Merge remote-tracking branch 'upstream/master' into vbsp-fixes.

author: Alan Edwardes <[email protected]> 2013-12-03 10:47:30 +0000
committer: Alan Edwardes <[email protected]> 2013-12-03 10:47:30 +0000
commit: 550992aebacbc7586553c15a3c2120f85a879126 (patch)
tree: c814cf654018acd5d69bb6e4be5dc9900391fd37 /mp/src/mathlib
parent: VBSP now checks all search paths for an FGD file. (diff)
parent: Make .xcconfigs text files too. (diff)
download: source-sdk-2013-550992aebacbc7586553c15a3c2120f85a879126.tar.xz
source-sdk-2013-550992aebacbc7586553c15a3c2120f85a879126.zip
27 files changed, 13946 insertions, 13948 deletions
diff --git a/mp/src/mathlib/3dnow.cpp b/mp/src/mathlib/3dnow.cpp
index 71657044..db17c8c1 100644
--- a/mp/src/mathlib/3dnow.cpp
+++ b/mp/src/mathlib/3dnow.cpp
@@ -1,197 +1,197 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: 3DNow Math primitives.
-//
-//=====================================================================================//
-
-#include <math.h>
-#include <float.h>	// Needed for FLT_EPSILON
-#include "basetypes.h"
-#include <memory.h>
-#include "tier0/dbg.h"
-#include "mathlib/mathlib.h"
-#include "mathlib/amd3dx.h"
-#include "mathlib/vector.h"
-
-// memdbgon must be the last include file in a .cpp file!!!
-#include "tier0/memdbgon.h"
-
-#if !defined(COMPILER_MSVC64) && !defined(LINUX)
-// Implement for 64-bit Windows if needed.
-// Clang hits "fatal error: error in backend:" and other errors when trying
-// to compile the inline assembly below. 3DNow support is highly unlikely to
-// be useful/used, so it's not worth spending time on fixing.
-
-#pragma warning(disable:4244)   // "conversion from 'const int' to 'float', possible loss of data"
-#pragma warning(disable:4730)	// "mixing _m64 and floating point expressions may result in incorrect code"
-
-//-----------------------------------------------------------------------------
-// 3D Now Implementations of optimized routines:
-//-----------------------------------------------------------------------------
-float _3DNow_Sqrt(float x)
-{
-	Assert( s_bMathlibInitialized );
-	float	root = 0.f;
-#ifdef _WIN32
-	_asm
-	{
-		femms
-		movd		mm0, x
-		PFRSQRT		(mm1,mm0)
-		punpckldq	mm0, mm0
-		PFMUL		(mm0, mm1)
-		movd		root, mm0
-		femms
-	}
-#elif LINUX
- 	__asm __volatile__( "femms" );
- 	__asm __volatile__
-	(
-		"pfrsqrt    %y0, %y1 \n\t"
-		"punpckldq   %y1, %y1 \n\t"
-		"pfmul      %y1, %y0 \n\t"
-		: "=y" (root), "=y" (x)
- 		:"0" (x)
- 	);
- 	__asm __volatile__( "femms" );
-#else
-#error
-#endif
-
-	return root;
-}
-
-// NJS FIXME: Need to test Recripricol squareroot performance and accuraccy
-// on AMD's before using the specialized instruction.
-float _3DNow_RSqrt(float x)
-{
-	Assert( s_bMathlibInitialized );
-
-	return 1.f / _3DNow_Sqrt(x);
-}
-
-
-float FASTCALL _3DNow_VectorNormalize (Vector& vec)
-{
-	Assert( s_bMathlibInitialized );
-	float *v = &vec[0];
-	float	radius = 0.f;
-
-	if ( v[0] || v[1] || v[2] )
-	{
-#ifdef _WIN32
-	_asm
-		{
-			mov			eax, v
-			femms
-			movq		mm0, QWORD PTR [eax]
-			movd		mm1, DWORD PTR [eax+8]
-			movq		mm2, mm0
-			movq		mm3, mm1
-			PFMUL		(mm0, mm0)
-			PFMUL		(mm1, mm1)
-			PFACC		(mm0, mm0)
-			PFADD		(mm1, mm0)
-			PFRSQRT		(mm0, mm1)
-			punpckldq	mm1, mm1
-			PFMUL		(mm1, mm0)
-			PFMUL		(mm2, mm0)
-			PFMUL		(mm3, mm0)
-			movq		QWORD PTR [eax], mm2
-			movd		DWORD PTR [eax+8], mm3
-			movd		radius, mm1
-			femms
-		}
-#elif LINUX	
-		long long a,c;
-    		int b,d;
-    		memcpy(&a,&vec[0],sizeof(a));
-    		memcpy(&b,&vec[2],sizeof(b));
-    		memcpy(&c,&vec[0],sizeof(c));
-    		memcpy(&d,&vec[2],sizeof(d));
-
-      		__asm __volatile__( "femms" );
-        	__asm __volatile__
-        	(
-        		"pfmul           %y3, %y3\n\t"
-        		"pfmul           %y0, %y0 \n\t"
-        		"pfacc           %y3, %y3 \n\t"
-        		"pfadd           %y3, %y0 \n\t"
-        		"pfrsqrt         %y0, %y3 \n\t"
-        		"punpckldq       %y0, %y0 \n\t"
-        		"pfmul           %y3, %y0 \n\t"
-        		"pfmul           %y3, %y2 \n\t"
-        		"pfmul           %y3, %y1 \n\t"
-        		: "=y" (radius), "=y" (c), "=y" (d)
-        		: "y" (a), "0" (b), "1" (c), "2" (d)
-        	);
-      		memcpy(&vec[0],&c,sizeof(c));
-      		memcpy(&vec[2],&d,sizeof(d));		
-        	__asm __volatile__( "femms" );
-
-#else
-#error
-#endif
-	}
-    return radius;
-}
-
-
-void FASTCALL _3DNow_VectorNormalizeFast (Vector& vec)
-{
-	_3DNow_VectorNormalize( vec );
-}
-
-
-// JAY: This complains with the latest processor pack
-#pragma warning(disable: 4730)
-
-float _3DNow_InvRSquared(const float* v)
-{
-	Assert( s_bMathlibInitialized );
-	float	r2 = 1.f;
-#ifdef _WIN32
-	_asm { // AMD 3DNow only routine
-		mov			eax, v
-		femms
-		movq		mm0, QWORD PTR [eax]
-		movd		mm1, DWORD PTR [eax+8]
-		movd		mm2, [r2]
-		PFMUL		(mm0, mm0)
-		PFMUL		(mm1, mm1)
-		PFACC		(mm0, mm0)
-		PFADD		(mm1, mm0)
-		PFMAX		(mm1, mm2)
-		PFRCP		(mm0, mm1)
-		movd		[r2], mm0
-		femms
-	}
-#elif LINUX
-		long long a,c;
-    		int b;
-    		memcpy(&a,&v[0],sizeof(a));
-    		memcpy(&b,&v[2],sizeof(b));
-    		memcpy(&c,&v[0],sizeof(c));
-
-      		__asm __volatile__( "femms" );
-        	__asm __volatile__
-        	(
-			"PFMUL          %y2, %y2 \n\t"
-                        "PFMUL          %y3, %y3 \n\t"
-                        "PFACC          %y2, %y2 \n\t"
-                        "PFADD          %y2, %y3 \n\t"
-                        "PFMAX          %y3, %y4 \n\t"
-                        "PFRCP          %y3, %y2 \n\t"
-                        "movq           %y2, %y0 \n\t"
-        		: "=y" (r2)
-        		: "0" (r2), "y" (a), "y" (b), "y" (c)
-        	);
-        	__asm __volatile__( "femms" );
-#else
-#error
-#endif
-
-	return r2;
-}
-
-#endif // COMPILER_MSVC64 
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 3DNow Math primitives.
+//
+//=====================================================================================//
+
+#include <math.h>
+#include <float.h>	// Needed for FLT_EPSILON
+#include "basetypes.h"
+#include <memory.h>
+#include "tier0/dbg.h"
+#include "mathlib/mathlib.h"
+#include "mathlib/amd3dx.h"
+#include "mathlib/vector.h"
+
+// memdbgon must be the last include file in a .cpp file!!!
+#include "tier0/memdbgon.h"
+
+#if !defined(COMPILER_MSVC64) && !defined(LINUX)
+// Implement for 64-bit Windows if needed.
+// Clang hits "fatal error: error in backend:" and other errors when trying
+// to compile the inline assembly below. 3DNow support is highly unlikely to
+// be useful/used, so it's not worth spending time on fixing.
+
+#pragma warning(disable:4244)   // "conversion from 'const int' to 'float', possible loss of data"
+#pragma warning(disable:4730)	// "mixing _m64 and floating point expressions may result in incorrect code"
+
+//-----------------------------------------------------------------------------
+// 3D Now Implementations of optimized routines:
+//-----------------------------------------------------------------------------
+float _3DNow_Sqrt(float x)
+{
+	Assert( s_bMathlibInitialized );
+	float	root = 0.f;
+#ifdef _WIN32
+	_asm
+	{
+		femms
+		movd		mm0, x
+		PFRSQRT		(mm1,mm0)
+		punpckldq	mm0, mm0
+		PFMUL		(mm0, mm1)
+		movd		root, mm0
+		femms
+	}
+#elif LINUX
+ 	__asm __volatile__( "femms" );
+ 	__asm __volatile__
+	(
+		"pfrsqrt    %y0, %y1 \n\t"
+		"punpckldq   %y1, %y1 \n\t"
+		"pfmul      %y1, %y0 \n\t"
+		: "=y" (root), "=y" (x)
+ 		:"0" (x)
+ 	);
+ 	__asm __volatile__( "femms" );
+#else
+#error
+#endif
+
+	return root;
+}
+
+// NJS FIXME: Need to test Recripricol squareroot performance and accuraccy
+// on AMD's before using the specialized instruction.
+float _3DNow_RSqrt(float x)
+{
+	Assert( s_bMathlibInitialized );
+
+	return 1.f / _3DNow_Sqrt(x);
+}
+
+
+float FASTCALL _3DNow_VectorNormalize (Vector& vec)
+{
+	Assert( s_bMathlibInitialized );
+	float *v = &vec[0];
+	float	radius = 0.f;
+
+	if ( v[0] || v[1] || v[2] )
+	{
+#ifdef _WIN32
+	_asm
+		{
+			mov			eax, v
+			femms
+			movq		mm0, QWORD PTR [eax]
+			movd		mm1, DWORD PTR [eax+8]
+			movq		mm2, mm0
+			movq		mm3, mm1
+			PFMUL		(mm0, mm0)
+			PFMUL		(mm1, mm1)
+			PFACC		(mm0, mm0)
+			PFADD		(mm1, mm0)
+			PFRSQRT		(mm0, mm1)
+			punpckldq	mm1, mm1
+			PFMUL		(mm1, mm0)
+			PFMUL		(mm2, mm0)
+			PFMUL		(mm3, mm0)
+			movq		QWORD PTR [eax], mm2
+			movd		DWORD PTR [eax+8], mm3
+			movd		radius, mm1
+			femms
+		}
+#elif LINUX	
+		long long a,c;
+    		int b,d;
+    		memcpy(&a,&vec[0],sizeof(a));
+    		memcpy(&b,&vec[2],sizeof(b));
+    		memcpy(&c,&vec[0],sizeof(c));
+    		memcpy(&d,&vec[2],sizeof(d));
+
+      		__asm __volatile__( "femms" );
+        	__asm __volatile__
+        	(
+        		"pfmul           %y3, %y3\n\t"
+        		"pfmul           %y0, %y0 \n\t"
+        		"pfacc           %y3, %y3 \n\t"
+        		"pfadd           %y3, %y0 \n\t"
+        		"pfrsqrt         %y0, %y3 \n\t"
+        		"punpckldq       %y0, %y0 \n\t"
+        		"pfmul           %y3, %y0 \n\t"
+        		"pfmul           %y3, %y2 \n\t"
+        		"pfmul           %y3, %y1 \n\t"
+        		: "=y" (radius), "=y" (c), "=y" (d)
+        		: "y" (a), "0" (b), "1" (c), "2" (d)
+        	);
+      		memcpy(&vec[0],&c,sizeof(c));
+      		memcpy(&vec[2],&d,sizeof(d));		
+        	__asm __volatile__( "femms" );
+
+#else
+#error
+#endif
+	}
+    return radius;
+}
+
+
+void FASTCALL _3DNow_VectorNormalizeFast (Vector& vec)
+{
+	_3DNow_VectorNormalize( vec );
+}
+
+
+// JAY: This complains with the latest processor pack
+#pragma warning(disable: 4730)
+
+float _3DNow_InvRSquared(const float* v)
+{
+	Assert( s_bMathlibInitialized );
+	float	r2 = 1.f;
+#ifdef _WIN32
+	_asm { // AMD 3DNow only routine
+		mov			eax, v
+		femms
+		movq		mm0, QWORD PTR [eax]
+		movd		mm1, DWORD PTR [eax+8]
+		movd		mm2, [r2]
+		PFMUL		(mm0, mm0)
+		PFMUL		(mm1, mm1)
+		PFACC		(mm0, mm0)
+		PFADD		(mm1, mm0)
+		PFMAX		(mm1, mm2)
+		PFRCP		(mm0, mm1)
+		movd		[r2], mm0
+		femms
+	}
+#elif LINUX
+		long long a,c;
+    		int b;
+    		memcpy(&a,&v[0],sizeof(a));
+    		memcpy(&b,&v[2],sizeof(b));
+    		memcpy(&c,&v[0],sizeof(c));
+
+      		__asm __volatile__( "femms" );
+        	__asm __volatile__
+        	(
+			"PFMUL          %y2, %y2 \n\t"
+                        "PFMUL          %y3, %y3 \n\t"
+                        "PFACC          %y2, %y2 \n\t"
+                        "PFADD          %y2, %y3 \n\t"
+                        "PFMAX          %y3, %y4 \n\t"
+                        "PFRCP          %y3, %y2 \n\t"
+                        "movq           %y2, %y0 \n\t"
+        		: "=y" (r2)
+        		: "0" (r2), "y" (a), "y" (b), "y" (c)
+        	);
+        	__asm __volatile__( "femms" );
+#else
+#error
+#endif
+
+	return r2;
+}
+
+#endif // COMPILER_MSVC64 
diff --git a/mp/src/mathlib/3dnow.h b/mp/src/mathlib/3dnow.h
index 47659c1f..c39b2ec5 100644
--- a/mp/src/mathlib/3dnow.h
+++ b/mp/src/mathlib/3dnow.h
@@ -1,16 +1,16 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: 
-//
-//=====================================================================================//
-
-#ifndef _3DNOW_H
-#define _3DNOW_H
-
-float _3DNow_Sqrt(float x);
-float _3DNow_RSqrt(float x);
-float FASTCALL _3DNow_VectorNormalize (Vector& vec);
-void FASTCALL _3DNow_VectorNormalizeFast (Vector& vec);
-float _3DNow_InvRSquared(const float* v);
-
-#endif // _3DNOW_H
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+//=====================================================================================//
+
+#ifndef _3DNOW_H
+#define _3DNOW_H
+
+float _3DNow_Sqrt(float x);
+float _3DNow_RSqrt(float x);
+float FASTCALL _3DNow_VectorNormalize (Vector& vec);
+void FASTCALL _3DNow_VectorNormalizeFast (Vector& vec);
+float _3DNow_InvRSquared(const float* v);
+
+#endif // _3DNOW_H
diff --git a/mp/src/mathlib/IceKey.cpp b/mp/src/mathlib/IceKey.cpp
index b5f910b7..e739ce6f 100644
--- a/mp/src/mathlib/IceKey.cpp
+++ b/mp/src/mathlib/IceKey.cpp
@@ -1,393 +1,393 @@
-// Purpose: C++ implementation of the ICE encryption algorithm.
-//			Taken from public domain code, as written by Matthew Kwan - July 1996
-//			http://www.darkside.com.au/ice/
-
-#if !defined(_STATIC_LINKED) || defined(_SHARED_LIB)
-
-#include "mathlib/IceKey.H"
-#include "tier0/memdbgon.h"
-#pragma warning(disable: 4244)
-
-
-	/* Structure of a single round subkey */
-class IceSubkey {
-    public:
-	unsigned long	val[3];
-};
-
-
-	/* The S-boxes */
-static unsigned long	ice_sbox[4][1024];
-static int		ice_sboxes_initialised = 0;
-
-
-	/* Modulo values for the S-boxes */
-static const int	ice_smod[4][4] = {
-				{333, 313, 505, 369},
-				{379, 375, 319, 391},
-				{361, 445, 451, 397},
-				{397, 425, 395, 505}};
-
-	/* XOR values for the S-boxes */
-static const int	ice_sxor[4][4] = {
-				{0x83, 0x85, 0x9b, 0xcd},
-				{0xcc, 0xa7, 0xad, 0x41},
-				{0x4b, 0x2e, 0xd4, 0x33},
-				{0xea, 0xcb, 0x2e, 0x04}};
-
-	/* Permutation values for the P-box */
-static const unsigned long	ice_pbox[32] = {
-		0x00000001, 0x00000080, 0x00000400, 0x00002000,
-		0x00080000, 0x00200000, 0x01000000, 0x40000000,
-		0x00000008, 0x00000020, 0x00000100, 0x00004000,
-		0x00010000, 0x00800000, 0x04000000, 0x20000000,
-		0x00000004, 0x00000010, 0x00000200, 0x00008000,
-		0x00020000, 0x00400000, 0x08000000, 0x10000000,
-		0x00000002, 0x00000040, 0x00000800, 0x00001000,
-		0x00040000, 0x00100000, 0x02000000, 0x80000000};
-
-	/* The key rotation schedule */
-static const int	ice_keyrot[16] = {
-				0, 1, 2, 3, 2, 1, 3, 0,
-				1, 3, 2, 0, 3, 1, 0, 2};
-
-
-/*
- * 8-bit Galois Field multiplication of a by b, modulo m.
- * Just like arithmetic multiplication, except that additions and
- * subtractions are replaced by XOR.
- */
-
-static unsigned int
-gf_mult (
-	register unsigned int	a,
-	register unsigned int	b,
-	register unsigned int	m
-) {
-	register unsigned int	res = 0;
-
-	while (b) {
-	    if (b & 1)
-		res ^= a;
-
-	    a <<= 1;
-	    b >>= 1;
-
-	    if (a >= 256)
-		a ^= m;
-	}
-
-	return (res);
-}
-
-
-/*
- * Galois Field exponentiation.
- * Raise the base to the power of 7, modulo m.
- */
-
-static unsigned long
-gf_exp7 (
-	register unsigned int	b,
-	unsigned int		m
-) {
-	register unsigned int	x;
-
-	if (b == 0)
-	    return (0);
-
-	x = gf_mult (b, b, m);
-	x = gf_mult (b, x, m);
-	x = gf_mult (x, x, m);
-	return (gf_mult (b, x, m));
-}
-
-
-/*
- * Carry out the ICE 32-bit P-box permutation.
- */
-
-static unsigned long
-ice_perm32 (
-	register unsigned long	x
-) {
-	register unsigned long		res = 0;
-	register const unsigned long	*pbox = ice_pbox;
-
-	while (x) {
-	    if (x & 1)
-		res |= *pbox;
-	    pbox++;
-	    x >>= 1;
-	}
-
-	return (res);
-}
-
-
-/*
- * Initialise the ICE S-boxes.
- * This only has to be done once.
- */
-
-static void
-ice_sboxes_init (void)
-{
-	register int	i;
-
-	for (i=0; i<1024; i++) {
-	    int			col = (i >> 1) & 0xff;
-	    int			row = (i & 0x1) | ((i & 0x200) >> 8);
-	    unsigned long	x;
-
-	    x = gf_exp7 (col ^ ice_sxor[0][row], ice_smod[0][row]) << 24;
-	    ice_sbox[0][i] = ice_perm32 (x);
-
-	    x = gf_exp7 (col ^ ice_sxor[1][row], ice_smod[1][row]) << 16;
-	    ice_sbox[1][i] = ice_perm32 (x);
-
-	    x = gf_exp7 (col ^ ice_sxor[2][row], ice_smod[2][row]) << 8;
-	    ice_sbox[2][i] = ice_perm32 (x);
-
-	    x = gf_exp7 (col ^ ice_sxor[3][row], ice_smod[3][row]);
-	    ice_sbox[3][i] = ice_perm32 (x);
-	}
-}
-
-
-/*
- * Create a new ICE key.
- */
-
-IceKey::IceKey (int n)
-{
-	if (!ice_sboxes_initialised) {
-	    ice_sboxes_init ();
-	    ice_sboxes_initialised = 1;
-	}
-
-	if (n < 1) {
-	    _size = 1;
-	    _rounds = 8;
-	} else {
-	    _size = n;
-	    _rounds = n * 16;
-	}
-
-	_keysched = new IceSubkey[_rounds];
-}
-
-
-/*
- * Destroy an ICE key.
- */
-
-IceKey::~IceKey ()
-{
-	int	i, j;
-
-	for (i=0; i<_rounds; i++)
-	    for (j=0; j<3; j++)
-		_keysched[i].val[j] = 0;
-
-	_rounds = _size = 0;
-
-	delete[] _keysched;
-}
-
-
-/*
- * The single round ICE f function.
- */
-
-static unsigned long
-ice_f (
-	register unsigned long	p,
-	const IceSubkey		*sk
-) {
-	unsigned long	tl, tr;		/* Expanded 40-bit values */
-	unsigned long	al, ar;		/* Salted expanded 40-bit values */
-
-					/* Left half expansion */
-	tl = ((p >> 16) & 0x3ff) | (((p >> 14) | (p << 18)) & 0xffc00);
-
-					/* Right half expansion */
-	tr = (p & 0x3ff) | ((p << 2) & 0xffc00);
-
-					/* Perform the salt permutation */
-			// al = (tr & sk->val[2]) | (tl & ~sk->val[2]);
-			// ar = (tl & sk->val[2]) | (tr & ~sk->val[2]);
-	al = sk->val[2] & (tl ^ tr);
-	ar = al ^ tr;
-	al ^= tl;
-
-	al ^= sk->val[0];		/* XOR with the subkey */
-	ar ^= sk->val[1];
-
-					/* S-box lookup and permutation */
-	return (ice_sbox[0][al >> 10] | ice_sbox[1][al & 0x3ff]
-		| ice_sbox[2][ar >> 10] | ice_sbox[3][ar & 0x3ff]);
-}
-
-
-/*
- * Encrypt a block of 8 bytes of data with the given ICE key.
- */
-
-void
-IceKey::encrypt (
-	const unsigned char	*ptext,
-	unsigned char		*ctext
-) const
-{
-	register int		i;
-	register unsigned long	l, r;
-
-	l = (((unsigned long) ptext[0]) << 24)
-				| (((unsigned long) ptext[1]) << 16)
-				| (((unsigned long) ptext[2]) << 8) | ptext[3];
-	r = (((unsigned long) ptext[4]) << 24)
-				| (((unsigned long) ptext[5]) << 16)
-				| (((unsigned long) ptext[6]) << 8) | ptext[7];
-
-	for (i = 0; i < _rounds; i += 2) {
-	    l ^= ice_f (r, &_keysched[i]);
-	    r ^= ice_f (l, &_keysched[i + 1]);
-	}
-
-	for (i = 0; i < 4; i++) {
-	    ctext[3 - i] = r & 0xff;
-	    ctext[7 - i] = l & 0xff;
-
-	    r >>= 8;
-	    l >>= 8;
-	}
-}
-
-
-/*
- * Decrypt a block of 8 bytes of data with the given ICE key.
- */
-
-void
-IceKey::decrypt (
-	const unsigned char	*ctext,
-	unsigned char		*ptext
-) const
-{
-	register int		i;
-	register unsigned long	l, r;
-
-	l = (((unsigned long) ctext[0]) << 24)
-				| (((unsigned long) ctext[1]) << 16)
-				| (((unsigned long) ctext[2]) << 8) | ctext[3];
-	r = (((unsigned long) ctext[4]) << 24)
-				| (((unsigned long) ctext[5]) << 16)
-				| (((unsigned long) ctext[6]) << 8) | ctext[7];
-
-	for (i = _rounds - 1; i > 0; i -= 2) {
-	    l ^= ice_f (r, &_keysched[i]);
-	    r ^= ice_f (l, &_keysched[i - 1]);
-	}
-
-	for (i = 0; i < 4; i++) {
-	    ptext[3 - i] = r & 0xff;
-	    ptext[7 - i] = l & 0xff;
-
-	    r >>= 8;
-	    l >>= 8;
-	}
-}
-
-
-/*
- * Set 8 rounds [n, n+7] of the key schedule of an ICE key.
- */
-
-void
-IceKey::scheduleBuild (
-	unsigned short	*kb,
-	int		n,
-	const int	*keyrot
-) {
-	int		i;
-
-	for (i=0; i<8; i++) {
-	    register int	j;
-	    register int	kr = keyrot[i];
-	    IceSubkey		*isk = &_keysched[n + i];
-
-	    for (j=0; j<3; j++)
-		isk->val[j] = 0;
-
-	    for (j=0; j<15; j++) {
-		register int	k;
-		unsigned long	*curr_sk = &isk->val[j % 3];
-
-		for (k=0; k<4; k++) {
-		    unsigned short	*curr_kb = &kb[(kr + k) & 3];
-		    register int	bit = *curr_kb & 1;
-
-		    *curr_sk = (*curr_sk << 1) | bit;
-		    *curr_kb = (*curr_kb >> 1) | ((bit ^ 1) << 15);
-		}
-	    }
-	}
-}
-
-
-/*
- * Set the key schedule of an ICE key.
- */
-
-void
-IceKey::set (
-	const unsigned char	*key
-) {
-	int		i;
-
-	if (_rounds == 8) {
-	    unsigned short	kb[4];
-
-	    for (i=0; i<4; i++)
-		kb[3 - i] = (key[i*2] << 8) | key[i*2 + 1];
-
-	    scheduleBuild (kb, 0, ice_keyrot);
-	    return;
-	}
-
-	for (i=0; i<_size; i++) {
-	    int			j;
-	    unsigned short	kb[4];
-
-	    for (j=0; j<4; j++)
-		kb[3 - j] = (key[i*8 + j*2] << 8) | key[i*8 + j*2 + 1];
-
-	    scheduleBuild (kb, i*8, ice_keyrot);
-	    scheduleBuild (kb, _rounds - 8 - i*8, &ice_keyrot[8]);
-	}
-}
-
-
-/*
- * Return the key size, in bytes.
- */
-
-int
-IceKey::keySize () const
-{
-	return (_size * 8);
-}
-
-
-/*
- * Return the block size, in bytes.
- */
-
-int
-IceKey::blockSize () const
-{
-	return (8);
-}
-
-#endif // !_STATIC_LINKED || _SHARED_LIB
+// Purpose: C++ implementation of the ICE encryption algorithm.
+//			Taken from public domain code, as written by Matthew Kwan - July 1996
+//			http://www.darkside.com.au/ice/
+
+#if !defined(_STATIC_LINKED) || defined(_SHARED_LIB)
+
+#include "mathlib/IceKey.H"
+#include "tier0/memdbgon.h"
+#pragma warning(disable: 4244)
+
+
+	/* Structure of a single round subkey */
+class IceSubkey {
+    public:
+	unsigned long	val[3];
+};
+
+
+	/* The S-boxes */
+static unsigned long	ice_sbox[4][1024];
+static int		ice_sboxes_initialised = 0;
+
+
+	/* Modulo values for the S-boxes */
+static const int	ice_smod[4][4] = {
+				{333, 313, 505, 369},
+				{379, 375, 319, 391},
+				{361, 445, 451, 397},
+				{397, 425, 395, 505}};
+
+	/* XOR values for the S-boxes */
+static const int	ice_sxor[4][4] = {
+				{0x83, 0x85, 0x9b, 0xcd},
+				{0xcc, 0xa7, 0xad, 0x41},
+				{0x4b, 0x2e, 0xd4, 0x33},
+				{0xea, 0xcb, 0x2e, 0x04}};
+
+	/* Permutation values for the P-box */
+static const unsigned long	ice_pbox[32] = {
+		0x00000001, 0x00000080, 0x00000400, 0x00002000,
+		0x00080000, 0x00200000, 0x01000000, 0x40000000,
+		0x00000008, 0x00000020, 0x00000100, 0x00004000,
+		0x00010000, 0x00800000, 0x04000000, 0x20000000,
+		0x00000004, 0x00000010, 0x00000200, 0x00008000,
+		0x00020000, 0x00400000, 0x08000000, 0x10000000,
+		0x00000002, 0x00000040, 0x00000800, 0x00001000,
+		0x00040000, 0x00100000, 0x02000000, 0x80000000};
+
+	/* The key rotation schedule */
+static const int	ice_keyrot[16] = {
+				0, 1, 2, 3, 2, 1, 3, 0,
+				1, 3, 2, 0, 3, 1, 0, 2};
+
+
+/*
+ * 8-bit Galois Field multiplication of a by b, modulo m.
+ * Just like arithmetic multiplication, except that additions and
+ * subtractions are replaced by XOR.
+ */
+
+static unsigned int
+gf_mult (
+	register unsigned int	a,
+	register unsigned int	b,
+	register unsigned int	m
+) {
+	register unsigned int	res = 0;
+
+	while (b) {
+	    if (b & 1)
+		res ^= a;
+
+	    a <<= 1;
+	    b >>= 1;
+
+	    if (a >= 256)
+		a ^= m;
+	}
+
+	return (res);
+}
+
+
+/*
+ * Galois Field exponentiation.
+ * Raise the base to the power of 7, modulo m.
+ */
+
+static unsigned long
+gf_exp7 (
+	register unsigned int	b,
+	unsigned int		m
+) {
+	register unsigned int	x;
+
+	if (b == 0)
+	    return (0);
+
+	x = gf_mult (b, b, m);
+	x = gf_mult (b, x, m);
+	x = gf_mult (x, x, m);
+	return (gf_mult (b, x, m));
+}
+
+
+/*
+ * Carry out the ICE 32-bit P-box permutation.
+ */
+
+static unsigned long
+ice_perm32 (
+	register unsigned long	x
+) {
+	register unsigned long		res = 0;
+	register const unsigned long	*pbox = ice_pbox;
+
+	while (x) {
+	    if (x & 1)
+		res |= *pbox;
+	    pbox++;
+	    x >>= 1;
+	}
+
+	return (res);
+}
+
+
+/*
+ * Initialise the ICE S-boxes.
+ * This only has to be done once.
+ */
+
+static void
+ice_sboxes_init (void)
+{
+	register int	i;
+
+	for (i=0; i<1024; i++) {
+	    int			col = (i >> 1) & 0xff;
+	    int			row = (i & 0x1) | ((i & 0x200) >> 8);
+	    unsigned long	x;
+
+	    x = gf_exp7 (col ^ ice_sxor[0][row], ice_smod[0][row]) << 24;
+	    ice_sbox[0][i] = ice_perm32 (x);
+
+	    x = gf_exp7 (col ^ ice_sxor[1][row], ice_smod[1][row]) << 16;
+	    ice_sbox[1][i] = ice_perm32 (x);
+
+	    x = gf_exp7 (col ^ ice_sxor[2][row], ice_smod[2][row]) << 8;
+	    ice_sbox[2][i] = ice_perm32 (x);
+
+	    x = gf_exp7 (col ^ ice_sxor[3][row], ice_smod[3][row]);
+	    ice_sbox[3][i] = ice_perm32 (x);
+	}
+}
+
+
+/*
+ * Create a new ICE key.
+ */
+
+IceKey::IceKey (int n)
+{
+	if (!ice_sboxes_initialised) {
+	    ice_sboxes_init ();
+	    ice_sboxes_initialised = 1;
+	}
+
+	if (n < 1) {
+	    _size = 1;
+	    _rounds = 8;
+	} else {
+	    _size = n;
+	    _rounds = n * 16;
+	}
+
+	_keysched = new IceSubkey[_rounds];
+}
+
+
+/*
+ * Destroy an ICE key.
+ */
+
+IceKey::~IceKey ()
+{
+	int	i, j;
+
+	for (i=0; i<_rounds; i++)
+	    for (j=0; j<3; j++)
+		_keysched[i].val[j] = 0;
+
+	_rounds = _size = 0;
+
+	delete[] _keysched;
+}
+
+
+/*
+ * The single round ICE f function.
+ */
+
+static unsigned long
+ice_f (
+	register unsigned long	p,
+	const IceSubkey		*sk
+) {
+	unsigned long	tl, tr;		/* Expanded 40-bit values */
+	unsigned long	al, ar;		/* Salted expanded 40-bit values */
+
+					/* Left half expansion */
+	tl = ((p >> 16) & 0x3ff) | (((p >> 14) | (p << 18)) & 0xffc00);
+
+					/* Right half expansion */
+	tr = (p & 0x3ff) | ((p << 2) & 0xffc00);
+
+					/* Perform the salt permutation */
+			// al = (tr & sk->val[2]) | (tl & ~sk->val[2]);
+			// ar = (tl & sk->val[2]) | (tr & ~sk->val[2]);
+	al = sk->val[2] & (tl ^ tr);
+	ar = al ^ tr;
+	al ^= tl;
+
+	al ^= sk->val[0];		/* XOR with the subkey */
+	ar ^= sk->val[1];
+
+					/* S-box lookup and permutation */
+	return (ice_sbox[0][al >> 10] | ice_sbox[1][al & 0x3ff]
+		| ice_sbox[2][ar >> 10] | ice_sbox[3][ar & 0x3ff]);
+}
+
+
+/*
+ * Encrypt a block of 8 bytes of data with the given ICE key.
+ */
+
+void
+IceKey::encrypt (
+	const unsigned char	*ptext,
+	unsigned char		*ctext
+) const
+{
+	register int		i;
+	register unsigned long	l, r;
+
+	l = (((unsigned long) ptext[0]) << 24)
+				| (((unsigned long) ptext[1]) << 16)
+				| (((unsigned long) ptext[2]) << 8) | ptext[3];
+	r = (((unsigned long) ptext[4]) << 24)
+				| (((unsigned long) ptext[5]) << 16)
+				| (((unsigned long) ptext[6]) << 8) | ptext[7];
+
+	for (i = 0; i < _rounds; i += 2) {
+	    l ^= ice_f (r, &_keysched[i]);
+	    r ^= ice_f (l, &_keysched[i + 1]);
+	}
+
+	for (i = 0; i < 4; i++) {
+	    ctext[3 - i] = r & 0xff;
+	    ctext[7 - i] = l & 0xff;
+
+	    r >>= 8;
+	    l >>= 8;
+	}
+}
+
+
+/*
+ * Decrypt a block of 8 bytes of data with the given ICE key.
+ */
+
+void
+IceKey::decrypt (
+	const unsigned char	*ctext,
+	unsigned char		*ptext
+) const
+{
+	register int		i;
+	register unsigned long	l, r;
+
+	l = (((unsigned long) ctext[0]) << 24)
+				| (((unsigned long) ctext[1]) << 16)
+				| (((unsigned long) ctext[2]) << 8) | ctext[3];
+	r = (((unsigned long) ctext[4]) << 24)
+				| (((unsigned long) ctext[5]) << 16)
+				| (((unsigned long) ctext[6]) << 8) | ctext[7];
+
+	for (i = _rounds - 1; i > 0; i -= 2) {
+	    l ^= ice_f (r, &_keysched[i]);
+	    r ^= ice_f (l, &_keysched[i - 1]);
+	}
+
+	for (i = 0; i < 4; i++) {
+	    ptext[3 - i] = r & 0xff;
+	    ptext[7 - i] = l & 0xff;
+
+	    r >>= 8;
+	    l >>= 8;
+	}
+}
+
+
+/*
+ * Set 8 rounds [n, n+7] of the key schedule of an ICE key.
+ */
+
+void
+IceKey::scheduleBuild (
+	unsigned short	*kb,
+	int		n,
+	const int	*keyrot
+) {
+	int		i;
+
+	for (i=0; i<8; i++) {
+	    register int	j;
+	    register int	kr = keyrot[i];
+	    IceSubkey		*isk = &_keysched[n + i];
+
+	    for (j=0; j<3; j++)
+		isk->val[j] = 0;
+
+	    for (j=0; j<15; j++) {
+		register int	k;
+		unsigned long	*curr_sk = &isk->val[j % 3];
+
+		for (k=0; k<4; k++) {
+		    unsigned short	*curr_kb = &kb[(kr + k) & 3];
+		    register int	bit = *curr_kb & 1;
+
+		    *curr_sk = (*curr_sk << 1) | bit;
+		    *curr_kb = (*curr_kb >> 1) | ((bit ^ 1) << 15);
+		}
+	    }
+	}
+}
+
+
+/*
+ * Set the key schedule of an ICE key.
+ */
+
+void
+IceKey::set (
+	const unsigned char	*key
+) {
+	int		i;
+
+	if (_rounds == 8) {
+	    unsigned short	kb[4];
+
+	    for (i=0; i<4; i++)
+		kb[3 - i] = (key[i*2] << 8) | key[i*2 + 1];
+
+	    scheduleBuild (kb, 0, ice_keyrot);
+	    return;
+	}
+
+	for (i=0; i<_size; i++) {
+	    int			j;
+	    unsigned short	kb[4];
+
+	    for (j=0; j<4; j++)
+		kb[3 - j] = (key[i*8 + j*2] << 8) | key[i*8 + j*2 + 1];
+
+	    scheduleBuild (kb, i*8, ice_keyrot);
+	    scheduleBuild (kb, _rounds - 8 - i*8, &ice_keyrot[8]);
+	}
+}
+
+
+/*
+ * Return the key size, in bytes.
+ */
+
+int
+IceKey::keySize () const
+{
+	return (_size * 8);
+}
+
+
+/*
+ * Return the block size, in bytes.
+ */
+
+int
+IceKey::blockSize () const
+{
+	return (8);
+}
+
+#endif // !_STATIC_LINKED || _SHARED_LIB
diff --git a/mp/src/mathlib/almostequal.cpp b/mp/src/mathlib/almostequal.cpp
index d4d3fba2..53b8a9e3 100644
--- a/mp/src/mathlib/almostequal.cpp
+++ b/mp/src/mathlib/almostequal.cpp
@@ -1,97 +1,97 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: Fast ways to compare equality of two floats.  Assumes 
-// sizeof(float) == sizeof(int) and we are using IEEE format.
-//
-// Source:  http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm
-//=====================================================================================//
-
-#include <float.h>
-#include <math.h>
-
-#include "mathlib/mathlib.h"
-
-static inline bool AE_IsInfinite(float a)
-{
-    const int kInfAsInt = 0x7F800000;
-
-    // An infinity has an exponent of 255 (shift left 23 positions) and
-    // a zero mantissa. There are two infinities - positive and negative.
-    if ((*(int*)&a & 0x7FFFFFFF) == kInfAsInt)
-        return true;
-    return false;
-}
-
-static inline bool AE_IsNan(float a)
-{
-    // a NAN has an exponent of 255 (shifted left 23 positions) and
-    // a non-zero mantissa.
-    int exp = *(int*)&a & 0x7F800000;
-    int mantissa = *(int*)&a & 0x007FFFFF;
-    if (exp == 0x7F800000 && mantissa != 0)
-        return true;
-    return false;
-}
-
-static inline int AE_Sign(float a)
-{
-    // The sign bit of a number is the high bit.
-    return (*(int*)&a) & 0x80000000;
-}
-
-// This is the 'final' version of the AlmostEqualUlps function.
-// The optional checks are included for completeness, but in many
-// cases they are not necessary, or even not desirable.
-bool AlmostEqual(float a, float b, int maxUlps)
-{
-    // There are several optional checks that you can do, depending
-    // on what behavior you want from your floating point comparisons.
-    // These checks should not be necessary and they are included
-    // mainly for completeness.
-
-    // If a or b are infinity (positive or negative) then
-    // only return true if they are exactly equal to each other -
-    // that is, if they are both infinities of the same sign.
-    // This check is only needed if you will be generating
-    // infinities and you don't want them 'close' to numbers
-    // near FLT_MAX.
-    if (AE_IsInfinite(a) || AE_IsInfinite(b))
-        return a == b;
-
-    // If a or b are a NAN, return false. NANs are equal to nothing,
-    // not even themselves.
-    // This check is only needed if you will be generating NANs
-    // and you use a maxUlps greater than 4 million or you want to
-    // ensure that a NAN does not equal itself.
-    if (AE_IsNan(a) || AE_IsNan(b))
-        return false;
-
-    // After adjusting floats so their representations are lexicographically
-    // ordered as twos-complement integers a very small positive number
-    // will compare as 'close' to a very small negative number. If this is
-    // not desireable, and if you are on a platform that supports
-    // subnormals (which is the only place the problem can show up) then
-    // you need this check.
-    // The check for a == b is because zero and negative zero have different
-    // signs but are equal to each other.
-    if (AE_Sign(a) != AE_Sign(b))
-        return a == b;
-
-    int aInt = *(int*)&a;
-    // Make aInt lexicographically ordered as a twos-complement int
-    if (aInt < 0)
-        aInt = 0x80000000 - aInt;
-    // Make bInt lexicographically ordered as a twos-complement int
-    int bInt = *(int*)&b;
-    if (bInt < 0)
-        bInt = 0x80000000 - bInt;
-
-    // Now we can compare aInt and bInt to find out how far apart a and b
-    // are.
-    int intDiff = abs(aInt - bInt);
-    if (intDiff <= maxUlps)
-        return true;
-    return false;
-}
-
-
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: Fast ways to compare equality of two floats.  Assumes 
+// sizeof(float) == sizeof(int) and we are using IEEE format.
+//
+// Source:  http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm
+//=====================================================================================//
+
+#include <float.h>
+#include <math.h>
+
+#include "mathlib/mathlib.h"
+
+static inline bool AE_IsInfinite(float a)
+{
+    const int kInfAsInt = 0x7F800000;
+
+    // An infinity has an exponent of 255 (shift left 23 positions) and
+    // a zero mantissa. There are two infinities - positive and negative.
+    if ((*(int*)&a & 0x7FFFFFFF) == kInfAsInt)
+        return true;
+    return false;
+}
+
+static inline bool AE_IsNan(float a)
+{
+    // a NAN has an exponent of 255 (shifted left 23 positions) and
+    // a non-zero mantissa.
+    int exp = *(int*)&a & 0x7F800000;
+    int mantissa = *(int*)&a & 0x007FFFFF;
+    if (exp == 0x7F800000 && mantissa != 0)
+        return true;
+    return false;
+}
+
+static inline int AE_Sign(float a)
+{
+    // The sign bit of a number is the high bit.
+    return (*(int*)&a) & 0x80000000;
+}
+
+// This is the 'final' version of the AlmostEqualUlps function.
+// The optional checks are included for completeness, but in many
+// cases they are not necessary, or even not desirable.
+bool AlmostEqual(float a, float b, int maxUlps)
+{
+    // There are several optional checks that you can do, depending
+    // on what behavior you want from your floating point comparisons.
+    // These checks should not be necessary and they are included
+    // mainly for completeness.
+
+    // If a or b are infinity (positive or negative) then
+    // only return true if they are exactly equal to each other -
+    // that is, if they are both infinities of the same sign.
+    // This check is only needed if you will be generating
+    // infinities and you don't want them 'close' to numbers
+    // near FLT_MAX.
+    if (AE_IsInfinite(a) || AE_IsInfinite(b))
+        return a == b;
+
+    // If a or b are a NAN, return false. NANs are equal to nothing,
+    // not even themselves.
+    // This check is only needed if you will be generating NANs
+    // and you use a maxUlps greater than 4 million or you want to
+    // ensure that a NAN does not equal itself.
+    if (AE_IsNan(a) || AE_IsNan(b))
+        return false;
+
+    // After adjusting floats so their representations are lexicographically
+    // ordered as twos-complement integers a very small positive number
+    // will compare as 'close' to a very small negative number. If this is
+    // not desireable, and if you are on a platform that supports
+    // subnormals (which is the only place the problem can show up) then
+    // you need this check.
+    // The check for a == b is because zero and negative zero have different
+    // signs but are equal to each other.
+    if (AE_Sign(a) != AE_Sign(b))
+        return a == b;
+
+    int aInt = *(int*)&a;
+    // Make aInt lexicographically ordered as a twos-complement int
+    if (aInt < 0)
+        aInt = 0x80000000 - aInt;
+    // Make bInt lexicographically ordered as a twos-complement int
+    int bInt = *(int*)&b;
+    if (bInt < 0)
+        bInt = 0x80000000 - bInt;
+
+    // Now we can compare aInt and bInt to find out how far apart a and b
+    // are.
+    int intDiff = abs(aInt - bInt);
+    if (intDiff <= maxUlps)
+        return true;
+    return false;
+}
+
+
diff --git a/mp/src/mathlib/anorms.cpp b/mp/src/mathlib/anorms.cpp
index 1970716d..5ce1c7e6 100644
--- a/mp/src/mathlib/anorms.cpp
+++ b/mp/src/mathlib/anorms.cpp
@@ -1,181 +1,181 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: 
-//
-//=============================================================================//
-#if !defined(_STATIC_LINKED) || defined(_SHARED_LIB)
-
-
-#include "mathlib/vector.h"
-#include "mathlib/anorms.h"
-
-// memdbgon must be the last include file in a .cpp file!!!
-#include "tier0/memdbgon.h"
-
-Vector g_anorms[NUMVERTEXNORMALS] =
-{
-	Vector(-0.525731, 0.000000, 0.850651), 
-	Vector(-0.442863, 0.238856, 0.864188), 
-	Vector(-0.295242, 0.000000, 0.955423), 
-	Vector(-0.309017, 0.500000, 0.809017), 
-	Vector(-0.162460, 0.262866, 0.951056), 
-	Vector(0.000000, 0.000000, 1.000000), 
-	Vector(0.000000, 0.850651, 0.525731), 
-	Vector(-0.147621, 0.716567, 0.681718), 
-	Vector(0.147621, 0.716567, 0.681718), 
-	Vector(0.000000, 0.525731, 0.850651), 
-	Vector(0.309017, 0.500000, 0.809017), 
-	Vector(0.525731, 0.000000, 0.850651), 
-	Vector(0.295242, 0.000000, 0.955423), 
-	Vector(0.442863, 0.238856, 0.864188), 
-	Vector(0.162460, 0.262866, 0.951056), 
-	Vector(-0.681718, 0.147621, 0.716567), 
-	Vector(-0.809017, 0.309017, 0.500000), 
-	Vector(-0.587785, 0.425325, 0.688191), 
-	Vector(-0.850651, 0.525731, 0.000000), 
-	Vector(-0.864188, 0.442863, 0.238856), 
-	Vector(-0.716567, 0.681718, 0.147621), 
-	Vector(-0.688191, 0.587785, 0.425325), 
-	Vector(-0.500000, 0.809017, 0.309017), 
-	Vector(-0.238856, 0.864188, 0.442863), 
-	Vector(-0.425325, 0.688191, 0.587785), 
-	Vector(-0.716567, 0.681718, -0.147621), 
-	Vector(-0.500000, 0.809017, -0.309017), 
-	Vector(-0.525731, 0.850651, 0.000000), 
-	Vector(0.000000, 0.850651, -0.525731), 
-	Vector(-0.238856, 0.864188, -0.442863), 
-	Vector(0.000000, 0.955423, -0.295242), 
-	Vector(-0.262866, 0.951056, -0.162460), 
-	Vector(0.000000, 1.000000, 0.000000), 
-	Vector(0.000000, 0.955423, 0.295242), 
-	Vector(-0.262866, 0.951056, 0.162460), 
-	Vector(0.238856, 0.864188, 0.442863), 
-	Vector(0.262866, 0.951056, 0.162460), 
-	Vector(0.500000, 0.809017, 0.309017), 
-	Vector(0.238856, 0.864188, -0.442863), 
-	Vector(0.262866, 0.951056, -0.162460), 
-	Vector(0.500000, 0.809017, -0.309017), 
-	Vector(0.850651, 0.525731, 0.000000), 
-	Vector(0.716567, 0.681718, 0.147621), 
-	Vector(0.716567, 0.681718, -0.147621), 
-	Vector(0.525731, 0.850651, 0.000000), 
-	Vector(0.425325, 0.688191, 0.587785), 
-	Vector(0.864188, 0.442863, 0.238856), 
-	Vector(0.688191, 0.587785, 0.425325), 
-	Vector(0.809017, 0.309017, 0.500000), 
-	Vector(0.681718, 0.147621, 0.716567), 
-	Vector(0.587785, 0.425325, 0.688191), 
-	Vector(0.955423, 0.295242, 0.000000), 
-	Vector(1.000000, 0.000000, 0.000000), 
-	Vector(0.951056, 0.162460, 0.262866), 
-	Vector(0.850651, -0.525731, 0.000000), 
-	Vector(0.955423, -0.295242, 0.000000), 
-	Vector(0.864188, -0.442863, 0.238856), 
-	Vector(0.951056, -0.162460, 0.262866), 
-	Vector(0.809017, -0.309017, 0.500000), 
-	Vector(0.681718, -0.147621, 0.716567), 
-	Vector(0.850651, 0.000000, 0.525731), 
-	Vector(0.864188, 0.442863, -0.238856), 
-	Vector(0.809017, 0.309017, -0.500000), 
-	Vector(0.951056, 0.162460, -0.262866), 
-	Vector(0.525731, 0.000000, -0.850651), 
-	Vector(0.681718, 0.147621, -0.716567), 
-	Vector(0.681718, -0.147621, -0.716567), 
-	Vector(0.850651, 0.000000, -0.525731), 
-	Vector(0.809017, -0.309017, -0.500000), 
-	Vector(0.864188, -0.442863, -0.238856), 
-	Vector(0.951056, -0.162460, -0.262866), 
-	Vector(0.147621, 0.716567, -0.681718), 
-	Vector(0.309017, 0.500000, -0.809017), 
-	Vector(0.425325, 0.688191, -0.587785), 
-	Vector(0.442863, 0.238856, -0.864188), 
-	Vector(0.587785, 0.425325, -0.688191), 
-	Vector(0.688191, 0.587785, -0.425325), 
-	Vector(-0.147621, 0.716567, -0.681718), 
-	Vector(-0.309017, 0.500000, -0.809017), 
-	Vector(0.000000, 0.525731, -0.850651), 
-	Vector(-0.525731, 0.000000, -0.850651), 
-	Vector(-0.442863, 0.238856, -0.864188), 
-	Vector(-0.295242, 0.000000, -0.955423), 
-	Vector(-0.162460, 0.262866, -0.951056), 
-	Vector(0.000000, 0.000000, -1.000000), 
-	Vector(0.295242, 0.000000, -0.955423), 
-	Vector(0.162460, 0.262866, -0.951056), 
-	Vector(-0.442863, -0.238856, -0.864188), 
-	Vector(-0.309017, -0.500000, -0.809017), 
-	Vector(-0.162460, -0.262866, -0.951056), 
-	Vector(0.000000, -0.850651, -0.525731), 
-	Vector(-0.147621, -0.716567, -0.681718), 
-	Vector(0.147621, -0.716567, -0.681718), 
-	Vector(0.000000, -0.525731, -0.850651), 
-	Vector(0.309017, -0.500000, -0.809017), 
-	Vector(0.442863, -0.238856, -0.864188), 
-	Vector(0.162460, -0.262866, -0.951056), 
-	Vector(0.238856, -0.864188, -0.442863), 
-	Vector(0.500000, -0.809017, -0.309017), 
-	Vector(0.425325, -0.688191, -0.587785), 
-	Vector(0.716567, -0.681718, -0.147621), 
-	Vector(0.688191, -0.587785, -0.425325), 
-	Vector(0.587785, -0.425325, -0.688191), 
-	Vector(0.000000, -0.955423, -0.295242), 
-	Vector(0.000000, -1.000000, 0.000000), 
-	Vector(0.262866, -0.951056, -0.162460), 
-	Vector(0.000000, -0.850651, 0.525731), 
-	Vector(0.000000, -0.955423, 0.295242), 
-	Vector(0.238856, -0.864188, 0.442863), 
-	Vector(0.262866, -0.951056, 0.162460), 
-	Vector(0.500000, -0.809017, 0.309017), 
-	Vector(0.716567, -0.681718, 0.147621), 
-	Vector(0.525731, -0.850651, 0.000000), 
-	Vector(-0.238856, -0.864188, -0.442863), 
-	Vector(-0.500000, -0.809017, -0.309017), 
-	Vector(-0.262866, -0.951056, -0.162460), 
-	Vector(-0.850651, -0.525731, 0.000000), 
-	Vector(-0.716567, -0.681718, -0.147621), 
-	Vector(-0.716567, -0.681718, 0.147621), 
-	Vector(-0.525731, -0.850651, 0.000000), 
-	Vector(-0.500000, -0.809017, 0.309017), 
-	Vector(-0.238856, -0.864188, 0.442863), 
-	Vector(-0.262866, -0.951056, 0.162460), 
-	Vector(-0.864188, -0.442863, 0.238856), 
-	Vector(-0.809017, -0.309017, 0.500000), 
-	Vector(-0.688191, -0.587785, 0.425325), 
-	Vector(-0.681718, -0.147621, 0.716567), 
-	Vector(-0.442863, -0.238856, 0.864188), 
-	Vector(-0.587785, -0.425325, 0.688191), 
-	Vector(-0.309017, -0.500000, 0.809017), 
-	Vector(-0.147621, -0.716567, 0.681718), 
-	Vector(-0.425325, -0.688191, 0.587785), 
-	Vector(-0.162460, -0.262866, 0.951056), 
-	Vector(0.442863, -0.238856, 0.864188), 
-	Vector(0.162460, -0.262866, 0.951056), 
-	Vector(0.309017, -0.500000, 0.809017), 
-	Vector(0.147621, -0.716567, 0.681718), 
-	Vector(0.000000, -0.525731, 0.850651), 
-	Vector(0.425325, -0.688191, 0.587785), 
-	Vector(0.587785, -0.425325, 0.688191), 
-	Vector(0.688191, -0.587785, 0.425325), 
-	Vector(-0.955423, 0.295242, 0.000000), 
-	Vector(-0.951056, 0.162460, 0.262866), 
-	Vector(-1.000000, 0.000000, 0.000000), 
-	Vector(-0.850651, 0.000000, 0.525731), 
-	Vector(-0.955423, -0.295242, 0.000000), 
-	Vector(-0.951056, -0.162460, 0.262866), 
-	Vector(-0.864188, 0.442863, -0.238856), 
-	Vector(-0.951056, 0.162460, -0.262866), 
-	Vector(-0.809017, 0.309017, -0.500000), 
-	Vector(-0.864188, -0.442863, -0.238856), 
-	Vector(-0.951056, -0.162460, -0.262866), 
-	Vector(-0.809017, -0.309017, -0.500000), 
-	Vector(-0.681718, 0.147621, -0.716567), 
-	Vector(-0.681718, -0.147621, -0.716567), 
-	Vector(-0.850651, 0.000000, -0.525731), 
-	Vector(-0.688191, 0.587785, -0.425325), 
-	Vector(-0.587785, 0.425325, -0.688191), 
-	Vector(-0.425325, 0.688191, -0.587785), 
-	Vector(-0.425325, -0.688191, -0.587785), 
-	Vector(-0.587785, -0.425325, -0.688191), 
-	Vector(-0.688191, -0.587785, -0.425325)
-};
-
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+//=============================================================================//
+#if !defined(_STATIC_LINKED) || defined(_SHARED_LIB)
+
+
+#include "mathlib/vector.h"
+#include "mathlib/anorms.h"
+
+// memdbgon must be the last include file in a .cpp file!!!
+#include "tier0/memdbgon.h"
+
+Vector g_anorms[NUMVERTEXNORMALS] =
+{
+	Vector(-0.525731, 0.000000, 0.850651), 
+	Vector(-0.442863, 0.238856, 0.864188), 
+	Vector(-0.295242, 0.000000, 0.955423), 
+	Vector(-0.309017, 0.500000, 0.809017), 
+	Vector(-0.162460, 0.262866, 0.951056), 
+	Vector(0.000000, 0.000000, 1.000000), 
+	Vector(0.000000, 0.850651, 0.525731), 
+	Vector(-0.147621, 0.716567, 0.681718), 
+	Vector(0.147621, 0.716567, 0.681718), 
+	Vector(0.000000, 0.525731, 0.850651), 
+	Vector(0.309017, 0.500000, 0.809017), 
+	Vector(0.525731, 0.000000, 0.850651), 
+	Vector(0.295242, 0.000000, 0.955423), 
+	Vector(0.442863, 0.238856, 0.864188), 
+	Vector(0.162460, 0.262866, 0.951056), 
+	Vector(-0.681718, 0.147621, 0.716567), 
+	Vector(-0.809017, 0.309017, 0.500000), 
+	Vector(-0.587785, 0.425325, 0.688191), 
+	Vector(-0.850651, 0.525731, 0.000000), 
+	Vector(-0.864188, 0.442863, 0.238856), 
+	Vector(-0.716567, 0.681718, 0.147621), 
+	Vector(-0.688191, 0.587785, 0.425325), 
+	Vector(-0.500000, 0.809017, 0.309017), 
+	Vector(-0.238856, 0.864188, 0.442863), 
+	Vector(-0.425325, 0.688191, 0.587785), 
+	Vector(-0.716567, 0.681718, -0.147621), 
+	Vector(-0.500000, 0.809017, -0.309017), 
+	Vector(-0.525731, 0.850651, 0.000000), 
+	Vector(0.000000, 0.850651, -0.525731), 
+	Vector(-0.238856, 0.864188, -0.442863), 
+	Vector(0.000000, 0.955423, -0.295242), 
+	Vector(-0.262866, 0.951056, -0.162460), 
+	Vector(0.000000, 1.000000, 0.000000), 
+	Vector(0.000000, 0.955423, 0.295242), 
+	Vector(-0.262866, 0.951056, 0.162460), 
+	Vector(0.238856, 0.864188, 0.442863), 
+	Vector(0.262866, 0.951056, 0.162460), 
+	Vector(0.500000, 0.809017, 0.309017), 
+	Vector(0.238856, 0.864188, -0.442863), 
+	Vector(0.262866, 0.951056, -0.162460), 
+	Vector(0.500000, 0.809017, -0.309017), 
+	Vector(0.850651, 0.525731, 0.000000), 
+	Vector(0.716567, 0.681718, 0.147621), 
+	Vector(0.716567, 0.681718, -0.147621), 
+	Vector(0.525731, 0.850651, 0.000000), 
+	Vector(0.425325, 0.688191, 0.587785), 
+	Vector(0.864188, 0.442863, 0.238856), 
+	Vector(0.688191, 0.587785, 0.425325), 
+	Vector(0.809017, 0.309017, 0.500000), 
+	Vector(0.681718, 0.147621, 0.716567), 
+	Vector(0.587785, 0.425325, 0.688191), 
+	Vector(0.955423, 0.295242, 0.000000), 
+	Vector(1.000000, 0.000000, 0.000000), 
+	Vector(0.951056, 0.162460, 0.262866), 
+	Vector(0.850651, -0.525731, 0.000000), 
+	Vector(0.955423, -0.295242, 0.000000), 
+	Vector(0.864188, -0.442863, 0.238856), 
+	Vector(0.951056, -0.162460, 0.262866), 
+	Vector(0.809017, -0.309017, 0.500000), 
+	Vector(0.681718, -0.147621, 0.716567), 
+	Vector(0.850651, 0.000000, 0.525731), 
+	Vector(0.864188, 0.442863, -0.238856), 
+	Vector(0.809017, 0.309017, -0.500000), 
+	Vector(0.951056, 0.162460, -0.262866), 
+	Vector(0.525731, 0.000000, -0.850651), 
+	Vector(0.681718, 0.147621, -0.716567), 
+	Vector(0.681718, -0.147621, -0.716567), 
+	Vector(0.850651, 0.000000, -0.525731), 
+	Vector(0.809017, -0.309017, -0.500000), 
+	Vector(0.864188, -0.442863, -0.238856), 
+	Vector(0.951056, -0.162460, -0.262866), 
+	Vector(0.147621, 0.716567, -0.681718), 
+	Vector(0.309017, 0.500000, -0.809017), 
+	Vector(0.425325, 0.688191, -0.587785), 
+	Vector(0.442863, 0.238856, -0.864188), 
+	Vector(0.587785, 0.425325, -0.688191), 
+	Vector(0.688191, 0.587785, -0.425325), 
+	Vector(-0.147621, 0.716567, -0.681718), 
+	Vector(-0.309017, 0.500000, -0.809017), 
+	Vector(0.000000, 0.525731, -0.850651), 
+	Vector(-0.525731, 0.000000, -0.850651), 
+	Vector(-0.442863, 0.238856, -0.864188), 
+	Vector(-0.295242, 0.000000, -0.955423), 
+	Vector(-0.162460, 0.262866, -0.951056), 
+	Vector(0.000000, 0.000000, -1.000000), 
+	Vector(0.295242, 0.000000, -0.955423), 
+	Vector(0.162460, 0.262866, -0.951056), 
+	Vector(-0.442863, -0.238856, -0.864188), 
+	Vector(-0.309017, -0.500000, -0.809017), 
+	Vector(-0.162460, -0.262866, -0.951056), 
+	Vector(0.000000, -0.850651, -0.525731), 
+	Vector(-0.147621, -0.716567, -0.681718), 
+	Vector(0.147621, -0.716567, -0.681718), 
+	Vector(0.000000, -0.525731, -0.850651), 
+	Vector(0.309017, -0.500000, -0.809017), 
+	Vector(0.442863, -0.238856, -0.864188), 
+	Vector(0.162460, -0.262866, -0.951056), 
+	Vector(0.238856, -0.864188, -0.442863), 
+	Vector(0.500000, -0.809017, -0.309017), 
+	Vector(0.425325, -0.688191, -0.587785), 
+	Vector(0.716567, -0.681718, -0.147621), 
+	Vector(0.688191, -0.587785, -0.425325), 
+	Vector(0.587785, -0.425325, -0.688191), 
+	Vector(0.000000, -0.955423, -0.295242), 
+	Vector(0.000000, -1.000000, 0.000000), 
+	Vector(0.262866, -0.951056, -0.162460), 
+	Vector(0.000000, -0.850651, 0.525731), 
+	Vector(0.000000, -0.955423, 0.295242), 
+	Vector(0.238856, -0.864188, 0.442863), 
+	Vector(0.262866, -0.951056, 0.162460), 
+	Vector(0.500000, -0.809017, 0.309017), 
+	Vector(0.716567, -0.681718, 0.147621), 
+	Vector(0.525731, -0.850651, 0.000000), 
+	Vector(-0.238856, -0.864188, -0.442863), 
+	Vector(-0.500000, -0.809017, -0.309017), 
+	Vector(-0.262866, -0.951056, -0.162460), 
+	Vector(-0.850651, -0.525731, 0.000000), 
+	Vector(-0.716567, -0.681718, -0.147621), 
+	Vector(-0.716567, -0.681718, 0.147621), 
+	Vector(-0.525731, -0.850651, 0.000000), 
+	Vector(-0.500000, -0.809017, 0.309017), 
+	Vector(-0.238856, -0.864188, 0.442863), 
+	Vector(-0.262866, -0.951056, 0.162460), 
+	Vector(-0.864188, -0.442863, 0.238856), 
+	Vector(-0.809017, -0.309017, 0.500000), 
+	Vector(-0.688191, -0.587785, 0.425325), 
+	Vector(-0.681718, -0.147621, 0.716567), 
+	Vector(-0.442863, -0.238856, 0.864188), 
+	Vector(-0.587785, -0.425325, 0.688191), 
+	Vector(-0.309017, -0.500000, 0.809017), 
+	Vector(-0.147621, -0.716567, 0.681718), 
+	Vector(-0.425325, -0.688191, 0.587785), 
+	Vector(-0.162460, -0.262866, 0.951056), 
+	Vector(0.442863, -0.238856, 0.864188), 
+	Vector(0.162460, -0.262866, 0.951056), 
+	Vector(0.309017, -0.500000, 0.809017), 
+	Vector(0.147621, -0.716567, 0.681718), 
+	Vector(0.000000, -0.525731, 0.850651), 
+	Vector(0.425325, -0.688191, 0.587785), 
+	Vector(0.587785, -0.425325, 0.688191), 
+	Vector(0.688191, -0.587785, 0.425325), 
+	Vector(-0.955423, 0.295242, 0.000000), 
+	Vector(-0.951056, 0.162460, 0.262866), 
+	Vector(-1.000000, 0.000000, 0.000000), 
+	Vector(-0.850651, 0.000000, 0.525731), 
+	Vector(-0.955423, -0.295242, 0.000000), 
+	Vector(-0.951056, -0.162460, 0.262866), 
+	Vector(-0.864188, 0.442863, -0.238856), 
+	Vector(-0.951056, 0.162460, -0.262866), 
+	Vector(-0.809017, 0.309017, -0.500000), 
+	Vector(-0.864188, -0.442863, -0.238856), 
+	Vector(-0.951056, -0.162460, -0.262866), 
+	Vector(-0.809017, -0.309017, -0.500000), 
+	Vector(-0.681718, 0.147621, -0.716567), 
+	Vector(-0.681718, -0.147621, -0.716567), 
+	Vector(-0.850651, 0.000000, -0.525731), 
+	Vector(-0.688191, 0.587785, -0.425325), 
+	Vector(-0.587785, 0.425325, -0.688191), 
+	Vector(-0.425325, 0.688191, -0.587785), 
+	Vector(-0.425325, -0.688191, -0.587785), 
+	Vector(-0.587785, -0.425325, -0.688191), 
+	Vector(-0.688191, -0.587785, -0.425325)
+};
+
 #endif // !_STATIC_LINKED || _SHARED_LIB
 \ No newline at end of file
diff --git a/mp/src/mathlib/bumpvects.cpp b/mp/src/mathlib/bumpvects.cpp
index 5d1d278a..5edbe4d4 100644
--- a/mp/src/mathlib/bumpvects.cpp
+++ b/mp/src/mathlib/bumpvects.cpp
@@ -1,69 +1,69 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: 
-//
-// $Workfile:     $
-// $Date:         $
-//
-//-----------------------------------------------------------------------------
-// $Log: $
-//
-// $NoKeywords: $
-//=============================================================================//
-
-#if !defined(_STATIC_LINKED) || defined(_SHARED_LIB)
-
-
-#ifdef QUIVER
-#include "r_local.h"
-#endif
-#include "mathlib/bumpvects.h"
-#include "mathlib/vector.h"
-#include <assert.h>
-
-// memdbgon must be the last include file in a .cpp file!!!
-#include "tier0/memdbgon.h"
-
-// z is coming out of the face.
-
-void GetBumpNormals( const Vector& sVect, const Vector& tVect, const Vector& flatNormal, 
-					 const Vector& phongNormal, Vector bumpNormals[NUM_BUMP_VECTS] )
-{
-	Vector tmpNormal;
-	bool leftHanded;
-	int i;
-
-	assert( NUM_BUMP_VECTS == 3 );
-	
-	// Are we left or right handed?
-	CrossProduct( sVect, tVect, tmpNormal );
-	if( DotProduct( flatNormal, tmpNormal ) < 0.0f )
-	{
-		leftHanded = true;
-	}
-	else
-	{
-		leftHanded = false;
-	}
-
-	// Build a basis for the face around the phong normal
-	matrix3x4_t smoothBasis;
-	CrossProduct( phongNormal.Base(), sVect.Base(), smoothBasis[1] );
-	VectorNormalize( smoothBasis[1] );
-	CrossProduct( smoothBasis[1], phongNormal.Base(), smoothBasis[0] );
-	VectorNormalize( smoothBasis[0] );
-	VectorCopy( phongNormal.Base(), smoothBasis[2] );
-	
-	if( leftHanded )
-	{
-		VectorNegate( smoothBasis[1] );
-	}
-	
-	// move the g_localBumpBasis into world space to create bumpNormals
-	for( i = 0; i < 3; i++ )
-	{
-		VectorIRotate( g_localBumpBasis[i], smoothBasis, bumpNormals[i] );
-	}
-}
-
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $Workfile:     $
+// $Date:         $
+//
+//-----------------------------------------------------------------------------
+// $Log: $
+//
+// $NoKeywords: $
+//=============================================================================//
+
+#if !defined(_STATIC_LINKED) || defined(_SHARED_LIB)
+
+
+#ifdef QUIVER
+#include "r_local.h"
+#endif
+#include "mathlib/bumpvects.h"
+#include "mathlib/vector.h"
+#include <assert.h>
+
+// memdbgon must be the last include file in a .cpp file!!!
+#include "tier0/memdbgon.h"
+
+// z is coming out of the face.
+
+void GetBumpNormals( const Vector& sVect, const Vector& tVect, const Vector& flatNormal, 
+					 const Vector& phongNormal, Vector bumpNormals[NUM_BUMP_VECTS] )
+{
+	Vector tmpNormal;
+	bool leftHanded;
+	int i;
+
+	assert( NUM_BUMP_VECTS == 3 );
+	
+	// Are we left or right handed?
+	CrossProduct( sVect, tVect, tmpNormal );
+	if( DotProduct( flatNormal, tmpNormal ) < 0.0f )
+	{
+		leftHanded = true;
+	}
+	else
+	{
+		leftHanded = false;
+	}
+
+	// Build a basis for the face around the phong normal
+	matrix3x4_t smoothBasis;
+	CrossProduct( phongNormal.Base(), sVect.Base(), smoothBasis[1] );
+	VectorNormalize( smoothBasis[1] );
+	CrossProduct( smoothBasis[1], phongNormal.Base(), smoothBasis[0] );
+	VectorNormalize( smoothBasis[0] );
+	VectorCopy( phongNormal.Base(), smoothBasis[2] );
+	
+	if( leftHanded )
+	{
+		VectorNegate( smoothBasis[1] );
+	}
+	
+	// move the g_localBumpBasis into world space to create bumpNormals
+	for( i = 0; i < 3; i++ )
+	{
+		VectorIRotate( g_localBumpBasis[i], smoothBasis, bumpNormals[i] );
+	}
+}
+
 #endif // !_STATIC_LINKED || _SHARED_LIB
 \ No newline at end of file
diff --git a/mp/src/mathlib/color_conversion.cpp b/mp/src/mathlib/color_conversion.cpp
index 3a30575b..c3125258 100644
--- a/mp/src/mathlib/color_conversion.cpp
+++ b/mp/src/mathlib/color_conversion.cpp
@@ -1,637 +1,637 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: Color conversion routines.
-//
-//=====================================================================================//
-
-#include <math.h>
-#include <float.h>	// Needed for FLT_EPSILON
-#include "basetypes.h"
-#include <memory.h>
-#include "tier0/dbg.h"
-#include "mathlib/mathlib.h"
-#include "mathlib/vector.h"
-
-// memdbgon must be the last include file in a .cpp file!!!
-#include "tier0/memdbgon.h"
-
-//-----------------------------------------------------------------------------
-// Gamma conversion support
-//-----------------------------------------------------------------------------
-static byte		texgammatable[256];	// palette is sent through this to convert to screen gamma
-
-static float	texturetolinear[256];	// texture (0..255) to linear (0..1)
-static int		lineartotexture[1024];	// linear (0..1) to texture (0..255)
-static int		lineartoscreen[1024];	// linear (0..1) to gamma corrected vertex light (0..255)
-
-// build a lightmap texture to combine with surface texture, adjust for src*dst+dst*src, ramp reprogramming, etc
-float			lineartovertex[4096];	// linear (0..4) to screen corrected vertex space (0..1?)
-unsigned char	lineartolightmap[4096];	// linear (0..4) to screen corrected texture value (0..255)
-
-static float	g_Mathlib_GammaToLinear[256];	// gamma (0..1) to linear (0..1)
-static float	g_Mathlib_LinearToGamma[256];	// linear (0..1) to gamma (0..1)
-
-// This is aligned to 16-byte boundaries so that we can load it
-// onto SIMD registers easily if needed (used by SSE version of lightmaps)
-// TODO: move this into the one DLL that actually uses it, instead of statically
-// linking it everywhere via mathlib.
-ALIGN128 float	power2_n[256] = 			// 2**(index - 128) / 255
-{ 
-	1.152445441982634800E-041, 2.304890883965269600E-041, 4.609781767930539200E-041, 9.219563535861078400E-041, 
-	1.843912707172215700E-040, 3.687825414344431300E-040, 7.375650828688862700E-040, 1.475130165737772500E-039,
-	2.950260331475545100E-039, 5.900520662951090200E-039, 1.180104132590218000E-038, 2.360208265180436100E-038, 
-	4.720416530360872100E-038, 9.440833060721744200E-038, 1.888166612144348800E-037, 3.776333224288697700E-037, 
-	7.552666448577395400E-037, 1.510533289715479100E-036, 3.021066579430958200E-036, 6.042133158861916300E-036, 
-	1.208426631772383300E-035, 2.416853263544766500E-035, 4.833706527089533100E-035, 9.667413054179066100E-035, 
-	1.933482610835813200E-034, 3.866965221671626400E-034, 7.733930443343252900E-034, 1.546786088668650600E-033, 
-	3.093572177337301200E-033, 6.187144354674602300E-033, 1.237428870934920500E-032, 2.474857741869840900E-032, 
-	4.949715483739681800E-032, 9.899430967479363700E-032, 1.979886193495872700E-031, 3.959772386991745500E-031, 
-	7.919544773983491000E-031, 1.583908954796698200E-030, 3.167817909593396400E-030, 6.335635819186792800E-030, 
-	1.267127163837358600E-029, 2.534254327674717100E-029, 5.068508655349434200E-029, 1.013701731069886800E-028, 
-	2.027403462139773700E-028, 4.054806924279547400E-028, 8.109613848559094700E-028, 1.621922769711818900E-027, 
-	3.243845539423637900E-027, 6.487691078847275800E-027, 1.297538215769455200E-026, 2.595076431538910300E-026, 
-	5.190152863077820600E-026, 1.038030572615564100E-025, 2.076061145231128300E-025, 4.152122290462256500E-025, 
-	8.304244580924513000E-025, 1.660848916184902600E-024, 3.321697832369805200E-024, 6.643395664739610400E-024, 
-	1.328679132947922100E-023, 2.657358265895844200E-023, 5.314716531791688300E-023, 1.062943306358337700E-022, 
-	2.125886612716675300E-022, 4.251773225433350700E-022, 8.503546450866701300E-022, 1.700709290173340300E-021, 
-	3.401418580346680500E-021, 6.802837160693361100E-021, 1.360567432138672200E-020, 2.721134864277344400E-020, 
-	5.442269728554688800E-020, 1.088453945710937800E-019, 2.176907891421875500E-019, 4.353815782843751100E-019, 
-	8.707631565687502200E-019, 1.741526313137500400E-018, 3.483052626275000900E-018, 6.966105252550001700E-018, 
-	1.393221050510000300E-017, 2.786442101020000700E-017, 5.572884202040001400E-017, 1.114576840408000300E-016, 
-	2.229153680816000600E-016, 4.458307361632001100E-016, 8.916614723264002200E-016, 1.783322944652800400E-015, 
-	3.566645889305600900E-015, 7.133291778611201800E-015, 1.426658355722240400E-014, 2.853316711444480700E-014, 
-	5.706633422888961400E-014, 1.141326684577792300E-013, 2.282653369155584600E-013, 4.565306738311169100E-013, 
-	9.130613476622338300E-013, 1.826122695324467700E-012, 3.652245390648935300E-012, 7.304490781297870600E-012, 
-	1.460898156259574100E-011, 2.921796312519148200E-011, 5.843592625038296500E-011, 1.168718525007659300E-010, 
-	2.337437050015318600E-010, 4.674874100030637200E-010, 9.349748200061274400E-010, 1.869949640012254900E-009, 
-	3.739899280024509800E-009, 7.479798560049019500E-009, 1.495959712009803900E-008, 2.991919424019607800E-008, 
-	5.983838848039215600E-008, 1.196767769607843100E-007, 2.393535539215686200E-007, 4.787071078431372500E-007, 
-	9.574142156862745000E-007, 1.914828431372549000E-006, 3.829656862745098000E-006, 7.659313725490196000E-006, 
-	1.531862745098039200E-005, 3.063725490196078400E-005, 6.127450980392156800E-005, 1.225490196078431400E-004, 
-	2.450980392156862700E-004, 4.901960784313725400E-004, 9.803921568627450800E-004, 1.960784313725490200E-003, 
-	3.921568627450980300E-003, 7.843137254901960700E-003, 1.568627450980392100E-002, 3.137254901960784300E-002, 
-	6.274509803921568500E-002, 1.254901960784313700E-001, 2.509803921568627400E-001, 5.019607843137254800E-001, 
-	1.003921568627451000E+000, 2.007843137254901900E+000, 4.015686274509803900E+000, 8.031372549019607700E+000, 
-	1.606274509803921500E+001, 3.212549019607843100E+001, 6.425098039215686200E+001, 1.285019607843137200E+002, 
-	2.570039215686274500E+002, 5.140078431372548900E+002, 1.028015686274509800E+003, 2.056031372549019600E+003, 
-	4.112062745098039200E+003, 8.224125490196078300E+003, 1.644825098039215700E+004, 3.289650196078431300E+004, 
-	6.579300392156862700E+004, 1.315860078431372500E+005, 2.631720156862745100E+005, 5.263440313725490100E+005, 
-	1.052688062745098000E+006, 2.105376125490196000E+006, 4.210752250980392100E+006, 8.421504501960784200E+006, 
-	1.684300900392156800E+007, 3.368601800784313700E+007, 6.737203601568627400E+007, 1.347440720313725500E+008, 
-	2.694881440627450900E+008, 5.389762881254901900E+008, 1.077952576250980400E+009, 2.155905152501960800E+009, 
-	4.311810305003921500E+009, 8.623620610007843000E+009, 1.724724122001568600E+010, 3.449448244003137200E+010, 
-	6.898896488006274400E+010, 1.379779297601254900E+011, 2.759558595202509800E+011, 5.519117190405019500E+011, 
-	1.103823438081003900E+012, 2.207646876162007800E+012, 4.415293752324015600E+012, 8.830587504648031200E+012, 
-	1.766117500929606200E+013, 3.532235001859212500E+013, 7.064470003718425000E+013, 1.412894000743685000E+014, 
-	2.825788001487370000E+014, 5.651576002974740000E+014, 1.130315200594948000E+015, 2.260630401189896000E+015, 
-	4.521260802379792000E+015, 9.042521604759584000E+015, 1.808504320951916800E+016, 3.617008641903833600E+016, 
-	7.234017283807667200E+016, 1.446803456761533400E+017, 2.893606913523066900E+017, 5.787213827046133800E+017, 
-	1.157442765409226800E+018, 2.314885530818453500E+018, 4.629771061636907000E+018, 9.259542123273814000E+018, 
-	1.851908424654762800E+019, 3.703816849309525600E+019, 7.407633698619051200E+019, 1.481526739723810200E+020, 
-	2.963053479447620500E+020, 5.926106958895241000E+020, 1.185221391779048200E+021, 2.370442783558096400E+021, 
-	4.740885567116192800E+021, 9.481771134232385600E+021, 1.896354226846477100E+022, 3.792708453692954200E+022, 
-	7.585416907385908400E+022, 1.517083381477181700E+023, 3.034166762954363400E+023, 6.068333525908726800E+023, 
-	1.213666705181745400E+024, 2.427333410363490700E+024, 4.854666820726981400E+024, 9.709333641453962800E+024, 
-	1.941866728290792600E+025, 3.883733456581585100E+025, 7.767466913163170200E+025, 1.553493382632634000E+026, 
-	3.106986765265268100E+026, 6.213973530530536200E+026, 1.242794706106107200E+027, 2.485589412212214500E+027, 
-	4.971178824424429000E+027, 9.942357648848857900E+027, 1.988471529769771600E+028, 3.976943059539543200E+028, 
-	7.953886119079086300E+028, 1.590777223815817300E+029, 3.181554447631634500E+029, 6.363108895263269100E+029, 
-	1.272621779052653800E+030, 2.545243558105307600E+030, 5.090487116210615300E+030, 1.018097423242123100E+031, 
-	2.036194846484246100E+031, 4.072389692968492200E+031, 8.144779385936984400E+031, 1.628955877187396900E+032, 
-	3.257911754374793800E+032, 6.515823508749587500E+032, 1.303164701749917500E+033, 2.606329403499835000E+033, 
-	5.212658806999670000E+033, 1.042531761399934000E+034, 2.085063522799868000E+034, 4.170127045599736000E+034, 
-	8.340254091199472000E+034, 1.668050818239894400E+035, 3.336101636479788800E+035, 6.672203272959577600E+035 
-};
-
-// You can use this to double check the exponent table and assert that 
-// the precomputation is correct.
-#ifdef DBGFLAG_ASSERT
-#pragma warning(push)
-#pragma warning( disable : 4189 ) // disable unused local variable warning
-static void CheckExponentTable()
-{
-	for( int i = 0; i < 256; i++ )
-	{
-		float testAgainst = pow( 2.0f, i - 128 ) / 255.0f;
-		float diff = testAgainst - power2_n[i] ;
-		float relativeDiff = diff / testAgainst;
-		Assert( testAgainst == 0 ? 
-				power2_n[i] < 1.16E-041 :
-				power2_n[i] == testAgainst );
-	}
-}
-#pragma warning(pop)
-#endif
-
-void BuildGammaTable( float gamma, float texGamma, float brightness, int overbright )
-{
-	int		i, inf;
-	float	g1, g3;
-
-	// Con_Printf("BuildGammaTable %.1f %.1f %.1f\n", g, v_lightgamma.GetFloat(), v_texgamma.GetFloat() );
-
-	float g = gamma;
-	if (g > 3.0) 
-	{
-		g = 3.0;
-	}
-
-	g = 1.0 / g;
-	g1 = texGamma * g; 
-
-	if (brightness <= 0.0) 
-	{
-		g3 = 0.125;
-	}
-	else if (brightness > 1.0) 
-	{
-		g3 = 0.05;
-	}
-	else 
-	{
-		g3 = 0.125 - (brightness * brightness) * 0.075;
-	}
-
-	for (i=0 ; i<256 ; i++)
-	{
-		inf = 255 * pow ( i/255.f, g1 ); 
-		if (inf < 0)
-			inf = 0;
-		if (inf > 255)
-			inf = 255;
-		texgammatable[i] = inf;
-	}
-
-	for (i=0 ; i<1024 ; i++)
-	{
-		float f;
-
-		f = i / 1023.0;
-
-		// scale up
-		if (brightness > 1.0)
-			f = f * brightness;
-
-		// shift up
-		if (f <= g3)
-			f = (f / g3) * 0.125;
-		else 
-			f = 0.125 + ((f - g3) / (1.0 - g3)) * 0.875;
-
-		// convert linear space to desired gamma space
-		inf = 255 * pow ( f, g ); 
-
-		if (inf < 0)
-			inf = 0;
-		if (inf > 255)
-			inf = 255;
-		lineartoscreen[i] = inf;
-	}
-
-	/*
-	for (i=0 ; i<1024 ; i++)
-	{
-		// convert from screen gamma space to linear space
-		lineargammatable[i] = 1023 * pow ( i/1023.0, v_gamma.GetFloat() );
-		// convert from linear gamma space to screen space
-		screengammatable[i] = 1023 * pow ( i/1023.0, 1.0 / v_gamma.GetFloat() );
-	}
-	*/
-
-	for (i=0 ; i<256 ; i++)
-	{
-		// convert from nonlinear texture space (0..255) to linear space (0..1)
-		texturetolinear[i] =  pow( i / 255.f, texGamma );
-
-		// convert from linear space (0..1) to nonlinear (sRGB) space (0..1)
-		g_Mathlib_LinearToGamma[i] =  LinearToGammaFullRange( i / 255.f );
-
-		// convert from sRGB gamma space (0..1) to linear space (0..1)
-		g_Mathlib_GammaToLinear[i] =  GammaToLinearFullRange( i / 255.f );
-	}
-
-	for (i=0 ; i<1024 ; i++)
-	{
-		// convert from linear space (0..1) to nonlinear texture space (0..255)
-		lineartotexture[i] =  pow( i / 1023.0, 1.0 / texGamma ) * 255;
-	}
-
-#if 0
-	for (i=0 ; i<256 ; i++)
-	{
-		float f;
-
-		// convert from nonlinear lightmap space (0..255) to linear space (0..4)
-		// f =  (i / 255.0) * sqrt( 4 );
-		f =  i * (2.0 / 255.0);
-		f = f * f;
-
-		texlighttolinear[i] = f;
-	}
-#endif
-
-	{
-		float f;
-		float overbrightFactor = 1.0f;
-
-		// Can't do overbright without texcombine
-		// UNDONE: Add GAMMA ramp to rectify this
-		if ( overbright == 2 )
-		{
-			overbrightFactor = 0.5;
-		}
-		else if ( overbright == 4 )
-		{
-			overbrightFactor = 0.25;
-		}
-
-		for (i=0 ; i<4096 ; i++)
-		{
-			// convert from linear 0..4 (x1024) to screen corrected vertex space (0..1?)
-			f = pow ( i/1024.0, 1.0 / gamma );
-
-			lineartovertex[i] = f * overbrightFactor;
-			if (lineartovertex[i] > 1)
-				lineartovertex[i] = 1;
-
-			int nLightmap = RoundFloatToInt( f * 255 * overbrightFactor );
-			nLightmap = clamp( nLightmap, 0, 255 );
-			lineartolightmap[i] = (unsigned char)nLightmap;
-		}
-	}
-}
-
-float GammaToLinearFullRange( float gamma )
-{
-	return pow( gamma, 2.2f );
-}
-
-float LinearToGammaFullRange( float linear )
-{
-	return pow( linear, 1.0f / 2.2f );
-}
-
-float GammaToLinear( float gamma )
-{
-	Assert( s_bMathlibInitialized );
-	if ( gamma < 0.0f )
-	{
-		return 0.0f;
-	}
-
-	if ( gamma >= 0.95f )
-	{
-		// Use GammaToLinearFullRange maybe if you trip this.
-// X360TEMP
-//		Assert( gamma <= 1.0f );
-		return 1.0f;
-	}
-
-	int index = RoundFloatToInt( gamma * 255.0f );
-	Assert( index >= 0 && index < 256 );
-	return g_Mathlib_GammaToLinear[index];
-}
-
-float LinearToGamma( float linear )
-{
-	Assert( s_bMathlibInitialized );
-	if ( linear < 0.0f )
-	{
-		return 0.0f;
-	}
-	if ( linear > 1.0f )
-	{
-		// Use LinearToGammaFullRange maybe if you trip this.
-		Assert( 0 );
-		return 1.0f;
-	}
-
-	int index = RoundFloatToInt( linear * 255.0f );
-	Assert( index >= 0 && index < 256 );
-	return g_Mathlib_LinearToGamma[index];
-}
-
-//-----------------------------------------------------------------------------
-// Helper functions to convert between sRGB and 360 gamma space
-//-----------------------------------------------------------------------------
-float SrgbGammaToLinear( float flSrgbGammaValue )
-{
-	float x = clamp( flSrgbGammaValue, 0.0f, 1.0f );
-	return ( x <= 0.04045f ) ? ( x / 12.92f ) : ( pow( ( x + 0.055f ) / 1.055f, 2.4f ) );
-}
-
-float SrgbLinearToGamma( float flLinearValue )
-{
-	float x = clamp( flLinearValue, 0.0f, 1.0f );
-	return ( x <= 0.0031308f ) ? ( x * 12.92f ) : ( 1.055f * pow( x, ( 1.0f / 2.4f ) ) ) - 0.055f;
-}
-
-float X360GammaToLinear( float fl360GammaValue )
-{
-	float flLinearValue;
-
-	fl360GammaValue = clamp( fl360GammaValue, 0.0f, 1.0f );
-	if ( fl360GammaValue < ( 96.0f / 255.0f ) )
-	{
-		if ( fl360GammaValue < ( 64.0f / 255.0f ) )
-		{
-			flLinearValue = fl360GammaValue * 255.0f;
-		}
-		else
-		{
-			flLinearValue = fl360GammaValue * ( 255.0f * 2.0f ) - 64.0f;
-			flLinearValue += floor( flLinearValue * ( 1.0f / 512.0f ) );
-		}
-	}
-	else
-	{
-		if( fl360GammaValue < ( 192.0f / 255.0f ) )
-		{
-			flLinearValue = fl360GammaValue * ( 255.0f * 4.0f ) - 256.0f;
-			flLinearValue += floor( flLinearValue * ( 1.0f / 256.0f ) );
-		}
-		else
-		{
-			flLinearValue = fl360GammaValue * ( 255.0f * 8.0f ) - 1024.0f;
-			flLinearValue += floor( flLinearValue * ( 1.0f / 128.0f ) );
-		}
-	}
-
-	flLinearValue *= 1.0f / 1023.0f;
-
-	flLinearValue = clamp( flLinearValue, 0.0f, 1.0f );
-	return flLinearValue;
-}
-
-float X360LinearToGamma( float flLinearValue )
-{
-	float fl360GammaValue;
-
-	flLinearValue = clamp( flLinearValue, 0.0f, 1.0f );
-	if ( flLinearValue < ( 128.0f / 1023.0f ) )
-	{
-		if ( flLinearValue < ( 64.0f / 1023.0f ) )
-		{
-			fl360GammaValue = flLinearValue * ( 1023.0f * ( 1.0f / 255.0f ) );
-		}
-		else
-		{
-			fl360GammaValue = flLinearValue * ( ( 1023.0f / 2.0f ) * ( 1.0f / 255.0f ) ) + ( 32.0f / 255.0f );
-		}
-	}
-	else
-	{
-		if ( flLinearValue < ( 512.0f / 1023.0f ) )
-		{
-			fl360GammaValue = flLinearValue * ( ( 1023.0f / 4.0f ) * ( 1.0f / 255.0f ) ) + ( 64.0f / 255.0f );
-		}
-		else
-		{
-			fl360GammaValue = flLinearValue * ( ( 1023.0f /8.0f ) * ( 1.0f / 255.0f ) ) + ( 128.0f /255.0f ); // 1.0 -> 1.0034313725490196078431372549016
-			if ( fl360GammaValue > 1.0f )
-			{
-				fl360GammaValue = 1.0f;
-			}
-		}
-	}
-
-	fl360GammaValue = clamp( fl360GammaValue, 0.0f, 1.0f );
-	return fl360GammaValue;
-}
-
-float SrgbGammaTo360Gamma( float flSrgbGammaValue )
-{
-	float flLinearValue = SrgbGammaToLinear( flSrgbGammaValue );
-	float fl360GammaValue = X360LinearToGamma( flLinearValue );
-	return fl360GammaValue;
-}
-
-// convert texture to linear 0..1 value
-float TextureToLinear( int c )
-{
-	Assert( s_bMathlibInitialized );
-	if (c < 0)
-		return 0;
-	if (c > 255)
-		return 1.0;
-
-	return texturetolinear[c];
-}
-
-// convert texture to linear 0..1 value
-int LinearToTexture( float f )
-{
-	Assert( s_bMathlibInitialized );
-	int i;
-	i = f * 1023;	// assume 0..1 range
-	if (i < 0)
-		i = 0;
-	if (i > 1023)
-		i = 1023;
-
-	return lineartotexture[i];
-}
-
-
-// converts 0..1 linear value to screen gamma (0..255)
-int LinearToScreenGamma( float f )
-{
-	Assert( s_bMathlibInitialized );
-	int i;
-	i = f * 1023;	// assume 0..1 range
-	if (i < 0)
-		i = 0;
-	if (i > 1023)
-		i = 1023;
-
-	return lineartoscreen[i];
-}
-
-void ColorRGBExp32ToVector( const ColorRGBExp32& in, Vector& out )
-{
-	Assert( s_bMathlibInitialized );
-	// FIXME: Why is there a factor of 255 built into this?
-	out.x = 255.0f * TexLightToLinear( in.r, in.exponent );
-	out.y = 255.0f * TexLightToLinear( in.g, in.exponent );
-	out.z = 255.0f * TexLightToLinear( in.b, in.exponent );
-}
-
-#if 0
-// assumes that the desired mantissa range is 128..255
-static int VectorToColorRGBExp32_CalcExponent( float in )
-{
-	int power = 0;
-	
-	if( in != 0.0f )
-	{
-		while( in > 255.0f )
-		{
-			power += 1;
-			in *= 0.5f;
-		}
-		
-		while( in < 128.0f )
-		{
-			power -= 1;
-			in *= 2.0f;
-		}
-	}
-
-	return power;
-}
-
-void VectorToColorRGBExp32( const Vector& vin, ColorRGBExp32 &c )
-{
-	Vector v = vin;
-	Assert( s_bMathlibInitialized );
-	Assert( v.x >= 0.0f && v.y >= 0.0f && v.z >= 0.0f );
-	int i;		
-	float max = v[0];				
-	for( i = 1; i < 3; i++ )
-	{
-		// Get the maximum value.
-		if( v[i] > max )
-		{
-			max = v[i];
-		}
-	}
-				
-	// figure out the exponent for this luxel.
-	int exponent = VectorToColorRGBExp32_CalcExponent( max );
-				
-	// make the exponent fits into a signed byte.
-	if( exponent < -128 )
-	{
-		exponent = -128;
-	}
-	else if( exponent > 127 )
-	{
-		exponent = 127;
-	}
-				
-	// undone: optimize with a table
-	float scalar = pow( 2.0f, -exponent );
-	// convert to mantissa x 2^exponent format
-	for( i = 0; i < 3; i++ )
-	{
-		v[i] *= scalar;
-		// clamp
-		if( v[i] > 255.0f )
-		{
-			v[i] = 255.0f;
-		}
-	}
-	c.r = ( unsigned char )v[0];
-	c.g = ( unsigned char )v[1];
-	c.b = ( unsigned char )v[2];
-	c.exponent = ( signed char )exponent;
-}
-
-#else
-
-// given a floating point number  f, return an exponent e such that
-// for f' = f * 2^e,  f is on [128..255].
-// Uses IEEE 754 representation to directly extract this information
-// from the float.
-inline static int VectorToColorRGBExp32_CalcExponent( const float *pin )
-{
-	// The thing we will take advantage of here is that the exponent component
-	// is stored in the float itself, and because we want to map to 128..255, we
-	// want an "ideal" exponent of 2^7. So, we compute the difference between the
-	// input exponent and 7 to work out the normalizing exponent. Thus if you pass in 
-	// 32 (represented in IEEE 754 as 2^5), this function will return 2
-	// (because 32 * 2^2 = 128)
-	if (*pin == 0.0f)
-		return 0;
-
-	unsigned int fbits = *reinterpret_cast<const unsigned int *>(pin);
-	
-	// the exponent component is bits 23..30, and biased by +127
-	const unsigned int biasedSeven = 7 + 127;
-
-	signed int expComponent = ( fbits & 0x7F800000 ) >> 23;
-	expComponent -= biasedSeven; // now the difference from seven (positive if was less than, etc)
-	return expComponent;
-}
-
-
-
-/// Slightly faster version of the function to turn a float-vector color into 
-/// a compressed-exponent notation 32bit color. However, still not SIMD optimized.
-/// PS3 developer: note there is a movement of a float onto an int here, which is
-/// bad on the base registers -- consider doing this as Altivec code, or better yet
-/// moving it onto the cell.
-/// \warning: Assumes an IEEE 754 single-precision float representation! Those of you
-/// porting to an 8080 are out of luck.
-void VectorToColorRGBExp32( const Vector& vin, ColorRGBExp32 &c )
-{
-	Assert( s_bMathlibInitialized );
-	Assert( vin.x >= 0.0f && vin.y >= 0.0f && vin.z >= 0.0f );
-
-	// work out which of the channels is the largest ( we will use that to map the exponent )
-	// this is a sluggish branch-based decision tree -- most architectures will offer a [max]
-	// assembly opcode to do this faster.
-	const float *pMax;
-	if (vin.x > vin.y)
-	{
-		if (vin.x > vin.z)
-		{
-			pMax = &vin.x;
-		}
-		else
-		{
-			pMax = &vin.z;
-		}
-	}
-	else
-	{
-		if (vin.y > vin.z)
-		{
-			pMax = &vin.y;
-		}
-		else
-		{
-			pMax = &vin.z;
-		}
-	}
-
-	// now work out the exponent for this luxel. 
-	signed int exponent = VectorToColorRGBExp32_CalcExponent( pMax );
-
-	// make sure the exponent fits into a signed byte.
-	// (in single precision format this is assured because it was a signed byte to begin with)
-	Assert(exponent > -128 && exponent <= 127);
-
-	// promote the exponent back onto a scalar that we'll use to normalize all the numbers
-	float scalar;
-	{
-		unsigned int fbits = (127 - exponent) << 23;
-		scalar = *reinterpret_cast<float *>(&fbits);
-	}
-
-	// we should never need to clamp:
-	Assert(vin.x * scalar <= 255.0f && 
-		   vin.y * scalar <= 255.0f && 
-		   vin.z * scalar <= 255.0f);
-
-	// This awful construction is necessary to prevent VC2005 from using the 
-	// fldcw/fnstcw control words around every float-to-unsigned-char operation.
-	{
-		int red = (vin.x * scalar);
-		int green = (vin.y * scalar);
-		int blue = (vin.z * scalar);
-
-		c.r = red;
-		c.g = green;
-		c.b = blue;
-	}
-	/*
-	c.r = ( unsigned char )(vin.x * scalar);
-	c.g = ( unsigned char )(vin.y * scalar);
-	c.b = ( unsigned char )(vin.z * scalar);
-	*/
-
-	c.exponent = ( signed char )exponent;
-}
-
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: Color conversion routines.
+//
+//=====================================================================================//
+
+#include <math.h>
+#include <float.h>	// Needed for FLT_EPSILON
+#include "basetypes.h"
+#include <memory.h>
+#include "tier0/dbg.h"
+#include "mathlib/mathlib.h"
+#include "mathlib/vector.h"
+
+// memdbgon must be the last include file in a .cpp file!!!
+#include "tier0/memdbgon.h"
+
+//-----------------------------------------------------------------------------
+// Gamma conversion support
+//-----------------------------------------------------------------------------
+static byte		texgammatable[256];	// palette is sent through this to convert to screen gamma
+
+static float	texturetolinear[256];	// texture (0..255) to linear (0..1)
+static int		lineartotexture[1024];	// linear (0..1) to texture (0..255)
+static int		lineartoscreen[1024];	// linear (0..1) to gamma corrected vertex light (0..255)
+
+// build a lightmap texture to combine with surface texture, adjust for src*dst+dst*src, ramp reprogramming, etc
+float			lineartovertex[4096];	// linear (0..4) to screen corrected vertex space (0..1?)
+unsigned char	lineartolightmap[4096];	// linear (0..4) to screen corrected texture value (0..255)
+
+static float	g_Mathlib_GammaToLinear[256];	// gamma (0..1) to linear (0..1)
+static float	g_Mathlib_LinearToGamma[256];	// linear (0..1) to gamma (0..1)
+
+// This is aligned to 16-byte boundaries so that we can load it
+// onto SIMD registers easily if needed (used by SSE version of lightmaps)
+// TODO: move this into the one DLL that actually uses it, instead of statically
+// linking it everywhere via mathlib.
+ALIGN128 float	power2_n[256] = 			// 2**(index - 128) / 255
+{ 
+	1.152445441982634800E-041, 2.304890883965269600E-041, 4.609781767930539200E-041, 9.219563535861078400E-041, 
+	1.843912707172215700E-040, 3.687825414344431300E-040, 7.375650828688862700E-040, 1.475130165737772500E-039,
+	2.950260331475545100E-039, 5.900520662951090200E-039, 1.180104132590218000E-038, 2.360208265180436100E-038, 
+	4.720416530360872100E-038, 9.440833060721744200E-038, 1.888166612144348800E-037, 3.776333224288697700E-037, 
+	7.552666448577395400E-037, 1.510533289715479100E-036, 3.021066579430958200E-036, 6.042133158861916300E-036, 
+	1.208426631772383300E-035, 2.416853263544766500E-035, 4.833706527089533100E-035, 9.667413054179066100E-035, 
+	1.933482610835813200E-034, 3.866965221671626400E-034, 7.733930443343252900E-034, 1.546786088668650600E-033, 
+	3.093572177337301200E-033, 6.187144354674602300E-033, 1.237428870934920500E-032, 2.474857741869840900E-032, 
+	4.949715483739681800E-032, 9.899430967479363700E-032, 1.979886193495872700E-031, 3.959772386991745500E-031, 
+	7.919544773983491000E-031, 1.583908954796698200E-030, 3.167817909593396400E-030, 6.335635819186792800E-030, 
+	1.267127163837358600E-029, 2.534254327674717100E-029, 5.068508655349434200E-029, 1.013701731069886800E-028, 
+	2.027403462139773700E-028, 4.054806924279547400E-028, 8.109613848559094700E-028, 1.621922769711818900E-027, 
+	3.243845539423637900E-027, 6.487691078847275800E-027, 1.297538215769455200E-026, 2.595076431538910300E-026, 
+	5.190152863077820600E-026, 1.038030572615564100E-025, 2.076061145231128300E-025, 4.152122290462256500E-025, 
+	8.304244580924513000E-025, 1.660848916184902600E-024, 3.321697832369805200E-024, 6.643395664739610400E-024, 
+	1.328679132947922100E-023, 2.657358265895844200E-023, 5.314716531791688300E-023, 1.062943306358337700E-022, 
+	2.125886612716675300E-022, 4.251773225433350700E-022, 8.503546450866701300E-022, 1.700709290173340300E-021, 
+	3.401418580346680500E-021, 6.802837160693361100E-021, 1.360567432138672200E-020, 2.721134864277344400E-020, 
+	5.442269728554688800E-020, 1.088453945710937800E-019, 2.176907891421875500E-019, 4.353815782843751100E-019, 
+	8.707631565687502200E-019, 1.741526313137500400E-018, 3.483052626275000900E-018, 6.966105252550001700E-018, 
+	1.393221050510000300E-017, 2.786442101020000700E-017, 5.572884202040001400E-017, 1.114576840408000300E-016, 
+	2.229153680816000600E-016, 4.458307361632001100E-016, 8.916614723264002200E-016, 1.783322944652800400E-015, 
+	3.566645889305600900E-015, 7.133291778611201800E-015, 1.426658355722240400E-014, 2.853316711444480700E-014, 
+	5.706633422888961400E-014, 1.141326684577792300E-013, 2.282653369155584600E-013, 4.565306738311169100E-013, 
+	9.130613476622338300E-013, 1.826122695324467700E-012, 3.652245390648935300E-012, 7.304490781297870600E-012, 
+	1.460898156259574100E-011, 2.921796312519148200E-011, 5.843592625038296500E-011, 1.168718525007659300E-010, 
+	2.337437050015318600E-010, 4.674874100030637200E-010, 9.349748200061274400E-010, 1.869949640012254900E-009, 
+	3.739899280024509800E-009, 7.479798560049019500E-009, 1.495959712009803900E-008, 2.991919424019607800E-008, 
+	5.983838848039215600E-008, 1.196767769607843100E-007, 2.393535539215686200E-007, 4.787071078431372500E-007, 
+	9.574142156862745000E-007, 1.914828431372549000E-006, 3.829656862745098000E-006, 7.659313725490196000E-006, 
+	1.531862745098039200E-005, 3.063725490196078400E-005, 6.127450980392156800E-005, 1.225490196078431400E-004, 
+	2.450980392156862700E-004, 4.901960784313725400E-004, 9.803921568627450800E-004, 1.960784313725490200E-003, 
+	3.921568627450980300E-003, 7.843137254901960700E-003, 1.568627450980392100E-002, 3.137254901960784300E-002, 
+	6.274509803921568500E-002, 1.254901960784313700E-001, 2.509803921568627400E-001, 5.019607843137254800E-001, 
+	1.003921568627451000E+000, 2.007843137254901900E+000, 4.015686274509803900E+000, 8.031372549019607700E+000, 
+	1.606274509803921500E+001, 3.212549019607843100E+001, 6.425098039215686200E+001, 1.285019607843137200E+002, 
+	2.570039215686274500E+002, 5.140078431372548900E+002, 1.028015686274509800E+003, 2.056031372549019600E+003, 
+	4.112062745098039200E+003, 8.224125490196078300E+003, 1.644825098039215700E+004, 3.289650196078431300E+004, 
+	6.579300392156862700E+004, 1.315860078431372500E+005, 2.631720156862745100E+005, 5.263440313725490100E+005, 
+	1.052688062745098000E+006, 2.105376125490196000E+006, 4.210752250980392100E+006, 8.421504501960784200E+006, 
+	1.684300900392156800E+007, 3.368601800784313700E+007, 6.737203601568627400E+007, 1.347440720313725500E+008, 
+	2.694881440627450900E+008, 5.389762881254901900E+008, 1.077952576250980400E+009, 2.155905152501960800E+009, 
+	4.311810305003921500E+009, 8.623620610007843000E+009, 1.724724122001568600E+010, 3.449448244003137200E+010, 
+	6.898896488006274400E+010, 1.379779297601254900E+011, 2.759558595202509800E+011, 5.519117190405019500E+011, 
+	1.103823438081003900E+012, 2.207646876162007800E+012, 4.415293752324015600E+012, 8.830587504648031200E+012, 
+	1.766117500929606200E+013, 3.532235001859212500E+013, 7.064470003718425000E+013, 1.412894000743685000E+014, 
+	2.825788001487370000E+014, 5.651576002974740000E+014, 1.130315200594948000E+015, 2.260630401189896000E+015, 
+	4.521260802379792000E+015, 9.042521604759584000E+015, 1.808504320951916800E+016, 3.617008641903833600E+016, 
+	7.234017283807667200E+016, 1.446803456761533400E+017, 2.893606913523066900E+017, 5.787213827046133800E+017, 
+	1.157442765409226800E+018, 2.314885530818453500E+018, 4.629771061636907000E+018, 9.259542123273814000E+018, 
+	1.851908424654762800E+019, 3.703816849309525600E+019, 7.407633698619051200E+019, 1.481526739723810200E+020, 
+	2.963053479447620500E+020, 5.926106958895241000E+020, 1.185221391779048200E+021, 2.370442783558096400E+021, 
+	4.740885567116192800E+021, 9.481771134232385600E+021, 1.896354226846477100E+022, 3.792708453692954200E+022, 
+	7.585416907385908400E+022, 1.517083381477181700E+023, 3.034166762954363400E+023, 6.068333525908726800E+023, 
+	1.213666705181745400E+024, 2.427333410363490700E+024, 4.854666820726981400E+024, 9.709333641453962800E+024, 
+	1.941866728290792600E+025, 3.883733456581585100E+025, 7.767466913163170200E+025, 1.553493382632634000E+026, 
+	3.106986765265268100E+026, 6.213973530530536200E+026, 1.242794706106107200E+027, 2.485589412212214500E+027, 
+	4.971178824424429000E+027, 9.942357648848857900E+027, 1.988471529769771600E+028, 3.976943059539543200E+028, 
+	7.953886119079086300E+028, 1.590777223815817300E+029, 3.181554447631634500E+029, 6.363108895263269100E+029, 
+	1.272621779052653800E+030, 2.545243558105307600E+030, 5.090487116210615300E+030, 1.018097423242123100E+031, 
+	2.036194846484246100E+031, 4.072389692968492200E+031, 8.144779385936984400E+031, 1.628955877187396900E+032, 
+	3.257911754374793800E+032, 6.515823508749587500E+032, 1.303164701749917500E+033, 2.606329403499835000E+033, 
+	5.212658806999670000E+033, 1.042531761399934000E+034, 2.085063522799868000E+034, 4.170127045599736000E+034, 
+	8.340254091199472000E+034, 1.668050818239894400E+035, 3.336101636479788800E+035, 6.672203272959577600E+035 
+};
+
+// You can use this to double check the exponent table and assert that 
+// the precomputation is correct.
+#ifdef DBGFLAG_ASSERT
+#pragma warning(push)
+#pragma warning( disable : 4189 ) // disable unused local variable warning
+static void CheckExponentTable()
+{
+	for( int i = 0; i < 256; i++ )
+	{
+		float testAgainst = pow( 2.0f, i - 128 ) / 255.0f;
+		float diff = testAgainst - power2_n[i] ;
+		float relativeDiff = diff / testAgainst;
+		Assert( testAgainst == 0 ? 
+				power2_n[i] < 1.16E-041 :
+				power2_n[i] == testAgainst );
+	}
+}
+#pragma warning(pop)
+#endif
+
+void BuildGammaTable( float gamma, float texGamma, float brightness, int overbright )
+{
+	int		i, inf;
+	float	g1, g3;
+
+	// Con_Printf("BuildGammaTable %.1f %.1f %.1f\n", g, v_lightgamma.GetFloat(), v_texgamma.GetFloat() );
+
+	float g = gamma;
+	if (g > 3.0) 
+	{
+		g = 3.0;
+	}
+
+	g = 1.0 / g;
+	g1 = texGamma * g; 
+
+	if (brightness <= 0.0) 
+	{
+		g3 = 0.125;
+	}
+	else if (brightness > 1.0) 
+	{
+		g3 = 0.05;
+	}
+	else 
+	{
+		g3 = 0.125 - (brightness * brightness) * 0.075;
+	}
+
+	for (i=0 ; i<256 ; i++)
+	{
+		inf = 255 * pow ( i/255.f, g1 ); 
+		if (inf < 0)
+			inf = 0;
+		if (inf > 255)
+			inf = 255;
+		texgammatable[i] = inf;
+	}
+
+	for (i=0 ; i<1024 ; i++)
+	{
+		float f;
+
+		f = i / 1023.0;
+
+		// scale up
+		if (brightness > 1.0)
+			f = f * brightness;
+
+		// shift up
+		if (f <= g3)
+			f = (f / g3) * 0.125;
+		else 
+			f = 0.125 + ((f - g3) / (1.0 - g3)) * 0.875;
+
+		// convert linear space to desired gamma space
+		inf = 255 * pow ( f, g ); 
+
+		if (inf < 0)
+			inf = 0;
+		if (inf > 255)
+			inf = 255;
+		lineartoscreen[i] = inf;
+	}
+
+	/*
+	for (i=0 ; i<1024 ; i++)
+	{
+		// convert from screen gamma space to linear space
+		lineargammatable[i] = 1023 * pow ( i/1023.0, v_gamma.GetFloat() );
+		// convert from linear gamma space to screen space
+		screengammatable[i] = 1023 * pow ( i/1023.0, 1.0 / v_gamma.GetFloat() );
+	}
+	*/
+
+	for (i=0 ; i<256 ; i++)
+	{
+		// convert from nonlinear texture space (0..255) to linear space (0..1)
+		texturetolinear[i] =  pow( i / 255.f, texGamma );
+
+		// convert from linear space (0..1) to nonlinear (sRGB) space (0..1)
+		g_Mathlib_LinearToGamma[i] =  LinearToGammaFullRange( i / 255.f );
+
+		// convert from sRGB gamma space (0..1) to linear space (0..1)
+		g_Mathlib_GammaToLinear[i] =  GammaToLinearFullRange( i / 255.f );
+	}
+
+	for (i=0 ; i<1024 ; i++)
+	{
+		// convert from linear space (0..1) to nonlinear texture space (0..255)
+		lineartotexture[i] =  pow( i / 1023.0, 1.0 / texGamma ) * 255;
+	}
+
+#if 0
+	for (i=0 ; i<256 ; i++)
+	{
+		float f;
+
+		// convert from nonlinear lightmap space (0..255) to linear space (0..4)
+		// f =  (i / 255.0) * sqrt( 4 );
+		f =  i * (2.0 / 255.0);
+		f = f * f;
+
+		texlighttolinear[i] = f;
+	}
+#endif
+
+	{
+		float f;
+		float overbrightFactor = 1.0f;
+
+		// Can't do overbright without texcombine
+		// UNDONE: Add GAMMA ramp to rectify this
+		if ( overbright == 2 )
+		{
+			overbrightFactor = 0.5;
+		}
+		else if ( overbright == 4 )
+		{
+			overbrightFactor = 0.25;
+		}
+
+		for (i=0 ; i<4096 ; i++)
+		{
+			// convert from linear 0..4 (x1024) to screen corrected vertex space (0..1?)
+			f = pow ( i/1024.0, 1.0 / gamma );
+
+			lineartovertex[i] = f * overbrightFactor;
+			if (lineartovertex[i] > 1)
+				lineartovertex[i] = 1;
+
+			int nLightmap = RoundFloatToInt( f * 255 * overbrightFactor );
+			nLightmap = clamp( nLightmap, 0, 255 );
+			lineartolightmap[i] = (unsigned char)nLightmap;
+		}
+	}
+}
+
+float GammaToLinearFullRange( float gamma )
+{
+	return pow( gamma, 2.2f );
+}
+
+float LinearToGammaFullRange( float linear )
+{
+	return pow( linear, 1.0f / 2.2f );
+}
+
+float GammaToLinear( float gamma )
+{
+	Assert( s_bMathlibInitialized );
+	if ( gamma < 0.0f )
+	{
+		return 0.0f;
+	}
+
+	if ( gamma >= 0.95f )
+	{
+		// Use GammaToLinearFullRange maybe if you trip this.
+// X360TEMP
+//		Assert( gamma <= 1.0f );
+		return 1.0f;
+	}
+
+	int index = RoundFloatToInt( gamma * 255.0f );
+	Assert( index >= 0 && index < 256 );
+	return g_Mathlib_GammaToLinear[index];
+}
+
+float LinearToGamma( float linear )
+{
+	Assert( s_bMathlibInitialized );
+	if ( linear < 0.0f )
+	{
+		return 0.0f;
+	}
+	if ( linear > 1.0f )
+	{
+		// Use LinearToGammaFullRange maybe if you trip this.
+		Assert( 0 );
+		return 1.0f;
+	}
+
+	int index = RoundFloatToInt( linear * 255.0f );
+	Assert( index >= 0 && index < 256 );
+	return g_Mathlib_LinearToGamma[index];
+}
+
+//-----------------------------------------------------------------------------
+// Helper functions to convert between sRGB and 360 gamma space
+//-----------------------------------------------------------------------------
+float SrgbGammaToLinear( float flSrgbGammaValue )
+{
+	float x = clamp( flSrgbGammaValue, 0.0f, 1.0f );
+	return ( x <= 0.04045f ) ? ( x / 12.92f ) : ( pow( ( x + 0.055f ) / 1.055f, 2.4f ) );
+}
+
+float SrgbLinearToGamma( float flLinearValue )
+{
+	float x = clamp( flLinearValue, 0.0f, 1.0f );
+	return ( x <= 0.0031308f ) ? ( x * 12.92f ) : ( 1.055f * pow( x, ( 1.0f / 2.4f ) ) ) - 0.055f;
+}
+
+float X360GammaToLinear( float fl360GammaValue )
+{
+	float flLinearValue;
+
+	fl360GammaValue = clamp( fl360GammaValue, 0.0f, 1.0f );
+	if ( fl360GammaValue < ( 96.0f / 255.0f ) )
+	{
+		if ( fl360GammaValue < ( 64.0f / 255.0f ) )
+		{
+			flLinearValue = fl360GammaValue * 255.0f;
+		}
+		else
+		{
+			flLinearValue = fl360GammaValue * ( 255.0f * 2.0f ) - 64.0f;
+			flLinearValue += floor( flLinearValue * ( 1.0f / 512.0f ) );
+		}
+	}
+	else
+	{
+		if( fl360GammaValue < ( 192.0f / 255.0f ) )
+		{
+			flLinearValue = fl360GammaValue * ( 255.0f * 4.0f ) - 256.0f;
+			flLinearValue += floor( flLinearValue * ( 1.0f / 256.0f ) );
+		}
+		else
+		{
+			flLinearValue = fl360GammaValue * ( 255.0f * 8.0f ) - 1024.0f;
+			flLinearValue += floor( flLinearValue * ( 1.0f / 128.0f ) );
+		}
+	}
+
+	flLinearValue *= 1.0f / 1023.0f;
+
+	flLinearValue = clamp( flLinearValue, 0.0f, 1.0f );
+	return flLinearValue;
+}
+
+float X360LinearToGamma( float flLinearValue )
+{
+	float fl360GammaValue;
+
+	flLinearValue = clamp( flLinearValue, 0.0f, 1.0f );
+	if ( flLinearValue < ( 128.0f / 1023.0f ) )
+	{
+		if ( flLinearValue < ( 64.0f / 1023.0f ) )
+		{
+			fl360GammaValue = flLinearValue * ( 1023.0f * ( 1.0f / 255.0f ) );
+		}
+		else
+		{
+			fl360GammaValue = flLinearValue * ( ( 1023.0f / 2.0f ) * ( 1.0f / 255.0f ) ) + ( 32.0f / 255.0f );
+		}
+	}
+	else
+	{
+		if ( flLinearValue < ( 512.0f / 1023.0f ) )
+		{
+			fl360GammaValue = flLinearValue * ( ( 1023.0f / 4.0f ) * ( 1.0f / 255.0f ) ) + ( 64.0f / 255.0f );
+		}
+		else
+		{
+			fl360GammaValue = flLinearValue * ( ( 1023.0f /8.0f ) * ( 1.0f / 255.0f ) ) + ( 128.0f /255.0f ); // 1.0 -> 1.0034313725490196078431372549016
+			if ( fl360GammaValue > 1.0f )
+			{
+				fl360GammaValue = 1.0f;
+			}
+		}
+	}
+
+	fl360GammaValue = clamp( fl360GammaValue, 0.0f, 1.0f );
+	return fl360GammaValue;
+}
+
+float SrgbGammaTo360Gamma( float flSrgbGammaValue )
+{
+	float flLinearValue = SrgbGammaToLinear( flSrgbGammaValue );
+	float fl360GammaValue = X360LinearToGamma( flLinearValue );
+	return fl360GammaValue;
+}
+
+// convert texture to linear 0..1 value
+float TextureToLinear( int c )
+{
+	Assert( s_bMathlibInitialized );
+	if (c < 0)
+		return 0;
+	if (c > 255)
+		return 1.0;
+
+	return texturetolinear[c];
+}
+
+// convert texture to linear 0..1 value
+int LinearToTexture( float f )
+{
+	Assert( s_bMathlibInitialized );
+	int i;
+	i = f * 1023;	// assume 0..1 range
+	if (i < 0)
+		i = 0;
+	if (i > 1023)
+		i = 1023;
+
+	return lineartotexture[i];
+}
+
+
+// converts 0..1 linear value to screen gamma (0..255)
+int LinearToScreenGamma( float f )
+{
+	Assert( s_bMathlibInitialized );
+	int i;
+	i = f * 1023;	// assume 0..1 range
+	if (i < 0)
+		i = 0;
+	if (i > 1023)
+		i = 1023;
+
+	return lineartoscreen[i];
+}
+
+void ColorRGBExp32ToVector( const ColorRGBExp32& in, Vector& out )
+{
+	Assert( s_bMathlibInitialized );
+	// FIXME: Why is there a factor of 255 built into this?
+	out.x = 255.0f * TexLightToLinear( in.r, in.exponent );
+	out.y = 255.0f * TexLightToLinear( in.g, in.exponent );
+	out.z = 255.0f * TexLightToLinear( in.b, in.exponent );
+}
+
+#if 0
+// assumes that the desired mantissa range is 128..255
+static int VectorToColorRGBExp32_CalcExponent( float in )
+{
+	int power = 0;
+	
+	if( in != 0.0f )
+	{
+		while( in > 255.0f )
+		{
+			power += 1;
+			in *= 0.5f;
+		}
+		
+		while( in < 128.0f )
+		{
+			power -= 1;
+			in *= 2.0f;
+		}
+	}
+
+	return power;
+}
+
+void VectorToColorRGBExp32( const Vector& vin, ColorRGBExp32 &c )
+{
+	Vector v = vin;
+	Assert( s_bMathlibInitialized );
+	Assert( v.x >= 0.0f && v.y >= 0.0f && v.z >= 0.0f );
+	int i;		
+	float max = v[0];				
+	for( i = 1; i < 3; i++ )
+	{
+		// Get the maximum value.
+		if( v[i] > max )
+		{
+			max = v[i];
+		}
+	}
+				
+	// figure out the exponent for this luxel.
+	int exponent = VectorToColorRGBExp32_CalcExponent( max );
+				
+	// make the exponent fits into a signed byte.
+	if( exponent < -128 )
+	{
+		exponent = -128;
+	}
+	else if( exponent > 127 )
+	{
+		exponent = 127;
+	}
+				
+	// undone: optimize with a table
+	float scalar = pow( 2.0f, -exponent );
+	// convert to mantissa x 2^exponent format
+	for( i = 0; i < 3; i++ )
+	{
+		v[i] *= scalar;
+		// clamp
+		if( v[i] > 255.0f )
+		{
+			v[i] = 255.0f;
+		}
+	}
+	c.r = ( unsigned char )v[0];
+	c.g = ( unsigned char )v[1];
+	c.b = ( unsigned char )v[2];
+	c.exponent = ( signed char )exponent;
+}
+
+#else
+
+// given a floating point number  f, return an exponent e such that
+// for f' = f * 2^e,  f is on [128..255].
+// Uses IEEE 754 representation to directly extract this information
+// from the float.
+inline static int VectorToColorRGBExp32_CalcExponent( const float *pin )
+{
+	// The thing we will take advantage of here is that the exponent component
+	// is stored in the float itself, and because we want to map to 128..255, we
+	// want an "ideal" exponent of 2^7. So, we compute the difference between the
+	// input exponent and 7 to work out the normalizing exponent. Thus if you pass in 
+	// 32 (represented in IEEE 754 as 2^5), this function will return 2
+	// (because 32 * 2^2 = 128)
+	if (*pin == 0.0f)
+		return 0;
+
+	unsigned int fbits = *reinterpret_cast<const unsigned int *>(pin);
+	
+	// the exponent component is bits 23..30, and biased by +127
+	const unsigned int biasedSeven = 7 + 127;
+
+	signed int expComponent = ( fbits & 0x7F800000 ) >> 23;
+	expComponent -= biasedSeven; // now the difference from seven (positive if was less than, etc)
+	return expComponent;
+}
+
+
+
+/// Slightly faster version of the function to turn a float-vector color into 
+/// a compressed-exponent notation 32bit color. However, still not SIMD optimized.
+/// PS3 developer: note there is a movement of a float onto an int here, which is
+/// bad on the base registers -- consider doing this as Altivec code, or better yet
+/// moving it onto the cell.
+/// \warning: Assumes an IEEE 754 single-precision float representation! Those of you
+/// porting to an 8080 are out of luck.
+void VectorToColorRGBExp32( const Vector& vin, ColorRGBExp32 &c )
+{
+	Assert( s_bMathlibInitialized );
+	Assert( vin.x >= 0.0f && vin.y >= 0.0f && vin.z >= 0.0f );
+
+	// work out which of the channels is the largest ( we will use that to map the exponent )
+	// this is a sluggish branch-based decision tree -- most architectures will offer a [max]
+	// assembly opcode to do this faster.
+	const float *pMax;
+	if (vin.x > vin.y)
+	{
+		if (vin.x > vin.z)
+		{
+			pMax = &vin.x;
+		}
+		else
+		{
+			pMax = &vin.z;
+		}
+	}
+	else
+	{
+		if (vin.y > vin.z)
+		{
+			pMax = &vin.y;
+		}
+		else
+		{
+			pMax = &vin.z;
+		}
+	}
+
+	// now work out the exponent for this luxel. 
+	signed int exponent = VectorToColorRGBExp32_CalcExponent( pMax );
+
+	// make sure the exponent fits into a signed byte.
+	// (in single precision format this is assured because it was a signed byte to begin with)
+	Assert(exponent > -128 && exponent <= 127);
+
+	// promote the exponent back onto a scalar that we'll use to normalize all the numbers
+	float scalar;
+	{
+		unsigned int fbits = (127 - exponent) << 23;
+		scalar = *reinterpret_cast<float *>(&fbits);
+	}
+
+	// we should never need to clamp:
+	Assert(vin.x * scalar <= 255.0f && 
+		   vin.y * scalar <= 255.0f && 
+		   vin.z * scalar <= 255.0f);
+
+	// This awful construction is necessary to prevent VC2005 from using the 
+	// fldcw/fnstcw control words around every float-to-unsigned-char operation.
+	{
+		int red = (vin.x * scalar);
+		int green = (vin.y * scalar);
+		int blue = (vin.z * scalar);
+
+		c.r = red;
+		c.g = green;
+		c.b = blue;
+	}
+	/*
+	c.r = ( unsigned char )(vin.x * scalar);
+	c.g = ( unsigned char )(vin.y * scalar);
+	c.b = ( unsigned char )(vin.z * scalar);
+	*/
+
+	c.exponent = ( signed char )exponent;
+}
+
 #endif
 \ No newline at end of file
diff --git a/mp/src/mathlib/datagen.pl b/mp/src/mathlib/datagen.pl
index 9e434034..9646002d 100644
--- a/mp/src/mathlib/datagen.pl
+++ b/mp/src/mathlib/datagen.pl
@@ -1,63 +1,63 @@
-#! perl
-use Text::Wrap;
-
-# generate output data for noise generators
-
-srand(31456);
-
-print <<END
-//========= Copyright � 1996-2006, Valve Corporation, All rights reserved. ============//
-//
-// Purpose: static data for noise() primitives.
-//
-// \$Workfile:     \$
-// \$NoKeywords: \$
-//=============================================================================//
-//
-//    **** DO NOT EDIT THIS FILE. GENERATED BY DATAGEN.PL ****
-//
-
-END
-;
-
-@perm_a=0..255;
-
-&fisher_yates_shuffle(\@perm_a);
-
-$Text::Wrap::Columns=78;
-$Text::Wrap::break=",";
-$Text::Wrap::separator=",\n";
-
-print "static int perm_a[]={\n",wrap('    ','   ',join(",",@perm_a)),"\n};\n\n";
-&fisher_yates_shuffle(\@perm_a);
-print "static int perm_b[]={\n",wrap('    ','    ',join(",",@perm_a)),"\n};\n\n";
-&fisher_yates_shuffle(\@perm_a);
-print "static int perm_c[]={\n",wrap('    ','    ',join(",",@perm_a)),"\n};\n\n";
-&fisher_yates_shuffle(\@perm_a);
-print "static int perm_d[]={\n",wrap('    ','    ',join(",",@perm_a)),"\n};\n\n";
-
-for ($i=0;$i<256;$i++)
-  {
-	$float_perm=(1.0/255.0)*$perm_a[$i];
-	$perm_a[$i] = sprintf("%f",$float_perm);
-  }
-&fisher_yates_shuffle(\@perm_a);
-print "static float impulse_xcoords[]={\n",wrap('    ','    ',join(",",@perm_a)),"\n};\n\n";
-&fisher_yates_shuffle(\@perm_a);
-print "static float impulse_ycoords[]={\n",wrap('    ','    ',join(",",@perm_a)),"\n};\n\n";
-&fisher_yates_shuffle(\@perm_a);
-print "static float impulse_zcoords[]={\n",wrap('    ','    ',join(",",@perm_a)),"\n};\n\n";
-
-
-
-# fisher_yates_shuffle( \@array ) : generate a random permutation
-# of @array in place
-sub fisher_yates_shuffle {
-    my $array = shift;
-    my $i;
-    for ($i = @$array; --$i; ) {
-        my $j = int rand ($i+1);
-        next if $i == $j;
-        @$array[$i,$j] = @$array[$j,$i];
-    }
-}
+#! perl
+use Text::Wrap;
+
+# generate output data for noise generators
+
+srand(31456);
+
+print <<END
+//========= Copyright � 1996-2006, Valve Corporation, All rights reserved. ============//
+//
+// Purpose: static data for noise() primitives.
+//
+// \$Workfile:     \$
+// \$NoKeywords: \$
+//=============================================================================//
+//
+//    **** DO NOT EDIT THIS FILE. GENERATED BY DATAGEN.PL ****
+//
+
+END
+;
+
+@perm_a=0..255;
+
+&fisher_yates_shuffle(\@perm_a);
+
+$Text::Wrap::Columns=78;
+$Text::Wrap::break=",";
+$Text::Wrap::separator=",\n";
+
+print "static int perm_a[]={\n",wrap('    ','   ',join(",",@perm_a)),"\n};\n\n";
+&fisher_yates_shuffle(\@perm_a);
+print "static int perm_b[]={\n",wrap('    ','    ',join(",",@perm_a)),"\n};\n\n";
+&fisher_yates_shuffle(\@perm_a);
+print "static int perm_c[]={\n",wrap('    ','    ',join(",",@perm_a)),"\n};\n\n";
+&fisher_yates_shuffle(\@perm_a);
+print "static int perm_d[]={\n",wrap('    ','    ',join(",",@perm_a)),"\n};\n\n";
+
+for ($i=0;$i<256;$i++)
+  {
+	$float_perm=(1.0/255.0)*$perm_a[$i];
+	$perm_a[$i] = sprintf("%f",$float_perm);
+  }
+&fisher_yates_shuffle(\@perm_a);
+print "static float impulse_xcoords[]={\n",wrap('    ','    ',join(",",@perm_a)),"\n};\n\n";
+&fisher_yates_shuffle(\@perm_a);
+print "static float impulse_ycoords[]={\n",wrap('    ','    ',join(",",@perm_a)),"\n};\n\n";
+&fisher_yates_shuffle(\@perm_a);
+print "static float impulse_zcoords[]={\n",wrap('    ','    ',join(",",@perm_a)),"\n};\n\n";
+
+
+
+# fisher_yates_shuffle( \@array ) : generate a random permutation
+# of @array in place
+sub fisher_yates_shuffle {
+    my $array = shift;
+    my $i;
+    for ($i = @$array; --$i; ) {
+        my $j = int rand ($i+1);
+        next if $i == $j;
+        @$array[$i,$j] = @$array[$j,$i];
+    }
+}
diff --git a/mp/src/mathlib/halton.cpp b/mp/src/mathlib/halton.cpp
index d0c56325..f9daae71 100644
--- a/mp/src/mathlib/halton.cpp
+++ b/mp/src/mathlib/halton.cpp
@@ -1,30 +1,30 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: 
-//
-//=====================================================================================//
-
-#include <halton.h>
-
-HaltonSequenceGenerator_t::HaltonSequenceGenerator_t(int b)
-{
-	base=b;
-	fbase=(float) b;
-	seed=1;
-
-}
-
-float HaltonSequenceGenerator_t::GetElement(int elem)
-{
-	int tmpseed=seed;
-	float ret=0.0;
-	float base_inv=1.0/fbase;
-	while(tmpseed)
-	{
-		int dig=tmpseed % base;
-		ret+=((float) dig)*base_inv;
-		base_inv/=fbase;
-		tmpseed/=base;
-	}
-	return ret;
-}
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+//=====================================================================================//
+
+#include <halton.h>
+
+HaltonSequenceGenerator_t::HaltonSequenceGenerator_t(int b)
+{
+	base=b;
+	fbase=(float) b;
+	seed=1;
+
+}
+
+float HaltonSequenceGenerator_t::GetElement(int elem)
+{
+	int tmpseed=seed;
+	float ret=0.0;
+	float base_inv=1.0/fbase;
+	while(tmpseed)
+	{
+		int dig=tmpseed % base;
+		ret+=((float) dig)*base_inv;
+		base_inv/=fbase;
+		tmpseed/=base;
+	}
+	return ret;
+}
diff --git a/mp/src/mathlib/imagequant.cpp b/mp/src/mathlib/imagequant.cpp
index fddafa74..47ba4f4c 100644
--- a/mp/src/mathlib/imagequant.cpp
+++ b/mp/src/mathlib/imagequant.cpp
@@ -1,96 +1,96 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: 
-//
-// $NoKeywords: $
-//
-//=============================================================================//
-#include <quantize.h>
-#include <minmax.h>
-
-#define N_EXTRAVALUES 1
-#define N_DIMENSIONS (3+N_EXTRAVALUES)
-
-#define PIXEL(x,y,c) Image[4*((x)+((Width*(y))))+c]
-
-static uint8 Weights[]={5,7,4,8};
-static int ExtraValueXForms[3*N_EXTRAVALUES]={
-	76,151,28,
-};
-
-  
-
-#define MAX_QUANTIZE_IMAGE_WIDTH 4096
-
-void ColorQuantize(uint8 const *Image,
-				   int Width,
-				   int Height,
-				   int flags, int ncolors,
-				   uint8 *out_pixels,
-				   uint8 *out_palette,
-				   int firstcolor)
-{
-	int Error[MAX_QUANTIZE_IMAGE_WIDTH+1][3][2];
-	struct Sample *s=AllocSamples(Width*Height,N_DIMENSIONS);
-	int x,y,c;
-	for(y=0;y<Height;y++)
-		for(x=0;x<Width;x++)
-		{
-			for(c=0;c<3;c++)
-				NthSample(s,y*Width+x,N_DIMENSIONS)->Value[c]=PIXEL(x,y,c);
-			// now, let's generate extra values to quantize on
-			for(int i=0;i<N_EXTRAVALUES;i++)
-			{
-				int val1=0;
-				for(c=0;c<3;c++)
-					val1+=PIXEL(x,y,c)*ExtraValueXForms[i*3+c];
-				val1>>=8;
-				NthSample(s,y*Width+x,N_DIMENSIONS)->Value[c]=(uint8)
-					(min(255,max(0,val1)));
-			}
-		}
-	struct QuantizedValue *q=Quantize(s,Width*Height,N_DIMENSIONS,
-									  ncolors,Weights,firstcolor);
-	delete[] s;
-	memset(out_palette,0x55,768);
-	for(int p=0;p<256;p++)
-	{
-		struct QuantizedValue *v=FindQNode(q,p);
-		if (v)
-			for(int c=0;c<3;c++)
-				out_palette[p*3+c]=v->Mean[c];
-	}
-	memset(Error,0,sizeof(Error));
-	for(y=0;y<Height;y++)
-	{
-		int ErrorUse=y & 1;
-		int ErrorUpdate=ErrorUse^1;
-		for(x=0;x<Width;x++)
-		{
-			uint8 samp[3];
-			for(c=0;c<3;c++)
-			{
-				int tryc=PIXEL(x,y,c);
-				if (! (flags & QUANTFLAGS_NODITHER))
-				{
-					tryc+=Error[x][c][ErrorUse];
-					Error[x][c][ErrorUse]=0;
-				}
-				samp[c]=(uint8) min(255,max(0,tryc));
-			}
-			struct QuantizedValue *f=FindMatch(samp,3,Weights,q);
-			out_pixels[Width*y+x]=(uint8) (f->value);
-			if (! (flags & QUANTFLAGS_NODITHER))
-				for(int i=0;i<3;i++)
-				{
-					int newerr=samp[i]-f->Mean[i];
-					int orthog_error=(newerr*3)/8;
-					Error[x+1][i][ErrorUse]+=orthog_error;
-					Error[x][i][ErrorUpdate]=orthog_error;
-					Error[x+1][i][ErrorUpdate]=newerr-2*orthog_error;
-				}
-		}
-	}
-	if (q) FreeQuantization(q);
-}
-
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+#include <quantize.h>
+#include <minmax.h>
+
+#define N_EXTRAVALUES 1
+#define N_DIMENSIONS (3+N_EXTRAVALUES)
+
+#define PIXEL(x,y,c) Image[4*((x)+((Width*(y))))+c]
+
+static uint8 Weights[]={5,7,4,8};
+static int ExtraValueXForms[3*N_EXTRAVALUES]={
+	76,151,28,
+};
+
+  
+
+#define MAX_QUANTIZE_IMAGE_WIDTH 4096
+
+void ColorQuantize(uint8 const *Image,
+				   int Width,
+				   int Height,
+				   int flags, int ncolors,
+				   uint8 *out_pixels,
+				   uint8 *out_palette,
+				   int firstcolor)
+{
+	int Error[MAX_QUANTIZE_IMAGE_WIDTH+1][3][2];
+	struct Sample *s=AllocSamples(Width*Height,N_DIMENSIONS);
+	int x,y,c;
+	for(y=0;y<Height;y++)
+		for(x=0;x<Width;x++)
+		{
+			for(c=0;c<3;c++)
+				NthSample(s,y*Width+x,N_DIMENSIONS)->Value[c]=PIXEL(x,y,c);
+			// now, let's generate extra values to quantize on
+			for(int i=0;i<N_EXTRAVALUES;i++)
+			{
+				int val1=0;
+				for(c=0;c<3;c++)
+					val1+=PIXEL(x,y,c)*ExtraValueXForms[i*3+c];
+				val1>>=8;
+				NthSample(s,y*Width+x,N_DIMENSIONS)->Value[c]=(uint8)
+					(min(255,max(0,val1)));
+			}
+		}
+	struct QuantizedValue *q=Quantize(s,Width*Height,N_DIMENSIONS,
+									  ncolors,Weights,firstcolor);
+	delete[] s;
+	memset(out_palette,0x55,768);
+	for(int p=0;p<256;p++)
+	{
+		struct QuantizedValue *v=FindQNode(q,p);
+		if (v)
+			for(int c=0;c<3;c++)
+				out_palette[p*3+c]=v->Mean[c];
+	}
+	memset(Error,0,sizeof(Error));
+	for(y=0;y<Height;y++)
+	{
+		int ErrorUse=y & 1;
+		int ErrorUpdate=ErrorUse^1;
+		for(x=0;x<Width;x++)
+		{
+			uint8 samp[3];
+			for(c=0;c<3;c++)
+			{
+				int tryc=PIXEL(x,y,c);
+				if (! (flags & QUANTFLAGS_NODITHER))
+				{
+					tryc+=Error[x][c][ErrorUse];
+					Error[x][c][ErrorUse]=0;
+				}
+				samp[c]=(uint8) min(255,max(0,tryc));
+			}
+			struct QuantizedValue *f=FindMatch(samp,3,Weights,q);
+			out_pixels[Width*y+x]=(uint8) (f->value);
+			if (! (flags & QUANTFLAGS_NODITHER))
+				for(int i=0;i<3;i++)
+				{
+					int newerr=samp[i]-f->Mean[i];
+					int orthog_error=(newerr*3)/8;
+					Error[x+1][i][ErrorUse]+=orthog_error;
+					Error[x][i][ErrorUpdate]=orthog_error;
+					Error[x+1][i][ErrorUpdate]=newerr-2*orthog_error;
+				}
+		}
+	}
+	if (q) FreeQuantization(q);
+}
+
diff --git a/mp/src/mathlib/lightdesc.cpp b/mp/src/mathlib/lightdesc.cpp
index c6e7bc8d..7d69282d 100644
--- a/mp/src/mathlib/lightdesc.cpp
+++ b/mp/src/mathlib/lightdesc.cpp
@@ -1,312 +1,312 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: 
-//
-//=====================================================================================//
-
-#include <ssemath.h>
-#include <lightdesc.h>
-#include "mathlib.h"
-
-void LightDesc_t::RecalculateDerivedValues(void)
-{
-	m_Flags = LIGHTTYPE_OPTIMIZATIONFLAGS_DERIVED_VALUES_CALCED;
-	if (m_Attenuation0)
-		m_Flags|=LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION0;
-	if (m_Attenuation1)
-		m_Flags|=LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION1;
-	if (m_Attenuation2)
-		m_Flags|=LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION2;
-	
-	if (m_Type==MATERIAL_LIGHT_SPOT)
-	{
-		m_ThetaDot=cos(m_Theta);
-		m_PhiDot=cos(m_Phi);
-		float spread=m_ThetaDot-m_PhiDot;
-		if (spread>1.0e-10)
-		{
-			// note - this quantity is very sensitive to round off error. the sse
-			// reciprocal approximation won't cut it here.
-			OneOver_ThetaDot_Minus_PhiDot=1.0/spread;
-		}
-		else
-		{
-			// hard falloff instead of divide by zero
-			OneOver_ThetaDot_Minus_PhiDot=1.0;
-		}				
-	}	
-	if (m_Type==MATERIAL_LIGHT_DIRECTIONAL)
-	{
-		// set position to be real far away in the right direction
-		m_Position=m_Direction;
-		m_Position *= 2.0e6;
-	}
-	
-	m_RangeSquared=m_Range*m_Range;
-
-}
-
-void LightDesc_t::ComputeLightAtPointsForDirectional(
-	const FourVectors &pos, const FourVectors &normal,
-	FourVectors &color, bool DoHalfLambert ) const
-{
-	FourVectors delta;
-	delta.DuplicateVector(m_Direction);
-//	delta.VectorNormalizeFast();
-	fltx4 strength=delta*normal;
-	if (DoHalfLambert)
-	{
-		strength=AddSIMD(MulSIMD(strength,Four_PointFives),Four_PointFives);
-	}
-	else
-		strength=MaxSIMD(Four_Zeros,delta*normal);
-		
-	color.x=AddSIMD(color.x,MulSIMD(strength,ReplicateX4(m_Color.x)));
-	color.y=AddSIMD(color.y,MulSIMD(strength,ReplicateX4(m_Color.y)));
-	color.z=AddSIMD(color.z,MulSIMD(strength,ReplicateX4(m_Color.z)));
-}
-
-
-void LightDesc_t::ComputeLightAtPoints( const FourVectors &pos, const FourVectors &normal,
-										FourVectors &color, bool DoHalfLambert ) const
-{
-	FourVectors delta;
-	Assert((m_Type==MATERIAL_LIGHT_POINT) || (m_Type==MATERIAL_LIGHT_SPOT) || (m_Type==MATERIAL_LIGHT_DIRECTIONAL));
-	switch (m_Type)
-	{
-		case MATERIAL_LIGHT_POINT:
-		case MATERIAL_LIGHT_SPOT:
-			delta.DuplicateVector(m_Position);
-			delta-=pos;
-			break;
-				
-		case MATERIAL_LIGHT_DIRECTIONAL:
-			ComputeLightAtPointsForDirectional( pos, normal, color, DoHalfLambert );
-			return;
-	}
-
-	fltx4 dist2 = delta*delta;
-
-	dist2=MaxSIMD( Four_Ones, dist2 );
-
-	fltx4 falloff;
-
-	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION0 )
-	{
-		falloff = ReplicateX4(m_Attenuation0);
-	}
-	else
-		falloff= Four_Epsilons;
-
-	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION1 )
-	{
-		falloff=AddSIMD(falloff,MulSIMD(ReplicateX4(m_Attenuation1),SqrtEstSIMD(dist2)));
-	}
-
-	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION2 )
-	{
-		falloff=AddSIMD(falloff,MulSIMD(ReplicateX4(m_Attenuation2),dist2));
-	}
-
-	falloff=ReciprocalEstSIMD(falloff);
-	// Cull out light beyond this radius
-	// now, zero out elements for which dist2 was > range^2. !!speed!! lights should store dist^2 in sse format
-	if (m_Range != 0.f)
-	{
-		fltx4 RangeSquared=ReplicateX4(m_RangeSquared); // !!speed!!
-		falloff=AndSIMD(falloff,CmpLtSIMD(dist2,RangeSquared));
-	}
-
-	delta.VectorNormalizeFast();
-	fltx4 strength=delta*normal;
-	if (DoHalfLambert)
-	{
-		strength=AddSIMD(MulSIMD(strength,Four_PointFives),Four_PointFives);
-	}
-	else
-		strength=MaxSIMD(Four_Zeros,delta*normal);
-		
-	switch(m_Type)
-	{
-		case MATERIAL_LIGHT_POINT:
-			// half-lambert
-			break;
-				
-		case MATERIAL_LIGHT_SPOT:
-		{
-			fltx4 dot2=SubSIMD(Four_Zeros,delta*m_Direction); // dot position with spot light dir for cone falloff
-
-
-			fltx4 cone_falloff_scale=MulSIMD(ReplicateX4(OneOver_ThetaDot_Minus_PhiDot),
-												 SubSIMD(dot2,ReplicateX4(m_PhiDot)));
-			cone_falloff_scale=MinSIMD(cone_falloff_scale,Four_Ones);
-			
-			if ((m_Falloff!=0.0) && (m_Falloff!=1.0))
-			{
-				// !!speed!! could compute integer exponent needed by powsimd and store in light
-				cone_falloff_scale=PowSIMD(cone_falloff_scale,m_Falloff);
-			}
-			strength=MulSIMD(cone_falloff_scale,strength);
-
-			// now, zero out lighting where dot2<phidot. This will mask out any invalid results
-			// from pow function, etc
-			fltx4 OutsideMask=CmpGtSIMD(dot2,ReplicateX4(m_PhiDot)); // outside light cone?
-			strength=AndSIMD(OutsideMask,strength);
-		}
-		break;
-			
-
-	}
-	strength=MulSIMD(strength,falloff);
-	color.x=AddSIMD(color.x,MulSIMD(strength,ReplicateX4(m_Color.x)));
-	color.y=AddSIMD(color.y,MulSIMD(strength,ReplicateX4(m_Color.y)));
-	color.z=AddSIMD(color.z,MulSIMD(strength,ReplicateX4(m_Color.z)));
-}
-
-
-
-void LightDesc_t::ComputeNonincidenceLightAtPoints( const FourVectors &pos, FourVectors &color ) const
-{
-	FourVectors delta;
-	Assert((m_Type==MATERIAL_LIGHT_POINT) || (m_Type==MATERIAL_LIGHT_SPOT) || (m_Type==MATERIAL_LIGHT_DIRECTIONAL));
-	switch (m_Type)
-	{
-		case MATERIAL_LIGHT_POINT:
-		case MATERIAL_LIGHT_SPOT:
-			delta.DuplicateVector(m_Position);
-			delta-=pos;
-			break;
-				
-		case MATERIAL_LIGHT_DIRECTIONAL:
-			return;
-	}
-
-	fltx4 dist2 = delta*delta;
-
-	dist2=MaxSIMD( Four_Ones, dist2 );
-
-	fltx4 falloff;
-
-	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION0 )
-	{
-		falloff = ReplicateX4(m_Attenuation0);
-	}
-	else
-		falloff= Four_Epsilons;
-
-	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION1 )
-	{
-		falloff=AddSIMD(falloff,MulSIMD(ReplicateX4(m_Attenuation1),SqrtEstSIMD(dist2)));
-	}
-
-	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION2 )
-	{
-		falloff=AddSIMD(falloff,MulSIMD(ReplicateX4(m_Attenuation2),dist2));
-	}
-
-	falloff=ReciprocalEstSIMD(falloff);
-	// Cull out light beyond this radius
-	// now, zero out elements for which dist2 was > range^2. !!speed!! lights should store dist^2 in sse format
-	if (m_Range != 0.f)
-	{
-		fltx4 RangeSquared=ReplicateX4(m_RangeSquared); // !!speed!!
-		falloff=AndSIMD(falloff,CmpLtSIMD(dist2,RangeSquared));
-	}
-
-	delta.VectorNormalizeFast();
-	fltx4 strength = Four_Ones;
-	//fltx4 strength=delta;
-	//fltx4 strength = MaxSIMD(Four_Zeros,delta);
-		
-	switch(m_Type)
-	{
-		case MATERIAL_LIGHT_POINT:
-			// half-lambert
-			break;
-				
-		case MATERIAL_LIGHT_SPOT:
-		{
-			fltx4 dot2=SubSIMD(Four_Zeros,delta*m_Direction); // dot position with spot light dir for cone falloff
-
-
-			fltx4 cone_falloff_scale=MulSIMD(ReplicateX4(OneOver_ThetaDot_Minus_PhiDot),
-												 SubSIMD(dot2,ReplicateX4(m_PhiDot)));
-			cone_falloff_scale=MinSIMD(cone_falloff_scale,Four_Ones);
-			
-			if ((m_Falloff!=0.0) && (m_Falloff!=1.0))
-			{
-				// !!speed!! could compute integer exponent needed by powsimd and store in light
-				cone_falloff_scale=PowSIMD(cone_falloff_scale,m_Falloff);
-			}
-			strength=MulSIMD(cone_falloff_scale,strength);
-
-			// now, zero out lighting where dot2<phidot. This will mask out any invalid results
-			// from pow function, etc
-			fltx4 OutsideMask=CmpGtSIMD(dot2,ReplicateX4(m_PhiDot)); // outside light cone?
-			strength=AndSIMD(OutsideMask,strength);
-		}
-		break;
-			
-
-	}
-	strength=MulSIMD(strength,falloff);
-	color.x=AddSIMD(color.x,MulSIMD(strength,ReplicateX4(m_Color.x)));
-	color.y=AddSIMD(color.y,MulSIMD(strength,ReplicateX4(m_Color.y)));
-	color.z=AddSIMD(color.z,MulSIMD(strength,ReplicateX4(m_Color.z)));
-}
-
-
-
-void LightDesc_t::SetupOldStyleAttenuation( float fQuadraticAttn, float fLinearAttn, float fConstantAttn )
-{
-	// old-style manually typed quadrtiac coefficients
-	if ( fQuadraticAttn < EQUAL_EPSILON )
-		fQuadraticAttn = 0;
-	
-	if ( fLinearAttn < EQUAL_EPSILON)
-		fLinearAttn = 0;
-	
-	if ( fConstantAttn < EQUAL_EPSILON)
-		fConstantAttn = 0;
-	
-	if ( ( fConstantAttn < EQUAL_EPSILON ) && 
-		 ( fLinearAttn < EQUAL_EPSILON ) && 
-		 ( fQuadraticAttn < EQUAL_EPSILON ) )
-		fConstantAttn = 1;
-
-	m_Attenuation2=fQuadraticAttn;
-	m_Attenuation1=fLinearAttn;
-	m_Attenuation0=fConstantAttn;
-	float fScaleFactor = fQuadraticAttn * 10000 + fLinearAttn * 100 + fConstantAttn;
-	
-	if ( fScaleFactor > 0 )
-		m_Color *= fScaleFactor;
-}
-
-void LightDesc_t::SetupNewStyleAttenuation( float fFiftyPercentDistance, 
-											float fZeroPercentDistance )
-{
-	// new style storing 50% and 0% distances
-	float d50=fFiftyPercentDistance;
-	float d0=fZeroPercentDistance;
-	if (d0<d50)
-	{
-		// !!warning in lib code???!!!
-		Warning("light has _fifty_percent_distance of %f but no zero_percent_distance\n",d50);
-		d0=2.0*d50;
-	}
-	float a=0,b=1,c=0;
-	if (! SolveInverseQuadraticMonotonic(0,1.0,d50,2.0,d0,256.0,a,b,c))
-	{
-		Warning("can't solve quadratic for light %f %f\n",d50,d0);
-	}
-	float v50=c+d50*(b+d50*a);
-	float scale=2.0/v50;
-	a*=scale;
-	b*=scale;
-	c*=scale;
-	m_Attenuation2=a;
-	m_Attenuation1=b;
-	m_Attenuation0=c;
-}
-
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+//=====================================================================================//
+
+#include <ssemath.h>
+#include <lightdesc.h>
+#include "mathlib.h"
+
+void LightDesc_t::RecalculateDerivedValues(void)
+{
+	m_Flags = LIGHTTYPE_OPTIMIZATIONFLAGS_DERIVED_VALUES_CALCED;
+	if (m_Attenuation0)
+		m_Flags|=LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION0;
+	if (m_Attenuation1)
+		m_Flags|=LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION1;
+	if (m_Attenuation2)
+		m_Flags|=LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION2;
+	
+	if (m_Type==MATERIAL_LIGHT_SPOT)
+	{
+		m_ThetaDot=cos(m_Theta);
+		m_PhiDot=cos(m_Phi);
+		float spread=m_ThetaDot-m_PhiDot;
+		if (spread>1.0e-10)
+		{
+			// note - this quantity is very sensitive to round off error. the sse
+			// reciprocal approximation won't cut it here.
+			OneOver_ThetaDot_Minus_PhiDot=1.0/spread;
+		}
+		else
+		{
+			// hard falloff instead of divide by zero
+			OneOver_ThetaDot_Minus_PhiDot=1.0;
+		}				
+	}	
+	if (m_Type==MATERIAL_LIGHT_DIRECTIONAL)
+	{
+		// set position to be real far away in the right direction
+		m_Position=m_Direction;
+		m_Position *= 2.0e6;
+	}
+	
+	m_RangeSquared=m_Range*m_Range;
+
+}
+
+void LightDesc_t::ComputeLightAtPointsForDirectional(
+	const FourVectors &pos, const FourVectors &normal,
+	FourVectors &color, bool DoHalfLambert ) const
+{
+	FourVectors delta;
+	delta.DuplicateVector(m_Direction);
+//	delta.VectorNormalizeFast();
+	fltx4 strength=delta*normal;
+	if (DoHalfLambert)
+	{
+		strength=AddSIMD(MulSIMD(strength,Four_PointFives),Four_PointFives);
+	}
+	else
+		strength=MaxSIMD(Four_Zeros,delta*normal);
+		
+	color.x=AddSIMD(color.x,MulSIMD(strength,ReplicateX4(m_Color.x)));
+	color.y=AddSIMD(color.y,MulSIMD(strength,ReplicateX4(m_Color.y)));
+	color.z=AddSIMD(color.z,MulSIMD(strength,ReplicateX4(m_Color.z)));
+}
+
+
+void LightDesc_t::ComputeLightAtPoints( const FourVectors &pos, const FourVectors &normal,
+										FourVectors &color, bool DoHalfLambert ) const
+{
+	FourVectors delta;
+	Assert((m_Type==MATERIAL_LIGHT_POINT) || (m_Type==MATERIAL_LIGHT_SPOT) || (m_Type==MATERIAL_LIGHT_DIRECTIONAL));
+	switch (m_Type)
+	{
+		case MATERIAL_LIGHT_POINT:
+		case MATERIAL_LIGHT_SPOT:
+			delta.DuplicateVector(m_Position);
+			delta-=pos;
+			break;
+				
+		case MATERIAL_LIGHT_DIRECTIONAL:
+			ComputeLightAtPointsForDirectional( pos, normal, color, DoHalfLambert );
+			return;
+	}
+
+	fltx4 dist2 = delta*delta;
+
+	dist2=MaxSIMD( Four_Ones, dist2 );
+
+	fltx4 falloff;
+
+	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION0 )
+	{
+		falloff = ReplicateX4(m_Attenuation0);
+	}
+	else
+		falloff= Four_Epsilons;
+
+	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION1 )
+	{
+		falloff=AddSIMD(falloff,MulSIMD(ReplicateX4(m_Attenuation1),SqrtEstSIMD(dist2)));
+	}
+
+	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION2 )
+	{
+		falloff=AddSIMD(falloff,MulSIMD(ReplicateX4(m_Attenuation2),dist2));
+	}
+
+	falloff=ReciprocalEstSIMD(falloff);
+	// Cull out light beyond this radius
+	// now, zero out elements for which dist2 was > range^2. !!speed!! lights should store dist^2 in sse format
+	if (m_Range != 0.f)
+	{
+		fltx4 RangeSquared=ReplicateX4(m_RangeSquared); // !!speed!!
+		falloff=AndSIMD(falloff,CmpLtSIMD(dist2,RangeSquared));
+	}
+
+	delta.VectorNormalizeFast();
+	fltx4 strength=delta*normal;
+	if (DoHalfLambert)
+	{
+		strength=AddSIMD(MulSIMD(strength,Four_PointFives),Four_PointFives);
+	}
+	else
+		strength=MaxSIMD(Four_Zeros,delta*normal);
+		
+	switch(m_Type)
+	{
+		case MATERIAL_LIGHT_POINT:
+			// half-lambert
+			break;
+				
+		case MATERIAL_LIGHT_SPOT:
+		{
+			fltx4 dot2=SubSIMD(Four_Zeros,delta*m_Direction); // dot position with spot light dir for cone falloff
+
+
+			fltx4 cone_falloff_scale=MulSIMD(ReplicateX4(OneOver_ThetaDot_Minus_PhiDot),
+												 SubSIMD(dot2,ReplicateX4(m_PhiDot)));
+			cone_falloff_scale=MinSIMD(cone_falloff_scale,Four_Ones);
+			
+			if ((m_Falloff!=0.0) && (m_Falloff!=1.0))
+			{
+				// !!speed!! could compute integer exponent needed by powsimd and store in light
+				cone_falloff_scale=PowSIMD(cone_falloff_scale,m_Falloff);
+			}
+			strength=MulSIMD(cone_falloff_scale,strength);
+
+			// now, zero out lighting where dot2<phidot. This will mask out any invalid results
+			// from pow function, etc
+			fltx4 OutsideMask=CmpGtSIMD(dot2,ReplicateX4(m_PhiDot)); // outside light cone?
+			strength=AndSIMD(OutsideMask,strength);
+		}
+		break;
+			
+
+	}
+	strength=MulSIMD(strength,falloff);
+	color.x=AddSIMD(color.x,MulSIMD(strength,ReplicateX4(m_Color.x)));
+	color.y=AddSIMD(color.y,MulSIMD(strength,ReplicateX4(m_Color.y)));
+	color.z=AddSIMD(color.z,MulSIMD(strength,ReplicateX4(m_Color.z)));
+}
+
+
+
+void LightDesc_t::ComputeNonincidenceLightAtPoints( const FourVectors &pos, FourVectors &color ) const
+{
+	FourVectors delta;
+	Assert((m_Type==MATERIAL_LIGHT_POINT) || (m_Type==MATERIAL_LIGHT_SPOT) || (m_Type==MATERIAL_LIGHT_DIRECTIONAL));
+	switch (m_Type)
+	{
+		case MATERIAL_LIGHT_POINT:
+		case MATERIAL_LIGHT_SPOT:
+			delta.DuplicateVector(m_Position);
+			delta-=pos;
+			break;
+				
+		case MATERIAL_LIGHT_DIRECTIONAL:
+			return;
+	}
+
+	fltx4 dist2 = delta*delta;
+
+	dist2=MaxSIMD( Four_Ones, dist2 );
+
+	fltx4 falloff;
+
+	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION0 )
+	{
+		falloff = ReplicateX4(m_Attenuation0);
+	}
+	else
+		falloff= Four_Epsilons;
+
+	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION1 )
+	{
+		falloff=AddSIMD(falloff,MulSIMD(ReplicateX4(m_Attenuation1),SqrtEstSIMD(dist2)));
+	}
+
+	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION2 )
+	{
+		falloff=AddSIMD(falloff,MulSIMD(ReplicateX4(m_Attenuation2),dist2));
+	}
+
+	falloff=ReciprocalEstSIMD(falloff);
+	// Cull out light beyond this radius
+	// now, zero out elements for which dist2 was > range^2. !!speed!! lights should store dist^2 in sse format
+	if (m_Range != 0.f)
+	{
+		fltx4 RangeSquared=ReplicateX4(m_RangeSquared); // !!speed!!
+		falloff=AndSIMD(falloff,CmpLtSIMD(dist2,RangeSquared));
+	}
+
+	delta.VectorNormalizeFast();
+	fltx4 strength = Four_Ones;
+	//fltx4 strength=delta;
+	//fltx4 strength = MaxSIMD(Four_Zeros,delta);
+		
+	switch(m_Type)
+	{
+		case MATERIAL_LIGHT_POINT:
+			// half-lambert
+			break;
+				
+		case MATERIAL_LIGHT_SPOT:
+		{
+			fltx4 dot2=SubSIMD(Four_Zeros,delta*m_Direction); // dot position with spot light dir for cone falloff
+
+
+			fltx4 cone_falloff_scale=MulSIMD(ReplicateX4(OneOver_ThetaDot_Minus_PhiDot),
+												 SubSIMD(dot2,ReplicateX4(m_PhiDot)));
+			cone_falloff_scale=MinSIMD(cone_falloff_scale,Four_Ones);
+			
+			if ((m_Falloff!=0.0) && (m_Falloff!=1.0))
+			{
+				// !!speed!! could compute integer exponent needed by powsimd and store in light
+				cone_falloff_scale=PowSIMD(cone_falloff_scale,m_Falloff);
+			}
+			strength=MulSIMD(cone_falloff_scale,strength);
+
+			// now, zero out lighting where dot2<phidot. This will mask out any invalid results
+			// from pow function, etc
+			fltx4 OutsideMask=CmpGtSIMD(dot2,ReplicateX4(m_PhiDot)); // outside light cone?
+			strength=AndSIMD(OutsideMask,strength);
+		}
+		break;
+			
+
+	}
+	strength=MulSIMD(strength,falloff);
+	color.x=AddSIMD(color.x,MulSIMD(strength,ReplicateX4(m_Color.x)));
+	color.y=AddSIMD(color.y,MulSIMD(strength,ReplicateX4(m_Color.y)));
+	color.z=AddSIMD(color.z,MulSIMD(strength,ReplicateX4(m_Color.z)));
+}
+
+
+
+void LightDesc_t::SetupOldStyleAttenuation( float fQuadraticAttn, float fLinearAttn, float fConstantAttn )
+{
+	// old-style manually typed quadrtiac coefficients
+	if ( fQuadraticAttn < EQUAL_EPSILON )
+		fQuadraticAttn = 0;
+	
+	if ( fLinearAttn < EQUAL_EPSILON)
+		fLinearAttn = 0;
+	
+	if ( fConstantAttn < EQUAL_EPSILON)
+		fConstantAttn = 0;
+	
+	if ( ( fConstantAttn < EQUAL_EPSILON ) && 
+		 ( fLinearAttn < EQUAL_EPSILON ) && 
+		 ( fQuadraticAttn < EQUAL_EPSILON ) )
+		fConstantAttn = 1;
+
+	m_Attenuation2=fQuadraticAttn;
+	m_Attenuation1=fLinearAttn;
+	m_Attenuation0=fConstantAttn;
+	float fScaleFactor = fQuadraticAttn * 10000 + fLinearAttn * 100 + fConstantAttn;
+	
+	if ( fScaleFactor > 0 )
+		m_Color *= fScaleFactor;
+}
+
+void LightDesc_t::SetupNewStyleAttenuation( float fFiftyPercentDistance, 
+											float fZeroPercentDistance )
+{
+	// new style storing 50% and 0% distances
+	float d50=fFiftyPercentDistance;
+	float d0=fZeroPercentDistance;
+	if (d0<d50)
+	{
+		// !!warning in lib code???!!!
+		Warning("light has _fifty_percent_distance of %f but no zero_percent_distance\n",d50);
+		d0=2.0*d50;
+	}
+	float a=0,b=1,c=0;
+	if (! SolveInverseQuadraticMonotonic(0,1.0,d50,2.0,d0,256.0,a,b,c))
+	{
+		Warning("can't solve quadratic for light %f %f\n",d50,d0);
+	}
+	float v50=c+d50*(b+d50*a);
+	float scale=2.0/v50;
+	a*=scale;
+	b*=scale;
+	c*=scale;
+	m_Attenuation2=a;
+	m_Attenuation1=b;
+	m_Attenuation0=c;
+}
+
diff --git a/mp/src/mathlib/mathlib.vpc b/mp/src/mathlib/mathlib.vpc
index 59a0c95f..17021025 100644
--- a/mp/src/mathlib/mathlib.vpc
+++ b/mp/src/mathlib/mathlib.vpc
@@ -1,84 +1,82 @@
-//-----------------------------------------------------------------------------
-//	MATHLIB.VPC
-//
-//	Project Script
-//-----------------------------------------------------------------------------
-
-$macro SRCDIR		".."
-$Macro OUTLIBDIR	"$SRCDIR\lib\public" [!$LINUX]
-
-$include "$SRCDIR\vpc_scripts\source_lib_base.vpc"
-
-$Configuration
-{
-	$Compiler
-	{
-		$AdditionalIncludeDirectories	"$BASE;..\public\mathlib"
-		$PreprocessorDefinitions		"$BASE;MATHLIB_LIB"
-	}
-}
-
-$Project "mathlib"
-{
-	$Folder	"Source Files"
-	{
-		$File	"color_conversion.cpp"
-		$File	"halton.cpp"
-		$File	"lightdesc.cpp"
-		$File	"mathlib_base.cpp"
-		$File	"powsse.cpp"
-		$File	"sparse_convolution_noise.cpp"
-		$File	"sseconst.cpp"
-		$File	"sse.cpp"					[$WINDOWS||$POSIX]
-		$File	"ssenoise.cpp"				
-		$File	"3dnow.cpp"					[$WINDOWS||$LINUX]
-		$File	"anorms.cpp"
-		$File	"bumpvects.cpp"
-		$File	"IceKey.cpp"
-		$File	"imagequant.cpp"
-		$File	"polyhedron.cpp"
-		$File	"quantize.cpp"
-		$File	"randsse.cpp"
-		$File	"spherical.cpp"
-		$File	"simdvectormatrix.cpp"
-		$File	"vector.cpp"
-		$File	"vmatrix.cpp"
-		$File	"almostequal.cpp"
-	}
-
-
-
-	$Folder	"Public Header Files"
-	{
-		$File	"$SRCDIR\public\mathlib\amd3dx.h"			[$WINDOWS||$LINUX]		
-		$File	"$SRCDIR\public\mathlib\anorms.h"
-		$File	"$SRCDIR\public\mathlib\bumpvects.h"		
-		$File	"$SRCDIR\public\mathlib\compressed_3d_unitvec.h"
-		$File	"$SRCDIR\public\mathlib\compressed_light_cube.h"
-		$File	"$SRCDIR\public\mathlib\compressed_vector.h"
-		$File	"$SRCDIR\public\mathlib\halton.h"
-		$File	"$SRCDIR\public\mathlib\IceKey.H"
-		$File	"$SRCDIR\public\mathlib\lightdesc.h"
-		$File	"$SRCDIR\public\mathlib\math_pfns.h"
-		$File	"$SRCDIR\public\mathlib\mathlib.h"
-		$File	"$SRCDIR\public\mathlib\noise.h"
-		$File	"$SRCDIR\public\mathlib\polyhedron.h"
-		$File	"$SRCDIR\public\mathlib\quantize.h"
-		$File	"$SRCDIR\public\mathlib\simdvectormatrix.h"
-		$File	"$SRCDIR\public\mathlib\spherical_geometry.h"		
-		$File	"$SRCDIR\public\mathlib\ssemath.h"		
-		$File	"$SRCDIR\public\mathlib\ssequaternion.h"		
-		$File	"$SRCDIR\public\mathlib\vector.h"
-		$File	"$SRCDIR\public\mathlib\vector2d.h"
-		$File	"$SRCDIR\public\mathlib\vector4d.h"
-		$File	"$SRCDIR\public\mathlib\vmatrix.h"
-		$File	"$SRCDIR\public\mathlib\vplane.h"
-	}
-
-	$Folder	"Header Files"
-	{
-		$File	"noisedata.h"
-		$File	"sse.h"					[$WINDOWS||$POSIX]
-		$File	"3dnow.h"				[$WINDOWS||$LINUX]
-	}
-}
+//-----------------------------------------------------------------------------
+//	MATHLIB.VPC
+//
+//	Project Script
+//-----------------------------------------------------------------------------
+
+$macro SRCDIR		".."
+$include "$SRCDIR\vpc_scripts\source_lib_base.vpc"
+
+$Configuration
+{
+	$Compiler
+	{
+		$AdditionalIncludeDirectories	"$BASE;..\public\mathlib"
+		$PreprocessorDefinitions		"$BASE;MATHLIB_LIB"
+	}
+}
+
+$Project "mathlib"
+{
+	$Folder	"Source Files"
+	{
+		$File	"color_conversion.cpp"
+		$File	"halton.cpp"
+		$File	"lightdesc.cpp"
+		$File	"mathlib_base.cpp"
+		$File	"powsse.cpp"
+		$File	"sparse_convolution_noise.cpp"
+		$File	"sseconst.cpp"
+		$File	"sse.cpp"					[$WINDOWS||$POSIX]
+		$File	"ssenoise.cpp"				
+		$File	"3dnow.cpp"					[$WINDOWS||$LINUX]
+		$File	"anorms.cpp"
+		$File	"bumpvects.cpp"
+		$File	"IceKey.cpp"
+		$File	"imagequant.cpp"
+		$File	"polyhedron.cpp"
+		$File	"quantize.cpp"
+		$File	"randsse.cpp"
+		$File	"spherical.cpp"
+		$File	"simdvectormatrix.cpp"
+		$File	"vector.cpp"
+		$File	"vmatrix.cpp"
+		$File	"almostequal.cpp"
+	}
+
+
+
+	$Folder	"Public Header Files"
+	{
+		$File	"$SRCDIR\public\mathlib\amd3dx.h"			[$WINDOWS||$LINUX]		
+		$File	"$SRCDIR\public\mathlib\anorms.h"
+		$File	"$SRCDIR\public\mathlib\bumpvects.h"		
+		$File	"$SRCDIR\public\mathlib\compressed_3d_unitvec.h"
+		$File	"$SRCDIR\public\mathlib\compressed_light_cube.h"
+		$File	"$SRCDIR\public\mathlib\compressed_vector.h"
+		$File	"$SRCDIR\public\mathlib\halton.h"
+		$File	"$SRCDIR\public\mathlib\IceKey.H"
+		$File	"$SRCDIR\public\mathlib\lightdesc.h"
+		$File	"$SRCDIR\public\mathlib\math_pfns.h"
+		$File	"$SRCDIR\public\mathlib\mathlib.h"
+		$File	"$SRCDIR\public\mathlib\noise.h"
+		$File	"$SRCDIR\public\mathlib\polyhedron.h"
+		$File	"$SRCDIR\public\mathlib\quantize.h"
+		$File	"$SRCDIR\public\mathlib\simdvectormatrix.h"
+		$File	"$SRCDIR\public\mathlib\spherical_geometry.h"		
+		$File	"$SRCDIR\public\mathlib\ssemath.h"		
+		$File	"$SRCDIR\public\mathlib\ssequaternion.h"		
+		$File	"$SRCDIR\public\mathlib\vector.h"
+		$File	"$SRCDIR\public\mathlib\vector2d.h"
+		$File	"$SRCDIR\public\mathlib\vector4d.h"
+		$File	"$SRCDIR\public\mathlib\vmatrix.h"
+		$File	"$SRCDIR\public\mathlib\vplane.h"
+	}
+
+	$Folder	"Header Files"
+	{
+		$File	"noisedata.h"
+		$File	"sse.h"					[$WINDOWS||$POSIX]
+		$File	"3dnow.h"				[$WINDOWS||$LINUX]
+	}
+}
diff --git a/mp/src/mathlib/mathlib_base.cpp b/mp/src/mathlib/mathlib_base.cpp
index fce3547b..a403ccfa 100644
--- a/mp/src/mathlib/mathlib_base.cpp
+++ b/mp/src/mathlib/mathlib_base.cpp
@@ -1,4293 +1,4293 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: Math primitives.
-//
-//===========================================================================//
-
-/// FIXME: As soon as all references to mathlib.c are gone, include it in here
-
-#include <math.h>
-#include <float.h>	// Needed for FLT_EPSILON
-
-#include "tier0/basetypes.h"
-#include <memory.h>
-#include "tier0/dbg.h"
-
-#include "tier0/vprof.h"
-//#define _VPROF_MATHLIB
-
-#pragma warning(disable:4244)   // "conversion from 'const int' to 'float', possible loss of data"
-#pragma warning(disable:4730)	// "mixing _m64 and floating point expressions may result in incorrect code"
-
-#include "mathlib/mathlib.h"
-#include "mathlib/vector.h"
-#if !defined( _X360 )
-#include "mathlib/amd3dx.h"
-#ifndef OSX
-#include "3dnow.h"
-#endif
-#include "sse.h"
-#endif
-
-#include "mathlib/ssemath.h"
-#include "mathlib/ssequaternion.h"
-
-// memdbgon must be the last include file in a .cpp file!!!
-#include "tier0/memdbgon.h"
-
-bool s_bMathlibInitialized = false;
-
-#ifdef PARANOID
-// User must provide an implementation of Sys_Error()
-void Sys_Error (char *error, ...);
-#endif
-
-const Vector vec3_origin(0,0,0);
-const QAngle vec3_angle(0,0,0);
-const Vector vec3_invalid( FLT_MAX, FLT_MAX, FLT_MAX );
-const int nanmask = 255<<23;
-
-//-----------------------------------------------------------------------------
-// Standard C implementations of optimized routines:
-//-----------------------------------------------------------------------------
-float _sqrtf(float _X)
-{
-	Assert( s_bMathlibInitialized );
-	return sqrtf(_X); 
-}
-
-float _rsqrtf(float x)
-{
-	Assert( s_bMathlibInitialized );
-
-	return 1.f / _sqrtf( x );
-}
-
-float FASTCALL _VectorNormalize (Vector& vec)
-{
-#ifdef _VPROF_MATHLIB
-	VPROF_BUDGET( "_VectorNormalize", "Mathlib" );
-#endif
-	Assert( s_bMathlibInitialized );
-	float radius = sqrtf(vec.x*vec.x + vec.y*vec.y + vec.z*vec.z);
-
-	// FLT_EPSILON is added to the radius to eliminate the possibility of divide by zero.
-	float iradius = 1.f / ( radius + FLT_EPSILON );
-	
-	vec.x *= iradius;
-	vec.y *= iradius;
-	vec.z *= iradius;
-	
-	return radius;
-}
-
-// TODO: Add fast C VectorNormalizeFast.
-// Perhaps use approximate rsqrt trick, if the accuracy isn't too bad.
-void FASTCALL _VectorNormalizeFast (Vector& vec)
-{
-	Assert( s_bMathlibInitialized );
-
-	// FLT_EPSILON is added to the radius to eliminate the possibility of divide by zero.
-	float iradius = 1.f / ( sqrtf(vec.x*vec.x + vec.y*vec.y + vec.z*vec.z) + FLT_EPSILON );
-	
-	vec.x *= iradius;
-	vec.y *= iradius;
-	vec.z *= iradius;
-	
-}
-
-float _InvRSquared(const float* v)
-{
-	Assert( s_bMathlibInitialized );
-	float	r2 = DotProduct(v, v);
-	return r2 < 1.f ? 1.f : 1/r2;
-}
-
-//-----------------------------------------------------------------------------
-// Function pointers selecting the appropriate implementation
-//-----------------------------------------------------------------------------
-float (*pfSqrt)(float x)  = _sqrtf;
-float (*pfRSqrt)(float x) = _rsqrtf;
-float (*pfRSqrtFast)(float x) = _rsqrtf;
-float (FASTCALL *pfVectorNormalize)(Vector& v) = _VectorNormalize;
-void  (FASTCALL *pfVectorNormalizeFast)(Vector& v) = _VectorNormalizeFast;
-float (*pfInvRSquared)(const float* v) = _InvRSquared;
-void  (*pfFastSinCos)(float x, float* s, float* c) = SinCos;
-float (*pfFastCos)(float x) = cosf;
-
-float SinCosTable[SIN_TABLE_SIZE];
-void InitSinCosTable()
-{
-	for( int i = 0; i < SIN_TABLE_SIZE; i++ )
-	{
-		SinCosTable[i] = sin(i * 2.0 * M_PI / SIN_TABLE_SIZE);
-	}
-}
-
-qboolean VectorsEqual( const float *v1, const float *v2 )
-{
-	Assert( s_bMathlibInitialized );
-	return ( ( v1[0] == v2[0] ) &&
-		     ( v1[1] == v2[1] ) &&
-			 ( v1[2] == v2[2] ) );
-}
-
-
-//-----------------------------------------------------------------------------
-// Purpose: Generates Euler angles given a left-handed orientation matrix. The
-//			columns of the matrix contain the forward, left, and up vectors.
-// Input  : matrix - Left-handed orientation matrix.
-//			angles[PITCH, YAW, ROLL]. Receives right-handed counterclockwise
-//				rotations in degrees around Y, Z, and X respectively.
-//-----------------------------------------------------------------------------
-
-void MatrixAngles( const matrix3x4_t& matrix, RadianEuler &angles, Vector &position )
-{
-	MatrixGetColumn( matrix, 3, position );
-	MatrixAngles( matrix, angles );
-}
-
-void MatrixAngles( const matrix3x4_t &matrix, Quaternion &q, Vector &pos )
-{
-#ifdef _VPROF_MATHLIB
-	VPROF_BUDGET( "MatrixQuaternion", "Mathlib" );
-#endif
-	float trace;
-	trace = matrix[0][0] + matrix[1][1] + matrix[2][2] + 1.0f;
-	if( trace > 1.0f + FLT_EPSILON ) 
-	{
-		// VPROF_INCREMENT_COUNTER("MatrixQuaternion A",1);
-		q.x = ( matrix[2][1] - matrix[1][2] );
-		q.y = ( matrix[0][2] - matrix[2][0] );
-		q.z = ( matrix[1][0] - matrix[0][1] );
-		q.w = trace;
-	} 
-	else if ( matrix[0][0] > matrix[1][1] && matrix[0][0] > matrix[2][2] ) 
-	{
-		// VPROF_INCREMENT_COUNTER("MatrixQuaternion B",1);
-		trace = 1.0f + matrix[0][0] - matrix[1][1] - matrix[2][2];
-		q.x = trace;
-		q.y = (matrix[1][0] + matrix[0][1] );
-		q.z = (matrix[0][2] + matrix[2][0] );
-		q.w = (matrix[2][1] - matrix[1][2] );
-	} 
-	else if (matrix[1][1] > matrix[2][2])
-	{
-		// VPROF_INCREMENT_COUNTER("MatrixQuaternion C",1);
-		trace = 1.0f + matrix[1][1] - matrix[0][0] - matrix[2][2];
-		q.x = (matrix[0][1] + matrix[1][0] );
-		q.y = trace;
-		q.z = (matrix[2][1] + matrix[1][2] );
-		q.w = (matrix[0][2] - matrix[2][0] );
-	}
-	else
-	{
-		// VPROF_INCREMENT_COUNTER("MatrixQuaternion D",1);
-		trace = 1.0f + matrix[2][2] - matrix[0][0] - matrix[1][1];
-		q.x = (matrix[0][2] + matrix[2][0] );
-		q.y = (matrix[2][1] + matrix[1][2] );
-		q.z = trace;
-		q.w = (matrix[1][0] - matrix[0][1] );
-	}
-
-	QuaternionNormalize( q );
-
-#if 0
-	// check against the angle version
-	RadianEuler ang;
-	MatrixAngles( matrix, ang );
-	Quaternion test;
-	AngleQuaternion( ang, test );
-	float d = QuaternionDotProduct( q, test );
-	Assert( fabs(d) > 0.99 && fabs(d) < 1.01 );
-#endif
-
-	MatrixGetColumn( matrix, 3, pos );
-}
-
-void MatrixAngles( const matrix3x4_t& matrix, float *angles )
-{ 
-#ifdef _VPROF_MATHLIB
-	VPROF_BUDGET( "MatrixAngles", "Mathlib" );
-#endif
-	Assert( s_bMathlibInitialized );
-	float forward[3];
-	float left[3];
-	float up[3];
-
-	//
-	// Extract the basis vectors from the matrix. Since we only need the Z
-	// component of the up vector, we don't get X and Y.
-	//
-	forward[0] = matrix[0][0];
-	forward[1] = matrix[1][0];
-	forward[2] = matrix[2][0];
-	left[0] = matrix[0][1];
-	left[1] = matrix[1][1];
-	left[2] = matrix[2][1];
-	up[2] = matrix[2][2];
-
-	float xyDist = sqrtf( forward[0] * forward[0] + forward[1] * forward[1] );
-	
-	// enough here to get angles?
-	if ( xyDist > 0.001f )
-	{
-		// (yaw)	y = ATAN( forward.y, forward.x );		-- in our space, forward is the X axis
-		angles[1] = RAD2DEG( atan2f( forward[1], forward[0] ) );
-
-		// (pitch)	x = ATAN( -forward.z, sqrt(forward.x*forward.x+forward.y*forward.y) );
-		angles[0] = RAD2DEG( atan2f( -forward[2], xyDist ) );
-
-		// (roll)	z = ATAN( left.z, up.z );
-		angles[2] = RAD2DEG( atan2f( left[2], up[2] ) );
-	}
-	else	// forward is mostly Z, gimbal lock-
-	{
-		// (yaw)	y = ATAN( -left.x, left.y );			-- forward is mostly z, so use right for yaw
-		angles[1] = RAD2DEG( atan2f( -left[0], left[1] ) );
-
-		// (pitch)	x = ATAN( -forward.z, sqrt(forward.x*forward.x+forward.y*forward.y) );
-		angles[0] = RAD2DEG( atan2f( -forward[2], xyDist ) );
-
-		// Assume no roll in this case as one degree of freedom has been lost (i.e. yaw == roll)
-		angles[2] = 0;
-	}
-}
-
-
-// transform in1 by the matrix in2
-void VectorTransform (const float *in1, const matrix3x4_t& in2, float *out)
-{
-	Assert( s_bMathlibInitialized );
-	Assert( in1 != out );
-	out[0] = DotProduct(in1, in2[0]) + in2[0][3];
-	out[1] = DotProduct(in1, in2[1]) + in2[1][3];
-	out[2] = DotProduct(in1, in2[2]) + in2[2][3];
-}
-
-
-// assuming the matrix is orthonormal, transform in1 by the transpose (also the inverse in this case) of in2.
-void VectorITransform (const float *in1, const matrix3x4_t& in2, float *out)
-{
-	Assert( s_bMathlibInitialized );
-	float in1t[3];
-
-	in1t[0] = in1[0] - in2[0][3];
-	in1t[1] = in1[1] - in2[1][3];
-	in1t[2] = in1[2] - in2[2][3];
-
-	out[0] = in1t[0] * in2[0][0] + in1t[1] * in2[1][0] + in1t[2] * in2[2][0];
-	out[1] = in1t[0] * in2[0][1] + in1t[1] * in2[1][1] + in1t[2] * in2[2][1];
-	out[2] = in1t[0] * in2[0][2] + in1t[1] * in2[1][2] + in1t[2] * in2[2][2];
-}
-
-
-// assume in2 is a rotation and rotate the input vector
-void VectorRotate( const float *in1, const matrix3x4_t& in2, float *out )
-{
-	Assert( s_bMathlibInitialized );
-	Assert( in1 != out );
-	out[0] = DotProduct( in1, in2[0] );
-	out[1] = DotProduct( in1, in2[1] );
-	out[2] = DotProduct( in1, in2[2] );
-}
-
-// assume in2 is a rotation and rotate the input vector
-void VectorRotate( const Vector &in1, const QAngle &in2, Vector &out )
-{
-	matrix3x4_t matRotate;
-	AngleMatrix( in2, matRotate );
-	VectorRotate( in1, matRotate, out );
-}
-
-// assume in2 is a rotation and rotate the input vector
-void VectorRotate( const Vector &in1, const Quaternion &in2, Vector &out )
-{
-	matrix3x4_t matRotate;
-	QuaternionMatrix( in2, matRotate );
-	VectorRotate( in1, matRotate, out );
-}
-
-
-// rotate by the inverse of the matrix
-void VectorIRotate( const float *in1, const matrix3x4_t& in2, float *out )
-{
-	Assert( s_bMathlibInitialized );
-	Assert( in1 != out );
-	out[0] = in1[0]*in2[0][0] + in1[1]*in2[1][0] + in1[2]*in2[2][0];
-	out[1] = in1[0]*in2[0][1] + in1[1]*in2[1][1] + in1[2]*in2[2][1];
-	out[2] = in1[0]*in2[0][2] + in1[1]*in2[1][2] + in1[2]*in2[2][2];
-}
-
-#ifndef VECTOR_NO_SLOW_OPERATIONS
-// transform a set of angles in the output space of parentMatrix to the input space
-QAngle TransformAnglesToLocalSpace( const QAngle &angles, const matrix3x4_t &parentMatrix )
-{
-	matrix3x4_t angToWorld, worldToParent, localMatrix;
-	MatrixInvert( parentMatrix, worldToParent );
-	AngleMatrix( angles, angToWorld );
-	ConcatTransforms( worldToParent, angToWorld, localMatrix );
-	
-	QAngle out;
-	MatrixAngles( localMatrix, out );
-	return out;
-}
-
-// transform a set of angles in the input space of parentMatrix to the output space
-QAngle TransformAnglesToWorldSpace( const QAngle &angles, const matrix3x4_t &parentMatrix )
-{
-	matrix3x4_t angToParent, angToWorld;
-	AngleMatrix( angles, angToParent );
-	ConcatTransforms( parentMatrix, angToParent, angToWorld );
-	QAngle out;
-	MatrixAngles( angToWorld, out );
-	return out;
-}
-
-#endif // VECTOR_NO_SLOW_OPERATIONS
-
-void MatrixInitialize( matrix3x4_t &mat, const Vector &vecOrigin, const Vector &vecXAxis, const Vector &vecYAxis, const Vector &vecZAxis )
-{
-	MatrixSetColumn( vecXAxis, 0, mat );
-	MatrixSetColumn( vecYAxis, 1, mat );
-	MatrixSetColumn( vecZAxis, 2, mat );
-	MatrixSetColumn( vecOrigin, 3, mat );
-}
-
-void MatrixCopy( const matrix3x4_t& in, matrix3x4_t& out )
-{
-	Assert( s_bMathlibInitialized );
-	memcpy( out.Base(), in.Base(), sizeof( float ) * 3 * 4 );
-}
-
-//-----------------------------------------------------------------------------
-// Matrix equality test
-//-----------------------------------------------------------------------------
-bool MatricesAreEqual( const matrix3x4_t &src1, const matrix3x4_t &src2, float flTolerance )
-{
-	for ( int i = 0; i < 3; ++i )
-	{
-		for ( int j = 0; j < 4; ++j )
-		{
-			if ( fabs( src1[i][j] - src2[i][j] ) > flTolerance )
-				return false;
-		}
-	}
-	return true;
-}
-
-// NOTE: This is just the transpose not a general inverse
-void MatrixInvert( const matrix3x4_t& in, matrix3x4_t& out )
-{
-	Assert( s_bMathlibInitialized );
-	if ( &in == &out )
-	{
-		V_swap(out[0][1],out[1][0]);
-		V_swap(out[0][2],out[2][0]);
-		V_swap(out[1][2],out[2][1]);
-	}
-	else
-	{
-		// transpose the matrix
-		out[0][0] = in[0][0];
-		out[0][1] = in[1][0];
-		out[0][2] = in[2][0];
-
-		out[1][0] = in[0][1];
-		out[1][1] = in[1][1];
-		out[1][2] = in[2][1];
-
-		out[2][0] = in[0][2];
-		out[2][1] = in[1][2];
-		out[2][2] = in[2][2];
-	}
-
-	// now fix up the translation to be in the other space
-	float tmp[3];
-	tmp[0] = in[0][3];
-	tmp[1] = in[1][3];
-	tmp[2] = in[2][3];
-
-	out[0][3] = -DotProduct( tmp, out[0] );
-	out[1][3] = -DotProduct( tmp, out[1] );
-	out[2][3] = -DotProduct( tmp, out[2] );
-}
-
-void MatrixGetColumn( const matrix3x4_t& in, int column, Vector &out )
-{
-	out.x = in[0][column];
-	out.y = in[1][column];
-	out.z = in[2][column];
-}
-
-void MatrixSetColumn( const Vector &in, int column, matrix3x4_t& out )
-{
-	out[0][column] = in.x;
-	out[1][column] = in.y;
-	out[2][column] = in.z;
-}
-
-void MatrixScaleBy ( const float flScale, matrix3x4_t &out )
-{
-	out[0][0] *= flScale;
-	out[1][0] *= flScale;
-	out[2][0] *= flScale;
-	out[0][1] *= flScale;
-	out[1][1] *= flScale;
-	out[2][1] *= flScale;
-	out[0][2] *= flScale;
-	out[1][2] *= flScale;
-	out[2][2] *= flScale;
-}
-
-void MatrixScaleByZero ( matrix3x4_t &out )
-{
-	out[0][0] = 0.0f;
-	out[1][0] = 0.0f;
-	out[2][0] = 0.0f;
-	out[0][1] = 0.0f;
-	out[1][1] = 0.0f;
-	out[2][1] = 0.0f;
-	out[0][2] = 0.0f;
-	out[1][2] = 0.0f;
-	out[2][2] = 0.0f;
-}
-
-
-
-int VectorCompare (const float *v1, const float *v2)
-{
-	Assert( s_bMathlibInitialized );
-	int		i;
-	
-	for (i=0 ; i<3 ; i++)
-		if (v1[i] != v2[i])
-			return 0;
-			
-	return 1;
-}
-
-void CrossProduct (const float* v1, const float* v2, float* cross)
-{
-	Assert( s_bMathlibInitialized );
-	Assert( v1 != cross );
-	Assert( v2 != cross );
-	cross[0] = v1[1]*v2[2] - v1[2]*v2[1];
-	cross[1] = v1[2]*v2[0] - v1[0]*v2[2];
-	cross[2] = v1[0]*v2[1] - v1[1]*v2[0];
-}
-
-int Q_log2(int val)
-{
-	int answer=0;
-	while (val>>=1)
-		answer++;
-	return answer;
-}
-
-// Matrix is right-handed x=forward, y=left, z=up.  We a left-handed convention for vectors in the game code (forward, right, up)
-void MatrixVectors( const matrix3x4_t &matrix, Vector* pForward, Vector *pRight, Vector *pUp )
-{
-	MatrixGetColumn( matrix, 0, *pForward );
-	MatrixGetColumn( matrix, 1, *pRight );
-	MatrixGetColumn( matrix, 2, *pUp );
-	*pRight *= -1.0f;
-}
-
-
-void VectorVectors( const Vector &forward, Vector &right, Vector &up )
-{
-	Assert( s_bMathlibInitialized );
-	Vector tmp;
-
-	if (forward[0] == 0 && forward[1] == 0)
-	{
-		// pitch 90 degrees up/down from identity
-		right[0] = 0;	
-		right[1] = -1; 
-		right[2] = 0;
-		up[0] = -forward[2]; 
-		up[1] = 0; 
-		up[2] = 0;
-	}
-	else
-	{
-		tmp[0] = 0; tmp[1] = 0; tmp[2] = 1.0;
-		CrossProduct( forward, tmp, right );
-		VectorNormalize( right );
-		CrossProduct( right, forward, up );
-		VectorNormalize( up );
-	}
-}
-
-void VectorMatrix( const Vector &forward, matrix3x4_t& matrix)
-{
-	Assert( s_bMathlibInitialized );
-	Vector right, up;
-	VectorVectors(forward, right, up);
-
-	MatrixSetColumn( forward, 0, matrix );
-	MatrixSetColumn( -right, 1, matrix );
-	MatrixSetColumn( up, 2, matrix );
-}
-
-
-void VectorAngles( const float *forward, float *angles )
-{
-	Assert( s_bMathlibInitialized );
-	float	tmp, yaw, pitch;
-	
-	if (forward[1] == 0 && forward[0] == 0)
-	{
-		yaw = 0;
-		if (forward[2] > 0)
-			pitch = 270;
-		else
-			pitch = 90;
-	}
-	else
-	{
-		yaw = (atan2(forward[1], forward[0]) * 180 / M_PI);
-		if (yaw < 0)
-			yaw += 360;
-
-		tmp = sqrt (forward[0]*forward[0] + forward[1]*forward[1]);
-		pitch = (atan2(-forward[2], tmp) * 180 / M_PI);
-		if (pitch < 0)
-			pitch += 360;
-	}
-	
-	angles[0] = pitch;
-	angles[1] = yaw;
-	angles[2] = 0;
-}
-
-
-/*
-================
-R_ConcatRotations
-================
-*/
-void ConcatRotations (const float in1[3][3], const float in2[3][3], float out[3][3])
-{
-	Assert( s_bMathlibInitialized );
-	Assert( in1 != out );
-	Assert( in2 != out );
-	out[0][0] = in1[0][0] * in2[0][0] + in1[0][1] * in2[1][0] +
-				in1[0][2] * in2[2][0];
-	out[0][1] = in1[0][0] * in2[0][1] + in1[0][1] * in2[1][1] +
-				in1[0][2] * in2[2][1];
-	out[0][2] = in1[0][0] * in2[0][2] + in1[0][1] * in2[1][2] +
-				in1[0][2] * in2[2][2];
-	out[1][0] = in1[1][0] * in2[0][0] + in1[1][1] * in2[1][0] +
-				in1[1][2] * in2[2][0];
-	out[1][1] = in1[1][0] * in2[0][1] + in1[1][1] * in2[1][1] +
-				in1[1][2] * in2[2][1];
-	out[1][2] = in1[1][0] * in2[0][2] + in1[1][1] * in2[1][2] +
-				in1[1][2] * in2[2][2];
-	out[2][0] = in1[2][0] * in2[0][0] + in1[2][1] * in2[1][0] +
-				in1[2][2] * in2[2][0];
-	out[2][1] = in1[2][0] * in2[0][1] + in1[2][1] * in2[1][1] +
-				in1[2][2] * in2[2][1];
-	out[2][2] = in1[2][0] * in2[0][2] + in1[2][1] * in2[1][2] +
-				in1[2][2] * in2[2][2];
-}
-
-void ConcatTransforms_Aligned( const matrix3x4_t &m0, const matrix3x4_t &m1, matrix3x4_t &out )
-{
-	Assert( (((size_t)&m0) % 16) == 0 );
-	Assert( (((size_t)&m1) % 16) == 0 );
-	Assert( (((size_t)&out) % 16) == 0 );
-
-	fltx4 lastMask = *(fltx4 *)(&g_SIMD_ComponentMask[3]);
-	fltx4 rowA0 = LoadAlignedSIMD( m0.m_flMatVal[0] );
-	fltx4 rowA1 = LoadAlignedSIMD( m0.m_flMatVal[1] );
-	fltx4 rowA2 = LoadAlignedSIMD( m0.m_flMatVal[2] );
-
-	fltx4 rowB0 = LoadAlignedSIMD( m1.m_flMatVal[0] );
-	fltx4 rowB1 = LoadAlignedSIMD( m1.m_flMatVal[1] );
-	fltx4 rowB2 = LoadAlignedSIMD( m1.m_flMatVal[2] );
-
-	// now we have the rows of m0 and the columns of m1
-	// first output row
-	fltx4 A0 = SplatXSIMD(rowA0);
-	fltx4 A1 = SplatYSIMD(rowA0);
-	fltx4 A2 = SplatZSIMD(rowA0);
-	fltx4 mul00 = MulSIMD( A0, rowB0 );
-	fltx4 mul01 = MulSIMD( A1, rowB1 );
-	fltx4 mul02 = MulSIMD( A2, rowB2 );
-	fltx4 out0 = AddSIMD( mul00, AddSIMD(mul01,mul02) );
-
-	// second output row
-	A0 = SplatXSIMD(rowA1);
-	A1 = SplatYSIMD(rowA1);
-	A2 = SplatZSIMD(rowA1);
-	fltx4 mul10 = MulSIMD( A0, rowB0 );
-	fltx4 mul11 = MulSIMD( A1, rowB1 );
-	fltx4 mul12 = MulSIMD( A2, rowB2 );
-	fltx4 out1 = AddSIMD( mul10, AddSIMD(mul11,mul12) );
-
-	// third output row
-	A0 = SplatXSIMD(rowA2);
-	A1 = SplatYSIMD(rowA2);
-	A2 = SplatZSIMD(rowA2);
-	fltx4 mul20 = MulSIMD( A0, rowB0 );
-	fltx4 mul21 = MulSIMD( A1, rowB1 );
-	fltx4 mul22 = MulSIMD( A2, rowB2 );
-	fltx4 out2 = AddSIMD( mul20, AddSIMD(mul21,mul22) );
-
-	// add in translation vector
-	A0 = AndSIMD(rowA0,lastMask);
-	A1 = AndSIMD(rowA1,lastMask);
-	A2 = AndSIMD(rowA2,lastMask);
-	out0 = AddSIMD(out0, A0);
-	out1 = AddSIMD(out1, A1);
-	out2 = AddSIMD(out2, A2);
-
-	StoreAlignedSIMD( out.m_flMatVal[0], out0 );
-	StoreAlignedSIMD( out.m_flMatVal[1], out1 );
-	StoreAlignedSIMD( out.m_flMatVal[2], out2 );
-}
-
-/*
-================
-R_ConcatTransforms
-================
-*/
-
-void ConcatTransforms (const matrix3x4_t& in1, const matrix3x4_t& in2, matrix3x4_t& out)
-{
-#if 0
-	// test for ones that'll be 2x faster
-	if ( (((size_t)&in1) % 16) == 0 && (((size_t)&in2) % 16) == 0 && (((size_t)&out) % 16) == 0 )
-	{
-		ConcatTransforms_Aligned( in1, in2, out );
-		return;
-	}
-#endif
-
-	fltx4 lastMask = *(fltx4 *)(&g_SIMD_ComponentMask[3]);
-	fltx4 rowA0 = LoadUnalignedSIMD( in1.m_flMatVal[0] );
-	fltx4 rowA1 = LoadUnalignedSIMD( in1.m_flMatVal[1] );
-	fltx4 rowA2 = LoadUnalignedSIMD( in1.m_flMatVal[2] );
-
-	fltx4 rowB0 = LoadUnalignedSIMD( in2.m_flMatVal[0] );
-	fltx4 rowB1 = LoadUnalignedSIMD( in2.m_flMatVal[1] );
-	fltx4 rowB2 = LoadUnalignedSIMD( in2.m_flMatVal[2] );
-
-	// now we have the rows of m0 and the columns of m1
-	// first output row
-	fltx4 A0 = SplatXSIMD(rowA0);
-	fltx4 A1 = SplatYSIMD(rowA0);
-	fltx4 A2 = SplatZSIMD(rowA0);
-	fltx4 mul00 = MulSIMD( A0, rowB0 );
-	fltx4 mul01 = MulSIMD( A1, rowB1 );
-	fltx4 mul02 = MulSIMD( A2, rowB2 );
-	fltx4 out0 = AddSIMD( mul00, AddSIMD(mul01,mul02) );
-
-	// second output row
-	A0 = SplatXSIMD(rowA1);
-	A1 = SplatYSIMD(rowA1);
-	A2 = SplatZSIMD(rowA1);
-	fltx4 mul10 = MulSIMD( A0, rowB0 );
-	fltx4 mul11 = MulSIMD( A1, rowB1 );
-	fltx4 mul12 = MulSIMD( A2, rowB2 );
-	fltx4 out1 = AddSIMD( mul10, AddSIMD(mul11,mul12) );
-
-	// third output row
-	A0 = SplatXSIMD(rowA2);
-	A1 = SplatYSIMD(rowA2);
-	A2 = SplatZSIMD(rowA2);
-	fltx4 mul20 = MulSIMD( A0, rowB0 );
-	fltx4 mul21 = MulSIMD( A1, rowB1 );
-	fltx4 mul22 = MulSIMD( A2, rowB2 );
-	fltx4 out2 = AddSIMD( mul20, AddSIMD(mul21,mul22) );
-
-	// add in translation vector
-	A0 = AndSIMD(rowA0,lastMask);
-	A1 = AndSIMD(rowA1,lastMask);
-	A2 = AndSIMD(rowA2,lastMask);
-	out0 = AddSIMD(out0, A0);
-	out1 = AddSIMD(out1, A1);
-	out2 = AddSIMD(out2, A2);
-
-	// write to output
-	StoreUnalignedSIMD( out.m_flMatVal[0], out0 );
-	StoreUnalignedSIMD( out.m_flMatVal[1], out1 );
-	StoreUnalignedSIMD( out.m_flMatVal[2], out2 );
-}
-
-
-/*
-===================
-FloorDivMod
-
-Returns mathematically correct (floor-based) quotient and remainder for
-numer and denom, both of which should contain no fractional part. The
-quotient must fit in 32 bits.
-====================
-*/
-
-void FloorDivMod (double numer, double denom, int *quotient,
-		int *rem)
-{
-	Assert( s_bMathlibInitialized );
-	int		q, r;
-	double	x;
-
-#ifdef PARANOID
-	if (denom <= 0.0)
-		Sys_Error ("FloorDivMod: bad denominator %d\n", denom);
-
-//	if ((floor(numer) != numer) || (floor(denom) != denom))
-//		Sys_Error ("FloorDivMod: non-integer numer or denom %f %f\n",
-//				numer, denom);
-#endif
-
-	if (numer >= 0.0)
-	{
-
-		x = floor(numer / denom);
-		q = (int)x;
-		r = Floor2Int(numer - (x * denom));
-	}
-	else
-	{
-		//
-		// perform operations with positive values, and fix mod to make floor-based
-		//
-		x = floor(-numer / denom);
-		q = -(int)x;
-		r = Floor2Int(-numer - (x * denom));
-		if (r != 0)
-		{
-			q--;
-			r = (int)denom - r;
-		}
-	}
-
-	*quotient = q;
-	*rem = r;
-}
-
-
-/*
-===================
-GreatestCommonDivisor
-====================
-*/
-int GreatestCommonDivisor (int i1, int i2)
-{
-	Assert( s_bMathlibInitialized );
-	if (i1 > i2)
-	{
-		if (i2 == 0)
-			return (i1);
-		return GreatestCommonDivisor (i2, i1 % i2);
-	}
-	else
-	{
-		if (i1 == 0)
-			return (i2);
-		return GreatestCommonDivisor (i1, i2 % i1);
-	}
-}
-
-
-bool IsDenormal( const float &val )
-{
-	const int x = *reinterpret_cast <const int *> (&val); // needs 32-bit int
-	const int abs_mantissa = x & 0x007FFFFF;
-	const int biased_exponent = x & 0x7F800000;
-	
-	return  ( biased_exponent == 0 && abs_mantissa != 0 );
-}
-
-int SignbitsForPlane (cplane_t *out)
-{
-	Assert( s_bMathlibInitialized );
-	int	bits, j;
-
-	// for fast box on planeside test
-
-	bits = 0;
-	for (j=0 ; j<3 ; j++)
-	{
-		if (out->normal[j] < 0)
-			bits |= 1<<j;
-	}
-	return bits;
-}
-
-/*
-==================
-BoxOnPlaneSide
-
-Returns 1, 2, or 1 + 2
-==================
-*/
-int __cdecl BoxOnPlaneSide (const float *emins, const float *emaxs, const cplane_t *p)
-{
-	Assert( s_bMathlibInitialized );
-	float	dist1, dist2;
-	int		sides;
-
-	// fast axial cases
-	if (p->type < 3)
-	{
-		if (p->dist <= emins[p->type])
-			return 1;
-		if (p->dist >= emaxs[p->type])
-			return 2;
-		return 3;
-	}
-	
-	// general case
-	switch (p->signbits)
-	{
-	case 0:
-		dist1 = p->normal[0]*emaxs[0] + p->normal[1]*emaxs[1] + p->normal[2]*emaxs[2];
-		dist2 = p->normal[0]*emins[0] + p->normal[1]*emins[1] + p->normal[2]*emins[2];
-		break;
-	case 1:
-		dist1 = p->normal[0]*emins[0] + p->normal[1]*emaxs[1] + p->normal[2]*emaxs[2];
-		dist2 = p->normal[0]*emaxs[0] + p->normal[1]*emins[1] + p->normal[2]*emins[2];
-		break;
-	case 2:
-		dist1 = p->normal[0]*emaxs[0] + p->normal[1]*emins[1] + p->normal[2]*emaxs[2];
-		dist2 = p->normal[0]*emins[0] + p->normal[1]*emaxs[1] + p->normal[2]*emins[2];
-		break;
-	case 3:
-		dist1 = p->normal[0]*emins[0] + p->normal[1]*emins[1] + p->normal[2]*emaxs[2];
-		dist2 = p->normal[0]*emaxs[0] + p->normal[1]*emaxs[1] + p->normal[2]*emins[2];
-		break;
-	case 4:
-		dist1 = p->normal[0]*emaxs[0] + p->normal[1]*emaxs[1] + p->normal[2]*emins[2];
-		dist2 = p->normal[0]*emins[0] + p->normal[1]*emins[1] + p->normal[2]*emaxs[2];
-		break;
-	case 5:
-		dist1 = p->normal[0]*emins[0] + p->normal[1]*emaxs[1] + p->normal[2]*emins[2];
-		dist2 = p->normal[0]*emaxs[0] + p->normal[1]*emins[1] + p->normal[2]*emaxs[2];
-		break;
-	case 6:
-		dist1 = p->normal[0]*emaxs[0] + p->normal[1]*emins[1] + p->normal[2]*emins[2];
-		dist2 = p->normal[0]*emins[0] + p->normal[1]*emaxs[1] + p->normal[2]*emaxs[2];
-		break;
-	case 7:
-		dist1 = p->normal[0]*emins[0] + p->normal[1]*emins[1] + p->normal[2]*emins[2];
-		dist2 = p->normal[0]*emaxs[0] + p->normal[1]*emaxs[1] + p->normal[2]*emaxs[2];
-		break;
-	default:
-		dist1 = dist2 = 0;		// shut up compiler
-		Assert( 0 );
-		break;
-	}
-
-	sides = 0;
-	if (dist1 >= p->dist)
-		sides = 1;
-	if (dist2 < p->dist)
-		sides |= 2;
-
-	Assert( sides != 0 );
-
-	return sides;
-}
-
-//-----------------------------------------------------------------------------
-// Euler QAngle -> Basis Vectors
-//-----------------------------------------------------------------------------
-
-void AngleVectors (const QAngle &angles, Vector *forward)
-{
-	Assert( s_bMathlibInitialized );
-	Assert( forward );
-	
-	float	sp, sy, cp, cy;
-	
-	SinCos( DEG2RAD( angles[YAW] ), &sy, &cy );
-	SinCos( DEG2RAD( angles[PITCH] ), &sp, &cp );
-	
-	forward->x = cp*cy;
-	forward->y = cp*sy;
-	forward->z = -sp;
-}
-
-//-----------------------------------------------------------------------------
-// Euler QAngle -> Basis Vectors.  Each vector is optional
-//-----------------------------------------------------------------------------
-void AngleVectors( const QAngle &angles, Vector *forward, Vector *right, Vector *up )
-{
-	Assert( s_bMathlibInitialized );
-	
-	float sr, sp, sy, cr, cp, cy;
-
-#ifdef _X360
-	fltx4 radians, scale, sine, cosine;
-	radians = LoadUnaligned3SIMD( angles.Base() );
-	scale = ReplicateX4( M_PI_F / 180.f ); 
-	radians = MulSIMD( radians, scale );
-	SinCos3SIMD( sine, cosine, radians ); 	
-	sp = SubFloat( sine, 0 );	sy = SubFloat( sine, 1 );	sr = SubFloat( sine, 2 );
-	cp = SubFloat( cosine, 0 );	cy = SubFloat( cosine, 1 );	cr = SubFloat( cosine, 2 );
-#else
-	SinCos( DEG2RAD( angles[YAW] ), &sy, &cy );
-	SinCos( DEG2RAD( angles[PITCH] ), &sp, &cp );
-	SinCos( DEG2RAD( angles[ROLL] ), &sr, &cr );
-#endif
-
-	if (forward)
-	{
-		forward->x = cp*cy;
-		forward->y = cp*sy;
-		forward->z = -sp;
-	}
-
-	if (right)
-	{
-		right->x = (-1*sr*sp*cy+-1*cr*-sy);
-		right->y = (-1*sr*sp*sy+-1*cr*cy);
-		right->z = -1*sr*cp;
-	}
-
-	if (up)
-	{
-		up->x = (cr*sp*cy+-sr*-sy);
-		up->y = (cr*sp*sy+-sr*cy);
-		up->z = cr*cp;
-	}
-}
-
-//-----------------------------------------------------------------------------
-// Euler QAngle -> Basis Vectors transposed
-//-----------------------------------------------------------------------------
-
-void AngleVectorsTranspose (const QAngle &angles, Vector *forward, Vector *right, Vector *up)
-{
-	Assert( s_bMathlibInitialized );
-	float sr, sp, sy, cr, cp, cy;
-	
-	SinCos( DEG2RAD( angles[YAW] ), &sy, &cy );
-	SinCos( DEG2RAD( angles[PITCH] ), &sp, &cp );
-	SinCos( DEG2RAD( angles[ROLL] ), &sr, &cr );
-
-	if (forward)
-	{
-		forward->x	= cp*cy;
-		forward->y	= (sr*sp*cy+cr*-sy);
-		forward->z	= (cr*sp*cy+-sr*-sy);
-	}
-	
-	if (right)
-	{
-		right->x	= cp*sy;
-		right->y	= (sr*sp*sy+cr*cy);
-		right->z	= (cr*sp*sy+-sr*cy);
-	}
-
-	if (up)
-	{
-		up->x		= -sp;
-		up->y		= sr*cp;
-		up->z		= cr*cp;
-	}
-}
-
-//-----------------------------------------------------------------------------
-// Forward direction vector -> Euler angles
-//-----------------------------------------------------------------------------
-
-void VectorAngles( const Vector& forward, QAngle &angles )
-{
-	Assert( s_bMathlibInitialized );
-	float	tmp, yaw, pitch;
-	
-	if (forward[1] == 0 && forward[0] == 0)
-	{
-		yaw = 0;
-		if (forward[2] > 0)
-			pitch = 270;
-		else
-			pitch = 90;
-	}
-	else
-	{
-		yaw = (atan2(forward[1], forward[0]) * 180 / M_PI);
-		if (yaw < 0)
-			yaw += 360;
-
-		tmp = FastSqrt (forward[0]*forward[0] + forward[1]*forward[1]);
-		pitch = (atan2(-forward[2], tmp) * 180 / M_PI);
-		if (pitch < 0)
-			pitch += 360;
-	}
-	
-	angles[0] = pitch;
-	angles[1] = yaw;
-	angles[2] = 0;
-}
-
-//-----------------------------------------------------------------------------
-// Forward direction vector with a reference up vector -> Euler angles
-//-----------------------------------------------------------------------------
-
-void VectorAngles( const Vector &forward, const Vector &pseudoup, QAngle &angles )
-{
-	Assert( s_bMathlibInitialized );
-
-	Vector left;
-
-	CrossProduct( pseudoup, forward, left );
-	VectorNormalizeFast( left );		
-	
-	float xyDist = sqrtf( forward[0] * forward[0] + forward[1] * forward[1] );
-
-	// enough here to get angles?
-	if ( xyDist > 0.001f )
-	{
-		// (yaw)	y = ATAN( forward.y, forward.x );		-- in our space, forward is the X axis
-		angles[1] = RAD2DEG( atan2f( forward[1], forward[0] ) );
-
-		// The engine does pitch inverted from this, but we always end up negating it in the DLL
-		// UNDONE: Fix the engine to make it consistent
-		// (pitch)	x = ATAN( -forward.z, sqrt(forward.x*forward.x+forward.y*forward.y) );
-		angles[0] = RAD2DEG( atan2f( -forward[2], xyDist ) );
-
-		float up_z = (left[1] * forward[0]) - (left[0] * forward[1]);
-
-		// (roll)	z = ATAN( left.z, up.z );
-		angles[2] = RAD2DEG( atan2f( left[2], up_z ) );
-	}
-	else	// forward is mostly Z, gimbal lock-
-	{
-		// (yaw)	y = ATAN( -left.x, left.y );			-- forward is mostly z, so use right for yaw
-		angles[1] = RAD2DEG( atan2f( -left[0], left[1] ) ); //This was originally copied from the "void MatrixAngles( const matrix3x4_t& matrix, float *angles )" code, and it's 180 degrees off, negated the values and it all works now (Dave Kircher)
-
-		// The engine does pitch inverted from this, but we always end up negating it in the DLL
-		// UNDONE: Fix the engine to make it consistent
-		// (pitch)	x = ATAN( -forward.z, sqrt(forward.x*forward.x+forward.y*forward.y) );
-		angles[0] = RAD2DEG( atan2f( -forward[2], xyDist ) );
-
-		// Assume no roll in this case as one degree of freedom has been lost (i.e. yaw == roll)
-		angles[2] = 0;
-	}	
-}
-
-void SetIdentityMatrix( matrix3x4_t& matrix )
-{
-	memset( matrix.Base(), 0, sizeof(float)*3*4 );
-	matrix[0][0] = 1.0;
-	matrix[1][1] = 1.0;
-	matrix[2][2] = 1.0;
-}
-
-
-//-----------------------------------------------------------------------------
-// Builds a scale matrix
-//-----------------------------------------------------------------------------
-void SetScaleMatrix( float x, float y, float z, matrix3x4_t &dst )
-{
-	dst[0][0] = x;		dst[0][1] = 0.0f;	dst[0][2] = 0.0f;	dst[0][3] = 0.0f;
-	dst[1][0] = 0.0f;	dst[1][1] = y;		dst[1][2] = 0.0f;	dst[1][3] = 0.0f;
-	dst[2][0] = 0.0f;	dst[2][1] = 0.0f;	dst[2][2] = z;		dst[2][3] = 0.0f;
-}
-
-
-//-----------------------------------------------------------------------------
-// Purpose: Builds the matrix for a counterclockwise rotation about an arbitrary axis.
-//
-//		   | ax2 + (1 - ax2)cosQ		axay(1 - cosQ) - azsinQ		azax(1 - cosQ) + aysinQ |
-// Ra(Q) = | axay(1 - cosQ) + azsinQ	ay2 + (1 - ay2)cosQ			ayaz(1 - cosQ) - axsinQ |
-//		   | azax(1 - cosQ) - aysinQ	ayaz(1 - cosQ) + axsinQ		az2 + (1 - az2)cosQ     |
-//          
-// Input  : mat - 
-//			vAxisOrRot - 
-//			angle - 
-//-----------------------------------------------------------------------------
-void MatrixBuildRotationAboutAxis( const Vector &vAxisOfRot, float angleDegrees, matrix3x4_t &dst )
-{
-	float radians;
-	float axisXSquared;
-	float axisYSquared;
-	float axisZSquared;
-	float fSin;
-	float fCos;
-
-	radians = angleDegrees * ( M_PI / 180.0 );
-	fSin = sin( radians );
-	fCos = cos( radians );
-
-	axisXSquared = vAxisOfRot[0] * vAxisOfRot[0];
-	axisYSquared = vAxisOfRot[1] * vAxisOfRot[1];
-	axisZSquared = vAxisOfRot[2] * vAxisOfRot[2];
-
-	// Column 0:
-	dst[0][0] = axisXSquared + (1 - axisXSquared) * fCos;
-	dst[1][0] = vAxisOfRot[0] * vAxisOfRot[1] * (1 - fCos) + vAxisOfRot[2] * fSin;
-	dst[2][0] = vAxisOfRot[2] * vAxisOfRot[0] * (1 - fCos) - vAxisOfRot[1] * fSin;
-
-	// Column 1:
-	dst[0][1] = vAxisOfRot[0] * vAxisOfRot[1] * (1 - fCos) - vAxisOfRot[2] * fSin;
-	dst[1][1] = axisYSquared + (1 - axisYSquared) * fCos;
-	dst[2][1] = vAxisOfRot[1] * vAxisOfRot[2] * (1 - fCos) + vAxisOfRot[0] * fSin;
-
-	// Column 2:
-	dst[0][2] = vAxisOfRot[2] * vAxisOfRot[0] * (1 - fCos) + vAxisOfRot[1] * fSin;
-	dst[1][2] = vAxisOfRot[1] * vAxisOfRot[2] * (1 - fCos) - vAxisOfRot[0] * fSin;
-	dst[2][2] = axisZSquared + (1 - axisZSquared) * fCos;
-
-	// Column 3:
-	dst[0][3] = 0;
-	dst[1][3] = 0;
-	dst[2][3] = 0;
-}
-
-
-//-----------------------------------------------------------------------------
-// Computes the transpose
-//-----------------------------------------------------------------------------
-void MatrixTranspose( matrix3x4_t& mat )
-{
-	vec_t tmp;
-	tmp = mat[0][1]; mat[0][1] = mat[1][0]; mat[1][0] = tmp;
-	tmp = mat[0][2]; mat[0][2] = mat[2][0]; mat[2][0] = tmp;
-	tmp = mat[1][2]; mat[1][2] = mat[2][1]; mat[2][1] = tmp;
-}
-
-void MatrixTranspose( const matrix3x4_t& src, matrix3x4_t& dst )
-{
-	dst[0][0] = src[0][0]; dst[0][1] = src[1][0]; dst[0][2] = src[2][0]; dst[0][3] = 0.0f;
-	dst[1][0] = src[0][1]; dst[1][1] = src[1][1]; dst[1][2] = src[2][1]; dst[1][3] = 0.0f;
-	dst[2][0] = src[0][2]; dst[2][1] = src[1][2]; dst[2][2] = src[2][2]; dst[2][3] = 0.0f;
-}
-
-
-//-----------------------------------------------------------------------------
-// Purpose: converts engine euler angles into a matrix
-// Input  : vec3_t angles - PITCH, YAW, ROLL
-// Output : *matrix - left-handed column matrix
-//			the basis vectors for the rotations will be in the columns as follows:
-//			matrix[][0] is forward
-//			matrix[][1] is left
-//			matrix[][2] is up
-//-----------------------------------------------------------------------------
-void AngleMatrix( RadianEuler const &angles, const Vector &position, matrix3x4_t& matrix )
-{
-	AngleMatrix( angles, matrix );
-	MatrixSetColumn( position, 3, matrix );
-}
-
-void AngleMatrix( const RadianEuler& angles, matrix3x4_t& matrix )
-{
-	QAngle quakeEuler( RAD2DEG( angles.y ), RAD2DEG( angles.z ), RAD2DEG( angles.x ) );
-
-	AngleMatrix( quakeEuler, matrix );
-}
-
-
-void AngleMatrix( const QAngle &angles, const Vector &position, matrix3x4_t& matrix )
-{
-	AngleMatrix( angles, matrix );
-	MatrixSetColumn( position, 3, matrix );
-}
-
-void AngleMatrix( const QAngle &angles, matrix3x4_t& matrix )
-{
-#ifdef _VPROF_MATHLIB
-	VPROF_BUDGET( "AngleMatrix", "Mathlib" );
-#endif
-	Assert( s_bMathlibInitialized );
-
-	float sr, sp, sy, cr, cp, cy;
-
-#ifdef _X360
-	fltx4 radians, scale, sine, cosine;
-	radians = LoadUnaligned3SIMD( angles.Base() );
-	scale = ReplicateX4( M_PI_F / 180.f ); 
-	radians = MulSIMD( radians, scale );
-	SinCos3SIMD( sine, cosine, radians ); 	
-
-	sp = SubFloat( sine, 0 );	sy = SubFloat( sine, 1 );	sr = SubFloat( sine, 2 );
-	cp = SubFloat( cosine, 0 );	cy = SubFloat( cosine, 1 );	cr = SubFloat( cosine, 2 );
-#else
-	SinCos( DEG2RAD( angles[YAW] ), &sy, &cy );
-	SinCos( DEG2RAD( angles[PITCH] ), &sp, &cp );
-	SinCos( DEG2RAD( angles[ROLL] ), &sr, &cr );
-#endif
-
-	// matrix = (YAW * PITCH) * ROLL
-	matrix[0][0] = cp*cy;
-	matrix[1][0] = cp*sy;
-	matrix[2][0] = -sp;
-
-	float crcy = cr*cy;
-	float crsy = cr*sy;
-	float srcy = sr*cy;
-	float srsy = sr*sy;
-	matrix[0][1] = sp*srcy-crsy;
-	matrix[1][1] = sp*srsy+crcy;
-	matrix[2][1] = sr*cp;
-
-	matrix[0][2] = (sp*crcy+srsy);
-	matrix[1][2] = (sp*crsy-srcy);
-	matrix[2][2] = cr*cp;
-
-	matrix[0][3] = 0.0f;
-	matrix[1][3] = 0.0f;
-	matrix[2][3] = 0.0f;
-}
-
-void AngleIMatrix( const RadianEuler& angles, matrix3x4_t& matrix )
-{
-	QAngle quakeEuler( RAD2DEG( angles.y ), RAD2DEG( angles.z ), RAD2DEG( angles.x ) );
-
-	AngleIMatrix( quakeEuler, matrix );
-}
-
-void AngleIMatrix (const QAngle& angles, matrix3x4_t& matrix )
-{
-	Assert( s_bMathlibInitialized );
-	float		sr, sp, sy, cr, cp, cy;
-	
-	SinCos( DEG2RAD( angles[YAW] ), &sy, &cy );
-	SinCos( DEG2RAD( angles[PITCH] ), &sp, &cp );
-	SinCos( DEG2RAD( angles[ROLL] ), &sr, &cr );
-
-	// matrix = (YAW * PITCH) * ROLL
-	matrix[0][0] = cp*cy;
-	matrix[0][1] = cp*sy;
-	matrix[0][2] = -sp;
-	matrix[1][0] = sr*sp*cy+cr*-sy;
-	matrix[1][1] = sr*sp*sy+cr*cy;
-	matrix[1][2] = sr*cp;
-	matrix[2][0] = (cr*sp*cy+-sr*-sy);
-	matrix[2][1] = (cr*sp*sy+-sr*cy);
-	matrix[2][2] = cr*cp;
-	matrix[0][3] = 0.f;
-	matrix[1][3] = 0.f;
-	matrix[2][3] = 0.f;
-}
-
-void AngleIMatrix (const QAngle &angles, const Vector &position, matrix3x4_t &mat )
-{
-	AngleIMatrix( angles, mat );
-
-	Vector vecTranslation;
-	VectorRotate( position, mat, vecTranslation );
-	vecTranslation *= -1.0f;
-	MatrixSetColumn( vecTranslation, 3, mat );
-}
-
-
-//-----------------------------------------------------------------------------
-// Bounding box construction methods
-//-----------------------------------------------------------------------------
-
-void ClearBounds (Vector& mins, Vector& maxs)
-{
-	Assert( s_bMathlibInitialized );
-	mins[0] = mins[1] = mins[2] = 99999;
-	maxs[0] = maxs[1] = maxs[2] = -99999;
-}
-
-void AddPointToBounds (const Vector& v, Vector& mins, Vector& maxs)
-{
-	Assert( s_bMathlibInitialized );
-	int		i;
-	vec_t	val;
-
-	for (i=0 ; i<3 ; i++)
-	{
-		val = v[i];
-		if (val < mins[i])
-			mins[i] = val;
-		if (val > maxs[i])
-			maxs[i] = val;
-	}
-}
-
-// solve a x^2 + b x + c = 0
-bool SolveQuadratic( float a, float b, float c, float &root1, float &root2 )
-{
-	Assert( s_bMathlibInitialized );
-	if (a == 0)
-	{
-		if (b != 0)
-		{
-			// no x^2 component, it's a linear system
-			root1 = root2 = -c / b;
-			return true;
-		}
-		if (c == 0)
-		{
-			// all zero's
-			root1 = root2 = 0;
-			return true;
-		}
-		return false;
-	}
-
-	float tmp = b * b - 4.0f * a * c;
-
-	if (tmp < 0)
-	{
-		// imaginary number, bah, no solution.
-		return false;
-	}
-
-	tmp = sqrt( tmp );
-	root1 = (-b + tmp) / (2.0f * a);
-	root2 = (-b - tmp) / (2.0f * a);
-	return true;
-}
-
-// solves for "a, b, c" where "a x^2 + b x + c = y", return true if solution exists
-bool SolveInverseQuadratic( float x1, float y1, float x2, float y2, float x3, float y3, float &a, float &b, float &c )
-{
-	float det = (x1 - x2)*(x1 - x3)*(x2 - x3);
-
-	// FIXME: check with some sort of epsilon
-	if (det == 0.0)
-		return false;
-
-	a = (x3*(-y1 + y2) + x2*(y1 - y3) + x1*(-y2 + y3)) / det;
-
-	b = (x3*x3*(y1 - y2) + x1*x1*(y2 - y3) + x2*x2*(-y1 + y3)) / det;
-
-    c = (x1*x3*(-x1 + x3)*y2 + x2*x2*(x3*y1 - x1*y3) + x2*(-(x3*x3*y1) + x1*x1*y3)) / det;
-
-	return true;
-}
-
-bool SolveInverseQuadraticMonotonic( float x1, float y1, float x2, float y2, float x3, float y3, 
-									 float &a, float &b, float &c )
-{
-	// use SolveInverseQuadratic, but if the sigm of the derivative at the start point is the wrong
-	// sign, displace the mid point
-	
-	// first, sort parameters
-	if (x1>x2)
-	{
-		V_swap(x1,x2);
-		V_swap(y1,y2);
-	}
-	if (x2>x3)
-	{
-		V_swap(x2,x3);
-		V_swap(y2,y3);
-	}
-	if (x1>x2)
-	{
-		V_swap(x1,x2);
-		V_swap(y1,y2);
-	}
-	// this code is not fast. what it does is when the curve would be non-monotonic, slowly shifts
-	// the center point closer to the linear line between the endpoints. Should anyone need htis
-	// function to be actually fast, it would be fairly easy to change it to be so.
-	for(float blend_to_linear_factor=0.0;blend_to_linear_factor<=1.0;blend_to_linear_factor+=0.05)
-	{
-		float tempy2=(1-blend_to_linear_factor)*y2+blend_to_linear_factor*FLerp(y1,y3,x1,x3,x2);
-		if (!SolveInverseQuadratic(x1,y1,x2,tempy2,x3,y3,a,b,c))
-			return false;
-		float derivative=2.0*a+b;
-		if ( (y1<y2) && (y2<y3))							// monotonically increasing
-		{
-			if (derivative>=0.0)
-				return true;
-		}
-		else
-		{
-			if ( (y1>y2) && (y2>y3))							// monotonically decreasing
-			{
-				if (derivative<=0.0)
-					return true;
-			}
-			else
-				return true;
-		}
-	}
-	return true;
-}
-
-
-// solves for "a, b, c" where "1/(a x^2 + b x + c ) = y", return true if solution exists
-bool SolveInverseReciprocalQuadratic( float x1, float y1, float x2, float y2, float x3, float y3, float &a, float &b, float &c )
-{
-	float det = (x1 - x2)*(x1 - x3)*(x2 - x3)*y1*y2*y3;
-
-	// FIXME: check with some sort of epsilon
-	if (det == 0.0)
-		return false;
-
-	a = (x1*y1*(y2 - y3) + x3*(y1 - y2)*y3 + x2*y2*(-y1 + y3)) / det;
-
-	b = (x2*x2*y2*(y1 - y3) + x3*x3*(-y1 + y2)*y3 + x1*x1*y1*(-y2 + y3)) / det;
-
-	c = (x2*(x2 - x3)*x3*y2*y3 + x1*x1*y1*(x2*y2 - x3*y3) + x1*(-(x2*x2*y1*y2) + x3*x3*y1*y3)) / det;
-
-	return true;
-}
-
-
-// Rotate a vector around the Z axis (YAW)
-void VectorYawRotate( const Vector &in, float flYaw, Vector &out)
-{
-	Assert( s_bMathlibInitialized );
-	if (&in == &out )
-	{
-		Vector tmp;
-		tmp = in;
-		VectorYawRotate( tmp, flYaw, out );
-		return;
-	}
-
-	float sy, cy;
-
-	SinCos( DEG2RAD(flYaw), &sy, &cy );
-
-	out.x = in.x * cy - in.y * sy;
-	out.y = in.x * sy + in.y * cy;
-	out.z = in.z;
-}
-
-
-
-float Bias( float x, float biasAmt )
-{
-	// WARNING: not thread safe
-	static float lastAmt = -1;
-	static float lastExponent = 0;
-	if( lastAmt != biasAmt )
-	{
-		lastExponent = log( biasAmt ) * -1.4427f; // (-1.4427 = 1 / log(0.5))
-	}
-	return pow( x, lastExponent );
-}
-
-
-float Gain( float x, float biasAmt )
-{
-	// WARNING: not thread safe
-	if( x < 0.5 )
-		return 0.5f * Bias( 2*x, 1-biasAmt );
-	else
-		return 1 - 0.5f * Bias( 2 - 2*x, 1-biasAmt );
-}
-
-
-float SmoothCurve( float x )
-{
-	return (1 - cos( x * M_PI )) * 0.5f;
-}
-
-
-inline float MovePeak( float x, float flPeakPos )
-{
-	// Todo: make this higher-order?
-	if( x < flPeakPos )
-		return x * 0.5f / flPeakPos;
-	else
-		return 0.5 + 0.5 * (x - flPeakPos) / (1 - flPeakPos);
-}
-
-
-float SmoothCurve_Tweak( float x, float flPeakPos, float flPeakSharpness )
-{
-	float flMovedPeak = MovePeak( x, flPeakPos );
-	float flSharpened = Gain( flMovedPeak, flPeakSharpness );
-	return SmoothCurve( flSharpened );
-}
-
-//-----------------------------------------------------------------------------
-// make sure quaternions are within 180 degrees of one another, if not, reverse q
-//-----------------------------------------------------------------------------
-
-void QuaternionAlign( const Quaternion &p, const Quaternion &q, Quaternion &qt )
-{
-	Assert( s_bMathlibInitialized );
-
-	// FIXME: can this be done with a quat dot product?
-
-	int i;
-	// decide if one of the quaternions is backwards
-	float a = 0;
-	float b = 0;
-	for (i = 0; i < 4; i++) 
-	{
-		a += (p[i]-q[i])*(p[i]-q[i]);
-		b += (p[i]+q[i])*(p[i]+q[i]);
-	}
-	if (a > b) 
-	{
-		for (i = 0; i < 4; i++) 
-		{
-			qt[i] = -q[i];
-		}
-	}
-	else if (&qt != &q)
-	{
-		for (i = 0; i < 4; i++) 
-		{
-			qt[i] = q[i];
-		}
-	}
-}
-
-
-//-----------------------------------------------------------------------------
-// Do a piecewise addition of the quaternion elements. This actually makes little 
-// mathematical sense, but it's a cheap way to simulate a slerp.
-//-----------------------------------------------------------------------------
-void QuaternionBlend( const Quaternion &p, const Quaternion &q, float t, Quaternion &qt )
-{
-	Assert( s_bMathlibInitialized );
-#if ALLOW_SIMD_QUATERNION_MATH
-	fltx4 psimd, qsimd, qtsimd;
-	psimd = LoadUnalignedSIMD( p.Base() );
-	qsimd = LoadUnalignedSIMD( q.Base() );
-	qtsimd = QuaternionBlendSIMD( psimd, qsimd, t );
-	StoreUnalignedSIMD( qt.Base(), qtsimd );
-#else
-	// decide if one of the quaternions is backwards
-	Quaternion q2;
-	QuaternionAlign( p, q, q2 );
-	QuaternionBlendNoAlign( p, q2, t, qt );
-#endif
-}
-
-
-void QuaternionBlendNoAlign( const Quaternion &p, const Quaternion &q, float t, Quaternion &qt )
-{
-	Assert( s_bMathlibInitialized );
-	float sclp, sclq;
-	int i;
-
-	// 0.0 returns p, 1.0 return q.
-	sclp = 1.0f - t;
-	sclq = t;
-	for (i = 0; i < 4; i++) {
-		qt[i] = sclp * p[i] + sclq * q[i];
-	}
-	QuaternionNormalize( qt );
-}
-
-
-
-void QuaternionIdentityBlend( const Quaternion &p, float t, Quaternion &qt )
-{
-	Assert( s_bMathlibInitialized );
-	float sclp;
-
-	sclp = 1.0f - t;
-
-	qt.x = p.x * sclp;
-	qt.y = p.y * sclp;
-	qt.z = p.z * sclp;
-	if (qt.w < 0.0)
-	{
-		qt.w = p.w * sclp - t;
-	}
-	else
-	{
-		qt.w = p.w * sclp + t;
-	}
-	QuaternionNormalize( qt );
-}
-
-//-----------------------------------------------------------------------------
-// Quaternion sphereical linear interpolation
-//-----------------------------------------------------------------------------
-
-void QuaternionSlerp( const Quaternion &p, const Quaternion &q, float t, Quaternion &qt )
-{
-	Quaternion q2;
-	// 0.0 returns p, 1.0 return q.
-
-	// decide if one of the quaternions is backwards
-	QuaternionAlign( p, q, q2 );
-
-	QuaternionSlerpNoAlign( p, q2, t, qt );
-}
-
-
-void QuaternionSlerpNoAlign( const Quaternion &p, const Quaternion &q, float t, Quaternion &qt )
-{
-	Assert( s_bMathlibInitialized );
-	float omega, cosom, sinom, sclp, sclq;
-	int i;
-
-	// 0.0 returns p, 1.0 return q.
-
-	cosom = p[0]*q[0] + p[1]*q[1] + p[2]*q[2] + p[3]*q[3];
-
-	if ((1.0f + cosom) > 0.000001f) {
-		if ((1.0f - cosom) > 0.000001f) {
-			omega = acos( cosom );
-			sinom = sin( omega );
-			sclp = sin( (1.0f - t)*omega) / sinom;
-			sclq = sin( t*omega ) / sinom;
-		}
-		else {
-			// TODO: add short circuit for cosom == 1.0f?
-			sclp = 1.0f - t;
-			sclq = t;
-		}
-		for (i = 0; i < 4; i++) {
-			qt[i] = sclp * p[i] + sclq * q[i];
-		}
-	}
-	else {
-		Assert( &qt != &q );
-
-		qt[0] = -q[1];
-		qt[1] = q[0];
-		qt[2] = -q[3];
-		qt[3] = q[2];
-		sclp = sin( (1.0f - t) * (0.5f * M_PI));
-		sclq = sin( t * (0.5f * M_PI));
-		for (i = 0; i < 3; i++) {
-			qt[i] = sclp * p[i] + sclq * qt[i];
-		}
-	}
-
-	Assert( qt.IsValid() );
-}
-
-
-//-----------------------------------------------------------------------------
-// Purpose: Returns the angular delta between the two normalized quaternions in degrees.
-//-----------------------------------------------------------------------------
-float QuaternionAngleDiff( const Quaternion &p, const Quaternion &q )
-{
-#if 1
-	// this code path is here for 2 reasons:
-	// 1 - acos maps 1-epsilon to values much larger than epsilon (vs asin, which maps epsilon to itself)
-	//     this means that in floats, anything below ~0.05 degrees truncates to 0
-	// 2 - normalized quaternions are frequently slightly non-normalized due to float precision issues,
-	//     and the epsilon off of normalized can be several percents of a degree
-	Quaternion qInv, diff;
-	QuaternionConjugate( q, qInv );
-	QuaternionMult( p, qInv, diff );
-
-	// Note if the quaternion is slightly non-normalized the square root below may be more than 1,
-	// the value is clamped to one otherwise it may result in asin() returning an undefined result.
-	float sinang = MIN( 1.0f, sqrt( diff.x * diff.x + diff.y * diff.y + diff.z * diff.z ) );
-	float angle = RAD2DEG( 2 * asin( sinang ) );
-	return angle;
-#else
-	Quaternion q2;
-	QuaternionAlign( p, q, q2 );
-
-	Assert( s_bMathlibInitialized );
-	float cosom = p.x * q2.x + p.y * q2.y + p.z * q2.z + p.w * q2.w;
-
-	if ( cosom > -1.0f )
-	{
-		if ( cosom < 1.0f )
-		{
-			float omega = 2 * fabs( acos( cosom ) );
-			return RAD2DEG( omega );
-		}
-		return 0.0f;
-	}
-
-	return 180.0f;
-#endif
-}
-
-void QuaternionConjugate( const Quaternion &p, Quaternion &q )
-{
-	Assert( s_bMathlibInitialized );
-	Assert( q.IsValid() );
-
-	q.x = -p.x;
-	q.y = -p.y;
-	q.z = -p.z;
-	q.w = p.w;
-}
-
-void QuaternionInvert( const Quaternion &p, Quaternion &q )
-{
-	Assert( s_bMathlibInitialized );
-	Assert( q.IsValid() );
-
-	QuaternionConjugate( p, q );
-
-	float magnitudeSqr = QuaternionDotProduct( p, p );
-	Assert( magnitudeSqr );
-	if ( magnitudeSqr )
-	{
-		float inv = 1.0f / magnitudeSqr;
-		q.x *= inv;
-		q.y *= inv;
-		q.z *= inv;
-		q.w *= inv;
-	}
-}
-
-//-----------------------------------------------------------------------------
-// Make sure the quaternion is of unit length
-//-----------------------------------------------------------------------------
-float QuaternionNormalize( Quaternion &q )
-{
-	Assert( s_bMathlibInitialized );
-	float radius, iradius;
-
-	Assert( q.IsValid() );
-
-	radius = q[0]*q[0] + q[1]*q[1] + q[2]*q[2] + q[3]*q[3];
-
-	if ( radius ) // > FLT_EPSILON && ((radius < 1.0f - 4*FLT_EPSILON) || (radius > 1.0f + 4*FLT_EPSILON))
-	{
-		radius = sqrt(radius);
-		iradius = 1.0f/radius;
-		q[3] *= iradius;
-		q[2] *= iradius;
-		q[1] *= iradius;
-		q[0] *= iradius;
-	}
-	return radius;
-}
-
-
-void QuaternionScale( const Quaternion &p, float t, Quaternion &q )
-{
-	Assert( s_bMathlibInitialized );
-
-#if 0
-	Quaternion p0;
-	Quaternion q;
-	p0.Init( 0.0, 0.0, 0.0, 1.0 );
-
-	// slerp in "reverse order" so that p doesn't get realigned
-	QuaternionSlerp( p, p0, 1.0 - fabs( t ), q );
-	if (t < 0.0)
-	{
-		q.w = -q.w;
-	}
-#else
-	float r;
-
-	// FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to 
-	// use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale.
-	float sinom = sqrt( DotProduct( &p.x, &p.x ) );
-	sinom = min( sinom, 1.f );
-
-	float sinsom = sin( asin( sinom ) * t );
-
-	t = sinsom / (sinom + FLT_EPSILON);
-	VectorScale( &p.x, t, &q.x );
-
-	// rescale rotation
-	r = 1.0f - sinsom * sinsom;
-
-	// Assert( r >= 0 );
-	if (r < 0.0f) 
-		r = 0.0f;
-	r = sqrt( r );
-
-	// keep sign of rotation
-	if (p.w < 0)
-		q.w = -r;
-	else
-		q.w = r;
-#endif
-
-	Assert( q.IsValid() );
-
-	return;
-}
-
-
-void QuaternionAdd( const Quaternion &p, const Quaternion &q, Quaternion &qt )
-{
-	Assert( s_bMathlibInitialized );
-	Assert( p.IsValid() );
-	Assert( q.IsValid() );
-
-	// decide if one of the quaternions is backwards
-	Quaternion q2;
-	QuaternionAlign( p, q, q2 );
-
-	// is this right???
-	qt[0] = p[0] + q2[0];
-	qt[1] = p[1] + q2[1];
-	qt[2] = p[2] + q2[2];
-	qt[3] = p[3] + q2[3];
-
-	return;
-}
-
-
-float QuaternionDotProduct( const Quaternion &p, const Quaternion &q )
-{
-	Assert( s_bMathlibInitialized );
-	Assert( p.IsValid() );
-	Assert( q.IsValid() );
-
-	return p.x * q.x + p.y * q.y + p.z * q.z + p.w * q.w;
-}
-
-
-// qt = p * q
-void QuaternionMult( const Quaternion &p, const Quaternion &q, Quaternion &qt )
-{
-	Assert( s_bMathlibInitialized );
-	Assert( p.IsValid() );
-	Assert( q.IsValid() );
-
-	if (&p == &qt)
-	{
-		Quaternion p2 = p;
-		QuaternionMult( p2, q, qt );
-		return;
-	}
-
-	// decide if one of the quaternions is backwards
-	Quaternion q2;
-	QuaternionAlign( p, q, q2 );
-
-	qt.x =  p.x * q2.w + p.y * q2.z - p.z * q2.y + p.w * q2.x;
-	qt.y = -p.x * q2.z + p.y * q2.w + p.z * q2.x + p.w * q2.y;
-	qt.z =  p.x * q2.y - p.y * q2.x + p.z * q2.w + p.w * q2.z;
-	qt.w = -p.x * q2.x - p.y * q2.y - p.z * q2.z + p.w * q2.w;
-}
-
-
-void QuaternionMatrix( const Quaternion &q, const Vector &pos, matrix3x4_t& matrix )
-{
-	Assert( pos.IsValid() );
-
-	QuaternionMatrix( q, matrix );
-
-	matrix[0][3] = pos.x;
-	matrix[1][3] = pos.y;
-	matrix[2][3] = pos.z;
-}
-
-void QuaternionMatrix( const Quaternion &q, matrix3x4_t& matrix )
-{
-	Assert( s_bMathlibInitialized );
-	Assert( q.IsValid() );
-
-#ifdef _VPROF_MATHLIB
-	VPROF_BUDGET( "QuaternionMatrix", "Mathlib" );
-#endif
-
-// Original code
-// This should produce the same code as below with optimization, but looking at the assmebly,
-// it doesn't.  There are 7 extra multiplies in the release build of this, go figure.
-#if 1
-	matrix[0][0] = 1.0 - 2.0 * q.y * q.y - 2.0 * q.z * q.z;
-	matrix[1][0] = 2.0 * q.x * q.y + 2.0 * q.w * q.z;
-	matrix[2][0] = 2.0 * q.x * q.z - 2.0 * q.w * q.y;
-
-	matrix[0][1] = 2.0f * q.x * q.y - 2.0f * q.w * q.z;
-	matrix[1][1] = 1.0f - 2.0f * q.x * q.x - 2.0f * q.z * q.z;
-	matrix[2][1] = 2.0f * q.y * q.z + 2.0f * q.w * q.x;
-
-	matrix[0][2] = 2.0f * q.x * q.z + 2.0f * q.w * q.y;
-	matrix[1][2] = 2.0f * q.y * q.z - 2.0f * q.w * q.x;
-	matrix[2][2] = 1.0f - 2.0f * q.x * q.x - 2.0f * q.y * q.y;
-
-	matrix[0][3] = 0.0f;
-	matrix[1][3] = 0.0f;
-	matrix[2][3] = 0.0f;
-#else
-   float wx, wy, wz, xx, yy, yz, xy, xz, zz, x2, y2, z2;
-
-    // precalculate common multiplitcations
-    x2 = q.x + q.x; 
-	y2 = q.y + q.y; 
-    z2 = q.z + q.z;
-    xx = q.x * x2;
-	xy = q.x * y2;
-	xz = q.x * z2;
-    yy = q.y * y2;
-	yz = q.y * z2;
-	zz = q.z * z2;
-    wx = q.w * x2;
-	wy = q.w * y2;
-	wz = q.w * z2;
-
-    matrix[0][0] = 1.0 - (yy + zz);
-    matrix[0][1] = xy - wz;
-	matrix[0][2] = xz + wy;
-    matrix[0][3] = 0.0f;
-
-    matrix[1][0] = xy + wz;
-	matrix[1][1] = 1.0 - (xx + zz);
-    matrix[1][2] = yz - wx;
-	matrix[1][3] = 0.0f;
-
-    matrix[2][0] = xz - wy;
-	matrix[2][1] = yz + wx;
-    matrix[2][2] = 1.0 - (xx + yy);
-	matrix[2][3] = 0.0f;
-#endif
-}
-
-
-//-----------------------------------------------------------------------------
-// Purpose: Converts a quaternion into engine angles
-// Input  : *quaternion - q3 + q0.i + q1.j + q2.k
-//			*outAngles - PITCH, YAW, ROLL
-//-----------------------------------------------------------------------------
-void QuaternionAngles( const Quaternion &q, QAngle &angles )
-{
-	Assert( s_bMathlibInitialized );
-	Assert( q.IsValid() );
-
-#ifdef _VPROF_MATHLIB
-	VPROF_BUDGET( "QuaternionAngles", "Mathlib" );
-#endif
-
-#if 1
-	// FIXME: doing it this way calculates too much data, needs to do an optimized version...
-	matrix3x4_t matrix;
-	QuaternionMatrix( q, matrix );
-	MatrixAngles( matrix, angles );
-#else
-	float m11, m12, m13, m23, m33;
-
-	m11 = ( 2.0f * q.w * q.w ) + ( 2.0f * q.x * q.x ) - 1.0f;
-	m12 = ( 2.0f * q.x * q.y ) + ( 2.0f * q.w * q.z );
-	m13 = ( 2.0f * q.x * q.z ) - ( 2.0f * q.w * q.y );
-	m23 = ( 2.0f * q.y * q.z ) + ( 2.0f * q.w * q.x );
-	m33 = ( 2.0f * q.w * q.w ) + ( 2.0f * q.z * q.z ) - 1.0f;
-
-	// FIXME: this code has a singularity near PITCH +-90
-	angles[YAW] = RAD2DEG( atan2(m12, m11) );
-	angles[PITCH] = RAD2DEG( asin(-m13) );
-	angles[ROLL] = RAD2DEG( atan2(m23, m33) );
-#endif
-
-	Assert( angles.IsValid() );
-}
-
-//-----------------------------------------------------------------------------
-// Purpose: Converts a quaternion to an axis / angle in degrees
-//			(exponential map)
-//-----------------------------------------------------------------------------
-void QuaternionAxisAngle( const Quaternion &q, Vector &axis, float &angle )
-{
-	angle = RAD2DEG(2 * acos(q.w));
-	if ( angle > 180 )
-	{
-		angle -= 360;
-	}
-	axis.x = q.x;
-	axis.y = q.y;
-	axis.z = q.z;
-	VectorNormalize( axis );
-}
-
-//-----------------------------------------------------------------------------
-// Purpose: Converts an exponential map (ang/axis) to a quaternion
-//-----------------------------------------------------------------------------
-void AxisAngleQuaternion( const Vector &axis, float angle, Quaternion &q )
-{
-	float sa, ca;
-	
-	SinCos( DEG2RAD(angle) * 0.5f, &sa, &ca );
-	
-	q.x = axis.x * sa;
-	q.y = axis.y * sa;
-	q.z = axis.z * sa;
-	q.w = ca;
-}
-
-
-//-----------------------------------------------------------------------------
-// Purpose: Converts radian-euler axis aligned angles to a quaternion
-// Input  : *pfAngles - Right-handed Euler angles in radians
-//			*outQuat - quaternion of form (i,j,k,real)
-//-----------------------------------------------------------------------------
-void AngleQuaternion( const RadianEuler &angles, Quaternion &outQuat )
-{
-	Assert( s_bMathlibInitialized );
-//	Assert( angles.IsValid() );
-
-#ifdef _VPROF_MATHLIB
-	VPROF_BUDGET( "AngleQuaternion", "Mathlib" );
-#endif
-
-	float sr, sp, sy, cr, cp, cy;
-
-#ifdef _X360
-	fltx4 radians, scale, sine, cosine;
-	radians = LoadUnaligned3SIMD( &angles.x );
-	scale = ReplicateX4( 0.5f ); 
-	radians = MulSIMD( radians, scale );
-	SinCos3SIMD( sine, cosine, radians ); 	
-
-	// NOTE: The ordering here is *different* from the AngleQuaternion below
-	// because p, y, r are not in the same locations in QAngle + RadianEuler. Yay!
-	sr = SubFloat( sine, 0 );	sp = SubFloat( sine, 1 );	sy = SubFloat( sine, 2 );	
-	cr = SubFloat( cosine, 0 );	cp = SubFloat( cosine, 1 );	cy = SubFloat( cosine, 2 );	
-#else
-	SinCos( angles.z * 0.5f, &sy, &cy );
-	SinCos( angles.y * 0.5f, &sp, &cp );
-	SinCos( angles.x * 0.5f, &sr, &cr );
-#endif
-
-	// NJS: for some reason VC6 wasn't recognizing the common subexpressions:
-	float srXcp = sr * cp, crXsp = cr * sp;
-	outQuat.x = srXcp*cy-crXsp*sy; // X
-	outQuat.y = crXsp*cy+srXcp*sy; // Y
-
-	float crXcp = cr * cp, srXsp = sr * sp;
-	outQuat.z = crXcp*sy-srXsp*cy; // Z
-	outQuat.w = crXcp*cy+srXsp*sy; // W (real component)
-}
-
-
-//-----------------------------------------------------------------------------
-// Purpose: Converts engine-format euler angles to a quaternion
-// Input  : angles - Right-handed Euler angles in degrees as follows:
-//				[0]: PITCH: Clockwise rotation around the Y axis.
-//				[1]: YAW:	Counterclockwise rotation around the Z axis.
-//				[2]: ROLL:	Counterclockwise rotation around the X axis.
-//			*outQuat - quaternion of form (i,j,k,real)
-//-----------------------------------------------------------------------------
-void AngleQuaternion( const QAngle &angles, Quaternion &outQuat )
-{
-#ifdef _VPROF_MATHLIB
-	VPROF_BUDGET( "AngleQuaternion", "Mathlib" );
-#endif
-
-	float sr, sp, sy, cr, cp, cy;
-
-#ifdef _X360
-	fltx4 radians, scale, sine, cosine;
-	radians = LoadUnaligned3SIMD( angles.Base() );
-	scale = ReplicateX4( 0.5f * M_PI_F / 180.f ); 
-	radians = MulSIMD( radians, scale );
-	SinCos3SIMD( sine, cosine, radians ); 	
-
-	// NOTE: The ordering here is *different* from the AngleQuaternion above
-	// because p, y, r are not in the same locations in QAngle + RadianEuler. Yay!
-	sp = SubFloat( sine, 0 );	sy = SubFloat( sine, 1 );	sr = SubFloat( sine, 2 );	
-	cp = SubFloat( cosine, 0 );	cy = SubFloat( cosine, 1 );	cr = SubFloat( cosine, 2 );	
-#else
-	SinCos( DEG2RAD( angles.y ) * 0.5f, &sy, &cy );
-	SinCos( DEG2RAD( angles.x ) * 0.5f, &sp, &cp );
-	SinCos( DEG2RAD( angles.z ) * 0.5f, &sr, &cr );
-#endif
-
-	// NJS: for some reason VC6 wasn't recognizing the common subexpressions:
-	float srXcp = sr * cp, crXsp = cr * sp;
-	outQuat.x = srXcp*cy-crXsp*sy; // X
-	outQuat.y = crXsp*cy+srXcp*sy; // Y
-
-	float crXcp = cr * cp, srXsp = sr * sp;
-	outQuat.z = crXcp*sy-srXsp*cy; // Z
-	outQuat.w = crXcp*cy+srXsp*sy; // W (real component)
-}
-
-
-//-----------------------------------------------------------------------------
-// Purpose: Converts a basis to a quaternion
-//-----------------------------------------------------------------------------
-void BasisToQuaternion( const Vector &vecForward, const Vector &vecRight, const Vector &vecUp, Quaternion &q )
-{
-	Assert( fabs( vecForward.LengthSqr() - 1.0f ) < 1e-3 );
-	Assert( fabs( vecRight.LengthSqr() - 1.0f ) < 1e-3 );
-	Assert( fabs( vecUp.LengthSqr() - 1.0f ) < 1e-3 );
-
-	Vector vecLeft;
-	VectorMultiply( vecRight, -1.0f, vecLeft );
-
-	// FIXME: Don't know why, but this doesn't match at all with other result
-	// so we can't use this super-fast way.
-	/*
-	// Find the trace of the matrix:
-	float flTrace = vecForward.x + vecLeft.y + vecUp.z + 1.0f;
-	if ( flTrace > 1e-6 )
-	{
-		float flSqrtTrace = FastSqrt( flTrace );
-		float s = 0.5f / flSqrtTrace;
-		q.x = ( vecUp.y - vecLeft.z ) * s;
-		q.y = ( vecForward.z - vecUp.x ) * s;
-		q.z = ( vecLeft.x - vecForward.y ) * s;
-		q.w = 0.5f * flSqrtTrace;
-	}
-	else
-	{
-		if (( vecForward.x > vecLeft.y ) && ( vecForward.x > vecUp.z ) )
-		{
-			float flSqrtTrace = FastSqrt( 1.0f + vecForward.x - vecLeft.y - vecUp.z );
-			float s = 0.5f / flSqrtTrace;
-			q.x = 0.5f * flSqrtTrace;
-			q.y = ( vecForward.y + vecLeft.x ) * s;
-			q.z = ( vecUp.x + vecForward.z ) * s;
-			q.w = ( vecUp.y - vecLeft.z ) * s;
-		}
-		else if ( vecLeft.y > vecUp.z )
-		{
-			float flSqrtTrace = FastSqrt( 1.0f + vecLeft.y - vecForward.x - vecUp.z );
-			float s = 0.5f / flSqrtTrace;
-			q.x = ( vecForward.y + vecLeft.x ) * s;
-			q.y = 0.5f * flSqrtTrace;
-			q.z = ( vecUp.y + vecLeft.z ) * s;
-			q.w = ( vecForward.z - vecUp.x ) * s;
-		}
-		else
-		{
-			float flSqrtTrace = FastSqrt( 1.0 + vecUp.z - vecForward.x - vecLeft.y );
-			float s = 0.5f / flSqrtTrace;
-			q.x = ( vecUp.x + vecForward.z ) * s;
-			q.y = ( vecUp.y + vecLeft.z ) * s;
-			q.z = 0.5f * flSqrtTrace;
-			q.w = ( vecLeft.x - vecForward.y ) * s;
-		}
-	}
-	QuaternionNormalize( q );
-	*/
-
-	// Version 2: Go through angles
-
-	matrix3x4_t mat;
-	MatrixSetColumn( vecForward, 0, mat );
-	MatrixSetColumn( vecLeft, 1, mat );
-	MatrixSetColumn( vecUp, 2, mat );
-
-	QAngle angles;
-	MatrixAngles( mat, angles );
-
-//	Quaternion q2;
-	AngleQuaternion( angles, q );
-
-//	Assert( fabs(q.x - q2.x) < 1e-3 );
-//	Assert( fabs(q.y - q2.y) < 1e-3 );
-//	Assert( fabs(q.z - q2.z) < 1e-3 );
-//	Assert( fabs(q.w - q2.w) < 1e-3 );
-}
-
-// FIXME: Optimize!
-void MatrixQuaternion( const matrix3x4_t &mat, Quaternion &q )
-{
-	QAngle angles;
-	MatrixAngles( mat, angles );
-	AngleQuaternion( angles, q );
-}
-
-
-//-----------------------------------------------------------------------------
-// Purpose: Converts a quaternion into engine angles
-// Input  : *quaternion - q3 + q0.i + q1.j + q2.k
-//			*outAngles - PITCH, YAW, ROLL
-//-----------------------------------------------------------------------------
-void QuaternionAngles( const Quaternion &q, RadianEuler &angles )
-{
-	Assert( s_bMathlibInitialized );
-	Assert( q.IsValid() );
-
-	// FIXME: doing it this way calculates too much data, needs to do an optimized version...
-	matrix3x4_t matrix;
-	QuaternionMatrix( q, matrix );
-	MatrixAngles( matrix, angles );
-
-	Assert( angles.IsValid() );
-}
-
-//-----------------------------------------------------------------------------
-// Purpose: A helper function to normalize p2.x->p1.x and p3.x->p4.x to 
-//  be the same length as p2.x->p3.x
-// Input  : &p2 - 
-//			&p4 - 
-//			p4n - 
-//-----------------------------------------------------------------------------
-void Spline_Normalize( 
-	const Vector &p1,
-	const Vector &p2,
-	const Vector &p3,
-	const Vector &p4,
-	Vector& p1n,
-	Vector& p4n )
-{
-	float dt = p3.x - p2.x;
-
-	p1n = p1;
-	p4n = p4;
-
-	if ( dt != 0.0 )
-	{
-		if (p1.x != p2.x)
-		{
-			// Equivalent to p1n = p2 - (p2 - p1) * (dt / (p2.x - p1.x));
-			VectorLerp( p2, p1, dt / (p2.x - p1.x), p1n );
-		}
-		if (p4.x != p3.x)
-		{
-			// Equivalent to p4n = p3 + (p4 - p3) * (dt / (p4.x - p3.x));
-			VectorLerp( p3, p4, dt / (p4.x - p3.x), p4n );
-		}
-	}
-}
-
-//-----------------------------------------------------------------------------
-// Purpose: 
-// Input  : 
-//-----------------------------------------------------------------------------
-
-void Catmull_Rom_Spline(
-	const Vector &p1,
-	const Vector &p2,
-	const Vector &p3,
-	const Vector &p4,
-	float t, 
-	Vector& output )
-{
-	Assert( s_bMathlibInitialized );
-	float tSqr = t*t*0.5f;
-	float tSqrSqr = t*tSqr;
-	t *= 0.5f;
-
-	Assert( &output != &p1 );
-	Assert( &output != &p2 );
-	Assert( &output != &p3 );
-	Assert( &output != &p4 );
-
-	output.Init();
-
-	Vector a, b, c, d;
-
-	// matrix row 1
-	VectorScale( p1, -tSqrSqr, a );		// 0.5 t^3 * [ (-1*p1) + ( 3*p2) + (-3*p3) + p4 ]
-	VectorScale( p2, tSqrSqr*3, b );
-	VectorScale( p3, tSqrSqr*-3, c );
-	VectorScale( p4, tSqrSqr, d );
-
-	VectorAdd( a, output, output );
-	VectorAdd( b, output, output );
-	VectorAdd( c, output, output );
-	VectorAdd( d, output, output );
-
-	// matrix row 2
-	VectorScale( p1, tSqr*2,  a );		// 0.5 t^2 * [ ( 2*p1) + (-5*p2) + ( 4*p3) - p4 ]
-	VectorScale( p2, tSqr*-5, b );
-	VectorScale( p3, tSqr*4,  c );
-	VectorScale( p4, -tSqr,    d );
-
-	VectorAdd( a, output, output );
-	VectorAdd( b, output, output );
-	VectorAdd( c, output, output );
-	VectorAdd( d, output, output );
-
-	// matrix row 3
-	VectorScale( p1, -t, a );			// 0.5 t * [ (-1*p1) + p3 ]
-	VectorScale( p3, t,  b );
-
-	VectorAdd( a, output, output );
-	VectorAdd( b, output, output );
-
-	// matrix row 4
-	VectorAdd( p2, output, output );	// p2
-}
-
-void Catmull_Rom_Spline_Tangent(
-	const Vector &p1,
-	const Vector &p2,
-	const Vector &p3,
-	const Vector &p4,
-	float t, 
-	Vector& output )
-{
-	Assert( s_bMathlibInitialized );
-	float tOne = 3*t*t*0.5f;
-	float tTwo = 2*t*0.5f;
-	float tThree = 0.5;
-
-	Assert( &output != &p1 );
-	Assert( &output != &p2 );
-	Assert( &output != &p3 );
-	Assert( &output != &p4 );
-
-	output.Init();
-
-	Vector a, b, c, d;
-
-	// matrix row 1
-	VectorScale( p1, -tOne, a );		// 0.5 t^3 * [ (-1*p1) + ( 3*p2) + (-3*p3) + p4 ]
-	VectorScale( p2, tOne*3, b );
-	VectorScale( p3, tOne*-3, c );
-	VectorScale( p4, tOne, d );
-
-	VectorAdd( a, output, output );
-	VectorAdd( b, output, output );
-	VectorAdd( c, output, output );
-	VectorAdd( d, output, output );
-
-	// matrix row 2
-	VectorScale( p1, tTwo*2,  a );		// 0.5 t^2 * [ ( 2*p1) + (-5*p2) + ( 4*p3) - p4 ]
-	VectorScale( p2, tTwo*-5, b );
-	VectorScale( p3, tTwo*4,  c );
-	VectorScale( p4, -tTwo,    d );
-
-	VectorAdd( a, output, output );
-	VectorAdd( b, output, output );
-	VectorAdd( c, output, output );
-	VectorAdd( d, output, output );
-
-	// matrix row 3
-	VectorScale( p1, -tThree, a );			// 0.5 t * [ (-1*p1) + p3 ]
-	VectorScale( p3, tThree,  b );
-
-	VectorAdd( a, output, output );
-	VectorAdd( b, output, output );
-}
-
-// area under the curve [0..t]
-void Catmull_Rom_Spline_Integral( 
-	const Vector &p1,
-	const Vector &p2,
-	const Vector &p3,
-	const Vector &p4,
-	float t, 
-	Vector& output )
-{
-	output = p2*t
-			-0.25f*(p1 - p3)*t*t 
-			+ (1.0f/6.0f)*(2.0f*p1 - 5.0f*p2 + 4.0f*p3 - p4)*t*t*t
-			- 0.125f*(p1 - 3.0f*p2 + 3.0f*p3 - p4)*t*t*t*t;
-}
-
-
-// area under the curve [0..1]
-void Catmull_Rom_Spline_Integral( 
-	const Vector &p1,
-	const Vector &p2,
-	const Vector &p3,
-	const Vector &p4,
-	Vector& output )
-{
-	output = (-0.25f * p1 + 3.25f * p2 + 3.25f * p3 - 0.25f * p4) * (1.0f / 6.0f);
-}
-
-
-void Catmull_Rom_Spline_Normalize(
-	const Vector &p1,
-	const Vector &p2,
-	const Vector &p3,
-	const Vector &p4,
-	float t, 
-	Vector& output )
-{
-	// Normalize p2->p1 and p3->p4 to be the same length as p2->p3
-	float dt = p3.DistTo(p2);
-
-	Vector p1n, p4n;
-	VectorSubtract( p1, p2, p1n );
-	VectorSubtract( p4, p3, p4n );
-
-	VectorNormalize( p1n );
-	VectorNormalize( p4n );
-
-	VectorMA( p2, dt, p1n, p1n );
-	VectorMA( p3, dt, p4n, p4n );
-	
-	Catmull_Rom_Spline( p1n, p2, p3, p4n, t, output );
-}
-
-
-void Catmull_Rom_Spline_Integral_Normalize(
-	const Vector &p1,
-	const Vector &p2,
-	const Vector &p3,
-	const Vector &p4,
-	float t, 
-	Vector& output )
-{
-	// Normalize p2->p1 and p3->p4 to be the same length as p2->p3
-	float dt = p3.DistTo(p2);
-
-	Vector p1n, p4n;
-	VectorSubtract( p1, p2, p1n );
-	VectorSubtract( p4, p3, p4n );
-
-	VectorNormalize( p1n );
-	VectorNormalize( p4n );
-
-	VectorMA( p2, dt, p1n, p1n );
-	VectorMA( p3, dt, p4n, p4n );
-	
-	Catmull_Rom_Spline_Integral( p1n, p2, p3, p4n, t, output );
-}
-
-
-void Catmull_Rom_Spline_NormalizeX(
-	const Vector &p1,
-	const Vector &p2,
-	const Vector &p3,
-	const Vector &p4,
-	float t, 
-	Vector& output )
-{
-	Vector p1n, p4n;
-	Spline_Normalize( p1, p2, p3, p4, p1n, p4n );
-	Catmull_Rom_Spline( p1n, p2, p3, p4n, t, output );
-}
-
-
-//-----------------------------------------------------------------------------
-// Purpose: basic hermite spline.  t = 0 returns p1, t = 1 returns p2, 
-//			d1 and d2 are used to entry and exit slope of curve
-// Input  : 
-//-----------------------------------------------------------------------------
-
-void Hermite_Spline(
-	const Vector &p1,
-	const Vector &p2,
-	const Vector &d1,
-	const Vector &d2,
-	float t, 
-	Vector& output )
-{
-	Assert( s_bMathlibInitialized );
-	float tSqr = t*t;
-	float tCube = t*tSqr;
-
-	Assert( &output != &p1 );
-	Assert( &output != &p2 );
-	Assert( &output != &d1 );
-	Assert( &output != &d2 );
-
-	float b1 = 2.0f*tCube-3.0f*tSqr+1.0f;
-	float b2 = 1.0f - b1; // -2*tCube+3*tSqr;
-	float b3 = tCube-2*tSqr+t;
-	float b4 = tCube-tSqr;
-
-	VectorScale( p1, b1, output );
-	VectorMA( output, b2, p2, output );
-	VectorMA( output, b3, d1, output );
-	VectorMA( output, b4, d2, output );
-}
-
-float Hermite_Spline(
-	float p1,
-	float p2,
-	float d1,
-	float d2,
-	float t )
-{
-	Assert( s_bMathlibInitialized );
-	float output;
-	float tSqr = t*t;
-	float tCube = t*tSqr;
-
-	float b1 = 2.0f*tCube-3.0f*tSqr+1.0f;
-	float b2 = 1.0f - b1; // -2*tCube+3*tSqr;
-	float b3 = tCube-2*tSqr+t;
-	float b4 = tCube-tSqr;
-
-	output = p1 * b1;
-	output += p2 * b2;
-	output += d1 * b3;
-	output += d2 * b4;
-
-	return output;
-}
-
-
-void Hermite_SplineBasis( float t, float basis[4] )
-{
-	float tSqr = t*t;
-	float tCube = t*tSqr;
-
-	basis[0] = 2.0f*tCube-3.0f*tSqr+1.0f;
-	basis[1] = 1.0f - basis[0]; // -2*tCube+3*tSqr;
-	basis[2] = tCube-2*tSqr+t;
-	basis[3] = tCube-tSqr;
-}
-
-//-----------------------------------------------------------------------------
-// Purpose: simple three data point hermite spline.  
-//			t = 0 returns p1, t = 1 returns p2, 
-//			slopes are generated from the p0->p1 and p1->p2 segments
-//			this is reasonable C1 method when there's no "p3" data yet.
-// Input  : 
-//-----------------------------------------------------------------------------
-
-// BUG: the VectorSubtract()'s calls go away if the global optimizer is enabled
-#pragma optimize( "g", off )
-
-void Hermite_Spline( const Vector &p0, const Vector &p1, const Vector &p2, float t, Vector& output )
-{
-	Vector e10, e21;
-	VectorSubtract( p1, p0, e10 );
-	VectorSubtract( p2, p1, e21 );
-	Hermite_Spline( p1, p2, e10, e21, t, output );
-}
-
-#pragma optimize( "", on )
-
-float Hermite_Spline( float p0, float p1, float p2,	float t )
-{
-	return Hermite_Spline( p1, p2, p1 - p0, p2 - p1, t );
-}
-
-
-void Hermite_Spline( const Quaternion &q0, const Quaternion &q1, const Quaternion &q2, float t, Quaternion &output )
-{
-	// cheap, hacked version of quaternions
-	Quaternion q0a;
-	Quaternion q1a;
-
-	QuaternionAlign( q2, q0, q0a );
-	QuaternionAlign( q2, q1, q1a );
-
-	output.x = Hermite_Spline( q0a.x, q1a.x, q2.x, t );
-	output.y = Hermite_Spline( q0a.y, q1a.y, q2.y, t );
-	output.z = Hermite_Spline( q0a.z, q1a.z, q2.z, t );
-	output.w = Hermite_Spline( q0a.w, q1a.w, q2.w, t );
-
-	QuaternionNormalize( output );
-}
-
-// See http://en.wikipedia.org/wiki/Kochanek-Bartels_curves
-// 
-// Tension:  -1 = Round -> 1 = Tight
-// Bias:     -1 = Pre-shoot (bias left) -> 1 = Post-shoot (bias right)
-// Continuity: -1 = Box corners -> 1 = Inverted corners
-//
-// If T=B=C=0 it's the same matrix as Catmull-Rom.
-// If T=1 & B=C=0 it's the same as Cubic.
-// If T=B=0 & C=-1 it's just linear interpolation
-// 
-// See http://news.povray.org/povray.binaries.tutorials/attachment/%[email protected]%3E/Splines.bas.txt
-// for example code and descriptions of various spline types...
-// 
-void Kochanek_Bartels_Spline(
-	float tension, 
-	float bias, 
-	float continuity,
-	const Vector &p1,
-	const Vector &p2,
-	const Vector &p3,
-	const Vector &p4,
-	float t, 
-	Vector& output )
-{
-	Assert( s_bMathlibInitialized );
-
-	float ffa, ffb, ffc, ffd;
-
-	ffa = ( 1.0f - tension ) * ( 1.0f + continuity ) * ( 1.0f + bias );
-	ffb = ( 1.0f - tension ) * ( 1.0f - continuity ) * ( 1.0f - bias );
-	ffc = ( 1.0f - tension ) * ( 1.0f - continuity ) * ( 1.0f + bias );
-	ffd = ( 1.0f - tension ) * ( 1.0f + continuity ) * ( 1.0f - bias );
-
-	float tSqr = t*t*0.5f;
-	float tSqrSqr = t*tSqr;
-	t *= 0.5f;
-
-	Assert( &output != &p1 );
-	Assert( &output != &p2 );
-	Assert( &output != &p3 );
-	Assert( &output != &p4 );
-
-	output.Init();
-
-	Vector a, b, c, d;
-
-	// matrix row 1
-	VectorScale( p1, tSqrSqr * -ffa, a );		
-	VectorScale( p2, tSqrSqr * ( 4.0f + ffa - ffb - ffc ), b );
-	VectorScale( p3, tSqrSqr * ( -4.0f + ffb + ffc - ffd ), c );
-	VectorScale( p4, tSqrSqr * ffd, d );
-
-	VectorAdd( a, output, output );
-	VectorAdd( b, output, output );
-	VectorAdd( c, output, output );
-	VectorAdd( d, output, output );
-
-	// matrix row 2
-	VectorScale( p1, tSqr* 2 * ffa,  a );		
-	VectorScale( p2, tSqr * ( -6 - 2 * ffa + 2 * ffb + ffc ), b );
-	VectorScale( p3, tSqr * ( 6 - 2 * ffb - ffc + ffd ),  c );
-	VectorScale( p4, tSqr * -ffd,    d );
-
-	VectorAdd( a, output, output );
-	VectorAdd( b, output, output );
-	VectorAdd( c, output, output );
-	VectorAdd( d, output, output );
-
-	// matrix row 3
-	VectorScale( p1, t * -ffa,  a );		
-	VectorScale( p2, t * ( ffa - ffb ), b );
-	VectorScale( p3, t * ffb,  c );
-	// p4 unchanged
-
-	VectorAdd( a, output, output );
-	VectorAdd( b, output, output );
-	VectorAdd( c, output, output );
-
-	// matrix row 4
-	// p1, p3, p4 unchanged
-	// p2 is multiplied by 1 and added, so just added it directly
-
-	VectorAdd( p2, output, output );
-}
-
-void Kochanek_Bartels_Spline_NormalizeX(
-	float tension, 
-	float bias, 
-	float continuity,
-	const Vector &p1,
-	const Vector &p2,
-	const Vector &p3,
-	const Vector &p4,
-	float t, 
-	Vector& output )
-{
-	Vector p1n, p4n;
-	Spline_Normalize( p1, p2, p3, p4, p1n, p4n );
-	Kochanek_Bartels_Spline( tension, bias, continuity, p1n, p2, p3, p4n, t, output );
-}
-
-void Cubic_Spline(
-	const Vector &p1,
-	const Vector &p2,
-	const Vector &p3,
-	const Vector &p4,
-	float t, 
-	Vector& output )
-{
-	Assert( s_bMathlibInitialized );
-
-	float tSqr = t*t;
-	float tSqrSqr = t*tSqr;
-
-	Assert( &output != &p1 );
-	Assert( &output != &p2 );
-	Assert( &output != &p3 );
-	Assert( &output != &p4 );
-
-	output.Init();
-
-	Vector a, b, c, d;
-
-	// matrix row 1
-	VectorScale( p2, tSqrSqr * 2, b );
-	VectorScale( p3, tSqrSqr * -2, c );
-
-	VectorAdd( b, output, output );
-	VectorAdd( c, output, output );
-
-	// matrix row 2
-	VectorScale( p2, tSqr * -3, b );
-	VectorScale( p3, tSqr * 3,  c );
-
-	VectorAdd( b, output, output );
-	VectorAdd( c, output, output );
-
-	// matrix row 3
-	// no influence
-	// p4 unchanged
-
-	// matrix row 4
-	// p1, p3, p4 unchanged
-	VectorAdd( p2, output, output );
-}
-
-void Cubic_Spline_NormalizeX(
-	const Vector &p1,
-	const Vector &p2,
-	const Vector &p3,
-	const Vector &p4,
-	float t, 
-	Vector& output )
-{
-	Vector p1n, p4n;
-	Spline_Normalize( p1, p2, p3, p4, p1n, p4n );
-	Cubic_Spline( p1n, p2, p3, p4n, t, output );
-}
-
-void BSpline(
-	const Vector &p1,
-	const Vector &p2,
-	const Vector &p3,
-	const Vector &p4,
-	float t, 
-	Vector& output )
-{
-	Assert( s_bMathlibInitialized );
-
-	float oneOver6 = 1.0f / 6.0f;
-
-	float tSqr = t * t * oneOver6;
-	float tSqrSqr = t*tSqr;
-	t *= oneOver6;
-
-	Assert( &output != &p1 );
-	Assert( &output != &p2 );
-	Assert( &output != &p3 );
-	Assert( &output != &p4 );
-
-	output.Init();
-
-	Vector a, b, c, d;
-
-	// matrix row 1
-	VectorScale( p1, -tSqrSqr, a );		
-	VectorScale( p2, tSqrSqr * 3.0f, b );
-	VectorScale( p3, tSqrSqr * -3.0f, c );
-	VectorScale( p4, tSqrSqr, d );
-
-	VectorAdd( a, output, output );
-	VectorAdd( b, output, output );
-	VectorAdd( c, output, output );
-	VectorAdd( d, output, output );
-
-	// matrix row 2
-	VectorScale( p1, tSqr * 3.0f,  a );		
-	VectorScale( p2, tSqr * -6.0f, b );
-	VectorScale( p3, tSqr * 3.0f,  c );
-
-	VectorAdd( a, output, output );
-	VectorAdd( b, output, output );
-	VectorAdd( c, output, output );
-
-	// matrix row 3
-	VectorScale( p1, t * -3.0f,  a );		
-	VectorScale( p3, t * 3.0f,  c );
-	// p4 unchanged
-
-	VectorAdd( a, output, output );
-	VectorAdd( c, output, output );
-
-	// matrix row 4
-	// p1 and p3 scaled by 1.0f, so done below
-	VectorScale( p1, oneOver6, a );
-	VectorScale( p2, 4.0f * oneOver6, b );
-	VectorScale( p3, oneOver6, c );
-
-	VectorAdd( a, output, output );
-	VectorAdd( b, output, output );
-	VectorAdd( c, output, output );
-}
-
-void BSpline_NormalizeX(
-	const Vector &p1,
-	const Vector &p2,
-	const Vector &p3,
-	const Vector &p4,
-	float t, 
-	Vector& output )
-{
-	Vector p1n, p4n;
-	Spline_Normalize( p1, p2, p3, p4, p1n, p4n );
-	BSpline( p1n, p2, p3, p4n, t, output );
-}
-
-void Parabolic_Spline(
-	const Vector &p1,
-	const Vector &p2,
-	const Vector &p3,
-	const Vector &p4,
-	float t, 
-	Vector& output )
-{
-	Assert( s_bMathlibInitialized );
-
-	float tSqr = t*t*0.5f;
-	t *= 0.5f;
-
-	Assert( &output != &p1 );
-	Assert( &output != &p2 );
-	Assert( &output != &p3 );
-	Assert( &output != &p4 );
-
-	output.Init();
-
-	Vector a, b, c, d;
-
-	// matrix row 1
-	// no influence from t cubed
-
-	// matrix row 2
-	VectorScale( p1, tSqr,  a );		
-	VectorScale( p2, tSqr * -2.0f, b );
-	VectorScale( p3, tSqr,  c );
-
-	VectorAdd( a, output, output );
-	VectorAdd( b, output, output );
-	VectorAdd( c, output, output );
-
-	// matrix row 3
-	VectorScale( p1, t * -2.0f,  a );		
-	VectorScale( p2, t * 2.0f,  b );
-	// p4 unchanged
-
-	VectorAdd( a, output, output );
-	VectorAdd( b, output, output );
-
-	// matrix row 4
-	VectorScale( p1, 0.5f,  a );		
-	VectorScale( p2, 0.5f,  b );
-
-	VectorAdd( a, output, output );
-	VectorAdd( b, output, output );
-}
-
-void Parabolic_Spline_NormalizeX(
-	const Vector &p1,
-	const Vector &p2,
-	const Vector &p3,
-	const Vector &p4,
-	float t, 
-	Vector& output )
-{
-	Vector p1n, p4n;
-	Spline_Normalize( p1, p2, p3, p4, p1n, p4n );
-	Parabolic_Spline( p1n, p2, p3, p4n, t, output );
-}
-
-//-----------------------------------------------------------------------------
-// Purpose: Compress the input values for a ranged result such that from 75% to 200% smoothly of the range maps 
-//-----------------------------------------------------------------------------
-
-float RangeCompressor( float flValue, float flMin, float flMax, float flBase )
-{
-	// clamp base
-	if (flBase < flMin)
-		flBase = flMin;
-	if (flBase > flMax)
-		flBase = flMax;
-
-	flValue += flBase;
-
-	// convert to 0 to 1 value
-	float flMid = (flValue - flMin) / (flMax - flMin);
-	// convert to -1 to 1 value
-	float flTarget = flMid * 2 - 1;
-
-	if (fabs(flTarget) > 0.75)
-	{
-		float t = (fabs(flTarget) - 0.75) / (1.25);
-		if (t < 1.0)
-		{
-			if (flTarget > 0)
-			{
-				flTarget = Hermite_Spline( 0.75, 1, 0.75, 0, t );
-			}
-			else
-			{
-				flTarget = -Hermite_Spline( 0.75, 1, 0.75, 0, t );
-			}
-		}
-		else
-		{
-			flTarget = (flTarget > 0) ? 1.0f : -1.0f;
-		}
-	}
-
-	flMid = (flTarget + 1 ) / 2.0;
-	flValue = flMin * (1 - flMid) + flMax * flMid;
-
-	flValue -= flBase;
-
-	return flValue;
-}
-
-
-//#pragma optimize( "", on )
-
-//-----------------------------------------------------------------------------
-// Transforms a AABB into another space; which will inherently grow the box.
-//-----------------------------------------------------------------------------
-void TransformAABB( const matrix3x4_t& transform, const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut )
-{
-	Vector localCenter;
-	VectorAdd( vecMinsIn, vecMaxsIn, localCenter );
-	localCenter *= 0.5f;
-
-	Vector localExtents;
-	VectorSubtract( vecMaxsIn, localCenter, localExtents );
-
-	Vector worldCenter;
-	VectorTransform( localCenter, transform, worldCenter );
-
-	Vector worldExtents;
-	worldExtents.x = DotProductAbs( localExtents, transform[0] );
-	worldExtents.y = DotProductAbs( localExtents, transform[1] );
-	worldExtents.z = DotProductAbs( localExtents, transform[2] );
-
-	VectorSubtract( worldCenter, worldExtents, vecMinsOut );
-	VectorAdd( worldCenter, worldExtents, vecMaxsOut );
-}
-
-
-//-----------------------------------------------------------------------------
-// Uses the inverse transform of in1
-//-----------------------------------------------------------------------------
-void ITransformAABB( const matrix3x4_t& transform, const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut )
-{
-	Vector worldCenter;
-	VectorAdd( vecMinsIn, vecMaxsIn, worldCenter );
-	worldCenter *= 0.5f;
-
-	Vector worldExtents;
-	VectorSubtract( vecMaxsIn, worldCenter, worldExtents );
-
-	Vector localCenter;
-	VectorITransform( worldCenter, transform, localCenter );
-
-	Vector localExtents;
-	localExtents.x =	FloatMakePositive( worldExtents.x * transform[0][0] ) + 
-						FloatMakePositive( worldExtents.y * transform[1][0] ) + 
-						FloatMakePositive( worldExtents.z * transform[2][0] );
-	localExtents.y =	FloatMakePositive( worldExtents.x * transform[0][1] ) + 
-						FloatMakePositive( worldExtents.y * transform[1][1] ) + 
-						FloatMakePositive( worldExtents.z * transform[2][1] );
-	localExtents.z =	FloatMakePositive( worldExtents.x * transform[0][2] ) + 
-						FloatMakePositive( worldExtents.y * transform[1][2] ) + 
-						FloatMakePositive( worldExtents.z * transform[2][2] );
-
-	VectorSubtract( localCenter, localExtents, vecMinsOut );
-	VectorAdd( localCenter, localExtents, vecMaxsOut );
-}
-
-
-//-----------------------------------------------------------------------------
-// Rotates a AABB into another space; which will inherently grow the box. 
-// (same as TransformAABB, but doesn't take the translation into account)
-//-----------------------------------------------------------------------------
-void RotateAABB( const matrix3x4_t &transform, const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut )
-{
-	Vector localCenter;
-	VectorAdd( vecMinsIn, vecMaxsIn, localCenter );
-	localCenter *= 0.5f;
-
-	Vector localExtents;
-	VectorSubtract( vecMaxsIn, localCenter, localExtents );
-
-	Vector newCenter;
-	VectorRotate( localCenter, transform, newCenter );
-
-	Vector newExtents;
-	newExtents.x = DotProductAbs( localExtents, transform[0] );
-	newExtents.y = DotProductAbs( localExtents, transform[1] );
-	newExtents.z = DotProductAbs( localExtents, transform[2] );
-
-	VectorSubtract( newCenter, newExtents, vecMinsOut );
-	VectorAdd( newCenter, newExtents, vecMaxsOut );
-}
-
-
-//-----------------------------------------------------------------------------
-// Uses the inverse transform of in1
-//-----------------------------------------------------------------------------
-void IRotateAABB( const matrix3x4_t &transform, const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut )
-{
-	Vector oldCenter;
-	VectorAdd( vecMinsIn, vecMaxsIn, oldCenter );
-	oldCenter *= 0.5f;
-
-	Vector oldExtents;
-	VectorSubtract( vecMaxsIn, oldCenter, oldExtents );
-
-	Vector newCenter;
-	VectorIRotate( oldCenter, transform, newCenter );
-
-	Vector newExtents;
-	newExtents.x =	FloatMakePositive( oldExtents.x * transform[0][0] ) + 
-					FloatMakePositive( oldExtents.y * transform[1][0] ) + 
-					FloatMakePositive( oldExtents.z * transform[2][0] );
-	newExtents.y =	FloatMakePositive( oldExtents.x * transform[0][1] ) + 
-					FloatMakePositive( oldExtents.y * transform[1][1] ) + 
-					FloatMakePositive( oldExtents.z * transform[2][1] );
-	newExtents.z =	FloatMakePositive( oldExtents.x * transform[0][2] ) + 
-					FloatMakePositive( oldExtents.y * transform[1][2] ) + 
-					FloatMakePositive( oldExtents.z * transform[2][2] );
-
-	VectorSubtract( newCenter, newExtents, vecMinsOut );
-	VectorAdd( newCenter, newExtents, vecMaxsOut );
-}
-
-
-float CalcSqrDistanceToAABB( const Vector &mins, const Vector &maxs, const Vector &point )
-{
-	float flDelta;
-	float flDistSqr = 0.0f;
-
-	if ( point.x < mins.x )
-	{
-		flDelta = (mins.x - point.x);
-		flDistSqr += flDelta * flDelta;
-	}
-	else if ( point.x > maxs.x )
-	{
-		flDelta = (point.x - maxs.x);
-		flDistSqr += flDelta * flDelta;
-	}
-
-	if ( point.y < mins.y )
-	{
-		flDelta = (mins.y - point.y);
-		flDistSqr += flDelta * flDelta;
-	}
-	else if ( point.y > maxs.y )
-	{
-		flDelta = (point.y - maxs.y);
-		flDistSqr += flDelta * flDelta;
-	}
-
-	if ( point.z < mins.z )
-	{
-		flDelta = (mins.z - point.z);
-		flDistSqr += flDelta * flDelta;
-	}
-	else if ( point.z > maxs.z )
-	{
-		flDelta = (point.z - maxs.z);
-		flDistSqr += flDelta * flDelta;
-	}
-
-	return flDistSqr;
-}
-
-
-void CalcClosestPointOnAABB( const Vector &mins, const Vector &maxs, const Vector &point, Vector &closestOut )
-{
-	closestOut.x = clamp( point.x, mins.x, maxs.x );
-	closestOut.y = clamp( point.y, mins.y, maxs.y );
-	closestOut.z = clamp( point.z, mins.z, maxs.z );
-}
-
-void CalcSqrDistAndClosestPointOnAABB( const Vector &mins, const Vector &maxs, const Vector &point, Vector &closestOut, float &distSqrOut )
-{
-	distSqrOut = 0.0f;
-	for ( int i = 0; i < 3; i++ )
-	{
-		if ( point[i] < mins[i] )
-		{
-			closestOut[i] = mins[i];
-			float flDelta = closestOut[i] - mins[i];
-			distSqrOut += flDelta * flDelta;
-		}
-		else if ( point[i] > maxs[i] )
-		{
-			closestOut[i] = maxs[i];
-			float flDelta = closestOut[i] - maxs[i];
-			distSqrOut += flDelta * flDelta;
-		}
-		else
-		{
-			closestOut[i] = point[i];
-		}
-	}
-
-}
-
-float CalcClosestPointToLineT( const Vector &P, const Vector &vLineA, const Vector &vLineB, Vector &vDir )
-{
-	Assert( s_bMathlibInitialized );
-	VectorSubtract( vLineB, vLineA, vDir );
-
-	// D dot [P - (A + D*t)] = 0
-	// t = ( DP - DA) / DD
-	float div = vDir.Dot( vDir );
-	if( div < 0.00001f )
-	{
-		return 0;
-	}
-	else
-	{
-		return (vDir.Dot( P ) - vDir.Dot( vLineA )) / div;
-	}
-}
-
-void CalcClosestPointOnLine( const Vector &P, const Vector &vLineA, const Vector &vLineB, Vector &vClosest, float *outT )
-{
-	Assert( s_bMathlibInitialized );
-	Vector vDir;
-	float t = CalcClosestPointToLineT( P, vLineA, vLineB, vDir );
-	if ( outT ) *outT = t;
-	vClosest.MulAdd( vLineA, vDir, t );
-}
-
-
-float CalcDistanceToLine( const Vector &P, const Vector &vLineA, const Vector &vLineB, float *outT )
-{
-	Assert( s_bMathlibInitialized );
-	Vector vClosest;
-	CalcClosestPointOnLine( P, vLineA, vLineB, vClosest, outT );
-	return P.DistTo(vClosest);
-}
-
-float CalcDistanceSqrToLine( const Vector &P, const Vector &vLineA, const Vector &vLineB, float *outT )
-{
-	Assert( s_bMathlibInitialized );
-	Vector vClosest;
-	CalcClosestPointOnLine( P, vLineA, vLineB, vClosest, outT );
-	return P.DistToSqr(vClosest);
-}
-
-void CalcClosestPointOnLineSegment( const Vector &P, const Vector &vLineA, const Vector &vLineB, Vector &vClosest, float *outT )
-{
-	Vector vDir;
-	float t = CalcClosestPointToLineT( P, vLineA, vLineB, vDir );
-	t = clamp( t, 0.f, 1.f );
-	if ( outT ) 
-	{
-		*outT = t;
-	}
-	vClosest.MulAdd( vLineA, vDir, t );
-}
-
-
-float CalcDistanceToLineSegment( const Vector &P, const Vector &vLineA, const Vector &vLineB, float *outT )
-{
-	Assert( s_bMathlibInitialized );
-	Vector vClosest;
-	CalcClosestPointOnLineSegment( P, vLineA, vLineB, vClosest, outT );
-	return P.DistTo( vClosest );
-}
-
-float CalcDistanceSqrToLineSegment( const Vector &P, const Vector &vLineA, const Vector &vLineB, float *outT )
-{
-	Assert( s_bMathlibInitialized );
-	Vector vClosest;
-	CalcClosestPointOnLineSegment( P, vLineA, vLineB, vClosest, outT );
-	return P.DistToSqr(vClosest);
-}
-
-float CalcClosestPointToLineT2D( const Vector2D &P, const Vector2D &vLineA, const Vector2D &vLineB, Vector2D &vDir )
-{
-	Assert( s_bMathlibInitialized );
-	Vector2DSubtract( vLineB, vLineA, vDir );
-
-	// D dot [P - (A + D*t)] = 0
-	// t = (DP - DA) / DD
-	float div = vDir.Dot( vDir );
-	if( div < 0.00001f )
-	{
-		return 0;
-	}
-	else
-	{
-		return (vDir.Dot( P ) - vDir.Dot( vLineA )) / div;
-	}
-}
-
-void CalcClosestPointOnLine2D( const Vector2D &P, const Vector2D &vLineA, const Vector2D &vLineB, Vector2D &vClosest, float *outT )
-{
-	Assert( s_bMathlibInitialized );
-	Vector2D vDir;
-	float t = CalcClosestPointToLineT2D( P, vLineA, vLineB, vDir );
-	if ( outT ) *outT = t;
-	vClosest.MulAdd( vLineA, vDir, t );
-}
-
-float CalcDistanceToLine2D( const Vector2D &P, const Vector2D &vLineA, const Vector2D &vLineB, float *outT )
-{
-	Assert( s_bMathlibInitialized );
-	Vector2D vClosest;
-	CalcClosestPointOnLine2D( P, vLineA, vLineB, vClosest, outT );
-	return P.DistTo( vClosest );
-}
-
-float CalcDistanceSqrToLine2D( const Vector2D &P, const Vector2D &vLineA, const Vector2D &vLineB, float *outT )
-{
-	Assert( s_bMathlibInitialized );
-	Vector2D vClosest;
-	CalcClosestPointOnLine2D( P, vLineA, vLineB, vClosest, outT );
-	return P.DistToSqr(vClosest);
-}
-
-void CalcClosestPointOnLineSegment2D( const Vector2D &P, const Vector2D &vLineA, const Vector2D &vLineB, Vector2D &vClosest, float *outT )
-{
-	Vector2D vDir;
-	float t = CalcClosestPointToLineT2D( P, vLineA, vLineB, vDir );
-	t = clamp( t, 0.f, 1.f );
-	if ( outT )
-	{
-		*outT = t;
-	}
-	vClosest.MulAdd( vLineA, vDir, t );
-}
-
-float CalcDistanceToLineSegment2D( const Vector2D &P, const Vector2D &vLineA, const Vector2D &vLineB, float *outT )
-{
-	Assert( s_bMathlibInitialized );
-	Vector2D vClosest;
-	CalcClosestPointOnLineSegment2D( P, vLineA, vLineB, vClosest, outT );
-	return P.DistTo( vClosest );
-}
-
-float CalcDistanceSqrToLineSegment2D( const Vector2D &P, const Vector2D &vLineA, const Vector2D &vLineB, float *outT )
-{
-	Assert( s_bMathlibInitialized );
-	Vector2D vClosest;
-	CalcClosestPointOnLineSegment2D( P, vLineA, vLineB, vClosest, outT );
-	return P.DistToSqr( vClosest );
-}
-
-// Do we have another epsilon we could use
-#define LINE_EPS ( 0.000001f )
-
-//-----------------------------------------------------------------------------
-// Purpose: Given lines p1->p2 and p3->p4, computes a line segment (pa->pb) and returns the parameters 0->1 multipliers
-//  along each segment for the returned points
-// Input  : p1 - 
-//			p2 - 
-//			p3 - 
-//			p4 - 
-//			*s1 - 
-//			*s2 - 
-// Output : Returns true on success, false on failure.
-//-----------------------------------------------------------------------------
-bool CalcLineToLineIntersectionSegment(
-   const Vector& p1,const Vector& p2,const Vector& p3,const Vector& p4,Vector *s1,Vector *s2,
-   float *t1, float *t2)
-{
-   Vector p13,p43,p21;
-   float d1343,d4321,d1321,d4343,d2121;
-   float numer,denom;
-
-   p13.x = p1.x - p3.x;
-   p13.y = p1.y - p3.y;
-   p13.z = p1.z - p3.z;
-   p43.x = p4.x - p3.x;
-   p43.y = p4.y - p3.y;
-   p43.z = p4.z - p3.z;
-
-   if (fabs(p43.x)  < LINE_EPS && fabs(p43.y)  < LINE_EPS && fabs(p43.z)  < LINE_EPS)
-      return false;
-   p21.x = p2.x - p1.x;
-   p21.y = p2.y - p1.y;
-   p21.z = p2.z - p1.z;
-   if (fabs(p21.x)  < LINE_EPS && fabs(p21.y)  < LINE_EPS && fabs(p21.z)  < LINE_EPS)
-      return false;
-
-   d1343 = p13.x * p43.x + p13.y * p43.y + p13.z * p43.z;
-   d4321 = p43.x * p21.x + p43.y * p21.y + p43.z * p21.z;
-   d1321 = p13.x * p21.x + p13.y * p21.y + p13.z * p21.z;
-   d4343 = p43.x * p43.x + p43.y * p43.y + p43.z * p43.z;
-   d2121 = p21.x * p21.x + p21.y * p21.y + p21.z * p21.z;
-
-   denom = d2121 * d4343 - d4321 * d4321;
-   if (fabs(denom) < LINE_EPS)
-      return false;
-   numer = d1343 * d4321 - d1321 * d4343;
-
-   *t1 = numer / denom;
-   *t2 = (d1343 + d4321 * (*t1)) / d4343;
-
-   s1->x = p1.x + *t1 * p21.x;
-   s1->y = p1.y + *t1 * p21.y;
-   s1->z = p1.z + *t1 * p21.z;
-   s2->x = p3.x + *t2 * p43.x;
-   s2->y = p3.y + *t2 * p43.y;
-   s2->z = p3.z + *t2 * p43.z;
-
-   return true;
-}
-
-#pragma optimize( "", off )
-
-#ifndef EXCEPTION_EXECUTE_HANDLER
-#define EXCEPTION_EXECUTE_HANDLER       1
-#endif
-
-#pragma optimize( "", on )
-
-static bool s_b3DNowEnabled = false;
-static bool s_bMMXEnabled = false;
-static bool s_bSSEEnabled = false;
-static bool s_bSSE2Enabled = false;
-
-void MathLib_Init( float gamma, float texGamma, float brightness, int overbright, bool bAllow3DNow, bool bAllowSSE, bool bAllowSSE2, bool bAllowMMX )
-{
-	if ( s_bMathlibInitialized )
-		return;
-
-	// FIXME: Hook SSE into VectorAligned + Vector4DAligned
-
-#if !defined( _X360 )
-	// Grab the processor information:
-	const CPUInformation& pi = *GetCPUInformation();
-
-	// Select the default generic routines.
-	pfSqrt = _sqrtf;
-	pfRSqrt = _rsqrtf;
-	pfRSqrtFast = _rsqrtf;
-	pfVectorNormalize = _VectorNormalize;
-	pfVectorNormalizeFast = _VectorNormalizeFast;
-	pfInvRSquared = _InvRSquared;
-	pfFastSinCos = SinCos;
-	pfFastCos = cosf;
-
-	if ( bAllowMMX && pi.m_bMMX )
-	{
-		// Select the MMX specific routines if available
-		// (MMX routines were used by SW span fillers - not currently used for HW)
-		s_bMMXEnabled = true;
-	}
-	else
-	{
-		s_bMMXEnabled = false;
-	}
-
-	// SSE Generally performs better than 3DNow when present, so this is placed 
-	// first to allow SSE to override these settings.
-#if !defined( OSX ) && !defined( PLATFORM_WINDOWS_PC64 ) && !defined(LINUX)
-	if ( bAllow3DNow && pi.m_b3DNow )
-	{
-		s_b3DNowEnabled = true;
-
-		// Select the 3DNow specific routines if available;
-		pfVectorNormalize = _3DNow_VectorNormalize;
-		pfVectorNormalizeFast = _3DNow_VectorNormalizeFast;
-		pfInvRSquared = _3DNow_InvRSquared;
-		pfSqrt = _3DNow_Sqrt;
-		pfRSqrt = _3DNow_RSqrt;
-		pfRSqrtFast = _3DNow_RSqrt;
-	}
-	else
-#endif
-	{
-		s_b3DNowEnabled = false;
-	}
-
-	if ( bAllowSSE && pi.m_bSSE )
-	{
-		s_bSSEEnabled = true;
-
-#ifndef PLATFORM_WINDOWS_PC64
-		// These are not yet available.
-		// Select the SSE specific routines if available
-		pfVectorNormalize = _VectorNormalize;
-		pfVectorNormalizeFast = _SSE_VectorNormalizeFast;
-		pfInvRSquared = _SSE_InvRSquared;
-		pfSqrt = _SSE_Sqrt;
-		pfRSqrt = _SSE_RSqrtAccurate;
-		pfRSqrtFast = _SSE_RSqrtFast;
-#endif
-#ifdef PLATFORM_WINDOWS_PC32
-		pfFastSinCos = _SSE_SinCos;
-		pfFastCos = _SSE_cos;
-#endif
-	}
-	else
-	{
-		s_bSSEEnabled = false;
-	}
-
-	if ( bAllowSSE2 && pi.m_bSSE2 )
-	{
-		s_bSSE2Enabled = true;
-#ifdef PLATFORM_WINDOWS_PC32
-		pfFastSinCos = _SSE2_SinCos;
-		pfFastCos = _SSE2_cos;
-#endif
-	} 
-	else
-	{
-		s_bSSE2Enabled = false;
-	}
-#endif
-
-	s_bMathlibInitialized = true;
-
-	InitSinCosTable();
-	BuildGammaTable( gamma, texGamma, brightness, overbright );
-}
-
-bool MathLib_3DNowEnabled( void )
-{
-	Assert( s_bMathlibInitialized );
-	return s_b3DNowEnabled;
-}
-
-bool MathLib_MMXEnabled( void )
-{
-	Assert( s_bMathlibInitialized );
-	return s_bMMXEnabled;
-}
-
-bool MathLib_SSEEnabled( void )
-{
-	Assert( s_bMathlibInitialized );
-	return s_bSSEEnabled;
-}
-
-bool MathLib_SSE2Enabled( void )
-{
-	Assert( s_bMathlibInitialized );
-	return s_bSSE2Enabled;
-}
-
-float Approach( float target, float value, float speed )
-{
-	float delta = target - value;
-
-	if ( delta > speed )
-		value += speed;
-	else if ( delta < -speed )
-		value -= speed;
-	else 
-		value = target;
-
-	return value;
-}
-
-// BUGBUG: Why doesn't this call angle diff?!?!?
-float ApproachAngle( float target, float value, float speed )
-{
-	target = anglemod( target );
-	value = anglemod( value );
-	
-	float delta = target - value;
-
-	// Speed is assumed to be positive
-	if ( speed < 0 )
-		speed = -speed;
-
-	if ( delta < -180 )
-		delta += 360;
-	else if ( delta > 180 )
-		delta -= 360;
-
-	if ( delta > speed )
-		value += speed;
-	else if ( delta < -speed )
-		value -= speed;
-	else 
-		value = target;
-
-	return value;
-}
-
-
-// BUGBUG: Why do we need both of these?
-float AngleDiff( float destAngle, float srcAngle )
-{
-	float delta;
-
-	delta = fmodf(destAngle - srcAngle, 360.0f);
-	if ( destAngle > srcAngle )
-	{
-		if ( delta >= 180 )
-			delta -= 360;
-	}
-	else
-	{
-		if ( delta <= -180 )
-			delta += 360;
-	}
-	return delta;
-}
-
-
-float AngleDistance( float next, float cur )
-{
-	float delta = next - cur;
-
-	if ( delta < -180 )
-		delta += 360;
-	else if ( delta > 180 )
-		delta -= 360;
-
-	return delta;
-}
-
-
-float AngleNormalize( float angle )
-{
-	angle = fmodf(angle, 360.0f);
-	if (angle > 180) 
-	{
-		angle -= 360;
-	}
-	if (angle < -180)
-	{
-		angle += 360;
-	}
-	return angle;
-}
-
-//--------------------------------------------------------------------------------------------------------------
-// ensure that 0 <= angle <= 360
-float AngleNormalizePositive( float angle )
-{
-	angle = fmodf( angle, 360.0f );
-
-	if (angle < 0.0f)
-	{
-		angle += 360.0f;
-	}
-
-	return angle;
-}
-
-//--------------------------------------------------------------------------------------------------------------
-bool AnglesAreEqual( float a, float b, float tolerance )
-{
-	return (fabs( AngleDiff( a, b ) ) < tolerance);
-}
-
-void RotationDeltaAxisAngle( const QAngle &srcAngles, const QAngle &destAngles, Vector &deltaAxis, float &deltaAngle )
-{
-	Quaternion srcQuat, destQuat, srcQuatInv, out;
-	AngleQuaternion( srcAngles, srcQuat );
-	AngleQuaternion( destAngles, destQuat );
-	QuaternionScale( srcQuat, -1, srcQuatInv );
-	QuaternionMult( destQuat, srcQuatInv, out );
-
-	QuaternionNormalize( out );
-	QuaternionAxisAngle( out, deltaAxis, deltaAngle );
-}
-
-void RotationDelta( const QAngle &srcAngles, const QAngle &destAngles, QAngle *out )
-{
-	matrix3x4_t src, srcInv;
-	matrix3x4_t dest;
-	AngleMatrix( srcAngles, src );
-	AngleMatrix( destAngles, dest );
-	// xform = src(-1) * dest
-	MatrixInvert( src, srcInv );
-	matrix3x4_t xform;
-	ConcatTransforms( dest, srcInv, xform );
-	QAngle xformAngles;
-	MatrixAngles( xform, xformAngles );
-	if ( out )
-	{
-		*out = xformAngles;
-	}
-}
-
-//-----------------------------------------------------------------------------
-// Purpose: Computes a triangle normal
-//-----------------------------------------------------------------------------
-void ComputeTrianglePlane( const Vector& v1, const Vector& v2, const Vector& v3, Vector& normal, float& intercept )
-{
-	Vector e1, e2;
-	VectorSubtract( v2, v1, e1 );
-	VectorSubtract( v3, v1, e2 );
-	CrossProduct( e1, e2, normal );
-	VectorNormalize( normal );
-	intercept = DotProduct( normal, v1 ); 
-}
-
-//-----------------------------------------------------------------------------
-// Purpose: This is a clone of BaseWindingForPlane()
-// Input  : *outVerts - an array of preallocated verts to build the polygon in
-//			normal - the plane normal
-//			dist - the plane constant
-// Output : int - vert count (always 4)
-//-----------------------------------------------------------------------------
-int PolyFromPlane( Vector *outVerts, const Vector& normal, float dist, float fHalfScale )
-{
-	int		i, x;
-	vec_t	max, v;
-	Vector	org, vright, vup;
-
-	// find the major axis
-
-	max = -16384; //MAX_COORD_INTEGER
-	x = -1;
-	for (i=0 ; i<3; i++)
-	{
-		v = fabs(normal[i]);
-		if (v > max)
-		{
-			x = i;
-			max = v;
-		}
-	}
-
-	if (x==-1)
-		return 0;
-
-	// Build a unit vector along something other than the major axis
-	VectorCopy (vec3_origin, vup);	
-	switch (x)
-	{
-	case 0:
-	case 1:
-		vup[2] = 1;
-		break;		
-	case 2:
-		vup[0] = 1;
-		break;		
-	}
-
-	// Remove the component of this vector along the normal
-	v = DotProduct (vup, normal);
-	VectorMA (vup, -v, normal, vup);
-	// Make it a unit (perpendicular)
-	VectorNormalize (vup);
-
-	// Center of the poly is at normal * dist
-	VectorScale (normal, dist, org);
-	// Calculate the third orthonormal basis vector for our plane space (this one and vup are in the plane)
-	CrossProduct (vup, normal, vright);
-
-	// Make the plane's basis vectors big (these are the half-sides of the polygon we're making)
-	VectorScale (vup, fHalfScale, vup);
-	VectorScale (vright, fHalfScale, vright);
-
-	// Move diagonally away from org to create the corner verts
-	VectorSubtract (org, vright, outVerts[0]);	// left
-	VectorAdd (outVerts[0], vup, outVerts[0]);	// up
-
-	VectorAdd (org, vright, outVerts[1]);		// right
-	VectorAdd (outVerts[1], vup, outVerts[1]);	// up
-
-	VectorAdd (org, vright, outVerts[2]);		// right
-	VectorSubtract (outVerts[2], vup, outVerts[2]);	// down
-
-	VectorSubtract (org, vright, outVerts[3]);		// left
-	VectorSubtract (outVerts[3], vup, outVerts[3]);	// down
-
-	// The four corners form a planar quadrilateral normal to "normal"
-	return 4;
-}
-
-//-----------------------------------------------------------------------------
-// Purpose: clip a poly to the plane and return the poly on the front side of the plane
-// Input  : *inVerts - input polygon
-//			vertCount - # verts in input poly
-//			*outVerts - destination poly
-//			normal - plane normal
-//			dist - plane constant
-// Output : int - # verts in output poly
-//-----------------------------------------------------------------------------
-
-int ClipPolyToPlane( Vector *inVerts, int vertCount, Vector *outVerts, const Vector& normal, float dist, float fOnPlaneEpsilon )
-{
-	vec_t	*dists = (vec_t *)stackalloc( sizeof(vec_t) * vertCount * 4 ); //4x vertcount should cover all cases
-	int		*sides = (int *)stackalloc( sizeof(vec_t) * vertCount * 4 );
-	int		counts[3];
-	vec_t	dot;
-	int		i, j;
-	Vector	mid = vec3_origin;
-	int		outCount;
-
-	counts[0] = counts[1] = counts[2] = 0;
-
-	// determine sides for each point
-	for ( i = 0; i < vertCount; i++ )
-	{
-		dot = DotProduct( inVerts[i], normal) - dist;
-		dists[i] = dot;
-		if ( dot > fOnPlaneEpsilon )
-		{
-			sides[i] = SIDE_FRONT;
-		}
-		else if ( dot < -fOnPlaneEpsilon )
-		{
-			sides[i] = SIDE_BACK;
-		}
-		else
-		{
-			sides[i] = SIDE_ON;
-		}
-		counts[sides[i]]++;
-	}
-	sides[i] = sides[0];
-	dists[i] = dists[0];
-
-	if (!counts[0])
-		return 0;
-
-	if (!counts[1])
-	{
-		// Copy to output verts
-		for ( i = 0; i < vertCount; i++ )
-		{
-			VectorCopy( inVerts[i], outVerts[i] );
-		}
-		return vertCount;
-	}
-
-	outCount = 0;
-	for ( i = 0; i < vertCount; i++ )
-	{
-		Vector& p1 = inVerts[i];
-
-		if (sides[i] == SIDE_ON)
-		{
-			VectorCopy( p1, outVerts[outCount]);
-			outCount++;
-			continue;
-		}
-
-		if (sides[i] == SIDE_FRONT)
-		{
-			VectorCopy( p1, outVerts[outCount]);
-			outCount++;
-		}
-
-		if (sides[i+1] == SIDE_ON || sides[i+1] == sides[i])
-			continue;
-
-		// generate a split point
-		Vector& p2 = inVerts[(i+1)%vertCount];
-
-		dot = dists[i] / (dists[i]-dists[i+1]);
-		for (j=0 ; j<3 ; j++)
-		{	// avoid round off error when possible
-			if (normal[j] == 1)
-				mid[j] = dist;
-			else if (normal[j] == -1)
-				mid[j] = -dist;
-			else
-				mid[j] = p1[j] + dot*(p2[j]-p1[j]);
-		}
-
-		VectorCopy (mid, outVerts[outCount]);
-		outCount++;
-	}
-
-	return outCount;
-}
-
-
-int ClipPolyToPlane_Precise( double *inVerts, int vertCount, double *outVerts, const double *normal, double dist, double fOnPlaneEpsilon )
-{
-	double	*dists = (double *)stackalloc( sizeof(double) * vertCount * 4 ); //4x vertcount should cover all cases
-	int		*sides = (int *)stackalloc( sizeof(double) * vertCount * 4 );
-	int		counts[3];
-	double	dot;
-	int		i, j;
-	//Vector	mid = vec3_origin;
-	double mid[3];
-	mid[0] = 0.0;
-	mid[1] = 0.0;
-	mid[2] = 0.0;
-	int		outCount;
-
-	counts[0] = counts[1] = counts[2] = 0;
-
-	// determine sides for each point
-	for ( i = 0; i < vertCount; i++ )
-	{
-		//dot = DotProduct( inVerts[i], normal) - dist;
-		dot = ((inVerts[i*3 + 0] * normal[0]) + (inVerts[i*3 + 1] * normal[1]) + (inVerts[i*3 + 2] * normal[2])) - dist;
-		dists[i] = dot;
-		if ( dot > fOnPlaneEpsilon )
-		{
-			sides[i] = SIDE_FRONT;
-		}
-		else if ( dot < -fOnPlaneEpsilon )
-		{
-			sides[i] = SIDE_BACK;
-		}
-		else
-		{
-			sides[i] = SIDE_ON;
-		}
-		counts[sides[i]]++;
-	}
-	sides[i] = sides[0];
-	dists[i] = dists[0];
-
-	if (!counts[0])
-		return 0;
-
-	if (!counts[1])
-	{
-		// Copy to output verts
-		//for ( i = 0; i < vertCount; i++ )
-		for ( i = 0; i < vertCount * 3; i++ )
-		{
-			//VectorCopy( inVerts[i], outVerts[i] );
-			outVerts[i] = inVerts[i];
-		}
-		return vertCount;
-	}
-
-	outCount = 0;
-	for ( i = 0; i < vertCount; i++ )
-	{
-		//Vector& p1 = inVerts[i];
-		double *p1 = &inVerts[i*3];
-		//p1[0] = inVerts[i*3 + 0];
-		//p1[1] = inVerts[i*3 + 1];
-		//p1[2] = inVerts[i*3 + 2];
-
-		if (sides[i] == SIDE_ON)
-		{
-			//VectorCopy( p1, outVerts[outCount]);
-			outVerts[outCount*3 + 0] = p1[0];
-			outVerts[outCount*3 + 1] = p1[1];
-			outVerts[outCount*3 + 2] = p1[2];
-			outCount++;
-			continue;
-		}
-
-		if (sides[i] == SIDE_FRONT)
-		{
-			//VectorCopy( p1, outVerts[outCount]);
-			outVerts[outCount*3 + 0] = p1[0];
-			outVerts[outCount*3 + 1] = p1[1];
-			outVerts[outCount*3 + 2] = p1[2];
-			outCount++;
-		}
-
-		if (sides[i+1] == SIDE_ON || sides[i+1] == sides[i])
-			continue;
-
-		// generate a split point
-		//Vector& p2 = inVerts[(i+1)%vertCount];
-		int wrappedindex = (i+1)%vertCount;
-		double *p2 = &inVerts[wrappedindex*3];
-		//p2[0] = inVerts[wrappedindex*3 + 0];
-		//p2[1] = inVerts[wrappedindex*3 + 1];
-		//p2[2] = inVerts[wrappedindex*3 + 2];
-
-		dot = dists[i] / (dists[i]-dists[i+1]);
-		for (j=0 ; j<3 ; j++)
-		{
-			mid[j] = (double)p1[j] + dot*((double)p2[j]-(double)p1[j]);
-		}
-
-		//VectorCopy (mid, outVerts[outCount]);
-		outVerts[outCount*3 + 0] = mid[0];
-		outVerts[outCount*3 + 1] = mid[1];
-		outVerts[outCount*3 + 2] = mid[2];
-		outCount++;
-	}
-
-	return outCount;
-}
-
-int CeilPow2( int in )
-{
-	int retval;
-	
-	retval = 1;
-	while( retval < in )
-		retval <<= 1;
-	return retval;
-}
-
-int FloorPow2( int in )
-{
-	int retval;
-	
-	retval = 1;
-	while( retval < in )
-		retval <<= 1;
-	return retval >> 1;
-}
-
-
-//-----------------------------------------------------------------------------
-// Computes Y fov from an X fov and a screen aspect ratio
-//-----------------------------------------------------------------------------
-float CalcFovY( float flFovX, float flAspect )
-{
-	if ( flFovX < 1 || flFovX > 179)
-	{
-		flFovX = 90;	// error, set to 90
-	}
-
-	// The long, but illustrative version (more closely matches CShaderAPIDX8::PerspectiveX, which
-	// is what it's based on).
-	//
-	//float width = 2 * zNear * tan( DEG2RAD( fov_x / 2.0 ) );
-	//float height = width / screenaspect;
-	//float yRadians = atan( (height/2.0) / zNear );
-	//return RAD2DEG( yRadians ) * 2;
-
-	// The short and sweet version.
-	float val = atan( tan( DEG2RAD( flFovX ) * 0.5f ) / flAspect );
-	val = RAD2DEG( val ) * 2.0f;
-	return val;
-}
-
-float CalcFovX( float flFovY, float flAspect )
-{
-	return RAD2DEG( atan( tan( DEG2RAD( flFovY ) * 0.5f ) * flAspect ) ) * 2.0f;
-}
-
-
-//-----------------------------------------------------------------------------
-// Generate a frustum based on perspective view parameters
-//-----------------------------------------------------------------------------
-void GeneratePerspectiveFrustum( const Vector& origin, const Vector &forward, 
-	const Vector &right, const Vector &up, float flZNear, float flZFar, 
-	float flFovX, float flFovY, Frustum_t &frustum )
-{
-	float flIntercept = DotProduct( origin, forward );
-
-	// Setup the near and far planes.
-	frustum.SetPlane( FRUSTUM_FARZ, PLANE_ANYZ, -forward, -flZFar - flIntercept );
-	frustum.SetPlane( FRUSTUM_NEARZ, PLANE_ANYZ, forward, flZNear + flIntercept );
-
-	flFovX *= 0.5f;
-	flFovY *= 0.5f;
-
-	float flTanX = tan( DEG2RAD( flFovX ) );
-	float flTanY = tan( DEG2RAD( flFovY ) );
-
-	// OPTIMIZE: Normalizing these planes is not necessary for culling
-	Vector normalPos, normalNeg;
-
-	VectorMA( right, flTanX, forward, normalPos );
-	VectorMA( normalPos, -2.0f, right, normalNeg );
-
-	VectorNormalize( normalPos );
-	VectorNormalize( normalNeg );
-
-	frustum.SetPlane( FRUSTUM_LEFT, PLANE_ANYZ, normalPos, normalPos.Dot( origin ) );
-	frustum.SetPlane( FRUSTUM_RIGHT, PLANE_ANYZ, normalNeg, normalNeg.Dot( origin ) );
-
-	VectorMA( up, flTanY, forward, normalPos );
-	VectorMA( normalPos, -2.0f, up, normalNeg );
-
-	VectorNormalize( normalPos );
-	VectorNormalize( normalNeg );
-
-	frustum.SetPlane( FRUSTUM_BOTTOM, PLANE_ANYZ, normalPos, normalPos.Dot( origin ) );
-	frustum.SetPlane( FRUSTUM_TOP, PLANE_ANYZ, normalNeg, normalNeg.Dot( origin ) );
-}
-
-
-//-----------------------------------------------------------------------------
-// Version that accepts angles instead of vectors
-//-----------------------------------------------------------------------------
-void GeneratePerspectiveFrustum( const Vector& origin, const QAngle &angles, float flZNear, float flZFar, float flFovX, float flAspectRatio, Frustum_t &frustum )
-{
-	Vector vecForward, vecRight, vecUp;
-	AngleVectors( angles, &vecForward, &vecRight, &vecUp );
-	float flFovY = CalcFovY( flFovX, flAspectRatio );
-	GeneratePerspectiveFrustum( origin, vecForward, vecRight, vecUp, flZNear, flZFar, flFovX, flFovY, frustum );
-}
-
-bool R_CullBox( const Vector& mins, const Vector& maxs, const Frustum_t &frustum )
-{
-	return (( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_RIGHT) ) == 2 ) || 
-			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_LEFT) ) == 2 ) ||
-			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_TOP) ) == 2 ) ||
-			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_BOTTOM) ) == 2 ) ||
-			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_NEARZ) ) == 2 ) ||
-			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_FARZ) ) == 2 ) );
-}
-
-bool R_CullBoxSkipNear( const Vector& mins, const Vector& maxs, const Frustum_t &frustum )
-{
-	return (( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_RIGHT) ) == 2 ) || 
-			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_LEFT) ) == 2 ) ||
-			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_TOP) ) == 2 ) ||
-			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_BOTTOM) ) == 2 ) ||
-			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_FARZ) ) == 2 ) );
-}
-
-
-// NOTE: This routine was taken (and modified) from NVidia's BlinnReflection demo
-// Creates basis vectors, based on a vertex and index list.
-// See the NVidia white paper 'GDC2K PerPixel Lighting' for a description
-// of how this computation works
-#define SMALL_FLOAT 1e-12
-
-void CalcTriangleTangentSpace( const Vector &p0, const Vector &p1, const Vector &p2,
-							   const Vector2D &t0, const Vector2D &t1, const Vector2D& t2,
-							   Vector &sVect, Vector &tVect )
-{
-	/* Compute the partial derivatives of X, Y, and Z with respect to S and T. */
-	sVect.Init( 0.0f, 0.0f, 0.0f );
-	tVect.Init( 0.0f, 0.0f, 0.0f );
-
-	// x, s, t
-	Vector edge01( p1.x - p0.x, t1.x - t0.x, t1.y - t0.y );
-	Vector edge02( p2.x - p0.x, t2.x - t0.x, t2.y - t0.y );
-
-	Vector cross;
-	CrossProduct( edge01, edge02, cross );
-	if ( fabs( cross.x ) > SMALL_FLOAT )
-	{
-		sVect.x += -cross.y / cross.x;
-		tVect.x += -cross.z / cross.x;
-	}
-
-	// y, s, t
-	edge01.Init( p1.y - p0.y, t1.x - t0.x, t1.y - t0.y );
-	edge02.Init( p2.y - p0.y, t2.x - t0.x, t2.y - t0.y );
-
-	CrossProduct( edge01, edge02, cross );
-	if ( fabs( cross.x ) > SMALL_FLOAT )
-	{
-		sVect.y += -cross.y / cross.x;
-		tVect.y += -cross.z / cross.x;
-	}
-
-	// z, s, t
-	edge01.Init( p1.z - p0.z, t1.x - t0.x, t1.y - t0.y );
-	edge02.Init( p2.z - p0.z, t2.x - t0.x, t2.y - t0.y );
-
-	CrossProduct( edge01, edge02, cross );
-	if( fabs( cross.x ) > SMALL_FLOAT )
-	{
-		sVect.z += -cross.y / cross.x;
-		tVect.z += -cross.z / cross.x;
-	}
-
-	// Normalize sVect and tVect
-	VectorNormalize( sVect );
-	VectorNormalize( tVect );
-}
-
-
-//-----------------------------------------------------------------------------
-// Convert RGB to HSV
-//-----------------------------------------------------------------------------
-void RGBtoHSV( const Vector &rgb, Vector &hsv )
-{
-	float flMax = max( rgb.x, rgb.y );
-	flMax = max( flMax, rgb.z );
-	float flMin = min( rgb.x, rgb.y );
-	flMin = min( flMin, rgb.z );
-
-	// hsv.z is the value
-	hsv.z = flMax;
-
-	// hsv.y is the saturation
-	if (flMax != 0.0F)
-	{
-		hsv.y = (flMax - flMin) / flMax;
-	}
-	else
-	{
-		hsv.y = 0.0F;
-	}
-
-	// hsv.x is the hue
-	if (hsv.y == 0.0F)
-	{
-		hsv.x = -1.0f;
-	}
-	else
-	{
-		float32 d = flMax - flMin;
-		if (rgb.x == flMax)		
-		{
-			hsv.x = (rgb.y - rgb.z) / d;
-		}
-		else if (rgb.y == flMax)	
-		{
-			hsv.x = 2.0F + (rgb.z - rgb.x) / d;
-		}
-		else				
-		{
-			hsv.x = 4.0F + (rgb.x - rgb.y) / d;
-		}
-		hsv.x *= 60.0F;
-		if ( hsv.x < 0.0F ) 
-		{
-			hsv.x += 360.0F;
-		}
-	}
-}
-
-
-//-----------------------------------------------------------------------------
-// Convert HSV to RGB
-//-----------------------------------------------------------------------------
-void HSVtoRGB( const Vector &hsv, Vector &rgb )
-{         
-	if ( hsv.y == 0.0F )
-	{
-		rgb.Init( hsv.z, hsv.z, hsv.z );
-		return;
-	}
-
-	float32 hue = hsv.x;
-	if (hue == 360.0F) 
-	{	
-		hue = 0.0F;
-	}
-	hue /= 60.0F;
-	int     i = hue;        // integer part
-	float32 f = hue - i;    // fractional part
-	float32 p = hsv.z * (1.0F - hsv.y);
-	float32 q = hsv.z * (1.0F - hsv.y * f);
-	float32 t = hsv.z * (1.0F - hsv.y * (1.0F - f));
-	switch(i)
-	{
-	case 0: rgb.Init( hsv.z, t, p ); break;
-	case 1: rgb.Init( q, hsv.z, p ); break;
-	case 2: rgb.Init( p, hsv.z, t ); break;
-	case 3: rgb.Init( p, q, hsv.z ); break;
-	case 4: rgb.Init( t, p, hsv.z ); break;
-	case 5: rgb.Init( hsv.z, p, q ); break;
-	}
-}
-
-
-void GetInterpolationData( float const *pKnotPositions, 
-						   float const *pKnotValues,
-						   int nNumValuesinList,
-						   int nInterpolationRange,
-						   float flPositionToInterpolateAt,
-						   bool bWrap,
-						   float *pValueA, 
-						   float *pValueB,
-						   float *pInterpolationValue)
-{
-	// first, find the bracketting knots by looking for the first knot >= our index
-	
-	int idx;
-	for(idx = 0; idx < nNumValuesinList; idx++ )
-	{
-		if ( pKnotPositions[idx] >= flPositionToInterpolateAt )
-			break;
-	}
-	int nKnot1, nKnot2;
-	float flOffsetFromStartOfGap, flSizeOfGap;
-	if ( idx == 0)
-	{
-		if ( bWrap )
-		{
-			nKnot1 = nNumValuesinList-1;
-			nKnot2 = 0;
-			flSizeOfGap =
-				( pKnotPositions[nKnot2] + ( nInterpolationRange-pKnotPositions[nKnot1] ) );
-			flOffsetFromStartOfGap = 
-				flPositionToInterpolateAt + ( nInterpolationRange-pKnotPositions[nKnot1] );
-		}
-		else
-		{
-			*pValueA = *pValueB = pKnotValues[0];
-			*pInterpolationValue = 1.0;
-			return;
-		}
-	}
-	else if ( idx == nNumValuesinList )						// ran out of values
-	{
-		if ( bWrap )
-		{
-			nKnot1 = nNumValuesinList -1;
-			nKnot2 = 0;
-			flSizeOfGap = ( pKnotPositions[nKnot2] + 
-						 ( nInterpolationRange-pKnotPositions[nKnot1] ) );
-			flOffsetFromStartOfGap = flPositionToInterpolateAt - pKnotPositions[nKnot1];
-		}
-		else
-		{
-			*pValueA = *pValueB = pKnotValues[nNumValuesinList-1];
-			*pInterpolationValue = 1.0;
-			return;
-		}
-
-	}
-	else
-	{
-		nKnot1 = idx-1;
-		nKnot2 = idx;
-		flSizeOfGap = pKnotPositions[nKnot2]-pKnotPositions[nKnot1];
-		flOffsetFromStartOfGap = flPositionToInterpolateAt-pKnotPositions[nKnot1];
-	}
-
-	*pValueA = pKnotValues[nKnot1];
-	*pValueB = pKnotValues[nKnot2];
-	*pInterpolationValue = FLerp( 0, 1, 0, flSizeOfGap, flOffsetFromStartOfGap );
-	return;
-}
-
-float RandomVectorInUnitSphere( Vector *pVector )
-{
-	// Guarantee uniform random distribution within a sphere
-	// Graphics gems III contains this algorithm ("Nonuniform random point sets via warping")
-	float u = ((float)rand() / VALVE_RAND_MAX);
-	float v = ((float)rand() / VALVE_RAND_MAX);
-	float w = ((float)rand() / VALVE_RAND_MAX);
-
-	float flPhi = acos( 1 - 2 * u );
-	float flTheta = 2 * M_PI * v;
-	float flRadius = powf( w, 1.0f / 3.0f );
-
-	float flSinPhi, flCosPhi;
-	float flSinTheta, flCosTheta;
-	SinCos( flPhi, &flSinPhi, &flCosPhi );
-	SinCos( flTheta, &flSinTheta, &flCosTheta );
-
-	pVector->x = flRadius * flSinPhi * flCosTheta;
-	pVector->y = flRadius * flSinPhi * flSinTheta;
-	pVector->z = flRadius * flCosPhi;
-	return flRadius;
-}
-
-float RandomVectorInUnitCircle( Vector2D *pVector )
-{
-	// Guarantee uniform random distribution within a sphere
-	// Graphics gems III contains this algorithm ("Nonuniform random point sets via warping")
-	float u = ((float)rand() / VALVE_RAND_MAX);
-	float v = ((float)rand() / VALVE_RAND_MAX);
-
-	float flTheta = 2 * M_PI * v;
-	float flRadius = powf( u, 1.0f / 2.0f );
-
-	float flSinTheta, flCosTheta;
-	SinCos( flTheta, &flSinTheta, &flCosTheta );
-
-	pVector->x = flRadius * flCosTheta;
-	pVector->y = flRadius * flSinTheta;
-	return flRadius;
-}
-#ifdef FP_EXCEPTIONS_ENABLED
-#include <float.h> // For _clearfp and _controlfp_s
-#endif
-
-// FPExceptionDisable and FPExceptionEnabler taken from my blog post
-// at http://www.altdevblogaday.com/2012/04/20/exceptional-floating-point/
-
-#ifdef FP_EXCEPTIONS_ENABLED
-// These functions are all inlined NOPs if FP_EXCEPTIONS_ENABLED is not defined.
-FPExceptionDisabler::FPExceptionDisabler()
-{
-	// Retrieve the current state of the exception flags. This
-	// must be done before changing them. _MCW_EM is a bit
-	// mask representing all available exception masks.
-	_controlfp_s(&mOldValues, _MCW_EM, _MCW_EM);
-	// Set all of the exception flags, which suppresses FP
-	// exceptions on the x87 and SSE units.
-	_controlfp_s(0, _MCW_EM, _MCW_EM);
-}
-
-FPExceptionDisabler::~FPExceptionDisabler()
-{
-	// Clear any pending FP exceptions. This must be done
-	// prior to enabling FP exceptions since otherwise there
-	// may be a 'deferred crash' as soon the exceptions are
-	// enabled.
-	_clearfp();
-
-	// Reset (possibly enabling) the exception status.
-	_controlfp_s(0, mOldValues, _MCW_EM);
-}
-
-// Overflow, divide-by-zero, and invalid-operation are the FP
-// exceptions most frequently associated with bugs.
-FPExceptionEnabler::FPExceptionEnabler(unsigned int enableBits /*= _EM_OVERFLOW | _EM_ZERODIVIDE | _EM_INVALID*/)
-{
-	// Retrieve the current state of the exception flags. This
-	// must be done before changing them. _MCW_EM is a bit
-	// mask representing all available exception masks.
-	_controlfp_s(&mOldValues, _MCW_EM, _MCW_EM);
-
-	// Make sure no non-exception flags have been specified,
-	// to avoid accidental changing of rounding modes, etc.
-	enableBits &= _MCW_EM;
-
-	// Clear any pending FP exceptions. This must be done
-	// prior to enabling FP exceptions since otherwise there
-	// may be a 'deferred crash' as soon the exceptions are
-	// enabled.
-	_clearfp();
-
-	// Zero out the specified bits, leaving other bits alone.
-	_controlfp_s(0, ~enableBits, enableBits);
-}
-
-FPExceptionEnabler::~FPExceptionEnabler()
-{
-	// Reset the exception state.
-	_controlfp_s(0, mOldValues, _MCW_EM);
-}
-#endif
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: Math primitives.
+//
+//===========================================================================//
+
+/// FIXME: As soon as all references to mathlib.c are gone, include it in here
+
+#include <math.h>
+#include <float.h>	// Needed for FLT_EPSILON
+
+#include "tier0/basetypes.h"
+#include <memory.h>
+#include "tier0/dbg.h"
+
+#include "tier0/vprof.h"
+//#define _VPROF_MATHLIB
+
+#pragma warning(disable:4244)   // "conversion from 'const int' to 'float', possible loss of data"
+#pragma warning(disable:4730)	// "mixing _m64 and floating point expressions may result in incorrect code"
+
+#include "mathlib/mathlib.h"
+#include "mathlib/vector.h"
+#if !defined( _X360 )
+#include "mathlib/amd3dx.h"
+#ifndef OSX
+#include "3dnow.h"
+#endif
+#include "sse.h"
+#endif
+
+#include "mathlib/ssemath.h"
+#include "mathlib/ssequaternion.h"
+
+// memdbgon must be the last include file in a .cpp file!!!
+#include "tier0/memdbgon.h"
+
+bool s_bMathlibInitialized = false;
+
+#ifdef PARANOID
+// User must provide an implementation of Sys_Error()
+void Sys_Error (char *error, ...);
+#endif
+
+const Vector vec3_origin(0,0,0);
+const QAngle vec3_angle(0,0,0);
+const Vector vec3_invalid( FLT_MAX, FLT_MAX, FLT_MAX );
+const int nanmask = 255<<23;
+
+//-----------------------------------------------------------------------------
+// Standard C implementations of optimized routines:
+//-----------------------------------------------------------------------------
+float _sqrtf(float _X)
+{
+	Assert( s_bMathlibInitialized );
+	return sqrtf(_X); 
+}
+
+float _rsqrtf(float x)
+{
+	Assert( s_bMathlibInitialized );
+
+	return 1.f / _sqrtf( x );
+}
+
+float FASTCALL _VectorNormalize (Vector& vec)
+{
+#ifdef _VPROF_MATHLIB
+	VPROF_BUDGET( "_VectorNormalize", "Mathlib" );
+#endif
+	Assert( s_bMathlibInitialized );
+	float radius = sqrtf(vec.x*vec.x + vec.y*vec.y + vec.z*vec.z);
+
+	// FLT_EPSILON is added to the radius to eliminate the possibility of divide by zero.
+	float iradius = 1.f / ( radius + FLT_EPSILON );
+	
+	vec.x *= iradius;
+	vec.y *= iradius;
+	vec.z *= iradius;
+	
+	return radius;
+}
+
+// TODO: Add fast C VectorNormalizeFast.
+// Perhaps use approximate rsqrt trick, if the accuracy isn't too bad.
+void FASTCALL _VectorNormalizeFast (Vector& vec)
+{
+	Assert( s_bMathlibInitialized );
+
+	// FLT_EPSILON is added to the radius to eliminate the possibility of divide by zero.
+	float iradius = 1.f / ( sqrtf(vec.x*vec.x + vec.y*vec.y + vec.z*vec.z) + FLT_EPSILON );
+	
+	vec.x *= iradius;
+	vec.y *= iradius;
+	vec.z *= iradius;
+	
+}
+
+float _InvRSquared(const float* v)
+{
+	Assert( s_bMathlibInitialized );
+	float	r2 = DotProduct(v, v);
+	return r2 < 1.f ? 1.f : 1/r2;
+}
+
+//-----------------------------------------------------------------------------
+// Function pointers selecting the appropriate implementation
+//-----------------------------------------------------------------------------
+float (*pfSqrt)(float x)  = _sqrtf;
+float (*pfRSqrt)(float x) = _rsqrtf;
+float (*pfRSqrtFast)(float x) = _rsqrtf;
+float (FASTCALL *pfVectorNormalize)(Vector& v) = _VectorNormalize;
+void  (FASTCALL *pfVectorNormalizeFast)(Vector& v) = _VectorNormalizeFast;
+float (*pfInvRSquared)(const float* v) = _InvRSquared;
+void  (*pfFastSinCos)(float x, float* s, float* c) = SinCos;
+float (*pfFastCos)(float x) = cosf;
+
+float SinCosTable[SIN_TABLE_SIZE];
+void InitSinCosTable()
+{
+	for( int i = 0; i < SIN_TABLE_SIZE; i++ )
+	{
+		SinCosTable[i] = sin(i * 2.0 * M_PI / SIN_TABLE_SIZE);
+	}
+}
+
+qboolean VectorsEqual( const float *v1, const float *v2 )
+{
+	Assert( s_bMathlibInitialized );
+	return ( ( v1[0] == v2[0] ) &&
+		     ( v1[1] == v2[1] ) &&
+			 ( v1[2] == v2[2] ) );
+}
+
+
+//-----------------------------------------------------------------------------
+// Purpose: Generates Euler angles given a left-handed orientation matrix. The
+//			columns of the matrix contain the forward, left, and up vectors.
+// Input  : matrix - Left-handed orientation matrix.
+//			angles[PITCH, YAW, ROLL]. Receives right-handed counterclockwise
+//				rotations in degrees around Y, Z, and X respectively.
+//-----------------------------------------------------------------------------
+
+void MatrixAngles( const matrix3x4_t& matrix, RadianEuler &angles, Vector &position )
+{
+	MatrixGetColumn( matrix, 3, position );
+	MatrixAngles( matrix, angles );
+}
+
+void MatrixAngles( const matrix3x4_t &matrix, Quaternion &q, Vector &pos )
+{
+#ifdef _VPROF_MATHLIB
+	VPROF_BUDGET( "MatrixQuaternion", "Mathlib" );
+#endif
+	float trace;
+	trace = matrix[0][0] + matrix[1][1] + matrix[2][2] + 1.0f;
+	if( trace > 1.0f + FLT_EPSILON ) 
+	{
+		// VPROF_INCREMENT_COUNTER("MatrixQuaternion A",1);
+		q.x = ( matrix[2][1] - matrix[1][2] );
+		q.y = ( matrix[0][2] - matrix[2][0] );
+		q.z = ( matrix[1][0] - matrix[0][1] );
+		q.w = trace;
+	} 
+	else if ( matrix[0][0] > matrix[1][1] && matrix[0][0] > matrix[2][2] ) 
+	{
+		// VPROF_INCREMENT_COUNTER("MatrixQuaternion B",1);
+		trace = 1.0f + matrix[0][0] - matrix[1][1] - matrix[2][2];
+		q.x = trace;
+		q.y = (matrix[1][0] + matrix[0][1] );
+		q.z = (matrix[0][2] + matrix[2][0] );
+		q.w = (matrix[2][1] - matrix[1][2] );
+	} 
+	else if (matrix[1][1] > matrix[2][2])
+	{
+		// VPROF_INCREMENT_COUNTER("MatrixQuaternion C",1);
+		trace = 1.0f + matrix[1][1] - matrix[0][0] - matrix[2][2];
+		q.x = (matrix[0][1] + matrix[1][0] );
+		q.y = trace;
+		q.z = (matrix[2][1] + matrix[1][2] );
+		q.w = (matrix[0][2] - matrix[2][0] );
+	}
+	else
+	{
+		// VPROF_INCREMENT_COUNTER("MatrixQuaternion D",1);
+		trace = 1.0f + matrix[2][2] - matrix[0][0] - matrix[1][1];
+		q.x = (matrix[0][2] + matrix[2][0] );
+		q.y = (matrix[2][1] + matrix[1][2] );
+		q.z = trace;
+		q.w = (matrix[1][0] - matrix[0][1] );
+	}
+
+	QuaternionNormalize( q );
+
+#if 0
+	// check against the angle version
+	RadianEuler ang;
+	MatrixAngles( matrix, ang );
+	Quaternion test;
+	AngleQuaternion( ang, test );
+	float d = QuaternionDotProduct( q, test );
+	Assert( fabs(d) > 0.99 && fabs(d) < 1.01 );
+#endif
+
+	MatrixGetColumn( matrix, 3, pos );
+}
+
+void MatrixAngles( const matrix3x4_t& matrix, float *angles )
+{ 
+#ifdef _VPROF_MATHLIB
+	VPROF_BUDGET( "MatrixAngles", "Mathlib" );
+#endif
+	Assert( s_bMathlibInitialized );
+	float forward[3];
+	float left[3];
+	float up[3];
+
+	//
+	// Extract the basis vectors from the matrix. Since we only need the Z
+	// component of the up vector, we don't get X and Y.
+	//
+	forward[0] = matrix[0][0];
+	forward[1] = matrix[1][0];
+	forward[2] = matrix[2][0];
+	left[0] = matrix[0][1];
+	left[1] = matrix[1][1];
+	left[2] = matrix[2][1];
+	up[2] = matrix[2][2];
+
+	float xyDist = sqrtf( forward[0] * forward[0] + forward[1] * forward[1] );
+	
+	// enough here to get angles?
+	if ( xyDist > 0.001f )
+	{
+		// (yaw)	y = ATAN( forward.y, forward.x );		-- in our space, forward is the X axis
+		angles[1] = RAD2DEG( atan2f( forward[1], forward[0] ) );
+
+		// (pitch)	x = ATAN( -forward.z, sqrt(forward.x*forward.x+forward.y*forward.y) );
+		angles[0] = RAD2DEG( atan2f( -forward[2], xyDist ) );
+
+		// (roll)	z = ATAN( left.z, up.z );
+		angles[2] = RAD2DEG( atan2f( left[2], up[2] ) );
+	}
+	else	// forward is mostly Z, gimbal lock-
+	{
+		// (yaw)	y = ATAN( -left.x, left.y );			-- forward is mostly z, so use right for yaw
+		angles[1] = RAD2DEG( atan2f( -left[0], left[1] ) );
+
+		// (pitch)	x = ATAN( -forward.z, sqrt(forward.x*forward.x+forward.y*forward.y) );
+		angles[0] = RAD2DEG( atan2f( -forward[2], xyDist ) );
+
+		// Assume no roll in this case as one degree of freedom has been lost (i.e. yaw == roll)
+		angles[2] = 0;
+	}
+}
+
+
+// transform in1 by the matrix in2
+void VectorTransform (const float *in1, const matrix3x4_t& in2, float *out)
+{
+	Assert( s_bMathlibInitialized );
+	Assert( in1 != out );
+	out[0] = DotProduct(in1, in2[0]) + in2[0][3];
+	out[1] = DotProduct(in1, in2[1]) + in2[1][3];
+	out[2] = DotProduct(in1, in2[2]) + in2[2][3];
+}
+
+
+// assuming the matrix is orthonormal, transform in1 by the transpose (also the inverse in this case) of in2.
+void VectorITransform (const float *in1, const matrix3x4_t& in2, float *out)
+{
+	Assert( s_bMathlibInitialized );
+	float in1t[3];
+
+	in1t[0] = in1[0] - in2[0][3];
+	in1t[1] = in1[1] - in2[1][3];
+	in1t[2] = in1[2] - in2[2][3];
+
+	out[0] = in1t[0] * in2[0][0] + in1t[1] * in2[1][0] + in1t[2] * in2[2][0];
+	out[1] = in1t[0] * in2[0][1] + in1t[1] * in2[1][1] + in1t[2] * in2[2][1];
+	out[2] = in1t[0] * in2[0][2] + in1t[1] * in2[1][2] + in1t[2] * in2[2][2];
+}
+
+
+// assume in2 is a rotation and rotate the input vector
+void VectorRotate( const float *in1, const matrix3x4_t& in2, float *out )
+{
+	Assert( s_bMathlibInitialized );
+	Assert( in1 != out );
+	out[0] = DotProduct( in1, in2[0] );
+	out[1] = DotProduct( in1, in2[1] );
+	out[2] = DotProduct( in1, in2[2] );
+}
+
+// assume in2 is a rotation and rotate the input vector
+void VectorRotate( const Vector &in1, const QAngle &in2, Vector &out )
+{
+	matrix3x4_t matRotate;
+	AngleMatrix( in2, matRotate );
+	VectorRotate( in1, matRotate, out );
+}
+
+// assume in2 is a rotation and rotate the input vector
+void VectorRotate( const Vector &in1, const Quaternion &in2, Vector &out )
+{
+	matrix3x4_t matRotate;
+	QuaternionMatrix( in2, matRotate );
+	VectorRotate( in1, matRotate, out );
+}
+
+
+// rotate by the inverse of the matrix
+void VectorIRotate( const float *in1, const matrix3x4_t& in2, float *out )
+{
+	Assert( s_bMathlibInitialized );
+	Assert( in1 != out );
+	out[0] = in1[0]*in2[0][0] + in1[1]*in2[1][0] + in1[2]*in2[2][0];
+	out[1] = in1[0]*in2[0][1] + in1[1]*in2[1][1] + in1[2]*in2[2][1];
+	out[2] = in1[0]*in2[0][2] + in1[1]*in2[1][2] + in1[2]*in2[2][2];
+}
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+// transform a set of angles in the output space of parentMatrix to the input space
+QAngle TransformAnglesToLocalSpace( const QAngle &angles, const matrix3x4_t &parentMatrix )
+{
+	matrix3x4_t angToWorld, worldToParent, localMatrix;
+	MatrixInvert( parentMatrix, worldToParent );
+	AngleMatrix( angles, angToWorld );
+	ConcatTransforms( worldToParent, angToWorld, localMatrix );
+	
+	QAngle out;
+	MatrixAngles( localMatrix, out );
+	return out;
+}
+
+// transform a set of angles in the input space of parentMatrix to the output space
+QAngle TransformAnglesToWorldSpace( const QAngle &angles, const matrix3x4_t &parentMatrix )
+{
+	matrix3x4_t angToParent, angToWorld;
+	AngleMatrix( angles, angToParent );
+	ConcatTransforms( parentMatrix, angToParent, angToWorld );
+	QAngle out;
+	MatrixAngles( angToWorld, out );
+	return out;
+}
+
+#endif // VECTOR_NO_SLOW_OPERATIONS
+
+void MatrixInitialize( matrix3x4_t &mat, const Vector &vecOrigin, const Vector &vecXAxis, const Vector &vecYAxis, const Vector &vecZAxis )
+{
+	MatrixSetColumn( vecXAxis, 0, mat );
+	MatrixSetColumn( vecYAxis, 1, mat );
+	MatrixSetColumn( vecZAxis, 2, mat );
+	MatrixSetColumn( vecOrigin, 3, mat );
+}
+
+void MatrixCopy( const matrix3x4_t& in, matrix3x4_t& out )
+{
+	Assert( s_bMathlibInitialized );
+	memcpy( out.Base(), in.Base(), sizeof( float ) * 3 * 4 );
+}
+
+//-----------------------------------------------------------------------------
+// Matrix equality test
+//-----------------------------------------------------------------------------
+bool MatricesAreEqual( const matrix3x4_t &src1, const matrix3x4_t &src2, float flTolerance )
+{
+	for ( int i = 0; i < 3; ++i )
+	{
+		for ( int j = 0; j < 4; ++j )
+		{
+			if ( fabs( src1[i][j] - src2[i][j] ) > flTolerance )
+				return false;
+		}
+	}
+	return true;
+}
+
+// NOTE: This is just the transpose not a general inverse
+void MatrixInvert( const matrix3x4_t& in, matrix3x4_t& out )
+{
+	Assert( s_bMathlibInitialized );
+	if ( &in == &out )
+	{
+		V_swap(out[0][1],out[1][0]);
+		V_swap(out[0][2],out[2][0]);
+		V_swap(out[1][2],out[2][1]);
+	}
+	else
+	{
+		// transpose the matrix
+		out[0][0] = in[0][0];
+		out[0][1] = in[1][0];
+		out[0][2] = in[2][0];
+
+		out[1][0] = in[0][1];
+		out[1][1] = in[1][1];
+		out[1][2] = in[2][1];
+
+		out[2][0] = in[0][2];
+		out[2][1] = in[1][2];
+		out[2][2] = in[2][2];
+	}
+
+	// now fix up the translation to be in the other space
+	float tmp[3];
+	tmp[0] = in[0][3];
+	tmp[1] = in[1][3];
+	tmp[2] = in[2][3];
+
+	out[0][3] = -DotProduct( tmp, out[0] );
+	out[1][3] = -DotProduct( tmp, out[1] );
+	out[2][3] = -DotProduct( tmp, out[2] );
+}
+
+void MatrixGetColumn( const matrix3x4_t& in, int column, Vector &out )
+{
+	out.x = in[0][column];
+	out.y = in[1][column];
+	out.z = in[2][column];
+}
+
+void MatrixSetColumn( const Vector &in, int column, matrix3x4_t& out )
+{
+	out[0][column] = in.x;
+	out[1][column] = in.y;
+	out[2][column] = in.z;
+}
+
+void MatrixScaleBy ( const float flScale, matrix3x4_t &out )
+{
+	out[0][0] *= flScale;
+	out[1][0] *= flScale;
+	out[2][0] *= flScale;
+	out[0][1] *= flScale;
+	out[1][1] *= flScale;
+	out[2][1] *= flScale;
+	out[0][2] *= flScale;
+	out[1][2] *= flScale;
+	out[2][2] *= flScale;
+}
+
+void MatrixScaleByZero ( matrix3x4_t &out )
+{
+	out[0][0] = 0.0f;
+	out[1][0] = 0.0f;
+	out[2][0] = 0.0f;
+	out[0][1] = 0.0f;
+	out[1][1] = 0.0f;
+	out[2][1] = 0.0f;
+	out[0][2] = 0.0f;
+	out[1][2] = 0.0f;
+	out[2][2] = 0.0f;
+}
+
+
+
+int VectorCompare (const float *v1, const float *v2)
+{
+	Assert( s_bMathlibInitialized );
+	int		i;
+	
+	for (i=0 ; i<3 ; i++)
+		if (v1[i] != v2[i])
+			return 0;
+			
+	return 1;
+}
+
+void CrossProduct (const float* v1, const float* v2, float* cross)
+{
+	Assert( s_bMathlibInitialized );
+	Assert( v1 != cross );
+	Assert( v2 != cross );
+	cross[0] = v1[1]*v2[2] - v1[2]*v2[1];
+	cross[1] = v1[2]*v2[0] - v1[0]*v2[2];
+	cross[2] = v1[0]*v2[1] - v1[1]*v2[0];
+}
+
+int Q_log2(int val)
+{
+	int answer=0;
+	while (val>>=1)
+		answer++;
+	return answer;
+}
+
+// Matrix is right-handed x=forward, y=left, z=up.  We a left-handed convention for vectors in the game code (forward, right, up)
+void MatrixVectors( const matrix3x4_t &matrix, Vector* pForward, Vector *pRight, Vector *pUp )
+{
+	MatrixGetColumn( matrix, 0, *pForward );
+	MatrixGetColumn( matrix, 1, *pRight );
+	MatrixGetColumn( matrix, 2, *pUp );
+	*pRight *= -1.0f;
+}
+
+
+void VectorVectors( const Vector &forward, Vector &right, Vector &up )
+{
+	Assert( s_bMathlibInitialized );
+	Vector tmp;
+
+	if (forward[0] == 0 && forward[1] == 0)
+	{
+		// pitch 90 degrees up/down from identity
+		right[0] = 0;	
+		right[1] = -1; 
+		right[2] = 0;
+		up[0] = -forward[2]; 
+		up[1] = 0; 
+		up[2] = 0;
+	}
+	else
+	{
+		tmp[0] = 0; tmp[1] = 0; tmp[2] = 1.0;
+		CrossProduct( forward, tmp, right );
+		VectorNormalize( right );
+		CrossProduct( right, forward, up );
+		VectorNormalize( up );
+	}
+}
+
+void VectorMatrix( const Vector &forward, matrix3x4_t& matrix)
+{
+	Assert( s_bMathlibInitialized );
+	Vector right, up;
+	VectorVectors(forward, right, up);
+
+	MatrixSetColumn( forward, 0, matrix );
+	MatrixSetColumn( -right, 1, matrix );
+	MatrixSetColumn( up, 2, matrix );
+}
+
+
+void VectorAngles( const float *forward, float *angles )
+{
+	Assert( s_bMathlibInitialized );
+	float	tmp, yaw, pitch;
+	
+	if (forward[1] == 0 && forward[0] == 0)
+	{
+		yaw = 0;
+		if (forward[2] > 0)
+			pitch = 270;
+		else
+			pitch = 90;
+	}
+	else
+	{
+		yaw = (atan2(forward[1], forward[0]) * 180 / M_PI);
+		if (yaw < 0)
+			yaw += 360;
+
+		tmp = sqrt (forward[0]*forward[0] + forward[1]*forward[1]);
+		pitch = (atan2(-forward[2], tmp) * 180 / M_PI);
+		if (pitch < 0)
+			pitch += 360;
+	}
+	
+	angles[0] = pitch;
+	angles[1] = yaw;
+	angles[2] = 0;
+}
+
+
+/*
+================
+R_ConcatRotations
+================
+*/
+void ConcatRotations (const float in1[3][3], const float in2[3][3], float out[3][3])
+{
+	Assert( s_bMathlibInitialized );
+	Assert( in1 != out );
+	Assert( in2 != out );
+	out[0][0] = in1[0][0] * in2[0][0] + in1[0][1] * in2[1][0] +
+				in1[0][2] * in2[2][0];
+	out[0][1] = in1[0][0] * in2[0][1] + in1[0][1] * in2[1][1] +
+				in1[0][2] * in2[2][1];
+	out[0][2] = in1[0][0] * in2[0][2] + in1[0][1] * in2[1][2] +
+				in1[0][2] * in2[2][2];
+	out[1][0] = in1[1][0] * in2[0][0] + in1[1][1] * in2[1][0] +
+				in1[1][2] * in2[2][0];
+	out[1][1] = in1[1][0] * in2[0][1] + in1[1][1] * in2[1][1] +
+				in1[1][2] * in2[2][1];
+	out[1][2] = in1[1][0] * in2[0][2] + in1[1][1] * in2[1][2] +
+				in1[1][2] * in2[2][2];
+	out[2][0] = in1[2][0] * in2[0][0] + in1[2][1] * in2[1][0] +
+				in1[2][2] * in2[2][0];
+	out[2][1] = in1[2][0] * in2[0][1] + in1[2][1] * in2[1][1] +
+				in1[2][2] * in2[2][1];
+	out[2][2] = in1[2][0] * in2[0][2] + in1[2][1] * in2[1][2] +
+				in1[2][2] * in2[2][2];
+}
+
+void ConcatTransforms_Aligned( const matrix3x4_t &m0, const matrix3x4_t &m1, matrix3x4_t &out )
+{
+	Assert( (((size_t)&m0) % 16) == 0 );
+	Assert( (((size_t)&m1) % 16) == 0 );
+	Assert( (((size_t)&out) % 16) == 0 );
+
+	fltx4 lastMask = *(fltx4 *)(&g_SIMD_ComponentMask[3]);
+	fltx4 rowA0 = LoadAlignedSIMD( m0.m_flMatVal[0] );
+	fltx4 rowA1 = LoadAlignedSIMD( m0.m_flMatVal[1] );
+	fltx4 rowA2 = LoadAlignedSIMD( m0.m_flMatVal[2] );
+
+	fltx4 rowB0 = LoadAlignedSIMD( m1.m_flMatVal[0] );
+	fltx4 rowB1 = LoadAlignedSIMD( m1.m_flMatVal[1] );
+	fltx4 rowB2 = LoadAlignedSIMD( m1.m_flMatVal[2] );
+
+	// now we have the rows of m0 and the columns of m1
+	// first output row
+	fltx4 A0 = SplatXSIMD(rowA0);
+	fltx4 A1 = SplatYSIMD(rowA0);
+	fltx4 A2 = SplatZSIMD(rowA0);
+	fltx4 mul00 = MulSIMD( A0, rowB0 );
+	fltx4 mul01 = MulSIMD( A1, rowB1 );
+	fltx4 mul02 = MulSIMD( A2, rowB2 );
+	fltx4 out0 = AddSIMD( mul00, AddSIMD(mul01,mul02) );
+
+	// second output row
+	A0 = SplatXSIMD(rowA1);
+	A1 = SplatYSIMD(rowA1);
+	A2 = SplatZSIMD(rowA1);
+	fltx4 mul10 = MulSIMD( A0, rowB0 );
+	fltx4 mul11 = MulSIMD( A1, rowB1 );
+	fltx4 mul12 = MulSIMD( A2, rowB2 );
+	fltx4 out1 = AddSIMD( mul10, AddSIMD(mul11,mul12) );
+
+	// third output row
+	A0 = SplatXSIMD(rowA2);
+	A1 = SplatYSIMD(rowA2);
+	A2 = SplatZSIMD(rowA2);
+	fltx4 mul20 = MulSIMD( A0, rowB0 );
+	fltx4 mul21 = MulSIMD( A1, rowB1 );
+	fltx4 mul22 = MulSIMD( A2, rowB2 );
+	fltx4 out2 = AddSIMD( mul20, AddSIMD(mul21,mul22) );
+
+	// add in translation vector
+	A0 = AndSIMD(rowA0,lastMask);
+	A1 = AndSIMD(rowA1,lastMask);
+	A2 = AndSIMD(rowA2,lastMask);
+	out0 = AddSIMD(out0, A0);
+	out1 = AddSIMD(out1, A1);
+	out2 = AddSIMD(out2, A2);
+
+	StoreAlignedSIMD( out.m_flMatVal[0], out0 );
+	StoreAlignedSIMD( out.m_flMatVal[1], out1 );
+	StoreAlignedSIMD( out.m_flMatVal[2], out2 );
+}
+
+/*
+================
+R_ConcatTransforms
+================
+*/
+
+void ConcatTransforms (const matrix3x4_t& in1, const matrix3x4_t& in2, matrix3x4_t& out)
+{
+#if 0
+	// test for ones that'll be 2x faster
+	if ( (((size_t)&in1) % 16) == 0 && (((size_t)&in2) % 16) == 0 && (((size_t)&out) % 16) == 0 )
+	{
+		ConcatTransforms_Aligned( in1, in2, out );
+		return;
+	}
+#endif
+
+	fltx4 lastMask = *(fltx4 *)(&g_SIMD_ComponentMask[3]);
+	fltx4 rowA0 = LoadUnalignedSIMD( in1.m_flMatVal[0] );
+	fltx4 rowA1 = LoadUnalignedSIMD( in1.m_flMatVal[1] );
+	fltx4 rowA2 = LoadUnalignedSIMD( in1.m_flMatVal[2] );
+
+	fltx4 rowB0 = LoadUnalignedSIMD( in2.m_flMatVal[0] );
+	fltx4 rowB1 = LoadUnalignedSIMD( in2.m_flMatVal[1] );
+	fltx4 rowB2 = LoadUnalignedSIMD( in2.m_flMatVal[2] );
+
+	// now we have the rows of m0 and the columns of m1
+	// first output row
+	fltx4 A0 = SplatXSIMD(rowA0);
+	fltx4 A1 = SplatYSIMD(rowA0);
+	fltx4 A2 = SplatZSIMD(rowA0);
+	fltx4 mul00 = MulSIMD( A0, rowB0 );
+	fltx4 mul01 = MulSIMD( A1, rowB1 );
+	fltx4 mul02 = MulSIMD( A2, rowB2 );
+	fltx4 out0 = AddSIMD( mul00, AddSIMD(mul01,mul02) );
+
+	// second output row
+	A0 = SplatXSIMD(rowA1);
+	A1 = SplatYSIMD(rowA1);
+	A2 = SplatZSIMD(rowA1);
+	fltx4 mul10 = MulSIMD( A0, rowB0 );
+	fltx4 mul11 = MulSIMD( A1, rowB1 );
+	fltx4 mul12 = MulSIMD( A2, rowB2 );
+	fltx4 out1 = AddSIMD( mul10, AddSIMD(mul11,mul12) );
+
+	// third output row
+	A0 = SplatXSIMD(rowA2);
+	A1 = SplatYSIMD(rowA2);
+	A2 = SplatZSIMD(rowA2);
+	fltx4 mul20 = MulSIMD( A0, rowB0 );
+	fltx4 mul21 = MulSIMD( A1, rowB1 );
+	fltx4 mul22 = MulSIMD( A2, rowB2 );
+	fltx4 out2 = AddSIMD( mul20, AddSIMD(mul21,mul22) );
+
+	// add in translation vector
+	A0 = AndSIMD(rowA0,lastMask);
+	A1 = AndSIMD(rowA1,lastMask);
+	A2 = AndSIMD(rowA2,lastMask);
+	out0 = AddSIMD(out0, A0);
+	out1 = AddSIMD(out1, A1);
+	out2 = AddSIMD(out2, A2);
+
+	// write to output
+	StoreUnalignedSIMD( out.m_flMatVal[0], out0 );
+	StoreUnalignedSIMD( out.m_flMatVal[1], out1 );
+	StoreUnalignedSIMD( out.m_flMatVal[2], out2 );
+}
+
+
+/*
+===================
+FloorDivMod
+
+Returns mathematically correct (floor-based) quotient and remainder for
+numer and denom, both of which should contain no fractional part. The
+quotient must fit in 32 bits.
+====================
+*/
+
+void FloorDivMod (double numer, double denom, int *quotient,
+		int *rem)
+{
+	Assert( s_bMathlibInitialized );
+	int		q, r;
+	double	x;
+
+#ifdef PARANOID
+	if (denom <= 0.0)
+		Sys_Error ("FloorDivMod: bad denominator %d\n", denom);
+
+//	if ((floor(numer) != numer) || (floor(denom) != denom))
+//		Sys_Error ("FloorDivMod: non-integer numer or denom %f %f\n",
+//				numer, denom);
+#endif
+
+	if (numer >= 0.0)
+	{
+
+		x = floor(numer / denom);
+		q = (int)x;
+		r = Floor2Int(numer - (x * denom));
+	}
+	else
+	{
+		//
+		// perform operations with positive values, and fix mod to make floor-based
+		//
+		x = floor(-numer / denom);
+		q = -(int)x;
+		r = Floor2Int(-numer - (x * denom));
+		if (r != 0)
+		{
+			q--;
+			r = (int)denom - r;
+		}
+	}
+
+	*quotient = q;
+	*rem = r;
+}
+
+
+/*
+===================
+GreatestCommonDivisor
+====================
+*/
+int GreatestCommonDivisor (int i1, int i2)
+{
+	Assert( s_bMathlibInitialized );
+	if (i1 > i2)
+	{
+		if (i2 == 0)
+			return (i1);
+		return GreatestCommonDivisor (i2, i1 % i2);
+	}
+	else
+	{
+		if (i1 == 0)
+			return (i2);
+		return GreatestCommonDivisor (i1, i2 % i1);
+	}
+}
+
+
+bool IsDenormal( const float &val )
+{
+	const int x = *reinterpret_cast <const int *> (&val); // needs 32-bit int
+	const int abs_mantissa = x & 0x007FFFFF;
+	const int biased_exponent = x & 0x7F800000;
+	
+	return  ( biased_exponent == 0 && abs_mantissa != 0 );
+}
+
+int SignbitsForPlane (cplane_t *out)
+{
+	Assert( s_bMathlibInitialized );
+	int	bits, j;
+
+	// for fast box on planeside test
+
+	bits = 0;
+	for (j=0 ; j<3 ; j++)
+	{
+		if (out->normal[j] < 0)
+			bits |= 1<<j;
+	}
+	return bits;
+}
+
+/*
+==================
+BoxOnPlaneSide
+
+Returns 1, 2, or 1 + 2
+==================
+*/
+int __cdecl BoxOnPlaneSide (const float *emins, const float *emaxs, const cplane_t *p)
+{
+	Assert( s_bMathlibInitialized );
+	float	dist1, dist2;
+	int		sides;
+
+	// fast axial cases
+	if (p->type < 3)
+	{
+		if (p->dist <= emins[p->type])
+			return 1;
+		if (p->dist >= emaxs[p->type])
+			return 2;
+		return 3;
+	}
+	
+	// general case
+	switch (p->signbits)
+	{
+	case 0:
+		dist1 = p->normal[0]*emaxs[0] + p->normal[1]*emaxs[1] + p->normal[2]*emaxs[2];
+		dist2 = p->normal[0]*emins[0] + p->normal[1]*emins[1] + p->normal[2]*emins[2];
+		break;
+	case 1:
+		dist1 = p->normal[0]*emins[0] + p->normal[1]*emaxs[1] + p->normal[2]*emaxs[2];
+		dist2 = p->normal[0]*emaxs[0] + p->normal[1]*emins[1] + p->normal[2]*emins[2];
+		break;
+	case 2:
+		dist1 = p->normal[0]*emaxs[0] + p->normal[1]*emins[1] + p->normal[2]*emaxs[2];
+		dist2 = p->normal[0]*emins[0] + p->normal[1]*emaxs[1] + p->normal[2]*emins[2];
+		break;
+	case 3:
+		dist1 = p->normal[0]*emins[0] + p->normal[1]*emins[1] + p->normal[2]*emaxs[2];
+		dist2 = p->normal[0]*emaxs[0] + p->normal[1]*emaxs[1] + p->normal[2]*emins[2];
+		break;
+	case 4:
+		dist1 = p->normal[0]*emaxs[0] + p->normal[1]*emaxs[1] + p->normal[2]*emins[2];
+		dist2 = p->normal[0]*emins[0] + p->normal[1]*emins[1] + p->normal[2]*emaxs[2];
+		break;
+	case 5:
+		dist1 = p->normal[0]*emins[0] + p->normal[1]*emaxs[1] + p->normal[2]*emins[2];
+		dist2 = p->normal[0]*emaxs[0] + p->normal[1]*emins[1] + p->normal[2]*emaxs[2];
+		break;
+	case 6:
+		dist1 = p->normal[0]*emaxs[0] + p->normal[1]*emins[1] + p->normal[2]*emins[2];
+		dist2 = p->normal[0]*emins[0] + p->normal[1]*emaxs[1] + p->normal[2]*emaxs[2];
+		break;
+	case 7:
+		dist1 = p->normal[0]*emins[0] + p->normal[1]*emins[1] + p->normal[2]*emins[2];
+		dist2 = p->normal[0]*emaxs[0] + p->normal[1]*emaxs[1] + p->normal[2]*emaxs[2];
+		break;
+	default:
+		dist1 = dist2 = 0;		// shut up compiler
+		Assert( 0 );
+		break;
+	}
+
+	sides = 0;
+	if (dist1 >= p->dist)
+		sides = 1;
+	if (dist2 < p->dist)
+		sides |= 2;
+
+	Assert( sides != 0 );
+
+	return sides;
+}
+
+//-----------------------------------------------------------------------------
+// Euler QAngle -> Basis Vectors
+//-----------------------------------------------------------------------------
+
+void AngleVectors (const QAngle &angles, Vector *forward)
+{
+	Assert( s_bMathlibInitialized );
+	Assert( forward );
+	
+	float	sp, sy, cp, cy;
+	
+	SinCos( DEG2RAD( angles[YAW] ), &sy, &cy );
+	SinCos( DEG2RAD( angles[PITCH] ), &sp, &cp );
+	
+	forward->x = cp*cy;
+	forward->y = cp*sy;
+	forward->z = -sp;
+}
+
+//-----------------------------------------------------------------------------
+// Euler QAngle -> Basis Vectors.  Each vector is optional
+//-----------------------------------------------------------------------------
+void AngleVectors( const QAngle &angles, Vector *forward, Vector *right, Vector *up )
+{
+	Assert( s_bMathlibInitialized );
+	
+	float sr, sp, sy, cr, cp, cy;
+
+#ifdef _X360
+	fltx4 radians, scale, sine, cosine;
+	radians = LoadUnaligned3SIMD( angles.Base() );
+	scale = ReplicateX4( M_PI_F / 180.f ); 
+	radians = MulSIMD( radians, scale );
+	SinCos3SIMD( sine, cosine, radians ); 	
+	sp = SubFloat( sine, 0 );	sy = SubFloat( sine, 1 );	sr = SubFloat( sine, 2 );
+	cp = SubFloat( cosine, 0 );	cy = SubFloat( cosine, 1 );	cr = SubFloat( cosine, 2 );
+#else
+	SinCos( DEG2RAD( angles[YAW] ), &sy, &cy );
+	SinCos( DEG2RAD( angles[PITCH] ), &sp, &cp );
+	SinCos( DEG2RAD( angles[ROLL] ), &sr, &cr );
+#endif
+
+	if (forward)
+	{
+		forward->x = cp*cy;
+		forward->y = cp*sy;
+		forward->z = -sp;
+	}
+
+	if (right)
+	{
+		right->x = (-1*sr*sp*cy+-1*cr*-sy);
+		right->y = (-1*sr*sp*sy+-1*cr*cy);
+		right->z = -1*sr*cp;
+	}
+
+	if (up)
+	{
+		up->x = (cr*sp*cy+-sr*-sy);
+		up->y = (cr*sp*sy+-sr*cy);
+		up->z = cr*cp;
+	}
+}
+
+//-----------------------------------------------------------------------------
+// Euler QAngle -> Basis Vectors transposed
+//-----------------------------------------------------------------------------
+
+void AngleVectorsTranspose (const QAngle &angles, Vector *forward, Vector *right, Vector *up)
+{
+	Assert( s_bMathlibInitialized );
+	float sr, sp, sy, cr, cp, cy;
+	
+	SinCos( DEG2RAD( angles[YAW] ), &sy, &cy );
+	SinCos( DEG2RAD( angles[PITCH] ), &sp, &cp );
+	SinCos( DEG2RAD( angles[ROLL] ), &sr, &cr );
+
+	if (forward)
+	{
+		forward->x	= cp*cy;
+		forward->y	= (sr*sp*cy+cr*-sy);
+		forward->z	= (cr*sp*cy+-sr*-sy);
+	}
+	
+	if (right)
+	{
+		right->x	= cp*sy;
+		right->y	= (sr*sp*sy+cr*cy);
+		right->z	= (cr*sp*sy+-sr*cy);
+	}
+
+	if (up)
+	{
+		up->x		= -sp;
+		up->y		= sr*cp;
+		up->z		= cr*cp;
+	}
+}
+
+//-----------------------------------------------------------------------------
+// Forward direction vector -> Euler angles
+//-----------------------------------------------------------------------------
+
+void VectorAngles( const Vector& forward, QAngle &angles )
+{
+	Assert( s_bMathlibInitialized );
+	float	tmp, yaw, pitch;
+	
+	if (forward[1] == 0 && forward[0] == 0)
+	{
+		yaw = 0;
+		if (forward[2] > 0)
+			pitch = 270;
+		else
+			pitch = 90;
+	}
+	else
+	{
+		yaw = (atan2(forward[1], forward[0]) * 180 / M_PI);
+		if (yaw < 0)
+			yaw += 360;
+
+		tmp = FastSqrt (forward[0]*forward[0] + forward[1]*forward[1]);
+		pitch = (atan2(-forward[2], tmp) * 180 / M_PI);
+		if (pitch < 0)
+			pitch += 360;
+	}
+	
+	angles[0] = pitch;
+	angles[1] = yaw;
+	angles[2] = 0;
+}
+
+//-----------------------------------------------------------------------------
+// Forward direction vector with a reference up vector -> Euler angles
+//-----------------------------------------------------------------------------
+
+void VectorAngles( const Vector &forward, const Vector &pseudoup, QAngle &angles )
+{
+	Assert( s_bMathlibInitialized );
+
+	Vector left;
+
+	CrossProduct( pseudoup, forward, left );
+	VectorNormalizeFast( left );		
+	
+	float xyDist = sqrtf( forward[0] * forward[0] + forward[1] * forward[1] );
+
+	// enough here to get angles?
+	if ( xyDist > 0.001f )
+	{
+		// (yaw)	y = ATAN( forward.y, forward.x );		-- in our space, forward is the X axis
+		angles[1] = RAD2DEG( atan2f( forward[1], forward[0] ) );
+
+		// The engine does pitch inverted from this, but we always end up negating it in the DLL
+		// UNDONE: Fix the engine to make it consistent
+		// (pitch)	x = ATAN( -forward.z, sqrt(forward.x*forward.x+forward.y*forward.y) );
+		angles[0] = RAD2DEG( atan2f( -forward[2], xyDist ) );
+
+		float up_z = (left[1] * forward[0]) - (left[0] * forward[1]);
+
+		// (roll)	z = ATAN( left.z, up.z );
+		angles[2] = RAD2DEG( atan2f( left[2], up_z ) );
+	}
+	else	// forward is mostly Z, gimbal lock-
+	{
+		// (yaw)	y = ATAN( -left.x, left.y );			-- forward is mostly z, so use right for yaw
+		angles[1] = RAD2DEG( atan2f( -left[0], left[1] ) ); //This was originally copied from the "void MatrixAngles( const matrix3x4_t& matrix, float *angles )" code, and it's 180 degrees off, negated the values and it all works now (Dave Kircher)
+
+		// The engine does pitch inverted from this, but we always end up negating it in the DLL
+		// UNDONE: Fix the engine to make it consistent
+		// (pitch)	x = ATAN( -forward.z, sqrt(forward.x*forward.x+forward.y*forward.y) );
+		angles[0] = RAD2DEG( atan2f( -forward[2], xyDist ) );
+
+		// Assume no roll in this case as one degree of freedom has been lost (i.e. yaw == roll)
+		angles[2] = 0;
+	}	
+}
+
+void SetIdentityMatrix( matrix3x4_t& matrix )
+{
+	memset( matrix.Base(), 0, sizeof(float)*3*4 );
+	matrix[0][0] = 1.0;
+	matrix[1][1] = 1.0;
+	matrix[2][2] = 1.0;
+}
+
+
+//-----------------------------------------------------------------------------
+// Builds a scale matrix
+//-----------------------------------------------------------------------------
+void SetScaleMatrix( float x, float y, float z, matrix3x4_t &dst )
+{
+	dst[0][0] = x;		dst[0][1] = 0.0f;	dst[0][2] = 0.0f;	dst[0][3] = 0.0f;
+	dst[1][0] = 0.0f;	dst[1][1] = y;		dst[1][2] = 0.0f;	dst[1][3] = 0.0f;
+	dst[2][0] = 0.0f;	dst[2][1] = 0.0f;	dst[2][2] = z;		dst[2][3] = 0.0f;
+}
+
+
+//-----------------------------------------------------------------------------
+// Purpose: Builds the matrix for a counterclockwise rotation about an arbitrary axis.
+//
+//		   | ax2 + (1 - ax2)cosQ		axay(1 - cosQ) - azsinQ		azax(1 - cosQ) + aysinQ |
+// Ra(Q) = | axay(1 - cosQ) + azsinQ	ay2 + (1 - ay2)cosQ			ayaz(1 - cosQ) - axsinQ |
+//		   | azax(1 - cosQ) - aysinQ	ayaz(1 - cosQ) + axsinQ		az2 + (1 - az2)cosQ     |
+//          
+// Input  : mat - 
+//			vAxisOrRot - 
+//			angle - 
+//-----------------------------------------------------------------------------
+void MatrixBuildRotationAboutAxis( const Vector &vAxisOfRot, float angleDegrees, matrix3x4_t &dst )
+{
+	float radians;
+	float axisXSquared;
+	float axisYSquared;
+	float axisZSquared;
+	float fSin;
+	float fCos;
+
+	radians = angleDegrees * ( M_PI / 180.0 );
+	fSin = sin( radians );
+	fCos = cos( radians );
+
+	axisXSquared = vAxisOfRot[0] * vAxisOfRot[0];
+	axisYSquared = vAxisOfRot[1] * vAxisOfRot[1];
+	axisZSquared = vAxisOfRot[2] * vAxisOfRot[2];
+
+	// Column 0:
+	dst[0][0] = axisXSquared + (1 - axisXSquared) * fCos;
+	dst[1][0] = vAxisOfRot[0] * vAxisOfRot[1] * (1 - fCos) + vAxisOfRot[2] * fSin;
+	dst[2][0] = vAxisOfRot[2] * vAxisOfRot[0] * (1 - fCos) - vAxisOfRot[1] * fSin;
+
+	// Column 1:
+	dst[0][1] = vAxisOfRot[0] * vAxisOfRot[1] * (1 - fCos) - vAxisOfRot[2] * fSin;
+	dst[1][1] = axisYSquared + (1 - axisYSquared) * fCos;
+	dst[2][1] = vAxisOfRot[1] * vAxisOfRot[2] * (1 - fCos) + vAxisOfRot[0] * fSin;
+
+	// Column 2:
+	dst[0][2] = vAxisOfRot[2] * vAxisOfRot[0] * (1 - fCos) + vAxisOfRot[1] * fSin;
+	dst[1][2] = vAxisOfRot[1] * vAxisOfRot[2] * (1 - fCos) - vAxisOfRot[0] * fSin;
+	dst[2][2] = axisZSquared + (1 - axisZSquared) * fCos;
+
+	// Column 3:
+	dst[0][3] = 0;
+	dst[1][3] = 0;
+	dst[2][3] = 0;
+}
+
+
+//-----------------------------------------------------------------------------
+// Computes the transpose
+//-----------------------------------------------------------------------------
+void MatrixTranspose( matrix3x4_t& mat )
+{
+	vec_t tmp;
+	tmp = mat[0][1]; mat[0][1] = mat[1][0]; mat[1][0] = tmp;
+	tmp = mat[0][2]; mat[0][2] = mat[2][0]; mat[2][0] = tmp;
+	tmp = mat[1][2]; mat[1][2] = mat[2][1]; mat[2][1] = tmp;
+}
+
+void MatrixTranspose( const matrix3x4_t& src, matrix3x4_t& dst )
+{
+	dst[0][0] = src[0][0]; dst[0][1] = src[1][0]; dst[0][2] = src[2][0]; dst[0][3] = 0.0f;
+	dst[1][0] = src[0][1]; dst[1][1] = src[1][1]; dst[1][2] = src[2][1]; dst[1][3] = 0.0f;
+	dst[2][0] = src[0][2]; dst[2][1] = src[1][2]; dst[2][2] = src[2][2]; dst[2][3] = 0.0f;
+}
+
+
+//-----------------------------------------------------------------------------
+// Purpose: converts engine euler angles into a matrix
+// Input  : vec3_t angles - PITCH, YAW, ROLL
+// Output : *matrix - left-handed column matrix
+//			the basis vectors for the rotations will be in the columns as follows:
+//			matrix[][0] is forward
+//			matrix[][1] is left
+//			matrix[][2] is up
+//-----------------------------------------------------------------------------
+void AngleMatrix( RadianEuler const &angles, const Vector &position, matrix3x4_t& matrix )
+{
+	AngleMatrix( angles, matrix );
+	MatrixSetColumn( position, 3, matrix );
+}
+
+void AngleMatrix( const RadianEuler& angles, matrix3x4_t& matrix )
+{
+	QAngle quakeEuler( RAD2DEG( angles.y ), RAD2DEG( angles.z ), RAD2DEG( angles.x ) );
+
+	AngleMatrix( quakeEuler, matrix );
+}
+
+
+void AngleMatrix( const QAngle &angles, const Vector &position, matrix3x4_t& matrix )
+{
+	AngleMatrix( angles, matrix );
+	MatrixSetColumn( position, 3, matrix );
+}
+
+void AngleMatrix( const QAngle &angles, matrix3x4_t& matrix )
+{
+#ifdef _VPROF_MATHLIB
+	VPROF_BUDGET( "AngleMatrix", "Mathlib" );
+#endif
+	Assert( s_bMathlibInitialized );
+
+	float sr, sp, sy, cr, cp, cy;
+
+#ifdef _X360
+	fltx4 radians, scale, sine, cosine;
+	radians = LoadUnaligned3SIMD( angles.Base() );
+	scale = ReplicateX4( M_PI_F / 180.f ); 
+	radians = MulSIMD( radians, scale );
+	SinCos3SIMD( sine, cosine, radians ); 	
+
+	sp = SubFloat( sine, 0 );	sy = SubFloat( sine, 1 );	sr = SubFloat( sine, 2 );
+	cp = SubFloat( cosine, 0 );	cy = SubFloat( cosine, 1 );	cr = SubFloat( cosine, 2 );
+#else
+	SinCos( DEG2RAD( angles[YAW] ), &sy, &cy );
+	SinCos( DEG2RAD( angles[PITCH] ), &sp, &cp );
+	SinCos( DEG2RAD( angles[ROLL] ), &sr, &cr );
+#endif
+
+	// matrix = (YAW * PITCH) * ROLL
+	matrix[0][0] = cp*cy;
+	matrix[1][0] = cp*sy;
+	matrix[2][0] = -sp;
+
+	float crcy = cr*cy;
+	float crsy = cr*sy;
+	float srcy = sr*cy;
+	float srsy = sr*sy;
+	matrix[0][1] = sp*srcy-crsy;
+	matrix[1][1] = sp*srsy+crcy;
+	matrix[2][1] = sr*cp;
+
+	matrix[0][2] = (sp*crcy+srsy);
+	matrix[1][2] = (sp*crsy-srcy);
+	matrix[2][2] = cr*cp;
+
+	matrix[0][3] = 0.0f;
+	matrix[1][3] = 0.0f;
+	matrix[2][3] = 0.0f;
+}
+
+void AngleIMatrix( const RadianEuler& angles, matrix3x4_t& matrix )
+{
+	QAngle quakeEuler( RAD2DEG( angles.y ), RAD2DEG( angles.z ), RAD2DEG( angles.x ) );
+
+	AngleIMatrix( quakeEuler, matrix );
+}
+
+void AngleIMatrix (const QAngle& angles, matrix3x4_t& matrix )
+{
+	Assert( s_bMathlibInitialized );
+	float		sr, sp, sy, cr, cp, cy;
+	
+	SinCos( DEG2RAD( angles[YAW] ), &sy, &cy );
+	SinCos( DEG2RAD( angles[PITCH] ), &sp, &cp );
+	SinCos( DEG2RAD( angles[ROLL] ), &sr, &cr );
+
+	// matrix = (YAW * PITCH) * ROLL
+	matrix[0][0] = cp*cy;
+	matrix[0][1] = cp*sy;
+	matrix[0][2] = -sp;
+	matrix[1][0] = sr*sp*cy+cr*-sy;
+	matrix[1][1] = sr*sp*sy+cr*cy;
+	matrix[1][2] = sr*cp;
+	matrix[2][0] = (cr*sp*cy+-sr*-sy);
+	matrix[2][1] = (cr*sp*sy+-sr*cy);
+	matrix[2][2] = cr*cp;
+	matrix[0][3] = 0.f;
+	matrix[1][3] = 0.f;
+	matrix[2][3] = 0.f;
+}
+
+void AngleIMatrix (const QAngle &angles, const Vector &position, matrix3x4_t &mat )
+{
+	AngleIMatrix( angles, mat );
+
+	Vector vecTranslation;
+	VectorRotate( position, mat, vecTranslation );
+	vecTranslation *= -1.0f;
+	MatrixSetColumn( vecTranslation, 3, mat );
+}
+
+
+//-----------------------------------------------------------------------------
+// Bounding box construction methods
+//-----------------------------------------------------------------------------
+
+void ClearBounds (Vector& mins, Vector& maxs)
+{
+	Assert( s_bMathlibInitialized );
+	mins[0] = mins[1] = mins[2] = 99999;
+	maxs[0] = maxs[1] = maxs[2] = -99999;
+}
+
+void AddPointToBounds (const Vector& v, Vector& mins, Vector& maxs)
+{
+	Assert( s_bMathlibInitialized );
+	int		i;
+	vec_t	val;
+
+	for (i=0 ; i<3 ; i++)
+	{
+		val = v[i];
+		if (val < mins[i])
+			mins[i] = val;
+		if (val > maxs[i])
+			maxs[i] = val;
+	}
+}
+
+// solve a x^2 + b x + c = 0
+bool SolveQuadratic( float a, float b, float c, float &root1, float &root2 )
+{
+	Assert( s_bMathlibInitialized );
+	if (a == 0)
+	{
+		if (b != 0)
+		{
+			// no x^2 component, it's a linear system
+			root1 = root2 = -c / b;
+			return true;
+		}
+		if (c == 0)
+		{
+			// all zero's
+			root1 = root2 = 0;
+			return true;
+		}
+		return false;
+	}
+
+	float tmp = b * b - 4.0f * a * c;
+
+	if (tmp < 0)
+	{
+		// imaginary number, bah, no solution.
+		return false;
+	}
+
+	tmp = sqrt( tmp );
+	root1 = (-b + tmp) / (2.0f * a);
+	root2 = (-b - tmp) / (2.0f * a);
+	return true;
+}
+
+// solves for "a, b, c" where "a x^2 + b x + c = y", return true if solution exists
+bool SolveInverseQuadratic( float x1, float y1, float x2, float y2, float x3, float y3, float &a, float &b, float &c )
+{
+	float det = (x1 - x2)*(x1 - x3)*(x2 - x3);
+
+	// FIXME: check with some sort of epsilon
+	if (det == 0.0)
+		return false;
+
+	a = (x3*(-y1 + y2) + x2*(y1 - y3) + x1*(-y2 + y3)) / det;
+
+	b = (x3*x3*(y1 - y2) + x1*x1*(y2 - y3) + x2*x2*(-y1 + y3)) / det;
+
+    c = (x1*x3*(-x1 + x3)*y2 + x2*x2*(x3*y1 - x1*y3) + x2*(-(x3*x3*y1) + x1*x1*y3)) / det;
+
+	return true;
+}
+
+bool SolveInverseQuadraticMonotonic( float x1, float y1, float x2, float y2, float x3, float y3, 
+									 float &a, float &b, float &c )
+{
+	// use SolveInverseQuadratic, but if the sigm of the derivative at the start point is the wrong
+	// sign, displace the mid point
+	
+	// first, sort parameters
+	if (x1>x2)
+	{
+		V_swap(x1,x2);
+		V_swap(y1,y2);
+	}
+	if (x2>x3)
+	{
+		V_swap(x2,x3);
+		V_swap(y2,y3);
+	}
+	if (x1>x2)
+	{
+		V_swap(x1,x2);
+		V_swap(y1,y2);
+	}
+	// this code is not fast. what it does is when the curve would be non-monotonic, slowly shifts
+	// the center point closer to the linear line between the endpoints. Should anyone need htis
+	// function to be actually fast, it would be fairly easy to change it to be so.
+	for(float blend_to_linear_factor=0.0;blend_to_linear_factor<=1.0;blend_to_linear_factor+=0.05)
+	{
+		float tempy2=(1-blend_to_linear_factor)*y2+blend_to_linear_factor*FLerp(y1,y3,x1,x3,x2);
+		if (!SolveInverseQuadratic(x1,y1,x2,tempy2,x3,y3,a,b,c))
+			return false;
+		float derivative=2.0*a+b;
+		if ( (y1<y2) && (y2<y3))							// monotonically increasing
+		{
+			if (derivative>=0.0)
+				return true;
+		}
+		else
+		{
+			if ( (y1>y2) && (y2>y3))							// monotonically decreasing
+			{
+				if (derivative<=0.0)
+					return true;
+			}
+			else
+				return true;
+		}
+	}
+	return true;
+}
+
+
+// solves for "a, b, c" where "1/(a x^2 + b x + c ) = y", return true if solution exists
+bool SolveInverseReciprocalQuadratic( float x1, float y1, float x2, float y2, float x3, float y3, float &a, float &b, float &c )
+{
+	float det = (x1 - x2)*(x1 - x3)*(x2 - x3)*y1*y2*y3;
+
+	// FIXME: check with some sort of epsilon
+	if (det == 0.0)
+		return false;
+
+	a = (x1*y1*(y2 - y3) + x3*(y1 - y2)*y3 + x2*y2*(-y1 + y3)) / det;
+
+	b = (x2*x2*y2*(y1 - y3) + x3*x3*(-y1 + y2)*y3 + x1*x1*y1*(-y2 + y3)) / det;
+
+	c = (x2*(x2 - x3)*x3*y2*y3 + x1*x1*y1*(x2*y2 - x3*y3) + x1*(-(x2*x2*y1*y2) + x3*x3*y1*y3)) / det;
+
+	return true;
+}
+
+
+// Rotate a vector around the Z axis (YAW)
+void VectorYawRotate( const Vector &in, float flYaw, Vector &out)
+{
+	Assert( s_bMathlibInitialized );
+	if (&in == &out )
+	{
+		Vector tmp;
+		tmp = in;
+		VectorYawRotate( tmp, flYaw, out );
+		return;
+	}
+
+	float sy, cy;
+
+	SinCos( DEG2RAD(flYaw), &sy, &cy );
+
+	out.x = in.x * cy - in.y * sy;
+	out.y = in.x * sy + in.y * cy;
+	out.z = in.z;
+}
+
+
+
+float Bias( float x, float biasAmt )
+{
+	// WARNING: not thread safe
+	static float lastAmt = -1;
+	static float lastExponent = 0;
+	if( lastAmt != biasAmt )
+	{
+		lastExponent = log( biasAmt ) * -1.4427f; // (-1.4427 = 1 / log(0.5))
+	}
+	return pow( x, lastExponent );
+}
+
+
+float Gain( float x, float biasAmt )
+{
+	// WARNING: not thread safe
+	if( x < 0.5 )
+		return 0.5f * Bias( 2*x, 1-biasAmt );
+	else
+		return 1 - 0.5f * Bias( 2 - 2*x, 1-biasAmt );
+}
+
+
+float SmoothCurve( float x )
+{
+	return (1 - cos( x * M_PI )) * 0.5f;
+}
+
+
+inline float MovePeak( float x, float flPeakPos )
+{
+	// Todo: make this higher-order?
+	if( x < flPeakPos )
+		return x * 0.5f / flPeakPos;
+	else
+		return 0.5 + 0.5 * (x - flPeakPos) / (1 - flPeakPos);
+}
+
+
+float SmoothCurve_Tweak( float x, float flPeakPos, float flPeakSharpness )
+{
+	float flMovedPeak = MovePeak( x, flPeakPos );
+	float flSharpened = Gain( flMovedPeak, flPeakSharpness );
+	return SmoothCurve( flSharpened );
+}
+
+//-----------------------------------------------------------------------------
+// make sure quaternions are within 180 degrees of one another, if not, reverse q
+//-----------------------------------------------------------------------------
+
+void QuaternionAlign( const Quaternion &p, const Quaternion &q, Quaternion &qt )
+{
+	Assert( s_bMathlibInitialized );
+
+	// FIXME: can this be done with a quat dot product?
+
+	int i;
+	// decide if one of the quaternions is backwards
+	float a = 0;
+	float b = 0;
+	for (i = 0; i < 4; i++) 
+	{
+		a += (p[i]-q[i])*(p[i]-q[i]);
+		b += (p[i]+q[i])*(p[i]+q[i]);
+	}
+	if (a > b) 
+	{
+		for (i = 0; i < 4; i++) 
+		{
+			qt[i] = -q[i];
+		}
+	}
+	else if (&qt != &q)
+	{
+		for (i = 0; i < 4; i++) 
+		{
+			qt[i] = q[i];
+		}
+	}
+}
+
+
+//-----------------------------------------------------------------------------
+// Do a piecewise addition of the quaternion elements. This actually makes little 
+// mathematical sense, but it's a cheap way to simulate a slerp.
+//-----------------------------------------------------------------------------
+void QuaternionBlend( const Quaternion &p, const Quaternion &q, float t, Quaternion &qt )
+{
+	Assert( s_bMathlibInitialized );
+#if ALLOW_SIMD_QUATERNION_MATH
+	fltx4 psimd, qsimd, qtsimd;
+	psimd = LoadUnalignedSIMD( p.Base() );
+	qsimd = LoadUnalignedSIMD( q.Base() );
+	qtsimd = QuaternionBlendSIMD( psimd, qsimd, t );
+	StoreUnalignedSIMD( qt.Base(), qtsimd );
+#else
+	// decide if one of the quaternions is backwards
+	Quaternion q2;
+	QuaternionAlign( p, q, q2 );
+	QuaternionBlendNoAlign( p, q2, t, qt );
+#endif
+}
+
+
+void QuaternionBlendNoAlign( const Quaternion &p, const Quaternion &q, float t, Quaternion &qt )
+{
+	Assert( s_bMathlibInitialized );
+	float sclp, sclq;
+	int i;
+
+	// 0.0 returns p, 1.0 return q.
+	sclp = 1.0f - t;
+	sclq = t;
+	for (i = 0; i < 4; i++) {
+		qt[i] = sclp * p[i] + sclq * q[i];
+	}
+	QuaternionNormalize( qt );
+}
+
+
+
+void QuaternionIdentityBlend( const Quaternion &p, float t, Quaternion &qt )
+{
+	Assert( s_bMathlibInitialized );
+	float sclp;
+
+	sclp = 1.0f - t;
+
+	qt.x = p.x * sclp;
+	qt.y = p.y * sclp;
+	qt.z = p.z * sclp;
+	if (qt.w < 0.0)
+	{
+		qt.w = p.w * sclp - t;
+	}
+	else
+	{
+		qt.w = p.w * sclp + t;
+	}
+	QuaternionNormalize( qt );
+}
+
+//-----------------------------------------------------------------------------
+// Quaternion sphereical linear interpolation
+//-----------------------------------------------------------------------------
+
+void QuaternionSlerp( const Quaternion &p, const Quaternion &q, float t, Quaternion &qt )
+{
+	Quaternion q2;
+	// 0.0 returns p, 1.0 return q.
+
+	// decide if one of the quaternions is backwards
+	QuaternionAlign( p, q, q2 );
+
+	QuaternionSlerpNoAlign( p, q2, t, qt );
+}
+
+
+void QuaternionSlerpNoAlign( const Quaternion &p, const Quaternion &q, float t, Quaternion &qt )
+{
+	Assert( s_bMathlibInitialized );
+	float omega, cosom, sinom, sclp, sclq;
+	int i;
+
+	// 0.0 returns p, 1.0 return q.
+
+	cosom = p[0]*q[0] + p[1]*q[1] + p[2]*q[2] + p[3]*q[3];
+
+	if ((1.0f + cosom) > 0.000001f) {
+		if ((1.0f - cosom) > 0.000001f) {
+			omega = acos( cosom );
+			sinom = sin( omega );
+			sclp = sin( (1.0f - t)*omega) / sinom;
+			sclq = sin( t*omega ) / sinom;
+		}
+		else {
+			// TODO: add short circuit for cosom == 1.0f?
+			sclp = 1.0f - t;
+			sclq = t;
+		}
+		for (i = 0; i < 4; i++) {
+			qt[i] = sclp * p[i] + sclq * q[i];
+		}
+	}
+	else {
+		Assert( &qt != &q );
+
+		qt[0] = -q[1];
+		qt[1] = q[0];
+		qt[2] = -q[3];
+		qt[3] = q[2];
+		sclp = sin( (1.0f - t) * (0.5f * M_PI));
+		sclq = sin( t * (0.5f * M_PI));
+		for (i = 0; i < 3; i++) {
+			qt[i] = sclp * p[i] + sclq * qt[i];
+		}
+	}
+
+	Assert( qt.IsValid() );
+}
+
+
+//-----------------------------------------------------------------------------
+// Purpose: Returns the angular delta between the two normalized quaternions in degrees.
+//-----------------------------------------------------------------------------
+float QuaternionAngleDiff( const Quaternion &p, const Quaternion &q )
+{
+#if 1
+	// this code path is here for 2 reasons:
+	// 1 - acos maps 1-epsilon to values much larger than epsilon (vs asin, which maps epsilon to itself)
+	//     this means that in floats, anything below ~0.05 degrees truncates to 0
+	// 2 - normalized quaternions are frequently slightly non-normalized due to float precision issues,
+	//     and the epsilon off of normalized can be several percents of a degree
+	Quaternion qInv, diff;
+	QuaternionConjugate( q, qInv );
+	QuaternionMult( p, qInv, diff );
+
+	// Note if the quaternion is slightly non-normalized the square root below may be more than 1,
+	// the value is clamped to one otherwise it may result in asin() returning an undefined result.
+	float sinang = MIN( 1.0f, sqrt( diff.x * diff.x + diff.y * diff.y + diff.z * diff.z ) );
+	float angle = RAD2DEG( 2 * asin( sinang ) );
+	return angle;
+#else
+	Quaternion q2;
+	QuaternionAlign( p, q, q2 );
+
+	Assert( s_bMathlibInitialized );
+	float cosom = p.x * q2.x + p.y * q2.y + p.z * q2.z + p.w * q2.w;
+
+	if ( cosom > -1.0f )
+	{
+		if ( cosom < 1.0f )
+		{
+			float omega = 2 * fabs( acos( cosom ) );
+			return RAD2DEG( omega );
+		}
+		return 0.0f;
+	}
+
+	return 180.0f;
+#endif
+}
+
+void QuaternionConjugate( const Quaternion &p, Quaternion &q )
+{
+	Assert( s_bMathlibInitialized );
+	Assert( q.IsValid() );
+
+	q.x = -p.x;
+	q.y = -p.y;
+	q.z = -p.z;
+	q.w = p.w;
+}
+
+void QuaternionInvert( const Quaternion &p, Quaternion &q )
+{
+	Assert( s_bMathlibInitialized );
+	Assert( q.IsValid() );
+
+	QuaternionConjugate( p, q );
+
+	float magnitudeSqr = QuaternionDotProduct( p, p );
+	Assert( magnitudeSqr );
+	if ( magnitudeSqr )
+	{
+		float inv = 1.0f / magnitudeSqr;
+		q.x *= inv;
+		q.y *= inv;
+		q.z *= inv;
+		q.w *= inv;
+	}
+}
+
+//-----------------------------------------------------------------------------
+// Make sure the quaternion is of unit length
+//-----------------------------------------------------------------------------
+float QuaternionNormalize( Quaternion &q )
+{
+	Assert( s_bMathlibInitialized );
+	float radius, iradius;
+
+	Assert( q.IsValid() );
+
+	radius = q[0]*q[0] + q[1]*q[1] + q[2]*q[2] + q[3]*q[3];
+
+	if ( radius ) // > FLT_EPSILON && ((radius < 1.0f - 4*FLT_EPSILON) || (radius > 1.0f + 4*FLT_EPSILON))
+	{
+		radius = sqrt(radius);
+		iradius = 1.0f/radius;
+		q[3] *= iradius;
+		q[2] *= iradius;
+		q[1] *= iradius;
+		q[0] *= iradius;
+	}
+	return radius;
+}
+
+
+void QuaternionScale( const Quaternion &p, float t, Quaternion &q )
+{
+	Assert( s_bMathlibInitialized );
+
+#if 0
+	Quaternion p0;
+	Quaternion q;
+	p0.Init( 0.0, 0.0, 0.0, 1.0 );
+
+	// slerp in "reverse order" so that p doesn't get realigned
+	QuaternionSlerp( p, p0, 1.0 - fabs( t ), q );
+	if (t < 0.0)
+	{
+		q.w = -q.w;
+	}
+#else
+	float r;
+
+	// FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to 
+	// use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale.
+	float sinom = sqrt( DotProduct( &p.x, &p.x ) );
+	sinom = min( sinom, 1.f );
+
+	float sinsom = sin( asin( sinom ) * t );
+
+	t = sinsom / (sinom + FLT_EPSILON);
+	VectorScale( &p.x, t, &q.x );
+
+	// rescale rotation
+	r = 1.0f - sinsom * sinsom;
+
+	// Assert( r >= 0 );
+	if (r < 0.0f) 
+		r = 0.0f;
+	r = sqrt( r );
+
+	// keep sign of rotation
+	if (p.w < 0)
+		q.w = -r;
+	else
+		q.w = r;
+#endif
+
+	Assert( q.IsValid() );
+
+	return;
+}
+
+
+void QuaternionAdd( const Quaternion &p, const Quaternion &q, Quaternion &qt )
+{
+	Assert( s_bMathlibInitialized );
+	Assert( p.IsValid() );
+	Assert( q.IsValid() );
+
+	// decide if one of the quaternions is backwards
+	Quaternion q2;
+	QuaternionAlign( p, q, q2 );
+
+	// is this right???
+	qt[0] = p[0] + q2[0];
+	qt[1] = p[1] + q2[1];
+	qt[2] = p[2] + q2[2];
+	qt[3] = p[3] + q2[3];
+
+	return;
+}
+
+
+float QuaternionDotProduct( const Quaternion &p, const Quaternion &q )
+{
+	Assert( s_bMathlibInitialized );
+	Assert( p.IsValid() );
+	Assert( q.IsValid() );
+
+	return p.x * q.x + p.y * q.y + p.z * q.z + p.w * q.w;
+}
+
+
+// qt = p * q
+void QuaternionMult( const Quaternion &p, const Quaternion &q, Quaternion &qt )
+{
+	Assert( s_bMathlibInitialized );
+	Assert( p.IsValid() );
+	Assert( q.IsValid() );
+
+	if (&p == &qt)
+	{
+		Quaternion p2 = p;
+		QuaternionMult( p2, q, qt );
+		return;
+	}
+
+	// decide if one of the quaternions is backwards
+	Quaternion q2;
+	QuaternionAlign( p, q, q2 );
+
+	qt.x =  p.x * q2.w + p.y * q2.z - p.z * q2.y + p.w * q2.x;
+	qt.y = -p.x * q2.z + p.y * q2.w + p.z * q2.x + p.w * q2.y;
+	qt.z =  p.x * q2.y - p.y * q2.x + p.z * q2.w + p.w * q2.z;
+	qt.w = -p.x * q2.x - p.y * q2.y - p.z * q2.z + p.w * q2.w;
+}
+
+
+void QuaternionMatrix( const Quaternion &q, const Vector &pos, matrix3x4_t& matrix )
+{
+	Assert( pos.IsValid() );
+
+	QuaternionMatrix( q, matrix );
+
+	matrix[0][3] = pos.x;
+	matrix[1][3] = pos.y;
+	matrix[2][3] = pos.z;
+}
+
+void QuaternionMatrix( const Quaternion &q, matrix3x4_t& matrix )
+{
+	Assert( s_bMathlibInitialized );
+	Assert( q.IsValid() );
+
+#ifdef _VPROF_MATHLIB
+	VPROF_BUDGET( "QuaternionMatrix", "Mathlib" );
+#endif
+
+// Original code
+// This should produce the same code as below with optimization, but looking at the assmebly,
+// it doesn't.  There are 7 extra multiplies in the release build of this, go figure.
+#if 1
+	matrix[0][0] = 1.0 - 2.0 * q.y * q.y - 2.0 * q.z * q.z;
+	matrix[1][0] = 2.0 * q.x * q.y + 2.0 * q.w * q.z;
+	matrix[2][0] = 2.0 * q.x * q.z - 2.0 * q.w * q.y;
+
+	matrix[0][1] = 2.0f * q.x * q.y - 2.0f * q.w * q.z;
+	matrix[1][1] = 1.0f - 2.0f * q.x * q.x - 2.0f * q.z * q.z;
+	matrix[2][1] = 2.0f * q.y * q.z + 2.0f * q.w * q.x;
+
+	matrix[0][2] = 2.0f * q.x * q.z + 2.0f * q.w * q.y;
+	matrix[1][2] = 2.0f * q.y * q.z - 2.0f * q.w * q.x;
+	matrix[2][2] = 1.0f - 2.0f * q.x * q.x - 2.0f * q.y * q.y;
+
+	matrix[0][3] = 0.0f;
+	matrix[1][3] = 0.0f;
+	matrix[2][3] = 0.0f;
+#else
+   float wx, wy, wz, xx, yy, yz, xy, xz, zz, x2, y2, z2;
+
+    // precalculate common multiplitcations
+    x2 = q.x + q.x; 
+	y2 = q.y + q.y; 
+    z2 = q.z + q.z;
+    xx = q.x * x2;
+	xy = q.x * y2;
+	xz = q.x * z2;
+    yy = q.y * y2;
+	yz = q.y * z2;
+	zz = q.z * z2;
+    wx = q.w * x2;
+	wy = q.w * y2;
+	wz = q.w * z2;
+
+    matrix[0][0] = 1.0 - (yy + zz);
+    matrix[0][1] = xy - wz;
+	matrix[0][2] = xz + wy;
+    matrix[0][3] = 0.0f;
+
+    matrix[1][0] = xy + wz;
+	matrix[1][1] = 1.0 - (xx + zz);
+    matrix[1][2] = yz - wx;
+	matrix[1][3] = 0.0f;
+
+    matrix[2][0] = xz - wy;
+	matrix[2][1] = yz + wx;
+    matrix[2][2] = 1.0 - (xx + yy);
+	matrix[2][3] = 0.0f;
+#endif
+}
+
+
+//-----------------------------------------------------------------------------
+// Purpose: Converts a quaternion into engine angles
+// Input  : *quaternion - q3 + q0.i + q1.j + q2.k
+//			*outAngles - PITCH, YAW, ROLL
+//-----------------------------------------------------------------------------
+void QuaternionAngles( const Quaternion &q, QAngle &angles )
+{
+	Assert( s_bMathlibInitialized );
+	Assert( q.IsValid() );
+
+#ifdef _VPROF_MATHLIB
+	VPROF_BUDGET( "QuaternionAngles", "Mathlib" );
+#endif
+
+#if 1
+	// FIXME: doing it this way calculates too much data, needs to do an optimized version...
+	matrix3x4_t matrix;
+	QuaternionMatrix( q, matrix );
+	MatrixAngles( matrix, angles );
+#else
+	float m11, m12, m13, m23, m33;
+
+	m11 = ( 2.0f * q.w * q.w ) + ( 2.0f * q.x * q.x ) - 1.0f;
+	m12 = ( 2.0f * q.x * q.y ) + ( 2.0f * q.w * q.z );
+	m13 = ( 2.0f * q.x * q.z ) - ( 2.0f * q.w * q.y );
+	m23 = ( 2.0f * q.y * q.z ) + ( 2.0f * q.w * q.x );
+	m33 = ( 2.0f * q.w * q.w ) + ( 2.0f * q.z * q.z ) - 1.0f;
+
+	// FIXME: this code has a singularity near PITCH +-90
+	angles[YAW] = RAD2DEG( atan2(m12, m11) );
+	angles[PITCH] = RAD2DEG( asin(-m13) );
+	angles[ROLL] = RAD2DEG( atan2(m23, m33) );
+#endif
+
+	Assert( angles.IsValid() );
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: Converts a quaternion to an axis / angle in degrees
+//			(exponential map)
+//-----------------------------------------------------------------------------
+void QuaternionAxisAngle( const Quaternion &q, Vector &axis, float &angle )
+{
+	angle = RAD2DEG(2 * acos(q.w));
+	if ( angle > 180 )
+	{
+		angle -= 360;
+	}
+	axis.x = q.x;
+	axis.y = q.y;
+	axis.z = q.z;
+	VectorNormalize( axis );
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: Converts an exponential map (ang/axis) to a quaternion
+//-----------------------------------------------------------------------------
+void AxisAngleQuaternion( const Vector &axis, float angle, Quaternion &q )
+{
+	float sa, ca;
+	
+	SinCos( DEG2RAD(angle) * 0.5f, &sa, &ca );
+	
+	q.x = axis.x * sa;
+	q.y = axis.y * sa;
+	q.z = axis.z * sa;
+	q.w = ca;
+}
+
+
+//-----------------------------------------------------------------------------
+// Purpose: Converts radian-euler axis aligned angles to a quaternion
+// Input  : *pfAngles - Right-handed Euler angles in radians
+//			*outQuat - quaternion of form (i,j,k,real)
+//-----------------------------------------------------------------------------
+void AngleQuaternion( const RadianEuler &angles, Quaternion &outQuat )
+{
+	Assert( s_bMathlibInitialized );
+//	Assert( angles.IsValid() );
+
+#ifdef _VPROF_MATHLIB
+	VPROF_BUDGET( "AngleQuaternion", "Mathlib" );
+#endif
+
+	float sr, sp, sy, cr, cp, cy;
+
+#ifdef _X360
+	fltx4 radians, scale, sine, cosine;
+	radians = LoadUnaligned3SIMD( &angles.x );
+	scale = ReplicateX4( 0.5f ); 
+	radians = MulSIMD( radians, scale );
+	SinCos3SIMD( sine, cosine, radians ); 	
+
+	// NOTE: The ordering here is *different* from the AngleQuaternion below
+	// because p, y, r are not in the same locations in QAngle + RadianEuler. Yay!
+	sr = SubFloat( sine, 0 );	sp = SubFloat( sine, 1 );	sy = SubFloat( sine, 2 );	
+	cr = SubFloat( cosine, 0 );	cp = SubFloat( cosine, 1 );	cy = SubFloat( cosine, 2 );	
+#else
+	SinCos( angles.z * 0.5f, &sy, &cy );
+	SinCos( angles.y * 0.5f, &sp, &cp );
+	SinCos( angles.x * 0.5f, &sr, &cr );
+#endif
+
+	// NJS: for some reason VC6 wasn't recognizing the common subexpressions:
+	float srXcp = sr * cp, crXsp = cr * sp;
+	outQuat.x = srXcp*cy-crXsp*sy; // X
+	outQuat.y = crXsp*cy+srXcp*sy; // Y
+
+	float crXcp = cr * cp, srXsp = sr * sp;
+	outQuat.z = crXcp*sy-srXsp*cy; // Z
+	outQuat.w = crXcp*cy+srXsp*sy; // W (real component)
+}
+
+
+//-----------------------------------------------------------------------------
+// Purpose: Converts engine-format euler angles to a quaternion
+// Input  : angles - Right-handed Euler angles in degrees as follows:
+//				[0]: PITCH: Clockwise rotation around the Y axis.
+//				[1]: YAW:	Counterclockwise rotation around the Z axis.
+//				[2]: ROLL:	Counterclockwise rotation around the X axis.
+//			*outQuat - quaternion of form (i,j,k,real)
+//-----------------------------------------------------------------------------
+void AngleQuaternion( const QAngle &angles, Quaternion &outQuat )
+{
+#ifdef _VPROF_MATHLIB
+	VPROF_BUDGET( "AngleQuaternion", "Mathlib" );
+#endif
+
+	float sr, sp, sy, cr, cp, cy;
+
+#ifdef _X360
+	fltx4 radians, scale, sine, cosine;
+	radians = LoadUnaligned3SIMD( angles.Base() );
+	scale = ReplicateX4( 0.5f * M_PI_F / 180.f ); 
+	radians = MulSIMD( radians, scale );
+	SinCos3SIMD( sine, cosine, radians ); 	
+
+	// NOTE: The ordering here is *different* from the AngleQuaternion above
+	// because p, y, r are not in the same locations in QAngle + RadianEuler. Yay!
+	sp = SubFloat( sine, 0 );	sy = SubFloat( sine, 1 );	sr = SubFloat( sine, 2 );	
+	cp = SubFloat( cosine, 0 );	cy = SubFloat( cosine, 1 );	cr = SubFloat( cosine, 2 );	
+#else
+	SinCos( DEG2RAD( angles.y ) * 0.5f, &sy, &cy );
+	SinCos( DEG2RAD( angles.x ) * 0.5f, &sp, &cp );
+	SinCos( DEG2RAD( angles.z ) * 0.5f, &sr, &cr );
+#endif
+
+	// NJS: for some reason VC6 wasn't recognizing the common subexpressions:
+	float srXcp = sr * cp, crXsp = cr * sp;
+	outQuat.x = srXcp*cy-crXsp*sy; // X
+	outQuat.y = crXsp*cy+srXcp*sy; // Y
+
+	float crXcp = cr * cp, srXsp = sr * sp;
+	outQuat.z = crXcp*sy-srXsp*cy; // Z
+	outQuat.w = crXcp*cy+srXsp*sy; // W (real component)
+}
+
+
+//-----------------------------------------------------------------------------
+// Purpose: Converts a basis to a quaternion
+//-----------------------------------------------------------------------------
+void BasisToQuaternion( const Vector &vecForward, const Vector &vecRight, const Vector &vecUp, Quaternion &q )
+{
+	Assert( fabs( vecForward.LengthSqr() - 1.0f ) < 1e-3 );
+	Assert( fabs( vecRight.LengthSqr() - 1.0f ) < 1e-3 );
+	Assert( fabs( vecUp.LengthSqr() - 1.0f ) < 1e-3 );
+
+	Vector vecLeft;
+	VectorMultiply( vecRight, -1.0f, vecLeft );
+
+	// FIXME: Don't know why, but this doesn't match at all with other result
+	// so we can't use this super-fast way.
+	/*
+	// Find the trace of the matrix:
+	float flTrace = vecForward.x + vecLeft.y + vecUp.z + 1.0f;
+	if ( flTrace > 1e-6 )
+	{
+		float flSqrtTrace = FastSqrt( flTrace );
+		float s = 0.5f / flSqrtTrace;
+		q.x = ( vecUp.y - vecLeft.z ) * s;
+		q.y = ( vecForward.z - vecUp.x ) * s;
+		q.z = ( vecLeft.x - vecForward.y ) * s;
+		q.w = 0.5f * flSqrtTrace;
+	}
+	else
+	{
+		if (( vecForward.x > vecLeft.y ) && ( vecForward.x > vecUp.z ) )
+		{
+			float flSqrtTrace = FastSqrt( 1.0f + vecForward.x - vecLeft.y - vecUp.z );
+			float s = 0.5f / flSqrtTrace;
+			q.x = 0.5f * flSqrtTrace;
+			q.y = ( vecForward.y + vecLeft.x ) * s;
+			q.z = ( vecUp.x + vecForward.z ) * s;
+			q.w = ( vecUp.y - vecLeft.z ) * s;
+		}
+		else if ( vecLeft.y > vecUp.z )
+		{
+			float flSqrtTrace = FastSqrt( 1.0f + vecLeft.y - vecForward.x - vecUp.z );
+			float s = 0.5f / flSqrtTrace;
+			q.x = ( vecForward.y + vecLeft.x ) * s;
+			q.y = 0.5f * flSqrtTrace;
+			q.z = ( vecUp.y + vecLeft.z ) * s;
+			q.w = ( vecForward.z - vecUp.x ) * s;
+		}
+		else
+		{
+			float flSqrtTrace = FastSqrt( 1.0 + vecUp.z - vecForward.x - vecLeft.y );
+			float s = 0.5f / flSqrtTrace;
+			q.x = ( vecUp.x + vecForward.z ) * s;
+			q.y = ( vecUp.y + vecLeft.z ) * s;
+			q.z = 0.5f * flSqrtTrace;
+			q.w = ( vecLeft.x - vecForward.y ) * s;
+		}
+	}
+	QuaternionNormalize( q );
+	*/
+
+	// Version 2: Go through angles
+
+	matrix3x4_t mat;
+	MatrixSetColumn( vecForward, 0, mat );
+	MatrixSetColumn( vecLeft, 1, mat );
+	MatrixSetColumn( vecUp, 2, mat );
+
+	QAngle angles;
+	MatrixAngles( mat, angles );
+
+//	Quaternion q2;
+	AngleQuaternion( angles, q );
+
+//	Assert( fabs(q.x - q2.x) < 1e-3 );
+//	Assert( fabs(q.y - q2.y) < 1e-3 );
+//	Assert( fabs(q.z - q2.z) < 1e-3 );
+//	Assert( fabs(q.w - q2.w) < 1e-3 );
+}
+
+// FIXME: Optimize!
+void MatrixQuaternion( const matrix3x4_t &mat, Quaternion &q )
+{
+	QAngle angles;
+	MatrixAngles( mat, angles );
+	AngleQuaternion( angles, q );
+}
+
+
+//-----------------------------------------------------------------------------
+// Purpose: Converts a quaternion into engine angles
+// Input  : *quaternion - q3 + q0.i + q1.j + q2.k
+//			*outAngles - PITCH, YAW, ROLL
+//-----------------------------------------------------------------------------
+void QuaternionAngles( const Quaternion &q, RadianEuler &angles )
+{
+	Assert( s_bMathlibInitialized );
+	Assert( q.IsValid() );
+
+	// FIXME: doing it this way calculates too much data, needs to do an optimized version...
+	matrix3x4_t matrix;
+	QuaternionMatrix( q, matrix );
+	MatrixAngles( matrix, angles );
+
+	Assert( angles.IsValid() );
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: A helper function to normalize p2.x->p1.x and p3.x->p4.x to 
+//  be the same length as p2.x->p3.x
+// Input  : &p2 - 
+//			&p4 - 
+//			p4n - 
+//-----------------------------------------------------------------------------
+void Spline_Normalize( 
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	Vector& p1n,
+	Vector& p4n )
+{
+	float dt = p3.x - p2.x;
+
+	p1n = p1;
+	p4n = p4;
+
+	if ( dt != 0.0 )
+	{
+		if (p1.x != p2.x)
+		{
+			// Equivalent to p1n = p2 - (p2 - p1) * (dt / (p2.x - p1.x));
+			VectorLerp( p2, p1, dt / (p2.x - p1.x), p1n );
+		}
+		if (p4.x != p3.x)
+		{
+			// Equivalent to p4n = p3 + (p4 - p3) * (dt / (p4.x - p3.x));
+			VectorLerp( p3, p4, dt / (p4.x - p3.x), p4n );
+		}
+	}
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: 
+// Input  : 
+//-----------------------------------------------------------------------------
+
+void Catmull_Rom_Spline(
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector& output )
+{
+	Assert( s_bMathlibInitialized );
+	float tSqr = t*t*0.5f;
+	float tSqrSqr = t*tSqr;
+	t *= 0.5f;
+
+	Assert( &output != &p1 );
+	Assert( &output != &p2 );
+	Assert( &output != &p3 );
+	Assert( &output != &p4 );
+
+	output.Init();
+
+	Vector a, b, c, d;
+
+	// matrix row 1
+	VectorScale( p1, -tSqrSqr, a );		// 0.5 t^3 * [ (-1*p1) + ( 3*p2) + (-3*p3) + p4 ]
+	VectorScale( p2, tSqrSqr*3, b );
+	VectorScale( p3, tSqrSqr*-3, c );
+	VectorScale( p4, tSqrSqr, d );
+
+	VectorAdd( a, output, output );
+	VectorAdd( b, output, output );
+	VectorAdd( c, output, output );
+	VectorAdd( d, output, output );
+
+	// matrix row 2
+	VectorScale( p1, tSqr*2,  a );		// 0.5 t^2 * [ ( 2*p1) + (-5*p2) + ( 4*p3) - p4 ]
+	VectorScale( p2, tSqr*-5, b );
+	VectorScale( p3, tSqr*4,  c );
+	VectorScale( p4, -tSqr,    d );
+
+	VectorAdd( a, output, output );
+	VectorAdd( b, output, output );
+	VectorAdd( c, output, output );
+	VectorAdd( d, output, output );
+
+	// matrix row 3
+	VectorScale( p1, -t, a );			// 0.5 t * [ (-1*p1) + p3 ]
+	VectorScale( p3, t,  b );
+
+	VectorAdd( a, output, output );
+	VectorAdd( b, output, output );
+
+	// matrix row 4
+	VectorAdd( p2, output, output );	// p2
+}
+
+void Catmull_Rom_Spline_Tangent(
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector& output )
+{
+	Assert( s_bMathlibInitialized );
+	float tOne = 3*t*t*0.5f;
+	float tTwo = 2*t*0.5f;
+	float tThree = 0.5;
+
+	Assert( &output != &p1 );
+	Assert( &output != &p2 );
+	Assert( &output != &p3 );
+	Assert( &output != &p4 );
+
+	output.Init();
+
+	Vector a, b, c, d;
+
+	// matrix row 1
+	VectorScale( p1, -tOne, a );		// 0.5 t^3 * [ (-1*p1) + ( 3*p2) + (-3*p3) + p4 ]
+	VectorScale( p2, tOne*3, b );
+	VectorScale( p3, tOne*-3, c );
+	VectorScale( p4, tOne, d );
+
+	VectorAdd( a, output, output );
+	VectorAdd( b, output, output );
+	VectorAdd( c, output, output );
+	VectorAdd( d, output, output );
+
+	// matrix row 2
+	VectorScale( p1, tTwo*2,  a );		// 0.5 t^2 * [ ( 2*p1) + (-5*p2) + ( 4*p3) - p4 ]
+	VectorScale( p2, tTwo*-5, b );
+	VectorScale( p3, tTwo*4,  c );
+	VectorScale( p4, -tTwo,    d );
+
+	VectorAdd( a, output, output );
+	VectorAdd( b, output, output );
+	VectorAdd( c, output, output );
+	VectorAdd( d, output, output );
+
+	// matrix row 3
+	VectorScale( p1, -tThree, a );			// 0.5 t * [ (-1*p1) + p3 ]
+	VectorScale( p3, tThree,  b );
+
+	VectorAdd( a, output, output );
+	VectorAdd( b, output, output );
+}
+
+// area under the curve [0..t]
+void Catmull_Rom_Spline_Integral( 
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector& output )
+{
+	output = p2*t
+			-0.25f*(p1 - p3)*t*t 
+			+ (1.0f/6.0f)*(2.0f*p1 - 5.0f*p2 + 4.0f*p3 - p4)*t*t*t
+			- 0.125f*(p1 - 3.0f*p2 + 3.0f*p3 - p4)*t*t*t*t;
+}
+
+
+// area under the curve [0..1]
+void Catmull_Rom_Spline_Integral( 
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	Vector& output )
+{
+	output = (-0.25f * p1 + 3.25f * p2 + 3.25f * p3 - 0.25f * p4) * (1.0f / 6.0f);
+}
+
+
+void Catmull_Rom_Spline_Normalize(
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector& output )
+{
+	// Normalize p2->p1 and p3->p4 to be the same length as p2->p3
+	float dt = p3.DistTo(p2);
+
+	Vector p1n, p4n;
+	VectorSubtract( p1, p2, p1n );
+	VectorSubtract( p4, p3, p4n );
+
+	VectorNormalize( p1n );
+	VectorNormalize( p4n );
+
+	VectorMA( p2, dt, p1n, p1n );
+	VectorMA( p3, dt, p4n, p4n );
+	
+	Catmull_Rom_Spline( p1n, p2, p3, p4n, t, output );
+}
+
+
+void Catmull_Rom_Spline_Integral_Normalize(
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector& output )
+{
+	// Normalize p2->p1 and p3->p4 to be the same length as p2->p3
+	float dt = p3.DistTo(p2);
+
+	Vector p1n, p4n;
+	VectorSubtract( p1, p2, p1n );
+	VectorSubtract( p4, p3, p4n );
+
+	VectorNormalize( p1n );
+	VectorNormalize( p4n );
+
+	VectorMA( p2, dt, p1n, p1n );
+	VectorMA( p3, dt, p4n, p4n );
+	
+	Catmull_Rom_Spline_Integral( p1n, p2, p3, p4n, t, output );
+}
+
+
+void Catmull_Rom_Spline_NormalizeX(
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector& output )
+{
+	Vector p1n, p4n;
+	Spline_Normalize( p1, p2, p3, p4, p1n, p4n );
+	Catmull_Rom_Spline( p1n, p2, p3, p4n, t, output );
+}
+
+
+//-----------------------------------------------------------------------------
+// Purpose: basic hermite spline.  t = 0 returns p1, t = 1 returns p2, 
+//			d1 and d2 are used to entry and exit slope of curve
+// Input  : 
+//-----------------------------------------------------------------------------
+
+void Hermite_Spline(
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &d1,
+	const Vector &d2,
+	float t, 
+	Vector& output )
+{
+	Assert( s_bMathlibInitialized );
+	float tSqr = t*t;
+	float tCube = t*tSqr;
+
+	Assert( &output != &p1 );
+	Assert( &output != &p2 );
+	Assert( &output != &d1 );
+	Assert( &output != &d2 );
+
+	float b1 = 2.0f*tCube-3.0f*tSqr+1.0f;
+	float b2 = 1.0f - b1; // -2*tCube+3*tSqr;
+	float b3 = tCube-2*tSqr+t;
+	float b4 = tCube-tSqr;
+
+	VectorScale( p1, b1, output );
+	VectorMA( output, b2, p2, output );
+	VectorMA( output, b3, d1, output );
+	VectorMA( output, b4, d2, output );
+}
+
+float Hermite_Spline(
+	float p1,
+	float p2,
+	float d1,
+	float d2,
+	float t )
+{
+	Assert( s_bMathlibInitialized );
+	float output;
+	float tSqr = t*t;
+	float tCube = t*tSqr;
+
+	float b1 = 2.0f*tCube-3.0f*tSqr+1.0f;
+	float b2 = 1.0f - b1; // -2*tCube+3*tSqr;
+	float b3 = tCube-2*tSqr+t;
+	float b4 = tCube-tSqr;
+
+	output = p1 * b1;
+	output += p2 * b2;
+	output += d1 * b3;
+	output += d2 * b4;
+
+	return output;
+}
+
+
+void Hermite_SplineBasis( float t, float basis[4] )
+{
+	float tSqr = t*t;
+	float tCube = t*tSqr;
+
+	basis[0] = 2.0f*tCube-3.0f*tSqr+1.0f;
+	basis[1] = 1.0f - basis[0]; // -2*tCube+3*tSqr;
+	basis[2] = tCube-2*tSqr+t;
+	basis[3] = tCube-tSqr;
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: simple three data point hermite spline.  
+//			t = 0 returns p1, t = 1 returns p2, 
+//			slopes are generated from the p0->p1 and p1->p2 segments
+//			this is reasonable C1 method when there's no "p3" data yet.
+// Input  : 
+//-----------------------------------------------------------------------------
+
+// BUG: the VectorSubtract()'s calls go away if the global optimizer is enabled
+#pragma optimize( "g", off )
+
+void Hermite_Spline( const Vector &p0, const Vector &p1, const Vector &p2, float t, Vector& output )
+{
+	Vector e10, e21;
+	VectorSubtract( p1, p0, e10 );
+	VectorSubtract( p2, p1, e21 );
+	Hermite_Spline( p1, p2, e10, e21, t, output );
+}
+
+#pragma optimize( "", on )
+
+float Hermite_Spline( float p0, float p1, float p2,	float t )
+{
+	return Hermite_Spline( p1, p2, p1 - p0, p2 - p1, t );
+}
+
+
+void Hermite_Spline( const Quaternion &q0, const Quaternion &q1, const Quaternion &q2, float t, Quaternion &output )
+{
+	// cheap, hacked version of quaternions
+	Quaternion q0a;
+	Quaternion q1a;
+
+	QuaternionAlign( q2, q0, q0a );
+	QuaternionAlign( q2, q1, q1a );
+
+	output.x = Hermite_Spline( q0a.x, q1a.x, q2.x, t );
+	output.y = Hermite_Spline( q0a.y, q1a.y, q2.y, t );
+	output.z = Hermite_Spline( q0a.z, q1a.z, q2.z, t );
+	output.w = Hermite_Spline( q0a.w, q1a.w, q2.w, t );
+
+	QuaternionNormalize( output );
+}
+
+// See http://en.wikipedia.org/wiki/Kochanek-Bartels_curves
+// 
+// Tension:  -1 = Round -> 1 = Tight
+// Bias:     -1 = Pre-shoot (bias left) -> 1 = Post-shoot (bias right)
+// Continuity: -1 = Box corners -> 1 = Inverted corners
+//
+// If T=B=C=0 it's the same matrix as Catmull-Rom.
+// If T=1 & B=C=0 it's the same as Cubic.
+// If T=B=0 & C=-1 it's just linear interpolation
+// 
+// See http://news.povray.org/povray.binaries.tutorials/attachment/%[email protected]%3E/Splines.bas.txt
+// for example code and descriptions of various spline types...
+// 
+void Kochanek_Bartels_Spline(
+	float tension, 
+	float bias, 
+	float continuity,
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector& output )
+{
+	Assert( s_bMathlibInitialized );
+
+	float ffa, ffb, ffc, ffd;
+
+	ffa = ( 1.0f - tension ) * ( 1.0f + continuity ) * ( 1.0f + bias );
+	ffb = ( 1.0f - tension ) * ( 1.0f - continuity ) * ( 1.0f - bias );
+	ffc = ( 1.0f - tension ) * ( 1.0f - continuity ) * ( 1.0f + bias );
+	ffd = ( 1.0f - tension ) * ( 1.0f + continuity ) * ( 1.0f - bias );
+
+	float tSqr = t*t*0.5f;
+	float tSqrSqr = t*tSqr;
+	t *= 0.5f;
+
+	Assert( &output != &p1 );
+	Assert( &output != &p2 );
+	Assert( &output != &p3 );
+	Assert( &output != &p4 );
+
+	output.Init();
+
+	Vector a, b, c, d;
+
+	// matrix row 1
+	VectorScale( p1, tSqrSqr * -ffa, a );		
+	VectorScale( p2, tSqrSqr * ( 4.0f + ffa - ffb - ffc ), b );
+	VectorScale( p3, tSqrSqr * ( -4.0f + ffb + ffc - ffd ), c );
+	VectorScale( p4, tSqrSqr * ffd, d );
+
+	VectorAdd( a, output, output );
+	VectorAdd( b, output, output );
+	VectorAdd( c, output, output );
+	VectorAdd( d, output, output );
+
+	// matrix row 2
+	VectorScale( p1, tSqr* 2 * ffa,  a );		
+	VectorScale( p2, tSqr * ( -6 - 2 * ffa + 2 * ffb + ffc ), b );
+	VectorScale( p3, tSqr * ( 6 - 2 * ffb - ffc + ffd ),  c );
+	VectorScale( p4, tSqr * -ffd,    d );
+
+	VectorAdd( a, output, output );
+	VectorAdd( b, output, output );
+	VectorAdd( c, output, output );
+	VectorAdd( d, output, output );
+
+	// matrix row 3
+	VectorScale( p1, t * -ffa,  a );		
+	VectorScale( p2, t * ( ffa - ffb ), b );
+	VectorScale( p3, t * ffb,  c );
+	// p4 unchanged
+
+	VectorAdd( a, output, output );
+	VectorAdd( b, output, output );
+	VectorAdd( c, output, output );
+
+	// matrix row 4
+	// p1, p3, p4 unchanged
+	// p2 is multiplied by 1 and added, so just added it directly
+
+	VectorAdd( p2, output, output );
+}
+
+void Kochanek_Bartels_Spline_NormalizeX(
+	float tension, 
+	float bias, 
+	float continuity,
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector& output )
+{
+	Vector p1n, p4n;
+	Spline_Normalize( p1, p2, p3, p4, p1n, p4n );
+	Kochanek_Bartels_Spline( tension, bias, continuity, p1n, p2, p3, p4n, t, output );
+}
+
+void Cubic_Spline(
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector& output )
+{
+	Assert( s_bMathlibInitialized );
+
+	float tSqr = t*t;
+	float tSqrSqr = t*tSqr;
+
+	Assert( &output != &p1 );
+	Assert( &output != &p2 );
+	Assert( &output != &p3 );
+	Assert( &output != &p4 );
+
+	output.Init();
+
+	Vector a, b, c, d;
+
+	// matrix row 1
+	VectorScale( p2, tSqrSqr * 2, b );
+	VectorScale( p3, tSqrSqr * -2, c );
+
+	VectorAdd( b, output, output );
+	VectorAdd( c, output, output );
+
+	// matrix row 2
+	VectorScale( p2, tSqr * -3, b );
+	VectorScale( p3, tSqr * 3,  c );
+
+	VectorAdd( b, output, output );
+	VectorAdd( c, output, output );
+
+	// matrix row 3
+	// no influence
+	// p4 unchanged
+
+	// matrix row 4
+	// p1, p3, p4 unchanged
+	VectorAdd( p2, output, output );
+}
+
+void Cubic_Spline_NormalizeX(
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector& output )
+{
+	Vector p1n, p4n;
+	Spline_Normalize( p1, p2, p3, p4, p1n, p4n );
+	Cubic_Spline( p1n, p2, p3, p4n, t, output );
+}
+
+void BSpline(
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector& output )
+{
+	Assert( s_bMathlibInitialized );
+
+	float oneOver6 = 1.0f / 6.0f;
+
+	float tSqr = t * t * oneOver6;
+	float tSqrSqr = t*tSqr;
+	t *= oneOver6;
+
+	Assert( &output != &p1 );
+	Assert( &output != &p2 );
+	Assert( &output != &p3 );
+	Assert( &output != &p4 );
+
+	output.Init();
+
+	Vector a, b, c, d;
+
+	// matrix row 1
+	VectorScale( p1, -tSqrSqr, a );		
+	VectorScale( p2, tSqrSqr * 3.0f, b );
+	VectorScale( p3, tSqrSqr * -3.0f, c );
+	VectorScale( p4, tSqrSqr, d );
+
+	VectorAdd( a, output, output );
+	VectorAdd( b, output, output );
+	VectorAdd( c, output, output );
+	VectorAdd( d, output, output );
+
+	// matrix row 2
+	VectorScale( p1, tSqr * 3.0f,  a );		
+	VectorScale( p2, tSqr * -6.0f, b );
+	VectorScale( p3, tSqr * 3.0f,  c );
+
+	VectorAdd( a, output, output );
+	VectorAdd( b, output, output );
+	VectorAdd( c, output, output );
+
+	// matrix row 3
+	VectorScale( p1, t * -3.0f,  a );		
+	VectorScale( p3, t * 3.0f,  c );
+	// p4 unchanged
+
+	VectorAdd( a, output, output );
+	VectorAdd( c, output, output );
+
+	// matrix row 4
+	// p1 and p3 scaled by 1.0f, so done below
+	VectorScale( p1, oneOver6, a );
+	VectorScale( p2, 4.0f * oneOver6, b );
+	VectorScale( p3, oneOver6, c );
+
+	VectorAdd( a, output, output );
+	VectorAdd( b, output, output );
+	VectorAdd( c, output, output );
+}
+
+void BSpline_NormalizeX(
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector& output )
+{
+	Vector p1n, p4n;
+	Spline_Normalize( p1, p2, p3, p4, p1n, p4n );
+	BSpline( p1n, p2, p3, p4n, t, output );
+}
+
+void Parabolic_Spline(
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector& output )
+{
+	Assert( s_bMathlibInitialized );
+
+	float tSqr = t*t*0.5f;
+	t *= 0.5f;
+
+	Assert( &output != &p1 );
+	Assert( &output != &p2 );
+	Assert( &output != &p3 );
+	Assert( &output != &p4 );
+
+	output.Init();
+
+	Vector a, b, c, d;
+
+	// matrix row 1
+	// no influence from t cubed
+
+	// matrix row 2
+	VectorScale( p1, tSqr,  a );		
+	VectorScale( p2, tSqr * -2.0f, b );
+	VectorScale( p3, tSqr,  c );
+
+	VectorAdd( a, output, output );
+	VectorAdd( b, output, output );
+	VectorAdd( c, output, output );
+
+	// matrix row 3
+	VectorScale( p1, t * -2.0f,  a );		
+	VectorScale( p2, t * 2.0f,  b );
+	// p4 unchanged
+
+	VectorAdd( a, output, output );
+	VectorAdd( b, output, output );
+
+	// matrix row 4
+	VectorScale( p1, 0.5f,  a );		
+	VectorScale( p2, 0.5f,  b );
+
+	VectorAdd( a, output, output );
+	VectorAdd( b, output, output );
+}
+
+void Parabolic_Spline_NormalizeX(
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector& output )
+{
+	Vector p1n, p4n;
+	Spline_Normalize( p1, p2, p3, p4, p1n, p4n );
+	Parabolic_Spline( p1n, p2, p3, p4n, t, output );
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: Compress the input values for a ranged result such that from 75% to 200% smoothly of the range maps 
+//-----------------------------------------------------------------------------
+
+float RangeCompressor( float flValue, float flMin, float flMax, float flBase )
+{
+	// clamp base
+	if (flBase < flMin)
+		flBase = flMin;
+	if (flBase > flMax)
+		flBase = flMax;
+
+	flValue += flBase;
+
+	// convert to 0 to 1 value
+	float flMid = (flValue - flMin) / (flMax - flMin);
+	// convert to -1 to 1 value
+	float flTarget = flMid * 2 - 1;
+
+	if (fabs(flTarget) > 0.75)
+	{
+		float t = (fabs(flTarget) - 0.75) / (1.25);
+		if (t < 1.0)
+		{
+			if (flTarget > 0)
+			{
+				flTarget = Hermite_Spline( 0.75, 1, 0.75, 0, t );
+			}
+			else
+			{
+				flTarget = -Hermite_Spline( 0.75, 1, 0.75, 0, t );
+			}
+		}
+		else
+		{
+			flTarget = (flTarget > 0) ? 1.0f : -1.0f;
+		}
+	}
+
+	flMid = (flTarget + 1 ) / 2.0;
+	flValue = flMin * (1 - flMid) + flMax * flMid;
+
+	flValue -= flBase;
+
+	return flValue;
+}
+
+
+//#pragma optimize( "", on )
+
+//-----------------------------------------------------------------------------
+// Transforms a AABB into another space; which will inherently grow the box.
+//-----------------------------------------------------------------------------
+void TransformAABB( const matrix3x4_t& transform, const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut )
+{
+	Vector localCenter;
+	VectorAdd( vecMinsIn, vecMaxsIn, localCenter );
+	localCenter *= 0.5f;
+
+	Vector localExtents;
+	VectorSubtract( vecMaxsIn, localCenter, localExtents );
+
+	Vector worldCenter;
+	VectorTransform( localCenter, transform, worldCenter );
+
+	Vector worldExtents;
+	worldExtents.x = DotProductAbs( localExtents, transform[0] );
+	worldExtents.y = DotProductAbs( localExtents, transform[1] );
+	worldExtents.z = DotProductAbs( localExtents, transform[2] );
+
+	VectorSubtract( worldCenter, worldExtents, vecMinsOut );
+	VectorAdd( worldCenter, worldExtents, vecMaxsOut );
+}
+
+
+//-----------------------------------------------------------------------------
+// Uses the inverse transform of in1
+//-----------------------------------------------------------------------------
+void ITransformAABB( const matrix3x4_t& transform, const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut )
+{
+	Vector worldCenter;
+	VectorAdd( vecMinsIn, vecMaxsIn, worldCenter );
+	worldCenter *= 0.5f;
+
+	Vector worldExtents;
+	VectorSubtract( vecMaxsIn, worldCenter, worldExtents );
+
+	Vector localCenter;
+	VectorITransform( worldCenter, transform, localCenter );
+
+	Vector localExtents;
+	localExtents.x =	FloatMakePositive( worldExtents.x * transform[0][0] ) + 
+						FloatMakePositive( worldExtents.y * transform[1][0] ) + 
+						FloatMakePositive( worldExtents.z * transform[2][0] );
+	localExtents.y =	FloatMakePositive( worldExtents.x * transform[0][1] ) + 
+						FloatMakePositive( worldExtents.y * transform[1][1] ) + 
+						FloatMakePositive( worldExtents.z * transform[2][1] );
+	localExtents.z =	FloatMakePositive( worldExtents.x * transform[0][2] ) + 
+						FloatMakePositive( worldExtents.y * transform[1][2] ) + 
+						FloatMakePositive( worldExtents.z * transform[2][2] );
+
+	VectorSubtract( localCenter, localExtents, vecMinsOut );
+	VectorAdd( localCenter, localExtents, vecMaxsOut );
+}
+
+
+//-----------------------------------------------------------------------------
+// Rotates a AABB into another space; which will inherently grow the box. 
+// (same as TransformAABB, but doesn't take the translation into account)
+//-----------------------------------------------------------------------------
+void RotateAABB( const matrix3x4_t &transform, const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut )
+{
+	Vector localCenter;
+	VectorAdd( vecMinsIn, vecMaxsIn, localCenter );
+	localCenter *= 0.5f;
+
+	Vector localExtents;
+	VectorSubtract( vecMaxsIn, localCenter, localExtents );
+
+	Vector newCenter;
+	VectorRotate( localCenter, transform, newCenter );
+
+	Vector newExtents;
+	newExtents.x = DotProductAbs( localExtents, transform[0] );
+	newExtents.y = DotProductAbs( localExtents, transform[1] );
+	newExtents.z = DotProductAbs( localExtents, transform[2] );
+
+	VectorSubtract( newCenter, newExtents, vecMinsOut );
+	VectorAdd( newCenter, newExtents, vecMaxsOut );
+}
+
+
+//-----------------------------------------------------------------------------
+// Uses the inverse transform of in1
+//-----------------------------------------------------------------------------
+void IRotateAABB( const matrix3x4_t &transform, const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut )
+{
+	Vector oldCenter;
+	VectorAdd( vecMinsIn, vecMaxsIn, oldCenter );
+	oldCenter *= 0.5f;
+
+	Vector oldExtents;
+	VectorSubtract( vecMaxsIn, oldCenter, oldExtents );
+
+	Vector newCenter;
+	VectorIRotate( oldCenter, transform, newCenter );
+
+	Vector newExtents;
+	newExtents.x =	FloatMakePositive( oldExtents.x * transform[0][0] ) + 
+					FloatMakePositive( oldExtents.y * transform[1][0] ) + 
+					FloatMakePositive( oldExtents.z * transform[2][0] );
+	newExtents.y =	FloatMakePositive( oldExtents.x * transform[0][1] ) + 
+					FloatMakePositive( oldExtents.y * transform[1][1] ) + 
+					FloatMakePositive( oldExtents.z * transform[2][1] );
+	newExtents.z =	FloatMakePositive( oldExtents.x * transform[0][2] ) + 
+					FloatMakePositive( oldExtents.y * transform[1][2] ) + 
+					FloatMakePositive( oldExtents.z * transform[2][2] );
+
+	VectorSubtract( newCenter, newExtents, vecMinsOut );
+	VectorAdd( newCenter, newExtents, vecMaxsOut );
+}
+
+
+float CalcSqrDistanceToAABB( const Vector &mins, const Vector &maxs, const Vector &point )
+{
+	float flDelta;
+	float flDistSqr = 0.0f;
+
+	if ( point.x < mins.x )
+	{
+		flDelta = (mins.x - point.x);
+		flDistSqr += flDelta * flDelta;
+	}
+	else if ( point.x > maxs.x )
+	{
+		flDelta = (point.x - maxs.x);
+		flDistSqr += flDelta * flDelta;
+	}
+
+	if ( point.y < mins.y )
+	{
+		flDelta = (mins.y - point.y);
+		flDistSqr += flDelta * flDelta;
+	}
+	else if ( point.y > maxs.y )
+	{
+		flDelta = (point.y - maxs.y);
+		flDistSqr += flDelta * flDelta;
+	}
+
+	if ( point.z < mins.z )
+	{
+		flDelta = (mins.z - point.z);
+		flDistSqr += flDelta * flDelta;
+	}
+	else if ( point.z > maxs.z )
+	{
+		flDelta = (point.z - maxs.z);
+		flDistSqr += flDelta * flDelta;
+	}
+
+	return flDistSqr;
+}
+
+
+void CalcClosestPointOnAABB( const Vector &mins, const Vector &maxs, const Vector &point, Vector &closestOut )
+{
+	closestOut.x = clamp( point.x, mins.x, maxs.x );
+	closestOut.y = clamp( point.y, mins.y, maxs.y );
+	closestOut.z = clamp( point.z, mins.z, maxs.z );
+}
+
+void CalcSqrDistAndClosestPointOnAABB( const Vector &mins, const Vector &maxs, const Vector &point, Vector &closestOut, float &distSqrOut )
+{
+	distSqrOut = 0.0f;
+	for ( int i = 0; i < 3; i++ )
+	{
+		if ( point[i] < mins[i] )
+		{
+			closestOut[i] = mins[i];
+			float flDelta = closestOut[i] - mins[i];
+			distSqrOut += flDelta * flDelta;
+		}
+		else if ( point[i] > maxs[i] )
+		{
+			closestOut[i] = maxs[i];
+			float flDelta = closestOut[i] - maxs[i];
+			distSqrOut += flDelta * flDelta;
+		}
+		else
+		{
+			closestOut[i] = point[i];
+		}
+	}
+
+}
+
+float CalcClosestPointToLineT( const Vector &P, const Vector &vLineA, const Vector &vLineB, Vector &vDir )
+{
+	Assert( s_bMathlibInitialized );
+	VectorSubtract( vLineB, vLineA, vDir );
+
+	// D dot [P - (A + D*t)] = 0
+	// t = ( DP - DA) / DD
+	float div = vDir.Dot( vDir );
+	if( div < 0.00001f )
+	{
+		return 0;
+	}
+	else
+	{
+		return (vDir.Dot( P ) - vDir.Dot( vLineA )) / div;
+	}
+}
+
+void CalcClosestPointOnLine( const Vector &P, const Vector &vLineA, const Vector &vLineB, Vector &vClosest, float *outT )
+{
+	Assert( s_bMathlibInitialized );
+	Vector vDir;
+	float t = CalcClosestPointToLineT( P, vLineA, vLineB, vDir );
+	if ( outT ) *outT = t;
+	vClosest.MulAdd( vLineA, vDir, t );
+}
+
+
+float CalcDistanceToLine( const Vector &P, const Vector &vLineA, const Vector &vLineB, float *outT )
+{
+	Assert( s_bMathlibInitialized );
+	Vector vClosest;
+	CalcClosestPointOnLine( P, vLineA, vLineB, vClosest, outT );
+	return P.DistTo(vClosest);
+}
+
+float CalcDistanceSqrToLine( const Vector &P, const Vector &vLineA, const Vector &vLineB, float *outT )
+{
+	Assert( s_bMathlibInitialized );
+	Vector vClosest;
+	CalcClosestPointOnLine( P, vLineA, vLineB, vClosest, outT );
+	return P.DistToSqr(vClosest);
+}
+
+void CalcClosestPointOnLineSegment( const Vector &P, const Vector &vLineA, const Vector &vLineB, Vector &vClosest, float *outT )
+{
+	Vector vDir;
+	float t = CalcClosestPointToLineT( P, vLineA, vLineB, vDir );
+	t = clamp( t, 0.f, 1.f );
+	if ( outT ) 
+	{
+		*outT = t;
+	}
+	vClosest.MulAdd( vLineA, vDir, t );
+}
+
+
+float CalcDistanceToLineSegment( const Vector &P, const Vector &vLineA, const Vector &vLineB, float *outT )
+{
+	Assert( s_bMathlibInitialized );
+	Vector vClosest;
+	CalcClosestPointOnLineSegment( P, vLineA, vLineB, vClosest, outT );
+	return P.DistTo( vClosest );
+}
+
+float CalcDistanceSqrToLineSegment( const Vector &P, const Vector &vLineA, const Vector &vLineB, float *outT )
+{
+	Assert( s_bMathlibInitialized );
+	Vector vClosest;
+	CalcClosestPointOnLineSegment( P, vLineA, vLineB, vClosest, outT );
+	return P.DistToSqr(vClosest);
+}
+
+float CalcClosestPointToLineT2D( const Vector2D &P, const Vector2D &vLineA, const Vector2D &vLineB, Vector2D &vDir )
+{
+	Assert( s_bMathlibInitialized );
+	Vector2DSubtract( vLineB, vLineA, vDir );
+
+	// D dot [P - (A + D*t)] = 0
+	// t = (DP - DA) / DD
+	float div = vDir.Dot( vDir );
+	if( div < 0.00001f )
+	{
+		return 0;
+	}
+	else
+	{
+		return (vDir.Dot( P ) - vDir.Dot( vLineA )) / div;
+	}
+}
+
+void CalcClosestPointOnLine2D( const Vector2D &P, const Vector2D &vLineA, const Vector2D &vLineB, Vector2D &vClosest, float *outT )
+{
+	Assert( s_bMathlibInitialized );
+	Vector2D vDir;
+	float t = CalcClosestPointToLineT2D( P, vLineA, vLineB, vDir );
+	if ( outT ) *outT = t;
+	vClosest.MulAdd( vLineA, vDir, t );
+}
+
+float CalcDistanceToLine2D( const Vector2D &P, const Vector2D &vLineA, const Vector2D &vLineB, float *outT )
+{
+	Assert( s_bMathlibInitialized );
+	Vector2D vClosest;
+	CalcClosestPointOnLine2D( P, vLineA, vLineB, vClosest, outT );
+	return P.DistTo( vClosest );
+}
+
+float CalcDistanceSqrToLine2D( const Vector2D &P, const Vector2D &vLineA, const Vector2D &vLineB, float *outT )
+{
+	Assert( s_bMathlibInitialized );
+	Vector2D vClosest;
+	CalcClosestPointOnLine2D( P, vLineA, vLineB, vClosest, outT );
+	return P.DistToSqr(vClosest);
+}
+
+void CalcClosestPointOnLineSegment2D( const Vector2D &P, const Vector2D &vLineA, const Vector2D &vLineB, Vector2D &vClosest, float *outT )
+{
+	Vector2D vDir;
+	float t = CalcClosestPointToLineT2D( P, vLineA, vLineB, vDir );
+	t = clamp( t, 0.f, 1.f );
+	if ( outT )
+	{
+		*outT = t;
+	}
+	vClosest.MulAdd( vLineA, vDir, t );
+}
+
+float CalcDistanceToLineSegment2D( const Vector2D &P, const Vector2D &vLineA, const Vector2D &vLineB, float *outT )
+{
+	Assert( s_bMathlibInitialized );
+	Vector2D vClosest;
+	CalcClosestPointOnLineSegment2D( P, vLineA, vLineB, vClosest, outT );
+	return P.DistTo( vClosest );
+}
+
+float CalcDistanceSqrToLineSegment2D( const Vector2D &P, const Vector2D &vLineA, const Vector2D &vLineB, float *outT )
+{
+	Assert( s_bMathlibInitialized );
+	Vector2D vClosest;
+	CalcClosestPointOnLineSegment2D( P, vLineA, vLineB, vClosest, outT );
+	return P.DistToSqr( vClosest );
+}
+
+// Do we have another epsilon we could use
+#define LINE_EPS ( 0.000001f )
+
+//-----------------------------------------------------------------------------
+// Purpose: Given lines p1->p2 and p3->p4, computes a line segment (pa->pb) and returns the parameters 0->1 multipliers
+//  along each segment for the returned points
+// Input  : p1 - 
+//			p2 - 
+//			p3 - 
+//			p4 - 
+//			*s1 - 
+//			*s2 - 
+// Output : Returns true on success, false on failure.
+//-----------------------------------------------------------------------------
+bool CalcLineToLineIntersectionSegment(
+   const Vector& p1,const Vector& p2,const Vector& p3,const Vector& p4,Vector *s1,Vector *s2,
+   float *t1, float *t2)
+{
+   Vector p13,p43,p21;
+   float d1343,d4321,d1321,d4343,d2121;
+   float numer,denom;
+
+   p13.x = p1.x - p3.x;
+   p13.y = p1.y - p3.y;
+   p13.z = p1.z - p3.z;
+   p43.x = p4.x - p3.x;
+   p43.y = p4.y - p3.y;
+   p43.z = p4.z - p3.z;
+
+   if (fabs(p43.x)  < LINE_EPS && fabs(p43.y)  < LINE_EPS && fabs(p43.z)  < LINE_EPS)
+      return false;
+   p21.x = p2.x - p1.x;
+   p21.y = p2.y - p1.y;
+   p21.z = p2.z - p1.z;
+   if (fabs(p21.x)  < LINE_EPS && fabs(p21.y)  < LINE_EPS && fabs(p21.z)  < LINE_EPS)
+      return false;
+
+   d1343 = p13.x * p43.x + p13.y * p43.y + p13.z * p43.z;
+   d4321 = p43.x * p21.x + p43.y * p21.y + p43.z * p21.z;
+   d1321 = p13.x * p21.x + p13.y * p21.y + p13.z * p21.z;
+   d4343 = p43.x * p43.x + p43.y * p43.y + p43.z * p43.z;
+   d2121 = p21.x * p21.x + p21.y * p21.y + p21.z * p21.z;
+
+   denom = d2121 * d4343 - d4321 * d4321;
+   if (fabs(denom) < LINE_EPS)
+      return false;
+   numer = d1343 * d4321 - d1321 * d4343;
+
+   *t1 = numer / denom;
+   *t2 = (d1343 + d4321 * (*t1)) / d4343;
+
+   s1->x = p1.x + *t1 * p21.x;
+   s1->y = p1.y + *t1 * p21.y;
+   s1->z = p1.z + *t1 * p21.z;
+   s2->x = p3.x + *t2 * p43.x;
+   s2->y = p3.y + *t2 * p43.y;
+   s2->z = p3.z + *t2 * p43.z;
+
+   return true;
+}
+
+#pragma optimize( "", off )
+
+#ifndef EXCEPTION_EXECUTE_HANDLER
+#define EXCEPTION_EXECUTE_HANDLER       1
+#endif
+
+#pragma optimize( "", on )
+
+static bool s_b3DNowEnabled = false;
+static bool s_bMMXEnabled = false;
+static bool s_bSSEEnabled = false;
+static bool s_bSSE2Enabled = false;
+
+void MathLib_Init( float gamma, float texGamma, float brightness, int overbright, bool bAllow3DNow, bool bAllowSSE, bool bAllowSSE2, bool bAllowMMX )
+{
+	if ( s_bMathlibInitialized )
+		return;
+
+	// FIXME: Hook SSE into VectorAligned + Vector4DAligned
+
+#if !defined( _X360 )
+	// Grab the processor information:
+	const CPUInformation& pi = *GetCPUInformation();
+
+	// Select the default generic routines.
+	pfSqrt = _sqrtf;
+	pfRSqrt = _rsqrtf;
+	pfRSqrtFast = _rsqrtf;
+	pfVectorNormalize = _VectorNormalize;
+	pfVectorNormalizeFast = _VectorNormalizeFast;
+	pfInvRSquared = _InvRSquared;
+	pfFastSinCos = SinCos;
+	pfFastCos = cosf;
+
+	if ( bAllowMMX && pi.m_bMMX )
+	{
+		// Select the MMX specific routines if available
+		// (MMX routines were used by SW span fillers - not currently used for HW)
+		s_bMMXEnabled = true;
+	}
+	else
+	{
+		s_bMMXEnabled = false;
+	}
+
+	// SSE Generally performs better than 3DNow when present, so this is placed 
+	// first to allow SSE to override these settings.
+#if !defined( OSX ) && !defined( PLATFORM_WINDOWS_PC64 ) && !defined(LINUX)
+	if ( bAllow3DNow && pi.m_b3DNow )
+	{
+		s_b3DNowEnabled = true;
+
+		// Select the 3DNow specific routines if available;
+		pfVectorNormalize = _3DNow_VectorNormalize;
+		pfVectorNormalizeFast = _3DNow_VectorNormalizeFast;
+		pfInvRSquared = _3DNow_InvRSquared;
+		pfSqrt = _3DNow_Sqrt;
+		pfRSqrt = _3DNow_RSqrt;
+		pfRSqrtFast = _3DNow_RSqrt;
+	}
+	else
+#endif
+	{
+		s_b3DNowEnabled = false;
+	}
+
+	if ( bAllowSSE && pi.m_bSSE )
+	{
+		s_bSSEEnabled = true;
+
+#ifndef PLATFORM_WINDOWS_PC64
+		// These are not yet available.
+		// Select the SSE specific routines if available
+		pfVectorNormalize = _VectorNormalize;
+		pfVectorNormalizeFast = _SSE_VectorNormalizeFast;
+		pfInvRSquared = _SSE_InvRSquared;
+		pfSqrt = _SSE_Sqrt;
+		pfRSqrt = _SSE_RSqrtAccurate;
+		pfRSqrtFast = _SSE_RSqrtFast;
+#endif
+#ifdef PLATFORM_WINDOWS_PC32
+		pfFastSinCos = _SSE_SinCos;
+		pfFastCos = _SSE_cos;
+#endif
+	}
+	else
+	{
+		s_bSSEEnabled = false;
+	}
+
+	if ( bAllowSSE2 && pi.m_bSSE2 )
+	{
+		s_bSSE2Enabled = true;
+#ifdef PLATFORM_WINDOWS_PC32
+		pfFastSinCos = _SSE2_SinCos;
+		pfFastCos = _SSE2_cos;
+#endif
+	} 
+	else
+	{
+		s_bSSE2Enabled = false;
+	}
+#endif
+
+	s_bMathlibInitialized = true;
+
+	InitSinCosTable();
+	BuildGammaTable( gamma, texGamma, brightness, overbright );
+}
+
+bool MathLib_3DNowEnabled( void )
+{
+	Assert( s_bMathlibInitialized );
+	return s_b3DNowEnabled;
+}
+
+bool MathLib_MMXEnabled( void )
+{
+	Assert( s_bMathlibInitialized );
+	return s_bMMXEnabled;
+}
+
+bool MathLib_SSEEnabled( void )
+{
+	Assert( s_bMathlibInitialized );
+	return s_bSSEEnabled;
+}
+
+bool MathLib_SSE2Enabled( void )
+{
+	Assert( s_bMathlibInitialized );
+	return s_bSSE2Enabled;
+}
+
+float Approach( float target, float value, float speed )
+{
+	float delta = target - value;
+
+	if ( delta > speed )
+		value += speed;
+	else if ( delta < -speed )
+		value -= speed;
+	else 
+		value = target;
+
+	return value;
+}
+
+// BUGBUG: Why doesn't this call angle diff?!?!?
+float ApproachAngle( float target, float value, float speed )
+{
+	target = anglemod( target );
+	value = anglemod( value );
+	
+	float delta = target - value;
+
+	// Speed is assumed to be positive
+	if ( speed < 0 )
+		speed = -speed;
+
+	if ( delta < -180 )
+		delta += 360;
+	else if ( delta > 180 )
+		delta -= 360;
+
+	if ( delta > speed )
+		value += speed;
+	else if ( delta < -speed )
+		value -= speed;
+	else 
+		value = target;
+
+	return value;
+}
+
+
+// BUGBUG: Why do we need both of these?
+float AngleDiff( float destAngle, float srcAngle )
+{
+	float delta;
+
+	delta = fmodf(destAngle - srcAngle, 360.0f);
+	if ( destAngle > srcAngle )
+	{
+		if ( delta >= 180 )
+			delta -= 360;
+	}
+	else
+	{
+		if ( delta <= -180 )
+			delta += 360;
+	}
+	return delta;
+}
+
+
+float AngleDistance( float next, float cur )
+{
+	float delta = next - cur;
+
+	if ( delta < -180 )
+		delta += 360;
+	else if ( delta > 180 )
+		delta -= 360;
+
+	return delta;
+}
+
+
+float AngleNormalize( float angle )
+{
+	angle = fmodf(angle, 360.0f);
+	if (angle > 180) 
+	{
+		angle -= 360;
+	}
+	if (angle < -180)
+	{
+		angle += 360;
+	}
+	return angle;
+}
+
+//--------------------------------------------------------------------------------------------------------------
+// ensure that 0 <= angle <= 360
+float AngleNormalizePositive( float angle )
+{
+	angle = fmodf( angle, 360.0f );
+
+	if (angle < 0.0f)
+	{
+		angle += 360.0f;
+	}
+
+	return angle;
+}
+
+//--------------------------------------------------------------------------------------------------------------
+bool AnglesAreEqual( float a, float b, float tolerance )
+{
+	return (fabs( AngleDiff( a, b ) ) < tolerance);
+}
+
+void RotationDeltaAxisAngle( const QAngle &srcAngles, const QAngle &destAngles, Vector &deltaAxis, float &deltaAngle )
+{
+	Quaternion srcQuat, destQuat, srcQuatInv, out;
+	AngleQuaternion( srcAngles, srcQuat );
+	AngleQuaternion( destAngles, destQuat );
+	QuaternionScale( srcQuat, -1, srcQuatInv );
+	QuaternionMult( destQuat, srcQuatInv, out );
+
+	QuaternionNormalize( out );
+	QuaternionAxisAngle( out, deltaAxis, deltaAngle );
+}
+
+void RotationDelta( const QAngle &srcAngles, const QAngle &destAngles, QAngle *out )
+{
+	matrix3x4_t src, srcInv;
+	matrix3x4_t dest;
+	AngleMatrix( srcAngles, src );
+	AngleMatrix( destAngles, dest );
+	// xform = src(-1) * dest
+	MatrixInvert( src, srcInv );
+	matrix3x4_t xform;
+	ConcatTransforms( dest, srcInv, xform );
+	QAngle xformAngles;
+	MatrixAngles( xform, xformAngles );
+	if ( out )
+	{
+		*out = xformAngles;
+	}
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: Computes a triangle normal
+//-----------------------------------------------------------------------------
+void ComputeTrianglePlane( const Vector& v1, const Vector& v2, const Vector& v3, Vector& normal, float& intercept )
+{
+	Vector e1, e2;
+	VectorSubtract( v2, v1, e1 );
+	VectorSubtract( v3, v1, e2 );
+	CrossProduct( e1, e2, normal );
+	VectorNormalize( normal );
+	intercept = DotProduct( normal, v1 ); 
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: This is a clone of BaseWindingForPlane()
+// Input  : *outVerts - an array of preallocated verts to build the polygon in
+//			normal - the plane normal
+//			dist - the plane constant
+// Output : int - vert count (always 4)
+//-----------------------------------------------------------------------------
+int PolyFromPlane( Vector *outVerts, const Vector& normal, float dist, float fHalfScale )
+{
+	int		i, x;
+	vec_t	max, v;
+	Vector	org, vright, vup;
+
+	// find the major axis
+
+	max = -16384; //MAX_COORD_INTEGER
+	x = -1;
+	for (i=0 ; i<3; i++)
+	{
+		v = fabs(normal[i]);
+		if (v > max)
+		{
+			x = i;
+			max = v;
+		}
+	}
+
+	if (x==-1)
+		return 0;
+
+	// Build a unit vector along something other than the major axis
+	VectorCopy (vec3_origin, vup);	
+	switch (x)
+	{
+	case 0:
+	case 1:
+		vup[2] = 1;
+		break;		
+	case 2:
+		vup[0] = 1;
+		break;		
+	}
+
+	// Remove the component of this vector along the normal
+	v = DotProduct (vup, normal);
+	VectorMA (vup, -v, normal, vup);
+	// Make it a unit (perpendicular)
+	VectorNormalize (vup);
+
+	// Center of the poly is at normal * dist
+	VectorScale (normal, dist, org);
+	// Calculate the third orthonormal basis vector for our plane space (this one and vup are in the plane)
+	CrossProduct (vup, normal, vright);
+
+	// Make the plane's basis vectors big (these are the half-sides of the polygon we're making)
+	VectorScale (vup, fHalfScale, vup);
+	VectorScale (vright, fHalfScale, vright);
+
+	// Move diagonally away from org to create the corner verts
+	VectorSubtract (org, vright, outVerts[0]);	// left
+	VectorAdd (outVerts[0], vup, outVerts[0]);	// up
+
+	VectorAdd (org, vright, outVerts[1]);		// right
+	VectorAdd (outVerts[1], vup, outVerts[1]);	// up
+
+	VectorAdd (org, vright, outVerts[2]);		// right
+	VectorSubtract (outVerts[2], vup, outVerts[2]);	// down
+
+	VectorSubtract (org, vright, outVerts[3]);		// left
+	VectorSubtract (outVerts[3], vup, outVerts[3]);	// down
+
+	// The four corners form a planar quadrilateral normal to "normal"
+	return 4;
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: clip a poly to the plane and return the poly on the front side of the plane
+// Input  : *inVerts - input polygon
+//			vertCount - # verts in input poly
+//			*outVerts - destination poly
+//			normal - plane normal
+//			dist - plane constant
+// Output : int - # verts in output poly
+//-----------------------------------------------------------------------------
+
+int ClipPolyToPlane( Vector *inVerts, int vertCount, Vector *outVerts, const Vector& normal, float dist, float fOnPlaneEpsilon )
+{
+	vec_t	*dists = (vec_t *)stackalloc( sizeof(vec_t) * vertCount * 4 ); //4x vertcount should cover all cases
+	int		*sides = (int *)stackalloc( sizeof(vec_t) * vertCount * 4 );
+	int		counts[3];
+	vec_t	dot;
+	int		i, j;
+	Vector	mid = vec3_origin;
+	int		outCount;
+
+	counts[0] = counts[1] = counts[2] = 0;
+
+	// determine sides for each point
+	for ( i = 0; i < vertCount; i++ )
+	{
+		dot = DotProduct( inVerts[i], normal) - dist;
+		dists[i] = dot;
+		if ( dot > fOnPlaneEpsilon )
+		{
+			sides[i] = SIDE_FRONT;
+		}
+		else if ( dot < -fOnPlaneEpsilon )
+		{
+			sides[i] = SIDE_BACK;
+		}
+		else
+		{
+			sides[i] = SIDE_ON;
+		}
+		counts[sides[i]]++;
+	}
+	sides[i] = sides[0];
+	dists[i] = dists[0];
+
+	if (!counts[0])
+		return 0;
+
+	if (!counts[1])
+	{
+		// Copy to output verts
+		for ( i = 0; i < vertCount; i++ )
+		{
+			VectorCopy( inVerts[i], outVerts[i] );
+		}
+		return vertCount;
+	}
+
+	outCount = 0;
+	for ( i = 0; i < vertCount; i++ )
+	{
+		Vector& p1 = inVerts[i];
+
+		if (sides[i] == SIDE_ON)
+		{
+			VectorCopy( p1, outVerts[outCount]);
+			outCount++;
+			continue;
+		}
+
+		if (sides[i] == SIDE_FRONT)
+		{
+			VectorCopy( p1, outVerts[outCount]);
+			outCount++;
+		}
+
+		if (sides[i+1] == SIDE_ON || sides[i+1] == sides[i])
+			continue;
+
+		// generate a split point
+		Vector& p2 = inVerts[(i+1)%vertCount];
+
+		dot = dists[i] / (dists[i]-dists[i+1]);
+		for (j=0 ; j<3 ; j++)
+		{	// avoid round off error when possible
+			if (normal[j] == 1)
+				mid[j] = dist;
+			else if (normal[j] == -1)
+				mid[j] = -dist;
+			else
+				mid[j] = p1[j] + dot*(p2[j]-p1[j]);
+		}
+
+		VectorCopy (mid, outVerts[outCount]);
+		outCount++;
+	}
+
+	return outCount;
+}
+
+
+int ClipPolyToPlane_Precise( double *inVerts, int vertCount, double *outVerts, const double *normal, double dist, double fOnPlaneEpsilon )
+{
+	double	*dists = (double *)stackalloc( sizeof(double) * vertCount * 4 ); //4x vertcount should cover all cases
+	int		*sides = (int *)stackalloc( sizeof(double) * vertCount * 4 );
+	int		counts[3];
+	double	dot;
+	int		i, j;
+	//Vector	mid = vec3_origin;
+	double mid[3];
+	mid[0] = 0.0;
+	mid[1] = 0.0;
+	mid[2] = 0.0;
+	int		outCount;
+
+	counts[0] = counts[1] = counts[2] = 0;
+
+	// determine sides for each point
+	for ( i = 0; i < vertCount; i++ )
+	{
+		//dot = DotProduct( inVerts[i], normal) - dist;
+		dot = ((inVerts[i*3 + 0] * normal[0]) + (inVerts[i*3 + 1] * normal[1]) + (inVerts[i*3 + 2] * normal[2])) - dist;
+		dists[i] = dot;
+		if ( dot > fOnPlaneEpsilon )
+		{
+			sides[i] = SIDE_FRONT;
+		}
+		else if ( dot < -fOnPlaneEpsilon )
+		{
+			sides[i] = SIDE_BACK;
+		}
+		else
+		{
+			sides[i] = SIDE_ON;
+		}
+		counts[sides[i]]++;
+	}
+	sides[i] = sides[0];
+	dists[i] = dists[0];
+
+	if (!counts[0])
+		return 0;
+
+	if (!counts[1])
+	{
+		// Copy to output verts
+		//for ( i = 0; i < vertCount; i++ )
+		for ( i = 0; i < vertCount * 3; i++ )
+		{
+			//VectorCopy( inVerts[i], outVerts[i] );
+			outVerts[i] = inVerts[i];
+		}
+		return vertCount;
+	}
+
+	outCount = 0;
+	for ( i = 0; i < vertCount; i++ )
+	{
+		//Vector& p1 = inVerts[i];
+		double *p1 = &inVerts[i*3];
+		//p1[0] = inVerts[i*3 + 0];
+		//p1[1] = inVerts[i*3 + 1];
+		//p1[2] = inVerts[i*3 + 2];
+
+		if (sides[i] == SIDE_ON)
+		{
+			//VectorCopy( p1, outVerts[outCount]);
+			outVerts[outCount*3 + 0] = p1[0];
+			outVerts[outCount*3 + 1] = p1[1];
+			outVerts[outCount*3 + 2] = p1[2];
+			outCount++;
+			continue;
+		}
+
+		if (sides[i] == SIDE_FRONT)
+		{
+			//VectorCopy( p1, outVerts[outCount]);
+			outVerts[outCount*3 + 0] = p1[0];
+			outVerts[outCount*3 + 1] = p1[1];
+			outVerts[outCount*3 + 2] = p1[2];
+			outCount++;
+		}
+
+		if (sides[i+1] == SIDE_ON || sides[i+1] == sides[i])
+			continue;
+
+		// generate a split point
+		//Vector& p2 = inVerts[(i+1)%vertCount];
+		int wrappedindex = (i+1)%vertCount;
+		double *p2 = &inVerts[wrappedindex*3];
+		//p2[0] = inVerts[wrappedindex*3 + 0];
+		//p2[1] = inVerts[wrappedindex*3 + 1];
+		//p2[2] = inVerts[wrappedindex*3 + 2];
+
+		dot = dists[i] / (dists[i]-dists[i+1]);
+		for (j=0 ; j<3 ; j++)
+		{
+			mid[j] = (double)p1[j] + dot*((double)p2[j]-(double)p1[j]);
+		}
+
+		//VectorCopy (mid, outVerts[outCount]);
+		outVerts[outCount*3 + 0] = mid[0];
+		outVerts[outCount*3 + 1] = mid[1];
+		outVerts[outCount*3 + 2] = mid[2];
+		outCount++;
+	}
+
+	return outCount;
+}
+
+int CeilPow2( int in )
+{
+	int retval;
+	
+	retval = 1;
+	while( retval < in )
+		retval <<= 1;
+	return retval;
+}
+
+int FloorPow2( int in )
+{
+	int retval;
+	
+	retval = 1;
+	while( retval < in )
+		retval <<= 1;
+	return retval >> 1;
+}
+
+
+//-----------------------------------------------------------------------------
+// Computes Y fov from an X fov and a screen aspect ratio
+//-----------------------------------------------------------------------------
+float CalcFovY( float flFovX, float flAspect )
+{
+	if ( flFovX < 1 || flFovX > 179)
+	{
+		flFovX = 90;	// error, set to 90
+	}
+
+	// The long, but illustrative version (more closely matches CShaderAPIDX8::PerspectiveX, which
+	// is what it's based on).
+	//
+	//float width = 2 * zNear * tan( DEG2RAD( fov_x / 2.0 ) );
+	//float height = width / screenaspect;
+	//float yRadians = atan( (height/2.0) / zNear );
+	//return RAD2DEG( yRadians ) * 2;
+
+	// The short and sweet version.
+	float val = atan( tan( DEG2RAD( flFovX ) * 0.5f ) / flAspect );
+	val = RAD2DEG( val ) * 2.0f;
+	return val;
+}
+
+float CalcFovX( float flFovY, float flAspect )
+{
+	return RAD2DEG( atan( tan( DEG2RAD( flFovY ) * 0.5f ) * flAspect ) ) * 2.0f;
+}
+
+
+//-----------------------------------------------------------------------------
+// Generate a frustum based on perspective view parameters
+//-----------------------------------------------------------------------------
+void GeneratePerspectiveFrustum( const Vector& origin, const Vector &forward, 
+	const Vector &right, const Vector &up, float flZNear, float flZFar, 
+	float flFovX, float flFovY, Frustum_t &frustum )
+{
+	float flIntercept = DotProduct( origin, forward );
+
+	// Setup the near and far planes.
+	frustum.SetPlane( FRUSTUM_FARZ, PLANE_ANYZ, -forward, -flZFar - flIntercept );
+	frustum.SetPlane( FRUSTUM_NEARZ, PLANE_ANYZ, forward, flZNear + flIntercept );
+
+	flFovX *= 0.5f;
+	flFovY *= 0.5f;
+
+	float flTanX = tan( DEG2RAD( flFovX ) );
+	float flTanY = tan( DEG2RAD( flFovY ) );
+
+	// OPTIMIZE: Normalizing these planes is not necessary for culling
+	Vector normalPos, normalNeg;
+
+	VectorMA( right, flTanX, forward, normalPos );
+	VectorMA( normalPos, -2.0f, right, normalNeg );
+
+	VectorNormalize( normalPos );
+	VectorNormalize( normalNeg );
+
+	frustum.SetPlane( FRUSTUM_LEFT, PLANE_ANYZ, normalPos, normalPos.Dot( origin ) );
+	frustum.SetPlane( FRUSTUM_RIGHT, PLANE_ANYZ, normalNeg, normalNeg.Dot( origin ) );
+
+	VectorMA( up, flTanY, forward, normalPos );
+	VectorMA( normalPos, -2.0f, up, normalNeg );
+
+	VectorNormalize( normalPos );
+	VectorNormalize( normalNeg );
+
+	frustum.SetPlane( FRUSTUM_BOTTOM, PLANE_ANYZ, normalPos, normalPos.Dot( origin ) );
+	frustum.SetPlane( FRUSTUM_TOP, PLANE_ANYZ, normalNeg, normalNeg.Dot( origin ) );
+}
+
+
+//-----------------------------------------------------------------------------
+// Version that accepts angles instead of vectors
+//-----------------------------------------------------------------------------
+void GeneratePerspectiveFrustum( const Vector& origin, const QAngle &angles, float flZNear, float flZFar, float flFovX, float flAspectRatio, Frustum_t &frustum )
+{
+	Vector vecForward, vecRight, vecUp;
+	AngleVectors( angles, &vecForward, &vecRight, &vecUp );
+	float flFovY = CalcFovY( flFovX, flAspectRatio );
+	GeneratePerspectiveFrustum( origin, vecForward, vecRight, vecUp, flZNear, flZFar, flFovX, flFovY, frustum );
+}
+
+bool R_CullBox( const Vector& mins, const Vector& maxs, const Frustum_t &frustum )
+{
+	return (( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_RIGHT) ) == 2 ) || 
+			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_LEFT) ) == 2 ) ||
+			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_TOP) ) == 2 ) ||
+			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_BOTTOM) ) == 2 ) ||
+			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_NEARZ) ) == 2 ) ||
+			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_FARZ) ) == 2 ) );
+}
+
+bool R_CullBoxSkipNear( const Vector& mins, const Vector& maxs, const Frustum_t &frustum )
+{
+	return (( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_RIGHT) ) == 2 ) || 
+			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_LEFT) ) == 2 ) ||
+			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_TOP) ) == 2 ) ||
+			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_BOTTOM) ) == 2 ) ||
+			( BoxOnPlaneSide( mins, maxs, frustum.GetPlane(FRUSTUM_FARZ) ) == 2 ) );
+}
+
+
+// NOTE: This routine was taken (and modified) from NVidia's BlinnReflection demo
+// Creates basis vectors, based on a vertex and index list.
+// See the NVidia white paper 'GDC2K PerPixel Lighting' for a description
+// of how this computation works
+#define SMALL_FLOAT 1e-12
+
+void CalcTriangleTangentSpace( const Vector &p0, const Vector &p1, const Vector &p2,
+							   const Vector2D &t0, const Vector2D &t1, const Vector2D& t2,
+							   Vector &sVect, Vector &tVect )
+{
+	/* Compute the partial derivatives of X, Y, and Z with respect to S and T. */
+	sVect.Init( 0.0f, 0.0f, 0.0f );
+	tVect.Init( 0.0f, 0.0f, 0.0f );
+
+	// x, s, t
+	Vector edge01( p1.x - p0.x, t1.x - t0.x, t1.y - t0.y );
+	Vector edge02( p2.x - p0.x, t2.x - t0.x, t2.y - t0.y );
+
+	Vector cross;
+	CrossProduct( edge01, edge02, cross );
+	if ( fabs( cross.x ) > SMALL_FLOAT )
+	{
+		sVect.x += -cross.y / cross.x;
+		tVect.x += -cross.z / cross.x;
+	}
+
+	// y, s, t
+	edge01.Init( p1.y - p0.y, t1.x - t0.x, t1.y - t0.y );
+	edge02.Init( p2.y - p0.y, t2.x - t0.x, t2.y - t0.y );
+
+	CrossProduct( edge01, edge02, cross );
+	if ( fabs( cross.x ) > SMALL_FLOAT )
+	{
+		sVect.y += -cross.y / cross.x;
+		tVect.y += -cross.z / cross.x;
+	}
+
+	// z, s, t
+	edge01.Init( p1.z - p0.z, t1.x - t0.x, t1.y - t0.y );
+	edge02.Init( p2.z - p0.z, t2.x - t0.x, t2.y - t0.y );
+
+	CrossProduct( edge01, edge02, cross );
+	if( fabs( cross.x ) > SMALL_FLOAT )
+	{
+		sVect.z += -cross.y / cross.x;
+		tVect.z += -cross.z / cross.x;
+	}
+
+	// Normalize sVect and tVect
+	VectorNormalize( sVect );
+	VectorNormalize( tVect );
+}
+
+
+//-----------------------------------------------------------------------------
+// Convert RGB to HSV
+//-----------------------------------------------------------------------------
+void RGBtoHSV( const Vector &rgb, Vector &hsv )
+{
+	float flMax = max( rgb.x, rgb.y );
+	flMax = max( flMax, rgb.z );
+	float flMin = min( rgb.x, rgb.y );
+	flMin = min( flMin, rgb.z );
+
+	// hsv.z is the value
+	hsv.z = flMax;
+
+	// hsv.y is the saturation
+	if (flMax != 0.0F)
+	{
+		hsv.y = (flMax - flMin) / flMax;
+	}
+	else
+	{
+		hsv.y = 0.0F;
+	}
+
+	// hsv.x is the hue
+	if (hsv.y == 0.0F)
+	{
+		hsv.x = -1.0f;
+	}
+	else
+	{
+		float32 d = flMax - flMin;
+		if (rgb.x == flMax)		
+		{
+			hsv.x = (rgb.y - rgb.z) / d;
+		}
+		else if (rgb.y == flMax)	
+		{
+			hsv.x = 2.0F + (rgb.z - rgb.x) / d;
+		}
+		else				
+		{
+			hsv.x = 4.0F + (rgb.x - rgb.y) / d;
+		}
+		hsv.x *= 60.0F;
+		if ( hsv.x < 0.0F ) 
+		{
+			hsv.x += 360.0F;
+		}
+	}
+}
+
+
+//-----------------------------------------------------------------------------
+// Convert HSV to RGB
+//-----------------------------------------------------------------------------
+void HSVtoRGB( const Vector &hsv, Vector &rgb )
+{         
+	if ( hsv.y == 0.0F )
+	{
+		rgb.Init( hsv.z, hsv.z, hsv.z );
+		return;
+	}
+
+	float32 hue = hsv.x;
+	if (hue == 360.0F) 
+	{	
+		hue = 0.0F;
+	}
+	hue /= 60.0F;
+	int     i = hue;        // integer part
+	float32 f = hue - i;    // fractional part
+	float32 p = hsv.z * (1.0F - hsv.y);
+	float32 q = hsv.z * (1.0F - hsv.y * f);
+	float32 t = hsv.z * (1.0F - hsv.y * (1.0F - f));
+	switch(i)
+	{
+	case 0: rgb.Init( hsv.z, t, p ); break;
+	case 1: rgb.Init( q, hsv.z, p ); break;
+	case 2: rgb.Init( p, hsv.z, t ); break;
+	case 3: rgb.Init( p, q, hsv.z ); break;
+	case 4: rgb.Init( t, p, hsv.z ); break;
+	case 5: rgb.Init( hsv.z, p, q ); break;
+	}
+}
+
+
+void GetInterpolationData( float const *pKnotPositions, 
+						   float const *pKnotValues,
+						   int nNumValuesinList,
+						   int nInterpolationRange,
+						   float flPositionToInterpolateAt,
+						   bool bWrap,
+						   float *pValueA, 
+						   float *pValueB,
+						   float *pInterpolationValue)
+{
+	// first, find the bracketting knots by looking for the first knot >= our index
+	
+	int idx;
+	for(idx = 0; idx < nNumValuesinList; idx++ )
+	{
+		if ( pKnotPositions[idx] >= flPositionToInterpolateAt )
+			break;
+	}
+	int nKnot1, nKnot2;
+	float flOffsetFromStartOfGap, flSizeOfGap;
+	if ( idx == 0)
+	{
+		if ( bWrap )
+		{
+			nKnot1 = nNumValuesinList-1;
+			nKnot2 = 0;
+			flSizeOfGap =
+				( pKnotPositions[nKnot2] + ( nInterpolationRange-pKnotPositions[nKnot1] ) );
+			flOffsetFromStartOfGap = 
+				flPositionToInterpolateAt + ( nInterpolationRange-pKnotPositions[nKnot1] );
+		}
+		else
+		{
+			*pValueA = *pValueB = pKnotValues[0];
+			*pInterpolationValue = 1.0;
+			return;
+		}
+	}
+	else if ( idx == nNumValuesinList )						// ran out of values
+	{
+		if ( bWrap )
+		{
+			nKnot1 = nNumValuesinList -1;
+			nKnot2 = 0;
+			flSizeOfGap = ( pKnotPositions[nKnot2] + 
+						 ( nInterpolationRange-pKnotPositions[nKnot1] ) );
+			flOffsetFromStartOfGap = flPositionToInterpolateAt - pKnotPositions[nKnot1];
+		}
+		else
+		{
+			*pValueA = *pValueB = pKnotValues[nNumValuesinList-1];
+			*pInterpolationValue = 1.0;
+			return;
+		}
+
+	}
+	else
+	{
+		nKnot1 = idx-1;
+		nKnot2 = idx;
+		flSizeOfGap = pKnotPositions[nKnot2]-pKnotPositions[nKnot1];
+		flOffsetFromStartOfGap = flPositionToInterpolateAt-pKnotPositions[nKnot1];
+	}
+
+	*pValueA = pKnotValues[nKnot1];
+	*pValueB = pKnotValues[nKnot2];
+	*pInterpolationValue = FLerp( 0, 1, 0, flSizeOfGap, flOffsetFromStartOfGap );
+	return;
+}
+
+float RandomVectorInUnitSphere( Vector *pVector )
+{
+	// Guarantee uniform random distribution within a sphere
+	// Graphics gems III contains this algorithm ("Nonuniform random point sets via warping")
+	float u = ((float)rand() / VALVE_RAND_MAX);
+	float v = ((float)rand() / VALVE_RAND_MAX);
+	float w = ((float)rand() / VALVE_RAND_MAX);
+
+	float flPhi = acos( 1 - 2 * u );
+	float flTheta = 2 * M_PI * v;
+	float flRadius = powf( w, 1.0f / 3.0f );
+
+	float flSinPhi, flCosPhi;
+	float flSinTheta, flCosTheta;
+	SinCos( flPhi, &flSinPhi, &flCosPhi );
+	SinCos( flTheta, &flSinTheta, &flCosTheta );
+
+	pVector->x = flRadius * flSinPhi * flCosTheta;
+	pVector->y = flRadius * flSinPhi * flSinTheta;
+	pVector->z = flRadius * flCosPhi;
+	return flRadius;
+}
+
+float RandomVectorInUnitCircle( Vector2D *pVector )
+{
+	// Guarantee uniform random distribution within a sphere
+	// Graphics gems III contains this algorithm ("Nonuniform random point sets via warping")
+	float u = ((float)rand() / VALVE_RAND_MAX);
+	float v = ((float)rand() / VALVE_RAND_MAX);
+
+	float flTheta = 2 * M_PI * v;
+	float flRadius = powf( u, 1.0f / 2.0f );
+
+	float flSinTheta, flCosTheta;
+	SinCos( flTheta, &flSinTheta, &flCosTheta );
+
+	pVector->x = flRadius * flCosTheta;
+	pVector->y = flRadius * flSinTheta;
+	return flRadius;
+}
+#ifdef FP_EXCEPTIONS_ENABLED
+#include <float.h> // For _clearfp and _controlfp_s
+#endif
+
+// FPExceptionDisable and FPExceptionEnabler taken from my blog post
+// at http://www.altdevblogaday.com/2012/04/20/exceptional-floating-point/
+
+#ifdef FP_EXCEPTIONS_ENABLED
+// These functions are all inlined NOPs if FP_EXCEPTIONS_ENABLED is not defined.
+FPExceptionDisabler::FPExceptionDisabler()
+{
+	// Retrieve the current state of the exception flags. This
+	// must be done before changing them. _MCW_EM is a bit
+	// mask representing all available exception masks.
+	_controlfp_s(&mOldValues, _MCW_EM, _MCW_EM);
+	// Set all of the exception flags, which suppresses FP
+	// exceptions on the x87 and SSE units.
+	_controlfp_s(0, _MCW_EM, _MCW_EM);
+}
+
+FPExceptionDisabler::~FPExceptionDisabler()
+{
+	// Clear any pending FP exceptions. This must be done
+	// prior to enabling FP exceptions since otherwise there
+	// may be a 'deferred crash' as soon the exceptions are
+	// enabled.
+	_clearfp();
+
+	// Reset (possibly enabling) the exception status.
+	_controlfp_s(0, mOldValues, _MCW_EM);
+}
+
+// Overflow, divide-by-zero, and invalid-operation are the FP
+// exceptions most frequently associated with bugs.
+FPExceptionEnabler::FPExceptionEnabler(unsigned int enableBits /*= _EM_OVERFLOW | _EM_ZERODIVIDE | _EM_INVALID*/)
+{
+	// Retrieve the current state of the exception flags. This
+	// must be done before changing them. _MCW_EM is a bit
+	// mask representing all available exception masks.
+	_controlfp_s(&mOldValues, _MCW_EM, _MCW_EM);
+
+	// Make sure no non-exception flags have been specified,
+	// to avoid accidental changing of rounding modes, etc.
+	enableBits &= _MCW_EM;
+
+	// Clear any pending FP exceptions. This must be done
+	// prior to enabling FP exceptions since otherwise there
+	// may be a 'deferred crash' as soon the exceptions are
+	// enabled.
+	_clearfp();
+
+	// Zero out the specified bits, leaving other bits alone.
+	_controlfp_s(0, ~enableBits, enableBits);
+}
+
+FPExceptionEnabler::~FPExceptionEnabler()
+{
+	// Reset the exception state.
+	_controlfp_s(0, mOldValues, _MCW_EM);
+}
+#endif
diff --git a/mp/src/mathlib/noisedata.h b/mp/src/mathlib/noisedata.h
index 3bef9cd6..6d5cd3d0 100644
--- a/mp/src/mathlib/noisedata.h
+++ b/mp/src/mathlib/noisedata.h
@@ -1,180 +1,180 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: static data for noise() primitives.
-//
-// $Workfile:     $
-// $NoKeywords: $
-//=============================================================================//
-//
-//    **** DO NOT EDIT THIS FILE. GENERATED BY DATAGEN.PL ****
-//
-
-static int perm_a[]={
-    66,147,106,213,89,115,239,25,171,175,9,114,141,226,118,128,41,208,4,56,
-   180,248,43,82,246,219,94,245,133,131,222,103,160,130,168,145,238,38,23,6,
-   236,67,99,2,70,232,80,209,1,3,68,65,102,210,13,73,55,252,187,170,22,36,
-   52,181,117,163,46,79,166,224,148,75,113,95,156,185,220,164,51,142,161,35,
-   206,251,45,136,197,190,132,32,218,127,63,27,137,93,242,20,189,108,183,
-   122,139,191,249,253,87,98,69,0,144,64,24,214,97,116,158,42,107,15,53,212,
-   83,111,152,240,74,237,62,77,205,149,26,151,178,204,91,176,234,49,154,203,
-   33,221,125,134,165,124,86,39,37,60,150,157,179,109,110,44,159,153,5,100,
-   10,207,40,186,96,215,143,162,230,184,101,54,174,247,76,59,241,223,192,84,
-   104,78,169,146,138,30,48,85,233,19,29,92,126,17,199,250,31,81,188,225,28,
-   112,88,11,182,173,211,129,194,172,14,120,200,167,135,12,177,227,229,155,
-   201,61,105,195,193,244,235,58,8,196,123,254,16,18,50,121,71,243,90,57,
-   202,119,255,47,7,198,228,21,217,216,231,140,72,34
-};
-
-static int perm_b[]={
-    123,108,201,64,40,75,24,221,137,110,191,142,9,69,230,83,7,247,51,54,115,
-    133,180,248,109,116,62,99,251,55,89,253,65,106,228,167,131,132,58,143,
-    97,102,163,202,149,234,12,117,174,94,121,74,32,113,20,60,159,182,204,29,
-    244,118,3,178,255,38,6,114,36,93,30,134,213,90,245,209,88,232,162,125,
-    84,166,70,136,208,231,27,71,157,80,76,0,170,225,203,176,33,161,196,128,
-    252,236,246,2,138,1,250,197,77,243,218,242,19,164,68,212,14,237,144,63,
-    46,103,177,188,85,223,8,160,222,4,216,219,35,15,44,23,126,127,100,226,
-    235,37,168,101,49,22,11,73,61,135,111,183,72,96,185,239,82,18,50,155,
-    186,153,17,233,146,156,107,5,254,10,192,198,148,207,104,13,124,48,95,
-    129,120,206,199,81,249,91,150,210,119,240,122,194,92,34,28,205,175,227,
-    179,220,140,152,79,26,195,47,66,173,169,241,53,184,187,145,112,238,214,
-    147,98,171,229,200,151,25,67,78,189,217,130,224,57,172,59,41,43,16,105,
-    158,165,21,45,56,141,139,215,190,86,42,52,39,87,181,31,154,193,211
-};
-
-static int perm_c[]={
-    97,65,96,25,122,26,219,85,148,251,102,0,140,130,136,213,138,60,236,52,
-    178,131,115,183,144,78,147,168,39,45,169,70,57,146,67,142,252,216,28,54,
-    86,222,194,200,48,5,205,125,214,56,181,255,196,155,37,218,153,208,66,
-    242,73,248,206,61,62,246,177,2,197,107,162,152,89,41,6,160,94,8,201,38,
-    235,228,165,93,111,239,74,231,121,47,166,221,157,64,77,244,29,105,150,
-    123,190,191,225,118,133,42,10,84,185,159,124,132,240,180,44,1,9,19,99,
-    254,12,207,186,71,234,184,11,20,16,193,139,175,98,59,113,27,170,230,91,
-    187,46,156,249,108,195,171,114,14,188,82,192,233,24,32,241,87,164,90,43,
-    163,245,92,40,215,55,226,15,3,112,158,250,172,22,227,137,35,128,145,247,
-    161,119,80,217,189,81,7,63,202,120,223,83,179,4,106,199,229,95,53,50,33,
-    182,72,143,23,243,75,18,173,141,167,198,204,58,174,237,17,129,238,127,
-    31,101,176,36,30,110,209,34,203,135,232,68,149,49,134,126,212,79,76,117,
-    104,210,211,224,253,100,220,109,116,88,13,151,154,69,21,51,103
-};
-
-static int perm_d[]={
-    94,234,145,235,151,166,187,238,4,5,128,115,87,107,229,175,190,108,218,
-    32,17,220,97,90,122,121,71,109,64,227,225,75,81,19,27,162,3,89,139,69,
-    92,26,48,215,116,191,114,2,104,157,66,39,1,127,96,124,30,0,82,233,219,
-    42,131,173,35,201,182,144,14,98,148,244,160,159,179,91,31,68,119,154,
-    205,113,149,167,44,60,18,228,251,245,43,10,80,15,129,67,181,174,6,45,
-    194,237,213,52,99,232,211,212,164,217,57,153,156,102,134,20,249,132,55,
-    204,65,33,231,85,61,37,163,193,189,170,226,63,168,236,165,224,242,195,
-    41,200,40,70,112,100,36,172,130,74,137,252,243,135,230,161,207,16,146,
-    198,118,150,24,29,250,188,25,209,103,23,105,47,7,46,133,83,184,50,79,
-    110,120,53,253,206,214,9,240,101,147,152,183,254,59,126,216,197,171,51,
-    208,248,202,58,176,28,72,177,185,141,12,11,56,222,86,178,155,223,88,111,
-    73,142,210,138,239,221,199,192,84,93,241,125,76,77,255,95,8,78,247,186,
-    123,196,13,140,180,143,54,106,136,34,62,169,38,117,22,21,49,203,158,246
-};
-
-static float impulse_xcoords[]={
-    0.788235,0.541176,0.972549,0.082353,0.352941,0.811765,0.286275,0.752941,
-    0.203922,0.705882,0.537255,0.886275,0.580392,0.137255,0.800000,0.533333,
-    0.117647,0.447059,0.129412,0.925490,0.086275,0.478431,0.666667,0.568627,
-    0.678431,0.313725,0.321569,0.349020,0.988235,0.419608,0.898039,0.219608,
-    0.243137,0.623529,0.501961,0.772549,0.952941,0.517647,0.949020,0.701961,
-    0.454902,0.505882,0.564706,0.960784,0.207843,0.007843,0.831373,0.184314,
-    0.576471,0.462745,0.572549,0.247059,0.262745,0.694118,0.615686,0.121569,
-    0.384314,0.749020,0.145098,0.717647,0.415686,0.607843,0.105882,0.101961,
-    0.200000,0.807843,0.521569,0.780392,0.466667,0.552941,0.996078,0.627451,
-    0.992157,0.529412,0.407843,0.011765,0.709804,0.458824,0.058824,0.819608,
-    0.176471,0.317647,0.392157,0.223529,0.156863,0.490196,0.325490,0.074510,
-    0.239216,0.164706,0.890196,0.603922,0.921569,0.839216,0.854902,0.098039,
-    0.686275,0.843137,0.152941,0.372549,0.062745,0.474510,0.486275,0.227451,
-    0.400000,0.298039,0.309804,0.274510,0.054902,0.815686,0.647059,0.635294,
-    0.662745,0.976471,0.094118,0.509804,0.650980,0.211765,0.180392,0.003922,
-    0.827451,0.278431,0.023529,0.525490,0.450980,0.725490,0.690196,0.941176,
-    0.639216,0.560784,0.196078,0.364706,0.043137,0.494118,0.796078,0.113725,
-    0.760784,0.729412,0.258824,0.290196,0.584314,0.674510,0.823529,0.905882,
-    0.917647,0.070588,0.862745,0.345098,0.913725,0.937255,0.031373,0.215686,
-    0.768627,0.333333,0.411765,0.423529,0.945098,0.721569,0.039216,0.792157,
-    0.956863,0.266667,0.254902,0.047059,0.294118,0.658824,0.250980,1.000000,
-    0.984314,0.756863,0.027451,0.305882,0.835294,0.513725,0.360784,0.776471,
-    0.611765,0.192157,0.866667,0.858824,0.592157,0.803922,0.141176,0.435294,
-    0.588235,0.619608,0.341176,0.109804,0.356863,0.270588,0.737255,0.847059,
-    0.050980,0.764706,0.019608,0.870588,0.933333,0.784314,0.549020,0.337255,
-    0.631373,0.929412,0.231373,0.427451,0.078431,0.498039,0.968627,0.654902,
-    0.125490,0.698039,0.015686,0.878431,0.713725,0.368627,0.431373,0.874510,
-    0.403922,0.556863,0.443137,0.964706,0.909804,0.301961,0.035294,0.850980,
-    0.882353,0.741176,0.380392,0.133333,0.470588,0.643137,0.282353,0.396078,
-    0.980392,0.168627,0.149020,0.235294,0.670588,0.596078,0.733333,0.160784,
-    0.376471,0.682353,0.545098,0.482353,0.745098,0.894118,0.188235,0.329412,
-    0.439216,0.901961,0.000000,0.600000,0.388235,0.172549,0.090196,0.066667
-};
-
-static float impulse_ycoords[]={
-    0.827451,0.337255,0.941176,0.886275,0.878431,0.239216,0.400000,0.164706,
-    0.490196,0.411765,0.964706,0.349020,0.803922,0.317647,0.647059,0.431373,
-    0.933333,0.156863,0.094118,0.219608,0.039216,0.521569,0.498039,0.705882,
-    0.717647,0.047059,0.631373,0.517647,0.984314,0.847059,0.482353,0.439216,
-    0.250980,0.862745,0.690196,0.913725,0.270588,0.070588,0.027451,0.694118,
-    0.811765,0.000000,0.494118,0.823529,0.800000,0.600000,0.003922,0.443137,
-    0.639216,0.376471,0.031373,0.035294,0.552941,0.215686,0.305882,0.133333,
-    0.564706,0.176471,0.211765,0.874510,0.360784,0.654902,0.223529,0.807843,
-    0.372549,0.137255,0.321569,0.015686,0.007843,0.262745,0.125490,0.078431,
-    0.396078,0.976471,0.929412,1.000000,0.937255,0.509804,0.188235,0.850980,
-    0.831373,0.392157,0.741176,0.541176,0.592157,0.286275,0.345098,0.572549,
-    0.537255,0.725490,0.839216,0.184314,0.772549,0.149020,0.505882,0.423529,
-    0.780392,0.011765,0.890196,0.086275,0.427451,0.023529,0.788235,0.050980,
-    0.760784,0.603922,0.066667,0.643137,0.623529,0.960784,0.172549,0.333333,
-    0.082353,0.290196,0.992157,0.709804,0.894118,0.596078,0.243137,0.752941,
-    0.486275,0.670588,0.949020,0.784314,0.145098,0.560784,0.513725,0.180392,
-    0.580392,0.996078,0.380392,0.556863,0.407843,0.945098,0.117647,0.058824,
-    0.678431,0.129412,0.192157,0.105882,0.968627,0.545098,0.462745,0.227451,
-    0.019608,0.866667,0.674510,0.207843,0.627451,0.819608,0.921569,0.356863,
-    0.447059,0.533333,0.435294,0.341176,0.054902,0.529412,0.235294,0.764706,
-    0.615686,0.043137,0.745098,0.266667,0.501961,0.619608,0.776471,0.450980,
-    0.309804,0.325490,0.200000,0.635294,0.247059,0.698039,0.721569,0.168627,
-    0.854902,0.141176,0.611765,0.525490,0.415686,0.298039,0.254902,0.858824,
-    0.568627,0.329412,0.062745,0.843137,0.588235,0.733333,0.607843,0.478431,
-    0.576471,0.662745,0.470588,0.666667,0.980392,0.113725,0.898039,0.203922,
-    0.294118,0.152941,0.098039,0.909804,0.796078,0.768627,0.713725,0.196078,
-    0.368627,0.419608,0.352941,0.090196,0.749020,0.121569,0.882353,0.278431,
-    0.388235,0.917647,0.701961,0.729412,0.835294,0.258824,0.301961,0.101961,
-    0.792157,0.474510,0.686275,0.658824,0.364706,0.682353,0.458824,0.815686,
-    0.282353,0.160784,0.870588,0.988235,0.756863,0.549020,0.274510,0.384314,
-    0.650980,0.737255,0.901961,0.956863,0.972549,0.584314,0.925490,0.403922,
-    0.074510,0.454902,0.952941,0.109804,0.313725,0.905882,0.231373,0.466667
-};
-
-static float impulse_zcoords[]={
-    0.082353,0.643137,0.415686,0.929412,0.568627,0.509804,0.537255,0.815686,
-    0.698039,0.941176,0.776471,0.752941,0.737255,0.525490,0.498039,0.423529,
-    0.792157,0.125490,0.619608,0.164706,0.368627,0.870588,0.137255,0.372549,
-    0.466667,0.486275,0.501961,0.513725,0.709804,0.576471,0.203922,0.258824,
-    0.152941,0.556863,0.223529,0.047059,0.235294,0.474510,0.764706,0.552941,
-    0.847059,0.145098,0.176471,0.937255,0.654902,0.894118,0.729412,0.054902,
-    0.666667,0.749020,0.262745,0.560784,0.431373,0.286275,0.352941,0.239216,
-    0.156863,0.839216,0.427451,0.949020,0.384314,0.227451,0.180392,0.074510,
-    0.172549,0.356863,0.066667,0.517647,0.447059,0.184314,0.062745,0.670588,
-    0.603922,0.219608,0.270588,0.976471,0.505882,0.627451,0.819608,0.854902,
-    0.843137,0.019608,0.713725,0.035294,0.925490,0.349020,0.866667,0.701961,
-    0.909804,0.811765,0.717647,0.141176,0.917647,0.023529,0.098039,0.803922,
-    0.733333,0.658824,0.827451,0.133333,0.858824,0.800000,0.635294,1.000000,
-    0.078431,0.450980,0.835294,0.321569,0.360784,0.529412,0.725490,0.572549,
-    0.639216,0.341176,0.533333,0.094118,0.149020,0.545098,0.101961,0.901961,
-    0.278431,0.694118,0.521569,0.490196,0.454902,0.329412,0.274510,0.027451,
-    0.745098,0.933333,0.443137,0.168627,0.192157,0.988235,0.070588,0.972549,
-    0.768627,0.400000,0.470588,0.207843,0.215686,0.388235,0.439216,0.780392,
-    0.482353,0.121569,0.964706,0.086275,0.890196,0.337255,0.109804,0.305882,
-    0.113725,0.435294,0.721569,0.772549,0.807843,0.741176,0.254902,0.596078,
-    0.494118,0.317647,0.419608,0.000000,0.188235,0.031373,0.376471,0.380392,
-    0.611765,0.945098,0.411765,0.313725,0.874510,0.588235,0.678431,0.160784,
-    0.007843,0.090196,0.850980,0.788235,0.705882,0.266667,0.309804,0.541176,
-    0.231373,0.129412,0.294118,0.243137,0.913725,0.996078,0.117647,0.478431,
-    0.290196,0.549020,0.682353,0.784314,0.396078,0.831373,0.984314,0.584314,
-    0.039216,0.250980,0.600000,0.392157,0.298039,0.050980,0.364706,0.105882,
-    0.623529,0.886275,0.980392,0.325490,0.247059,0.690196,0.674510,0.960784,
-    0.647059,0.211765,0.882353,0.686275,0.823529,0.058824,0.956863,0.043137,
-    0.345098,0.301961,0.592157,0.862745,0.607843,0.458824,0.282353,0.003922,
-    0.580392,0.760784,0.564706,0.011765,0.968627,0.905882,0.756863,0.952941,
-    0.662745,0.015686,0.898039,0.196078,0.333333,0.992157,0.650980,0.407843,
-    0.796078,0.615686,0.878431,0.921569,0.631373,0.200000,0.403922,0.462745
-};
-
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: static data for noise() primitives.
+//
+// $Workfile:     $
+// $NoKeywords: $
+//=============================================================================//
+//
+//    **** DO NOT EDIT THIS FILE. GENERATED BY DATAGEN.PL ****
+//
+
+static int perm_a[]={
+    66,147,106,213,89,115,239,25,171,175,9,114,141,226,118,128,41,208,4,56,
+   180,248,43,82,246,219,94,245,133,131,222,103,160,130,168,145,238,38,23,6,
+   236,67,99,2,70,232,80,209,1,3,68,65,102,210,13,73,55,252,187,170,22,36,
+   52,181,117,163,46,79,166,224,148,75,113,95,156,185,220,164,51,142,161,35,
+   206,251,45,136,197,190,132,32,218,127,63,27,137,93,242,20,189,108,183,
+   122,139,191,249,253,87,98,69,0,144,64,24,214,97,116,158,42,107,15,53,212,
+   83,111,152,240,74,237,62,77,205,149,26,151,178,204,91,176,234,49,154,203,
+   33,221,125,134,165,124,86,39,37,60,150,157,179,109,110,44,159,153,5,100,
+   10,207,40,186,96,215,143,162,230,184,101,54,174,247,76,59,241,223,192,84,
+   104,78,169,146,138,30,48,85,233,19,29,92,126,17,199,250,31,81,188,225,28,
+   112,88,11,182,173,211,129,194,172,14,120,200,167,135,12,177,227,229,155,
+   201,61,105,195,193,244,235,58,8,196,123,254,16,18,50,121,71,243,90,57,
+   202,119,255,47,7,198,228,21,217,216,231,140,72,34
+};
+
+static int perm_b[]={
+    123,108,201,64,40,75,24,221,137,110,191,142,9,69,230,83,7,247,51,54,115,
+    133,180,248,109,116,62,99,251,55,89,253,65,106,228,167,131,132,58,143,
+    97,102,163,202,149,234,12,117,174,94,121,74,32,113,20,60,159,182,204,29,
+    244,118,3,178,255,38,6,114,36,93,30,134,213,90,245,209,88,232,162,125,
+    84,166,70,136,208,231,27,71,157,80,76,0,170,225,203,176,33,161,196,128,
+    252,236,246,2,138,1,250,197,77,243,218,242,19,164,68,212,14,237,144,63,
+    46,103,177,188,85,223,8,160,222,4,216,219,35,15,44,23,126,127,100,226,
+    235,37,168,101,49,22,11,73,61,135,111,183,72,96,185,239,82,18,50,155,
+    186,153,17,233,146,156,107,5,254,10,192,198,148,207,104,13,124,48,95,
+    129,120,206,199,81,249,91,150,210,119,240,122,194,92,34,28,205,175,227,
+    179,220,140,152,79,26,195,47,66,173,169,241,53,184,187,145,112,238,214,
+    147,98,171,229,200,151,25,67,78,189,217,130,224,57,172,59,41,43,16,105,
+    158,165,21,45,56,141,139,215,190,86,42,52,39,87,181,31,154,193,211
+};
+
+static int perm_c[]={
+    97,65,96,25,122,26,219,85,148,251,102,0,140,130,136,213,138,60,236,52,
+    178,131,115,183,144,78,147,168,39,45,169,70,57,146,67,142,252,216,28,54,
+    86,222,194,200,48,5,205,125,214,56,181,255,196,155,37,218,153,208,66,
+    242,73,248,206,61,62,246,177,2,197,107,162,152,89,41,6,160,94,8,201,38,
+    235,228,165,93,111,239,74,231,121,47,166,221,157,64,77,244,29,105,150,
+    123,190,191,225,118,133,42,10,84,185,159,124,132,240,180,44,1,9,19,99,
+    254,12,207,186,71,234,184,11,20,16,193,139,175,98,59,113,27,170,230,91,
+    187,46,156,249,108,195,171,114,14,188,82,192,233,24,32,241,87,164,90,43,
+    163,245,92,40,215,55,226,15,3,112,158,250,172,22,227,137,35,128,145,247,
+    161,119,80,217,189,81,7,63,202,120,223,83,179,4,106,199,229,95,53,50,33,
+    182,72,143,23,243,75,18,173,141,167,198,204,58,174,237,17,129,238,127,
+    31,101,176,36,30,110,209,34,203,135,232,68,149,49,134,126,212,79,76,117,
+    104,210,211,224,253,100,220,109,116,88,13,151,154,69,21,51,103
+};
+
+static int perm_d[]={
+    94,234,145,235,151,166,187,238,4,5,128,115,87,107,229,175,190,108,218,
+    32,17,220,97,90,122,121,71,109,64,227,225,75,81,19,27,162,3,89,139,69,
+    92,26,48,215,116,191,114,2,104,157,66,39,1,127,96,124,30,0,82,233,219,
+    42,131,173,35,201,182,144,14,98,148,244,160,159,179,91,31,68,119,154,
+    205,113,149,167,44,60,18,228,251,245,43,10,80,15,129,67,181,174,6,45,
+    194,237,213,52,99,232,211,212,164,217,57,153,156,102,134,20,249,132,55,
+    204,65,33,231,85,61,37,163,193,189,170,226,63,168,236,165,224,242,195,
+    41,200,40,70,112,100,36,172,130,74,137,252,243,135,230,161,207,16,146,
+    198,118,150,24,29,250,188,25,209,103,23,105,47,7,46,133,83,184,50,79,
+    110,120,53,253,206,214,9,240,101,147,152,183,254,59,126,216,197,171,51,
+    208,248,202,58,176,28,72,177,185,141,12,11,56,222,86,178,155,223,88,111,
+    73,142,210,138,239,221,199,192,84,93,241,125,76,77,255,95,8,78,247,186,
+    123,196,13,140,180,143,54,106,136,34,62,169,38,117,22,21,49,203,158,246
+};
+
+static float impulse_xcoords[]={
+    0.788235,0.541176,0.972549,0.082353,0.352941,0.811765,0.286275,0.752941,
+    0.203922,0.705882,0.537255,0.886275,0.580392,0.137255,0.800000,0.533333,
+    0.117647,0.447059,0.129412,0.925490,0.086275,0.478431,0.666667,0.568627,
+    0.678431,0.313725,0.321569,0.349020,0.988235,0.419608,0.898039,0.219608,
+    0.243137,0.623529,0.501961,0.772549,0.952941,0.517647,0.949020,0.701961,
+    0.454902,0.505882,0.564706,0.960784,0.207843,0.007843,0.831373,0.184314,
+    0.576471,0.462745,0.572549,0.247059,0.262745,0.694118,0.615686,0.121569,
+    0.384314,0.749020,0.145098,0.717647,0.415686,0.607843,0.105882,0.101961,
+    0.200000,0.807843,0.521569,0.780392,0.466667,0.552941,0.996078,0.627451,
+    0.992157,0.529412,0.407843,0.011765,0.709804,0.458824,0.058824,0.819608,
+    0.176471,0.317647,0.392157,0.223529,0.156863,0.490196,0.325490,0.074510,
+    0.239216,0.164706,0.890196,0.603922,0.921569,0.839216,0.854902,0.098039,
+    0.686275,0.843137,0.152941,0.372549,0.062745,0.474510,0.486275,0.227451,
+    0.400000,0.298039,0.309804,0.274510,0.054902,0.815686,0.647059,0.635294,
+    0.662745,0.976471,0.094118,0.509804,0.650980,0.211765,0.180392,0.003922,
+    0.827451,0.278431,0.023529,0.525490,0.450980,0.725490,0.690196,0.941176,
+    0.639216,0.560784,0.196078,0.364706,0.043137,0.494118,0.796078,0.113725,
+    0.760784,0.729412,0.258824,0.290196,0.584314,0.674510,0.823529,0.905882,
+    0.917647,0.070588,0.862745,0.345098,0.913725,0.937255,0.031373,0.215686,
+    0.768627,0.333333,0.411765,0.423529,0.945098,0.721569,0.039216,0.792157,
+    0.956863,0.266667,0.254902,0.047059,0.294118,0.658824,0.250980,1.000000,
+    0.984314,0.756863,0.027451,0.305882,0.835294,0.513725,0.360784,0.776471,
+    0.611765,0.192157,0.866667,0.858824,0.592157,0.803922,0.141176,0.435294,
+    0.588235,0.619608,0.341176,0.109804,0.356863,0.270588,0.737255,0.847059,
+    0.050980,0.764706,0.019608,0.870588,0.933333,0.784314,0.549020,0.337255,
+    0.631373,0.929412,0.231373,0.427451,0.078431,0.498039,0.968627,0.654902,
+    0.125490,0.698039,0.015686,0.878431,0.713725,0.368627,0.431373,0.874510,
+    0.403922,0.556863,0.443137,0.964706,0.909804,0.301961,0.035294,0.850980,
+    0.882353,0.741176,0.380392,0.133333,0.470588,0.643137,0.282353,0.396078,
+    0.980392,0.168627,0.149020,0.235294,0.670588,0.596078,0.733333,0.160784,
+    0.376471,0.682353,0.545098,0.482353,0.745098,0.894118,0.188235,0.329412,
+    0.439216,0.901961,0.000000,0.600000,0.388235,0.172549,0.090196,0.066667
+};
+
+static float impulse_ycoords[]={
+    0.827451,0.337255,0.941176,0.886275,0.878431,0.239216,0.400000,0.164706,
+    0.490196,0.411765,0.964706,0.349020,0.803922,0.317647,0.647059,0.431373,
+    0.933333,0.156863,0.094118,0.219608,0.039216,0.521569,0.498039,0.705882,
+    0.717647,0.047059,0.631373,0.517647,0.984314,0.847059,0.482353,0.439216,
+    0.250980,0.862745,0.690196,0.913725,0.270588,0.070588,0.027451,0.694118,
+    0.811765,0.000000,0.494118,0.823529,0.800000,0.600000,0.003922,0.443137,
+    0.639216,0.376471,0.031373,0.035294,0.552941,0.215686,0.305882,0.133333,
+    0.564706,0.176471,0.211765,0.874510,0.360784,0.654902,0.223529,0.807843,
+    0.372549,0.137255,0.321569,0.015686,0.007843,0.262745,0.125490,0.078431,
+    0.396078,0.976471,0.929412,1.000000,0.937255,0.509804,0.188235,0.850980,
+    0.831373,0.392157,0.741176,0.541176,0.592157,0.286275,0.345098,0.572549,
+    0.537255,0.725490,0.839216,0.184314,0.772549,0.149020,0.505882,0.423529,
+    0.780392,0.011765,0.890196,0.086275,0.427451,0.023529,0.788235,0.050980,
+    0.760784,0.603922,0.066667,0.643137,0.623529,0.960784,0.172549,0.333333,
+    0.082353,0.290196,0.992157,0.709804,0.894118,0.596078,0.243137,0.752941,
+    0.486275,0.670588,0.949020,0.784314,0.145098,0.560784,0.513725,0.180392,
+    0.580392,0.996078,0.380392,0.556863,0.407843,0.945098,0.117647,0.058824,
+    0.678431,0.129412,0.192157,0.105882,0.968627,0.545098,0.462745,0.227451,
+    0.019608,0.866667,0.674510,0.207843,0.627451,0.819608,0.921569,0.356863,
+    0.447059,0.533333,0.435294,0.341176,0.054902,0.529412,0.235294,0.764706,
+    0.615686,0.043137,0.745098,0.266667,0.501961,0.619608,0.776471,0.450980,
+    0.309804,0.325490,0.200000,0.635294,0.247059,0.698039,0.721569,0.168627,
+    0.854902,0.141176,0.611765,0.525490,0.415686,0.298039,0.254902,0.858824,
+    0.568627,0.329412,0.062745,0.843137,0.588235,0.733333,0.607843,0.478431,
+    0.576471,0.662745,0.470588,0.666667,0.980392,0.113725,0.898039,0.203922,
+    0.294118,0.152941,0.098039,0.909804,0.796078,0.768627,0.713725,0.196078,
+    0.368627,0.419608,0.352941,0.090196,0.749020,0.121569,0.882353,0.278431,
+    0.388235,0.917647,0.701961,0.729412,0.835294,0.258824,0.301961,0.101961,
+    0.792157,0.474510,0.686275,0.658824,0.364706,0.682353,0.458824,0.815686,
+    0.282353,0.160784,0.870588,0.988235,0.756863,0.549020,0.274510,0.384314,
+    0.650980,0.737255,0.901961,0.956863,0.972549,0.584314,0.925490,0.403922,
+    0.074510,0.454902,0.952941,0.109804,0.313725,0.905882,0.231373,0.466667
+};
+
+static float impulse_zcoords[]={
+    0.082353,0.643137,0.415686,0.929412,0.568627,0.509804,0.537255,0.815686,
+    0.698039,0.941176,0.776471,0.752941,0.737255,0.525490,0.498039,0.423529,
+    0.792157,0.125490,0.619608,0.164706,0.368627,0.870588,0.137255,0.372549,
+    0.466667,0.486275,0.501961,0.513725,0.709804,0.576471,0.203922,0.258824,
+    0.152941,0.556863,0.223529,0.047059,0.235294,0.474510,0.764706,0.552941,
+    0.847059,0.145098,0.176471,0.937255,0.654902,0.894118,0.729412,0.054902,
+    0.666667,0.749020,0.262745,0.560784,0.431373,0.286275,0.352941,0.239216,
+    0.156863,0.839216,0.427451,0.949020,0.384314,0.227451,0.180392,0.074510,
+    0.172549,0.356863,0.066667,0.517647,0.447059,0.184314,0.062745,0.670588,
+    0.603922,0.219608,0.270588,0.976471,0.505882,0.627451,0.819608,0.854902,
+    0.843137,0.019608,0.713725,0.035294,0.925490,0.349020,0.866667,0.701961,
+    0.909804,0.811765,0.717647,0.141176,0.917647,0.023529,0.098039,0.803922,
+    0.733333,0.658824,0.827451,0.133333,0.858824,0.800000,0.635294,1.000000,
+    0.078431,0.450980,0.835294,0.321569,0.360784,0.529412,0.725490,0.572549,
+    0.639216,0.341176,0.533333,0.094118,0.149020,0.545098,0.101961,0.901961,
+    0.278431,0.694118,0.521569,0.490196,0.454902,0.329412,0.274510,0.027451,
+    0.745098,0.933333,0.443137,0.168627,0.192157,0.988235,0.070588,0.972549,
+    0.768627,0.400000,0.470588,0.207843,0.215686,0.388235,0.439216,0.780392,
+    0.482353,0.121569,0.964706,0.086275,0.890196,0.337255,0.109804,0.305882,
+    0.113725,0.435294,0.721569,0.772549,0.807843,0.741176,0.254902,0.596078,
+    0.494118,0.317647,0.419608,0.000000,0.188235,0.031373,0.376471,0.380392,
+    0.611765,0.945098,0.411765,0.313725,0.874510,0.588235,0.678431,0.160784,
+    0.007843,0.090196,0.850980,0.788235,0.705882,0.266667,0.309804,0.541176,
+    0.231373,0.129412,0.294118,0.243137,0.913725,0.996078,0.117647,0.478431,
+    0.290196,0.549020,0.682353,0.784314,0.396078,0.831373,0.984314,0.584314,
+    0.039216,0.250980,0.600000,0.392157,0.298039,0.050980,0.364706,0.105882,
+    0.623529,0.886275,0.980392,0.325490,0.247059,0.690196,0.674510,0.960784,
+    0.647059,0.211765,0.882353,0.686275,0.823529,0.058824,0.956863,0.043137,
+    0.345098,0.301961,0.592157,0.862745,0.607843,0.458824,0.282353,0.003922,
+    0.580392,0.760784,0.564706,0.011765,0.968627,0.905882,0.756863,0.952941,
+    0.662745,0.015686,0.898039,0.196078,0.333333,0.992157,0.650980,0.407843,
+    0.796078,0.615686,0.878431,0.921569,0.631373,0.200000,0.403922,0.462745
+};
+
diff --git a/mp/src/mathlib/polyhedron.cpp b/mp/src/mathlib/polyhedron.cpp
index 067ffc57..5a858f19 100644
--- a/mp/src/mathlib/polyhedron.cpp
+++ b/mp/src/mathlib/polyhedron.cpp
@@ -1,2293 +1,2293 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: 
-//
-// $NoKeywords: $
-//
-//=============================================================================//
-
-#include "mathlib/polyhedron.h"
-#include "mathlib/vmatrix.h"
-#include <stdlib.h>
-#include <stdio.h>
-#include "tier1/utlvector.h"
-
-
-
-struct GeneratePolyhedronFromPlanes_Point;
-struct GeneratePolyhedronFromPlanes_PointLL;
-struct GeneratePolyhedronFromPlanes_Line;
-struct GeneratePolyhedronFromPlanes_LineLL;
-struct GeneratePolyhedronFromPlanes_Polygon;
-struct GeneratePolyhedronFromPlanes_PolygonLL;
-
-struct GeneratePolyhedronFromPlanes_UnorderedPointLL;
-struct GeneratePolyhedronFromPlanes_UnorderedLineLL;
-struct GeneratePolyhedronFromPlanes_UnorderedPolygonLL;
-
-Vector FindPointInPlanes( const float *pPlanes, int planeCount );
-bool FindConvexShapeLooseAABB( const float *pInwardFacingPlanes, int iPlaneCount, Vector *pAABBMins, Vector *pAABBMaxs );
-CPolyhedron *ClipLinkedGeometry( GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pPolygons, GeneratePolyhedronFromPlanes_UnorderedLineLL *pLines, GeneratePolyhedronFromPlanes_UnorderedPointLL *pPoints, const float *pOutwardFacingPlanes, int iPlaneCount, float fOnPlaneEpsilon, bool bUseTemporaryMemory );
-CPolyhedron *ConvertLinkedGeometryToPolyhedron( GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pPolygons, GeneratePolyhedronFromPlanes_UnorderedLineLL *pLines, GeneratePolyhedronFromPlanes_UnorderedPointLL *pPoints, bool bUseTemporaryMemory );
-
-//#define ENABLE_DEBUG_POLYHEDRON_DUMPS //Dumps debug information to disk for use with glview. Requires that tier2 also be in all projects using debug mathlib
-//#define DEBUG_DUMP_POLYHEDRONS_TO_NUMBERED_GLVIEWS //dumps successfully generated polyhedrons
-
-#ifdef _DEBUG
-void DumpPolyhedronToGLView( const CPolyhedron *pPolyhedron, const char *pFilename, const VMatrix *pTransform );
-void DumpPlaneToGlView( const float *pPlane, float fGrayScale, const char *pszFileName, const VMatrix *pTransform );
-void DumpLineToGLView( const Vector &vPoint1, const Vector &vColor1, const Vector &vPoint2, const Vector &vColor2, float fThickness, FILE *pFile );
-void DumpAABBToGLView( const Vector &vCenter, const Vector &vExtents, const Vector &vColor, FILE *pFile );
-
-#if defined( ENABLE_DEBUG_POLYHEDRON_DUMPS ) && defined( WIN32 )
-#include "winlite.h"
-#endif
-
-static VMatrix s_matIdentity( 1.0f, 0.0f, 0.0f, 0.0f, 
-							 0.0f, 1.0f, 0.0f, 0.0f, 
-							 0.0f, 0.0f, 1.0f, 0.0f, 
-							 0.0f, 0.0f, 0.0f, 1.0f );
-#endif
-
-#if defined( DEBUG_DUMP_POLYHEDRONS_TO_NUMBERED_GLVIEWS )
-static int g_iPolyhedronDumpCounter = 0;
-#endif
-
-// memdbgon must be the last include file in a .cpp file!!!
-#include "tier0/memdbgon.h"
-
-#if defined( _DEBUG ) && defined( ENABLE_DEBUG_POLYHEDRON_DUMPS )
-void CreateDumpDirectory( const char *szDirectoryName )
-{
-#if defined( WIN32 )
-	CreateDirectory( szDirectoryName, NULL );
-#else
-	Assert( false ); //TODO: create directories in linux
-#endif
-}
-#endif
-
-
-
-void CPolyhedron_AllocByNew::Release( void )
-{
-	delete this;
-}
-
-CPolyhedron_AllocByNew *CPolyhedron_AllocByNew::Allocate( unsigned short iVertices, unsigned short iLines, unsigned short iIndices, unsigned short iPolygons ) //creates the polyhedron along with enough memory to hold all it's data in a single allocation
-{
-	void *pMemory = new unsigned char [ sizeof( CPolyhedron_AllocByNew ) +
-										(iVertices * sizeof(Vector)) + 
-										(iLines * sizeof(Polyhedron_IndexedLine_t)) + 
-										(iIndices * sizeof( Polyhedron_IndexedLineReference_t )) + 
-										(iPolygons * sizeof( Polyhedron_IndexedPolygon_t ))];
-
-#include "tier0/memdbgoff.h" //the following placement new doesn't compile with memory debugging
-	CPolyhedron_AllocByNew *pAllocated = new ( pMemory ) CPolyhedron_AllocByNew;
-#include "tier0/memdbgon.h"
-
-	pAllocated->iVertexCount = iVertices;
-	pAllocated->iLineCount = iLines;
-	pAllocated->iIndexCount = iIndices;
-	pAllocated->iPolygonCount = iPolygons;
-	pAllocated->pVertices = (Vector *)(pAllocated + 1); //start vertex memory at the end of the class
-	pAllocated->pLines = (Polyhedron_IndexedLine_t *)(pAllocated->pVertices + iVertices);
-	pAllocated->pIndices = (Polyhedron_IndexedLineReference_t *)(pAllocated->pLines + iLines);
-	pAllocated->pPolygons = (Polyhedron_IndexedPolygon_t *)(pAllocated->pIndices + iIndices);
-
-	return pAllocated;
-}
-
-
-class CPolyhedron_TempMemory : public CPolyhedron
-{
-public:
-#ifdef DBGFLAG_ASSERT
-	int iReferenceCount;
-#endif
-
-	virtual void Release( void )
-	{
-#ifdef DBGFLAG_ASSERT
-		--iReferenceCount;
-#endif
-	}
-
-	CPolyhedron_TempMemory( void )
-#ifdef DBGFLAG_ASSERT
-		: iReferenceCount( 0 )
-#endif
-	{ };
-};
-
-
-static CUtlVector<unsigned char> s_TempMemoryPolyhedron_Buffer;
-static CPolyhedron_TempMemory s_TempMemoryPolyhedron;
-
-CPolyhedron *GetTempPolyhedron( unsigned short iVertices, unsigned short iLines, unsigned short iIndices, unsigned short iPolygons ) //grab the temporary polyhedron. Avoids new/delete for quick work. Can only be in use by one chunk of code at a time
-{
-	AssertMsg( s_TempMemoryPolyhedron.iReferenceCount == 0, "Temporary polyhedron memory being rewritten before released" );
-#ifdef DBGFLAG_ASSERT
-	++s_TempMemoryPolyhedron.iReferenceCount;
-#endif
-	s_TempMemoryPolyhedron_Buffer.SetCount( (sizeof( Vector ) * iVertices) +
-											(sizeof( Polyhedron_IndexedLine_t ) * iLines) +
-											(sizeof( Polyhedron_IndexedLineReference_t ) * iIndices) +
-											(sizeof( Polyhedron_IndexedPolygon_t ) * iPolygons) );
-
-	s_TempMemoryPolyhedron.iVertexCount = iVertices;
-	s_TempMemoryPolyhedron.iLineCount = iLines;
-	s_TempMemoryPolyhedron.iIndexCount = iIndices;
-	s_TempMemoryPolyhedron.iPolygonCount = iPolygons;
-
-	s_TempMemoryPolyhedron.pVertices = (Vector *)s_TempMemoryPolyhedron_Buffer.Base();
-	s_TempMemoryPolyhedron.pLines = (Polyhedron_IndexedLine_t *)(&s_TempMemoryPolyhedron.pVertices[s_TempMemoryPolyhedron.iVertexCount]);
-	s_TempMemoryPolyhedron.pIndices = (Polyhedron_IndexedLineReference_t *)(&s_TempMemoryPolyhedron.pLines[s_TempMemoryPolyhedron.iLineCount]);
-	s_TempMemoryPolyhedron.pPolygons = (Polyhedron_IndexedPolygon_t *)(&s_TempMemoryPolyhedron.pIndices[s_TempMemoryPolyhedron.iIndexCount]);
-
-	return &s_TempMemoryPolyhedron;
-}
-
-
-Vector CPolyhedron::Center( void )
-{
-	if( iVertexCount == 0 )
-		return vec3_origin;
-
-	Vector vAABBMin, vAABBMax;
-	vAABBMin = vAABBMax = pVertices[0];
-	for( int i = 1; i != iVertexCount; ++i )
-	{
-		Vector &vPoint = pVertices[i];
-		if( vPoint.x < vAABBMin.x )
-			vAABBMin.x = vPoint.x;
-		if( vPoint.y < vAABBMin.y )
-			vAABBMin.y = vPoint.y;
-		if( vPoint.z < vAABBMin.z )
-			vAABBMin.z = vPoint.z;
-
-		if( vPoint.x > vAABBMax.x )
-			vAABBMax.x = vPoint.x;
-		if( vPoint.y > vAABBMax.y )
-			vAABBMax.y = vPoint.y;
-		if( vPoint.z > vAABBMax.z )
-			vAABBMax.z = vPoint.z;
-	}
-	return ((vAABBMin + vAABBMax) * 0.5f);
-}
-
-enum PolyhedronPointPlanarity
-{
-	POINT_DEAD,
-	POINT_ONPLANE,
-	POINT_ALIVE	
-};
-
-struct GeneratePolyhedronFromPlanes_Point
-{
-	Vector ptPosition;
-	GeneratePolyhedronFromPlanes_LineLL *pConnectedLines; //keep these in a clockwise order, circular linking
-	float fPlaneDist; //used in plane cutting
-	PolyhedronPointPlanarity planarity;
-	int iSaveIndices;
-};
-
-struct GeneratePolyhedronFromPlanes_Line
-{
-	GeneratePolyhedronFromPlanes_Point *pPoints[2]; //the 2 connecting points in no particular order
-	GeneratePolyhedronFromPlanes_Polygon *pPolygons[2]; //viewing from the outside with the point connections going up, 0 is the left polygon, 1 is the right
-	int iSaveIndices;
-	bool bAlive; //connected to at least one living point
-	bool bCut; //connected to at least one dead point
-
-	GeneratePolyhedronFromPlanes_LineLL *pPointLineLinks[2]; //rather than going into a point and searching for its link to this line, lets just cache it to eliminate searching
-	GeneratePolyhedronFromPlanes_LineLL *pPolygonLineLinks[2]; //rather than going into a polygon and searching for its link to this line, lets just cache it to eliminate searching
-#ifdef POLYHEDRON_EXTENSIVE_DEBUGGING
-	int iDebugFlags;
-#endif
-};
-
-struct GeneratePolyhedronFromPlanes_LineLL
-{
-	GeneratePolyhedronFromPlanes_Line *pLine;
-	int iReferenceIndex; //whatever is referencing the line should know which side of the line it's on (points and polygons), for polygons, it's which point to follow to continue going clockwise, which makes polygon 0 the one on the left side of an upward facing line vector, for points, it's the OTHER point's index
-	GeneratePolyhedronFromPlanes_LineLL *pPrev;
-	GeneratePolyhedronFromPlanes_LineLL *pNext;
-};
-
-struct GeneratePolyhedronFromPlanes_Polygon
-{
-	Vector vSurfaceNormal; 
-	GeneratePolyhedronFromPlanes_LineLL *pLines; //keep these in a clockwise order, circular linking
-	
-	bool bMissingASide;
-};
-
-struct GeneratePolyhedronFromPlanes_UnorderedPolygonLL //an unordered collection of polygons
-{
-	GeneratePolyhedronFromPlanes_Polygon *pPolygon;
-	GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pNext;
-	GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pPrev;
-};
-
-struct GeneratePolyhedronFromPlanes_UnorderedLineLL //an unordered collection of lines
-{
-	GeneratePolyhedronFromPlanes_Line *pLine;
-	GeneratePolyhedronFromPlanes_UnorderedLineLL *pNext;
-	GeneratePolyhedronFromPlanes_UnorderedLineLL *pPrev;
-};
-
-struct GeneratePolyhedronFromPlanes_UnorderedPointLL //an unordered collection of points
-{
-	GeneratePolyhedronFromPlanes_Point *pPoint;
-	GeneratePolyhedronFromPlanes_UnorderedPointLL *pNext;
-	GeneratePolyhedronFromPlanes_UnorderedPointLL *pPrev;
-};
-
-
-
-
-CPolyhedron *ClipPolyhedron( const CPolyhedron *pExistingPolyhedron, const float *pOutwardFacingPlanes, int iPlaneCount, float fOnPlaneEpsilon, bool bUseTemporaryMemory )
-{
-	if( pExistingPolyhedron == NULL )
-		return NULL;
-
-	AssertMsg( (pExistingPolyhedron->iVertexCount >= 3) && (pExistingPolyhedron->iPolygonCount >= 2), "Polyhedron doesn't meet absolute minimum spec" );
-
-	float *pUsefulPlanes = (float *)stackalloc( sizeof( float ) * 4 * iPlaneCount );
-	int iUsefulPlaneCount = 0;
-	Vector *pExistingVertices = pExistingPolyhedron->pVertices;
-
-	//A large part of clipping will either eliminate the polyhedron entirely, or clip nothing at all, so lets just check for those first and throw away useless planes
-	{
-		int iLiveCount = 0;
-		int iDeadCount = 0;
-		const float fNegativeOnPlaneEpsilon = -fOnPlaneEpsilon;
-
-		for( int i = 0; i != iPlaneCount; ++i )
-		{
-			Vector vNormal = *((Vector *)&pOutwardFacingPlanes[(i * 4) + 0]);
-			float fPlaneDist = pOutwardFacingPlanes[(i * 4) + 3];
-
-			for( int j = 0; j != pExistingPolyhedron->iVertexCount; ++j )
-			{
-				float fPointDist = vNormal.Dot( pExistingVertices[j] ) - fPlaneDist;
-				
-				if( fPointDist <= fNegativeOnPlaneEpsilon )
-					++iLiveCount;
-				else if( fPointDist > fOnPlaneEpsilon )
-					++iDeadCount;
-			}
-
-			if( iLiveCount == 0 )
-			{
-				//all points are dead or on the plane, so the polyhedron is dead
-				return NULL;
-			}
-
-			if( iDeadCount != 0 )
-			{
-				//at least one point died, this plane yields useful results
-				pUsefulPlanes[(iUsefulPlaneCount * 4) + 0] = vNormal.x;
-				pUsefulPlanes[(iUsefulPlaneCount * 4) + 1] = vNormal.y;
-				pUsefulPlanes[(iUsefulPlaneCount * 4) + 2] = vNormal.z;
-				pUsefulPlanes[(iUsefulPlaneCount * 4) + 3] = fPlaneDist;
-				++iUsefulPlaneCount;
-			}
-		}
-	}
-
-	if( iUsefulPlaneCount == 0 )
-	{
-		//testing shows that the polyhedron won't even be cut, clone the existing polyhedron and return that
-
-		CPolyhedron *pReturn;
-		if( bUseTemporaryMemory )
-		{
-			pReturn = GetTempPolyhedron( pExistingPolyhedron->iVertexCount, 
-											pExistingPolyhedron->iLineCount, 
-											pExistingPolyhedron->iIndexCount, 
-											pExistingPolyhedron->iPolygonCount );
-		}
-		else
-		{
-			pReturn = CPolyhedron_AllocByNew::Allocate( pExistingPolyhedron->iVertexCount, 
-														pExistingPolyhedron->iLineCount, 
-														pExistingPolyhedron->iIndexCount, 
-														pExistingPolyhedron->iPolygonCount );
-		}
-
-		memcpy( pReturn->pVertices, pExistingPolyhedron->pVertices, sizeof( Vector ) * pReturn->iVertexCount );
-		memcpy( pReturn->pLines, pExistingPolyhedron->pLines, sizeof( Polyhedron_IndexedLine_t ) * pReturn->iLineCount );
-		memcpy( pReturn->pIndices, pExistingPolyhedron->pIndices, sizeof( Polyhedron_IndexedLineReference_t ) * pReturn->iIndexCount );
-		memcpy( pReturn->pPolygons, pExistingPolyhedron->pPolygons, sizeof( Polyhedron_IndexedPolygon_t ) * pReturn->iPolygonCount );
-
-		return pReturn;
-	}
-
-
-
-	//convert the polyhedron to linked geometry
-	GeneratePolyhedronFromPlanes_Point *pStartPoints = (GeneratePolyhedronFromPlanes_Point *)stackalloc( pExistingPolyhedron->iVertexCount * sizeof( GeneratePolyhedronFromPlanes_Point ) );
-	GeneratePolyhedronFromPlanes_Line *pStartLines = (GeneratePolyhedronFromPlanes_Line *)stackalloc( pExistingPolyhedron->iLineCount * sizeof( GeneratePolyhedronFromPlanes_Line ) );
-	GeneratePolyhedronFromPlanes_Polygon *pStartPolygons = (GeneratePolyhedronFromPlanes_Polygon *)stackalloc( pExistingPolyhedron->iPolygonCount * sizeof( GeneratePolyhedronFromPlanes_Polygon ) );
-
-	GeneratePolyhedronFromPlanes_LineLL *pStartLineLinks = (GeneratePolyhedronFromPlanes_LineLL *)stackalloc( pExistingPolyhedron->iLineCount * 4 * sizeof( GeneratePolyhedronFromPlanes_LineLL ) );
-	
-	int iCurrentLineLinkIndex = 0;
-
-	//setup points
-	for( int i = 0; i != pExistingPolyhedron->iVertexCount; ++i )
-	{
-		pStartPoints[i].ptPosition = pExistingPolyhedron->pVertices[i];
-		pStartPoints[i].pConnectedLines = NULL; //we won't be circular linking until later
-	}
-
-	//setup lines and interlink to points (line links are not yet circularly linked, and are unordered)
-	for( int i = 0; i != pExistingPolyhedron->iLineCount; ++i )
-	{
-		for( int j = 0; j != 2; ++j )
-		{
-			pStartLines[i].pPoints[j] = &pStartPoints[pExistingPolyhedron->pLines[i].iPointIndices[j]];
-
-			GeneratePolyhedronFromPlanes_LineLL *pLineLink = &pStartLineLinks[iCurrentLineLinkIndex++];
-			pStartLines[i].pPointLineLinks[j] = pLineLink;
-			pLineLink->pLine = &pStartLines[i];
-			pLineLink->iReferenceIndex = 1 - j;
-			//pLineLink->pPrev = NULL;
-			pLineLink->pNext = pStartLines[i].pPoints[j]->pConnectedLines;
-			pStartLines[i].pPoints[j]->pConnectedLines = pLineLink;
-		}
-	}
-
-
-
-	//setup polygons
-	for( int i = 0; i != pExistingPolyhedron->iPolygonCount; ++i )
-	{
-		pStartPolygons[i].vSurfaceNormal = pExistingPolyhedron->pPolygons[i].polyNormal;
-		Polyhedron_IndexedLineReference_t *pOffsetPolyhedronLines = &pExistingPolyhedron->pIndices[pExistingPolyhedron->pPolygons[i].iFirstIndex];
-
-		
-		GeneratePolyhedronFromPlanes_LineLL *pFirstLink = &pStartLineLinks[iCurrentLineLinkIndex];
-		pStartPolygons[i].pLines = pFirstLink; //technically going to link to itself on first pass, then get linked properly immediately afterward
-		for( int j = 0; j != pExistingPolyhedron->pPolygons[i].iIndexCount; ++j )
-		{
-			GeneratePolyhedronFromPlanes_LineLL *pLineLink = &pStartLineLinks[iCurrentLineLinkIndex++];
-			pLineLink->pLine = &pStartLines[pOffsetPolyhedronLines[j].iLineIndex];
-			pLineLink->iReferenceIndex = pOffsetPolyhedronLines[j].iEndPointIndex;
-			
-			pLineLink->pLine->pPolygons[pLineLink->iReferenceIndex] = &pStartPolygons[i];
-			pLineLink->pLine->pPolygonLineLinks[pLineLink->iReferenceIndex] = pLineLink;			
-
-			pLineLink->pPrev = pStartPolygons[i].pLines;
-			pStartPolygons[i].pLines->pNext = pLineLink;
-			pStartPolygons[i].pLines = pLineLink;
-		}
-		
-		pFirstLink->pPrev = pStartPolygons[i].pLines;
-		pStartPolygons[i].pLines->pNext = pFirstLink;
-	}
-
-	Assert( iCurrentLineLinkIndex == (pExistingPolyhedron->iLineCount * 4) );
-
-	//go back to point line links so we can circularly link them as well as order them now that every point has all its line links
-	for( int i = 0; i != pExistingPolyhedron->iVertexCount; ++i )
-	{
-		//interlink the points
-		{
-			GeneratePolyhedronFromPlanes_LineLL *pLastVisitedLink = pStartPoints[i].pConnectedLines;
-			GeneratePolyhedronFromPlanes_LineLL *pCurrentLink = pLastVisitedLink;
-			
-			do
-			{
-				pCurrentLink->pPrev = pLastVisitedLink;
-				pLastVisitedLink = pCurrentLink;
-				pCurrentLink = pCurrentLink->pNext;
-			} while( pCurrentLink );
-
-			//circular link
-			pLastVisitedLink->pNext = pStartPoints[i].pConnectedLines;
-			pStartPoints[i].pConnectedLines->pPrev = pLastVisitedLink;
-		}
-
-
-		//fix ordering
-		GeneratePolyhedronFromPlanes_LineLL *pFirstLink = pStartPoints[i].pConnectedLines;
-		GeneratePolyhedronFromPlanes_LineLL *pWorkLink = pFirstLink;
-		GeneratePolyhedronFromPlanes_LineLL *pSearchLink;
-		GeneratePolyhedronFromPlanes_Polygon *pLookingForPolygon;
-		Assert( pFirstLink->pNext != pFirstLink );
-		do
-		{
-			pLookingForPolygon = pWorkLink->pLine->pPolygons[1 - pWorkLink->iReferenceIndex]; //grab pointer to left polygon
-			pSearchLink = pWorkLink->pPrev;
-
-			while( pSearchLink->pLine->pPolygons[pSearchLink->iReferenceIndex] != pLookingForPolygon )
-				pSearchLink = pSearchLink->pPrev;
-
-			Assert( pSearchLink->pLine->pPolygons[pSearchLink->iReferenceIndex] == pWorkLink->pLine->pPolygons[1 - pWorkLink->iReferenceIndex] );
-
-			//pluck the search link from wherever it is
-			pSearchLink->pPrev->pNext = pSearchLink->pNext;
-			pSearchLink->pNext->pPrev = pSearchLink->pPrev;
-
-			//insert the search link just before the work link			
-			pSearchLink->pPrev = pWorkLink->pPrev;
-			pSearchLink->pNext = pWorkLink;
-			
-			pSearchLink->pPrev->pNext = pSearchLink;
-			pWorkLink->pPrev = pSearchLink;
-
-			pWorkLink = pSearchLink;
-		} while( pWorkLink != pFirstLink );
-	}
-
-	GeneratePolyhedronFromPlanes_UnorderedPointLL *pPoints = (GeneratePolyhedronFromPlanes_UnorderedPointLL *)stackalloc( pExistingPolyhedron->iVertexCount * sizeof( GeneratePolyhedronFromPlanes_UnorderedPointLL ) );
-	GeneratePolyhedronFromPlanes_UnorderedLineLL *pLines = (GeneratePolyhedronFromPlanes_UnorderedLineLL *)stackalloc( pExistingPolyhedron->iLineCount * sizeof( GeneratePolyhedronFromPlanes_UnorderedLineLL ) );
-	GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pPolygons = (GeneratePolyhedronFromPlanes_UnorderedPolygonLL *)stackalloc( pExistingPolyhedron->iPolygonCount * sizeof( GeneratePolyhedronFromPlanes_UnorderedPolygonLL ) );
-
-	//setup point collection
-	{
-		pPoints[0].pPrev = NULL;
-		pPoints[0].pPoint = &pStartPoints[0];
-		pPoints[0].pNext = &pPoints[1];
-		int iLastPoint = pExistingPolyhedron->iVertexCount - 1;
-		for( int i = 1; i != iLastPoint; ++i )
-		{
-			pPoints[i].pPrev = &pPoints[i - 1];
-			pPoints[i].pPoint = &pStartPoints[i];
-			pPoints[i].pNext = &pPoints[i + 1];
-		}
-		pPoints[iLastPoint].pPrev = &pPoints[iLastPoint - 1];
-		pPoints[iLastPoint].pPoint = &pStartPoints[iLastPoint];
-		pPoints[iLastPoint].pNext = NULL;
-	}
-
-	//setup line collection
-	{
-		pLines[0].pPrev = NULL;
-		pLines[0].pLine = &pStartLines[0];
-		pLines[0].pNext = &pLines[1];
-		int iLastLine = pExistingPolyhedron->iLineCount - 1;
-		for( int i = 1; i != iLastLine; ++i )
-		{
-			pLines[i].pPrev = &pLines[i - 1];
-			pLines[i].pLine = &pStartLines[i];
-			pLines[i].pNext = &pLines[i + 1];
-		}
-		pLines[iLastLine].pPrev = &pLines[iLastLine - 1];
-		pLines[iLastLine].pLine = &pStartLines[iLastLine];
-		pLines[iLastLine].pNext = NULL;
-	}
-
-	//setup polygon collection
-	{
-		pPolygons[0].pPrev = NULL;
-		pPolygons[0].pPolygon = &pStartPolygons[0];
-		pPolygons[0].pNext = &pPolygons[1];
-		int iLastPolygon = pExistingPolyhedron->iPolygonCount - 1;
-		for( int i = 1; i != iLastPolygon; ++i )
-		{
-			pPolygons[i].pPrev = &pPolygons[i - 1];
-			pPolygons[i].pPolygon = &pStartPolygons[i];
-			pPolygons[i].pNext = &pPolygons[i + 1];
-		}
-		pPolygons[iLastPolygon].pPrev = &pPolygons[iLastPolygon - 1];
-		pPolygons[iLastPolygon].pPolygon = &pStartPolygons[iLastPolygon];
-		pPolygons[iLastPolygon].pNext = NULL;
-	}
-
-	return ClipLinkedGeometry( pPolygons, pLines, pPoints, pUsefulPlanes, iUsefulPlaneCount, fOnPlaneEpsilon, bUseTemporaryMemory );
-}
-
-
-
-Vector FindPointInPlanes( const float *pPlanes, int planeCount )
-{
-	Vector point = vec3_origin;
-
-	for ( int i = 0; i < planeCount; i++ )
-	{
-		float fD = DotProduct( *(Vector *)&pPlanes[i*4], point ) - pPlanes[i*4 + 3];
-		if ( fD < 0 )
-		{
-			point -= fD * (*(Vector *)&pPlanes[i*4]);
-		}
-	}
-	return point;
-}
-
-
-
-bool FindConvexShapeLooseAABB( const float *pInwardFacingPlanes, int iPlaneCount, Vector *pAABBMins, Vector *pAABBMaxs ) //bounding box of the convex shape (subject to floating point error)
-{
-	//returns false if the AABB hasn't been set
-	if( pAABBMins == NULL && pAABBMaxs == NULL ) //no use in actually finding out what it is
-		return false;
-
-	struct FindConvexShapeAABB_Polygon_t
-	{
-		float *verts;
-		int iVertCount;
-	};
-
-	float *pMovedPlanes = (float *)stackalloc( iPlaneCount * 4 * sizeof( float ) );
-	//Vector vPointInPlanes = FindPointInPlanes( pInwardFacingPlanes, iPlaneCount );
-
-	for( int i = 0; i != iPlaneCount; ++i )
-	{
-		pMovedPlanes[(i * 4) + 0] = pInwardFacingPlanes[(i * 4) + 0];
-		pMovedPlanes[(i * 4) + 1] = pInwardFacingPlanes[(i * 4) + 1];
-		pMovedPlanes[(i * 4) + 2] = pInwardFacingPlanes[(i * 4) + 2];
-		pMovedPlanes[(i * 4) + 3] = pInwardFacingPlanes[(i * 4) + 3] - 100.0f; //move planes out a lot to kill some imprecision problems
-	}
-	
-	
-
-	//vAABBMins = vAABBMaxs = FindPointInPlanes( pPlanes, iPlaneCount );
-	float *vertsIn = NULL; //we'll be allocating a new buffer for this with each new polygon, and moving it off to the polygon array
-	float *vertsOut = (float *)stackalloc( (iPlaneCount + 4) * (sizeof( float ) * 3) ); //each plane will initially have 4 points in its polygon representation, and each plane clip has the possibility to add 1 point to the polygon
-	float *vertsSwap;
-
-	FindConvexShapeAABB_Polygon_t *pPolygons = (FindConvexShapeAABB_Polygon_t *)stackalloc( iPlaneCount * sizeof( FindConvexShapeAABB_Polygon_t ) );
-	int iPolyCount = 0;
-
-	for ( int i = 0; i < iPlaneCount; i++ )
-	{
-		Vector *pPlaneNormal = (Vector *)&pInwardFacingPlanes[i*4];
-		float fPlaneDist = pInwardFacingPlanes[(i*4) + 3];
-
-		if( vertsIn == NULL )
-			vertsIn = (float *)stackalloc( (iPlaneCount + 4) * (sizeof( float ) * 3) );
-
-		// Build a big-ass poly in this plane
-		int vertCount = PolyFromPlane( (Vector *)vertsIn, *pPlaneNormal, fPlaneDist, 100000.0f );
-
-		//chop it by every other plane
-		for( int j = 0; j < iPlaneCount; j++ )
-		{
-			// don't clip planes with themselves
-			if ( i == j )
-				continue;
-
-			// Chop the polygon against this plane
-			vertCount = ClipPolyToPlane( (Vector *)vertsIn, vertCount, (Vector *)vertsOut, *(Vector *)&pMovedPlanes[j*4], pMovedPlanes[(j*4) + 3], 0.0f );
-
-			//swap the input and output arrays
-			vertsSwap = vertsIn; vertsIn = vertsOut; vertsOut = vertsSwap;
-
-			// Less than a poly left, something's wrong, don't bother with this polygon
-			if ( vertCount < 3 )
-				break;
-		}
-
-		if ( vertCount < 3 )
-			continue; //not enough to work with
-
-		pPolygons[iPolyCount].iVertCount = vertCount;
-		pPolygons[iPolyCount].verts = vertsIn;
-		vertsIn = NULL;
-		++iPolyCount;
-	}
-
-	if( iPolyCount == 0 )
-		return false;
-
-	//initialize the AABB to the first point available
-	Vector vAABBMins, vAABBMaxs;
-	vAABBMins = vAABBMaxs = ((Vector *)pPolygons[0].verts)[0];
-
-	if( pAABBMins && pAABBMaxs ) //they want the full box
-	{
-		for( int i = 0; i != iPolyCount; ++i )
-		{
-			Vector *PolyVerts = (Vector *)pPolygons[i].verts;
-			for( int j = 0; j != pPolygons[i].iVertCount; ++j )
-			{
-				if( PolyVerts[j].x < vAABBMins.x ) 
-					vAABBMins.x = PolyVerts[j].x;
-				if( PolyVerts[j].y < vAABBMins.y ) 
-					vAABBMins.y = PolyVerts[j].y;
-				if( PolyVerts[j].z < vAABBMins.z ) 
-					vAABBMins.z = PolyVerts[j].z;
-
-				if( PolyVerts[j].x > vAABBMaxs.x ) 
-					vAABBMaxs.x = PolyVerts[j].x;
-				if( PolyVerts[j].y > vAABBMaxs.y ) 
-					vAABBMaxs.y = PolyVerts[j].y;
-				if( PolyVerts[j].z > vAABBMaxs.z ) 
-					vAABBMaxs.z = PolyVerts[j].z;
-			}
-		}
-		*pAABBMins = vAABBMins;
-		*pAABBMaxs = vAABBMaxs;
-	}
-	else if( pAABBMins ) //they only want the min
-	{
-		for( int i = 0; i != iPolyCount; ++i )
-		{
-			Vector *PolyVerts = (Vector *)pPolygons[i].verts;
-			for( int j = 0; j != pPolygons[i].iVertCount; ++j )
-			{
-				if( PolyVerts[j].x < vAABBMins.x ) 
-					vAABBMins.x = PolyVerts[j].x;
-				if( PolyVerts[j].y < vAABBMins.y ) 
-					vAABBMins.y = PolyVerts[j].y;
-				if( PolyVerts[j].z < vAABBMins.z ) 
-					vAABBMins.z = PolyVerts[j].z;
-			}
-		}
-		*pAABBMins = vAABBMins;
-	}
-	else //they only want the max
-	{
-		for( int i = 0; i != iPolyCount; ++i )
-		{
-			Vector *PolyVerts = (Vector *)pPolygons[i].verts;
-			for( int j = 0; j != pPolygons[i].iVertCount; ++j )
-			{
-				if( PolyVerts[j].x > vAABBMaxs.x ) 
-					vAABBMaxs.x = PolyVerts[j].x;
-				if( PolyVerts[j].y > vAABBMaxs.y ) 
-					vAABBMaxs.y = PolyVerts[j].y;
-				if( PolyVerts[j].z > vAABBMaxs.z ) 
-					vAABBMaxs.z = PolyVerts[j].z;
-			}
-		}
-		*pAABBMaxs = vAABBMaxs;
-	}
-
-	return true;
-}
-
-
-
-
-
-
-
-CPolyhedron *ConvertLinkedGeometryToPolyhedron( GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pPolygons, GeneratePolyhedronFromPlanes_UnorderedLineLL *pLines, GeneratePolyhedronFromPlanes_UnorderedPointLL *pPoints, bool bUseTemporaryMemory )
-{
-	Assert( (pPolygons != NULL) && (pLines != NULL) && (pPoints != NULL) );
-	unsigned int iPolyCount = 0, iLineCount = 0, iPointCount = 0, iIndexCount = 0;
-
-	GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pActivePolygonWalk = pPolygons;	
-	do
-	{
-		++iPolyCount;
-		GeneratePolyhedronFromPlanes_LineLL *pLineWalk = pActivePolygonWalk->pPolygon->pLines;
-		GeneratePolyhedronFromPlanes_LineLL *pFirstLine = pLineWalk;
-		Assert( pLineWalk != NULL );
-		
-		do
-		{
-			++iIndexCount;
-			pLineWalk = pLineWalk->pNext;
-		} while( pLineWalk != pFirstLine );
-
-		pActivePolygonWalk = pActivePolygonWalk->pNext;
-	} while( pActivePolygonWalk );
-
-	GeneratePolyhedronFromPlanes_UnorderedLineLL *pActiveLineWalk = pLines;
-	do
-	{
-		++iLineCount;
-		pActiveLineWalk = pActiveLineWalk->pNext;
-	} while( pActiveLineWalk );
-
-	GeneratePolyhedronFromPlanes_UnorderedPointLL *pActivePointWalk = pPoints;
-	do
-	{
-		++iPointCount;
-		pActivePointWalk = pActivePointWalk->pNext;
-	} while( pActivePointWalk );	
-	
-	CPolyhedron *pReturn;
-	if( bUseTemporaryMemory )
-	{
-		pReturn = GetTempPolyhedron( iPointCount, iLineCount, iIndexCount, iPolyCount );
-	}
-	else
-	{
-		pReturn = CPolyhedron_AllocByNew::Allocate( iPointCount, iLineCount, iIndexCount, iPolyCount );
-	}
-
-	Vector *pVertexArray = pReturn->pVertices;
-	Polyhedron_IndexedLine_t *pLineArray = pReturn->pLines;
-	Polyhedron_IndexedLineReference_t *pIndexArray = pReturn->pIndices;
-	Polyhedron_IndexedPolygon_t *pPolyArray = pReturn->pPolygons;
-
-	//copy points
-	pActivePointWalk = pPoints;
-	for( unsigned int i = 0; i != iPointCount; ++i )
-	{
-		pVertexArray[i] = pActivePointWalk->pPoint->ptPosition;
-		pActivePointWalk->pPoint->iSaveIndices = i; //storing array indices
-		pActivePointWalk = pActivePointWalk->pNext;
-	}
-
-	//copy lines
-	pActiveLineWalk = pLines;
-	for( unsigned int i = 0; i != iLineCount; ++i )
-	{
-		pLineArray[i].iPointIndices[0] = (unsigned short)pActiveLineWalk->pLine->pPoints[0]->iSaveIndices;
-		pLineArray[i].iPointIndices[1] = (unsigned short)pActiveLineWalk->pLine->pPoints[1]->iSaveIndices;
-
-		pActiveLineWalk->pLine->iSaveIndices = i; //storing array indices
-
-		pActiveLineWalk = pActiveLineWalk->pNext;
-	}
-
-	//copy polygons and indices at the same time
-	pActivePolygonWalk = pPolygons;
-	iIndexCount = 0;
-	for( unsigned int i = 0; i != iPolyCount; ++i )
-	{
-		pPolyArray[i].polyNormal = pActivePolygonWalk->pPolygon->vSurfaceNormal;
-		pPolyArray[i].iFirstIndex = iIndexCount;		
-		
-		GeneratePolyhedronFromPlanes_LineLL *pLineWalk = pActivePolygonWalk->pPolygon->pLines;
-		GeneratePolyhedronFromPlanes_LineLL *pFirstLine = pLineWalk;
-		do
-		{
-			//pIndexArray[iIndexCount] = pLineWalk->pLine->pPoints[pLineWalk->iReferenceIndex]->iWorkData; //startpoint of each line, iWorkData is the index of the vertex
-			pIndexArray[iIndexCount].iLineIndex = pLineWalk->pLine->iSaveIndices;
-			pIndexArray[iIndexCount].iEndPointIndex = pLineWalk->iReferenceIndex;
-			
-			++iIndexCount;
-			pLineWalk = pLineWalk->pNext;
-		} while( pLineWalk != pFirstLine );
-
-		pPolyArray[i].iIndexCount = iIndexCount - pPolyArray[i].iFirstIndex;
-
-		pActivePolygonWalk = pActivePolygonWalk->pNext;	
-	}
-
-#if defined( _DEBUG ) && defined( ENABLE_DEBUG_POLYHEDRON_DUMPS ) && defined( DEBUG_DUMP_POLYHEDRONS_TO_NUMBERED_GLVIEWS )
-	char szCollisionFile[128];
-	CreateDumpDirectory( "PolyhedronDumps" );
-	Q_snprintf( szCollisionFile, 128, "PolyhedronDumps/NewStyle_PolyhedronDump%i.txt", g_iPolyhedronDumpCounter );
-	++g_iPolyhedronDumpCounter;
-
-	remove( szCollisionFile );
-	DumpPolyhedronToGLView( pReturn, szCollisionFile, &s_matIdentity );
-	DumpPolyhedronToGLView( pReturn, "PolyhedronDumps/NewStyle_PolyhedronDump_All-Appended.txt", &s_matIdentity );
-#endif
-
-	return pReturn;
-}
-
-
-
-#ifdef _DEBUG
-
-void DumpPointListToGLView( GeneratePolyhedronFromPlanes_UnorderedPointLL *pHead, PolyhedronPointPlanarity planarity, const Vector &vColor, const char *szDumpFile, const VMatrix *pTransform )
-{
-#ifdef ENABLE_DEBUG_POLYHEDRON_DUMPS
-	if( pTransform == NULL )
-		pTransform = &s_matIdentity;
-	
-	FILE *pFile = fopen( szDumpFile, "ab" );
-	
-	while( pHead )
-	{
-		if( pHead->pPoint->planarity == planarity )
-		{
-			const Vector vPointExtents( 0.5f, 0.5f, 0.01f );
-			DumpAABBToGLView( (*pTransform) * pHead->pPoint->ptPosition, vPointExtents, vColor, pFile );
-		}
-		pHead = pHead->pNext;
-	}
-
-	fclose( pFile );
-#endif
-}
-
-const char * DumpPolyhedronCutHistory( const CUtlVector<CPolyhedron *> &DumpedHistory, const CUtlVector<const float *> &CutHistory, const VMatrix *pTransform )
-{
-#ifdef ENABLE_DEBUG_POLYHEDRON_DUMPS
-	if( pTransform == NULL )
-		pTransform = &s_matIdentity;
-
-	static char szDumpFile[100] = "FailedPolyhedronCut_Error.txt"; //most recent filename returned for further dumping
-
-	for( int i = 0; i != DumpedHistory.Count(); ++i )
-	{
-		if( DumpedHistory[i] != NULL )
-		{
-			Q_snprintf( szDumpFile, 100, "FailedPolyhedronCut_%d.txt", i );
-			DumpPolyhedronToGLView( DumpedHistory[i], szDumpFile, pTransform );
-			DumpPlaneToGlView( CutHistory[i], 1.0f, szDumpFile, pTransform );
-		}
-	}
-
-	return szDumpFile;
-#else
-	return NULL;
-#endif
-}
-
-#ifdef ENABLE_DEBUG_POLYHEDRON_DUMPS
-#define AssertMsg_DumpPolyhedron(condition, message)\
-	if( (condition) == false )\
-	{\
-		VMatrix matTransform;\
-		matTransform.Identity();\
-		matTransform[0][0] = matTransform[1][1] = matTransform[2][2] = 25.0f;\
-		matTransform.SetTranslation( -DebugCutHistory.Tail()->Center() * 25.0f );\
-		const char *szLastDumpFile = DumpPolyhedronCutHistory( DebugCutHistory, PlaneCutHistory, &matTransform );\
-		DumpPointListToGLView( pAllPoints, POINT_ALIVE, Vector( 0.9f, 0.9f, 0.9f ), szLastDumpFile, &matTransform );\
-		DumpPointListToGLView( pAllPoints, POINT_ONPLANE, Vector( 0.5f, 0.5f, 0.5f ), szLastDumpFile, &matTransform );\
-		DumpPointListToGLView( pDeadPointCollection, POINT_DEAD, Vector( 0.1f, 0.1f, 0.1f ), szLastDumpFile, &matTransform );\
-		if( pStartPoint )\
-		{\
-			FILE *pFileDumpRepairProgress = fopen( szLastDumpFile, "ab" );\
-			DumpAABBToGLView( matTransform * pStartPoint->ptPosition, Vector( 2.0f, 0.05f, 0.05f ), Vector( 0.0f, 1.0f, 0.0f ), pFileDumpRepairProgress );\
-			DumpAABBToGLView( matTransform * pWorkPoint->ptPosition, Vector( 2.0f, 0.05f, 0.05f ), Vector( 1.0f, 0.0f, 0.0f ), pFileDumpRepairProgress );\
-			fclose( pFileDumpRepairProgress );\
-		}\
-		AssertMsg( condition, message );\
-	}
-#else
-#define AssertMsg_DumpPolyhedron(condition, message) AssertMsg( condition, message )
-#endif
-#define Assert_DumpPolyhedron(condition) AssertMsg_DumpPolyhedron( condition, #condition )
-
-#else
-
-#define AssertMsg_DumpPolyhedron(condition, message) NULL;
-#define Assert_DumpPolyhedron(condition) NULL;
-
-#endif
-
-CPolyhedron *ClipLinkedGeometry( GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pAllPolygons, GeneratePolyhedronFromPlanes_UnorderedLineLL *pAllLines, GeneratePolyhedronFromPlanes_UnorderedPointLL *pAllPoints, const float *pOutwardFacingPlanes, int iPlaneCount, float fOnPlaneEpsilon, bool bUseTemporaryMemory )
-{
-	const float fNegativeOnPlaneEpsilon = -fOnPlaneEpsilon;
-
-#ifdef _DEBUG
-	CUtlVector<CPolyhedron *> DebugCutHistory;
-	CUtlVector<const float *> PlaneCutHistory;
-	GeneratePolyhedronFromPlanes_Point *pStartPoint = NULL;
-	GeneratePolyhedronFromPlanes_Point *pWorkPoint = NULL;
-
-	static int iPolyhedronClipCount = 0;
-	++iPolyhedronClipCount;
-	
-	DebugCutHistory.AddToTail( ConvertLinkedGeometryToPolyhedron( pAllPolygons, pAllLines, pAllPoints, false ) );
-#endif
-
-	//clear out polygon work variables
-	{
-		GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pActivePolygonWalk = pAllPolygons;
-		do
-		{
-			pActivePolygonWalk->pPolygon->bMissingASide = false;
-			pActivePolygonWalk = pActivePolygonWalk->pNext;
-		} while( pActivePolygonWalk );
-	}
-
-
-	//Collections of dead pointers for reallocation, shouldn't be touched until the current loop iteration is done.
-	GeneratePolyhedronFromPlanes_UnorderedPointLL	*pDeadPointCollection = NULL;
-	GeneratePolyhedronFromPlanes_UnorderedLineLL	*pDeadLineCollection = NULL;
-	GeneratePolyhedronFromPlanes_UnorderedPolygonLL	*pDeadPolygonCollection = NULL;
-	GeneratePolyhedronFromPlanes_LineLL				*pDeadLineLinkCollection = NULL;
-
-
-	for( int iCurrentPlane = 0; iCurrentPlane != iPlaneCount; ++iCurrentPlane )
-	{
-		//clear out line work variables
-		{
-			GeneratePolyhedronFromPlanes_UnorderedLineLL *pActiveLineWalk = pAllLines;
-			do
-			{
-				pActiveLineWalk->pLine->bAlive = false;
-				pActiveLineWalk->pLine->bCut = false;
-
-				pActiveLineWalk = pActiveLineWalk->pNext;
-			} while( pActiveLineWalk );
-		}
-		
-		//TODO: Move these pointers into a reallocation pool
-		pDeadPointCollection = NULL; 
-		pDeadLineCollection = NULL;
-		pDeadLineLinkCollection = NULL;
-		pDeadPolygonCollection = NULL;
-
-		Vector vNormal = *((Vector *)&pOutwardFacingPlanes[(iCurrentPlane * 4) + 0]);
-		/*double vNormalAsDouble[3];
-		vNormalAsDouble[0] = vNormal.x;
-		vNormalAsDouble[1] = vNormal.y;
-		vNormalAsDouble[2] = vNormal.z;*/
-		float fPlaneDist = pOutwardFacingPlanes[(iCurrentPlane * 4) + 3];
-
-		//===================================================================================================
-		// Step 1: Categorize each point as being either cut, split, or alive
-		//===================================================================================================
-		{
-			bool bAllPointsDead = true;
-			bool bAllPointsAlive = true;
-
-			//find point distances from the plane
-			GeneratePolyhedronFromPlanes_UnorderedPointLL *pActivePointWalk = pAllPoints;
-			do
-			{
-				GeneratePolyhedronFromPlanes_Point *pPoint = pActivePointWalk->pPoint;
-				float fPointDist = vNormal.Dot( pPoint->ptPosition ) - fPlaneDist;
-				if( fPointDist > fOnPlaneEpsilon )
-				{
-					pPoint->planarity = POINT_DEAD; //point is dead, bang bang
-
-					//mark connected lines as cut
-					GeneratePolyhedronFromPlanes_LineLL *pLineWalk = pPoint->pConnectedLines;
-					GeneratePolyhedronFromPlanes_LineLL *pFirstLine = pLineWalk;
-					do
-					{
-						pLineWalk->pLine->bCut = true;
-						pLineWalk = pLineWalk->pNext;
-					} while( pLineWalk != pFirstLine );
-
-					bAllPointsAlive = false;
-				}
-				else if( fPointDist <= fNegativeOnPlaneEpsilon )
-				{
-					pPoint->planarity = POINT_ALIVE; //point is in behind plane, not voted off the island....yet
-					bAllPointsDead = false;
-
-					//mark connected lines as alive
-					GeneratePolyhedronFromPlanes_LineLL *pLineWalk = pPoint->pConnectedLines;
-					GeneratePolyhedronFromPlanes_LineLL *pFirstLine = pLineWalk;
-					do
-					{
-						pLineWalk->pLine->bAlive = true; //mark the line as alive
-						pLineWalk = pLineWalk->pNext;
-					} while( pLineWalk != pFirstLine );
-				}
-				else
-				{
-					pPoint->planarity = POINT_ONPLANE; //point is on the plane, he's everyone's buddy
-
-					//Project on-plane points leaning towards death closer to the plane. This battles floating point precision decay.
-					// Consider the case of a large on-plane epsilon leaving protrusions over time
-					/*if( fPointDist < 0.0f )
-					{
-						double distAsDouble = fPointDist;
-						double vPositionAsDouble[3];
-						vPositionAsDouble[0] = pPoint->ptPosition.x;
-						vPositionAsDouble[1] = pPoint->ptPosition.y;
-						vPositionAsDouble[2] = pPoint->ptPosition.z;
-
-						pPoint->ptPosition.x = vPositionAsDouble[0] - (distAsDouble * vNormalAsDouble[0]);
-						pPoint->ptPosition.y = vPositionAsDouble[1] - (distAsDouble * vNormalAsDouble[1]);
-						pPoint->ptPosition.z = vPositionAsDouble[2] - (distAsDouble * vNormalAsDouble[2]);
-
-#if ( 0 && defined( _DEBUG ) )
-						float fDebugDist = vNormal.Dot( pPoint->ptPosition ) - fPlaneDist; //just for looking at in watch windows
-						AssertMsg( fabs( fDebugDist ) < fabs(fPointDist), "Projected point is further from plane than unprojected." );
-#endif
-						fPointDist = vNormal.Dot( pPoint->ptPosition ) - fPlaneDist; //recompute dist (not guaranteed to be 0.0 like we want)
-					}*/				
-				}
-
-				pPoint->fPlaneDist = fPointDist;
-
-				pActivePointWalk = pActivePointWalk->pNext;
-			} while( pActivePointWalk );
-
-			if( bAllPointsDead ) //all the points either died or are on the plane, no polyhedron left at all
-			{
-#ifdef _DEBUG
-				for( int i = DebugCutHistory.Count(); --i >= 0; )
-				{
-					if( DebugCutHistory[i] )
-						DebugCutHistory[i]->Release();
-				}
-				DebugCutHistory.RemoveAll();
-#endif
-
-				return NULL; 
-			}
-
-			if( bAllPointsAlive )
-				continue; //no cuts made
-
-
-			//Scan for onplane points connected to only other onplane/dead points, these points get downgraded to dead status.
-			{
-				GeneratePolyhedronFromPlanes_UnorderedPointLL *pActivePointWalk = pAllPoints;
-				do
-				{
-					if( pActivePointWalk->pPoint->planarity == POINT_ONPLANE )
-					{
-						GeneratePolyhedronFromPlanes_LineLL *pOnPlaneLineWalk = pActivePointWalk->pPoint->pConnectedLines;
-						GeneratePolyhedronFromPlanes_LineLL *pStartLineWalk = pOnPlaneLineWalk;
-						bool bDead = true; //assume it's dead and disprove
-						do
-						{
-							if ( pOnPlaneLineWalk->pLine->bAlive )
-							{
-								bDead = false;
-							}
-							else if ( pOnPlaneLineWalk->pLine->bCut )
-							{
-								//connected to a dead point.
-								if( pOnPlaneLineWalk->pNext->pLine->bCut || pOnPlaneLineWalk->pPrev->pLine->bCut )
-								{
-									//This on-plane point is surrounded by dead points on one polygon of the polyhedron.
-									//	We have to downgrade this point to dead to avoid situations where float imprecision 
-									//	turns the polyhedron into a *slightly* concave shape. Concave shapes might break this algorithm, even falsely concave shapes.
-									bDead = true;
-									break;
-								}
-							}
-
-							pOnPlaneLineWalk = pOnPlaneLineWalk->pNext;
-						} while( pOnPlaneLineWalk != pStartLineWalk );
-
-						if( bDead )
-						{
-							pActivePointWalk->pPoint->planarity = POINT_DEAD;
-
-							pOnPlaneLineWalk = pStartLineWalk;
-
-							//mark connected lines as cut
-							do
-							{
-								pOnPlaneLineWalk->pLine->bCut = true;
-								pOnPlaneLineWalk = pOnPlaneLineWalk->pNext;
-							} while( pOnPlaneLineWalk != pStartLineWalk );
-						}
-					}
-					pActivePointWalk = pActivePointWalk->pNext;
-				} while( pActivePointWalk );
-			}
-#ifdef _DEBUG
-			PlaneCutHistory.AddToTail( &pOutwardFacingPlanes[iCurrentPlane * 4] );
-#endif
-		}
-
-		
-
-
-#ifdef _DEBUG
-		//Run around the edges of all the polygons and ensure they don't have more than one point of lowered "alive" status (alive > onplane > dead) surrounded by higher status
-		//	It indicates a concave shape. It's impossible to have it occur in theoretical space. But floating point numbers introduce error.
-		{
-			GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pDebugPolygonWalk = pAllPolygons;
-			do
-			{
-				int iSurroundedCount = 0;
-				GeneratePolyhedronFromPlanes_LineLL *pDebugLineWalk = pDebugPolygonWalk->pPolygon->pLines;
-				GeneratePolyhedronFromPlanes_LineLL *pFirstDebugLine = pDebugLineWalk;
-
-				do
-				{
-					PolyhedronPointPlanarity currentPlanarity = pDebugLineWalk->pLine->pPoints[pDebugLineWalk->iReferenceIndex]->planarity;
-					
-					GeneratePolyhedronFromPlanes_LineLL *pNext = pDebugLineWalk->pNext;
-					PolyhedronPointPlanarity nextPlanarity = pNext->pLine->pPoints[pNext->iReferenceIndex]->planarity;
-
-					if( currentPlanarity < nextPlanarity )
-					{
-						GeneratePolyhedronFromPlanes_LineLL *pPrev = pDebugLineWalk->pPrev;
-						PolyhedronPointPlanarity prevPlanarity = pPrev->pLine->pPoints[pPrev->iReferenceIndex]->planarity;
-
-						if( currentPlanarity < prevPlanarity )
-						{
-							++iSurroundedCount;
-						}
-					}
-
-					pDebugLineWalk = pDebugLineWalk->pNext;
-				} while( pDebugLineWalk != pFirstDebugLine );
-
-				AssertMsg_DumpPolyhedron( iSurroundedCount <= 1, "Concave polygon, cutting process might break. Consider adjusting the on-plane epsilon to better compensate for floating point precision." );
-				pDebugPolygonWalk = pDebugPolygonWalk->pNext;
-			} while( pDebugPolygonWalk );
-		}
-#endif
-
-		//===================================================================================================
-		// Step 2: Remove dead lines. A dead line is one with a dead point that isn't connected to a living point
-		//===================================================================================================
-		{
-			GeneratePolyhedronFromPlanes_UnorderedLineLL *pActiveLineWalk = pAllLines;
-			do
-			{
-				GeneratePolyhedronFromPlanes_Line *pLine = pActiveLineWalk->pLine;
-				if( (pLine->bAlive == false) && (pLine->bCut == true) ) //not connected to a live point, but connected to a dead one. Dead line
-				{
-					//remove line from connected polygons
-					for( int i = 0; i != 2; ++i )
-					{
-						GeneratePolyhedronFromPlanes_Polygon *pPolygon = pLine->pPolygons[i];
-						GeneratePolyhedronFromPlanes_LineLL *pLineLink = pLine->pPolygonLineLinks[i];
-                        
-						pPolygon->bMissingASide = true;
-
-						if( pLineLink->pNext == pLineLink )
-						{
-							//this was the last line of the polygon, it's dead
-							pPolygon->pLines = NULL;
-						}
-						else
-						{
-							//link around this line
-							pPolygon->pLines = pLineLink->pPrev; //Always have the polygon's head line be just before the gap in the polygon
-							pLineLink->pNext->pPrev = pLineLink->pPrev;
-							pLineLink->pPrev->pNext = pLineLink->pNext;
-						}
-
-						//move the line link to the dead list
-						pLineLink->pNext = pDeadLineLinkCollection;
-						pDeadLineLinkCollection = pLineLink;
-					}
-
-					//remove the line from connected points
-					for( int i = 0; i != 2; ++i )
-					{
-						GeneratePolyhedronFromPlanes_Point *pPoint = pLine->pPoints[i];
-						GeneratePolyhedronFromPlanes_LineLL *pLineLink = pLine->pPointLineLinks[i];
-						
-						if( pLineLink->pNext == pLineLink )
-						{					
-							//this is the last line
-							pPoint->pConnectedLines = NULL;
-							Assert( pPoint->planarity != POINT_ALIVE );
-							pPoint->planarity = POINT_DEAD; //in case it was merely POINT_ONPLANE before
-						}
-						else
-						{
-							//link around this line
-							pPoint->pConnectedLines = pLineLink->pNext; //in case pLineLink was the head line
-							pLineLink->pNext->pPrev = pLineLink->pPrev;
-							pLineLink->pPrev->pNext = pLineLink->pNext;
-						}
-
-						//move the line link to the dead list
-						pLineLink->pNext = pDeadLineLinkCollection;
-						pDeadLineLinkCollection = pLineLink;
-					}
-
-					//move the line to the dead list
-					{
-						//link past this node
-						if( pActiveLineWalk->pPrev )
-							pActiveLineWalk->pPrev->pNext = pActiveLineWalk->pNext;
-						else
-							pAllLines = pActiveLineWalk->pNext;
-
-						if( pActiveLineWalk->pNext )
-							pActiveLineWalk->pNext->pPrev = pActiveLineWalk->pPrev;
-
-						GeneratePolyhedronFromPlanes_UnorderedLineLL *pNextLineWalk = pActiveLineWalk->pNext;
-						
-						//add to the dead list
-						pActiveLineWalk->pNext = pDeadLineCollection;
-						pDeadLineCollection = pActiveLineWalk;
-						
-						//next
-						pActiveLineWalk = pNextLineWalk;
-					}
-				}
-				else
-				{
-					pActiveLineWalk = pActiveLineWalk->pNext;
-				}
-			} while( pActiveLineWalk );
-		}
-
-
-		//===================================================================================================
-		// Step 3: Remove dead polygons. A dead polygon has less than 2 lines.
-		//===================================================================================================
-		{
-			GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pActivePolygonWalk = pAllPolygons;
-			do
-			{
-				GeneratePolyhedronFromPlanes_Polygon *pPolygon = pActivePolygonWalk->pPolygon;
-				GeneratePolyhedronFromPlanes_LineLL *pHeadLine = pPolygon->pLines;
-
-				bool bDead = (pHeadLine == NULL) || (pHeadLine->pNext == pHeadLine);
-				if( !bDead )
-				{
-					//there's a rare case where a polygon can be almost entirely coplanar with the cut, it comes purely out of the land of imprecision
-					bDead = true; //assume it's dead, and disprove
-
-					GeneratePolyhedronFromPlanes_LineLL *pTestLineWalk = pHeadLine;
-					do
-					{
-						if( pTestLineWalk->pLine->bAlive )
-						{
-							bDead = false;
-							break;
-						}
-							
-						pTestLineWalk = pTestLineWalk->pNext;
-					} while( pTestLineWalk != pHeadLine );
-				}
-
-				if( bDead )
-				{
-					//dead polygon, move it to the dead list
-
-					//link around this node
-					if( pActivePolygonWalk->pPrev )
-						pActivePolygonWalk->pPrev->pNext = pActivePolygonWalk->pNext;
-					else
-						pAllPolygons = pAllPolygons->pNext; //pActivePolygonWalk was the head node
-
-					if( pActivePolygonWalk->pNext )
-						pActivePolygonWalk->pNext->pPrev = pActivePolygonWalk->pPrev;
-
-					GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pNextPolygonWalk = pActivePolygonWalk->pNext;
-
-					//add to the dead list
-					pActivePolygonWalk->pNext = pDeadPolygonCollection;
-					pDeadPolygonCollection = pActivePolygonWalk;
-
-					//next
-					pActivePolygonWalk = pNextPolygonWalk;
-				}
-				else
-				{
-					AssertMsg_DumpPolyhedron( (pActivePolygonWalk->pPolygon->pLines != NULL) && 
-						(pActivePolygonWalk->pPolygon->pLines != pActivePolygonWalk->pPolygon->pLines->pNext), "Living polygon with less than 2 lines" );
-					
-					pActivePolygonWalk = pActivePolygonWalk->pNext;
-				}
-			} while( pActivePolygonWalk );
-		}
-
-		//===================================================================================================
-		// Step 4: Remove dead points.
-		//===================================================================================================
-		{
-			GeneratePolyhedronFromPlanes_UnorderedPointLL *pActivePointWalk = pAllPoints;
-			do
-			{
-				if( pActivePointWalk->pPoint->planarity == POINT_DEAD )
-				{
-					GeneratePolyhedronFromPlanes_UnorderedPointLL *pNext = pActivePointWalk->pNext;
-
-					if( pActivePointWalk->pPrev )
-						pActivePointWalk->pPrev->pNext = pActivePointWalk->pNext;
-					else
-						pAllPoints = pAllPoints->pNext;
-
-					if( pActivePointWalk->pNext )
-						pActivePointWalk->pNext->pPrev = pActivePointWalk->pPrev;
-
-					pActivePointWalk->pNext = pDeadPointCollection;
-					pDeadPointCollection = pActivePointWalk;
-
-					pActivePointWalk = pNext;
-				}
-				else
-				{
-					pActivePointWalk = pActivePointWalk->pNext;
-				}				
-			} while( pActivePointWalk );
-		}
-
-
-		//===================================================================================================
-		// Step 5: Handle cut lines
-		//===================================================================================================
-		{
-			GeneratePolyhedronFromPlanes_UnorderedLineLL *pActiveLineWalk = pAllLines;
-			do
-			{
-				GeneratePolyhedronFromPlanes_Line *pWorkLine = pActiveLineWalk->pLine;
-				Assert_DumpPolyhedron( (pWorkLine->bAlive == true) || (pWorkLine->bCut == false) ); //all dead lines should have already been removed
-				
-				if( pWorkLine->bCut )
-				{
-					GeneratePolyhedronFromPlanes_Point **pLinePoints = pWorkLine->pPoints;
-
-					Assert_DumpPolyhedron( (pLinePoints[0]->planarity == POINT_DEAD) || (pLinePoints[1]->planarity == POINT_DEAD) ); //one of the two has to be a dead point
-
-					int iDeadIndex = (pLinePoints[0]->planarity == POINT_DEAD)?(0):(1);
-					int iLivingIndex = 1 - iDeadIndex;
-					GeneratePolyhedronFromPlanes_Point *pDeadPoint = pLinePoints[iDeadIndex];
-					GeneratePolyhedronFromPlanes_Point *pLivingPoint = pLinePoints[iLivingIndex];
-
-					Assert_DumpPolyhedron( pLivingPoint->planarity == POINT_ALIVE ); //if this point were on-plane or dead, the line should be dead
-
-					//We'll be de-linking from the old point and generating a new one. We do this so other lines can still access the dead point's untouched data.
-					
-					//Generate a new point
-					GeneratePolyhedronFromPlanes_Point *pNewPoint = (GeneratePolyhedronFromPlanes_Point *)stackalloc( sizeof( GeneratePolyhedronFromPlanes_Point ) );
-					{
-						//add this point to the active list
-						pAllPoints->pPrev = (GeneratePolyhedronFromPlanes_UnorderedPointLL *)stackalloc( sizeof( GeneratePolyhedronFromPlanes_UnorderedPointLL ) );
-						pAllPoints->pPrev->pNext = pAllPoints;
-						pAllPoints = pAllPoints->pPrev;
-						pAllPoints->pPrev = NULL;
-						pAllPoints->pPoint = pNewPoint;
-
-
-						float fInvTotalDist = 1.0f/(pDeadPoint->fPlaneDist - pLivingPoint->fPlaneDist); //subtraction because the living index is known to be negative
-						pNewPoint->ptPosition = (pLivingPoint->ptPosition * (pDeadPoint->fPlaneDist * fInvTotalDist)) - (pDeadPoint->ptPosition * (pLivingPoint->fPlaneDist * fInvTotalDist));
-
-#if ( 0 && defined( _DEBUG ) )
-						float fDebugDist = vNormal.Dot( pNewPoint->ptPosition ) - fPlaneDist; //just for looking at in watch windows
-						AssertMsg_DumpPolyhedron( fabs( fDebugDist ) < fOnPlaneEpsilon, "Generated split point is far from plane" );
-
-						//verify that the new point isn't sitting on top of another
-						{
-							GeneratePolyhedronFromPlanes_UnorderedPointLL *pActivePointWalk = pAllPoints;
-							do
-							{
-								if( pActivePointWalk->pPoint != pNewPoint )
-								{
-									Vector vDiff = pActivePointWalk->pPoint->ptPosition - pNewPoint->ptPosition;
-
-									AssertMsg_DumpPolyhedron( vDiff.Length() > fOnPlaneEpsilon, "Generated a point on top of another" );
-								}
-								pActivePointWalk = pActivePointWalk->pNext;
-							} while( pActivePointWalk );
-						}
-#endif
-
-						pNewPoint->planarity = POINT_ONPLANE;
-						pNewPoint->fPlaneDist = 0.0f;
-					}
-					
-					GeneratePolyhedronFromPlanes_LineLL *pNewLineLink = pNewPoint->pConnectedLines = (GeneratePolyhedronFromPlanes_LineLL *)stackalloc( sizeof( GeneratePolyhedronFromPlanes_LineLL ) );
-					pNewLineLink->pLine = pWorkLine;
-					pNewLineLink->pNext = pNewLineLink;
-					pNewLineLink->pPrev = pNewLineLink;
-					pNewLineLink->iReferenceIndex = iLivingIndex;
-
-					pWorkLine->pPoints[iDeadIndex] = pNewPoint;
-					pWorkLine->pPointLineLinks[iDeadIndex] = pNewLineLink;
-					pNewPoint->pConnectedLines = pNewLineLink;
-
-					//A new line is needed on each polygon touching the dead point to connect the two new endpoints for split lines. 
-					// So mark connected polygons as missing a side.
-					for( int i = 0; i != 2; ++i )
-						pWorkLine->pPolygons[i]->bMissingASide = true;
-					
-
-					//Always have a cut polygon's head line be just before the gap in the polygon. 
-					// In this case, we know that one of the two polygons goes clockwise into the dead point, so have that polygon point at this line.
-					// We don't know enough about the other polygon to do anything here, but another cut line will handle that polygon. So it all works out in the end.
-					pWorkLine->pPolygons[iDeadIndex]->pLines = pWorkLine->pPolygonLineLinks[iDeadIndex];
-				}
-
-				pActiveLineWalk = pActiveLineWalk->pNext;
-			} while( pActiveLineWalk );
-		}
-
-
-		//===================================================================================================
-		// Step 6: Repair polygons that are missing a side. And generate the new coplanar polygon.
-		//===================================================================================================
-		{
-			//Find the first polygon missing a side.
-			// We'll then walk from polygon to polygon using line connections so that we can generate the new polygon in a clockwise manner.
-			GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pActivePolygonWalk = pAllPolygons;
-
-			while( (pActivePolygonWalk != NULL) && (pActivePolygonWalk->pPolygon->bMissingASide == false) )
-			{
-				pActivePolygonWalk = pActivePolygonWalk->pNext;
-			}
-
-			//acquire iteration data
-#ifndef _DEBUG
-			GeneratePolyhedronFromPlanes_Point *pStartPoint;
-			GeneratePolyhedronFromPlanes_Point *pWorkPoint;
-#endif
-
-			GeneratePolyhedronFromPlanes_LineLL *pLastLineLink;
-			GeneratePolyhedronFromPlanes_Polygon *pWorkPolygon;			
-			GeneratePolyhedronFromPlanes_LineLL *pTestLine;
-
-#ifdef _DEBUG
-			GeneratePolyhedronFromPlanes_Polygon *pLastWorkPolygon = NULL;
-			GeneratePolyhedronFromPlanes_Point *pLastWorkPoint = NULL;
-#endif
-
-			if( pActivePolygonWalk )
-			{
-				//grab the polygon we'll be starting with
-				GeneratePolyhedronFromPlanes_Polygon *pBrokenPolygon = pActivePolygonWalk->pPolygon;
-				
-				{
-					GeneratePolyhedronFromPlanes_LineLL *pTemp = pBrokenPolygon->pLines->pNext;
-					pStartPoint = pTemp->pLine->pPoints[1 - pTemp->iReferenceIndex];
-					Assert_DumpPolyhedron( pStartPoint->planarity == POINT_ONPLANE ); //every working point should be coplanar
-					pLastLineLink = pTemp->pLine->pPointLineLinks[1 - pTemp->iReferenceIndex]->pNext;
-					pWorkPolygon = pBrokenPolygon;
-				}
-
-				pWorkPoint = pStartPoint;
-				pTestLine = pLastLineLink->pPrev; //rotate counterclockwise around the point
-			}
-			else
-			{
-				//apparently the plane was entirely through existing polygonal borders, extremely rare but it can happen with inefficient cutting planes
-                GeneratePolyhedronFromPlanes_UnorderedPointLL *pActivePointWalk = pAllPoints;
-				while( (pActivePointWalk != NULL) && (pActivePointWalk->pPoint->planarity != POINT_ONPLANE) )
-				{
-					pActivePointWalk = pActivePointWalk->pNext;
-				}
-
-				Assert( pActivePointWalk != NULL );
-
-				pStartPoint = pWorkPoint = pActivePointWalk->pPoint;
-				GeneratePolyhedronFromPlanes_LineLL *pLines = pWorkPoint->pConnectedLines;
-				
-				while( !pLines->pLine->bAlive ) //seek clockwise until we find a line not on the plane
-					pLines = pLines->pNext;
-
-				while( pLines->pLine->bAlive ) //now seek counterclockwise until we find a line on the plane (in case we started on an alive line last seek)
-					pLines = pLines->pPrev;
-
-				//now pLines points at one side of the polygon, with pActivePointWalk
-				pLastLineLink = pLines;
-				pTestLine = pLines->pPrev;
-				pWorkPolygon = pTestLine->pLine->pPolygons[1 - pTestLine->iReferenceIndex];
-
-			}
-
-			//create the new polygon
-			GeneratePolyhedronFromPlanes_Polygon *pNewPolygon = (GeneratePolyhedronFromPlanes_Polygon *)stackalloc( sizeof( GeneratePolyhedronFromPlanes_Polygon ) );
-			{
-				//before we forget, add this polygon to the active list
-				pAllPolygons->pPrev = (GeneratePolyhedronFromPlanes_UnorderedPolygonLL *)stackalloc( sizeof( GeneratePolyhedronFromPlanes_UnorderedPolygonLL ) );
-				pAllPolygons->pPrev->pNext = pAllPolygons;
-				pAllPolygons = pAllPolygons->pPrev;
-				pAllPolygons->pPrev = NULL;
-				pAllPolygons->pPolygon = pNewPolygon;
-
-				pNewPolygon->bMissingASide = false; //technically missing all it's sides, but we're fixing it now
-				pNewPolygon->vSurfaceNormal = vNormal;
-				pNewPolygon->pLines = NULL;
-			}
-
-
-
-			//===================================================================================================================
-			// The general idea of the upcoming algorithm to put together a new polygon and patch broken polygons...
-			//	You have a point and a line the algorithm just jumped across.
-			//		1. Rotate through the point's line links one time counterclockwise (pPrev)
-			//		2. If the line is cut, then we make a new bridging line in the polygon between that line and the one counterclockwise to it. (pPrev)
-			//			If the line is on-plane. Skip the bridge line making, but set links to the new polygon as if we'd just created the bridge
-			//		3. Once we follow a line back to the point where we started, we should be all done.
-
-			do
-			{
-				if( pWorkPolygon->bMissingASide )
-				{
-					//during the cutting process we made sure that the head line link was going clockwise into the missing area
-					GeneratePolyhedronFromPlanes_LineLL *pGapLines[2];
-					pGapLines[1] = pTestLine->pLine->pPolygonLineLinks[pTestLine->iReferenceIndex]; //get the same line, but in the polygons linked list.
-					Assert_DumpPolyhedron( pGapLines[1]->pLine == pTestLine->pLine );
-					pGapLines[0] = pGapLines[1]->pPrev;
-
-					Assert_DumpPolyhedron( pWorkPolygon->bMissingASide );
-
-#ifdef _DEBUG
-					{
-						//ensure that the space between the gap lines is the only space where fixing is required
-						GeneratePolyhedronFromPlanes_LineLL *pDebugLineWalk = pGapLines[1]->pNext;
-						
-						while( pDebugLineWalk != pGapLines[0] )
-						{
-							Assert_DumpPolyhedron( pDebugLineWalk->pLine->bCut == false );
-							pDebugLineWalk = pDebugLineWalk->pNext;
-						}
-					}
-#endif
-
-					GeneratePolyhedronFromPlanes_Line *pJoinLine = (GeneratePolyhedronFromPlanes_Line *)stackalloc( sizeof( GeneratePolyhedronFromPlanes_Line ) );
-					{
-						//before we forget, add this line to the active list
-						pAllLines->pPrev = (GeneratePolyhedronFromPlanes_UnorderedLineLL *)stackalloc( sizeof( GeneratePolyhedronFromPlanes_UnorderedLineLL ) );
-						pAllLines->pPrev->pNext = pAllLines;
-						pAllLines = pAllLines->pPrev;
-						pAllLines->pPrev = NULL;
-						pAllLines->pLine = pJoinLine;
-
-						pJoinLine->bAlive = false;
-						pJoinLine->bCut = false;
-					}
-
-
-					pJoinLine->pPoints[0] = pGapLines[0]->pLine->pPoints[pGapLines[0]->iReferenceIndex];
-					pJoinLine->pPoints[1] = pGapLines[1]->pLine->pPoints[1 - pGapLines[1]->iReferenceIndex];
-
-					pJoinLine->pPolygons[0] = pNewPolygon;
-					pJoinLine->pPolygons[1] = pWorkPolygon;
-
-					//now create all 4 links into the line
-					GeneratePolyhedronFromPlanes_LineLL *pPointLinks[2];
-					pPointLinks[0] = (GeneratePolyhedronFromPlanes_LineLL *)stackalloc( sizeof( GeneratePolyhedronFromPlanes_LineLL ) );
-					pPointLinks[1] = (GeneratePolyhedronFromPlanes_LineLL *)stackalloc( sizeof( GeneratePolyhedronFromPlanes_LineLL ) );
-
-					GeneratePolyhedronFromPlanes_LineLL *pPolygonLinks[2];
-					pPolygonLinks[0] = (GeneratePolyhedronFromPlanes_LineLL *)stackalloc( sizeof( GeneratePolyhedronFromPlanes_LineLL ) );
-					pPolygonLinks[1] = (GeneratePolyhedronFromPlanes_LineLL *)stackalloc( sizeof( GeneratePolyhedronFromPlanes_LineLL ) );
-
-					pPointLinks[0]->pLine = pPointLinks[1]->pLine = pPolygonLinks[0]->pLine = pPolygonLinks[1]->pLine = pJoinLine;
-
-					pJoinLine->pPointLineLinks[0] = pPointLinks[0];
-					pJoinLine->pPointLineLinks[1] = pPointLinks[1];
-					pJoinLine->pPolygonLineLinks[0] = pPolygonLinks[0];
-					pJoinLine->pPolygonLineLinks[1] = pPolygonLinks[1];
-
-
-
-					pPointLinks[0]->iReferenceIndex = 1;
-					pPointLinks[1]->iReferenceIndex = 0;
-
-					//Insert before the link from point 0 to gap line 0 (counterclockwise rotation)
-					{
-						GeneratePolyhedronFromPlanes_LineLL *pWorkLink = pGapLines[0]->pLine->pPointLineLinks[pGapLines[0]->iReferenceIndex];
-						Assert_DumpPolyhedron( pWorkLink->pLine == pGapLines[0]->pLine );
-
-						pPointLinks[0]->pPrev = pWorkLink->pPrev;
-						pPointLinks[0]->pNext = pWorkLink;
-
-						pWorkLink->pPrev->pNext = pPointLinks[0];
-						pWorkLink->pPrev = pPointLinks[0];						
-					}
-
-					//Insert after the link from point 1 to gap line 1 (clockwise rotation)
-					{
-						GeneratePolyhedronFromPlanes_LineLL *pWorkLink = pGapLines[1]->pLine->pPointLineLinks[1 - pGapLines[1]->iReferenceIndex];
-						Assert_DumpPolyhedron( pWorkLink->pLine == pGapLines[1]->pLine );
-
-						pPointLinks[1]->pNext = pWorkLink->pNext;
-						pPointLinks[1]->pPrev = pWorkLink;
-						
-						pWorkLink->pNext->pPrev = pPointLinks[1];
-						pWorkLink->pNext = pPointLinks[1];						
-					}
-
-
-
-
-					pPolygonLinks[0]->iReferenceIndex = 0;
-					pPolygonLinks[1]->iReferenceIndex = 1;
-
-					//Insert before the head line in the new polygon (at the end of the clockwise order)
-					{
-						if( pNewPolygon->pLines == NULL )
-						{
-							//this is the first line being added to the polygon
-							pNewPolygon->pLines = pPolygonLinks[0];
-							pPolygonLinks[0]->pNext = pPolygonLinks[0];
-							pPolygonLinks[0]->pPrev = pPolygonLinks[0];
-						}
-						else
-						{
-							GeneratePolyhedronFromPlanes_LineLL *pWorkLink = pNewPolygon->pLines;
-
-							pPolygonLinks[0]->pNext = pWorkLink;
-							pPolygonLinks[0]->pPrev = pWorkLink->pPrev;
-
-							pWorkLink->pPrev->pNext = pPolygonLinks[0];
-							pWorkLink->pPrev = pPolygonLinks[0];
-						}
-					}
-
-					//Insert after the head line in the work polygon
-					{
-						GeneratePolyhedronFromPlanes_LineLL *pWorkLink = pWorkPolygon->pLines;
-
-						pPolygonLinks[1]->pNext = pWorkLink->pNext;
-						pPolygonLinks[1]->pPrev = pWorkLink;
-
-						pWorkLink->pNext->pPrev = pPolygonLinks[1];
-						pWorkLink->pNext = pPolygonLinks[1];
-					}
-
-					pWorkPolygon->bMissingASide = false; //repairs are finished
-
-#ifdef _DEBUG
-					pLastWorkPolygon = pWorkPolygon;
-					pLastWorkPoint = pWorkPoint;
-#endif
-					//move to the next point
-					pWorkPoint = pJoinLine->pPoints[0];
-					pLastLineLink = pJoinLine->pPointLineLinks[0];
-					Assert_DumpPolyhedron( pWorkPoint->planarity == POINT_ONPLANE ); //every working point should be coplanar
-					
-					pTestLine = pLastLineLink->pPrev;
-					if( pTestLine->pLine->pPoints[pTestLine->iReferenceIndex]->planarity == POINT_ALIVE )
-						pWorkPolygon = pTestLine->pLine->pPolygons[pTestLine->iReferenceIndex];
-					else
-						pWorkPolygon = pTestLine->pLine->pPolygons[1 - pTestLine->iReferenceIndex];
-					
-					Assert_DumpPolyhedron( pWorkPolygon != pLastWorkPolygon );
-					Assert_DumpPolyhedron( (pWorkPoint == pStartPoint) ||
-											(pGapLines[0]->pLine->bCut == false) || 
-											(pWorkPolygon->bMissingASide == true) ); //if we're not done fixing, and if the shared line was cut, the next polygon must be missing a side
-				}
-				else
-				{
-					//line is on the plane, meaning the polygon isn't broken and doesn't need patching
-					Assert_DumpPolyhedron( pTestLine->pLine->bCut == false );
-					Assert_DumpPolyhedron( (pTestLine->pLine->pPoints[0]->planarity == POINT_ONPLANE) && (pTestLine->pLine->pPoints[1]->planarity == POINT_ONPLANE) );
-
-					
-					//link to this line from the new polygon
-					GeneratePolyhedronFromPlanes_LineLL *pNewLineLink;
-					pNewLineLink = (GeneratePolyhedronFromPlanes_LineLL *)stackalloc( sizeof( GeneratePolyhedronFromPlanes_LineLL ) );
-					
-					pNewLineLink->pLine = pTestLine->pLine;
-					pNewLineLink->iReferenceIndex = pTestLine->iReferenceIndex;
-
-					//Insert before the head line in the new polygon (at the end of the clockwise order)
-					{
-						if( pNewPolygon->pLines == NULL )
-						{
-							//this is the first line being added to the polygon
-							pNewPolygon->pLines = pNewLineLink;
-							pNewLineLink->pNext = pNewLineLink;
-							pNewLineLink->pPrev = pNewLineLink;
-						}
-						else
-						{
-							GeneratePolyhedronFromPlanes_LineLL *pWorkLink = pNewPolygon->pLines;
-
-							pNewLineLink->pNext = pWorkLink;
-							pNewLineLink->pPrev = pWorkLink->pPrev;
-
-							pWorkLink->pPrev->pNext = pNewLineLink;
-							pWorkLink->pPrev = pNewLineLink;
-						}
-					}
-
-					//Since the entire line is on the plane, that means it used to point to something that used to reside where the new polygon is going
-					// Update the link to the new the polygon pointer and be on our way
-					pTestLine->pLine->pPolygons[pTestLine->iReferenceIndex] = pNewPolygon;
-					pTestLine->pLine->pPolygonLineLinks[pTestLine->iReferenceIndex] = pNewLineLink;
-
-#ifdef _DEBUG
-					pLastWorkPolygon = pWorkPolygon;
-					pLastWorkPoint = pWorkPoint;
-#endif
-
-					pWorkPoint = pTestLine->pLine->pPoints[pTestLine->iReferenceIndex];
-					pLastLineLink = pTestLine->pLine->pPointLineLinks[pTestLine->iReferenceIndex];
-					Assert_DumpPolyhedron( pWorkPoint->planarity == POINT_ONPLANE ); //every working point should be coplanar
-
-					pTestLine = pLastLineLink->pPrev;
-					if( pTestLine->pLine->pPoints[pTestLine->iReferenceIndex]->planarity == POINT_ALIVE )
-						pWorkPolygon = pTestLine->pLine->pPolygons[pTestLine->iReferenceIndex];
-					else
-						pWorkPolygon = pTestLine->pLine->pPolygons[1 - pTestLine->iReferenceIndex];
-
-					Assert_DumpPolyhedron( pWorkPolygon != pLastWorkPolygon );
-				}
-			} while( pWorkPoint != pStartPoint );
-		}
-
-#ifdef _DEBUG
-		//verify that repairs are complete
-		{
-			GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pDebugPolygonWalk = pAllPolygons;
-			do
-			{
-				AssertMsg_DumpPolyhedron( pDebugPolygonWalk->pPolygon->bMissingASide == false, "Some polygons not repaired after cut" );
-				pDebugPolygonWalk = pDebugPolygonWalk->pNext;
-			} while( pDebugPolygonWalk );
-
-
-			GeneratePolyhedronFromPlanes_UnorderedPointLL *pDebugPointWalk = pAllPoints;
-			do
-			{
-				AssertMsg_DumpPolyhedron( pDebugPointWalk->pPoint->pConnectedLines, "Point connected to no lines after cut" );
-				pDebugPointWalk = pDebugPointWalk->pNext;
-			} while( pDebugPointWalk );
-
-			pStartPoint = NULL;
-		}
-
-		//maintain the cut history
-		DebugCutHistory.AddToTail( ConvertLinkedGeometryToPolyhedron( pAllPolygons, pAllLines, pAllPoints, false ) );
-#endif
-	}
-
-#ifdef _DEBUG
-	for( int i = DebugCutHistory.Count(); --i >= 0; )
-	{
-		if( DebugCutHistory[i] )
-			DebugCutHistory[i]->Release();
-	}
-	DebugCutHistory.RemoveAll();
-#endif
-
-	return ConvertLinkedGeometryToPolyhedron( pAllPolygons, pAllLines, pAllPoints, bUseTemporaryMemory );
-}
-
-
-
-#define STARTPOINTTOLINELINKS(iPointNum, lineindex1, iOtherPointIndex1, lineindex2, iOtherPointIndex2, lineindex3, iOtherPointIndex3 )\
-	StartingBoxPoints[iPointNum].pConnectedLines = &StartingPoints_To_Lines_Links[(iPointNum * 3) + 0];\
-	StartingPoints_To_Lines_Links[(iPointNum * 3) + 0].pLine = &StartingBoxLines[lineindex1];\
-	StartingPoints_To_Lines_Links[(iPointNum * 3) + 0].iReferenceIndex = iOtherPointIndex1;\
-	StartingBoxLines[lineindex1].pPointLineLinks[1 - iOtherPointIndex1] = &StartingPoints_To_Lines_Links[(iPointNum * 3) + 0];\
-	StartingPoints_To_Lines_Links[(iPointNum * 3) + 0].pPrev = &StartingPoints_To_Lines_Links[(iPointNum * 3) + 2];\
-	StartingPoints_To_Lines_Links[(iPointNum * 3) + 0].pNext = &StartingPoints_To_Lines_Links[(iPointNum * 3) + 1];\
-	StartingPoints_To_Lines_Links[(iPointNum * 3) + 1].pLine = &StartingBoxLines[lineindex2];\
-	StartingPoints_To_Lines_Links[(iPointNum * 3) + 1].iReferenceIndex = iOtherPointIndex2;\
-	StartingBoxLines[lineindex2].pPointLineLinks[1 - iOtherPointIndex2] = &StartingPoints_To_Lines_Links[(iPointNum * 3) + 1];\
-	StartingPoints_To_Lines_Links[(iPointNum * 3) + 1].pPrev = &StartingPoints_To_Lines_Links[(iPointNum * 3) + 0];\
-	StartingPoints_To_Lines_Links[(iPointNum * 3) + 1].pNext = &StartingPoints_To_Lines_Links[(iPointNum * 3) + 2];\
-	StartingPoints_To_Lines_Links[(iPointNum * 3) + 2].pLine = &StartingBoxLines[lineindex3];\
-	StartingPoints_To_Lines_Links[(iPointNum * 3) + 2].iReferenceIndex = iOtherPointIndex3;\
-	StartingBoxLines[lineindex3].pPointLineLinks[1 - iOtherPointIndex3] = &StartingPoints_To_Lines_Links[(iPointNum * 3) + 2];\
-	StartingPoints_To_Lines_Links[(iPointNum * 3) + 2].pPrev = &StartingPoints_To_Lines_Links[(iPointNum * 3) + 1];\
-	StartingPoints_To_Lines_Links[(iPointNum * 3) + 2].pNext = &StartingPoints_To_Lines_Links[(iPointNum * 3) + 0];
-
-#define STARTBOXCONNECTION( linenum, point1, point2, poly1, poly2 )\
-	StartingBoxLines[linenum].pPoints[0] = &StartingBoxPoints[point1];\
-	StartingBoxLines[linenum].pPoints[1] = &StartingBoxPoints[point2];\
-	StartingBoxLines[linenum].pPolygons[0] = &StartingBoxPolygons[poly1];\
-	StartingBoxLines[linenum].pPolygons[1] = &StartingBoxPolygons[poly2];
-
-#define STARTPOLYGONTOLINELINKS( polynum, lineindex1, iThisPolyIndex1, lineindex2, iThisPolyIndex2, lineindex3, iThisPolyIndex3, lineindex4, iThisPolyIndex4 )\
-	StartingBoxPolygons[polynum].pLines = &StartingPolygon_To_Lines_Links[(polynum * 4) + 0];\
-	StartingPolygon_To_Lines_Links[(polynum * 4) + 0].pLine = &StartingBoxLines[lineindex1];\
-	StartingPolygon_To_Lines_Links[(polynum * 4) + 0].iReferenceIndex = iThisPolyIndex1;\
-	StartingBoxLines[lineindex1].pPolygonLineLinks[iThisPolyIndex1] = &StartingPolygon_To_Lines_Links[(polynum * 4) + 0];\
-	StartingPolygon_To_Lines_Links[(polynum * 4) + 0].pPrev = &StartingPolygon_To_Lines_Links[(polynum * 4) + 3];\
-	StartingPolygon_To_Lines_Links[(polynum * 4) + 0].pNext = &StartingPolygon_To_Lines_Links[(polynum * 4) + 1];\
-	StartingPolygon_To_Lines_Links[(polynum * 4) + 1].pLine = &StartingBoxLines[lineindex2];\
-	StartingPolygon_To_Lines_Links[(polynum * 4) + 1].iReferenceIndex = iThisPolyIndex2;\
-	StartingBoxLines[lineindex2].pPolygonLineLinks[iThisPolyIndex2] = &StartingPolygon_To_Lines_Links[(polynum * 4) + 1];\
-	StartingPolygon_To_Lines_Links[(polynum * 4) + 1].pPrev = &StartingPolygon_To_Lines_Links[(polynum * 4) + 0];\
-	StartingPolygon_To_Lines_Links[(polynum * 4) + 1].pNext = &StartingPolygon_To_Lines_Links[(polynum * 4) + 2];\
-	StartingPolygon_To_Lines_Links[(polynum * 4) + 2].pLine = &StartingBoxLines[lineindex3];\
-	StartingPolygon_To_Lines_Links[(polynum * 4) + 2].iReferenceIndex = iThisPolyIndex3;\
-	StartingBoxLines[lineindex3].pPolygonLineLinks[iThisPolyIndex3] = &StartingPolygon_To_Lines_Links[(polynum * 4) + 2];\
-	StartingPolygon_To_Lines_Links[(polynum * 4) + 2].pPrev = &StartingPolygon_To_Lines_Links[(polynum * 4) + 1];\
-	StartingPolygon_To_Lines_Links[(polynum * 4) + 2].pNext = &StartingPolygon_To_Lines_Links[(polynum * 4) + 3];\
-	StartingPolygon_To_Lines_Links[(polynum * 4) + 3].pLine = &StartingBoxLines[lineindex4];\
-	StartingPolygon_To_Lines_Links[(polynum * 4) + 3].iReferenceIndex = iThisPolyIndex4;\
-	StartingBoxLines[lineindex4].pPolygonLineLinks[iThisPolyIndex4] = &StartingPolygon_To_Lines_Links[(polynum * 4) + 3];\
-	StartingPolygon_To_Lines_Links[(polynum * 4) + 3].pPrev = &StartingPolygon_To_Lines_Links[(polynum * 4) + 2];\
-	StartingPolygon_To_Lines_Links[(polynum * 4) + 3].pNext = &StartingPolygon_To_Lines_Links[(polynum * 4) + 0];
-
-
-CPolyhedron *GeneratePolyhedronFromPlanes( const float *pOutwardFacingPlanes, int iPlaneCount, float fOnPlaneEpsilon, bool bUseTemporaryMemory )
-{
-	//this is version 2 of the polyhedron generator, version 1 made individual polygons and joined points together, some guesswork is involved and it therefore isn't a solid method
-	//this version will start with a cube and hack away at it (retaining point connection information) to produce a polyhedron with no guesswork involved, this method should be rock solid
-	
-	//the polygon clipping functions we're going to use want inward facing planes
-	float *pFlippedPlanes = (float *)stackalloc( (iPlaneCount * 4) * sizeof( float ) );
-	for( int i = 0; i != iPlaneCount * 4; ++i )
-	{
-		pFlippedPlanes[i] = -pOutwardFacingPlanes[i];
-	}
-
-	//our first goal is to find the size of a cube big enough to encapsulate all points that will be in the final polyhedron
-	Vector vAABBMins, vAABBMaxs;
-	if( FindConvexShapeLooseAABB( pFlippedPlanes, iPlaneCount, &vAABBMins, &vAABBMaxs ) == false )
-		return NULL; //no shape to work with apparently
-
-	
-	//grow the bounding box to a larger size since it's probably inaccurate a bit
-	{
-		Vector vGrow = (vAABBMaxs - vAABBMins) * 0.5f;
-		vGrow.x += 100.0f;
-		vGrow.y += 100.0f;
-		vGrow.z += 100.0f;
-
-		vAABBMaxs += vGrow;
-		vAABBMins -= vGrow;
-	}
-
-	//generate our starting cube using the 2x AABB so we can start hacking away at it
-	
-	
-
-	//create our starting box on the stack
-	GeneratePolyhedronFromPlanes_Point StartingBoxPoints[8];
-	GeneratePolyhedronFromPlanes_Line StartingBoxLines[12];
-	GeneratePolyhedronFromPlanes_Polygon StartingBoxPolygons[6];
-	GeneratePolyhedronFromPlanes_LineLL StartingPoints_To_Lines_Links[24]; //8 points, 3 lines per point
-	GeneratePolyhedronFromPlanes_LineLL StartingPolygon_To_Lines_Links[24]; //6 polygons, 4 lines per poly
-	
-	GeneratePolyhedronFromPlanes_UnorderedPolygonLL StartingPolygonList[6]; //6 polygons
-	GeneratePolyhedronFromPlanes_UnorderedLineLL StartingLineList[12]; //12 lines
-	GeneratePolyhedronFromPlanes_UnorderedPointLL StartingPointList[8]; //8 points
-
-
-	//I had to work all this out on a whiteboard if it seems completely unintuitive.
-	{
-		StartingBoxPoints[0].ptPosition.Init( vAABBMins.x, vAABBMins.y, vAABBMins.z );
-		STARTPOINTTOLINELINKS( 0, 0, 1, 4, 1, 3, 0 );
-
-		StartingBoxPoints[1].ptPosition.Init( vAABBMins.x, vAABBMaxs.y, vAABBMins.z );
-		STARTPOINTTOLINELINKS( 1, 0, 0, 1, 1, 5, 1 );
-
-		StartingBoxPoints[2].ptPosition.Init( vAABBMins.x, vAABBMins.y, vAABBMaxs.z );
-		STARTPOINTTOLINELINKS( 2, 4, 0, 8, 1, 11, 0 );
-
-		StartingBoxPoints[3].ptPosition.Init( vAABBMins.x, vAABBMaxs.y, vAABBMaxs.z );
-		STARTPOINTTOLINELINKS( 3, 5, 0, 9, 1, 8, 0 );
-
-		StartingBoxPoints[4].ptPosition.Init( vAABBMaxs.x, vAABBMins.y, vAABBMins.z );
-		STARTPOINTTOLINELINKS( 4, 2, 0, 3, 1, 7, 1 );
-
-		StartingBoxPoints[5].ptPosition.Init( vAABBMaxs.x, vAABBMaxs.y, vAABBMins.z );
-		STARTPOINTTOLINELINKS( 5, 1, 0, 2, 1, 6, 1 );
-
-		StartingBoxPoints[6].ptPosition.Init( vAABBMaxs.x, vAABBMins.y, vAABBMaxs.z );
-		STARTPOINTTOLINELINKS( 6, 7, 0, 11, 1, 10, 0 );
-
-		StartingBoxPoints[7].ptPosition.Init( vAABBMaxs.x, vAABBMaxs.y, vAABBMaxs.z );
-		STARTPOINTTOLINELINKS( 7, 6, 0, 10, 1, 9, 0 );
-
-		STARTBOXCONNECTION( 0, 0, 1, 0, 5 );
-		STARTBOXCONNECTION( 1, 1, 5, 1, 5 );
-		STARTBOXCONNECTION( 2, 5, 4, 2, 5 );
-		STARTBOXCONNECTION( 3, 4, 0, 3, 5 );
-		STARTBOXCONNECTION( 4, 0, 2, 3, 0 );
-		STARTBOXCONNECTION( 5, 1, 3, 0, 1 );
-		STARTBOXCONNECTION( 6, 5, 7, 1, 2 );
-		STARTBOXCONNECTION( 7, 4, 6, 2, 3 );
-		STARTBOXCONNECTION( 8, 2, 3, 4, 0 );
-		STARTBOXCONNECTION( 9, 3, 7, 4, 1 );
-		STARTBOXCONNECTION( 10, 7, 6, 4, 2 );
-		STARTBOXCONNECTION( 11, 6, 2, 4, 3 );
-
-
-		STARTBOXCONNECTION( 0, 0, 1, 5, 0 );
-		STARTBOXCONNECTION( 1, 1, 5, 5, 1 );
-		STARTBOXCONNECTION( 2, 5, 4, 5, 2 );
-		STARTBOXCONNECTION( 3, 4, 0, 5, 3 );
-		STARTBOXCONNECTION( 4, 0, 2, 0, 3 );
-		STARTBOXCONNECTION( 5, 1, 3, 1, 0 );
-		STARTBOXCONNECTION( 6, 5, 7, 2, 1 );
-		STARTBOXCONNECTION( 7, 4, 6, 3, 2 );
-		STARTBOXCONNECTION( 8, 2, 3, 0, 4 );
-		STARTBOXCONNECTION( 9, 3, 7, 1, 4 );
-		STARTBOXCONNECTION( 10, 7, 6, 2, 4 );
-		STARTBOXCONNECTION( 11, 6, 2, 3, 4 );
-
-		StartingBoxPolygons[0].vSurfaceNormal.Init( -1.0f, 0.0f, 0.0f );
-		StartingBoxPolygons[1].vSurfaceNormal.Init( 0.0f, 1.0f, 0.0f );
-		StartingBoxPolygons[2].vSurfaceNormal.Init( 1.0f, 0.0f, 0.0f );
-		StartingBoxPolygons[3].vSurfaceNormal.Init( 0.0f, -1.0f, 0.0f );
-		StartingBoxPolygons[4].vSurfaceNormal.Init( 0.0f, 0.0f, 1.0f );
-		StartingBoxPolygons[5].vSurfaceNormal.Init( 0.0f, 0.0f, -1.0f );
-
-
-		STARTPOLYGONTOLINELINKS( 0, 0, 1, 5, 1, 8, 0, 4, 0 );
-		STARTPOLYGONTOLINELINKS( 1, 1, 1, 6, 1, 9, 0, 5, 0 );
-		STARTPOLYGONTOLINELINKS( 2, 2, 1, 7, 1, 10, 0, 6, 0 );
-		STARTPOLYGONTOLINELINKS( 3, 3, 1, 4, 1, 11, 0, 7, 0 );
-		STARTPOLYGONTOLINELINKS( 4, 8, 1, 9, 1, 10, 1, 11, 1 );
-		STARTPOLYGONTOLINELINKS( 5, 0, 0, 3, 0, 2, 0, 1, 0 );
-
-
-		{
-			StartingPolygonList[0].pPolygon = &StartingBoxPolygons[0];
-			StartingPolygonList[0].pNext = &StartingPolygonList[1];
-			StartingPolygonList[0].pPrev = NULL;
-
-			StartingPolygonList[1].pPolygon = &StartingBoxPolygons[1];
-			StartingPolygonList[1].pNext = &StartingPolygonList[2];
-			StartingPolygonList[1].pPrev = &StartingPolygonList[0];
-
-			StartingPolygonList[2].pPolygon = &StartingBoxPolygons[2];
-			StartingPolygonList[2].pNext = &StartingPolygonList[3];
-			StartingPolygonList[2].pPrev = &StartingPolygonList[1];
-
-			StartingPolygonList[3].pPolygon = &StartingBoxPolygons[3];
-			StartingPolygonList[3].pNext = &StartingPolygonList[4];
-			StartingPolygonList[3].pPrev = &StartingPolygonList[2];
-
-			StartingPolygonList[4].pPolygon = &StartingBoxPolygons[4];
-			StartingPolygonList[4].pNext = &StartingPolygonList[5];
-			StartingPolygonList[4].pPrev = &StartingPolygonList[3];
-
-			StartingPolygonList[5].pPolygon = &StartingBoxPolygons[5];
-			StartingPolygonList[5].pNext = NULL;
-			StartingPolygonList[5].pPrev = &StartingPolygonList[4];
-		}
-
-
-
-		{
-			StartingLineList[0].pLine = &StartingBoxLines[0];
-			StartingLineList[0].pNext = &StartingLineList[1];
-			StartingLineList[0].pPrev = NULL;
-
-			StartingLineList[1].pLine = &StartingBoxLines[1];
-			StartingLineList[1].pNext = &StartingLineList[2];
-			StartingLineList[1].pPrev = &StartingLineList[0];
-
-			StartingLineList[2].pLine = &StartingBoxLines[2];
-			StartingLineList[2].pNext = &StartingLineList[3];
-			StartingLineList[2].pPrev = &StartingLineList[1];
-
-			StartingLineList[3].pLine = &StartingBoxLines[3];
-			StartingLineList[3].pNext = &StartingLineList[4];
-			StartingLineList[3].pPrev = &StartingLineList[2];
-
-			StartingLineList[4].pLine = &StartingBoxLines[4];
-			StartingLineList[4].pNext = &StartingLineList[5];
-			StartingLineList[4].pPrev = &StartingLineList[3];
-
-			StartingLineList[5].pLine = &StartingBoxLines[5];
-			StartingLineList[5].pNext = &StartingLineList[6];
-			StartingLineList[5].pPrev = &StartingLineList[4];
-
-			StartingLineList[6].pLine = &StartingBoxLines[6];
-			StartingLineList[6].pNext = &StartingLineList[7];
-			StartingLineList[6].pPrev = &StartingLineList[5];
-
-			StartingLineList[7].pLine = &StartingBoxLines[7];
-			StartingLineList[7].pNext = &StartingLineList[8];
-			StartingLineList[7].pPrev = &StartingLineList[6];
-
-			StartingLineList[8].pLine = &StartingBoxLines[8];
-			StartingLineList[8].pNext = &StartingLineList[9];
-			StartingLineList[8].pPrev = &StartingLineList[7];
-
-			StartingLineList[9].pLine = &StartingBoxLines[9];
-			StartingLineList[9].pNext = &StartingLineList[10];
-			StartingLineList[9].pPrev = &StartingLineList[8];
-
-			StartingLineList[10].pLine = &StartingBoxLines[10];
-			StartingLineList[10].pNext = &StartingLineList[11];
-			StartingLineList[10].pPrev = &StartingLineList[9];
-
-			StartingLineList[11].pLine = &StartingBoxLines[11];
-			StartingLineList[11].pNext = NULL;
-			StartingLineList[11].pPrev = &StartingLineList[10];
-		}
-
-		{
-			StartingPointList[0].pPoint = &StartingBoxPoints[0];
-			StartingPointList[0].pNext = &StartingPointList[1];
-			StartingPointList[0].pPrev = NULL;
-
-			StartingPointList[1].pPoint = &StartingBoxPoints[1];
-			StartingPointList[1].pNext = &StartingPointList[2];
-			StartingPointList[1].pPrev = &StartingPointList[0];
-
-			StartingPointList[2].pPoint = &StartingBoxPoints[2];
-			StartingPointList[2].pNext = &StartingPointList[3];
-			StartingPointList[2].pPrev = &StartingPointList[1];
-
-			StartingPointList[3].pPoint = &StartingBoxPoints[3];
-			StartingPointList[3].pNext = &StartingPointList[4];
-			StartingPointList[3].pPrev = &StartingPointList[2];
-
-			StartingPointList[4].pPoint = &StartingBoxPoints[4];
-			StartingPointList[4].pNext = &StartingPointList[5];
-			StartingPointList[4].pPrev = &StartingPointList[3];
-
-			StartingPointList[5].pPoint = &StartingBoxPoints[5];
-			StartingPointList[5].pNext = &StartingPointList[6];
-			StartingPointList[5].pPrev = &StartingPointList[4];
-
-			StartingPointList[6].pPoint = &StartingBoxPoints[6];
-			StartingPointList[6].pNext = &StartingPointList[7];
-			StartingPointList[6].pPrev = &StartingPointList[5];
-
-			StartingPointList[7].pPoint = &StartingBoxPoints[7];
-			StartingPointList[7].pNext = NULL;
-			StartingPointList[7].pPrev = &StartingPointList[6];
-		}
-	}
-
-	return ClipLinkedGeometry( StartingPolygonList, StartingLineList, StartingPointList, pOutwardFacingPlanes, iPlaneCount, fOnPlaneEpsilon, bUseTemporaryMemory );
-}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-#ifdef _DEBUG
-void DumpAABBToGLView( const Vector &vCenter, const Vector &vExtents, const Vector &vColor, FILE *pFile )
-{
-#ifdef ENABLE_DEBUG_POLYHEDRON_DUMPS
-	Vector vMins = vCenter - vExtents;
-	Vector vMaxs = vCenter + vExtents;
-
-	//x min side
-	fprintf( pFile, "4\n" );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMins.y, vMins.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMins.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMaxs.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMaxs.y, vMins.z, vColor.x, vColor.y, vColor.z );
-
-	fprintf( pFile, "4\n" );	
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMaxs.y, vMins.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMaxs.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMins.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMins.y, vMins.z, vColor.x, vColor.y, vColor.z );
-
-	//x max side
-	fprintf( pFile, "4\n" );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMins.y, vMins.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMins.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMaxs.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMaxs.y, vMins.z, vColor.x, vColor.y, vColor.z );
-
-	fprintf( pFile, "4\n" );	
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMaxs.y, vMins.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMaxs.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMins.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMins.y, vMins.z, vColor.x, vColor.y, vColor.z );
-
-
-	//y min side
-	fprintf( pFile, "4\n" );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMins.y, vMins.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMins.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMins.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMins.y, vMins.z, vColor.x, vColor.y, vColor.z );
-
-	fprintf( pFile, "4\n" );	
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMins.y, vMins.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMins.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMins.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMins.y, vMins.z, vColor.x, vColor.y, vColor.z );
-
-
-
-	//y max side
-	fprintf( pFile, "4\n" );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMaxs.y, vMins.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMaxs.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMaxs.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMaxs.y, vMins.z, vColor.x, vColor.y, vColor.z );
-
-	fprintf( pFile, "4\n" );	
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMaxs.y, vMins.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMaxs.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMaxs.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMaxs.y, vMins.z, vColor.x, vColor.y, vColor.z );
-
-
-
-	//z min side
-	fprintf( pFile, "4\n" );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMins.y, vMins.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMaxs.y, vMins.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMaxs.y, vMins.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMins.y, vMins.z, vColor.x, vColor.y, vColor.z );
-
-	fprintf( pFile, "4\n" );	
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMins.y, vMins.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMaxs.y, vMins.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMaxs.y, vMins.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMins.y, vMins.z, vColor.x, vColor.y, vColor.z );
-
-
-	//z max side
-	fprintf( pFile, "4\n" );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMins.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMaxs.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMaxs.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMins.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
-
-	fprintf( pFile, "4\n" );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMins.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMaxs.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMaxs.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMins.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
-#endif
-}
-
-void DumpLineToGLView( const Vector &vPoint1, const Vector &vColor1, const Vector &vPoint2, const Vector &vColor2, float fThickness, FILE *pFile )
-{
-#ifdef ENABLE_DEBUG_POLYHEDRON_DUMPS
-	Vector vDirection = vPoint2 - vPoint1;
-	vDirection.NormalizeInPlace();
-
-	Vector vPseudoPerpandicular = vec3_origin;
-
-	if( vDirection.x != 0.0f )
-		vPseudoPerpandicular.z = 1.0f;
-	else
-		vPseudoPerpandicular.x = 1.0f;
-
-	Vector vWidth = vDirection.Cross( vPseudoPerpandicular );
-	vWidth.NormalizeInPlace();
-
-	Vector vHeight = vDirection.Cross( vWidth );
-	vHeight.NormalizeInPlace();
-
-	fThickness *= 0.5f; //we use half thickness in both directions
-	vDirection *= fThickness;
-	vWidth *= fThickness;
-	vHeight *= fThickness;
-
-	Vector vLinePoints[8];
-	vLinePoints[0] = vPoint1 - vDirection - vWidth - vHeight;
-	vLinePoints[1] = vPoint1 - vDirection - vWidth + vHeight;
-	vLinePoints[2] = vPoint1 - vDirection + vWidth - vHeight;
-	vLinePoints[3] = vPoint1 - vDirection + vWidth + vHeight;
-
-	vLinePoints[4] = vPoint2 + vDirection - vWidth - vHeight;
-	vLinePoints[5] = vPoint2 + vDirection - vWidth + vHeight;
-	vLinePoints[6] = vPoint2 + vDirection + vWidth - vHeight;
-	vLinePoints[7] = vPoint2 + vDirection + vWidth + vHeight;
-
-	const Vector *pLineColors[8] = { &vColor1, &vColor1, &vColor1, &vColor1, &vColor2, &vColor2, &vColor2, &vColor2 };
-
-
-#define DPTGLV_LINE_WRITEPOINT(index) fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vLinePoints[index].x, vLinePoints[index].y, vLinePoints[index].z, pLineColors[index]->x, pLineColors[index]->y, pLineColors[index]->z );
-#define DPTGLV_LINE_DOUBLESIDEDQUAD(index1,index2,index3,index4)\
-	fprintf( pFile, "4\n" );\
-	DPTGLV_LINE_WRITEPOINT(index1);\
-	DPTGLV_LINE_WRITEPOINT(index2);\
-	DPTGLV_LINE_WRITEPOINT(index3);\
-	DPTGLV_LINE_WRITEPOINT(index4);\
-	fprintf( pFile, "4\n" );\
-	DPTGLV_LINE_WRITEPOINT(index4);\
-	DPTGLV_LINE_WRITEPOINT(index3);\
-	DPTGLV_LINE_WRITEPOINT(index2);\
-	DPTGLV_LINE_WRITEPOINT(index1);
-
-
-	DPTGLV_LINE_DOUBLESIDEDQUAD(0,4,6,2);
-	DPTGLV_LINE_DOUBLESIDEDQUAD(3,7,5,1);
-	DPTGLV_LINE_DOUBLESIDEDQUAD(1,5,4,0);
-	DPTGLV_LINE_DOUBLESIDEDQUAD(2,6,7,3);
-	DPTGLV_LINE_DOUBLESIDEDQUAD(0,2,3,1);
-	DPTGLV_LINE_DOUBLESIDEDQUAD(5,7,6,4);
-#endif
-}
-
-void DumpPolyhedronToGLView( const CPolyhedron *pPolyhedron, const char *pFilename, const VMatrix *pTransform )
-{
-#ifdef ENABLE_DEBUG_POLYHEDRON_DUMPS
-	if ( (pPolyhedron == NULL) || (pPolyhedron->iVertexCount == 0) )
-		return;
-
-	if( pTransform == NULL )
-		pTransform = &s_matIdentity;
-
-	printf("Writing %s...\n", pFilename );
-
-	FILE *pFile = fopen( pFilename, "ab" );
-
-	//randomizing an array of colors to help spot shared/unshared vertices
-	Vector *pColors = (Vector *)stackalloc( sizeof( Vector ) * pPolyhedron->iVertexCount );	
-	int counter;
-	for( counter = 0; counter != pPolyhedron->iVertexCount; ++counter )
-	{
-		pColors[counter].Init( rand()/32768.0f, rand()/32768.0f, rand()/32768.0f );
-	}
-
-	Vector *pTransformedPoints = (Vector *)stackalloc( pPolyhedron->iVertexCount * sizeof( Vector ) );
-	for ( counter = 0; counter != pPolyhedron->iVertexCount; ++counter )
-	{
-		pTransformedPoints[counter] = (*pTransform) * pPolyhedron->pVertices[counter];
-	}
-
-	for ( counter = 0; counter != pPolyhedron->iPolygonCount; ++counter )
-	{
-		fprintf( pFile, "%i\n", pPolyhedron->pPolygons[counter].iIndexCount );
-		int counter2;
-		for( counter2 = 0; counter2 != pPolyhedron->pPolygons[counter].iIndexCount; ++counter2 )
-		{
-			Polyhedron_IndexedLineReference_t *pLineReference = &pPolyhedron->pIndices[pPolyhedron->pPolygons[counter].iFirstIndex + counter2];
-
-			Vector *pVertex = &pTransformedPoints[pPolyhedron->pLines[pLineReference->iLineIndex].iPointIndices[pLineReference->iEndPointIndex]];
-			Vector *pColor = &pColors[pPolyhedron->pLines[pLineReference->iLineIndex].iPointIndices[pLineReference->iEndPointIndex]];
-			fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n",pVertex->x, pVertex->y, pVertex->z, pColor->x, pColor->y, pColor->z );
-		}
-	}
-
-	for( counter = 0; counter != pPolyhedron->iLineCount; ++counter )
-	{
-		const Vector vOne( 1.0f, 1.0f, 1.0f );
-		DumpLineToGLView( pTransformedPoints[pPolyhedron->pLines[counter].iPointIndices[0]], vOne - pColors[pPolyhedron->pLines[counter].iPointIndices[0]],
-							pTransformedPoints[pPolyhedron->pLines[counter].iPointIndices[1]], vOne - pColors[pPolyhedron->pLines[counter].iPointIndices[1]], 
-							0.1f, pFile );
-	}
-
-	for( counter = 0; counter != pPolyhedron->iVertexCount; ++counter )
-	{
-		const Vector vPointHalfSize(0.15f, 0.15f, 0.15f );
-		DumpAABBToGLView( pTransformedPoints[counter], vPointHalfSize, pColors[counter], pFile );
-	}
-
-	fclose( pFile );
-#endif
-}
-
-
-void DumpPlaneToGlView( const float *pPlane, float fGrayScale, const char *pszFileName, const VMatrix *pTransform )
-{
-#ifdef ENABLE_DEBUG_POLYHEDRON_DUMPS
-	if( pTransform == NULL )
-		pTransform = &s_matIdentity;
-
-	FILE *pFile = fopen( pszFileName, "ab" );
-
-	//transform the plane
-	Vector vNormal = pTransform->ApplyRotation( *(Vector *)pPlane );
-	float fDist = pPlane[3] * vNormal.NormalizeInPlace(); //possible scaling going on
-	fDist += vNormal.Dot( pTransform->GetTranslation() );
-	
-	Vector vPlaneVerts[4];
-
-	PolyFromPlane( vPlaneVerts, vNormal, fDist, 100000.0f );
-
-	fprintf( pFile, "4\n" );
-
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vPlaneVerts[0].x, vPlaneVerts[0].y, vPlaneVerts[0].z, fGrayScale, fGrayScale, fGrayScale );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vPlaneVerts[1].x, vPlaneVerts[1].y, vPlaneVerts[1].z, fGrayScale, fGrayScale, fGrayScale );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vPlaneVerts[2].x, vPlaneVerts[2].y, vPlaneVerts[2].z, fGrayScale, fGrayScale, fGrayScale );
-	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vPlaneVerts[3].x, vPlaneVerts[3].y, vPlaneVerts[3].z, fGrayScale, fGrayScale, fGrayScale );
-
-	fclose( pFile );
-#endif
-}
-#endif
-
-
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+
+#include "mathlib/polyhedron.h"
+#include "mathlib/vmatrix.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include "tier1/utlvector.h"
+
+
+
+struct GeneratePolyhedronFromPlanes_Point;
+struct GeneratePolyhedronFromPlanes_PointLL;
+struct GeneratePolyhedronFromPlanes_Line;
+struct GeneratePolyhedronFromPlanes_LineLL;
+struct GeneratePolyhedronFromPlanes_Polygon;
+struct GeneratePolyhedronFromPlanes_PolygonLL;
+
+struct GeneratePolyhedronFromPlanes_UnorderedPointLL;
+struct GeneratePolyhedronFromPlanes_UnorderedLineLL;
+struct GeneratePolyhedronFromPlanes_UnorderedPolygonLL;
+
+Vector FindPointInPlanes( const float *pPlanes, int planeCount );
+bool FindConvexShapeLooseAABB( const float *pInwardFacingPlanes, int iPlaneCount, Vector *pAABBMins, Vector *pAABBMaxs );
+CPolyhedron *ClipLinkedGeometry( GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pPolygons, GeneratePolyhedronFromPlanes_UnorderedLineLL *pLines, GeneratePolyhedronFromPlanes_UnorderedPointLL *pPoints, const float *pOutwardFacingPlanes, int iPlaneCount, float fOnPlaneEpsilon, bool bUseTemporaryMemory );
+CPolyhedron *ConvertLinkedGeometryToPolyhedron( GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pPolygons, GeneratePolyhedronFromPlanes_UnorderedLineLL *pLines, GeneratePolyhedronFromPlanes_UnorderedPointLL *pPoints, bool bUseTemporaryMemory );
+
+//#define ENABLE_DEBUG_POLYHEDRON_DUMPS //Dumps debug information to disk for use with glview. Requires that tier2 also be in all projects using debug mathlib
+//#define DEBUG_DUMP_POLYHEDRONS_TO_NUMBERED_GLVIEWS //dumps successfully generated polyhedrons
+
+#ifdef _DEBUG
+void DumpPolyhedronToGLView( const CPolyhedron *pPolyhedron, const char *pFilename, const VMatrix *pTransform );
+void DumpPlaneToGlView( const float *pPlane, float fGrayScale, const char *pszFileName, const VMatrix *pTransform );
+void DumpLineToGLView( const Vector &vPoint1, const Vector &vColor1, const Vector &vPoint2, const Vector &vColor2, float fThickness, FILE *pFile );
+void DumpAABBToGLView( const Vector &vCenter, const Vector &vExtents, const Vector &vColor, FILE *pFile );
+
+#if defined( ENABLE_DEBUG_POLYHEDRON_DUMPS ) && defined( WIN32 )
+#include "winlite.h"
+#endif
+
+static VMatrix s_matIdentity( 1.0f, 0.0f, 0.0f, 0.0f, 
+							 0.0f, 1.0f, 0.0f, 0.0f, 
+							 0.0f, 0.0f, 1.0f, 0.0f, 
+							 0.0f, 0.0f, 0.0f, 1.0f );
+#endif
+
+#if defined( DEBUG_DUMP_POLYHEDRONS_TO_NUMBERED_GLVIEWS )
+static int g_iPolyhedronDumpCounter = 0;
+#endif
+
+// memdbgon must be the last include file in a .cpp file!!!
+#include "tier0/memdbgon.h"
+
+#if defined( _DEBUG ) && defined( ENABLE_DEBUG_POLYHEDRON_DUMPS )
+void CreateDumpDirectory( const char *szDirectoryName )
+{
+#if defined( WIN32 )
+	CreateDirectory( szDirectoryName, NULL );
+#else
+	Assert( false ); //TODO: create directories in linux
+#endif
+}
+#endif
+
+
+
+void CPolyhedron_AllocByNew::Release( void )
+{
+	delete this;
+}
+
+CPolyhedron_AllocByNew *CPolyhedron_AllocByNew::Allocate( unsigned short iVertices, unsigned short iLines, unsigned short iIndices, unsigned short iPolygons ) //creates the polyhedron along with enough memory to hold all it's data in a single allocation
+{
+	void *pMemory = new unsigned char [ sizeof( CPolyhedron_AllocByNew ) +
+										(iVertices * sizeof(Vector)) + 
+										(iLines * sizeof(Polyhedron_IndexedLine_t)) + 
+										(iIndices * sizeof( Polyhedron_IndexedLineReference_t )) + 
+										(iPolygons * sizeof( Polyhedron_IndexedPolygon_t ))];
+
+#include "tier0/memdbgoff.h" //the following placement new doesn't compile with memory debugging
+	CPolyhedron_AllocByNew *pAllocated = new ( pMemory ) CPolyhedron_AllocByNew;
+#include "tier0/memdbgon.h"
+
+	pAllocated->iVertexCount = iVertices;
+	pAllocated->iLineCount = iLines;
+	pAllocated->iIndexCount = iIndices;
+	pAllocated->iPolygonCount = iPolygons;
+	pAllocated->pVertices = (Vector *)(pAllocated + 1); //start vertex memory at the end of the class
+	pAllocated->pLines = (Polyhedron_IndexedLine_t *)(pAllocated->pVertices + iVertices);
+	pAllocated->pIndices = (Polyhedron_IndexedLineReference_t *)(pAllocated->pLines + iLines);
+	pAllocated->pPolygons = (Polyhedron_IndexedPolygon_t *)(pAllocated->pIndices + iIndices);
+
+	return pAllocated;
+}
+
+
+class CPolyhedron_TempMemory : public CPolyhedron
+{
+public:
+#ifdef DBGFLAG_ASSERT
+	int iReferenceCount;
+#endif
+
+	virtual void Release( void )
+	{
+#ifdef DBGFLAG_ASSERT
+		--iReferenceCount;
+#endif
+	}
+
+	CPolyhedron_TempMemory( void )
+#ifdef DBGFLAG_ASSERT
+		: iReferenceCount( 0 )
+#endif
+	{ };
+};
+
+
+static CUtlVector<unsigned char> s_TempMemoryPolyhedron_Buffer;
+static CPolyhedron_TempMemory s_TempMemoryPolyhedron;
+
+CPolyhedron *GetTempPolyhedron( unsigned short iVertices, unsigned short iLines, unsigned short iIndices, unsigned short iPolygons ) //grab the temporary polyhedron. Avoids new/delete for quick work. Can only be in use by one chunk of code at a time
+{
+	AssertMsg( s_TempMemoryPolyhedron.iReferenceCount == 0, "Temporary polyhedron memory being rewritten before released" );
+#ifdef DBGFLAG_ASSERT
+	++s_TempMemoryPolyhedron.iReferenceCount;
+#endif
+	s_TempMemoryPolyhedron_Buffer.SetCount( (sizeof( Vector ) * iVertices) +
+											(sizeof( Polyhedron_IndexedLine_t ) * iLines) +
+											(sizeof( Polyhedron_IndexedLineReference_t ) * iIndices) +
+											(sizeof( Polyhedron_IndexedPolygon_t ) * iPolygons) );
+
+	s_TempMemoryPolyhedron.iVertexCount = iVertices;
+	s_TempMemoryPolyhedron.iLineCount = iLines;
+	s_TempMemoryPolyhedron.iIndexCount = iIndices;
+	s_TempMemoryPolyhedron.iPolygonCount = iPolygons;
+
+	s_TempMemoryPolyhedron.pVertices = (Vector *)s_TempMemoryPolyhedron_Buffer.Base();
+	s_TempMemoryPolyhedron.pLines = (Polyhedron_IndexedLine_t *)(&s_TempMemoryPolyhedron.pVertices[s_TempMemoryPolyhedron.iVertexCount]);
+	s_TempMemoryPolyhedron.pIndices = (Polyhedron_IndexedLineReference_t *)(&s_TempMemoryPolyhedron.pLines[s_TempMemoryPolyhedron.iLineCount]);
+	s_TempMemoryPolyhedron.pPolygons = (Polyhedron_IndexedPolygon_t *)(&s_TempMemoryPolyhedron.pIndices[s_TempMemoryPolyhedron.iIndexCount]);
+
+	return &s_TempMemoryPolyhedron;
+}
+
+
+Vector CPolyhedron::Center( void )
+{
+	if( iVertexCount == 0 )
+		return vec3_origin;
+
+	Vector vAABBMin, vAABBMax;
+	vAABBMin = vAABBMax = pVertices[0];
+	for( int i = 1; i != iVertexCount; ++i )
+	{
+		Vector &vPoint = pVertices[i];
+		if( vPoint.x < vAABBMin.x )
+			vAABBMin.x = vPoint.x;
+		if( vPoint.y < vAABBMin.y )
+			vAABBMin.y = vPoint.y;
+		if( vPoint.z < vAABBMin.z )
+			vAABBMin.z = vPoint.z;
+
+		if( vPoint.x > vAABBMax.x )
+			vAABBMax.x = vPoint.x;
+		if( vPoint.y > vAABBMax.y )
+			vAABBMax.y = vPoint.y;
+		if( vPoint.z > vAABBMax.z )
+			vAABBMax.z = vPoint.z;
+	}
+	return ((vAABBMin + vAABBMax) * 0.5f);
+}
+
+enum PolyhedronPointPlanarity
+{
+	POINT_DEAD,
+	POINT_ONPLANE,
+	POINT_ALIVE	
+};
+
+struct GeneratePolyhedronFromPlanes_Point
+{
+	Vector ptPosition;
+	GeneratePolyhedronFromPlanes_LineLL *pConnectedLines; //keep these in a clockwise order, circular linking
+	float fPlaneDist; //used in plane cutting
+	PolyhedronPointPlanarity planarity;
+	int iSaveIndices;
+};
+
+struct GeneratePolyhedronFromPlanes_Line
+{
+	GeneratePolyhedronFromPlanes_Point *pPoints[2]; //the 2 connecting points in no particular order
+	GeneratePolyhedronFromPlanes_Polygon *pPolygons[2]; //viewing from the outside with the point connections going up, 0 is the left polygon, 1 is the right
+	int iSaveIndices;
+	bool bAlive; //connected to at least one living point
+	bool bCut; //connected to at least one dead point
+
+	GeneratePolyhedronFromPlanes_LineLL *pPointLineLinks[2]; //rather than going into a point and searching for its link to this line, lets just cache it to eliminate searching
+	GeneratePolyhedronFromPlanes_LineLL *pPolygonLineLinks[2]; //rather than going into a polygon and searching for its link to this line, lets just cache it to eliminate searching
+#ifdef POLYHEDRON_EXTENSIVE_DEBUGGING
+	int iDebugFlags;
+#endif
+};
+
+struct GeneratePolyhedronFromPlanes_LineLL
+{
+	GeneratePolyhedronFromPlanes_Line *pLine;
+	int iReferenceIndex; //whatever is referencing the line should know which side of the line it's on (points and polygons), for polygons, it's which point to follow to continue going clockwise, which makes polygon 0 the one on the left side of an upward facing line vector, for points, it's the OTHER point's index
+	GeneratePolyhedronFromPlanes_LineLL *pPrev;
+	GeneratePolyhedronFromPlanes_LineLL *pNext;
+};
+
+struct GeneratePolyhedronFromPlanes_Polygon
+{
+	Vector vSurfaceNormal; 
+	GeneratePolyhedronFromPlanes_LineLL *pLines; //keep these in a clockwise order, circular linking
+	
+	bool bMissingASide;
+};
+
+struct GeneratePolyhedronFromPlanes_UnorderedPolygonLL //an unordered collection of polygons
+{
+	GeneratePolyhedronFromPlanes_Polygon *pPolygon;
+	GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pNext;
+	GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pPrev;
+};
+
+struct GeneratePolyhedronFromPlanes_UnorderedLineLL //an unordered collection of lines
+{
+	GeneratePolyhedronFromPlanes_Line *pLine;
+	GeneratePolyhedronFromPlanes_UnorderedLineLL *pNext;
+	GeneratePolyhedronFromPlanes_UnorderedLineLL *pPrev;
+};
+
+struct GeneratePolyhedronFromPlanes_UnorderedPointLL //an unordered collection of points
+{
+	GeneratePolyhedronFromPlanes_Point *pPoint;
+	GeneratePolyhedronFromPlanes_UnorderedPointLL *pNext;
+	GeneratePolyhedronFromPlanes_UnorderedPointLL *pPrev;
+};
+
+
+
+
+CPolyhedron *ClipPolyhedron( const CPolyhedron *pExistingPolyhedron, const float *pOutwardFacingPlanes, int iPlaneCount, float fOnPlaneEpsilon, bool bUseTemporaryMemory )
+{
+	if( pExistingPolyhedron == NULL )
+		return NULL;
+
+	AssertMsg( (pExistingPolyhedron->iVertexCount >= 3) && (pExistingPolyhedron->iPolygonCount >= 2), "Polyhedron doesn't meet absolute minimum spec" );
+
+	float *pUsefulPlanes = (float *)stackalloc( sizeof( float ) * 4 * iPlaneCount );
+	int iUsefulPlaneCount = 0;
+	Vector *pExistingVertices = pExistingPolyhedron->pVertices;
+
+	//A large part of clipping will either eliminate the polyhedron entirely, or clip nothing at all, so lets just check for those first and throw away useless planes
+	{
+		int iLiveCount = 0;
+		int iDeadCount = 0;
+		const float fNegativeOnPlaneEpsilon = -fOnPlaneEpsilon;
+
+		for( int i = 0; i != iPlaneCount; ++i )
+		{
+			Vector vNormal = *((Vector *)&pOutwardFacingPlanes[(i * 4) + 0]);
+			float fPlaneDist = pOutwardFacingPlanes[(i * 4) + 3];
+
+			for( int j = 0; j != pExistingPolyhedron->iVertexCount; ++j )
+			{
+				float fPointDist = vNormal.Dot( pExistingVertices[j] ) - fPlaneDist;
+				
+				if( fPointDist <= fNegativeOnPlaneEpsilon )
+					++iLiveCount;
+				else if( fPointDist > fOnPlaneEpsilon )
+					++iDeadCount;
+			}
+
+			if( iLiveCount == 0 )
+			{
+				//all points are dead or on the plane, so the polyhedron is dead
+				return NULL;
+			}
+
+			if( iDeadCount != 0 )
+			{
+				//at least one point died, this plane yields useful results
+				pUsefulPlanes[(iUsefulPlaneCount * 4) + 0] = vNormal.x;
+				pUsefulPlanes[(iUsefulPlaneCount * 4) + 1] = vNormal.y;
+				pUsefulPlanes[(iUsefulPlaneCount * 4) + 2] = vNormal.z;
+				pUsefulPlanes[(iUsefulPlaneCount * 4) + 3] = fPlaneDist;
+				++iUsefulPlaneCount;
+			}
+		}
+	}
+
+	if( iUsefulPlaneCount == 0 )
+	{
+		//testing shows that the polyhedron won't even be cut, clone the existing polyhedron and return that
+
+		CPolyhedron *pReturn;
+		if( bUseTemporaryMemory )
+		{
+			pReturn = GetTempPolyhedron( pExistingPolyhedron->iVertexCount, 
+											pExistingPolyhedron->iLineCount, 
+											pExistingPolyhedron->iIndexCount, 
+											pExistingPolyhedron->iPolygonCount );
+		}
+		else
+		{
+			pReturn = CPolyhedron_AllocByNew::Allocate( pExistingPolyhedron->iVertexCount, 
+														pExistingPolyhedron->iLineCount, 
+														pExistingPolyhedron->iIndexCount, 
+														pExistingPolyhedron->iPolygonCount );
+		}
+
+		memcpy( pReturn->pVertices, pExistingPolyhedron->pVertices, sizeof( Vector ) * pReturn->iVertexCount );
+		memcpy( pReturn->pLines, pExistingPolyhedron->pLines, sizeof( Polyhedron_IndexedLine_t ) * pReturn->iLineCount );
+		memcpy( pReturn->pIndices, pExistingPolyhedron->pIndices, sizeof( Polyhedron_IndexedLineReference_t ) * pReturn->iIndexCount );
+		memcpy( pReturn->pPolygons, pExistingPolyhedron->pPolygons, sizeof( Polyhedron_IndexedPolygon_t ) * pReturn->iPolygonCount );
+
+		return pReturn;
+	}
+
+
+
+	//convert the polyhedron to linked geometry
+	GeneratePolyhedronFromPlanes_Point *pStartPoints = (GeneratePolyhedronFromPlanes_Point *)stackalloc( pExistingPolyhedron->iVertexCount * sizeof( GeneratePolyhedronFromPlanes_Point ) );
+	GeneratePolyhedronFromPlanes_Line *pStartLines = (GeneratePolyhedronFromPlanes_Line *)stackalloc( pExistingPolyhedron->iLineCount * sizeof( GeneratePolyhedronFromPlanes_Line ) );
+	GeneratePolyhedronFromPlanes_Polygon *pStartPolygons = (GeneratePolyhedronFromPlanes_Polygon *)stackalloc( pExistingPolyhedron->iPolygonCount * sizeof( GeneratePolyhedronFromPlanes_Polygon ) );
+
+	GeneratePolyhedronFromPlanes_LineLL *pStartLineLinks = (GeneratePolyhedronFromPlanes_LineLL *)stackalloc( pExistingPolyhedron->iLineCount * 4 * sizeof( GeneratePolyhedronFromPlanes_LineLL ) );
+	
+	int iCurrentLineLinkIndex = 0;
+
+	//setup points
+	for( int i = 0; i != pExistingPolyhedron->iVertexCount; ++i )
+	{
+		pStartPoints[i].ptPosition = pExistingPolyhedron->pVertices[i];
+		pStartPoints[i].pConnectedLines = NULL; //we won't be circular linking until later
+	}
+
+	//setup lines and interlink to points (line links are not yet circularly linked, and are unordered)
+	for( int i = 0; i != pExistingPolyhedron->iLineCount; ++i )
+	{
+		for( int j = 0; j != 2; ++j )
+		{
+			pStartLines[i].pPoints[j] = &pStartPoints[pExistingPolyhedron->pLines[i].iPointIndices[j]];
+
+			GeneratePolyhedronFromPlanes_LineLL *pLineLink = &pStartLineLinks[iCurrentLineLinkIndex++];
+			pStartLines[i].pPointLineLinks[j] = pLineLink;
+			pLineLink->pLine = &pStartLines[i];
+			pLineLink->iReferenceIndex = 1 - j;
+			//pLineLink->pPrev = NULL;
+			pLineLink->pNext = pStartLines[i].pPoints[j]->pConnectedLines;
+			pStartLines[i].pPoints[j]->pConnectedLines = pLineLink;
+		}
+	}
+
+
+
+	//setup polygons
+	for( int i = 0; i != pExistingPolyhedron->iPolygonCount; ++i )
+	{
+		pStartPolygons[i].vSurfaceNormal = pExistingPolyhedron->pPolygons[i].polyNormal;
+		Polyhedron_IndexedLineReference_t *pOffsetPolyhedronLines = &pExistingPolyhedron->pIndices[pExistingPolyhedron->pPolygons[i].iFirstIndex];
+
+		
+		GeneratePolyhedronFromPlanes_LineLL *pFirstLink = &pStartLineLinks[iCurrentLineLinkIndex];
+		pStartPolygons[i].pLines = pFirstLink; //technically going to link to itself on first pass, then get linked properly immediately afterward
+		for( int j = 0; j != pExistingPolyhedron->pPolygons[i].iIndexCount; ++j )
+		{
+			GeneratePolyhedronFromPlanes_LineLL *pLineLink = &pStartLineLinks[iCurrentLineLinkIndex++];
+			pLineLink->pLine = &pStartLines[pOffsetPolyhedronLines[j].iLineIndex];
+			pLineLink->iReferenceIndex = pOffsetPolyhedronLines[j].iEndPointIndex;
+			
+			pLineLink->pLine->pPolygons[pLineLink->iReferenceIndex] = &pStartPolygons[i];
+			pLineLink->pLine->pPolygonLineLinks[pLineLink->iReferenceIndex] = pLineLink;			
+
+			pLineLink->pPrev = pStartPolygons[i].pLines;
+			pStartPolygons[i].pLines->pNext = pLineLink;
+			pStartPolygons[i].pLines = pLineLink;
+		}
+		
+		pFirstLink->pPrev = pStartPolygons[i].pLines;
+		pStartPolygons[i].pLines->pNext = pFirstLink;
+	}
+
+	Assert( iCurrentLineLinkIndex == (pExistingPolyhedron->iLineCount * 4) );
+
+	//go back to point line links so we can circularly link them as well as order them now that every point has all its line links
+	for( int i = 0; i != pExistingPolyhedron->iVertexCount; ++i )
+	{
+		//interlink the points
+		{
+			GeneratePolyhedronFromPlanes_LineLL *pLastVisitedLink = pStartPoints[i].pConnectedLines;
+			GeneratePolyhedronFromPlanes_LineLL *pCurrentLink = pLastVisitedLink;
+			
+			do
+			{
+				pCurrentLink->pPrev = pLastVisitedLink;
+				pLastVisitedLink = pCurrentLink;
+				pCurrentLink = pCurrentLink->pNext;
+			} while( pCurrentLink );
+
+			//circular link
+			pLastVisitedLink->pNext = pStartPoints[i].pConnectedLines;
+			pStartPoints[i].pConnectedLines->pPrev = pLastVisitedLink;
+		}
+
+
+		//fix ordering
+		GeneratePolyhedronFromPlanes_LineLL *pFirstLink = pStartPoints[i].pConnectedLines;
+		GeneratePolyhedronFromPlanes_LineLL *pWorkLink = pFirstLink;
+		GeneratePolyhedronFromPlanes_LineLL *pSearchLink;
+		GeneratePolyhedronFromPlanes_Polygon *pLookingForPolygon;
+		Assert( pFirstLink->pNext != pFirstLink );
+		do
+		{
+			pLookingForPolygon = pWorkLink->pLine->pPolygons[1 - pWorkLink->iReferenceIndex]; //grab pointer to left polygon
+			pSearchLink = pWorkLink->pPrev;
+
+			while( pSearchLink->pLine->pPolygons[pSearchLink->iReferenceIndex] != pLookingForPolygon )
+				pSearchLink = pSearchLink->pPrev;
+
+			Assert( pSearchLink->pLine->pPolygons[pSearchLink->iReferenceIndex] == pWorkLink->pLine->pPolygons[1 - pWorkLink->iReferenceIndex] );
+
+			//pluck the search link from wherever it is
+			pSearchLink->pPrev->pNext = pSearchLink->pNext;
+			pSearchLink->pNext->pPrev = pSearchLink->pPrev;
+
+			//insert the search link just before the work link			
+			pSearchLink->pPrev = pWorkLink->pPrev;
+			pSearchLink->pNext = pWorkLink;
+			
+			pSearchLink->pPrev->pNext = pSearchLink;
+			pWorkLink->pPrev = pSearchLink;
+
+			pWorkLink = pSearchLink;
+		} while( pWorkLink != pFirstLink );
+	}
+
+	GeneratePolyhedronFromPlanes_UnorderedPointLL *pPoints = (GeneratePolyhedronFromPlanes_UnorderedPointLL *)stackalloc( pExistingPolyhedron->iVertexCount * sizeof( GeneratePolyhedronFromPlanes_UnorderedPointLL ) );
+	GeneratePolyhedronFromPlanes_UnorderedLineLL *pLines = (GeneratePolyhedronFromPlanes_UnorderedLineLL *)stackalloc( pExistingPolyhedron->iLineCount * sizeof( GeneratePolyhedronFromPlanes_UnorderedLineLL ) );
+	GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pPolygons = (GeneratePolyhedronFromPlanes_UnorderedPolygonLL *)stackalloc( pExistingPolyhedron->iPolygonCount * sizeof( GeneratePolyhedronFromPlanes_UnorderedPolygonLL ) );
+
+	//setup point collection
+	{
+		pPoints[0].pPrev = NULL;
+		pPoints[0].pPoint = &pStartPoints[0];
+		pPoints[0].pNext = &pPoints[1];
+		int iLastPoint = pExistingPolyhedron->iVertexCount - 1;
+		for( int i = 1; i != iLastPoint; ++i )
+		{
+			pPoints[i].pPrev = &pPoints[i - 1];
+			pPoints[i].pPoint = &pStartPoints[i];
+			pPoints[i].pNext = &pPoints[i + 1];
+		}
+		pPoints[iLastPoint].pPrev = &pPoints[iLastPoint - 1];
+		pPoints[iLastPoint].pPoint = &pStartPoints[iLastPoint];
+		pPoints[iLastPoint].pNext = NULL;
+	}
+
+	//setup line collection
+	{
+		pLines[0].pPrev = NULL;
+		pLines[0].pLine = &pStartLines[0];
+		pLines[0].pNext = &pLines[1];
+		int iLastLine = pExistingPolyhedron->iLineCount - 1;
+		for( int i = 1; i != iLastLine; ++i )
+		{
+			pLines[i].pPrev = &pLines[i - 1];
+			pLines[i].pLine = &pStartLines[i];
+			pLines[i].pNext = &pLines[i + 1];
+		}
+		pLines[iLastLine].pPrev = &pLines[iLastLine - 1];
+		pLines[iLastLine].pLine = &pStartLines[iLastLine];
+		pLines[iLastLine].pNext = NULL;
+	}
+
+	//setup polygon collection
+	{
+		pPolygons[0].pPrev = NULL;
+		pPolygons[0].pPolygon = &pStartPolygons[0];
+		pPolygons[0].pNext = &pPolygons[1];
+		int iLastPolygon = pExistingPolyhedron->iPolygonCount - 1;
+		for( int i = 1; i != iLastPolygon; ++i )
+		{
+			pPolygons[i].pPrev = &pPolygons[i - 1];
+			pPolygons[i].pPolygon = &pStartPolygons[i];
+			pPolygons[i].pNext = &pPolygons[i + 1];
+		}
+		pPolygons[iLastPolygon].pPrev = &pPolygons[iLastPolygon - 1];
+		pPolygons[iLastPolygon].pPolygon = &pStartPolygons[iLastPolygon];
+		pPolygons[iLastPolygon].pNext = NULL;
+	}
+
+	return ClipLinkedGeometry( pPolygons, pLines, pPoints, pUsefulPlanes, iUsefulPlaneCount, fOnPlaneEpsilon, bUseTemporaryMemory );
+}
+
+
+
+Vector FindPointInPlanes( const float *pPlanes, int planeCount )
+{
+	Vector point = vec3_origin;
+
+	for ( int i = 0; i < planeCount; i++ )
+	{
+		float fD = DotProduct( *(Vector *)&pPlanes[i*4], point ) - pPlanes[i*4 + 3];
+		if ( fD < 0 )
+		{
+			point -= fD * (*(Vector *)&pPlanes[i*4]);
+		}
+	}
+	return point;
+}
+
+
+
+bool FindConvexShapeLooseAABB( const float *pInwardFacingPlanes, int iPlaneCount, Vector *pAABBMins, Vector *pAABBMaxs ) //bounding box of the convex shape (subject to floating point error)
+{
+	//returns false if the AABB hasn't been set
+	if( pAABBMins == NULL && pAABBMaxs == NULL ) //no use in actually finding out what it is
+		return false;
+
+	struct FindConvexShapeAABB_Polygon_t
+	{
+		float *verts;
+		int iVertCount;
+	};
+
+	float *pMovedPlanes = (float *)stackalloc( iPlaneCount * 4 * sizeof( float ) );
+	//Vector vPointInPlanes = FindPointInPlanes( pInwardFacingPlanes, iPlaneCount );
+
+	for( int i = 0; i != iPlaneCount; ++i )
+	{
+		pMovedPlanes[(i * 4) + 0] = pInwardFacingPlanes[(i * 4) + 0];
+		pMovedPlanes[(i * 4) + 1] = pInwardFacingPlanes[(i * 4) + 1];
+		pMovedPlanes[(i * 4) + 2] = pInwardFacingPlanes[(i * 4) + 2];
+		pMovedPlanes[(i * 4) + 3] = pInwardFacingPlanes[(i * 4) + 3] - 100.0f; //move planes out a lot to kill some imprecision problems
+	}
+	
+	
+
+	//vAABBMins = vAABBMaxs = FindPointInPlanes( pPlanes, iPlaneCount );
+	float *vertsIn = NULL; //we'll be allocating a new buffer for this with each new polygon, and moving it off to the polygon array
+	float *vertsOut = (float *)stackalloc( (iPlaneCount + 4) * (sizeof( float ) * 3) ); //each plane will initially have 4 points in its polygon representation, and each plane clip has the possibility to add 1 point to the polygon
+	float *vertsSwap;
+
+	FindConvexShapeAABB_Polygon_t *pPolygons = (FindConvexShapeAABB_Polygon_t *)stackalloc( iPlaneCount * sizeof( FindConvexShapeAABB_Polygon_t ) );
+	int iPolyCount = 0;
+
+	for ( int i = 0; i < iPlaneCount; i++ )
+	{
+		Vector *pPlaneNormal = (Vector *)&pInwardFacingPlanes[i*4];
+		float fPlaneDist = pInwardFacingPlanes[(i*4) + 3];
+
+		if( vertsIn == NULL )
+			vertsIn = (float *)stackalloc( (iPlaneCount + 4) * (sizeof( float ) * 3) );
+
+		// Build a big-ass poly in this plane
+		int vertCount = PolyFromPlane( (Vector *)vertsIn, *pPlaneNormal, fPlaneDist, 100000.0f );
+
+		//chop it by every other plane
+		for( int j = 0; j < iPlaneCount; j++ )
+		{
+			// don't clip planes with themselves
+			if ( i == j )
+				continue;
+
+			// Chop the polygon against this plane
+			vertCount = ClipPolyToPlane( (Vector *)vertsIn, vertCount, (Vector *)vertsOut, *(Vector *)&pMovedPlanes[j*4], pMovedPlanes[(j*4) + 3], 0.0f );
+
+			//swap the input and output arrays
+			vertsSwap = vertsIn; vertsIn = vertsOut; vertsOut = vertsSwap;
+
+			// Less than a poly left, something's wrong, don't bother with this polygon
+			if ( vertCount < 3 )
+				break;
+		}
+
+		if ( vertCount < 3 )
+			continue; //not enough to work with
+
+		pPolygons[iPolyCount].iVertCount = vertCount;
+		pPolygons[iPolyCount].verts = vertsIn;
+		vertsIn = NULL;
+		++iPolyCount;
+	}
+
+	if( iPolyCount == 0 )
+		return false;
+
+	//initialize the AABB to the first point available
+	Vector vAABBMins, vAABBMaxs;
+	vAABBMins = vAABBMaxs = ((Vector *)pPolygons[0].verts)[0];
+
+	if( pAABBMins && pAABBMaxs ) //they want the full box
+	{
+		for( int i = 0; i != iPolyCount; ++i )
+		{
+			Vector *PolyVerts = (Vector *)pPolygons[i].verts;
+			for( int j = 0; j != pPolygons[i].iVertCount; ++j )
+			{
+				if( PolyVerts[j].x < vAABBMins.x ) 
+					vAABBMins.x = PolyVerts[j].x;
+				if( PolyVerts[j].y < vAABBMins.y ) 
+					vAABBMins.y = PolyVerts[j].y;
+				if( PolyVerts[j].z < vAABBMins.z ) 
+					vAABBMins.z = PolyVerts[j].z;
+
+				if( PolyVerts[j].x > vAABBMaxs.x ) 
+					vAABBMaxs.x = PolyVerts[j].x;
+				if( PolyVerts[j].y > vAABBMaxs.y ) 
+					vAABBMaxs.y = PolyVerts[j].y;
+				if( PolyVerts[j].z > vAABBMaxs.z ) 
+					vAABBMaxs.z = PolyVerts[j].z;
+			}
+		}
+		*pAABBMins = vAABBMins;
+		*pAABBMaxs = vAABBMaxs;
+	}
+	else if( pAABBMins ) //they only want the min
+	{
+		for( int i = 0; i != iPolyCount; ++i )
+		{
+			Vector *PolyVerts = (Vector *)pPolygons[i].verts;
+			for( int j = 0; j != pPolygons[i].iVertCount; ++j )
+			{
+				if( PolyVerts[j].x < vAABBMins.x ) 
+					vAABBMins.x = PolyVerts[j].x;
+				if( PolyVerts[j].y < vAABBMins.y ) 
+					vAABBMins.y = PolyVerts[j].y;
+				if( PolyVerts[j].z < vAABBMins.z ) 
+					vAABBMins.z = PolyVerts[j].z;
+			}
+		}
+		*pAABBMins = vAABBMins;
+	}
+	else //they only want the max
+	{
+		for( int i = 0; i != iPolyCount; ++i )
+		{
+			Vector *PolyVerts = (Vector *)pPolygons[i].verts;
+			for( int j = 0; j != pPolygons[i].iVertCount; ++j )
+			{
+				if( PolyVerts[j].x > vAABBMaxs.x ) 
+					vAABBMaxs.x = PolyVerts[j].x;
+				if( PolyVerts[j].y > vAABBMaxs.y ) 
+					vAABBMaxs.y = PolyVerts[j].y;
+				if( PolyVerts[j].z > vAABBMaxs.z ) 
+					vAABBMaxs.z = PolyVerts[j].z;
+			}
+		}
+		*pAABBMaxs = vAABBMaxs;
+	}
+
+	return true;
+}
+
+
+
+
+
+
+
+CPolyhedron *ConvertLinkedGeometryToPolyhedron( GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pPolygons, GeneratePolyhedronFromPlanes_UnorderedLineLL *pLines, GeneratePolyhedronFromPlanes_UnorderedPointLL *pPoints, bool bUseTemporaryMemory )
+{
+	Assert( (pPolygons != NULL) && (pLines != NULL) && (pPoints != NULL) );
+	unsigned int iPolyCount = 0, iLineCount = 0, iPointCount = 0, iIndexCount = 0;
+
+	GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pActivePolygonWalk = pPolygons;	
+	do
+	{
+		++iPolyCount;
+		GeneratePolyhedronFromPlanes_LineLL *pLineWalk = pActivePolygonWalk->pPolygon->pLines;
+		GeneratePolyhedronFromPlanes_LineLL *pFirstLine = pLineWalk;
+		Assert( pLineWalk != NULL );
+		
+		do
+		{
+			++iIndexCount;
+			pLineWalk = pLineWalk->pNext;
+		} while( pLineWalk != pFirstLine );
+
+		pActivePolygonWalk = pActivePolygonWalk->pNext;
+	} while( pActivePolygonWalk );
+
+	GeneratePolyhedronFromPlanes_UnorderedLineLL *pActiveLineWalk = pLines;
+	do
+	{
+		++iLineCount;
+		pActiveLineWalk = pActiveLineWalk->pNext;
+	} while( pActiveLineWalk );
+
+	GeneratePolyhedronFromPlanes_UnorderedPointLL *pActivePointWalk = pPoints;
+	do
+	{
+		++iPointCount;
+		pActivePointWalk = pActivePointWalk->pNext;
+	} while( pActivePointWalk );	
+	
+	CPolyhedron *pReturn;
+	if( bUseTemporaryMemory )
+	{
+		pReturn = GetTempPolyhedron( iPointCount, iLineCount, iIndexCount, iPolyCount );
+	}
+	else
+	{
+		pReturn = CPolyhedron_AllocByNew::Allocate( iPointCount, iLineCount, iIndexCount, iPolyCount );
+	}
+
+	Vector *pVertexArray = pReturn->pVertices;
+	Polyhedron_IndexedLine_t *pLineArray = pReturn->pLines;
+	Polyhedron_IndexedLineReference_t *pIndexArray = pReturn->pIndices;
+	Polyhedron_IndexedPolygon_t *pPolyArray = pReturn->pPolygons;
+
+	//copy points
+	pActivePointWalk = pPoints;
+	for( unsigned int i = 0; i != iPointCount; ++i )
+	{
+		pVertexArray[i] = pActivePointWalk->pPoint->ptPosition;
+		pActivePointWalk->pPoint->iSaveIndices = i; //storing array indices
+		pActivePointWalk = pActivePointWalk->pNext;
+	}
+
+	//copy lines
+	pActiveLineWalk = pLines;
+	for( unsigned int i = 0; i != iLineCount; ++i )
+	{
+		pLineArray[i].iPointIndices[0] = (unsigned short)pActiveLineWalk->pLine->pPoints[0]->iSaveIndices;
+		pLineArray[i].iPointIndices[1] = (unsigned short)pActiveLineWalk->pLine->pPoints[1]->iSaveIndices;
+
+		pActiveLineWalk->pLine->iSaveIndices = i; //storing array indices
+
+		pActiveLineWalk = pActiveLineWalk->pNext;
+	}
+
+	//copy polygons and indices at the same time
+	pActivePolygonWalk = pPolygons;
+	iIndexCount = 0;
+	for( unsigned int i = 0; i != iPolyCount; ++i )
+	{
+		pPolyArray[i].polyNormal = pActivePolygonWalk->pPolygon->vSurfaceNormal;
+		pPolyArray[i].iFirstIndex = iIndexCount;		
+		
+		GeneratePolyhedronFromPlanes_LineLL *pLineWalk = pActivePolygonWalk->pPolygon->pLines;
+		GeneratePolyhedronFromPlanes_LineLL *pFirstLine = pLineWalk;
+		do
+		{
+			//pIndexArray[iIndexCount] = pLineWalk->pLine->pPoints[pLineWalk->iReferenceIndex]->iWorkData; //startpoint of each line, iWorkData is the index of the vertex
+			pIndexArray[iIndexCount].iLineIndex = pLineWalk->pLine->iSaveIndices;
+			pIndexArray[iIndexCount].iEndPointIndex = pLineWalk->iReferenceIndex;
+			
+			++iIndexCount;
+			pLineWalk = pLineWalk->pNext;
+		} while( pLineWalk != pFirstLine );
+
+		pPolyArray[i].iIndexCount = iIndexCount - pPolyArray[i].iFirstIndex;
+
+		pActivePolygonWalk = pActivePolygonWalk->pNext;	
+	}
+
+#if defined( _DEBUG ) && defined( ENABLE_DEBUG_POLYHEDRON_DUMPS ) && defined( DEBUG_DUMP_POLYHEDRONS_TO_NUMBERED_GLVIEWS )
+	char szCollisionFile[128];
+	CreateDumpDirectory( "PolyhedronDumps" );
+	Q_snprintf( szCollisionFile, 128, "PolyhedronDumps/NewStyle_PolyhedronDump%i.txt", g_iPolyhedronDumpCounter );
+	++g_iPolyhedronDumpCounter;
+
+	remove( szCollisionFile );
+	DumpPolyhedronToGLView( pReturn, szCollisionFile, &s_matIdentity );
+	DumpPolyhedronToGLView( pReturn, "PolyhedronDumps/NewStyle_PolyhedronDump_All-Appended.txt", &s_matIdentity );
+#endif
+
+	return pReturn;
+}
+
+
+
+#ifdef _DEBUG
+
+void DumpPointListToGLView( GeneratePolyhedronFromPlanes_UnorderedPointLL *pHead, PolyhedronPointPlanarity planarity, const Vector &vColor, const char *szDumpFile, const VMatrix *pTransform )
+{
+#ifdef ENABLE_DEBUG_POLYHEDRON_DUMPS
+	if( pTransform == NULL )
+		pTransform = &s_matIdentity;
+	
+	FILE *pFile = fopen( szDumpFile, "ab" );
+	
+	while( pHead )
+	{
+		if( pHead->pPoint->planarity == planarity )
+		{
+			const Vector vPointExtents( 0.5f, 0.5f, 0.01f );
+			DumpAABBToGLView( (*pTransform) * pHead->pPoint->ptPosition, vPointExtents, vColor, pFile );
+		}
+		pHead = pHead->pNext;
+	}
+
+	fclose( pFile );
+#endif
+}
+
+const char * DumpPolyhedronCutHistory( const CUtlVector<CPolyhedron *> &DumpedHistory, const CUtlVector<const float *> &CutHistory, const VMatrix *pTransform )
+{
+#ifdef ENABLE_DEBUG_POLYHEDRON_DUMPS
+	if( pTransform == NULL )
+		pTransform = &s_matIdentity;
+
+	static char szDumpFile[100] = "FailedPolyhedronCut_Error.txt"; //most recent filename returned for further dumping
+
+	for( int i = 0; i != DumpedHistory.Count(); ++i )
+	{
+		if( DumpedHistory[i] != NULL )
+		{
+			Q_snprintf( szDumpFile, 100, "FailedPolyhedronCut_%d.txt", i );
+			DumpPolyhedronToGLView( DumpedHistory[i], szDumpFile, pTransform );
+			DumpPlaneToGlView( CutHistory[i], 1.0f, szDumpFile, pTransform );
+		}
+	}
+
+	return szDumpFile;
+#else
+	return NULL;
+#endif
+}
+
+#ifdef ENABLE_DEBUG_POLYHEDRON_DUMPS
+#define AssertMsg_DumpPolyhedron(condition, message)\
+	if( (condition) == false )\
+	{\
+		VMatrix matTransform;\
+		matTransform.Identity();\
+		matTransform[0][0] = matTransform[1][1] = matTransform[2][2] = 25.0f;\
+		matTransform.SetTranslation( -DebugCutHistory.Tail()->Center() * 25.0f );\
+		const char *szLastDumpFile = DumpPolyhedronCutHistory( DebugCutHistory, PlaneCutHistory, &matTransform );\
+		DumpPointListToGLView( pAllPoints, POINT_ALIVE, Vector( 0.9f, 0.9f, 0.9f ), szLastDumpFile, &matTransform );\
+		DumpPointListToGLView( pAllPoints, POINT_ONPLANE, Vector( 0.5f, 0.5f, 0.5f ), szLastDumpFile, &matTransform );\
+		DumpPointListToGLView( pDeadPointCollection, POINT_DEAD, Vector( 0.1f, 0.1f, 0.1f ), szLastDumpFile, &matTransform );\
+		if( pStartPoint )\
+		{\
+			FILE *pFileDumpRepairProgress = fopen( szLastDumpFile, "ab" );\
+			DumpAABBToGLView( matTransform * pStartPoint->ptPosition, Vector( 2.0f, 0.05f, 0.05f ), Vector( 0.0f, 1.0f, 0.0f ), pFileDumpRepairProgress );\
+			DumpAABBToGLView( matTransform * pWorkPoint->ptPosition, Vector( 2.0f, 0.05f, 0.05f ), Vector( 1.0f, 0.0f, 0.0f ), pFileDumpRepairProgress );\
+			fclose( pFileDumpRepairProgress );\
+		}\
+		AssertMsg( condition, message );\
+	}
+#else
+#define AssertMsg_DumpPolyhedron(condition, message) AssertMsg( condition, message )
+#endif
+#define Assert_DumpPolyhedron(condition) AssertMsg_DumpPolyhedron( condition, #condition )
+
+#else
+
+#define AssertMsg_DumpPolyhedron(condition, message) NULL;
+#define Assert_DumpPolyhedron(condition) NULL;
+
+#endif
+
+CPolyhedron *ClipLinkedGeometry( GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pAllPolygons, GeneratePolyhedronFromPlanes_UnorderedLineLL *pAllLines, GeneratePolyhedronFromPlanes_UnorderedPointLL *pAllPoints, const float *pOutwardFacingPlanes, int iPlaneCount, float fOnPlaneEpsilon, bool bUseTemporaryMemory )
+{
+	const float fNegativeOnPlaneEpsilon = -fOnPlaneEpsilon;
+
+#ifdef _DEBUG
+	CUtlVector<CPolyhedron *> DebugCutHistory;
+	CUtlVector<const float *> PlaneCutHistory;
+	GeneratePolyhedronFromPlanes_Point *pStartPoint = NULL;
+	GeneratePolyhedronFromPlanes_Point *pWorkPoint = NULL;
+
+	static int iPolyhedronClipCount = 0;
+	++iPolyhedronClipCount;
+	
+	DebugCutHistory.AddToTail( ConvertLinkedGeometryToPolyhedron( pAllPolygons, pAllLines, pAllPoints, false ) );
+#endif
+
+	//clear out polygon work variables
+	{
+		GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pActivePolygonWalk = pAllPolygons;
+		do
+		{
+			pActivePolygonWalk->pPolygon->bMissingASide = false;
+			pActivePolygonWalk = pActivePolygonWalk->pNext;
+		} while( pActivePolygonWalk );
+	}
+
+
+	//Collections of dead pointers for reallocation, shouldn't be touched until the current loop iteration is done.
+	GeneratePolyhedronFromPlanes_UnorderedPointLL	*pDeadPointCollection = NULL;
+	GeneratePolyhedronFromPlanes_UnorderedLineLL	*pDeadLineCollection = NULL;
+	GeneratePolyhedronFromPlanes_UnorderedPolygonLL	*pDeadPolygonCollection = NULL;
+	GeneratePolyhedronFromPlanes_LineLL				*pDeadLineLinkCollection = NULL;
+
+
+	for( int iCurrentPlane = 0; iCurrentPlane != iPlaneCount; ++iCurrentPlane )
+	{
+		//clear out line work variables
+		{
+			GeneratePolyhedronFromPlanes_UnorderedLineLL *pActiveLineWalk = pAllLines;
+			do
+			{
+				pActiveLineWalk->pLine->bAlive = false;
+				pActiveLineWalk->pLine->bCut = false;
+
+				pActiveLineWalk = pActiveLineWalk->pNext;
+			} while( pActiveLineWalk );
+		}
+		
+		//TODO: Move these pointers into a reallocation pool
+		pDeadPointCollection = NULL; 
+		pDeadLineCollection = NULL;
+		pDeadLineLinkCollection = NULL;
+		pDeadPolygonCollection = NULL;
+
+		Vector vNormal = *((Vector *)&pOutwardFacingPlanes[(iCurrentPlane * 4) + 0]);
+		/*double vNormalAsDouble[3];
+		vNormalAsDouble[0] = vNormal.x;
+		vNormalAsDouble[1] = vNormal.y;
+		vNormalAsDouble[2] = vNormal.z;*/
+		float fPlaneDist = pOutwardFacingPlanes[(iCurrentPlane * 4) + 3];
+
+		//===================================================================================================
+		// Step 1: Categorize each point as being either cut, split, or alive
+		//===================================================================================================
+		{
+			bool bAllPointsDead = true;
+			bool bAllPointsAlive = true;
+
+			//find point distances from the plane
+			GeneratePolyhedronFromPlanes_UnorderedPointLL *pActivePointWalk = pAllPoints;
+			do
+			{
+				GeneratePolyhedronFromPlanes_Point *pPoint = pActivePointWalk->pPoint;
+				float fPointDist = vNormal.Dot( pPoint->ptPosition ) - fPlaneDist;
+				if( fPointDist > fOnPlaneEpsilon )
+				{
+					pPoint->planarity = POINT_DEAD; //point is dead, bang bang
+
+					//mark connected lines as cut
+					GeneratePolyhedronFromPlanes_LineLL *pLineWalk = pPoint->pConnectedLines;
+					GeneratePolyhedronFromPlanes_LineLL *pFirstLine = pLineWalk;
+					do
+					{
+						pLineWalk->pLine->bCut = true;
+						pLineWalk = pLineWalk->pNext;
+					} while( pLineWalk != pFirstLine );
+
+					bAllPointsAlive = false;
+				}
+				else if( fPointDist <= fNegativeOnPlaneEpsilon )
+				{
+					pPoint->planarity = POINT_ALIVE; //point is in behind plane, not voted off the island....yet
+					bAllPointsDead = false;
+
+					//mark connected lines as alive
+					GeneratePolyhedronFromPlanes_LineLL *pLineWalk = pPoint->pConnectedLines;
+					GeneratePolyhedronFromPlanes_LineLL *pFirstLine = pLineWalk;
+					do
+					{
+						pLineWalk->pLine->bAlive = true; //mark the line as alive
+						pLineWalk = pLineWalk->pNext;
+					} while( pLineWalk != pFirstLine );
+				}
+				else
+				{
+					pPoint->planarity = POINT_ONPLANE; //point is on the plane, he's everyone's buddy
+
+					//Project on-plane points leaning towards death closer to the plane. This battles floating point precision decay.
+					// Consider the case of a large on-plane epsilon leaving protrusions over time
+					/*if( fPointDist < 0.0f )
+					{
+						double distAsDouble = fPointDist;
+						double vPositionAsDouble[3];
+						vPositionAsDouble[0] = pPoint->ptPosition.x;
+						vPositionAsDouble[1] = pPoint->ptPosition.y;
+						vPositionAsDouble[2] = pPoint->ptPosition.z;
+
+						pPoint->ptPosition.x = vPositionAsDouble[0] - (distAsDouble * vNormalAsDouble[0]);
+						pPoint->ptPosition.y = vPositionAsDouble[1] - (distAsDouble * vNormalAsDouble[1]);
+						pPoint->ptPosition.z = vPositionAsDouble[2] - (distAsDouble * vNormalAsDouble[2]);
+
+#if ( 0 && defined( _DEBUG ) )
+						float fDebugDist = vNormal.Dot( pPoint->ptPosition ) - fPlaneDist; //just for looking at in watch windows
+						AssertMsg( fabs( fDebugDist ) < fabs(fPointDist), "Projected point is further from plane than unprojected." );
+#endif
+						fPointDist = vNormal.Dot( pPoint->ptPosition ) - fPlaneDist; //recompute dist (not guaranteed to be 0.0 like we want)
+					}*/				
+				}
+
+				pPoint->fPlaneDist = fPointDist;
+
+				pActivePointWalk = pActivePointWalk->pNext;
+			} while( pActivePointWalk );
+
+			if( bAllPointsDead ) //all the points either died or are on the plane, no polyhedron left at all
+			{
+#ifdef _DEBUG
+				for( int i = DebugCutHistory.Count(); --i >= 0; )
+				{
+					if( DebugCutHistory[i] )
+						DebugCutHistory[i]->Release();
+				}
+				DebugCutHistory.RemoveAll();
+#endif
+
+				return NULL; 
+			}
+
+			if( bAllPointsAlive )
+				continue; //no cuts made
+
+
+			//Scan for onplane points connected to only other onplane/dead points, these points get downgraded to dead status.
+			{
+				GeneratePolyhedronFromPlanes_UnorderedPointLL *pActivePointWalk = pAllPoints;
+				do
+				{
+					if( pActivePointWalk->pPoint->planarity == POINT_ONPLANE )
+					{
+						GeneratePolyhedronFromPlanes_LineLL *pOnPlaneLineWalk = pActivePointWalk->pPoint->pConnectedLines;
+						GeneratePolyhedronFromPlanes_LineLL *pStartLineWalk = pOnPlaneLineWalk;
+						bool bDead = true; //assume it's dead and disprove
+						do
+						{
+							if ( pOnPlaneLineWalk->pLine->bAlive )
+							{
+								bDead = false;
+							}
+							else if ( pOnPlaneLineWalk->pLine->bCut )
+							{
+								//connected to a dead point.
+								if( pOnPlaneLineWalk->pNext->pLine->bCut || pOnPlaneLineWalk->pPrev->pLine->bCut )
+								{
+									//This on-plane point is surrounded by dead points on one polygon of the polyhedron.
+									//	We have to downgrade this point to dead to avoid situations where float imprecision 
+									//	turns the polyhedron into a *slightly* concave shape. Concave shapes might break this algorithm, even falsely concave shapes.
+									bDead = true;
+									break;
+								}
+							}
+
+							pOnPlaneLineWalk = pOnPlaneLineWalk->pNext;
+						} while( pOnPlaneLineWalk != pStartLineWalk );
+
+						if( bDead )
+						{
+							pActivePointWalk->pPoint->planarity = POINT_DEAD;
+
+							pOnPlaneLineWalk = pStartLineWalk;
+
+							//mark connected lines as cut
+							do
+							{
+								pOnPlaneLineWalk->pLine->bCut = true;
+								pOnPlaneLineWalk = pOnPlaneLineWalk->pNext;
+							} while( pOnPlaneLineWalk != pStartLineWalk );
+						}
+					}
+					pActivePointWalk = pActivePointWalk->pNext;
+				} while( pActivePointWalk );
+			}
+#ifdef _DEBUG
+			PlaneCutHistory.AddToTail( &pOutwardFacingPlanes[iCurrentPlane * 4] );
+#endif
+		}
+
+		
+
+
+#ifdef _DEBUG
+		//Run around the edges of all the polygons and ensure they don't have more than one point of lowered "alive" status (alive > onplane > dead) surrounded by higher status
+		//	It indicates a concave shape. It's impossible to have it occur in theoretical space. But floating point numbers introduce error.
+		{
+			GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pDebugPolygonWalk = pAllPolygons;
+			do
+			{
+				int iSurroundedCount = 0;
+				GeneratePolyhedronFromPlanes_LineLL *pDebugLineWalk = pDebugPolygonWalk->pPolygon->pLines;
+				GeneratePolyhedronFromPlanes_LineLL *pFirstDebugLine = pDebugLineWalk;
+
+				do
+				{
+					PolyhedronPointPlanarity currentPlanarity = pDebugLineWalk->pLine->pPoints[pDebugLineWalk->iReferenceIndex]->planarity;
+					
+					GeneratePolyhedronFromPlanes_LineLL *pNext = pDebugLineWalk->pNext;
+					PolyhedronPointPlanarity nextPlanarity = pNext->pLine->pPoints[pNext->iReferenceIndex]->planarity;
+
+					if( currentPlanarity < nextPlanarity )
+					{
+						GeneratePolyhedronFromPlanes_LineLL *pPrev = pDebugLineWalk->pPrev;
+						PolyhedronPointPlanarity prevPlanarity = pPrev->pLine->pPoints[pPrev->iReferenceIndex]->planarity;
+
+						if( currentPlanarity < prevPlanarity )
+						{
+							++iSurroundedCount;
+						}
+					}
+
+					pDebugLineWalk = pDebugLineWalk->pNext;
+				} while( pDebugLineWalk != pFirstDebugLine );
+
+				AssertMsg_DumpPolyhedron( iSurroundedCount <= 1, "Concave polygon, cutting process might break. Consider adjusting the on-plane epsilon to better compensate for floating point precision." );
+				pDebugPolygonWalk = pDebugPolygonWalk->pNext;
+			} while( pDebugPolygonWalk );
+		}
+#endif
+
+		//===================================================================================================
+		// Step 2: Remove dead lines. A dead line is one with a dead point that isn't connected to a living point
+		//===================================================================================================
+		{
+			GeneratePolyhedronFromPlanes_UnorderedLineLL *pActiveLineWalk = pAllLines;
+			do
+			{
+				GeneratePolyhedronFromPlanes_Line *pLine = pActiveLineWalk->pLine;
+				if( (pLine->bAlive == false) && (pLine->bCut == true) ) //not connected to a live point, but connected to a dead one. Dead line
+				{
+					//remove line from connected polygons
+					for( int i = 0; i != 2; ++i )
+					{
+						GeneratePolyhedronFromPlanes_Polygon *pPolygon = pLine->pPolygons[i];
+						GeneratePolyhedronFromPlanes_LineLL *pLineLink = pLine->pPolygonLineLinks[i];
+                        
+						pPolygon->bMissingASide = true;
+
+						if( pLineLink->pNext == pLineLink )
+						{
+							//this was the last line of the polygon, it's dead
+							pPolygon->pLines = NULL;
+						}
+						else
+						{
+							//link around this line
+							pPolygon->pLines = pLineLink->pPrev; //Always have the polygon's head line be just before the gap in the polygon
+							pLineLink->pNext->pPrev = pLineLink->pPrev;
+							pLineLink->pPrev->pNext = pLineLink->pNext;
+						}
+
+						//move the line link to the dead list
+						pLineLink->pNext = pDeadLineLinkCollection;
+						pDeadLineLinkCollection = pLineLink;
+					}
+
+					//remove the line from connected points
+					for( int i = 0; i != 2; ++i )
+					{
+						GeneratePolyhedronFromPlanes_Point *pPoint = pLine->pPoints[i];
+						GeneratePolyhedronFromPlanes_LineLL *pLineLink = pLine->pPointLineLinks[i];
+						
+						if( pLineLink->pNext == pLineLink )
+						{					
+							//this is the last line
+							pPoint->pConnectedLines = NULL;
+							Assert( pPoint->planarity != POINT_ALIVE );
+							pPoint->planarity = POINT_DEAD; //in case it was merely POINT_ONPLANE before
+						}
+						else
+						{
+							//link around this line
+							pPoint->pConnectedLines = pLineLink->pNext; //in case pLineLink was the head line
+							pLineLink->pNext->pPrev = pLineLink->pPrev;
+							pLineLink->pPrev->pNext = pLineLink->pNext;
+						}
+
+						//move the line link to the dead list
+						pLineLink->pNext = pDeadLineLinkCollection;
+						pDeadLineLinkCollection = pLineLink;
+					}
+
+					//move the line to the dead list
+					{
+						//link past this node
+						if( pActiveLineWalk->pPrev )
+							pActiveLineWalk->pPrev->pNext = pActiveLineWalk->pNext;
+						else
+							pAllLines = pActiveLineWalk->pNext;
+
+						if( pActiveLineWalk->pNext )
+							pActiveLineWalk->pNext->pPrev = pActiveLineWalk->pPrev;
+
+						GeneratePolyhedronFromPlanes_UnorderedLineLL *pNextLineWalk = pActiveLineWalk->pNext;
+						
+						//add to the dead list
+						pActiveLineWalk->pNext = pDeadLineCollection;
+						pDeadLineCollection = pActiveLineWalk;
+						
+						//next
+						pActiveLineWalk = pNextLineWalk;
+					}
+				}
+				else
+				{
+					pActiveLineWalk = pActiveLineWalk->pNext;
+				}
+			} while( pActiveLineWalk );
+		}
+
+
+		//===================================================================================================
+		// Step 3: Remove dead polygons. A dead polygon has less than 2 lines.
+		//===================================================================================================
+		{
+			GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pActivePolygonWalk = pAllPolygons;
+			do
+			{
+				GeneratePolyhedronFromPlanes_Polygon *pPolygon = pActivePolygonWalk->pPolygon;
+				GeneratePolyhedronFromPlanes_LineLL *pHeadLine = pPolygon->pLines;
+
+				bool bDead = (pHeadLine == NULL) || (pHeadLine->pNext == pHeadLine);
+				if( !bDead )
+				{
+					//there's a rare case where a polygon can be almost entirely coplanar with the cut, it comes purely out of the land of imprecision
+					bDead = true; //assume it's dead, and disprove
+
+					GeneratePolyhedronFromPlanes_LineLL *pTestLineWalk = pHeadLine;
+					do
+					{
+						if( pTestLineWalk->pLine->bAlive )
+						{
+							bDead = false;
+							break;
+						}
+							
+						pTestLineWalk = pTestLineWalk->pNext;
+					} while( pTestLineWalk != pHeadLine );
+				}
+
+				if( bDead )
+				{
+					//dead polygon, move it to the dead list
+
+					//link around this node
+					if( pActivePolygonWalk->pPrev )
+						pActivePolygonWalk->pPrev->pNext = pActivePolygonWalk->pNext;
+					else
+						pAllPolygons = pAllPolygons->pNext; //pActivePolygonWalk was the head node
+
+					if( pActivePolygonWalk->pNext )
+						pActivePolygonWalk->pNext->pPrev = pActivePolygonWalk->pPrev;
+
+					GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pNextPolygonWalk = pActivePolygonWalk->pNext;
+
+					//add to the dead list
+					pActivePolygonWalk->pNext = pDeadPolygonCollection;
+					pDeadPolygonCollection = pActivePolygonWalk;
+
+					//next
+					pActivePolygonWalk = pNextPolygonWalk;
+				}
+				else
+				{
+					AssertMsg_DumpPolyhedron( (pActivePolygonWalk->pPolygon->pLines != NULL) && 
+						(pActivePolygonWalk->pPolygon->pLines != pActivePolygonWalk->pPolygon->pLines->pNext), "Living polygon with less than 2 lines" );
+					
+					pActivePolygonWalk = pActivePolygonWalk->pNext;
+				}
+			} while( pActivePolygonWalk );
+		}
+
+		//===================================================================================================
+		// Step 4: Remove dead points.
+		//===================================================================================================
+		{
+			GeneratePolyhedronFromPlanes_UnorderedPointLL *pActivePointWalk = pAllPoints;
+			do
+			{
+				if( pActivePointWalk->pPoint->planarity == POINT_DEAD )
+				{
+					GeneratePolyhedronFromPlanes_UnorderedPointLL *pNext = pActivePointWalk->pNext;
+
+					if( pActivePointWalk->pPrev )
+						pActivePointWalk->pPrev->pNext = pActivePointWalk->pNext;
+					else
+						pAllPoints = pAllPoints->pNext;
+
+					if( pActivePointWalk->pNext )
+						pActivePointWalk->pNext->pPrev = pActivePointWalk->pPrev;
+
+					pActivePointWalk->pNext = pDeadPointCollection;
+					pDeadPointCollection = pActivePointWalk;
+
+					pActivePointWalk = pNext;
+				}
+				else
+				{
+					pActivePointWalk = pActivePointWalk->pNext;
+				}				
+			} while( pActivePointWalk );
+		}
+
+
+		//===================================================================================================
+		// Step 5: Handle cut lines
+		//===================================================================================================
+		{
+			GeneratePolyhedronFromPlanes_UnorderedLineLL *pActiveLineWalk = pAllLines;
+			do
+			{
+				GeneratePolyhedronFromPlanes_Line *pWorkLine = pActiveLineWalk->pLine;
+				Assert_DumpPolyhedron( (pWorkLine->bAlive == true) || (pWorkLine->bCut == false) ); //all dead lines should have already been removed
+				
+				if( pWorkLine->bCut )
+				{
+					GeneratePolyhedronFromPlanes_Point **pLinePoints = pWorkLine->pPoints;
+
+					Assert_DumpPolyhedron( (pLinePoints[0]->planarity == POINT_DEAD) || (pLinePoints[1]->planarity == POINT_DEAD) ); //one of the two has to be a dead point
+
+					int iDeadIndex = (pLinePoints[0]->planarity == POINT_DEAD)?(0):(1);
+					int iLivingIndex = 1 - iDeadIndex;
+					GeneratePolyhedronFromPlanes_Point *pDeadPoint = pLinePoints[iDeadIndex];
+					GeneratePolyhedronFromPlanes_Point *pLivingPoint = pLinePoints[iLivingIndex];
+
+					Assert_DumpPolyhedron( pLivingPoint->planarity == POINT_ALIVE ); //if this point were on-plane or dead, the line should be dead
+
+					//We'll be de-linking from the old point and generating a new one. We do this so other lines can still access the dead point's untouched data.
+					
+					//Generate a new point
+					GeneratePolyhedronFromPlanes_Point *pNewPoint = (GeneratePolyhedronFromPlanes_Point *)stackalloc( sizeof( GeneratePolyhedronFromPlanes_Point ) );
+					{
+						//add this point to the active list
+						pAllPoints->pPrev = (GeneratePolyhedronFromPlanes_UnorderedPointLL *)stackalloc( sizeof( GeneratePolyhedronFromPlanes_UnorderedPointLL ) );
+						pAllPoints->pPrev->pNext = pAllPoints;
+						pAllPoints = pAllPoints->pPrev;
+						pAllPoints->pPrev = NULL;
+						pAllPoints->pPoint = pNewPoint;
+
+
+						float fInvTotalDist = 1.0f/(pDeadPoint->fPlaneDist - pLivingPoint->fPlaneDist); //subtraction because the living index is known to be negative
+						pNewPoint->ptPosition = (pLivingPoint->ptPosition * (pDeadPoint->fPlaneDist * fInvTotalDist)) - (pDeadPoint->ptPosition * (pLivingPoint->fPlaneDist * fInvTotalDist));
+
+#if ( 0 && defined( _DEBUG ) )
+						float fDebugDist = vNormal.Dot( pNewPoint->ptPosition ) - fPlaneDist; //just for looking at in watch windows
+						AssertMsg_DumpPolyhedron( fabs( fDebugDist ) < fOnPlaneEpsilon, "Generated split point is far from plane" );
+
+						//verify that the new point isn't sitting on top of another
+						{
+							GeneratePolyhedronFromPlanes_UnorderedPointLL *pActivePointWalk = pAllPoints;
+							do
+							{
+								if( pActivePointWalk->pPoint != pNewPoint )
+								{
+									Vector vDiff = pActivePointWalk->pPoint->ptPosition - pNewPoint->ptPosition;
+
+									AssertMsg_DumpPolyhedron( vDiff.Length() > fOnPlaneEpsilon, "Generated a point on top of another" );
+								}
+								pActivePointWalk = pActivePointWalk->pNext;
+							} while( pActivePointWalk );
+						}
+#endif
+
+						pNewPoint->planarity = POINT_ONPLANE;
+						pNewPoint->fPlaneDist = 0.0f;
+					}
+					
+					GeneratePolyhedronFromPlanes_LineLL *pNewLineLink = pNewPoint->pConnectedLines = (GeneratePolyhedronFromPlanes_LineLL *)stackalloc( sizeof( GeneratePolyhedronFromPlanes_LineLL ) );
+					pNewLineLink->pLine = pWorkLine;
+					pNewLineLink->pNext = pNewLineLink;
+					pNewLineLink->pPrev = pNewLineLink;
+					pNewLineLink->iReferenceIndex = iLivingIndex;
+
+					pWorkLine->pPoints[iDeadIndex] = pNewPoint;
+					pWorkLine->pPointLineLinks[iDeadIndex] = pNewLineLink;
+					pNewPoint->pConnectedLines = pNewLineLink;
+
+					//A new line is needed on each polygon touching the dead point to connect the two new endpoints for split lines. 
+					// So mark connected polygons as missing a side.
+					for( int i = 0; i != 2; ++i )
+						pWorkLine->pPolygons[i]->bMissingASide = true;
+					
+
+					//Always have a cut polygon's head line be just before the gap in the polygon. 
+					// In this case, we know that one of the two polygons goes clockwise into the dead point, so have that polygon point at this line.
+					// We don't know enough about the other polygon to do anything here, but another cut line will handle that polygon. So it all works out in the end.
+					pWorkLine->pPolygons[iDeadIndex]->pLines = pWorkLine->pPolygonLineLinks[iDeadIndex];
+				}
+
+				pActiveLineWalk = pActiveLineWalk->pNext;
+			} while( pActiveLineWalk );
+		}
+
+
+		//===================================================================================================
+		// Step 6: Repair polygons that are missing a side. And generate the new coplanar polygon.
+		//===================================================================================================
+		{
+			//Find the first polygon missing a side.
+			// We'll then walk from polygon to polygon using line connections so that we can generate the new polygon in a clockwise manner.
+			GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pActivePolygonWalk = pAllPolygons;
+
+			while( (pActivePolygonWalk != NULL) && (pActivePolygonWalk->pPolygon->bMissingASide == false) )
+			{
+				pActivePolygonWalk = pActivePolygonWalk->pNext;
+			}
+
+			//acquire iteration data
+#ifndef _DEBUG
+			GeneratePolyhedronFromPlanes_Point *pStartPoint;
+			GeneratePolyhedronFromPlanes_Point *pWorkPoint;
+#endif
+
+			GeneratePolyhedronFromPlanes_LineLL *pLastLineLink;
+			GeneratePolyhedronFromPlanes_Polygon *pWorkPolygon;			
+			GeneratePolyhedronFromPlanes_LineLL *pTestLine;
+
+#ifdef _DEBUG
+			GeneratePolyhedronFromPlanes_Polygon *pLastWorkPolygon = NULL;
+			GeneratePolyhedronFromPlanes_Point *pLastWorkPoint = NULL;
+#endif
+
+			if( pActivePolygonWalk )
+			{
+				//grab the polygon we'll be starting with
+				GeneratePolyhedronFromPlanes_Polygon *pBrokenPolygon = pActivePolygonWalk->pPolygon;
+				
+				{
+					GeneratePolyhedronFromPlanes_LineLL *pTemp = pBrokenPolygon->pLines->pNext;
+					pStartPoint = pTemp->pLine->pPoints[1 - pTemp->iReferenceIndex];
+					Assert_DumpPolyhedron( pStartPoint->planarity == POINT_ONPLANE ); //every working point should be coplanar
+					pLastLineLink = pTemp->pLine->pPointLineLinks[1 - pTemp->iReferenceIndex]->pNext;
+					pWorkPolygon = pBrokenPolygon;
+				}
+
+				pWorkPoint = pStartPoint;
+				pTestLine = pLastLineLink->pPrev; //rotate counterclockwise around the point
+			}
+			else
+			{
+				//apparently the plane was entirely through existing polygonal borders, extremely rare but it can happen with inefficient cutting planes
+                GeneratePolyhedronFromPlanes_UnorderedPointLL *pActivePointWalk = pAllPoints;
+				while( (pActivePointWalk != NULL) && (pActivePointWalk->pPoint->planarity != POINT_ONPLANE) )
+				{
+					pActivePointWalk = pActivePointWalk->pNext;
+				}
+
+				Assert( pActivePointWalk != NULL );
+
+				pStartPoint = pWorkPoint = pActivePointWalk->pPoint;
+				GeneratePolyhedronFromPlanes_LineLL *pLines = pWorkPoint->pConnectedLines;
+				
+				while( !pLines->pLine->bAlive ) //seek clockwise until we find a line not on the plane
+					pLines = pLines->pNext;
+
+				while( pLines->pLine->bAlive ) //now seek counterclockwise until we find a line on the plane (in case we started on an alive line last seek)
+					pLines = pLines->pPrev;
+
+				//now pLines points at one side of the polygon, with pActivePointWalk
+				pLastLineLink = pLines;
+				pTestLine = pLines->pPrev;
+				pWorkPolygon = pTestLine->pLine->pPolygons[1 - pTestLine->iReferenceIndex];
+
+			}
+
+			//create the new polygon
+			GeneratePolyhedronFromPlanes_Polygon *pNewPolygon = (GeneratePolyhedronFromPlanes_Polygon *)stackalloc( sizeof( GeneratePolyhedronFromPlanes_Polygon ) );
+			{
+				//before we forget, add this polygon to the active list
+				pAllPolygons->pPrev = (GeneratePolyhedronFromPlanes_UnorderedPolygonLL *)stackalloc( sizeof( GeneratePolyhedronFromPlanes_UnorderedPolygonLL ) );
+				pAllPolygons->pPrev->pNext = pAllPolygons;
+				pAllPolygons = pAllPolygons->pPrev;
+				pAllPolygons->pPrev = NULL;
+				pAllPolygons->pPolygon = pNewPolygon;
+
+				pNewPolygon->bMissingASide = false; //technically missing all it's sides, but we're fixing it now
+				pNewPolygon->vSurfaceNormal = vNormal;
+				pNewPolygon->pLines = NULL;
+			}
+
+
+
+			//===================================================================================================================
+			// The general idea of the upcoming algorithm to put together a new polygon and patch broken polygons...
+			//	You have a point and a line the algorithm just jumped across.
+			//		1. Rotate through the point's line links one time counterclockwise (pPrev)
+			//		2. If the line is cut, then we make a new bridging line in the polygon between that line and the one counterclockwise to it. (pPrev)
+			//			If the line is on-plane. Skip the bridge line making, but set links to the new polygon as if we'd just created the bridge
+			//		3. Once we follow a line back to the point where we started, we should be all done.
+
+			do
+			{
+				if( pWorkPolygon->bMissingASide )
+				{
+					//during the cutting process we made sure that the head line link was going clockwise into the missing area
+					GeneratePolyhedronFromPlanes_LineLL *pGapLines[2];
+					pGapLines[1] = pTestLine->pLine->pPolygonLineLinks[pTestLine->iReferenceIndex]; //get the same line, but in the polygons linked list.
+					Assert_DumpPolyhedron( pGapLines[1]->pLine == pTestLine->pLine );
+					pGapLines[0] = pGapLines[1]->pPrev;
+
+					Assert_DumpPolyhedron( pWorkPolygon->bMissingASide );
+
+#ifdef _DEBUG
+					{
+						//ensure that the space between the gap lines is the only space where fixing is required
+						GeneratePolyhedronFromPlanes_LineLL *pDebugLineWalk = pGapLines[1]->pNext;
+						
+						while( pDebugLineWalk != pGapLines[0] )
+						{
+							Assert_DumpPolyhedron( pDebugLineWalk->pLine->bCut == false );
+							pDebugLineWalk = pDebugLineWalk->pNext;
+						}
+					}
+#endif
+
+					GeneratePolyhedronFromPlanes_Line *pJoinLine = (GeneratePolyhedronFromPlanes_Line *)stackalloc( sizeof( GeneratePolyhedronFromPlanes_Line ) );
+					{
+						//before we forget, add this line to the active list
+						pAllLines->pPrev = (GeneratePolyhedronFromPlanes_UnorderedLineLL *)stackalloc( sizeof( GeneratePolyhedronFromPlanes_UnorderedLineLL ) );
+						pAllLines->pPrev->pNext = pAllLines;
+						pAllLines = pAllLines->pPrev;
+						pAllLines->pPrev = NULL;
+						pAllLines->pLine = pJoinLine;
+
+						pJoinLine->bAlive = false;
+						pJoinLine->bCut = false;
+					}
+
+
+					pJoinLine->pPoints[0] = pGapLines[0]->pLine->pPoints[pGapLines[0]->iReferenceIndex];
+					pJoinLine->pPoints[1] = pGapLines[1]->pLine->pPoints[1 - pGapLines[1]->iReferenceIndex];
+
+					pJoinLine->pPolygons[0] = pNewPolygon;
+					pJoinLine->pPolygons[1] = pWorkPolygon;
+
+					//now create all 4 links into the line
+					GeneratePolyhedronFromPlanes_LineLL *pPointLinks[2];
+					pPointLinks[0] = (GeneratePolyhedronFromPlanes_LineLL *)stackalloc( sizeof( GeneratePolyhedronFromPlanes_LineLL ) );
+					pPointLinks[1] = (GeneratePolyhedronFromPlanes_LineLL *)stackalloc( sizeof( GeneratePolyhedronFromPlanes_LineLL ) );
+
+					GeneratePolyhedronFromPlanes_LineLL *pPolygonLinks[2];
+					pPolygonLinks[0] = (GeneratePolyhedronFromPlanes_LineLL *)stackalloc( sizeof( GeneratePolyhedronFromPlanes_LineLL ) );
+					pPolygonLinks[1] = (GeneratePolyhedronFromPlanes_LineLL *)stackalloc( sizeof( GeneratePolyhedronFromPlanes_LineLL ) );
+
+					pPointLinks[0]->pLine = pPointLinks[1]->pLine = pPolygonLinks[0]->pLine = pPolygonLinks[1]->pLine = pJoinLine;
+
+					pJoinLine->pPointLineLinks[0] = pPointLinks[0];
+					pJoinLine->pPointLineLinks[1] = pPointLinks[1];
+					pJoinLine->pPolygonLineLinks[0] = pPolygonLinks[0];
+					pJoinLine->pPolygonLineLinks[1] = pPolygonLinks[1];
+
+
+
+					pPointLinks[0]->iReferenceIndex = 1;
+					pPointLinks[1]->iReferenceIndex = 0;
+
+					//Insert before the link from point 0 to gap line 0 (counterclockwise rotation)
+					{
+						GeneratePolyhedronFromPlanes_LineLL *pWorkLink = pGapLines[0]->pLine->pPointLineLinks[pGapLines[0]->iReferenceIndex];
+						Assert_DumpPolyhedron( pWorkLink->pLine == pGapLines[0]->pLine );
+
+						pPointLinks[0]->pPrev = pWorkLink->pPrev;
+						pPointLinks[0]->pNext = pWorkLink;
+
+						pWorkLink->pPrev->pNext = pPointLinks[0];
+						pWorkLink->pPrev = pPointLinks[0];						
+					}
+
+					//Insert after the link from point 1 to gap line 1 (clockwise rotation)
+					{
+						GeneratePolyhedronFromPlanes_LineLL *pWorkLink = pGapLines[1]->pLine->pPointLineLinks[1 - pGapLines[1]->iReferenceIndex];
+						Assert_DumpPolyhedron( pWorkLink->pLine == pGapLines[1]->pLine );
+
+						pPointLinks[1]->pNext = pWorkLink->pNext;
+						pPointLinks[1]->pPrev = pWorkLink;
+						
+						pWorkLink->pNext->pPrev = pPointLinks[1];
+						pWorkLink->pNext = pPointLinks[1];						
+					}
+
+
+
+
+					pPolygonLinks[0]->iReferenceIndex = 0;
+					pPolygonLinks[1]->iReferenceIndex = 1;
+
+					//Insert before the head line in the new polygon (at the end of the clockwise order)
+					{
+						if( pNewPolygon->pLines == NULL )
+						{
+							//this is the first line being added to the polygon
+							pNewPolygon->pLines = pPolygonLinks[0];
+							pPolygonLinks[0]->pNext = pPolygonLinks[0];
+							pPolygonLinks[0]->pPrev = pPolygonLinks[0];
+						}
+						else
+						{
+							GeneratePolyhedronFromPlanes_LineLL *pWorkLink = pNewPolygon->pLines;
+
+							pPolygonLinks[0]->pNext = pWorkLink;
+							pPolygonLinks[0]->pPrev = pWorkLink->pPrev;
+
+							pWorkLink->pPrev->pNext = pPolygonLinks[0];
+							pWorkLink->pPrev = pPolygonLinks[0];
+						}
+					}
+
+					//Insert after the head line in the work polygon
+					{
+						GeneratePolyhedronFromPlanes_LineLL *pWorkLink = pWorkPolygon->pLines;
+
+						pPolygonLinks[1]->pNext = pWorkLink->pNext;
+						pPolygonLinks[1]->pPrev = pWorkLink;
+
+						pWorkLink->pNext->pPrev = pPolygonLinks[1];
+						pWorkLink->pNext = pPolygonLinks[1];
+					}
+
+					pWorkPolygon->bMissingASide = false; //repairs are finished
+
+#ifdef _DEBUG
+					pLastWorkPolygon = pWorkPolygon;
+					pLastWorkPoint = pWorkPoint;
+#endif
+					//move to the next point
+					pWorkPoint = pJoinLine->pPoints[0];
+					pLastLineLink = pJoinLine->pPointLineLinks[0];
+					Assert_DumpPolyhedron( pWorkPoint->planarity == POINT_ONPLANE ); //every working point should be coplanar
+					
+					pTestLine = pLastLineLink->pPrev;
+					if( pTestLine->pLine->pPoints[pTestLine->iReferenceIndex]->planarity == POINT_ALIVE )
+						pWorkPolygon = pTestLine->pLine->pPolygons[pTestLine->iReferenceIndex];
+					else
+						pWorkPolygon = pTestLine->pLine->pPolygons[1 - pTestLine->iReferenceIndex];
+					
+					Assert_DumpPolyhedron( pWorkPolygon != pLastWorkPolygon );
+					Assert_DumpPolyhedron( (pWorkPoint == pStartPoint) ||
+											(pGapLines[0]->pLine->bCut == false) || 
+											(pWorkPolygon->bMissingASide == true) ); //if we're not done fixing, and if the shared line was cut, the next polygon must be missing a side
+				}
+				else
+				{
+					//line is on the plane, meaning the polygon isn't broken and doesn't need patching
+					Assert_DumpPolyhedron( pTestLine->pLine->bCut == false );
+					Assert_DumpPolyhedron( (pTestLine->pLine->pPoints[0]->planarity == POINT_ONPLANE) && (pTestLine->pLine->pPoints[1]->planarity == POINT_ONPLANE) );
+
+					
+					//link to this line from the new polygon
+					GeneratePolyhedronFromPlanes_LineLL *pNewLineLink;
+					pNewLineLink = (GeneratePolyhedronFromPlanes_LineLL *)stackalloc( sizeof( GeneratePolyhedronFromPlanes_LineLL ) );
+					
+					pNewLineLink->pLine = pTestLine->pLine;
+					pNewLineLink->iReferenceIndex = pTestLine->iReferenceIndex;
+
+					//Insert before the head line in the new polygon (at the end of the clockwise order)
+					{
+						if( pNewPolygon->pLines == NULL )
+						{
+							//this is the first line being added to the polygon
+							pNewPolygon->pLines = pNewLineLink;
+							pNewLineLink->pNext = pNewLineLink;
+							pNewLineLink->pPrev = pNewLineLink;
+						}
+						else
+						{
+							GeneratePolyhedronFromPlanes_LineLL *pWorkLink = pNewPolygon->pLines;
+
+							pNewLineLink->pNext = pWorkLink;
+							pNewLineLink->pPrev = pWorkLink->pPrev;
+
+							pWorkLink->pPrev->pNext = pNewLineLink;
+							pWorkLink->pPrev = pNewLineLink;
+						}
+					}
+
+					//Since the entire line is on the plane, that means it used to point to something that used to reside where the new polygon is going
+					// Update the link to the new the polygon pointer and be on our way
+					pTestLine->pLine->pPolygons[pTestLine->iReferenceIndex] = pNewPolygon;
+					pTestLine->pLine->pPolygonLineLinks[pTestLine->iReferenceIndex] = pNewLineLink;
+
+#ifdef _DEBUG
+					pLastWorkPolygon = pWorkPolygon;
+					pLastWorkPoint = pWorkPoint;
+#endif
+
+					pWorkPoint = pTestLine->pLine->pPoints[pTestLine->iReferenceIndex];
+					pLastLineLink = pTestLine->pLine->pPointLineLinks[pTestLine->iReferenceIndex];
+					Assert_DumpPolyhedron( pWorkPoint->planarity == POINT_ONPLANE ); //every working point should be coplanar
+
+					pTestLine = pLastLineLink->pPrev;
+					if( pTestLine->pLine->pPoints[pTestLine->iReferenceIndex]->planarity == POINT_ALIVE )
+						pWorkPolygon = pTestLine->pLine->pPolygons[pTestLine->iReferenceIndex];
+					else
+						pWorkPolygon = pTestLine->pLine->pPolygons[1 - pTestLine->iReferenceIndex];
+
+					Assert_DumpPolyhedron( pWorkPolygon != pLastWorkPolygon );
+				}
+			} while( pWorkPoint != pStartPoint );
+		}
+
+#ifdef _DEBUG
+		//verify that repairs are complete
+		{
+			GeneratePolyhedronFromPlanes_UnorderedPolygonLL *pDebugPolygonWalk = pAllPolygons;
+			do
+			{
+				AssertMsg_DumpPolyhedron( pDebugPolygonWalk->pPolygon->bMissingASide == false, "Some polygons not repaired after cut" );
+				pDebugPolygonWalk = pDebugPolygonWalk->pNext;
+			} while( pDebugPolygonWalk );
+
+
+			GeneratePolyhedronFromPlanes_UnorderedPointLL *pDebugPointWalk = pAllPoints;
+			do
+			{
+				AssertMsg_DumpPolyhedron( pDebugPointWalk->pPoint->pConnectedLines, "Point connected to no lines after cut" );
+				pDebugPointWalk = pDebugPointWalk->pNext;
+			} while( pDebugPointWalk );
+
+			pStartPoint = NULL;
+		}
+
+		//maintain the cut history
+		DebugCutHistory.AddToTail( ConvertLinkedGeometryToPolyhedron( pAllPolygons, pAllLines, pAllPoints, false ) );
+#endif
+	}
+
+#ifdef _DEBUG
+	for( int i = DebugCutHistory.Count(); --i >= 0; )
+	{
+		if( DebugCutHistory[i] )
+			DebugCutHistory[i]->Release();
+	}
+	DebugCutHistory.RemoveAll();
+#endif
+
+	return ConvertLinkedGeometryToPolyhedron( pAllPolygons, pAllLines, pAllPoints, bUseTemporaryMemory );
+}
+
+
+
+#define STARTPOINTTOLINELINKS(iPointNum, lineindex1, iOtherPointIndex1, lineindex2, iOtherPointIndex2, lineindex3, iOtherPointIndex3 )\
+	StartingBoxPoints[iPointNum].pConnectedLines = &StartingPoints_To_Lines_Links[(iPointNum * 3) + 0];\
+	StartingPoints_To_Lines_Links[(iPointNum * 3) + 0].pLine = &StartingBoxLines[lineindex1];\
+	StartingPoints_To_Lines_Links[(iPointNum * 3) + 0].iReferenceIndex = iOtherPointIndex1;\
+	StartingBoxLines[lineindex1].pPointLineLinks[1 - iOtherPointIndex1] = &StartingPoints_To_Lines_Links[(iPointNum * 3) + 0];\
+	StartingPoints_To_Lines_Links[(iPointNum * 3) + 0].pPrev = &StartingPoints_To_Lines_Links[(iPointNum * 3) + 2];\
+	StartingPoints_To_Lines_Links[(iPointNum * 3) + 0].pNext = &StartingPoints_To_Lines_Links[(iPointNum * 3) + 1];\
+	StartingPoints_To_Lines_Links[(iPointNum * 3) + 1].pLine = &StartingBoxLines[lineindex2];\
+	StartingPoints_To_Lines_Links[(iPointNum * 3) + 1].iReferenceIndex = iOtherPointIndex2;\
+	StartingBoxLines[lineindex2].pPointLineLinks[1 - iOtherPointIndex2] = &StartingPoints_To_Lines_Links[(iPointNum * 3) + 1];\
+	StartingPoints_To_Lines_Links[(iPointNum * 3) + 1].pPrev = &StartingPoints_To_Lines_Links[(iPointNum * 3) + 0];\
+	StartingPoints_To_Lines_Links[(iPointNum * 3) + 1].pNext = &StartingPoints_To_Lines_Links[(iPointNum * 3) + 2];\
+	StartingPoints_To_Lines_Links[(iPointNum * 3) + 2].pLine = &StartingBoxLines[lineindex3];\
+	StartingPoints_To_Lines_Links[(iPointNum * 3) + 2].iReferenceIndex = iOtherPointIndex3;\
+	StartingBoxLines[lineindex3].pPointLineLinks[1 - iOtherPointIndex3] = &StartingPoints_To_Lines_Links[(iPointNum * 3) + 2];\
+	StartingPoints_To_Lines_Links[(iPointNum * 3) + 2].pPrev = &StartingPoints_To_Lines_Links[(iPointNum * 3) + 1];\
+	StartingPoints_To_Lines_Links[(iPointNum * 3) + 2].pNext = &StartingPoints_To_Lines_Links[(iPointNum * 3) + 0];
+
+#define STARTBOXCONNECTION( linenum, point1, point2, poly1, poly2 )\
+	StartingBoxLines[linenum].pPoints[0] = &StartingBoxPoints[point1];\
+	StartingBoxLines[linenum].pPoints[1] = &StartingBoxPoints[point2];\
+	StartingBoxLines[linenum].pPolygons[0] = &StartingBoxPolygons[poly1];\
+	StartingBoxLines[linenum].pPolygons[1] = &StartingBoxPolygons[poly2];
+
+#define STARTPOLYGONTOLINELINKS( polynum, lineindex1, iThisPolyIndex1, lineindex2, iThisPolyIndex2, lineindex3, iThisPolyIndex3, lineindex4, iThisPolyIndex4 )\
+	StartingBoxPolygons[polynum].pLines = &StartingPolygon_To_Lines_Links[(polynum * 4) + 0];\
+	StartingPolygon_To_Lines_Links[(polynum * 4) + 0].pLine = &StartingBoxLines[lineindex1];\
+	StartingPolygon_To_Lines_Links[(polynum * 4) + 0].iReferenceIndex = iThisPolyIndex1;\
+	StartingBoxLines[lineindex1].pPolygonLineLinks[iThisPolyIndex1] = &StartingPolygon_To_Lines_Links[(polynum * 4) + 0];\
+	StartingPolygon_To_Lines_Links[(polynum * 4) + 0].pPrev = &StartingPolygon_To_Lines_Links[(polynum * 4) + 3];\
+	StartingPolygon_To_Lines_Links[(polynum * 4) + 0].pNext = &StartingPolygon_To_Lines_Links[(polynum * 4) + 1];\
+	StartingPolygon_To_Lines_Links[(polynum * 4) + 1].pLine = &StartingBoxLines[lineindex2];\
+	StartingPolygon_To_Lines_Links[(polynum * 4) + 1].iReferenceIndex = iThisPolyIndex2;\
+	StartingBoxLines[lineindex2].pPolygonLineLinks[iThisPolyIndex2] = &StartingPolygon_To_Lines_Links[(polynum * 4) + 1];\
+	StartingPolygon_To_Lines_Links[(polynum * 4) + 1].pPrev = &StartingPolygon_To_Lines_Links[(polynum * 4) + 0];\
+	StartingPolygon_To_Lines_Links[(polynum * 4) + 1].pNext = &StartingPolygon_To_Lines_Links[(polynum * 4) + 2];\
+	StartingPolygon_To_Lines_Links[(polynum * 4) + 2].pLine = &StartingBoxLines[lineindex3];\
+	StartingPolygon_To_Lines_Links[(polynum * 4) + 2].iReferenceIndex = iThisPolyIndex3;\
+	StartingBoxLines[lineindex3].pPolygonLineLinks[iThisPolyIndex3] = &StartingPolygon_To_Lines_Links[(polynum * 4) + 2];\
+	StartingPolygon_To_Lines_Links[(polynum * 4) + 2].pPrev = &StartingPolygon_To_Lines_Links[(polynum * 4) + 1];\
+	StartingPolygon_To_Lines_Links[(polynum * 4) + 2].pNext = &StartingPolygon_To_Lines_Links[(polynum * 4) + 3];\
+	StartingPolygon_To_Lines_Links[(polynum * 4) + 3].pLine = &StartingBoxLines[lineindex4];\
+	StartingPolygon_To_Lines_Links[(polynum * 4) + 3].iReferenceIndex = iThisPolyIndex4;\
+	StartingBoxLines[lineindex4].pPolygonLineLinks[iThisPolyIndex4] = &StartingPolygon_To_Lines_Links[(polynum * 4) + 3];\
+	StartingPolygon_To_Lines_Links[(polynum * 4) + 3].pPrev = &StartingPolygon_To_Lines_Links[(polynum * 4) + 2];\
+	StartingPolygon_To_Lines_Links[(polynum * 4) + 3].pNext = &StartingPolygon_To_Lines_Links[(polynum * 4) + 0];
+
+
+CPolyhedron *GeneratePolyhedronFromPlanes( const float *pOutwardFacingPlanes, int iPlaneCount, float fOnPlaneEpsilon, bool bUseTemporaryMemory )
+{
+	//this is version 2 of the polyhedron generator, version 1 made individual polygons and joined points together, some guesswork is involved and it therefore isn't a solid method
+	//this version will start with a cube and hack away at it (retaining point connection information) to produce a polyhedron with no guesswork involved, this method should be rock solid
+	
+	//the polygon clipping functions we're going to use want inward facing planes
+	float *pFlippedPlanes = (float *)stackalloc( (iPlaneCount * 4) * sizeof( float ) );
+	for( int i = 0; i != iPlaneCount * 4; ++i )
+	{
+		pFlippedPlanes[i] = -pOutwardFacingPlanes[i];
+	}
+
+	//our first goal is to find the size of a cube big enough to encapsulate all points that will be in the final polyhedron
+	Vector vAABBMins, vAABBMaxs;
+	if( FindConvexShapeLooseAABB( pFlippedPlanes, iPlaneCount, &vAABBMins, &vAABBMaxs ) == false )
+		return NULL; //no shape to work with apparently
+
+	
+	//grow the bounding box to a larger size since it's probably inaccurate a bit
+	{
+		Vector vGrow = (vAABBMaxs - vAABBMins) * 0.5f;
+		vGrow.x += 100.0f;
+		vGrow.y += 100.0f;
+		vGrow.z += 100.0f;
+
+		vAABBMaxs += vGrow;
+		vAABBMins -= vGrow;
+	}
+
+	//generate our starting cube using the 2x AABB so we can start hacking away at it
+	
+	
+
+	//create our starting box on the stack
+	GeneratePolyhedronFromPlanes_Point StartingBoxPoints[8];
+	GeneratePolyhedronFromPlanes_Line StartingBoxLines[12];
+	GeneratePolyhedronFromPlanes_Polygon StartingBoxPolygons[6];
+	GeneratePolyhedronFromPlanes_LineLL StartingPoints_To_Lines_Links[24]; //8 points, 3 lines per point
+	GeneratePolyhedronFromPlanes_LineLL StartingPolygon_To_Lines_Links[24]; //6 polygons, 4 lines per poly
+	
+	GeneratePolyhedronFromPlanes_UnorderedPolygonLL StartingPolygonList[6]; //6 polygons
+	GeneratePolyhedronFromPlanes_UnorderedLineLL StartingLineList[12]; //12 lines
+	GeneratePolyhedronFromPlanes_UnorderedPointLL StartingPointList[8]; //8 points
+
+
+	//I had to work all this out on a whiteboard if it seems completely unintuitive.
+	{
+		StartingBoxPoints[0].ptPosition.Init( vAABBMins.x, vAABBMins.y, vAABBMins.z );
+		STARTPOINTTOLINELINKS( 0, 0, 1, 4, 1, 3, 0 );
+
+		StartingBoxPoints[1].ptPosition.Init( vAABBMins.x, vAABBMaxs.y, vAABBMins.z );
+		STARTPOINTTOLINELINKS( 1, 0, 0, 1, 1, 5, 1 );
+
+		StartingBoxPoints[2].ptPosition.Init( vAABBMins.x, vAABBMins.y, vAABBMaxs.z );
+		STARTPOINTTOLINELINKS( 2, 4, 0, 8, 1, 11, 0 );
+
+		StartingBoxPoints[3].ptPosition.Init( vAABBMins.x, vAABBMaxs.y, vAABBMaxs.z );
+		STARTPOINTTOLINELINKS( 3, 5, 0, 9, 1, 8, 0 );
+
+		StartingBoxPoints[4].ptPosition.Init( vAABBMaxs.x, vAABBMins.y, vAABBMins.z );
+		STARTPOINTTOLINELINKS( 4, 2, 0, 3, 1, 7, 1 );
+
+		StartingBoxPoints[5].ptPosition.Init( vAABBMaxs.x, vAABBMaxs.y, vAABBMins.z );
+		STARTPOINTTOLINELINKS( 5, 1, 0, 2, 1, 6, 1 );
+
+		StartingBoxPoints[6].ptPosition.Init( vAABBMaxs.x, vAABBMins.y, vAABBMaxs.z );
+		STARTPOINTTOLINELINKS( 6, 7, 0, 11, 1, 10, 0 );
+
+		StartingBoxPoints[7].ptPosition.Init( vAABBMaxs.x, vAABBMaxs.y, vAABBMaxs.z );
+		STARTPOINTTOLINELINKS( 7, 6, 0, 10, 1, 9, 0 );
+
+		STARTBOXCONNECTION( 0, 0, 1, 0, 5 );
+		STARTBOXCONNECTION( 1, 1, 5, 1, 5 );
+		STARTBOXCONNECTION( 2, 5, 4, 2, 5 );
+		STARTBOXCONNECTION( 3, 4, 0, 3, 5 );
+		STARTBOXCONNECTION( 4, 0, 2, 3, 0 );
+		STARTBOXCONNECTION( 5, 1, 3, 0, 1 );
+		STARTBOXCONNECTION( 6, 5, 7, 1, 2 );
+		STARTBOXCONNECTION( 7, 4, 6, 2, 3 );
+		STARTBOXCONNECTION( 8, 2, 3, 4, 0 );
+		STARTBOXCONNECTION( 9, 3, 7, 4, 1 );
+		STARTBOXCONNECTION( 10, 7, 6, 4, 2 );
+		STARTBOXCONNECTION( 11, 6, 2, 4, 3 );
+
+
+		STARTBOXCONNECTION( 0, 0, 1, 5, 0 );
+		STARTBOXCONNECTION( 1, 1, 5, 5, 1 );
+		STARTBOXCONNECTION( 2, 5, 4, 5, 2 );
+		STARTBOXCONNECTION( 3, 4, 0, 5, 3 );
+		STARTBOXCONNECTION( 4, 0, 2, 0, 3 );
+		STARTBOXCONNECTION( 5, 1, 3, 1, 0 );
+		STARTBOXCONNECTION( 6, 5, 7, 2, 1 );
+		STARTBOXCONNECTION( 7, 4, 6, 3, 2 );
+		STARTBOXCONNECTION( 8, 2, 3, 0, 4 );
+		STARTBOXCONNECTION( 9, 3, 7, 1, 4 );
+		STARTBOXCONNECTION( 10, 7, 6, 2, 4 );
+		STARTBOXCONNECTION( 11, 6, 2, 3, 4 );
+
+		StartingBoxPolygons[0].vSurfaceNormal.Init( -1.0f, 0.0f, 0.0f );
+		StartingBoxPolygons[1].vSurfaceNormal.Init( 0.0f, 1.0f, 0.0f );
+		StartingBoxPolygons[2].vSurfaceNormal.Init( 1.0f, 0.0f, 0.0f );
+		StartingBoxPolygons[3].vSurfaceNormal.Init( 0.0f, -1.0f, 0.0f );
+		StartingBoxPolygons[4].vSurfaceNormal.Init( 0.0f, 0.0f, 1.0f );
+		StartingBoxPolygons[5].vSurfaceNormal.Init( 0.0f, 0.0f, -1.0f );
+
+
+		STARTPOLYGONTOLINELINKS( 0, 0, 1, 5, 1, 8, 0, 4, 0 );
+		STARTPOLYGONTOLINELINKS( 1, 1, 1, 6, 1, 9, 0, 5, 0 );
+		STARTPOLYGONTOLINELINKS( 2, 2, 1, 7, 1, 10, 0, 6, 0 );
+		STARTPOLYGONTOLINELINKS( 3, 3, 1, 4, 1, 11, 0, 7, 0 );
+		STARTPOLYGONTOLINELINKS( 4, 8, 1, 9, 1, 10, 1, 11, 1 );
+		STARTPOLYGONTOLINELINKS( 5, 0, 0, 3, 0, 2, 0, 1, 0 );
+
+
+		{
+			StartingPolygonList[0].pPolygon = &StartingBoxPolygons[0];
+			StartingPolygonList[0].pNext = &StartingPolygonList[1];
+			StartingPolygonList[0].pPrev = NULL;
+
+			StartingPolygonList[1].pPolygon = &StartingBoxPolygons[1];
+			StartingPolygonList[1].pNext = &StartingPolygonList[2];
+			StartingPolygonList[1].pPrev = &StartingPolygonList[0];
+
+			StartingPolygonList[2].pPolygon = &StartingBoxPolygons[2];
+			StartingPolygonList[2].pNext = &StartingPolygonList[3];
+			StartingPolygonList[2].pPrev = &StartingPolygonList[1];
+
+			StartingPolygonList[3].pPolygon = &StartingBoxPolygons[3];
+			StartingPolygonList[3].pNext = &StartingPolygonList[4];
+			StartingPolygonList[3].pPrev = &StartingPolygonList[2];
+
+			StartingPolygonList[4].pPolygon = &StartingBoxPolygons[4];
+			StartingPolygonList[4].pNext = &StartingPolygonList[5];
+			StartingPolygonList[4].pPrev = &StartingPolygonList[3];
+
+			StartingPolygonList[5].pPolygon = &StartingBoxPolygons[5];
+			StartingPolygonList[5].pNext = NULL;
+			StartingPolygonList[5].pPrev = &StartingPolygonList[4];
+		}
+
+
+
+		{
+			StartingLineList[0].pLine = &StartingBoxLines[0];
+			StartingLineList[0].pNext = &StartingLineList[1];
+			StartingLineList[0].pPrev = NULL;
+
+			StartingLineList[1].pLine = &StartingBoxLines[1];
+			StartingLineList[1].pNext = &StartingLineList[2];
+			StartingLineList[1].pPrev = &StartingLineList[0];
+
+			StartingLineList[2].pLine = &StartingBoxLines[2];
+			StartingLineList[2].pNext = &StartingLineList[3];
+			StartingLineList[2].pPrev = &StartingLineList[1];
+
+			StartingLineList[3].pLine = &StartingBoxLines[3];
+			StartingLineList[3].pNext = &StartingLineList[4];
+			StartingLineList[3].pPrev = &StartingLineList[2];
+
+			StartingLineList[4].pLine = &StartingBoxLines[4];
+			StartingLineList[4].pNext = &StartingLineList[5];
+			StartingLineList[4].pPrev = &StartingLineList[3];
+
+			StartingLineList[5].pLine = &StartingBoxLines[5];
+			StartingLineList[5].pNext = &StartingLineList[6];
+			StartingLineList[5].pPrev = &StartingLineList[4];
+
+			StartingLineList[6].pLine = &StartingBoxLines[6];
+			StartingLineList[6].pNext = &StartingLineList[7];
+			StartingLineList[6].pPrev = &StartingLineList[5];
+
+			StartingLineList[7].pLine = &StartingBoxLines[7];
+			StartingLineList[7].pNext = &StartingLineList[8];
+			StartingLineList[7].pPrev = &StartingLineList[6];
+
+			StartingLineList[8].pLine = &StartingBoxLines[8];
+			StartingLineList[8].pNext = &StartingLineList[9];
+			StartingLineList[8].pPrev = &StartingLineList[7];
+
+			StartingLineList[9].pLine = &StartingBoxLines[9];
+			StartingLineList[9].pNext = &StartingLineList[10];
+			StartingLineList[9].pPrev = &StartingLineList[8];
+
+			StartingLineList[10].pLine = &StartingBoxLines[10];
+			StartingLineList[10].pNext = &StartingLineList[11];
+			StartingLineList[10].pPrev = &StartingLineList[9];
+
+			StartingLineList[11].pLine = &StartingBoxLines[11];
+			StartingLineList[11].pNext = NULL;
+			StartingLineList[11].pPrev = &StartingLineList[10];
+		}
+
+		{
+			StartingPointList[0].pPoint = &StartingBoxPoints[0];
+			StartingPointList[0].pNext = &StartingPointList[1];
+			StartingPointList[0].pPrev = NULL;
+
+			StartingPointList[1].pPoint = &StartingBoxPoints[1];
+			StartingPointList[1].pNext = &StartingPointList[2];
+			StartingPointList[1].pPrev = &StartingPointList[0];
+
+			StartingPointList[2].pPoint = &StartingBoxPoints[2];
+			StartingPointList[2].pNext = &StartingPointList[3];
+			StartingPointList[2].pPrev = &StartingPointList[1];
+
+			StartingPointList[3].pPoint = &StartingBoxPoints[3];
+			StartingPointList[3].pNext = &StartingPointList[4];
+			StartingPointList[3].pPrev = &StartingPointList[2];
+
+			StartingPointList[4].pPoint = &StartingBoxPoints[4];
+			StartingPointList[4].pNext = &StartingPointList[5];
+			StartingPointList[4].pPrev = &StartingPointList[3];
+
+			StartingPointList[5].pPoint = &StartingBoxPoints[5];
+			StartingPointList[5].pNext = &StartingPointList[6];
+			StartingPointList[5].pPrev = &StartingPointList[4];
+
+			StartingPointList[6].pPoint = &StartingBoxPoints[6];
+			StartingPointList[6].pNext = &StartingPointList[7];
+			StartingPointList[6].pPrev = &StartingPointList[5];
+
+			StartingPointList[7].pPoint = &StartingBoxPoints[7];
+			StartingPointList[7].pNext = NULL;
+			StartingPointList[7].pPrev = &StartingPointList[6];
+		}
+	}
+
+	return ClipLinkedGeometry( StartingPolygonList, StartingLineList, StartingPointList, pOutwardFacingPlanes, iPlaneCount, fOnPlaneEpsilon, bUseTemporaryMemory );
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#ifdef _DEBUG
+void DumpAABBToGLView( const Vector &vCenter, const Vector &vExtents, const Vector &vColor, FILE *pFile )
+{
+#ifdef ENABLE_DEBUG_POLYHEDRON_DUMPS
+	Vector vMins = vCenter - vExtents;
+	Vector vMaxs = vCenter + vExtents;
+
+	//x min side
+	fprintf( pFile, "4\n" );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMins.y, vMins.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMins.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMaxs.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMaxs.y, vMins.z, vColor.x, vColor.y, vColor.z );
+
+	fprintf( pFile, "4\n" );	
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMaxs.y, vMins.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMaxs.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMins.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMins.y, vMins.z, vColor.x, vColor.y, vColor.z );
+
+	//x max side
+	fprintf( pFile, "4\n" );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMins.y, vMins.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMins.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMaxs.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMaxs.y, vMins.z, vColor.x, vColor.y, vColor.z );
+
+	fprintf( pFile, "4\n" );	
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMaxs.y, vMins.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMaxs.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMins.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMins.y, vMins.z, vColor.x, vColor.y, vColor.z );
+
+
+	//y min side
+	fprintf( pFile, "4\n" );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMins.y, vMins.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMins.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMins.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMins.y, vMins.z, vColor.x, vColor.y, vColor.z );
+
+	fprintf( pFile, "4\n" );	
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMins.y, vMins.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMins.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMins.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMins.y, vMins.z, vColor.x, vColor.y, vColor.z );
+
+
+
+	//y max side
+	fprintf( pFile, "4\n" );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMaxs.y, vMins.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMaxs.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMaxs.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMaxs.y, vMins.z, vColor.x, vColor.y, vColor.z );
+
+	fprintf( pFile, "4\n" );	
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMaxs.y, vMins.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMaxs.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMaxs.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMaxs.y, vMins.z, vColor.x, vColor.y, vColor.z );
+
+
+
+	//z min side
+	fprintf( pFile, "4\n" );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMins.y, vMins.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMaxs.y, vMins.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMaxs.y, vMins.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMins.y, vMins.z, vColor.x, vColor.y, vColor.z );
+
+	fprintf( pFile, "4\n" );	
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMins.y, vMins.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMaxs.y, vMins.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMaxs.y, vMins.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMins.y, vMins.z, vColor.x, vColor.y, vColor.z );
+
+
+	//z max side
+	fprintf( pFile, "4\n" );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMins.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMaxs.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMaxs.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMins.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
+
+	fprintf( pFile, "4\n" );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMins.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMaxs.x, vMaxs.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMaxs.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vMins.x, vMins.y, vMaxs.z, vColor.x, vColor.y, vColor.z );
+#endif
+}
+
+void DumpLineToGLView( const Vector &vPoint1, const Vector &vColor1, const Vector &vPoint2, const Vector &vColor2, float fThickness, FILE *pFile )
+{
+#ifdef ENABLE_DEBUG_POLYHEDRON_DUMPS
+	Vector vDirection = vPoint2 - vPoint1;
+	vDirection.NormalizeInPlace();
+
+	Vector vPseudoPerpandicular = vec3_origin;
+
+	if( vDirection.x != 0.0f )
+		vPseudoPerpandicular.z = 1.0f;
+	else
+		vPseudoPerpandicular.x = 1.0f;
+
+	Vector vWidth = vDirection.Cross( vPseudoPerpandicular );
+	vWidth.NormalizeInPlace();
+
+	Vector vHeight = vDirection.Cross( vWidth );
+	vHeight.NormalizeInPlace();
+
+	fThickness *= 0.5f; //we use half thickness in both directions
+	vDirection *= fThickness;
+	vWidth *= fThickness;
+	vHeight *= fThickness;
+
+	Vector vLinePoints[8];
+	vLinePoints[0] = vPoint1 - vDirection - vWidth - vHeight;
+	vLinePoints[1] = vPoint1 - vDirection - vWidth + vHeight;
+	vLinePoints[2] = vPoint1 - vDirection + vWidth - vHeight;
+	vLinePoints[3] = vPoint1 - vDirection + vWidth + vHeight;
+
+	vLinePoints[4] = vPoint2 + vDirection - vWidth - vHeight;
+	vLinePoints[5] = vPoint2 + vDirection - vWidth + vHeight;
+	vLinePoints[6] = vPoint2 + vDirection + vWidth - vHeight;
+	vLinePoints[7] = vPoint2 + vDirection + vWidth + vHeight;
+
+	const Vector *pLineColors[8] = { &vColor1, &vColor1, &vColor1, &vColor1, &vColor2, &vColor2, &vColor2, &vColor2 };
+
+
+#define DPTGLV_LINE_WRITEPOINT(index) fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vLinePoints[index].x, vLinePoints[index].y, vLinePoints[index].z, pLineColors[index]->x, pLineColors[index]->y, pLineColors[index]->z );
+#define DPTGLV_LINE_DOUBLESIDEDQUAD(index1,index2,index3,index4)\
+	fprintf( pFile, "4\n" );\
+	DPTGLV_LINE_WRITEPOINT(index1);\
+	DPTGLV_LINE_WRITEPOINT(index2);\
+	DPTGLV_LINE_WRITEPOINT(index3);\
+	DPTGLV_LINE_WRITEPOINT(index4);\
+	fprintf( pFile, "4\n" );\
+	DPTGLV_LINE_WRITEPOINT(index4);\
+	DPTGLV_LINE_WRITEPOINT(index3);\
+	DPTGLV_LINE_WRITEPOINT(index2);\
+	DPTGLV_LINE_WRITEPOINT(index1);
+
+
+	DPTGLV_LINE_DOUBLESIDEDQUAD(0,4,6,2);
+	DPTGLV_LINE_DOUBLESIDEDQUAD(3,7,5,1);
+	DPTGLV_LINE_DOUBLESIDEDQUAD(1,5,4,0);
+	DPTGLV_LINE_DOUBLESIDEDQUAD(2,6,7,3);
+	DPTGLV_LINE_DOUBLESIDEDQUAD(0,2,3,1);
+	DPTGLV_LINE_DOUBLESIDEDQUAD(5,7,6,4);
+#endif
+}
+
+void DumpPolyhedronToGLView( const CPolyhedron *pPolyhedron, const char *pFilename, const VMatrix *pTransform )
+{
+#ifdef ENABLE_DEBUG_POLYHEDRON_DUMPS
+	if ( (pPolyhedron == NULL) || (pPolyhedron->iVertexCount == 0) )
+		return;
+
+	if( pTransform == NULL )
+		pTransform = &s_matIdentity;
+
+	printf("Writing %s...\n", pFilename );
+
+	FILE *pFile = fopen( pFilename, "ab" );
+
+	//randomizing an array of colors to help spot shared/unshared vertices
+	Vector *pColors = (Vector *)stackalloc( sizeof( Vector ) * pPolyhedron->iVertexCount );	
+	int counter;
+	for( counter = 0; counter != pPolyhedron->iVertexCount; ++counter )
+	{
+		pColors[counter].Init( rand()/32768.0f, rand()/32768.0f, rand()/32768.0f );
+	}
+
+	Vector *pTransformedPoints = (Vector *)stackalloc( pPolyhedron->iVertexCount * sizeof( Vector ) );
+	for ( counter = 0; counter != pPolyhedron->iVertexCount; ++counter )
+	{
+		pTransformedPoints[counter] = (*pTransform) * pPolyhedron->pVertices[counter];
+	}
+
+	for ( counter = 0; counter != pPolyhedron->iPolygonCount; ++counter )
+	{
+		fprintf( pFile, "%i\n", pPolyhedron->pPolygons[counter].iIndexCount );
+		int counter2;
+		for( counter2 = 0; counter2 != pPolyhedron->pPolygons[counter].iIndexCount; ++counter2 )
+		{
+			Polyhedron_IndexedLineReference_t *pLineReference = &pPolyhedron->pIndices[pPolyhedron->pPolygons[counter].iFirstIndex + counter2];
+
+			Vector *pVertex = &pTransformedPoints[pPolyhedron->pLines[pLineReference->iLineIndex].iPointIndices[pLineReference->iEndPointIndex]];
+			Vector *pColor = &pColors[pPolyhedron->pLines[pLineReference->iLineIndex].iPointIndices[pLineReference->iEndPointIndex]];
+			fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n",pVertex->x, pVertex->y, pVertex->z, pColor->x, pColor->y, pColor->z );
+		}
+	}
+
+	for( counter = 0; counter != pPolyhedron->iLineCount; ++counter )
+	{
+		const Vector vOne( 1.0f, 1.0f, 1.0f );
+		DumpLineToGLView( pTransformedPoints[pPolyhedron->pLines[counter].iPointIndices[0]], vOne - pColors[pPolyhedron->pLines[counter].iPointIndices[0]],
+							pTransformedPoints[pPolyhedron->pLines[counter].iPointIndices[1]], vOne - pColors[pPolyhedron->pLines[counter].iPointIndices[1]], 
+							0.1f, pFile );
+	}
+
+	for( counter = 0; counter != pPolyhedron->iVertexCount; ++counter )
+	{
+		const Vector vPointHalfSize(0.15f, 0.15f, 0.15f );
+		DumpAABBToGLView( pTransformedPoints[counter], vPointHalfSize, pColors[counter], pFile );
+	}
+
+	fclose( pFile );
+#endif
+}
+
+
+void DumpPlaneToGlView( const float *pPlane, float fGrayScale, const char *pszFileName, const VMatrix *pTransform )
+{
+#ifdef ENABLE_DEBUG_POLYHEDRON_DUMPS
+	if( pTransform == NULL )
+		pTransform = &s_matIdentity;
+
+	FILE *pFile = fopen( pszFileName, "ab" );
+
+	//transform the plane
+	Vector vNormal = pTransform->ApplyRotation( *(Vector *)pPlane );
+	float fDist = pPlane[3] * vNormal.NormalizeInPlace(); //possible scaling going on
+	fDist += vNormal.Dot( pTransform->GetTranslation() );
+	
+	Vector vPlaneVerts[4];
+
+	PolyFromPlane( vPlaneVerts, vNormal, fDist, 100000.0f );
+
+	fprintf( pFile, "4\n" );
+
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vPlaneVerts[0].x, vPlaneVerts[0].y, vPlaneVerts[0].z, fGrayScale, fGrayScale, fGrayScale );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vPlaneVerts[1].x, vPlaneVerts[1].y, vPlaneVerts[1].z, fGrayScale, fGrayScale, fGrayScale );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vPlaneVerts[2].x, vPlaneVerts[2].y, vPlaneVerts[2].z, fGrayScale, fGrayScale, fGrayScale );
+	fprintf( pFile, "%6.3f %6.3f %6.3f %.2f %.2f %.2f\n", vPlaneVerts[3].x, vPlaneVerts[3].y, vPlaneVerts[3].z, fGrayScale, fGrayScale, fGrayScale );
+
+	fclose( pFile );
+#endif
+}
+#endif
+
+
diff --git a/mp/src/mathlib/powsse.cpp b/mp/src/mathlib/powsse.cpp
index 111f8d24..b026c642 100644
--- a/mp/src/mathlib/powsse.cpp
+++ b/mp/src/mathlib/powsse.cpp
@@ -1,96 +1,96 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: 
-//
-//=====================================================================================//
-
-#include "mathlib/ssemath.h"
-
-// NOTE: This has to be the last file included!
-#include "tier0/memdbgon.h"
-
-
-fltx4 Pow_FixedPoint_Exponent_SIMD( const fltx4 & x, int exponent)
-{
-	fltx4 rslt=Four_Ones;									// x^0=1.0
-	int xp=abs(exponent);
-	if (xp & 3)												// fraction present?
-	{
-		fltx4 sq_rt=SqrtEstSIMD(x);
-		if (xp & 1)											// .25?
-			rslt=SqrtEstSIMD(sq_rt);						// x^.25
-		if (xp & 2)
-			rslt=MulSIMD(rslt,sq_rt);
-	}
-	xp>>=2;													// strip fraction
-	fltx4 curpower=x;										// curpower iterates through  x,x^2,x^4,x^8,x^16...
-
-	while(1)
-	{
-		if (xp & 1)
-			rslt=MulSIMD(rslt,curpower);
-		xp>>=1;
-		if (xp)
-			curpower=MulSIMD(curpower,curpower);
-		else
-			break;
-	}
-	if (exponent<0)
-		return ReciprocalEstSaturateSIMD(rslt);				// pow(x,-b)=1/pow(x,b)
-	else
-		return rslt;
-}
-
-
-
-
-/*
- * (c) Ian Stephenson
- *
- * [email protected]
- *
- * Fast pow() reference implementation
- */
-
-
-static float shift23=(1<<23);
-static float OOshift23=1.0/(1<<23);
-
-float FastLog2(float i)
-{
-	float LogBodge=0.346607f;
-	float x;
-	float y;
-	x=*(int *)&i;
-	x*= OOshift23; //1/pow(2,23);
-	x=x-127;
-
-	y=x-floorf(x);
-	y=(y-y*y)*LogBodge;
-	return x+y;
-}
-float FastPow2(float i)
-{
-	float PowBodge=0.33971f;
-	float x;
-	float y=i-floorf(i);
-	y=(y-y*y)*PowBodge;
-
-	x=i+127-y;
-	x*= shift23; //pow(2,23);
-	*(int*)&x=(int)x;
-	return x;
-}
-float FastPow(float a, float b)
-{
-	if (a <= OOshift23)
-	{
-		return 0.0f;
-	}
-	return FastPow2(b*FastLog2(a));
-}
-float FastPow10( float i )
-{
-	return FastPow2( i * 3.321928f );
-}
-
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+//=====================================================================================//
+
+#include "mathlib/ssemath.h"
+
+// NOTE: This has to be the last file included!
+#include "tier0/memdbgon.h"
+
+
+fltx4 Pow_FixedPoint_Exponent_SIMD( const fltx4 & x, int exponent)
+{
+	fltx4 rslt=Four_Ones;									// x^0=1.0
+	int xp=abs(exponent);
+	if (xp & 3)												// fraction present?
+	{
+		fltx4 sq_rt=SqrtEstSIMD(x);
+		if (xp & 1)											// .25?
+			rslt=SqrtEstSIMD(sq_rt);						// x^.25
+		if (xp & 2)
+			rslt=MulSIMD(rslt,sq_rt);
+	}
+	xp>>=2;													// strip fraction
+	fltx4 curpower=x;										// curpower iterates through  x,x^2,x^4,x^8,x^16...
+
+	while(1)
+	{
+		if (xp & 1)
+			rslt=MulSIMD(rslt,curpower);
+		xp>>=1;
+		if (xp)
+			curpower=MulSIMD(curpower,curpower);
+		else
+			break;
+	}
+	if (exponent<0)
+		return ReciprocalEstSaturateSIMD(rslt);				// pow(x,-b)=1/pow(x,b)
+	else
+		return rslt;
+}
+
+
+
+
+/*
+ * (c) Ian Stephenson
+ *
+ * [email protected]
+ *
+ * Fast pow() reference implementation
+ */
+
+
+static float shift23=(1<<23);
+static float OOshift23=1.0/(1<<23);
+
+float FastLog2(float i)
+{
+	float LogBodge=0.346607f;
+	float x;
+	float y;
+	x=*(int *)&i;
+	x*= OOshift23; //1/pow(2,23);
+	x=x-127;
+
+	y=x-floorf(x);
+	y=(y-y*y)*LogBodge;
+	return x+y;
+}
+float FastPow2(float i)
+{
+	float PowBodge=0.33971f;
+	float x;
+	float y=i-floorf(i);
+	y=(y-y*y)*PowBodge;
+
+	x=i+127-y;
+	x*= shift23; //pow(2,23);
+	*(int*)&x=(int)x;
+	return x;
+}
+float FastPow(float a, float b)
+{
+	if (a <= OOshift23)
+	{
+		return 0.0f;
+	}
+	return FastPow2(b*FastLog2(a));
+}
+float FastPow10( float i )
+{
+	return FastPow2( i * 3.321928f );
+}
+
diff --git a/mp/src/mathlib/quantize.cpp b/mp/src/mathlib/quantize.cpp
index e1fd88dc..e829d2d2 100644
--- a/mp/src/mathlib/quantize.cpp
+++ b/mp/src/mathlib/quantize.cpp
@@ -1,679 +1,679 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: 
-//
-// $NoKeywords: $
-//
-//=============================================================================//
-#ifndef STDIO_H
-#include <stdio.h>
-#endif
-
-#ifndef STRING_H
-#include <string.h>
-#endif
-
-#ifndef QUANTIZE_H
-#include <quantize.h>
-#endif
-
-#include <stdlib.h>
-#include <minmax.h>
-
-#include <math.h>
-
-static int current_ndims;
-static struct QuantizedValue *current_root;
-static int current_ssize;
-
-static uint8 *current_weights;
-
-double SquaredError;
-
-#define SPLIT_THEN_SORT 1
-
-#define SQ(x) ((x)*(x))
-
-static struct QuantizedValue *AllocQValue(void)
-{
-	struct QuantizedValue *ret=new QuantizedValue;
-	ret->Samples=0;
-	ret->Children[0]=ret->Children[1]=0;
-	ret->NSamples=0;
-  
-	ret->ErrorMeasure=new double[current_ndims];
-	ret->Mean=new uint8[current_ndims];
-	ret->Mins=new uint8[current_ndims];
-	ret->Maxs=new uint8[current_ndims];
-	ret->Sums=new int [current_ndims];
-	memset(ret->Sums,0,sizeof(int)*current_ndims);
-	ret->NQuant=0;
-	ret->sortdim=-1;
-	return ret;
-}
-
-void FreeQuantization(struct QuantizedValue *t)
-{
-	if (t)
-	{
-		delete[] t->ErrorMeasure;
-		delete[] t->Mean;
-		delete[] t->Mins;
-		delete[] t->Maxs;
-		FreeQuantization(t->Children[0]);
-		FreeQuantization(t->Children[1]);
-		delete[] t->Sums;
-		delete[] t;
-	}
-}
-
-static int QNumSort(void const *a, void const *b)
-{
-	int32 as=((struct Sample *) a)->QNum;
-	int32 bs=((struct Sample *) b)->QNum;
-	if (as==bs) return 0;
-	return (as>bs)?1:-1;
-}
-
-#if SPLIT_THEN_SORT
-#else
-static int current_sort_dim;
-
-static int samplesort(void const *a, void const *b)
-{
-	uint8 as=((struct Sample *) a)->Value[current_sort_dim];
-	uint8 bs=((struct Sample *) b)->Value[current_sort_dim];
-	if (as==bs) return 0;
-	return (as>bs)?1:-1;
-}
-#endif
-
-static int sortlong(void const *a, void const *b)
-{
-	// treat the entire vector of values as a long integer for duplicate removal.
-	return memcmp(((struct Sample *) a)->Value,
-				  ((struct Sample *) b)->Value,current_ndims);
-}
-
-
-  
-#define NEXTSAMPLE(s) ( (struct Sample *) (((uint8 *) s)+current_ssize))
-#define SAMPLE(s,i) NthSample(s,i,current_ndims)
-
-static void SetNDims(int n)
-{
-	current_ssize=sizeof(struct Sample)+(n-1);
-	current_ndims=n;
-}
-
-int CompressSamples(struct Sample *s, int nsamples, int ndims)
-{
-	SetNDims(ndims);
-	qsort(s,nsamples,current_ssize,sortlong);
-	// now, they are all sorted by treating all dimensions as a large number.
-	// we may now remove duplicates.
-	struct Sample *src=s;
-	struct Sample *dst=s;
-	struct Sample *lastdst=dst;
-	dst=NEXTSAMPLE(dst);		// copy first sample to get the ball rolling
-	src=NEXTSAMPLE(src);
-	int noutput=1;
-	while(--nsamples)		// while some remain
-	{
-		if (memcmp(src->Value,lastdst->Value,current_ndims))
-		{
-			// yikes, a difference has been found!
-			memcpy(dst,src,current_ssize);
-			lastdst=dst;
-			dst=NEXTSAMPLE(dst);
-			noutput++;
-		}
-		else
-			lastdst->Count++;
-		src=NEXTSAMPLE(src);
-	}
-	return noutput;
-}
-
-void PrintSamples(struct Sample const *s, int nsamples, int ndims)
-{
-	SetNDims(ndims);
-	int cnt=0;
-	while(nsamples--)
-	{
-		printf("sample #%d, count=%d, values=\n { ",cnt++,s->Count);
-		for(int d=0;d<ndims;d++)
-			printf("%02x,",s->Value[d]);
-		printf("}\n");
-		s=NEXTSAMPLE(s);
-	}
-}
-
-void PrintQTree(struct QuantizedValue const *p,int idlevel)
-{
-	int i;
-
-	if (p)
-	{
-		for(i=0;i<idlevel;i++)
-			printf(" ");
-		printf("node=%p NSamples=%d value=%d Mean={",p,p->NSamples,p->value);
-		for(i=0;i<current_ndims;i++)
-			printf("%x,",p->Mean[i]);
-		printf("}\n");
-		for(i=0;i<idlevel;i++)
-			printf(" ");
-		printf("Errors={");
-		for(i=0;i<current_ndims;i++)
-			printf("%f,",p->ErrorMeasure[i]);
-		printf("}\n");
-		for(i=0;i<idlevel;i++)
-			printf(" ");
-		printf("Mins={");
-		for(i=0;i<current_ndims;i++)
-			printf("%d,",p->Mins[i]);
-		printf("} Maxs={");
-		for(i=0;i<current_ndims;i++)
-			printf("%d,",p->Maxs[i]);
-		printf("}\n");
-		PrintQTree(p->Children[0],idlevel+2);
-		PrintQTree(p->Children[1],idlevel+2);
-	}
-}
-
-static void UpdateStats(struct QuantizedValue *v)
-{
-	// first, find mean
-	int32 Means[MAXDIMS];
-	double Errors[MAXDIMS];
-	double WorstError[MAXDIMS];
-	int i,j;
-  
-	memset(Means,0,sizeof(Means));
-	int N=0;
-	for(i=0;i<v->NSamples;i++)
-	{
-		struct Sample *s=SAMPLE(v->Samples,i);
-		N+=s->Count;
-		for(j=0;j<current_ndims;j++)
-		{
-			uint8 v=s->Value[j];
-			Means[j]+=v*s->Count;
-		}
-	}
-	for(j=0;j<current_ndims;j++)
-	{
-		if (N) v->Mean[j]=(uint8) (Means[j]/N);
-		Errors[j]=WorstError[j]=0.;
-	}
-	for(i=0;i<v->NSamples;i++)
-	{
-		struct Sample *s=SAMPLE(v->Samples,i);
-		double c=s->Count;
-		for(j=0;j<current_ndims;j++)
-		{
-			double diff=SQ(s->Value[j]-v->Mean[j]);
-			Errors[j]+=c*diff; // charles uses abs not sq()
-			if (diff>WorstError[j])
-				WorstError[j]=diff;
-		}
-	}
-	v->TotalError=0.;
-	double ErrorScale=1.; // /sqrt((double) (N));
-	for(j=0;j<current_ndims;j++)
-	{
-		v->ErrorMeasure[j]=(ErrorScale*Errors[j]*current_weights[j]);
-		v->TotalError+=v->ErrorMeasure[j];
-#if SPLIT_THEN_SORT
-		v->ErrorMeasure[j]*=WorstError[j];
-#endif
-	}
-	v->TotSamples=N;
-}
-
-static int ErrorDim;
-static double ErrorVal;
-static struct QuantizedValue *ErrorNode;
-
-static void UpdateWorst(struct QuantizedValue *q)
-{
-	if (q->Children[0])
-	{
-		// not a leaf node
-		UpdateWorst(q->Children[0]);
-		UpdateWorst(q->Children[1]);
-	}
-	else
-	{
-		if (q->TotalError>ErrorVal)
-		{
-			ErrorVal=q->TotalError;
-			ErrorNode=q;
-			ErrorDim=0;
-			for(int d=0;d<current_ndims;d++)
-				if (q->ErrorMeasure[d]>q->ErrorMeasure[ErrorDim])
-					ErrorDim=d;
-		}
-	}
-}
-
-static int FindWorst(void)
-{
-	ErrorVal=-1.;
-	UpdateWorst(current_root);
-	return (ErrorVal>0);
-}
-
-
-
-static void SubdivideNode(struct QuantizedValue *n, int whichdim)
-{
-	int NAdded=0;
-	int i;
-
-#if SPLIT_THEN_SORT
-	// we will try the "split then sort" method. This works by finding the
-	// means for all samples above and below the mean along the given axis.
-	// samples are then split into two groups, with the selection based upon
-	// which of the n-dimensional means the sample is closest to.
-	double LocalMean[MAXDIMS][2];
-	int totsamps[2];
-	for(i=0;i<current_ndims;i++)
-		LocalMean[i][0]=LocalMean[i][1]=0.;
-	totsamps[0]=totsamps[1]=0;
-	uint8 minv=255;
-	uint8 maxv=0;
-	struct Sample *minS=0,*maxS=0;
-	for(i=0;i<n->NSamples;i++)
-	{
-		uint8 v;
-		int whichside=1;
-		struct Sample *sl;
-		sl=SAMPLE(n->Samples,i);
-		v=sl->Value[whichdim];
-		if (v<minv) { minv=v; minS=sl; }
-		if (v>maxv) { maxv=v; maxS=sl; }
-		if (v<n->Mean[whichdim])
-			whichside=0;
-		totsamps[whichside]+=sl->Count;
-		for(int d=0;d<current_ndims;d++)
-			LocalMean[d][whichside]+=
-				sl->Count*sl->Value[d];
-	}
-
-	if (totsamps[0] && totsamps[1])
-		for(i=0;i<current_ndims;i++)
-		{
-			LocalMean[i][0]/=totsamps[0];
-			LocalMean[i][1]/=totsamps[1];
-		}
-	else
-	{
-		// it is possible that the clustering failed to split the samples.
-		// this can happen with a heavily biased sample (i.e. all black
-		// with a few stars). If this happens, we will cluster around the
-		// extrema instead. LocalMean[i][0] will be the point with the lowest
-		// value on the dimension and LocalMean[i][1] the one with the lowest
-		// value.
-		for(int i=0;i<current_ndims;i++)
-		{
-			LocalMean[i][0]=minS->Value[i];
-			LocalMean[i][1]=maxS->Value[i];
-		}
-	}
-
-	// now, we have 2 n-dimensional means. We will label each sample
-	// for which one it is nearer to by using the QNum field.
-	for(i=0;i<n->NSamples;i++)
-	{
-		double dist[2];
-		dist[0]=dist[1]=0.;
-		struct Sample *s=SAMPLE(n->Samples,i);
-		for(int d=0;d<current_ndims;d++)
-			for(int w=0;w<2;w++)
-				dist[w]+=current_weights[d]*SQ(LocalMean[d][w]-s->Value[d]);
-		s->QNum=(dist[0]<dist[1]);
-    }
-
-
-	// hey ho! we have now labelled each one with a candidate bin. Let's
-	// sort the array by moving the 0-labelled ones to the head of the array.
-	n->sortdim=-1;
-	qsort(n->Samples,n->NSamples,current_ssize,QNumSort);
-	for(i=0;i<n->NSamples;i++,NAdded++)
-		if (SAMPLE(n->Samples,i)->QNum)
-			break;
-  
-#else
-	if (whichdim != n->sortdim)
-	{
-		current_sort_dim=whichdim;
-		qsort(n->Samples,n->NSamples,current_ssize,samplesort);
-		n->sortdim=whichdim;
-	}
-	// now, the samples are sorted along the proper dimension.  we need
-	// to find the place to cut in order to split the node.  this is
-	// complicated by the fact that each sample entry can represent many
-	// samples. What we will do is start at the beginning of the array,
-	// adding samples to the first node, until either the number added
-	// is >=TotSamples/2, or there is only one left.
-	int TotAdded=0;
-	for(;;)
-	{
-		if (NAdded==n->NSamples-1)
-			break;
-		if (TotAdded>=n->TotSamples/2)
-			break;
-		TotAdded+=SAMPLE(n->Samples,NAdded)->Count;
-		NAdded++;
-	}
-#endif
-	struct QuantizedValue *a=AllocQValue();
-	a->sortdim=n->sortdim;
-	a->Samples=n->Samples;
-	a->NSamples=NAdded;
-	n->Children[0]=a;
-	UpdateStats(a);
-	a=AllocQValue();
-	a->Samples=SAMPLE(n->Samples,NAdded);
-	a->NSamples=n->NSamples-NAdded;
-	a->sortdim=n->sortdim;
-	n->Children[1]=a;
-	UpdateStats(a);
-}
-
-static int colorid=0;
-
-static void Label(struct QuantizedValue *q, int updatecolor)
-{
-	// fill in max/min values for tree, etc.
-	if (q)
-	{
-		Label(q->Children[0],updatecolor);
-		Label(q->Children[1],updatecolor);
-		if (! q->Children[0])	// leaf node?
-		{
-			if (updatecolor)
-			{
-				q->value=colorid++;
-				for(int j=0;j<q->NSamples;j++)
-				{
-					SAMPLE(q->Samples,j)->QNum=q->value;
-					SAMPLE(q->Samples,j)->qptr=q;
-				}
-			}
-			for(int i=0;i<current_ndims;i++)
-			{
-				q->Mins[i]=q->Mean[i];
-				q->Maxs[i]=q->Mean[i];
-			}
-		}
-		else
-			for(int i=0;i<current_ndims;i++)
-			{
-				q->Mins[i]=min(q->Children[0]->Mins[i],q->Children[1]->Mins[i]);
-				q->Maxs[i]=max(q->Children[0]->Maxs[i],q->Children[1]->Maxs[i]);
-			}
-	}
-}    
-
-struct QuantizedValue *FindQNode(struct QuantizedValue const *q, int32 code)
-{
-	if (! (q->Children[0]))
-		if (code==q->value) return (struct QuantizedValue *) q;
-		else return 0;
-	else
-	{
-		struct QuantizedValue *found=FindQNode(q->Children[0],code);
-		if (! found) found=FindQNode(q->Children[1],code);
-		return found;
-	}
-}
-
-
-void CheckInRange(struct QuantizedValue *q, uint8 *max, uint8 *min)
-{
-	if (q)
-	{
-		if (q->Children[0])
-		{
-			// non-leaf node
-			CheckInRange(q->Children[0],q->Maxs, q->Mins);
-			CheckInRange(q->Children[1],q->Maxs, q->Mins);
-			CheckInRange(q->Children[0],max, min);
-			CheckInRange(q->Children[1],max, min);
-		}
-		for (int i=0;i<current_ndims;i++)
-		{
-			if (q->Maxs[i]>max[i]) printf("error1\n");
-			if (q->Mins[i]<min[i]) printf("error2\n");
-		}
-	}
-}
-
-struct QuantizedValue *Quantize(struct Sample *s, int nsamples, int ndims,
-								int nvalues, uint8 *weights, int firstvalue)
-{
-	SetNDims(ndims);
-	current_weights=weights;
-	current_root=AllocQValue();
-	current_root->Samples=s;
-	current_root->NSamples=nsamples;
-	UpdateStats(current_root);
-	while(--nvalues)
-	{
-		if (! FindWorst())
-			break;                          // if <n unique ones, stop now
-		SubdivideNode(ErrorNode,ErrorDim);
-	}
-	colorid=firstvalue;
-	Label(current_root,1);
-	return current_root;
-}
-
-double MinimumError(struct QuantizedValue const *q, uint8 const *sample,
-					int ndims, uint8 const *weights)
-{
-	double err=0;
-	for(int i=0;i<ndims;i++)
-	{
-		int val1;
-		int val2=sample[i];
-		if ((q->Mins[i]<=val2) && (q->Maxs[i]>=val2)) val1=val2;
-		else
-		{
-			val1=(val2<=q->Mins[i])?q->Mins[i]:q->Maxs[i];
-		}
-		err+=weights[i]*SQ(val1-val2);
-	}
-	return err;
-}
-
-double MaximumError(struct QuantizedValue const *q, uint8 const *sample,
-					int ndims, uint8 const *weights)
-{
-	double err=0;
-	for(int i=0;i<ndims;i++)
-	{
-		int val2=sample[i];
-		int val1=(abs(val2-q->Mins[i])>abs(val2-q->Maxs[i]))?
-			q->Mins[i]:
-			q->Maxs[i];
-		err+=weights[i]*SQ(val2-val1);
-	}
-	return err;
-}
-
-				     
-
-// heap (priority queue) routines used for nearest-neghbor searches
-struct FHeap {
-	int heap_n;
-	double *heap[MAXQUANT];
-};
-
-void InitHeap(struct FHeap *h)
-{
-  h->heap_n=0;
-}
-
-
-void UpHeap(int k, struct FHeap *h)
-{
-  double *tmpk=h->heap[k];
-  double tmpkn=*tmpk;
-  while((k>1) && (tmpkn <= *(h->heap[k/2])))
-    {
-      h->heap[k]=h->heap[k/2];
-      k/=2;
-    }
-  h->heap[k]=tmpk;
-}
-
-void HeapInsert(struct FHeap *h,double *elem)
-{
-  h->heap_n++;
-  h->heap[h->heap_n]=elem;
-  UpHeap(h->heap_n,h);
-}
-
-void DownHeap(int k, struct FHeap *h)
-{
-  double *v=h->heap[k];
-  while(k<=h->heap_n/2)
-    {
-      int j=2*k;
-      if (j<h->heap_n)
-	if (*(h->heap[j]) >= *(h->heap[j+1]))
-	  j++;
-      if (*v < *(h->heap[j]))
-	{
-	  h->heap[k]=v;
-	  return;
-	}
-      h->heap[k]=h->heap[j]; k=j;
-    }
-  h->heap[k]=v;
-}
-
-void *RemoveHeapItem(struct FHeap *h)
-{
-  void *ret=0;
-  if (h->heap_n!=0)
-    {
-      ret=h->heap[1];
-      h->heap[1]=h->heap[h->heap_n];
-      h->heap_n--;
-      DownHeap(1,h);
-    }
-  return ret;
-}
-
-// now, nearest neighbor finder. Use a heap to traverse the tree, stopping
-// when there are no nodes with a minimum error < the current error.
-
-struct FHeap TheQueue;
-
-#define PUSHNODE(a) { \
-  (a)->MinError=MinimumError(a,sample,ndims,weights); \
-  if ((a)->MinError < besterror) HeapInsert(&TheQueue,&(a)->MinError); \
- }
-
-struct QuantizedValue *FindMatch(uint8 const *sample, int ndims,
-								 uint8 *weights, struct QuantizedValue *q)
-{
-	InitHeap(&TheQueue);
-	struct QuantizedValue *bestmatch=0;
-	double besterror=1.0e63;
-	PUSHNODE(q);
-	for(;;)
-	{
-		struct QuantizedValue *test=(struct QuantizedValue *)
-			RemoveHeapItem(&TheQueue);
-		if (! test) break;		// heap empty
-//    printf("got pop node =%p minerror=%f\n",test,test->MinError);
-    
-		if (test->MinError>besterror) break;
-		if (test->Children[0])
-		{
-			// it's a parent node. put the children on the queue
-			struct QuantizedValue *c1=test->Children[0];
-			struct QuantizedValue *c2=test->Children[1];
-			c1->MinError=MinimumError(c1,sample,ndims,weights);
-			if (c1->MinError < besterror)
-				HeapInsert(&TheQueue,&(c1->MinError));
-			c2->MinError=MinimumError(c2,sample,ndims,weights);
-			if (c2->MinError < besterror)
-				HeapInsert(&TheQueue,&(c2->MinError));
-		}
-		else
-		{
-			// it's a leaf node. This must be a new minimum or the MinError
-			// test would have failed.
-			if (test->MinError < besterror)
-			{
-				bestmatch=test;
-				besterror=test->MinError;
-			}
-		}
-	}
-	if (bestmatch)
-	{
-		SquaredError+=besterror;
-		bestmatch->NQuant++;
-		for(int i=0;i<ndims;i++)
-			bestmatch->Sums[i]+=sample[i];
-	}
-	return bestmatch;
-}
-
-static void RecalcMeans(struct QuantizedValue *q)
-{
-	if (q)
-	{
-		if (q->Children[0])
-		{
-			// not a leaf, invoke recursively.
-			RecalcMeans(q->Children[0]);
-			RecalcMeans(q->Children[0]);
-		}
-		else
-		{
-			// it's a leaf. Set the means
-			if (q->NQuant)
-			{
-				for(int i=0;i<current_ndims;i++)
-				{
-					q->Mean[i]=(uint8) (q->Sums[i]/q->NQuant);
-					q->Sums[i]=0;
-				}
-				q->NQuant=0;
-			}
-		}
-	}
-}
-		      
-void OptimizeQuantizer(struct QuantizedValue *q, int ndims)
-{
-	SetNDims(ndims);
-	RecalcMeans(q);		// reset q values
-	Label(q,0);			// update max/mins
-}
-
-
-static void RecalcStats(struct QuantizedValue *q)
-{
-	if (q)
-	{
-		UpdateStats(q);
-		RecalcStats(q->Children[0]);
-		RecalcStats(q->Children[1]);
-	}
-}
-
-void RecalculateValues(struct QuantizedValue *q, int ndims)
-{
-	SetNDims(ndims);
-	RecalcStats(q);
-	Label(q,0);
-}
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+#ifndef STDIO_H
+#include <stdio.h>
+#endif
+
+#ifndef STRING_H
+#include <string.h>
+#endif
+
+#ifndef QUANTIZE_H
+#include <quantize.h>
+#endif
+
+#include <stdlib.h>
+#include <minmax.h>
+
+#include <math.h>
+
+static int current_ndims;
+static struct QuantizedValue *current_root;
+static int current_ssize;
+
+static uint8 *current_weights;
+
+double SquaredError;
+
+#define SPLIT_THEN_SORT 1
+
+#define SQ(x) ((x)*(x))
+
+static struct QuantizedValue *AllocQValue(void)
+{
+	struct QuantizedValue *ret=new QuantizedValue;
+	ret->Samples=0;
+	ret->Children[0]=ret->Children[1]=0;
+	ret->NSamples=0;
+  
+	ret->ErrorMeasure=new double[current_ndims];
+	ret->Mean=new uint8[current_ndims];
+	ret->Mins=new uint8[current_ndims];
+	ret->Maxs=new uint8[current_ndims];
+	ret->Sums=new int [current_ndims];
+	memset(ret->Sums,0,sizeof(int)*current_ndims);
+	ret->NQuant=0;
+	ret->sortdim=-1;
+	return ret;
+}
+
+void FreeQuantization(struct QuantizedValue *t)
+{
+	if (t)
+	{
+		delete[] t->ErrorMeasure;
+		delete[] t->Mean;
+		delete[] t->Mins;
+		delete[] t->Maxs;
+		FreeQuantization(t->Children[0]);
+		FreeQuantization(t->Children[1]);
+		delete[] t->Sums;
+		delete[] t;
+	}
+}
+
+static int QNumSort(void const *a, void const *b)
+{
+	int32 as=((struct Sample *) a)->QNum;
+	int32 bs=((struct Sample *) b)->QNum;
+	if (as==bs) return 0;
+	return (as>bs)?1:-1;
+}
+
+#if SPLIT_THEN_SORT
+#else
+static int current_sort_dim;
+
+static int samplesort(void const *a, void const *b)
+{
+	uint8 as=((struct Sample *) a)->Value[current_sort_dim];
+	uint8 bs=((struct Sample *) b)->Value[current_sort_dim];
+	if (as==bs) return 0;
+	return (as>bs)?1:-1;
+}
+#endif
+
+static int sortlong(void const *a, void const *b)
+{
+	// treat the entire vector of values as a long integer for duplicate removal.
+	return memcmp(((struct Sample *) a)->Value,
+				  ((struct Sample *) b)->Value,current_ndims);
+}
+
+
+  
+#define NEXTSAMPLE(s) ( (struct Sample *) (((uint8 *) s)+current_ssize))
+#define SAMPLE(s,i) NthSample(s,i,current_ndims)
+
+static void SetNDims(int n)
+{
+	current_ssize=sizeof(struct Sample)+(n-1);
+	current_ndims=n;
+}
+
+int CompressSamples(struct Sample *s, int nsamples, int ndims)
+{
+	SetNDims(ndims);
+	qsort(s,nsamples,current_ssize,sortlong);
+	// now, they are all sorted by treating all dimensions as a large number.
+	// we may now remove duplicates.
+	struct Sample *src=s;
+	struct Sample *dst=s;
+	struct Sample *lastdst=dst;
+	dst=NEXTSAMPLE(dst);		// copy first sample to get the ball rolling
+	src=NEXTSAMPLE(src);
+	int noutput=1;
+	while(--nsamples)		// while some remain
+	{
+		if (memcmp(src->Value,lastdst->Value,current_ndims))
+		{
+			// yikes, a difference has been found!
+			memcpy(dst,src,current_ssize);
+			lastdst=dst;
+			dst=NEXTSAMPLE(dst);
+			noutput++;
+		}
+		else
+			lastdst->Count++;
+		src=NEXTSAMPLE(src);
+	}
+	return noutput;
+}
+
+void PrintSamples(struct Sample const *s, int nsamples, int ndims)
+{
+	SetNDims(ndims);
+	int cnt=0;
+	while(nsamples--)
+	{
+		printf("sample #%d, count=%d, values=\n { ",cnt++,s->Count);
+		for(int d=0;d<ndims;d++)
+			printf("%02x,",s->Value[d]);
+		printf("}\n");
+		s=NEXTSAMPLE(s);
+	}
+}
+
+void PrintQTree(struct QuantizedValue const *p,int idlevel)
+{
+	int i;
+
+	if (p)
+	{
+		for(i=0;i<idlevel;i++)
+			printf(" ");
+		printf("node=%p NSamples=%d value=%d Mean={",p,p->NSamples,p->value);
+		for(i=0;i<current_ndims;i++)
+			printf("%x,",p->Mean[i]);
+		printf("}\n");
+		for(i=0;i<idlevel;i++)
+			printf(" ");
+		printf("Errors={");
+		for(i=0;i<current_ndims;i++)
+			printf("%f,",p->ErrorMeasure[i]);
+		printf("}\n");
+		for(i=0;i<idlevel;i++)
+			printf(" ");
+		printf("Mins={");
+		for(i=0;i<current_ndims;i++)
+			printf("%d,",p->Mins[i]);
+		printf("} Maxs={");
+		for(i=0;i<current_ndims;i++)
+			printf("%d,",p->Maxs[i]);
+		printf("}\n");
+		PrintQTree(p->Children[0],idlevel+2);
+		PrintQTree(p->Children[1],idlevel+2);
+	}
+}
+
+static void UpdateStats(struct QuantizedValue *v)
+{
+	// first, find mean
+	int32 Means[MAXDIMS];
+	double Errors[MAXDIMS];
+	double WorstError[MAXDIMS];
+	int i,j;
+  
+	memset(Means,0,sizeof(Means));
+	int N=0;
+	for(i=0;i<v->NSamples;i++)
+	{
+		struct Sample *s=SAMPLE(v->Samples,i);
+		N+=s->Count;
+		for(j=0;j<current_ndims;j++)
+		{
+			uint8 v=s->Value[j];
+			Means[j]+=v*s->Count;
+		}
+	}
+	for(j=0;j<current_ndims;j++)
+	{
+		if (N) v->Mean[j]=(uint8) (Means[j]/N);
+		Errors[j]=WorstError[j]=0.;
+	}
+	for(i=0;i<v->NSamples;i++)
+	{
+		struct Sample *s=SAMPLE(v->Samples,i);
+		double c=s->Count;
+		for(j=0;j<current_ndims;j++)
+		{
+			double diff=SQ(s->Value[j]-v->Mean[j]);
+			Errors[j]+=c*diff; // charles uses abs not sq()
+			if (diff>WorstError[j])
+				WorstError[j]=diff;
+		}
+	}
+	v->TotalError=0.;
+	double ErrorScale=1.; // /sqrt((double) (N));
+	for(j=0;j<current_ndims;j++)
+	{
+		v->ErrorMeasure[j]=(ErrorScale*Errors[j]*current_weights[j]);
+		v->TotalError+=v->ErrorMeasure[j];
+#if SPLIT_THEN_SORT
+		v->ErrorMeasure[j]*=WorstError[j];
+#endif
+	}
+	v->TotSamples=N;
+}
+
+static int ErrorDim;
+static double ErrorVal;
+static struct QuantizedValue *ErrorNode;
+
+static void UpdateWorst(struct QuantizedValue *q)
+{
+	if (q->Children[0])
+	{
+		// not a leaf node
+		UpdateWorst(q->Children[0]);
+		UpdateWorst(q->Children[1]);
+	}
+	else
+	{
+		if (q->TotalError>ErrorVal)
+		{
+			ErrorVal=q->TotalError;
+			ErrorNode=q;
+			ErrorDim=0;
+			for(int d=0;d<current_ndims;d++)
+				if (q->ErrorMeasure[d]>q->ErrorMeasure[ErrorDim])
+					ErrorDim=d;
+		}
+	}
+}
+
+static int FindWorst(void)
+{
+	ErrorVal=-1.;
+	UpdateWorst(current_root);
+	return (ErrorVal>0);
+}
+
+
+
+static void SubdivideNode(struct QuantizedValue *n, int whichdim)
+{
+	int NAdded=0;
+	int i;
+
+#if SPLIT_THEN_SORT
+	// we will try the "split then sort" method. This works by finding the
+	// means for all samples above and below the mean along the given axis.
+	// samples are then split into two groups, with the selection based upon
+	// which of the n-dimensional means the sample is closest to.
+	double LocalMean[MAXDIMS][2];
+	int totsamps[2];
+	for(i=0;i<current_ndims;i++)
+		LocalMean[i][0]=LocalMean[i][1]=0.;
+	totsamps[0]=totsamps[1]=0;
+	uint8 minv=255;
+	uint8 maxv=0;
+	struct Sample *minS=0,*maxS=0;
+	for(i=0;i<n->NSamples;i++)
+	{
+		uint8 v;
+		int whichside=1;
+		struct Sample *sl;
+		sl=SAMPLE(n->Samples,i);
+		v=sl->Value[whichdim];
+		if (v<minv) { minv=v; minS=sl; }
+		if (v>maxv) { maxv=v; maxS=sl; }
+		if (v<n->Mean[whichdim])
+			whichside=0;
+		totsamps[whichside]+=sl->Count;
+		for(int d=0;d<current_ndims;d++)
+			LocalMean[d][whichside]+=
+				sl->Count*sl->Value[d];
+	}
+
+	if (totsamps[0] && totsamps[1])
+		for(i=0;i<current_ndims;i++)
+		{
+			LocalMean[i][0]/=totsamps[0];
+			LocalMean[i][1]/=totsamps[1];
+		}
+	else
+	{
+		// it is possible that the clustering failed to split the samples.
+		// this can happen with a heavily biased sample (i.e. all black
+		// with a few stars). If this happens, we will cluster around the
+		// extrema instead. LocalMean[i][0] will be the point with the lowest
+		// value on the dimension and LocalMean[i][1] the one with the lowest
+		// value.
+		for(int i=0;i<current_ndims;i++)
+		{
+			LocalMean[i][0]=minS->Value[i];
+			LocalMean[i][1]=maxS->Value[i];
+		}
+	}
+
+	// now, we have 2 n-dimensional means. We will label each sample
+	// for which one it is nearer to by using the QNum field.
+	for(i=0;i<n->NSamples;i++)
+	{
+		double dist[2];
+		dist[0]=dist[1]=0.;
+		struct Sample *s=SAMPLE(n->Samples,i);
+		for(int d=0;d<current_ndims;d++)
+			for(int w=0;w<2;w++)
+				dist[w]+=current_weights[d]*SQ(LocalMean[d][w]-s->Value[d]);
+		s->QNum=(dist[0]<dist[1]);
+    }
+
+
+	// hey ho! we have now labelled each one with a candidate bin. Let's
+	// sort the array by moving the 0-labelled ones to the head of the array.
+	n->sortdim=-1;
+	qsort(n->Samples,n->NSamples,current_ssize,QNumSort);
+	for(i=0;i<n->NSamples;i++,NAdded++)
+		if (SAMPLE(n->Samples,i)->QNum)
+			break;
+  
+#else
+	if (whichdim != n->sortdim)
+	{
+		current_sort_dim=whichdim;
+		qsort(n->Samples,n->NSamples,current_ssize,samplesort);
+		n->sortdim=whichdim;
+	}
+	// now, the samples are sorted along the proper dimension.  we need
+	// to find the place to cut in order to split the node.  this is
+	// complicated by the fact that each sample entry can represent many
+	// samples. What we will do is start at the beginning of the array,
+	// adding samples to the first node, until either the number added
+	// is >=TotSamples/2, or there is only one left.
+	int TotAdded=0;
+	for(;;)
+	{
+		if (NAdded==n->NSamples-1)
+			break;
+		if (TotAdded>=n->TotSamples/2)
+			break;
+		TotAdded+=SAMPLE(n->Samples,NAdded)->Count;
+		NAdded++;
+	}
+#endif
+	struct QuantizedValue *a=AllocQValue();
+	a->sortdim=n->sortdim;
+	a->Samples=n->Samples;
+	a->NSamples=NAdded;
+	n->Children[0]=a;
+	UpdateStats(a);
+	a=AllocQValue();
+	a->Samples=SAMPLE(n->Samples,NAdded);
+	a->NSamples=n->NSamples-NAdded;
+	a->sortdim=n->sortdim;
+	n->Children[1]=a;
+	UpdateStats(a);
+}
+
+static int colorid=0;
+
+static void Label(struct QuantizedValue *q, int updatecolor)
+{
+	// fill in max/min values for tree, etc.
+	if (q)
+	{
+		Label(q->Children[0],updatecolor);
+		Label(q->Children[1],updatecolor);
+		if (! q->Children[0])	// leaf node?
+		{
+			if (updatecolor)
+			{
+				q->value=colorid++;
+				for(int j=0;j<q->NSamples;j++)
+				{
+					SAMPLE(q->Samples,j)->QNum=q->value;
+					SAMPLE(q->Samples,j)->qptr=q;
+				}
+			}
+			for(int i=0;i<current_ndims;i++)
+			{
+				q->Mins[i]=q->Mean[i];
+				q->Maxs[i]=q->Mean[i];
+			}
+		}
+		else
+			for(int i=0;i<current_ndims;i++)
+			{
+				q->Mins[i]=min(q->Children[0]->Mins[i],q->Children[1]->Mins[i]);
+				q->Maxs[i]=max(q->Children[0]->Maxs[i],q->Children[1]->Maxs[i]);
+			}
+	}
+}    
+
+struct QuantizedValue *FindQNode(struct QuantizedValue const *q, int32 code)
+{
+	if (! (q->Children[0]))
+		if (code==q->value) return (struct QuantizedValue *) q;
+		else return 0;
+	else
+	{
+		struct QuantizedValue *found=FindQNode(q->Children[0],code);
+		if (! found) found=FindQNode(q->Children[1],code);
+		return found;
+	}
+}
+
+
+void CheckInRange(struct QuantizedValue *q, uint8 *max, uint8 *min)
+{
+	if (q)
+	{
+		if (q->Children[0])
+		{
+			// non-leaf node
+			CheckInRange(q->Children[0],q->Maxs, q->Mins);
+			CheckInRange(q->Children[1],q->Maxs, q->Mins);
+			CheckInRange(q->Children[0],max, min);
+			CheckInRange(q->Children[1],max, min);
+		}
+		for (int i=0;i<current_ndims;i++)
+		{
+			if (q->Maxs[i]>max[i]) printf("error1\n");
+			if (q->Mins[i]<min[i]) printf("error2\n");
+		}
+	}
+}
+
+struct QuantizedValue *Quantize(struct Sample *s, int nsamples, int ndims,
+								int nvalues, uint8 *weights, int firstvalue)
+{
+	SetNDims(ndims);
+	current_weights=weights;
+	current_root=AllocQValue();
+	current_root->Samples=s;
+	current_root->NSamples=nsamples;
+	UpdateStats(current_root);
+	while(--nvalues)
+	{
+		if (! FindWorst())
+			break;                          // if <n unique ones, stop now
+		SubdivideNode(ErrorNode,ErrorDim);
+	}
+	colorid=firstvalue;
+	Label(current_root,1);
+	return current_root;
+}
+
+double MinimumError(struct QuantizedValue const *q, uint8 const *sample,
+					int ndims, uint8 const *weights)
+{
+	double err=0;
+	for(int i=0;i<ndims;i++)
+	{
+		int val1;
+		int val2=sample[i];
+		if ((q->Mins[i]<=val2) && (q->Maxs[i]>=val2)) val1=val2;
+		else
+		{
+			val1=(val2<=q->Mins[i])?q->Mins[i]:q->Maxs[i];
+		}
+		err+=weights[i]*SQ(val1-val2);
+	}
+	return err;
+}
+
+double MaximumError(struct QuantizedValue const *q, uint8 const *sample,
+					int ndims, uint8 const *weights)
+{
+	double err=0;
+	for(int i=0;i<ndims;i++)
+	{
+		int val2=sample[i];
+		int val1=(abs(val2-q->Mins[i])>abs(val2-q->Maxs[i]))?
+			q->Mins[i]:
+			q->Maxs[i];
+		err+=weights[i]*SQ(val2-val1);
+	}
+	return err;
+}
+
+				     
+
+// heap (priority queue) routines used for nearest-neghbor searches
+struct FHeap {
+	int heap_n;
+	double *heap[MAXQUANT];
+};
+
+void InitHeap(struct FHeap *h)
+{
+  h->heap_n=0;
+}
+
+
+void UpHeap(int k, struct FHeap *h)
+{
+  double *tmpk=h->heap[k];
+  double tmpkn=*tmpk;
+  while((k>1) && (tmpkn <= *(h->heap[k/2])))
+    {
+      h->heap[k]=h->heap[k/2];
+      k/=2;
+    }
+  h->heap[k]=tmpk;
+}
+
+void HeapInsert(struct FHeap *h,double *elem)
+{
+  h->heap_n++;
+  h->heap[h->heap_n]=elem;
+  UpHeap(h->heap_n,h);
+}
+
+void DownHeap(int k, struct FHeap *h)
+{
+  double *v=h->heap[k];
+  while(k<=h->heap_n/2)
+    {
+      int j=2*k;
+      if (j<h->heap_n)
+	if (*(h->heap[j]) >= *(h->heap[j+1]))
+	  j++;
+      if (*v < *(h->heap[j]))
+	{
+	  h->heap[k]=v;
+	  return;
+	}
+      h->heap[k]=h->heap[j]; k=j;
+    }
+  h->heap[k]=v;
+}
+
+void *RemoveHeapItem(struct FHeap *h)
+{
+  void *ret=0;
+  if (h->heap_n!=0)
+    {
+      ret=h->heap[1];
+      h->heap[1]=h->heap[h->heap_n];
+      h->heap_n--;
+      DownHeap(1,h);
+    }
+  return ret;
+}
+
+// now, nearest neighbor finder. Use a heap to traverse the tree, stopping
+// when there are no nodes with a minimum error < the current error.
+
+struct FHeap TheQueue;
+
+#define PUSHNODE(a) { \
+  (a)->MinError=MinimumError(a,sample,ndims,weights); \
+  if ((a)->MinError < besterror) HeapInsert(&TheQueue,&(a)->MinError); \
+ }
+
+struct QuantizedValue *FindMatch(uint8 const *sample, int ndims,
+								 uint8 *weights, struct QuantizedValue *q)
+{
+	InitHeap(&TheQueue);
+	struct QuantizedValue *bestmatch=0;
+	double besterror=1.0e63;
+	PUSHNODE(q);
+	for(;;)
+	{
+		struct QuantizedValue *test=(struct QuantizedValue *)
+			RemoveHeapItem(&TheQueue);
+		if (! test) break;		// heap empty
+//    printf("got pop node =%p minerror=%f\n",test,test->MinError);
+    
+		if (test->MinError>besterror) break;
+		if (test->Children[0])
+		{
+			// it's a parent node. put the children on the queue
+			struct QuantizedValue *c1=test->Children[0];
+			struct QuantizedValue *c2=test->Children[1];
+			c1->MinError=MinimumError(c1,sample,ndims,weights);
+			if (c1->MinError < besterror)
+				HeapInsert(&TheQueue,&(c1->MinError));
+			c2->MinError=MinimumError(c2,sample,ndims,weights);
+			if (c2->MinError < besterror)
+				HeapInsert(&TheQueue,&(c2->MinError));
+		}
+		else
+		{
+			// it's a leaf node. This must be a new minimum or the MinError
+			// test would have failed.
+			if (test->MinError < besterror)
+			{
+				bestmatch=test;
+				besterror=test->MinError;
+			}
+		}
+	}
+	if (bestmatch)
+	{
+		SquaredError+=besterror;
+		bestmatch->NQuant++;
+		for(int i=0;i<ndims;i++)
+			bestmatch->Sums[i]+=sample[i];
+	}
+	return bestmatch;
+}
+
+static void RecalcMeans(struct QuantizedValue *q)
+{
+	if (q)
+	{
+		if (q->Children[0])
+		{
+			// not a leaf, invoke recursively.
+			RecalcMeans(q->Children[0]);
+			RecalcMeans(q->Children[0]);
+		}
+		else
+		{
+			// it's a leaf. Set the means
+			if (q->NQuant)
+			{
+				for(int i=0;i<current_ndims;i++)
+				{
+					q->Mean[i]=(uint8) (q->Sums[i]/q->NQuant);
+					q->Sums[i]=0;
+				}
+				q->NQuant=0;
+			}
+		}
+	}
+}
+		      
+void OptimizeQuantizer(struct QuantizedValue *q, int ndims)
+{
+	SetNDims(ndims);
+	RecalcMeans(q);		// reset q values
+	Label(q,0);			// update max/mins
+}
+
+
+static void RecalcStats(struct QuantizedValue *q)
+{
+	if (q)
+	{
+		UpdateStats(q);
+		RecalcStats(q->Children[0]);
+		RecalcStats(q->Children[1]);
+	}
+}
+
+void RecalculateValues(struct QuantizedValue *q, int ndims)
+{
+	SetNDims(ndims);
+	RecalcStats(q);
+	Label(q,0);
+}
diff --git a/mp/src/mathlib/randsse.cpp b/mp/src/mathlib/randsse.cpp
index d6bd666e..b718d399 100644
--- a/mp/src/mathlib/randsse.cpp
+++ b/mp/src/mathlib/randsse.cpp
@@ -1,109 +1,109 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: generates 4 randum numbers in the range 0..1 quickly, using SIMD
-//
-//=====================================================================================//
-
-#include <math.h>
-#include <float.h>	// Needed for FLT_EPSILON
-#include "basetypes.h"
-#include <memory.h>
-#include "tier0/dbg.h"
-#include "mathlib/mathlib.h"
-#include "mathlib/vector.h"
-#include "mathlib/ssemath.h"
-
-// memdbgon must be the last include file in a .cpp file!!!
-#include "tier0/memdbgon.h"
-
-// see knuth volume 3 for insight.
-
-class SIMDRandStreamContext
-{
-	fltx4 m_RandY[55];
-
-	fltx4 *m_pRand_J, *m_pRand_K;
-
-
-public:
-	void Seed( uint32 seed )
-	{
-		m_pRand_J=m_RandY+23; m_pRand_K=m_RandY+54;
-		for(int i=0;i<55;i++)
-		{
-			for(int j=0;j<4;j++)
-			{
-				SubFloat( m_RandY[i], j) = (seed>>16)/65536.0;
-				seed=(seed+1)*3141592621u;
-			}
-		}
-	}
-
-	inline fltx4 RandSIMD( void )
-	{
-		// ret= rand[k]+rand[j]
-		fltx4 retval=AddSIMD( *m_pRand_K, *m_pRand_J );
-		
-		// if ( ret>=1.0) ret-=1.0
-		fltx4 overflow_mask=CmpGeSIMD( retval, Four_Ones );
-		retval=SubSIMD( retval, AndSIMD( Four_Ones, overflow_mask ) );
-		
-		*m_pRand_K = retval;
-		
-		// update pointers w/ wrap-around
-		if ( --m_pRand_J < m_RandY )
-			m_pRand_J=m_RandY+54;
-		if ( --m_pRand_K < m_RandY )
-			m_pRand_K=m_RandY+54;
-		
-		return retval;
-	}
-};
-
-#define MAX_SIMULTANEOUS_RANDOM_STREAMS 32
-
-static SIMDRandStreamContext s_SIMDRandContexts[MAX_SIMULTANEOUS_RANDOM_STREAMS];
-
-static volatile int s_nRandContextsInUse[MAX_SIMULTANEOUS_RANDOM_STREAMS];
-
-void SeedRandSIMD(uint32 seed)
-{
-	for( int i = 0; i<MAX_SIMULTANEOUS_RANDOM_STREAMS; i++)
-		s_SIMDRandContexts[i].Seed( seed+i );
-}
-
-fltx4 RandSIMD( int nContextIndex )
-{
-	return s_SIMDRandContexts[nContextIndex].RandSIMD();
-}
-
-int GetSIMDRandContext( void )
-{
-	for(;;)
-	{
-		for(int i=0; i < NELEMS( s_SIMDRandContexts ); i++)
-		{
-			if ( ! s_nRandContextsInUse[i] )				// available?
-			{
-				// try to take it!
-				if ( ThreadInterlockedAssignIf( &( s_nRandContextsInUse[i]), 1, 0 ) )
-				{
-					return i;								// done!
-				}
-			}
-		}
-		Assert(0);											// why don't we have enough buffers?
-		ThreadSleep();
-	}
-}
-
-void ReleaseSIMDRandContext( int nContext )
-{
-	s_nRandContextsInUse[ nContext ] = 0;
-}
-
-
-fltx4 RandSIMD( void )
-{
-	return s_SIMDRandContexts[0].RandSIMD();
-}
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: generates 4 randum numbers in the range 0..1 quickly, using SIMD
+//
+//=====================================================================================//
+
+#include <math.h>
+#include <float.h>	// Needed for FLT_EPSILON
+#include "basetypes.h"
+#include <memory.h>
+#include "tier0/dbg.h"
+#include "mathlib/mathlib.h"
+#include "mathlib/vector.h"
+#include "mathlib/ssemath.h"
+
+// memdbgon must be the last include file in a .cpp file!!!
+#include "tier0/memdbgon.h"
+
+// see knuth volume 3 for insight.
+
+class SIMDRandStreamContext
+{
+	fltx4 m_RandY[55];
+
+	fltx4 *m_pRand_J, *m_pRand_K;
+
+
+public:
+	void Seed( uint32 seed )
+	{
+		m_pRand_J=m_RandY+23; m_pRand_K=m_RandY+54;
+		for(int i=0;i<55;i++)
+		{
+			for(int j=0;j<4;j++)
+			{
+				SubFloat( m_RandY[i], j) = (seed>>16)/65536.0;
+				seed=(seed+1)*3141592621u;
+			}
+		}
+	}
+
+	inline fltx4 RandSIMD( void )
+	{
+		// ret= rand[k]+rand[j]
+		fltx4 retval=AddSIMD( *m_pRand_K, *m_pRand_J );
+		
+		// if ( ret>=1.0) ret-=1.0
+		fltx4 overflow_mask=CmpGeSIMD( retval, Four_Ones );
+		retval=SubSIMD( retval, AndSIMD( Four_Ones, overflow_mask ) );
+		
+		*m_pRand_K = retval;
+		
+		// update pointers w/ wrap-around
+		if ( --m_pRand_J < m_RandY )
+			m_pRand_J=m_RandY+54;
+		if ( --m_pRand_K < m_RandY )
+			m_pRand_K=m_RandY+54;
+		
+		return retval;
+	}
+};
+
+#define MAX_SIMULTANEOUS_RANDOM_STREAMS 32
+
+static SIMDRandStreamContext s_SIMDRandContexts[MAX_SIMULTANEOUS_RANDOM_STREAMS];
+
+static volatile int s_nRandContextsInUse[MAX_SIMULTANEOUS_RANDOM_STREAMS];
+
+void SeedRandSIMD(uint32 seed)
+{
+	for( int i = 0; i<MAX_SIMULTANEOUS_RANDOM_STREAMS; i++)
+		s_SIMDRandContexts[i].Seed( seed+i );
+}
+
+fltx4 RandSIMD( int nContextIndex )
+{
+	return s_SIMDRandContexts[nContextIndex].RandSIMD();
+}
+
+int GetSIMDRandContext( void )
+{
+	for(;;)
+	{
+		for(int i=0; i < NELEMS( s_SIMDRandContexts ); i++)
+		{
+			if ( ! s_nRandContextsInUse[i] )				// available?
+			{
+				// try to take it!
+				if ( ThreadInterlockedAssignIf( &( s_nRandContextsInUse[i]), 1, 0 ) )
+				{
+					return i;								// done!
+				}
+			}
+		}
+		Assert(0);											// why don't we have enough buffers?
+		ThreadSleep();
+	}
+}
+
+void ReleaseSIMDRandContext( int nContext )
+{
+	s_nRandContextsInUse[ nContext ] = 0;
+}
+
+
+fltx4 RandSIMD( void )
+{
+	return s_SIMDRandContexts[0].RandSIMD();
+}
diff --git a/mp/src/mathlib/simdvectormatrix.cpp b/mp/src/mathlib/simdvectormatrix.cpp
index 7b200c2e..9cac37c4 100644
--- a/mp/src/mathlib/simdvectormatrix.cpp
+++ b/mp/src/mathlib/simdvectormatrix.cpp
@@ -1,112 +1,112 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: Provide a class (SSE/SIMD only) holding a 2d matrix of class FourVectors,
-// for high speed processing in tools.
-//
-// $NoKeywords: $
-//
-//=============================================================================//
-
-
-
-#include "basetypes.h"
-#include "mathlib/mathlib.h"
-#include "mathlib/simdvectormatrix.h"
-#include "mathlib/ssemath.h"
-#include "tier0/dbg.h"
-
-void CSIMDVectorMatrix::CreateFromRGBA_FloatImageData(int srcwidth, int srcheight,
-													  float const *srcdata )
-{
-	Assert( srcwidth && srcheight && srcdata );
-	SetSize( srcwidth, srcheight );
-
-	FourVectors *p_write_ptr=m_pData;
-	int n_vectors_per_source_line=(srcwidth >> 2);
-	int ntrailing_pixels_per_source_line=(srcwidth & 3);
-	for(int y=0;y<srcheight;y++)
-	{
-		float const *data_in=srcdata;
-		float *data_out=reinterpret_cast<float *>( p_write_ptr );
-		// copy full input blocks
-		for(int x=0;x<n_vectors_per_source_line;x++)
-		{
-			for(int c=0;c<3;c++)
-			{
-				data_out[0]=data_in[c];					// x0
-				data_out[1]=data_in[4+c];				// x1
-				data_out[2]=data_in[8+c];				// x2
-				data_out[3]=data_in[12+c];				// x3
-				data_out+=4;
-			}
-			data_in += 16;
-		}
-		// now, copy trailing data and pad with copies
-		if (ntrailing_pixels_per_source_line )
-		{
-			for(int c=0;c<3;c++)
-			{
-				for(int cp=0;cp<4; cp++)
-				{
-					int real_cp=min( cp, ntrailing_pixels_per_source_line-1 );
-					data_out[4*c+cp]= data_in[c+4*real_cp];
-				}
-			}
-		}
-		// advance ptrs to next line
-		p_write_ptr += m_nPaddedWidth;
-		srcdata += 4 * srcwidth;
-	}
-}
-
-void CSIMDVectorMatrix::RaiseToPower( float power )
-{
-	int nv=NVectors();
-	if ( nv )
-	{
-		int fixed_point_exp=(int) ( 4.0*power );
-		FourVectors *src=m_pData;
-		do
-		{
-			src->x=Pow_FixedPoint_Exponent_SIMD( src->x, fixed_point_exp );
-			src->y=Pow_FixedPoint_Exponent_SIMD( src->y, fixed_point_exp );
-			src->z=Pow_FixedPoint_Exponent_SIMD( src->z, fixed_point_exp );
-			src++;
-		} while (--nv);
-	}
-}
-
-CSIMDVectorMatrix & CSIMDVectorMatrix::operator+=( CSIMDVectorMatrix const &src )
-{
-	Assert( m_nWidth == src.m_nWidth );
-	Assert( m_nHeight == src.m_nHeight );
-	int nv=NVectors();
-	if ( nv )
-	{
-		FourVectors *srcv=src.m_pData;
-		FourVectors *destv=m_pData;
-		do													// !! speed !! inline more iters
-		{
-			*( destv++ ) += *( srcv++ );
-		} while ( --nv );
-	}
-	return *this;
-}
-
-CSIMDVectorMatrix & CSIMDVectorMatrix::operator*=( Vector const &src )
-{
-	int nv=NVectors();
-	if ( nv )
-	{
-		FourVectors scalevalue;
-		scalevalue.DuplicateVector( src );
-		FourVectors *destv=m_pData;
-		do													// !! speed !! inline more iters
-		{
-			destv->VProduct( scalevalue );
-			destv++;
-		} while ( --nv );
-	}
-	return *this;
-}
-
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: Provide a class (SSE/SIMD only) holding a 2d matrix of class FourVectors,
+// for high speed processing in tools.
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+
+
+
+#include "basetypes.h"
+#include "mathlib/mathlib.h"
+#include "mathlib/simdvectormatrix.h"
+#include "mathlib/ssemath.h"
+#include "tier0/dbg.h"
+
+void CSIMDVectorMatrix::CreateFromRGBA_FloatImageData(int srcwidth, int srcheight,
+													  float const *srcdata )
+{
+	Assert( srcwidth && srcheight && srcdata );
+	SetSize( srcwidth, srcheight );
+
+	FourVectors *p_write_ptr=m_pData;
+	int n_vectors_per_source_line=(srcwidth >> 2);
+	int ntrailing_pixels_per_source_line=(srcwidth & 3);
+	for(int y=0;y<srcheight;y++)
+	{
+		float const *data_in=srcdata;
+		float *data_out=reinterpret_cast<float *>( p_write_ptr );
+		// copy full input blocks
+		for(int x=0;x<n_vectors_per_source_line;x++)
+		{
+			for(int c=0;c<3;c++)
+			{
+				data_out[0]=data_in[c];					// x0
+				data_out[1]=data_in[4+c];				// x1
+				data_out[2]=data_in[8+c];				// x2
+				data_out[3]=data_in[12+c];				// x3
+				data_out+=4;
+			}
+			data_in += 16;
+		}
+		// now, copy trailing data and pad with copies
+		if (ntrailing_pixels_per_source_line )
+		{
+			for(int c=0;c<3;c++)
+			{
+				for(int cp=0;cp<4; cp++)
+				{
+					int real_cp=min( cp, ntrailing_pixels_per_source_line-1 );
+					data_out[4*c+cp]= data_in[c+4*real_cp];
+				}
+			}
+		}
+		// advance ptrs to next line
+		p_write_ptr += m_nPaddedWidth;
+		srcdata += 4 * srcwidth;
+	}
+}
+
+void CSIMDVectorMatrix::RaiseToPower( float power )
+{
+	int nv=NVectors();
+	if ( nv )
+	{
+		int fixed_point_exp=(int) ( 4.0*power );
+		FourVectors *src=m_pData;
+		do
+		{
+			src->x=Pow_FixedPoint_Exponent_SIMD( src->x, fixed_point_exp );
+			src->y=Pow_FixedPoint_Exponent_SIMD( src->y, fixed_point_exp );
+			src->z=Pow_FixedPoint_Exponent_SIMD( src->z, fixed_point_exp );
+			src++;
+		} while (--nv);
+	}
+}
+
+CSIMDVectorMatrix & CSIMDVectorMatrix::operator+=( CSIMDVectorMatrix const &src )
+{
+	Assert( m_nWidth == src.m_nWidth );
+	Assert( m_nHeight == src.m_nHeight );
+	int nv=NVectors();
+	if ( nv )
+	{
+		FourVectors *srcv=src.m_pData;
+		FourVectors *destv=m_pData;
+		do													// !! speed !! inline more iters
+		{
+			*( destv++ ) += *( srcv++ );
+		} while ( --nv );
+	}
+	return *this;
+}
+
+CSIMDVectorMatrix & CSIMDVectorMatrix::operator*=( Vector const &src )
+{
+	int nv=NVectors();
+	if ( nv )
+	{
+		FourVectors scalevalue;
+		scalevalue.DuplicateVector( src );
+		FourVectors *destv=m_pData;
+		do													// !! speed !! inline more iters
+		{
+			destv->VProduct( scalevalue );
+			destv++;
+		} while ( --nv );
+	}
+	return *this;
+}
+
diff --git a/mp/src/mathlib/sparse_convolution_noise.cpp b/mp/src/mathlib/sparse_convolution_noise.cpp
index 447c5292..dbf19b60 100644
--- a/mp/src/mathlib/sparse_convolution_noise.cpp
+++ b/mp/src/mathlib/sparse_convolution_noise.cpp
@@ -1,218 +1,218 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: noise() primitives.
-//
-//=====================================================================================//
-
-#include <math.h>
-#include "basetypes.h"
-#include <memory.h>
-#include "tier0/dbg.h"
-#include "mathlib/mathlib.h"
-#include "mathlib/vector.h"
-#include "mathlib/noise.h"
-
-// memdbgon must be the last include file in a .cpp file!!!
-#include "tier0/memdbgon.h"
-
-// generate high quality noise based upon "sparse convolution". HIgher quality than perlin noise,
-// and no direcitonal artifacts.
-
-#include "noisedata.h"
-
-#define N_IMPULSES_PER_CELL 5
-#define NORMALIZING_FACTOR 1.0
-
-//(0.5/N_IMPULSES_PER_CELL)
-
-static inline int LatticeCoord(float x)
-{
-	return ((int) floor(x)) & 0xff;
-}
-
-static inline int Hash4D(int ix, int iy, int iz, int idx)
-{
-	int ret=perm_a[ix];
-	ret=perm_b[(ret+iy) & 0xff];
-	ret=perm_c[(ret+iz) & 0xff];
-	ret=perm_d[(ret+idx) & 0xff];
-	return ret;
-}
-
-#define SQ(x) ((x)*(x))
-
-static float CellNoise( int ix, int iy, int iz, float xfrac, float yfrac, float zfrac,
-						float (*pNoiseShapeFunction)(float) )
-{
-	float ret=0;
-	for(int idx=0;idx<N_IMPULSES_PER_CELL;idx++)
-	{
-		int coord_idx=Hash4D( ix, iy, iz, idx );
-		float dsq=SQ(impulse_xcoords[coord_idx]-xfrac)+
-			SQ(impulse_ycoords[coord_idx]-yfrac)+
-			SQ(impulse_zcoords[coord_idx]-zfrac);
-		dsq = sqrt( dsq );
-		if (dsq < 1.0 )
-		{
-			ret += (*pNoiseShapeFunction)( 1-dsq );
-		}
-	}
-	return ret;
-}
-
-
-float SparseConvolutionNoise( Vector const &pnt )
-{
-	return SparseConvolutionNoise( pnt, QuinticInterpolatingPolynomial );
-}
-
-float FractalNoise( Vector const &pnt, int n_octaves)
-{
-	float scale=1.0;
-	float iscale=1.0;
-	float ret=0;
-	float sumscale=0;
-	for(int o=0;o<n_octaves;o++)
-	{
-		Vector p1=pnt;
-		p1 *= scale;
-		ret+=iscale * SparseConvolutionNoise( p1 );
-		sumscale += iscale;
-		scale *= 2.0;
-		iscale *= 0.5;
-	}
-	return ret * ( 1.0/sumscale );
-}
-
-float Turbulence( Vector const &pnt, int n_octaves)
-{
-	float scale=1.0;
-	float iscale=1.0;
-	float ret=0;
-	float sumscale=0;
-	for(int o=0;o<n_octaves;o++)
-	{
-		Vector p1=pnt;
-		p1 *= scale;
-		ret+=iscale * fabs ( 2.0*( SparseConvolutionNoise( p1 )-.5 ) );
-		sumscale += iscale;
-		scale *= 2.0;
-		iscale *= 0.5;
-	}
-	return ret * ( 1.0/sumscale );
-}
-
-#ifdef MEASURE_RANGE
-float fmin1=10000000.0;
-float fmax1=-1000000.0;
-#endif
-
-float SparseConvolutionNoise(Vector const &pnt, float (*pNoiseShapeFunction)(float) )
-{
-	// computer integer lattice point
-	int ix=LatticeCoord(pnt.x);
-	int iy=LatticeCoord(pnt.y);
-	int iz=LatticeCoord(pnt.z);
-
-	// compute offsets within unit cube
-	float xfrac=pnt.x-floor(pnt.x);
-	float yfrac=pnt.y-floor(pnt.y);
-	float zfrac=pnt.z-floor(pnt.z);
-
-	float sum_out=0.;
-
-	for(int ox=-1; ox<=1; ox++)
-		for(int oy=-1; oy<=1; oy++)
-			for(int oz=-1; oz<=1; oz++)
-			{
-				sum_out += CellNoise( ix+ox, iy+oy, iz+oz,
-									  xfrac-ox, yfrac-oy, zfrac-oz,
-									  pNoiseShapeFunction );
-			}
-#ifdef MEASURE_RANGE
-	fmin1=min(sum_out,fmin1);
-	fmax1=max(sum_out,fmax1);
-#endif
-	return RemapValClamped( sum_out, .544487, 9.219176, 0.0, 1.0 );
-}
-
-
-// Improved Perlin Noise
-// The following code is the c-ification of Ken Perlin's new noise algorithm
-// "JAVA REFERENCE IMPLEMENTATION OF IMPROVED NOISE - COPYRIGHT 2002 KEN PERLIN"
-// as available here: http://mrl.nyu.edu/~perlin/noise/
-
-float NoiseGradient(int hash, float x, float y, float z)
-{
-	int h = hash & 15;                      // CONVERT LO 4 BITS OF HASH CODE
-	float u = h<8 ? x : y;                  // INTO 12 GRADIENT DIRECTIONS.
-	float v = h<4 ? y : (h==12||h==14 ? x : z);
-	return ((h&1) == 0 ? u : -u) + ((h&2) == 0 ? v : -v);
-}
-
-int NoiseHashIndex( int i )
-{
-	static int s_permutation[] = 
-	{
-		151,160,137,91,90,15,
-			131,13,201,95,96,53,194,233,7,225,140,36,103,30,69,142,8,99,37,240,21,10,23,
-			190, 6,148,247,120,234,75,0,26,197,62,94,252,219,203,117,35,11,32,57,177,33,
-			88,237,149,56,87,174,20,125,136,171,168, 68,175,74,165,71,134,139,48,27,166,
-			77,146,158,231,83,111,229,122,60,211,133,230,220,105,92,41,55,46,245,40,244,
-			102,143,54, 65,25,63,161, 1,216,80,73,209,76,132,187,208, 89,18,169,200,196,
-			135,130,116,188,159,86,164,100,109,198,173,186, 3,64,52,217,226,250,124,123,
-			5,202,38,147,118,126,255,82,85,212,207,206,59,227,47,16,58,17,182,189,28,42,
-			223,183,170,213,119,248,152, 2,44,154,163, 70,221,153,101,155,167, 43,172,9,
-			129,22,39,253, 19,98,108,110,79,113,224,232,178,185, 112,104,218,246,97,228,
-			251,34,242,193,238,210,144,12,191,179,162,241, 81,51,145,235,249,14,239,107,
-			49,192,214, 31,181,199,106,157,184, 84,204,176,115,121,50,45,127, 4,150,254,
-			138,236,205,93,222,114,67,29,24,72,243,141,128,195,78,66,215,61,156,180
-	};
-
-	return s_permutation[ i & 0xff ];
-}
-
-float ImprovedPerlinNoise( Vector const &pnt )
-{
-	float fx = floor(pnt.x);
-	float fy = floor(pnt.y);
-	float fz = floor(pnt.z);
-
-	int X = (int)fx & 255;								// FIND UNIT CUBE THAT
-	int Y = (int)fy & 255;								// CONTAINS POINT.
-	int Z = (int)fz & 255;
-
-	float x = pnt.x - fx;								// FIND RELATIVE X,Y,Z
-	float y = pnt.y - fy;								// OF POINT IN CUBE.
-	float z = pnt.z - fz;
-
-	float u = QuinticInterpolatingPolynomial(x);		// COMPUTE FADE CURVES
-	float v = QuinticInterpolatingPolynomial(y);		// FOR EACH OF X,Y,Z.
-	float w = QuinticInterpolatingPolynomial(z);
-
-	int A  = NoiseHashIndex( X ) + Y;					// HASH COORDINATES OF
-	int AA = NoiseHashIndex( A ) + Z;					// THE 8 CUBE CORNERS,
-	int AB = NoiseHashIndex( A + 1 ) + Z;
-	int B  = NoiseHashIndex( X + 1 ) + Y;
-	int BA = NoiseHashIndex( B ) + Z;
-	int BB = NoiseHashIndex( B + 1 ) + Z;
-
-	float g0 = NoiseGradient(NoiseHashIndex(AA  ), x  , y  , z   );
-	float g1 = NoiseGradient(NoiseHashIndex(BA  ), x-1, y  , z   );
-	float g2 = NoiseGradient(NoiseHashIndex(AB  ), x  , y-1, z   );
-	float g3 = NoiseGradient(NoiseHashIndex(BB  ), x-1, y-1, z   );
-	float g4 = NoiseGradient(NoiseHashIndex(AA+1), x  , y  , z-1 );
-	float g5 = NoiseGradient(NoiseHashIndex(BA+1), x-1, y  , z-1 );
-	float g6 = NoiseGradient(NoiseHashIndex(AB+1), x  , y-1, z-1 );
-	float g7 = NoiseGradient(NoiseHashIndex(BB+1), x-1, y-1, z-1 );
-
-	// AND ADD BLENDED RESULTS FROM 8 CORNERS OF CUBE
-	float g01 = Lerp( u, g0, g1 );
-	float g23 = Lerp( u, g2, g3 );
-	float g45 = Lerp( u, g4, g5 );
-	float g67 = Lerp( u, g6, g7 );
-	float g0123 = Lerp( v, g01, g23 );
-	float g4567 = Lerp( v, g45, g67 );
-
-	return Lerp( w, g0123,g4567 );
-}
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: noise() primitives.
+//
+//=====================================================================================//
+
+#include <math.h>
+#include "basetypes.h"
+#include <memory.h>
+#include "tier0/dbg.h"
+#include "mathlib/mathlib.h"
+#include "mathlib/vector.h"
+#include "mathlib/noise.h"
+
+// memdbgon must be the last include file in a .cpp file!!!
+#include "tier0/memdbgon.h"
+
+// generate high quality noise based upon "sparse convolution". HIgher quality than perlin noise,
+// and no direcitonal artifacts.
+
+#include "noisedata.h"
+
+#define N_IMPULSES_PER_CELL 5
+#define NORMALIZING_FACTOR 1.0
+
+//(0.5/N_IMPULSES_PER_CELL)
+
+static inline int LatticeCoord(float x)
+{
+	return ((int) floor(x)) & 0xff;
+}
+
+static inline int Hash4D(int ix, int iy, int iz, int idx)
+{
+	int ret=perm_a[ix];
+	ret=perm_b[(ret+iy) & 0xff];
+	ret=perm_c[(ret+iz) & 0xff];
+	ret=perm_d[(ret+idx) & 0xff];
+	return ret;
+}
+
+#define SQ(x) ((x)*(x))
+
+static float CellNoise( int ix, int iy, int iz, float xfrac, float yfrac, float zfrac,
+						float (*pNoiseShapeFunction)(float) )
+{
+	float ret=0;
+	for(int idx=0;idx<N_IMPULSES_PER_CELL;idx++)
+	{
+		int coord_idx=Hash4D( ix, iy, iz, idx );
+		float dsq=SQ(impulse_xcoords[coord_idx]-xfrac)+
+			SQ(impulse_ycoords[coord_idx]-yfrac)+
+			SQ(impulse_zcoords[coord_idx]-zfrac);
+		dsq = sqrt( dsq );
+		if (dsq < 1.0 )
+		{
+			ret += (*pNoiseShapeFunction)( 1-dsq );
+		}
+	}
+	return ret;
+}
+
+
+float SparseConvolutionNoise( Vector const &pnt )
+{
+	return SparseConvolutionNoise( pnt, QuinticInterpolatingPolynomial );
+}
+
+float FractalNoise( Vector const &pnt, int n_octaves)
+{
+	float scale=1.0;
+	float iscale=1.0;
+	float ret=0;
+	float sumscale=0;
+	for(int o=0;o<n_octaves;o++)
+	{
+		Vector p1=pnt;
+		p1 *= scale;
+		ret+=iscale * SparseConvolutionNoise( p1 );
+		sumscale += iscale;
+		scale *= 2.0;
+		iscale *= 0.5;
+	}
+	return ret * ( 1.0/sumscale );
+}
+
+float Turbulence( Vector const &pnt, int n_octaves)
+{
+	float scale=1.0;
+	float iscale=1.0;
+	float ret=0;
+	float sumscale=0;
+	for(int o=0;o<n_octaves;o++)
+	{
+		Vector p1=pnt;
+		p1 *= scale;
+		ret+=iscale * fabs ( 2.0*( SparseConvolutionNoise( p1 )-.5 ) );
+		sumscale += iscale;
+		scale *= 2.0;
+		iscale *= 0.5;
+	}
+	return ret * ( 1.0/sumscale );
+}
+
+#ifdef MEASURE_RANGE
+float fmin1=10000000.0;
+float fmax1=-1000000.0;
+#endif
+
+float SparseConvolutionNoise(Vector const &pnt, float (*pNoiseShapeFunction)(float) )
+{
+	// computer integer lattice point
+	int ix=LatticeCoord(pnt.x);
+	int iy=LatticeCoord(pnt.y);
+	int iz=LatticeCoord(pnt.z);
+
+	// compute offsets within unit cube
+	float xfrac=pnt.x-floor(pnt.x);
+	float yfrac=pnt.y-floor(pnt.y);
+	float zfrac=pnt.z-floor(pnt.z);
+
+	float sum_out=0.;
+
+	for(int ox=-1; ox<=1; ox++)
+		for(int oy=-1; oy<=1; oy++)
+			for(int oz=-1; oz<=1; oz++)
+			{
+				sum_out += CellNoise( ix+ox, iy+oy, iz+oz,
+									  xfrac-ox, yfrac-oy, zfrac-oz,
+									  pNoiseShapeFunction );
+			}
+#ifdef MEASURE_RANGE
+	fmin1=min(sum_out,fmin1);
+	fmax1=max(sum_out,fmax1);
+#endif
+	return RemapValClamped( sum_out, .544487, 9.219176, 0.0, 1.0 );
+}
+
+
+// Improved Perlin Noise
+// The following code is the c-ification of Ken Perlin's new noise algorithm
+// "JAVA REFERENCE IMPLEMENTATION OF IMPROVED NOISE - COPYRIGHT 2002 KEN PERLIN"
+// as available here: http://mrl.nyu.edu/~perlin/noise/
+
+float NoiseGradient(int hash, float x, float y, float z)
+{
+	int h = hash & 15;                      // CONVERT LO 4 BITS OF HASH CODE
+	float u = h<8 ? x : y;                  // INTO 12 GRADIENT DIRECTIONS.
+	float v = h<4 ? y : (h==12||h==14 ? x : z);
+	return ((h&1) == 0 ? u : -u) + ((h&2) == 0 ? v : -v);
+}
+
+int NoiseHashIndex( int i )
+{
+	static int s_permutation[] = 
+	{
+		151,160,137,91,90,15,
+			131,13,201,95,96,53,194,233,7,225,140,36,103,30,69,142,8,99,37,240,21,10,23,
+			190, 6,148,247,120,234,75,0,26,197,62,94,252,219,203,117,35,11,32,57,177,33,
+			88,237,149,56,87,174,20,125,136,171,168, 68,175,74,165,71,134,139,48,27,166,
+			77,146,158,231,83,111,229,122,60,211,133,230,220,105,92,41,55,46,245,40,244,
+			102,143,54, 65,25,63,161, 1,216,80,73,209,76,132,187,208, 89,18,169,200,196,
+			135,130,116,188,159,86,164,100,109,198,173,186, 3,64,52,217,226,250,124,123,
+			5,202,38,147,118,126,255,82,85,212,207,206,59,227,47,16,58,17,182,189,28,42,
+			223,183,170,213,119,248,152, 2,44,154,163, 70,221,153,101,155,167, 43,172,9,
+			129,22,39,253, 19,98,108,110,79,113,224,232,178,185, 112,104,218,246,97,228,
+			251,34,242,193,238,210,144,12,191,179,162,241, 81,51,145,235,249,14,239,107,
+			49,192,214, 31,181,199,106,157,184, 84,204,176,115,121,50,45,127, 4,150,254,
+			138,236,205,93,222,114,67,29,24,72,243,141,128,195,78,66,215,61,156,180
+	};
+
+	return s_permutation[ i & 0xff ];
+}
+
+float ImprovedPerlinNoise( Vector const &pnt )
+{
+	float fx = floor(pnt.x);
+	float fy = floor(pnt.y);
+	float fz = floor(pnt.z);
+
+	int X = (int)fx & 255;								// FIND UNIT CUBE THAT
+	int Y = (int)fy & 255;								// CONTAINS POINT.
+	int Z = (int)fz & 255;
+
+	float x = pnt.x - fx;								// FIND RELATIVE X,Y,Z
+	float y = pnt.y - fy;								// OF POINT IN CUBE.
+	float z = pnt.z - fz;
+
+	float u = QuinticInterpolatingPolynomial(x);		// COMPUTE FADE CURVES
+	float v = QuinticInterpolatingPolynomial(y);		// FOR EACH OF X,Y,Z.
+	float w = QuinticInterpolatingPolynomial(z);
+
+	int A  = NoiseHashIndex( X ) + Y;					// HASH COORDINATES OF
+	int AA = NoiseHashIndex( A ) + Z;					// THE 8 CUBE CORNERS,
+	int AB = NoiseHashIndex( A + 1 ) + Z;
+	int B  = NoiseHashIndex( X + 1 ) + Y;
+	int BA = NoiseHashIndex( B ) + Z;
+	int BB = NoiseHashIndex( B + 1 ) + Z;
+
+	float g0 = NoiseGradient(NoiseHashIndex(AA  ), x  , y  , z   );
+	float g1 = NoiseGradient(NoiseHashIndex(BA  ), x-1, y  , z   );
+	float g2 = NoiseGradient(NoiseHashIndex(AB  ), x  , y-1, z   );
+	float g3 = NoiseGradient(NoiseHashIndex(BB  ), x-1, y-1, z   );
+	float g4 = NoiseGradient(NoiseHashIndex(AA+1), x  , y  , z-1 );
+	float g5 = NoiseGradient(NoiseHashIndex(BA+1), x-1, y  , z-1 );
+	float g6 = NoiseGradient(NoiseHashIndex(AB+1), x  , y-1, z-1 );
+	float g7 = NoiseGradient(NoiseHashIndex(BB+1), x-1, y-1, z-1 );
+
+	// AND ADD BLENDED RESULTS FROM 8 CORNERS OF CUBE
+	float g01 = Lerp( u, g0, g1 );
+	float g23 = Lerp( u, g2, g3 );
+	float g45 = Lerp( u, g4, g5 );
+	float g67 = Lerp( u, g6, g7 );
+	float g0123 = Lerp( v, g01, g23 );
+	float g4567 = Lerp( v, g45, g67 );
+
+	return Lerp( w, g0123,g4567 );
+}
diff --git a/mp/src/mathlib/spherical.cpp b/mp/src/mathlib/spherical.cpp
index d3aa9679..b4cb8258 100644
--- a/mp/src/mathlib/spherical.cpp
+++ b/mp/src/mathlib/spherical.cpp
@@ -1,124 +1,124 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: spherical math routines
-//
-//=====================================================================================//
-
-#include <math.h>
-#include <float.h>	// Needed for FLT_EPSILON
-#include "basetypes.h"
-#include <memory.h>
-#include "tier0/dbg.h"
-#include "mathlib/mathlib.h"
-#include "mathlib/vector.h"
-#include "mathlib/spherical_geometry.h"
-
-// memdbgon must be the last include file in a .cpp file!!!
-#include "tier0/memdbgon.h"
-
-float s_flFactorials[]={
-	1.,
-	1.,
-	2.,
-	6.,
-	24.,
-	120.,
-	720.,
-	5040.,
-	40320.,
-	362880.,
-	3628800.,
-	39916800.,
-	479001600.,
-	6227020800.,
-	87178291200.,
-	1307674368000.,
-	20922789888000.,
-	355687428096000.,
-	6402373705728000.,
-	121645100408832000.,
-	2432902008176640000.,
-	51090942171709440000.,
-	1124000727777607680000.,
-	25852016738884976640000.,
-	620448401733239439360000.,
-	15511210043330985984000000.,
-	403291461126605635584000000.,
-	10888869450418352160768000000.,
-	304888344611713860501504000000.,
-	8841761993739701954543616000000.,
-	265252859812191058636308480000000.,
-	8222838654177922817725562880000000.,
-	263130836933693530167218012160000000.,
-	8683317618811886495518194401280000000.
-};
-
-float AssociatedLegendrePolynomial( int nL, int nM, float flX )
-{
-	// evaluate associated legendre polynomial at flX, using recurrence relation
-	float flPmm = 1.;
-	if ( nM > 0 )
-	{
-		float flSomX2 = sqrt( ( 1 - flX ) * ( 1 + flX ) );
-		float flFact = 1.;
-		for( int i = 0 ; i < nM; i++ )
-		{
-			flPmm *= -flFact * flSomX2;
-			flFact += 2.0;
-		}
-	}
-	if ( nL == nM )
-		return flPmm;
-	float flPmmp1 = flX * ( 2.0 * nM + 1.0 ) * flPmm;
-	if ( nL == nM + 1 ) 
-		return flPmmp1;
-	float flPll = 0.;
-	for( int nLL = nM + 2 ; nLL <= nL; nLL++ )
-	{
-		flPll = ( ( 2.0 * nLL - 1.0 ) * flX * flPmmp1 - ( nLL + nM - 1.0 ) * flPmm ) * ( 1.0 / ( nLL - nM ) );
-		flPmm = flPmmp1;
-		flPmmp1 = flPll;
-	}
-	return flPll;
-}
-
-static float SHNormalizationFactor( int nL, int nM )
-{
-	double flTemp = ( ( 2. * nL + 1.0 ) * s_flFactorials[ nL - nM ] )/ ( 4. * M_PI * s_flFactorials[ nL + nM ] );
-	return sqrt( flTemp );
-}
-
-#define SQRT_2 1.414213562373095 
-
-FORCEINLINE float SphericalHarmonic( int nL, int nM, float flTheta, float flPhi, float flCosTheta )
-{
-	if ( nM == 0 )
-		return SHNormalizationFactor( nL, 0 ) * AssociatedLegendrePolynomial( nL, nM, flCosTheta );
-
-	if ( nM > 0 )
-		return SQRT_2 * SHNormalizationFactor( nL, nM ) * cos ( nM * flPhi ) *
-			AssociatedLegendrePolynomial( nL, nM, flCosTheta );
-
-	return 
-		SQRT_2 * SHNormalizationFactor( nL, -nM ) * sin( -nM * flPhi ) * AssociatedLegendrePolynomial( nL, -nM, flCosTheta );
-
-}
-
-float SphericalHarmonic( int nL, int nM, float flTheta, float flPhi )
-{
-	return SphericalHarmonic( nL, nM, flTheta, flPhi, cos( flTheta ) );
-}
-
-float SphericalHarmonic( int nL, int nM, Vector const &vecDirection )
-{
-	Assert( fabs( VectorLength( vecDirection ) - 1.0 ) < 0.0001 );
-	float flPhi = acos( vecDirection.z );
-	float flTheta = 0;
-	float S = Square( vecDirection.x ) + Square( vecDirection.y );
-	if ( S > 0 )
-	{
-		flTheta = atan2( vecDirection.y, vecDirection.x );
-	}
-	return SphericalHarmonic( nL, nM, flTheta, flPhi, cos( flTheta ) );
-}
-
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: spherical math routines
+//
+//=====================================================================================//
+
+#include <math.h>
+#include <float.h>	// Needed for FLT_EPSILON
+#include "basetypes.h"
+#include <memory.h>
+#include "tier0/dbg.h"
+#include "mathlib/mathlib.h"
+#include "mathlib/vector.h"
+#include "mathlib/spherical_geometry.h"
+
+// memdbgon must be the last include file in a .cpp file!!!
+#include "tier0/memdbgon.h"
+
+float s_flFactorials[]={
+	1.,
+	1.,
+	2.,
+	6.,
+	24.,
+	120.,
+	720.,
+	5040.,
+	40320.,
+	362880.,
+	3628800.,
+	39916800.,
+	479001600.,
+	6227020800.,
+	87178291200.,
+	1307674368000.,
+	20922789888000.,
+	355687428096000.,
+	6402373705728000.,
+	121645100408832000.,
+	2432902008176640000.,
+	51090942171709440000.,
+	1124000727777607680000.,
+	25852016738884976640000.,
+	620448401733239439360000.,
+	15511210043330985984000000.,
+	403291461126605635584000000.,
+	10888869450418352160768000000.,
+	304888344611713860501504000000.,
+	8841761993739701954543616000000.,
+	265252859812191058636308480000000.,
+	8222838654177922817725562880000000.,
+	263130836933693530167218012160000000.,
+	8683317618811886495518194401280000000.
+};
+
+float AssociatedLegendrePolynomial( int nL, int nM, float flX )
+{
+	// evaluate associated legendre polynomial at flX, using recurrence relation
+	float flPmm = 1.;
+	if ( nM > 0 )
+	{
+		float flSomX2 = sqrt( ( 1 - flX ) * ( 1 + flX ) );
+		float flFact = 1.;
+		for( int i = 0 ; i < nM; i++ )
+		{
+			flPmm *= -flFact * flSomX2;
+			flFact += 2.0;
+		}
+	}
+	if ( nL == nM )
+		return flPmm;
+	float flPmmp1 = flX * ( 2.0 * nM + 1.0 ) * flPmm;
+	if ( nL == nM + 1 ) 
+		return flPmmp1;
+	float flPll = 0.;
+	for( int nLL = nM + 2 ; nLL <= nL; nLL++ )
+	{
+		flPll = ( ( 2.0 * nLL - 1.0 ) * flX * flPmmp1 - ( nLL + nM - 1.0 ) * flPmm ) * ( 1.0 / ( nLL - nM ) );
+		flPmm = flPmmp1;
+		flPmmp1 = flPll;
+	}
+	return flPll;
+}
+
+static float SHNormalizationFactor( int nL, int nM )
+{
+	double flTemp = ( ( 2. * nL + 1.0 ) * s_flFactorials[ nL - nM ] )/ ( 4. * M_PI * s_flFactorials[ nL + nM ] );
+	return sqrt( flTemp );
+}
+
+#define SQRT_2 1.414213562373095 
+
+FORCEINLINE float SphericalHarmonic( int nL, int nM, float flTheta, float flPhi, float flCosTheta )
+{
+	if ( nM == 0 )
+		return SHNormalizationFactor( nL, 0 ) * AssociatedLegendrePolynomial( nL, nM, flCosTheta );
+
+	if ( nM > 0 )
+		return SQRT_2 * SHNormalizationFactor( nL, nM ) * cos ( nM * flPhi ) *
+			AssociatedLegendrePolynomial( nL, nM, flCosTheta );
+
+	return 
+		SQRT_2 * SHNormalizationFactor( nL, -nM ) * sin( -nM * flPhi ) * AssociatedLegendrePolynomial( nL, -nM, flCosTheta );
+
+}
+
+float SphericalHarmonic( int nL, int nM, float flTheta, float flPhi )
+{
+	return SphericalHarmonic( nL, nM, flTheta, flPhi, cos( flTheta ) );
+}
+
+float SphericalHarmonic( int nL, int nM, Vector const &vecDirection )
+{
+	Assert( fabs( VectorLength( vecDirection ) - 1.0 ) < 0.0001 );
+	float flPhi = acos( vecDirection.z );
+	float flTheta = 0;
+	float S = Square( vecDirection.x ) + Square( vecDirection.y );
+	if ( S > 0 )
+	{
+		flTheta = atan2( vecDirection.y, vecDirection.x );
+	}
+	return SphericalHarmonic( nL, nM, flTheta, flPhi, cos( flTheta ) );
+}
+
diff --git a/mp/src/mathlib/sse.cpp b/mp/src/mathlib/sse.cpp
index 6e41683f..a2445c79 100644
--- a/mp/src/mathlib/sse.cpp
+++ b/mp/src/mathlib/sse.cpp
@@ -1,1095 +1,1095 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: SSE Math primitives.
-//
-//=====================================================================================//
-
-#include <math.h>
-#include <float.h>	// Needed for FLT_EPSILON
-#include "basetypes.h"
-#include <memory.h>
-#include "tier0/dbg.h"
-#include "mathlib/mathlib.h"
-#include "mathlib/vector.h"
-#include "sse.h"
-
-// memdbgon must be the last include file in a .cpp file!!!
-#include "tier0/memdbgon.h"
-
-#ifndef COMPILER_MSVC64
-// Implement for 64-bit Windows if needed.
-
-static const uint32 _sincos_masks[]	  = { (uint32)0x0,  (uint32)~0x0 };
-static const uint32 _sincos_inv_masks[] = { (uint32)~0x0, (uint32)0x0 };
-
-//-----------------------------------------------------------------------------
-// Macros and constants required by some of the SSE assembly:
-//-----------------------------------------------------------------------------
-
-#ifdef _WIN32
-	#define _PS_EXTERN_CONST(Name, Val) \
-		const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val }
-
-	#define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \
-		const __declspec(align(16)) Type _ps_##Name[4] = { Val, Val, Val, Val }; \
-
-	#define _EPI32_CONST(Name, Val) \
-		static const __declspec(align(16)) __int32 _epi32_##Name[4] = { Val, Val, Val, Val }
-
-	#define _PS_CONST(Name, Val) \
-		static const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val }
-#elif POSIX
-	#define _PS_EXTERN_CONST(Name, Val) \
-		const float _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }
-
-	#define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \
-		const Type _ps_##Name[4]  __attribute__((aligned(16))) = { Val, Val, Val, Val }; \
-
-	#define _EPI32_CONST(Name, Val) \
-		static const int32 _epi32_##Name[4]  __attribute__((aligned(16))) = { Val, Val, Val, Val }
-
-	#define _PS_CONST(Name, Val) \
-		static const float _ps_##Name[4]  __attribute__((aligned(16))) = { Val, Val, Val, Val }
-#endif
-
-_PS_EXTERN_CONST(am_0, 0.0f);
-_PS_EXTERN_CONST(am_1, 1.0f);
-_PS_EXTERN_CONST(am_m1, -1.0f);
-_PS_EXTERN_CONST(am_0p5, 0.5f);
-_PS_EXTERN_CONST(am_1p5, 1.5f);
-_PS_EXTERN_CONST(am_pi, (float)M_PI);
-_PS_EXTERN_CONST(am_pi_o_2, (float)(M_PI / 2.0));
-_PS_EXTERN_CONST(am_2_o_pi, (float)(2.0 / M_PI));
-_PS_EXTERN_CONST(am_pi_o_4, (float)(M_PI / 4.0));
-_PS_EXTERN_CONST(am_4_o_pi, (float)(4.0 / M_PI));
-_PS_EXTERN_CONST_TYPE(am_sign_mask, int32, 0x80000000);
-_PS_EXTERN_CONST_TYPE(am_inv_sign_mask, int32, ~0x80000000);
-_PS_EXTERN_CONST_TYPE(am_min_norm_pos,int32, 0x00800000);
-_PS_EXTERN_CONST_TYPE(am_mant_mask, int32, 0x7f800000);
-_PS_EXTERN_CONST_TYPE(am_inv_mant_mask, int32, ~0x7f800000);
-
-_EPI32_CONST(1, 1);
-_EPI32_CONST(2, 2);
-
-_PS_CONST(sincos_p0, 0.15707963267948963959e1f);
-_PS_CONST(sincos_p1, -0.64596409750621907082e0f);
-_PS_CONST(sincos_p2, 0.7969262624561800806e-1f);
-_PS_CONST(sincos_p3, -0.468175413106023168e-2f);
-
-#ifdef PFN_VECTORMA
-void  __cdecl _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest );
-#endif
-
-//-----------------------------------------------------------------------------
-// SSE implementations of optimized routines:
-//-----------------------------------------------------------------------------
-float _SSE_Sqrt(float x)
-{
-	Assert( s_bMathlibInitialized );
-	float	root = 0.f;
-#ifdef _WIN32
-	_asm
-	{
-		sqrtss		xmm0, x
-		movss		root, xmm0
-	}
-#elif POSIX
-	_mm_store_ss( &root, _mm_sqrt_ss( _mm_load_ss( &x ) ) );
-#endif
-	return root;
-}
-
-// Single iteration NewtonRaphson reciprocal square root:
-// 0.5 * rsqrtps * (3 - x * rsqrtps(x) * rsqrtps(x)) 	
-// Very low error, and fine to use in place of 1.f / sqrtf(x).	
-#if 0
-float _SSE_RSqrtAccurate(float x)
-{
-	Assert( s_bMathlibInitialized );
-
-	float rroot;
-	_asm
-	{
-		rsqrtss	xmm0, x
-		movss	rroot, xmm0
-	}
-
-	return (0.5f * rroot) * (3.f - (x * rroot) * rroot);
-}
-#else
-
-#ifdef POSIX
-const __m128  f3  = _mm_set_ss(3.0f);  // 3 as SSE value
-const __m128  f05 = _mm_set_ss(0.5f);  // 0.5 as SSE value
-#endif
-
-// Intel / Kipps SSE RSqrt.  Significantly faster than above.
-float _SSE_RSqrtAccurate(float a)
-{
-
-#ifdef _WIN32
-	float x;
-	float half = 0.5f;
-	float three = 3.f;
-
-	__asm
-	{
-		movss   xmm3, a;
-		movss   xmm1, half;
-		movss   xmm2, three;
-		rsqrtss xmm0, xmm3;
-
-		mulss   xmm3, xmm0;
-		mulss   xmm1, xmm0;
-		mulss   xmm3, xmm0;
-		subss   xmm2, xmm3;
-		mulss   xmm1, xmm2;
-
-		movss   x,    xmm1;
-	}
-
-	return x;
-#elif POSIX	
-	__m128  xx = _mm_load_ss( &a );
-    __m128  xr = _mm_rsqrt_ss( xx );
-    __m128  xt;
-	
-    xt = _mm_mul_ss( xr, xr );
-    xt = _mm_mul_ss( xt, xx );
-    xt = _mm_sub_ss( f3, xt );
-    xt = _mm_mul_ss( xt, f05 );
-    xr = _mm_mul_ss( xr, xt );
-	
-    _mm_store_ss( &a, xr );
-    return a;
-#else
-	#error "Not Implemented"
-#endif
-
-}
-#endif
-
-// Simple SSE rsqrt.  Usually accurate to around 6 (relative) decimal places 
-// or so, so ok for closed transforms.  (ie, computing lighting normals)
-float _SSE_RSqrtFast(float x)
-{
-	Assert( s_bMathlibInitialized );
-
-	float rroot;
-#ifdef _WIN32
-	_asm
-	{
-		rsqrtss	xmm0, x
-		movss	rroot, xmm0
-	}
-#elif POSIX
-	__asm__ __volatile__( "rsqrtss %0, %1" : "=x" (rroot) : "x" (x) );
-#else
-#error
-#endif
-
-	return rroot;
-}
-
-float FASTCALL _SSE_VectorNormalize (Vector& vec)
-{
-	Assert( s_bMathlibInitialized );
-
-	// NOTE: This is necessary to prevent an memory overwrite...
-	// sice vec only has 3 floats, we can't "movaps" directly into it.
-#ifdef _WIN32
-	__declspec(align(16)) float result[4];
-#elif POSIX
-	 float result[4] __attribute__((aligned(16)));
-#endif
-
-	float *v = &vec[0];
-	float *r = &result[0];
-
-	float	radius = 0.f;
-	// Blah, get rid of these comparisons ... in reality, if you have all 3 as zero, it shouldn't 
-	// be much of a performance win, considering you will very likely miss 3 branch predicts in a row.
-	if ( v[0] || v[1] || v[2] )
-	{
-#ifdef _WIN32
-	_asm
-		{
-			mov			eax, v
-			mov			edx, r
-#ifdef ALIGNED_VECTOR
-			movaps		xmm4, [eax]			// r4 = vx, vy, vz, X
-			movaps		xmm1, xmm4			// r1 = r4
-#else
-			movups		xmm4, [eax]			// r4 = vx, vy, vz, X
-			movaps		xmm1, xmm4			// r1 = r4
-#endif
-			mulps		xmm1, xmm4			// r1 = vx * vx, vy * vy, vz * vz, X
-			movhlps		xmm3, xmm1			// r3 = vz * vz, X, X, X
-			movaps		xmm2, xmm1			// r2 = r1
-			shufps		xmm2, xmm2, 1		// r2 = vy * vy, X, X, X
-			addss		xmm1, xmm2			// r1 = (vx * vx) + (vy * vy), X, X, X
-			addss		xmm1, xmm3			// r1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X
-			sqrtss		xmm1, xmm1			// r1 = sqrt((vx * vx) + (vy * vy) + (vz * vz)), X, X, X
-			movss		radius, xmm1		// radius = sqrt((vx * vx) + (vy * vy) + (vz * vz))
-			rcpss		xmm1, xmm1			// r1 = 1/radius, X, X, X
-			shufps		xmm1, xmm1, 0		// r1 = 1/radius, 1/radius, 1/radius, X
-			mulps		xmm4, xmm1			// r4 = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
-			movaps		[edx], xmm4			// v = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
-		}
-#elif POSIX
-		__asm__ __volatile__(
-#ifdef ALIGNED_VECTOR
-            "movaps          %2, %%xmm4 \n\t"
-            "movaps          %%xmm4, %%xmm1 \n\t"
-#else
-            "movups          %2, %%xmm4 \n\t"
-            "movaps          %%xmm4, %%xmm1 \n\t"
-#endif
-            "mulps           %%xmm4, %%xmm1 \n\t"
-            "movhlps         %%xmm1, %%xmm3 \n\t"
-            "movaps          %%xmm1, %%xmm2 \n\t"
-            "shufps          $1, %%xmm2, %%xmm2 \n\t"
-            "addss           %%xmm2, %%xmm1 \n\t"
-            "addss           %%xmm3, %%xmm1 \n\t"
-            "sqrtss          %%xmm1, %%xmm1 \n\t"
-            "movss           %%xmm1, %0 \n\t"
-            "rcpss           %%xmm1, %%xmm1 \n\t"
-            "shufps          $0, %%xmm1, %%xmm1 \n\t"
-            "mulps           %%xmm1, %%xmm4 \n\t"
-            "movaps          %%xmm4, %1 \n\t"
-            : "=m" (radius), "=m" (result)
-            : "m" (*v)
- 		);
-#else
-	#error "Not Implemented"
-#endif
-		vec.x = result[0];
-		vec.y = result[1];
-		vec.z = result[2];
-
-	}
-
-	return radius;
-}
-
-void FASTCALL _SSE_VectorNormalizeFast (Vector& vec)
-{
-	float ool = _SSE_RSqrtAccurate( FLT_EPSILON + vec.x * vec.x + vec.y * vec.y + vec.z * vec.z );
-
-	vec.x *= ool;
-	vec.y *= ool;
-	vec.z *= ool;
-}
-
-float _SSE_InvRSquared(const float* v)
-{
-	float	inv_r2 = 1.f;
-#ifdef _WIN32
-	_asm { // Intel SSE only routine
-		mov			eax, v
-		movss		xmm5, inv_r2		// x5 = 1.0, 0, 0, 0
-#ifdef ALIGNED_VECTOR
-		movaps		xmm4, [eax]			// x4 = vx, vy, vz, X
-#else
-		movups		xmm4, [eax]			// x4 = vx, vy, vz, X
-#endif
-		movaps		xmm1, xmm4			// x1 = x4
-		mulps		xmm1, xmm4			// x1 = vx * vx, vy * vy, vz * vz, X
-		movhlps		xmm3, xmm1			// x3 = vz * vz, X, X, X
-		movaps		xmm2, xmm1			// x2 = x1
-		shufps		xmm2, xmm2, 1		// x2 = vy * vy, X, X, X
-		addss		xmm1, xmm2			// x1 = (vx * vx) + (vy * vy), X, X, X
-		addss		xmm1, xmm3			// x1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X
-		maxss		xmm1, xmm5			// x1 = max( 1.0, x1 )
-		rcpss		xmm0, xmm1			// x0 = 1 / max( 1.0, x1 )
-		movss		inv_r2, xmm0		// inv_r2 = x0
-	}
-#elif POSIX
-		__asm__ __volatile__(
-		"movss			 %0, %%xmm5 \n\t"
-#ifdef ALIGNED_VECTOR
-		"movaps          %1, %%xmm4 \n\t"
-#else
-		"movups          %1, %%xmm4 \n\t"
-#endif
-        "movaps          %%xmm4, %%xmm1 \n\t"
-        "mulps           %%xmm4, %%xmm1 \n\t"
-		"movhlps         %%xmm1, %%xmm3 \n\t"
-		"movaps          %%xmm1, %%xmm2 \n\t"
-        "shufps          $1, %%xmm2, %%xmm2 \n\t"
-        "addss           %%xmm2, %%xmm1 \n\t"
-        "addss           %%xmm3, %%xmm1 \n\t"
-		"maxss           %%xmm5, %%xmm1 \n\t"
-        "rcpss           %%xmm1, %%xmm0 \n\t"
-		"movss           %%xmm0, %0 \n\t" 
-        : "=m" (inv_r2)
-        : "m" (*v), "0" (inv_r2)
- 		);
-#else
-	#error "Not Implemented"
-#endif
-
-	return inv_r2;
-}
-
-
-#ifdef POSIX
-// #define _PS_CONST(Name, Val) static const ALIGN16 float _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val }
-#define _PS_CONST_TYPE(Name, Type, Val) static const ALIGN16 Type _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val }
-
-_PS_CONST_TYPE(sign_mask, int, 0x80000000);
-_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
-
-
-#define _PI32_CONST(Name, Val)  static const ALIGN16 int _pi32_##Name[4]  ALIGN16_POST = { Val, Val, Val, Val }
-
-_PI32_CONST(1, 1);
-_PI32_CONST(inv1, ~1);
-_PI32_CONST(2, 2);
-_PI32_CONST(4, 4);
-_PI32_CONST(0x7f, 0x7f);
-_PS_CONST(1  , 1.0f);
-_PS_CONST(0p5, 0.5f);
-
-_PS_CONST(minus_cephes_DP1, -0.78515625);
-_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
-_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
-_PS_CONST(sincof_p0, -1.9515295891E-4);
-_PS_CONST(sincof_p1,  8.3321608736E-3);
-_PS_CONST(sincof_p2, -1.6666654611E-1);
-_PS_CONST(coscof_p0,  2.443315711809948E-005);
-_PS_CONST(coscof_p1, -1.388731625493765E-003);
-_PS_CONST(coscof_p2,  4.166664568298827E-002);
-_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
-
-typedef union xmm_mm_union {
-	__m128 xmm;
-	__m64 mm[2];
-} xmm_mm_union;
-
-#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; }
-
-typedef __m128 v4sf;  // vector of 4 float (sse1)
-typedef __m64 v2si;   // vector of 2 int (mmx)
-
-#endif
-
-void _SSE_SinCos(float x, float* s, float* c)
-{
-#ifdef _WIN32
-	float t4, t8, t12;
-
-	__asm
-	{
-		movss	xmm0, x
-		movss	t12, xmm0
-		movss	xmm1, _ps_am_inv_sign_mask
-		mov		eax, t12
-		mulss	xmm0, _ps_am_2_o_pi
-		andps	xmm0, xmm1
-		and		eax, 0x80000000
-
-		cvttss2si	edx, xmm0
-		mov		ecx, edx
-		mov		t12, esi
-		mov		esi, edx
-		add		edx, 0x1	
-		shl		ecx, (31 - 1)
-		shl		edx, (31 - 1)
-
-		movss	xmm4, _ps_am_1
-		cvtsi2ss	xmm3, esi
-		mov		t8, eax
-		and		esi, 0x1
-
-		subss	xmm0, xmm3
-		movss	xmm3, _sincos_inv_masks[esi * 4]
-		minss	xmm0, xmm4
-
-		subss	xmm4, xmm0
-
-		movss	xmm6, xmm4
-		andps	xmm4, xmm3
-		and		ecx, 0x80000000
-		movss	xmm2, xmm3
-		andnps	xmm3, xmm0
-		and		edx, 0x80000000
-		movss	xmm7, t8
-		andps	xmm0, xmm2
-		mov		t8, ecx
-		mov		t4, edx
-		orps	xmm4, xmm3
-
-		mov		eax, s     //mov eax, [esp + 4 + 16]
-		mov		edx, c //mov edx, [esp + 4 + 16 + 4]
-
-		andnps	xmm2, xmm6
-		orps	xmm0, xmm2
-
-		movss	xmm2, t8
-		movss	xmm1, xmm0
-		movss	xmm5, xmm4
-		xorps	xmm7, xmm2
-		movss	xmm3, _ps_sincos_p3
-		mulss	xmm0, xmm0
-		mulss	xmm4, xmm4
-		movss	xmm2, xmm0
-		movss	xmm6, xmm4
-		orps	xmm1, xmm7
-		movss	xmm7, _ps_sincos_p2
-		mulss	xmm0, xmm3
-		mulss	xmm4, xmm3
-		movss	xmm3, _ps_sincos_p1
-		addss	xmm0, xmm7
-		addss	xmm4, xmm7
-		movss	xmm7, _ps_sincos_p0
-		mulss	xmm0, xmm2
-		mulss	xmm4, xmm6
-		addss	xmm0, xmm3
-		addss	xmm4, xmm3
-		movss	xmm3, t4
-		mulss	xmm0, xmm2
-		mulss	xmm4, xmm6
-		orps	xmm5, xmm3
-		mov		esi, t12
-		addss	xmm0, xmm7
-		addss	xmm4, xmm7
-		mulss	xmm0, xmm1
-		mulss	xmm4, xmm5
-
-		// use full stores since caller might reload with full loads
-		movss	[eax], xmm0
-		movss	[edx], xmm4
-	}
-#elif POSIX
-	
-	Assert( "Needs testing, verify impl!\n" );
-	
-	v4sf  xx = _mm_load_ss( &x );
-	
-	v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
-	v2si mm0, mm1, mm2, mm3, mm4, mm5;
-	sign_bit_sin = xx;
-	/* take the absolute value */
-	xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask);
-	/* extract the sign bit (upper one) */
-	sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
-	
-	/* scale by 4/Pi */
-	y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI);
-	
-	/* store the integer part of y in mm2:mm3 */
-	xmm3 = _mm_movehl_ps(xmm3, y);
-	mm2 = _mm_cvttps_pi32(y);
-	mm3 = _mm_cvttps_pi32(xmm3);
-	
-	/* j=(j+1) & (~1) (see the cephes sources) */
-	mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
-	mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
-	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
-	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
-	
-	y = _mm_cvtpi32x2_ps(mm2, mm3);
-	
-	mm4 = mm2;
-	mm5 = mm3;
-	
-	/* get the swap sign flag for the sine */
-	mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
-	mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
-	mm0 = _mm_slli_pi32(mm0, 29);
-	mm1 = _mm_slli_pi32(mm1, 29);
-	v4sf swap_sign_bit_sin;
-	COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
-	
-	/* get the polynom selection mask for the sine */
-	
-	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
-	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
-	mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
-	mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
-	v4sf poly_mask;
-	COPY_MM_TO_XMM(mm2, mm3, poly_mask);
-	
-	/* The magic pass: "Extended precision modular arithmetic" 
-	 x = ((x - y * DP1) - y * DP2) - y * DP3; */
-	xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
-	xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
-	xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
-	xmm1 = _mm_mul_ps(y, xmm1);
-	xmm2 = _mm_mul_ps(y, xmm2);
-	xmm3 = _mm_mul_ps(y, xmm3);
-	xx = _mm_add_ps(xx, xmm1);
-	xx = _mm_add_ps(xx, xmm2);
-	xx = _mm_add_ps(xx, xmm3);
-	
-	/* get the sign flag for the cosine */
-	mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
-	mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
-	mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
-	mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
-	mm4 = _mm_slli_pi32(mm4, 29);
-	mm5 = _mm_slli_pi32(mm5, 29);
-	v4sf sign_bit_cos;
-	COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
-	_mm_empty(); /* good-bye mmx */
-	
-	sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
-	
-	
-	/* Evaluate the first polynom  (0 <= x <= Pi/4) */
-	v4sf z = _mm_mul_ps(xx,xx);
-	y = *(v4sf*)_ps_coscof_p0;
-	
-	y = _mm_mul_ps(y, z);
-	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
-	y = _mm_mul_ps(y, z);
-	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
-	y = _mm_mul_ps(y, z);
-	y = _mm_mul_ps(y, z);
-	v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
-	y = _mm_sub_ps(y, tmp);
-	y = _mm_add_ps(y, *(v4sf*)_ps_1);
-	
-	/* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-	
-	v4sf y2 = *(v4sf*)_ps_sincof_p0;
-	y2 = _mm_mul_ps(y2, z);
-	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
-	y2 = _mm_mul_ps(y2, z);
-	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
-	y2 = _mm_mul_ps(y2, z);
-	y2 = _mm_mul_ps(y2, xx);
-	y2 = _mm_add_ps(y2, xx);
-	
-	/* select the correct result from the two polynoms */  
-	xmm3 = poly_mask;
-	v4sf ysin2 = _mm_and_ps(xmm3, y2);
-	v4sf ysin1 = _mm_andnot_ps(xmm3, y);
-	y2 = _mm_sub_ps(y2,ysin2);
-	y = _mm_sub_ps(y, ysin1);
-	
-	xmm1 = _mm_add_ps(ysin1,ysin2);
-	xmm2 = _mm_add_ps(y,y2);
-	
-	/* update the sign */
-	_mm_store_ss( s, _mm_xor_ps(xmm1, sign_bit_sin) );
-	_mm_store_ss( c, _mm_xor_ps(xmm2, sign_bit_cos) );
-
-#else
-	#error "Not Implemented"
-#endif
-}
-
-float _SSE_cos( float x )
-{
-#ifdef _WIN32
-	float temp;
-	__asm
-	{
-		movss	xmm0, x
-		movss	xmm1, _ps_am_inv_sign_mask
-		andps	xmm0, xmm1
-		addss	xmm0, _ps_am_pi_o_2
-		mulss	xmm0, _ps_am_2_o_pi
-
-		cvttss2si	ecx, xmm0
-		movss	xmm5, _ps_am_1
-		mov		edx, ecx
-		shl		edx, (31 - 1)
-		cvtsi2ss	xmm1, ecx
-		and		edx, 0x80000000
-		and		ecx, 0x1
-
-		subss	xmm0, xmm1
-		movss	xmm6, _sincos_masks[ecx * 4]
-		minss	xmm0, xmm5
-
-		movss	xmm1, _ps_sincos_p3
-		subss	xmm5, xmm0
-
-		andps	xmm5, xmm6
-		movss	xmm7, _ps_sincos_p2
-		andnps	xmm6, xmm0
-		mov		temp, edx
-		orps	xmm5, xmm6
-		movss	xmm0, xmm5
-
-		mulss	xmm5, xmm5
-		movss	xmm4, _ps_sincos_p1
-		movss	xmm2, xmm5
-		mulss	xmm5, xmm1
-		movss	xmm1, _ps_sincos_p0
-		addss	xmm5, xmm7
-		mulss	xmm5, xmm2
-		movss	xmm3, temp
-		addss	xmm5, xmm4
-		mulss	xmm5, xmm2
-		orps	xmm0, xmm3
-		addss	xmm5, xmm1
-		mulss	xmm0, xmm5
-		
-		movss   x,    xmm0
-
-	}
-#elif POSIX
-
-	Assert( "Needs testing, verify impl!\n" );
-
-	v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
-	v2si mm0, mm1, mm2, mm3;
-	/* take the absolute value */
-	v4sf  xx = _mm_load_ss( &x );
-
-	xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask);
-		
-	/* scale by 4/Pi */
-	y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI);
-	
-	/* store the integer part of y in mm0:mm1 */
-	xmm2 = _mm_movehl_ps(xmm2, y);
-	mm2 = _mm_cvttps_pi32(y);
-	mm3 = _mm_cvttps_pi32(xmm2);
-	
-	/* j=(j+1) & (~1) (see the cephes sources) */
-	mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
-	mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
-	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
-	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
-	
-	y = _mm_cvtpi32x2_ps(mm2, mm3);
-	
-	
-	mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
-	mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
-	
-	/* get the swap sign flag in mm0:mm1 and the 
-	 polynom selection mask in mm2:mm3 */
-	
-	mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
-	mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
-	mm0 = _mm_slli_pi32(mm0, 29);
-	mm1 = _mm_slli_pi32(mm1, 29);
-	
-	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
-	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
-	
-	mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
-	mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
-	
-	v4sf sign_bit, poly_mask;
-	COPY_MM_TO_XMM(mm0, mm1, sign_bit);
-	COPY_MM_TO_XMM(mm2, mm3, poly_mask);
-	_mm_empty(); /* good-bye mmx */
-
-	/* The magic pass: "Extended precision modular arithmetic" 
-	 x = ((x - y * DP1) - y * DP2) - y * DP3; */
-	xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
-	xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
-	xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
-	xmm1 = _mm_mul_ps(y, xmm1);
-	xmm2 = _mm_mul_ps(y, xmm2);
-	xmm3 = _mm_mul_ps(y, xmm3);
-	xx = _mm_add_ps(xx, xmm1);
-	xx = _mm_add_ps(xx, xmm2);
-	xx = _mm_add_ps(xx, xmm3);
-	
-	/* Evaluate the first polynom  (0 <= x <= Pi/4) */
-	y = *(v4sf*)_ps_coscof_p0;
-	v4sf z = _mm_mul_ps(xx,xx);
-	
-	y = _mm_mul_ps(y, z);
-	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
-	y = _mm_mul_ps(y, z);
-	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
-	y = _mm_mul_ps(y, z);
-	y = _mm_mul_ps(y, z);
-	v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
-	y = _mm_sub_ps(y, tmp);
-	y = _mm_add_ps(y, *(v4sf*)_ps_1);
-	
-	/* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-	
-	v4sf y2 = *(v4sf*)_ps_sincof_p0;
-	y2 = _mm_mul_ps(y2, z);
-	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
-	y2 = _mm_mul_ps(y2, z);
-	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
-	y2 = _mm_mul_ps(y2, z);
-	y2 = _mm_mul_ps(y2, xx);
-	y2 = _mm_add_ps(y2, xx);
-	
-	/* select the correct result from the two polynoms */  
-	xmm3 = poly_mask;
-	y2 = _mm_and_ps(xmm3, y2); //, xmm3);
-	y = _mm_andnot_ps(xmm3, y);
-	y = _mm_add_ps(y,y2);
-	/* update the sign */
-
-	_mm_store_ss( &x, _mm_xor_ps(y, sign_bit) );
-
-#else
-	#error "Not Implemented"
-#endif
-
-	return x;
-}
-
-//-----------------------------------------------------------------------------
-// SSE2 implementations of optimized routines:
-//-----------------------------------------------------------------------------
-void _SSE2_SinCos(float x, float* s, float* c)  // any x
-{
-#ifdef _WIN32
-	__asm
-	{
-		movss	xmm0, x
-		movaps	xmm7, xmm0
-		movss	xmm1, _ps_am_inv_sign_mask
-		movss	xmm2, _ps_am_sign_mask
-		movss	xmm3, _ps_am_2_o_pi
-		andps	xmm0, xmm1
-		andps	xmm7, xmm2
-		mulss	xmm0, xmm3
-
-		pxor	xmm3, xmm3
-		movd	xmm5, _epi32_1
-		movss	xmm4, _ps_am_1
-
-		cvttps2dq	xmm2, xmm0
-		pand	xmm5, xmm2
-		movd	xmm1, _epi32_2
-		pcmpeqd	xmm5, xmm3
-		movd	xmm3, _epi32_1
-		cvtdq2ps	xmm6, xmm2
-		paddd	xmm3, xmm2
-		pand	xmm2, xmm1
-		pand	xmm3, xmm1
-		subss	xmm0, xmm6
-		pslld	xmm2, (31 - 1)
-		minss	xmm0, xmm4
-
-		mov		eax, s     // mov eax, [esp + 4 + 16]
-		mov		edx, c	   // mov edx, [esp + 4 + 16 + 4]
-
-		subss	xmm4, xmm0
-		pslld	xmm3, (31 - 1)
-
-		movaps	xmm6, xmm4
-		xorps	xmm2, xmm7
-		movaps	xmm7, xmm5
-		andps	xmm6, xmm7
-		andnps	xmm7, xmm0
-		andps	xmm0, xmm5
-		andnps	xmm5, xmm4
-		movss	xmm4, _ps_sincos_p3
-		orps	xmm6, xmm7
-		orps	xmm0, xmm5
-		movss	xmm5, _ps_sincos_p2
-
-		movaps	xmm1, xmm0
-		movaps	xmm7, xmm6
-		mulss	xmm0, xmm0
-		mulss	xmm6, xmm6
-		orps	xmm1, xmm2
-		orps	xmm7, xmm3
-		movaps	xmm2, xmm0
-		movaps	xmm3, xmm6
-		mulss	xmm0, xmm4
-		mulss	xmm6, xmm4
-		movss	xmm4, _ps_sincos_p1
-		addss	xmm0, xmm5
-		addss	xmm6, xmm5
-		movss	xmm5, _ps_sincos_p0
-		mulss	xmm0, xmm2
-		mulss	xmm6, xmm3
-		addss	xmm0, xmm4
-		addss	xmm6, xmm4
-		mulss	xmm0, xmm2
-		mulss	xmm6, xmm3
-		addss	xmm0, xmm5
-		addss	xmm6, xmm5
-		mulss	xmm0, xmm1
-		mulss	xmm6, xmm7
-
-		// use full stores since caller might reload with full loads
-		movss	[eax], xmm0
-		movss	[edx], xmm6
-	}
-#elif POSIX
-	#warning "_SSE2_SinCos NOT implemented!"
-	Assert( 0 );
-#else
-	#error "Not Implemented"
-#endif
-}
-
-float _SSE2_cos(float x)  
-{
-#ifdef _WIN32
-	__asm
-	{
-		movss	xmm0, x
-		movss	xmm1, _ps_am_inv_sign_mask
-		movss	xmm2, _ps_am_pi_o_2
-		movss	xmm3, _ps_am_2_o_pi
-		andps	xmm0, xmm1
-		addss	xmm0, xmm2
-		mulss	xmm0, xmm3
-
-		pxor	xmm3, xmm3
-		movd	xmm5, _epi32_1
-		movss	xmm4, _ps_am_1
-		cvttps2dq	xmm2, xmm0
-		pand	xmm5, xmm2
-		movd	xmm1, _epi32_2
-		pcmpeqd	xmm5, xmm3
-		cvtdq2ps	xmm6, xmm2
-		pand	xmm2, xmm1
-		pslld	xmm2, (31 - 1)
-
-		subss	xmm0, xmm6
-		movss	xmm3, _ps_sincos_p3
-		minss	xmm0, xmm4
-		subss	xmm4, xmm0
-		andps	xmm0, xmm5
-		andnps	xmm5, xmm4
-		orps	xmm0, xmm5
-
-		movaps	xmm1, xmm0
-		movss	xmm4, _ps_sincos_p2
-		mulss	xmm0, xmm0
-		movss	xmm5, _ps_sincos_p1
-		orps	xmm1, xmm2
-		movaps	xmm7, xmm0
-		mulss	xmm0, xmm3
-		movss	xmm6, _ps_sincos_p0
-		addss	xmm0, xmm4
-		mulss	xmm0, xmm7
-		addss	xmm0, xmm5
-		mulss	xmm0, xmm7
-		addss	xmm0, xmm6
-		mulss	xmm0, xmm1
-		movss   x,    xmm0
-	}
-#elif POSIX
-	#warning "_SSE2_cos NOT implemented!"
-	Assert( 0 );
-#else
-	#error "Not Implemented"
-#endif
-
-	return x;
-}
-
-// SSE Version of VectorTransform
-void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1)
-{
-	Assert( s_bMathlibInitialized );
-	Assert( in1 != out1 );
-
-#ifdef _WIN32
-	__asm
-	{
-		mov eax, in1;
-		mov ecx, in2;
-		mov edx, out1;
-
-		movss xmm0, [eax];
-		mulss xmm0, [ecx];
-		movss xmm1, [eax+4];
-		mulss xmm1, [ecx+4];
-		movss xmm2, [eax+8];
-		mulss xmm2, [ecx+8];
-		addss xmm0, xmm1;
-		addss xmm0, xmm2;
-		addss xmm0, [ecx+12]
- 		movss [edx], xmm0;
-		add ecx, 16;
-
-		movss xmm0, [eax];
-		mulss xmm0, [ecx];
-		movss xmm1, [eax+4];
-		mulss xmm1, [ecx+4];
-		movss xmm2, [eax+8];
-		mulss xmm2, [ecx+8];
-		addss xmm0, xmm1;
-		addss xmm0, xmm2;
-		addss xmm0, [ecx+12]
-		movss [edx+4], xmm0;
-		add ecx, 16;
-
-		movss xmm0, [eax];
-		mulss xmm0, [ecx];
-		movss xmm1, [eax+4];
-		mulss xmm1, [ecx+4];
-		movss xmm2, [eax+8];
-		mulss xmm2, [ecx+8];
-		addss xmm0, xmm1;
-		addss xmm0, xmm2;
-		addss xmm0, [ecx+12]
-		movss [edx+8], xmm0;
-	}
-#elif POSIX
-	#warning "VectorTransformSSE C implementation only"
-		out1[0] = DotProduct(in1, in2[0]) + in2[0][3];
-		out1[1] = DotProduct(in1, in2[1]) + in2[1][3];
-		out1[2] = DotProduct(in1, in2[2]) + in2[2][3];
-#else
-	#error "Not Implemented"
-#endif
-}
-
-void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 )
-{
-	Assert( s_bMathlibInitialized );
-	Assert( in1 != out1 );
-
-#ifdef _WIN32
-	__asm
-	{
-		mov eax, in1;
-		mov ecx, in2;
-		mov edx, out1;
-
-		movss xmm0, [eax];
-		mulss xmm0, [ecx];
-		movss xmm1, [eax+4];
-		mulss xmm1, [ecx+4];
-		movss xmm2, [eax+8];
-		mulss xmm2, [ecx+8];
-		addss xmm0, xmm1;
-		addss xmm0, xmm2;
- 		movss [edx], xmm0;
-		add ecx, 16;
-
-		movss xmm0, [eax];
-		mulss xmm0, [ecx];
-		movss xmm1, [eax+4];
-		mulss xmm1, [ecx+4];
-		movss xmm2, [eax+8];
-		mulss xmm2, [ecx+8];
-		addss xmm0, xmm1;
-		addss xmm0, xmm2;
-		movss [edx+4], xmm0;
-		add ecx, 16;
-
-		movss xmm0, [eax];
-		mulss xmm0, [ecx];
-		movss xmm1, [eax+4];
-		mulss xmm1, [ecx+4];
-		movss xmm2, [eax+8];
-		mulss xmm2, [ecx+8];
-		addss xmm0, xmm1;
-		addss xmm0, xmm2;
-		movss [edx+8], xmm0;
-	}
-#elif POSIX
-	#warning "VectorRotateSSE C implementation only"
-		out1[0] = DotProduct( in1, in2[0] );
-		out1[1] = DotProduct( in1, in2[1] );
-		out1[2] = DotProduct( in1, in2[2] );
-#else
-	#error "Not Implemented"
-#endif
-}
-
-#ifdef _WIN32
-void _declspec(naked) _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest )
-{
-	// FIXME: This don't work!! It will overwrite memory in the write to dest
-	Assert(0);
-
-	Assert( s_bMathlibInitialized );
-	_asm {  // Intel SSE only routine
-		mov	eax, DWORD PTR [esp+0x04]	; *start, s0..s2
-		mov ecx, DWORD PTR [esp+0x0c]	; *direction, d0..d2
-		mov edx, DWORD PTR [esp+0x10]	; *dest
-		movss	xmm2, [esp+0x08]		; x2 = scale, 0, 0, 0
-#ifdef ALIGNED_VECTOR
-		movaps	xmm3, [ecx]				; x3 = dir0,dir1,dir2,X
-		pshufd	xmm2, xmm2, 0			; x2 = scale, scale, scale, scale
-		movaps	xmm1, [eax]				; x1 = start1, start2, start3, X
-		mulps	xmm3, xmm2				; x3 *= x2
-		addps	xmm3, xmm1				; x3 += x1
-		movaps	[edx], xmm3				; *dest = x3
-#else
-		movups	xmm3, [ecx]				; x3 = dir0,dir1,dir2,X
-		pshufd	xmm2, xmm2, 0			; x2 = scale, scale, scale, scale
-		movups	xmm1, [eax]				; x1 = start1, start2, start3, X
-		mulps	xmm3, xmm2				; x3 *= x2
-		addps	xmm3, xmm1				; x3 += x1
-		movups	[edx], xmm3				; *dest = x3
-#endif
-	}
-}
-#endif
-
-#ifdef _WIN32
-#ifdef PFN_VECTORMA
-void _declspec(naked) __cdecl _SSE_VectorMA( const Vector &start, float scale, const Vector &direction, Vector &dest )
-{
-	// FIXME: This don't work!! It will overwrite memory in the write to dest
-	Assert(0);
-
-	Assert( s_bMathlibInitialized );
-	_asm 
-	{  
-		// Intel SSE only routine
-		mov	eax, DWORD PTR [esp+0x04]	; *start, s0..s2
-		mov ecx, DWORD PTR [esp+0x0c]	; *direction, d0..d2
-		mov edx, DWORD PTR [esp+0x10]	; *dest
-		movss	xmm2, [esp+0x08]		; x2 = scale, 0, 0, 0
-#ifdef ALIGNED_VECTOR
-		movaps	xmm3, [ecx]				; x3 = dir0,dir1,dir2,X
-		pshufd	xmm2, xmm2, 0			; x2 = scale, scale, scale, scale
-		movaps	xmm1, [eax]				; x1 = start1, start2, start3, X
-		mulps	xmm3, xmm2				; x3 *= x2
-		addps	xmm3, xmm1				; x3 += x1
-		movaps	[edx], xmm3				; *dest = x3
-#else
-		movups	xmm3, [ecx]				; x3 = dir0,dir1,dir2,X
-		pshufd	xmm2, xmm2, 0			; x2 = scale, scale, scale, scale
-		movups	xmm1, [eax]				; x1 = start1, start2, start3, X
-		mulps	xmm3, xmm2				; x3 *= x2
-		addps	xmm3, xmm1				; x3 += x1
-		movups	[edx], xmm3				; *dest = x3
-#endif
-	}
-}
-float (__cdecl *pfVectorMA)(Vector& v) = _VectorMA;
-#endif
-#endif
-
-
-// SSE DotProduct -- it's a smidgen faster than the asm DotProduct...
-//   Should be validated too!  :)
-//   NJS: (Nov 1 2002) -NOT- faster.  may time a couple cycles faster in a single function like 
-//   this, but when inlined, and instruction scheduled, the C version is faster.  
-//   Verified this via VTune
-/*
-vec_t DotProduct (const vec_t *a, const vec_t *c)
-{
-	vec_t temp;
-
-	__asm
-	{
-		mov eax, a;
-		mov ecx, c;
-		mov edx, DWORD PTR [temp]
-		movss xmm0, [eax];
-		mulss xmm0, [ecx];
-		movss xmm1, [eax+4];
-		mulss xmm1, [ecx+4];
-		movss xmm2, [eax+8];
-		mulss xmm2, [ecx+8];
-		addss xmm0, xmm1;
-		addss xmm0, xmm2;
-		movss [edx], xmm0;
-		fld DWORD PTR [edx];
-		ret
-	}
-}
-*/
-
-#endif // COMPILER_MSVC64 
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: SSE Math primitives.
+//
+//=====================================================================================//
+
+#include <math.h>
+#include <float.h>	// Needed for FLT_EPSILON
+#include "basetypes.h"
+#include <memory.h>
+#include "tier0/dbg.h"
+#include "mathlib/mathlib.h"
+#include "mathlib/vector.h"
+#include "sse.h"
+
+// memdbgon must be the last include file in a .cpp file!!!
+#include "tier0/memdbgon.h"
+
+#ifndef COMPILER_MSVC64
+// Implement for 64-bit Windows if needed.
+
+static const uint32 _sincos_masks[]	  = { (uint32)0x0,  (uint32)~0x0 };
+static const uint32 _sincos_inv_masks[] = { (uint32)~0x0, (uint32)0x0 };
+
+//-----------------------------------------------------------------------------
+// Macros and constants required by some of the SSE assembly:
+//-----------------------------------------------------------------------------
+
+#ifdef _WIN32
+	#define _PS_EXTERN_CONST(Name, Val) \
+		const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val }
+
+	#define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \
+		const __declspec(align(16)) Type _ps_##Name[4] = { Val, Val, Val, Val }; \
+
+	#define _EPI32_CONST(Name, Val) \
+		static const __declspec(align(16)) __int32 _epi32_##Name[4] = { Val, Val, Val, Val }
+
+	#define _PS_CONST(Name, Val) \
+		static const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val }
+#elif POSIX
+	#define _PS_EXTERN_CONST(Name, Val) \
+		const float _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }
+
+	#define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \
+		const Type _ps_##Name[4]  __attribute__((aligned(16))) = { Val, Val, Val, Val }; \
+
+	#define _EPI32_CONST(Name, Val) \
+		static const int32 _epi32_##Name[4]  __attribute__((aligned(16))) = { Val, Val, Val, Val }
+
+	#define _PS_CONST(Name, Val) \
+		static const float _ps_##Name[4]  __attribute__((aligned(16))) = { Val, Val, Val, Val }
+#endif
+
+_PS_EXTERN_CONST(am_0, 0.0f);
+_PS_EXTERN_CONST(am_1, 1.0f);
+_PS_EXTERN_CONST(am_m1, -1.0f);
+_PS_EXTERN_CONST(am_0p5, 0.5f);
+_PS_EXTERN_CONST(am_1p5, 1.5f);
+_PS_EXTERN_CONST(am_pi, (float)M_PI);
+_PS_EXTERN_CONST(am_pi_o_2, (float)(M_PI / 2.0));
+_PS_EXTERN_CONST(am_2_o_pi, (float)(2.0 / M_PI));
+_PS_EXTERN_CONST(am_pi_o_4, (float)(M_PI / 4.0));
+_PS_EXTERN_CONST(am_4_o_pi, (float)(4.0 / M_PI));
+_PS_EXTERN_CONST_TYPE(am_sign_mask, int32, 0x80000000);
+_PS_EXTERN_CONST_TYPE(am_inv_sign_mask, int32, ~0x80000000);
+_PS_EXTERN_CONST_TYPE(am_min_norm_pos,int32, 0x00800000);
+_PS_EXTERN_CONST_TYPE(am_mant_mask, int32, 0x7f800000);
+_PS_EXTERN_CONST_TYPE(am_inv_mant_mask, int32, ~0x7f800000);
+
+_EPI32_CONST(1, 1);
+_EPI32_CONST(2, 2);
+
+_PS_CONST(sincos_p0, 0.15707963267948963959e1f);
+_PS_CONST(sincos_p1, -0.64596409750621907082e0f);
+_PS_CONST(sincos_p2, 0.7969262624561800806e-1f);
+_PS_CONST(sincos_p3, -0.468175413106023168e-2f);
+
+#ifdef PFN_VECTORMA
+void  __cdecl _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest );
+#endif
+
+//-----------------------------------------------------------------------------
+// SSE implementations of optimized routines:
+//-----------------------------------------------------------------------------
+float _SSE_Sqrt(float x)
+{
+	Assert( s_bMathlibInitialized );
+	float	root = 0.f;
+#ifdef _WIN32
+	_asm
+	{
+		sqrtss		xmm0, x
+		movss		root, xmm0
+	}
+#elif POSIX
+	_mm_store_ss( &root, _mm_sqrt_ss( _mm_load_ss( &x ) ) );
+#endif
+	return root;
+}
+
+// Single iteration NewtonRaphson reciprocal square root:
+// 0.5 * rsqrtps * (3 - x * rsqrtps(x) * rsqrtps(x)) 	
+// Very low error, and fine to use in place of 1.f / sqrtf(x).	
+#if 0
+float _SSE_RSqrtAccurate(float x)
+{
+	Assert( s_bMathlibInitialized );
+
+	float rroot;
+	_asm
+	{
+		rsqrtss	xmm0, x
+		movss	rroot, xmm0
+	}
+
+	return (0.5f * rroot) * (3.f - (x * rroot) * rroot);
+}
+#else
+
+#ifdef POSIX
+const __m128  f3  = _mm_set_ss(3.0f);  // 3 as SSE value
+const __m128  f05 = _mm_set_ss(0.5f);  // 0.5 as SSE value
+#endif
+
+// Intel / Kipps SSE RSqrt.  Significantly faster than above.
+float _SSE_RSqrtAccurate(float a)
+{
+
+#ifdef _WIN32
+	float x;
+	float half = 0.5f;
+	float three = 3.f;
+
+	__asm
+	{
+		movss   xmm3, a;
+		movss   xmm1, half;
+		movss   xmm2, three;
+		rsqrtss xmm0, xmm3;
+
+		mulss   xmm3, xmm0;
+		mulss   xmm1, xmm0;
+		mulss   xmm3, xmm0;
+		subss   xmm2, xmm3;
+		mulss   xmm1, xmm2;
+
+		movss   x,    xmm1;
+	}
+
+	return x;
+#elif POSIX	
+	__m128  xx = _mm_load_ss( &a );
+    __m128  xr = _mm_rsqrt_ss( xx );
+    __m128  xt;
+	
+    xt = _mm_mul_ss( xr, xr );
+    xt = _mm_mul_ss( xt, xx );
+    xt = _mm_sub_ss( f3, xt );
+    xt = _mm_mul_ss( xt, f05 );
+    xr = _mm_mul_ss( xr, xt );
+	
+    _mm_store_ss( &a, xr );
+    return a;
+#else
+	#error "Not Implemented"
+#endif
+
+}
+#endif
+
+// Simple SSE rsqrt.  Usually accurate to around 6 (relative) decimal places 
+// or so, so ok for closed transforms.  (ie, computing lighting normals)
+float _SSE_RSqrtFast(float x)
+{
+	Assert( s_bMathlibInitialized );
+
+	float rroot;
+#ifdef _WIN32
+	_asm
+	{
+		rsqrtss	xmm0, x
+		movss	rroot, xmm0
+	}
+#elif POSIX
+	__asm__ __volatile__( "rsqrtss %0, %1" : "=x" (rroot) : "x" (x) );
+#else
+#error
+#endif
+
+	return rroot;
+}
+
+float FASTCALL _SSE_VectorNormalize (Vector& vec)
+{
+	Assert( s_bMathlibInitialized );
+
+	// NOTE: This is necessary to prevent an memory overwrite...
+	// sice vec only has 3 floats, we can't "movaps" directly into it.
+#ifdef _WIN32
+	__declspec(align(16)) float result[4];
+#elif POSIX
+	 float result[4] __attribute__((aligned(16)));
+#endif
+
+	float *v = &vec[0];
+	float *r = &result[0];
+
+	float	radius = 0.f;
+	// Blah, get rid of these comparisons ... in reality, if you have all 3 as zero, it shouldn't 
+	// be much of a performance win, considering you will very likely miss 3 branch predicts in a row.
+	if ( v[0] || v[1] || v[2] )
+	{
+#ifdef _WIN32
+	_asm
+		{
+			mov			eax, v
+			mov			edx, r
+#ifdef ALIGNED_VECTOR
+			movaps		xmm4, [eax]			// r4 = vx, vy, vz, X
+			movaps		xmm1, xmm4			// r1 = r4
+#else
+			movups		xmm4, [eax]			// r4 = vx, vy, vz, X
+			movaps		xmm1, xmm4			// r1 = r4
+#endif
+			mulps		xmm1, xmm4			// r1 = vx * vx, vy * vy, vz * vz, X
+			movhlps		xmm3, xmm1			// r3 = vz * vz, X, X, X
+			movaps		xmm2, xmm1			// r2 = r1
+			shufps		xmm2, xmm2, 1		// r2 = vy * vy, X, X, X
+			addss		xmm1, xmm2			// r1 = (vx * vx) + (vy * vy), X, X, X
+			addss		xmm1, xmm3			// r1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X
+			sqrtss		xmm1, xmm1			// r1 = sqrt((vx * vx) + (vy * vy) + (vz * vz)), X, X, X
+			movss		radius, xmm1		// radius = sqrt((vx * vx) + (vy * vy) + (vz * vz))
+			rcpss		xmm1, xmm1			// r1 = 1/radius, X, X, X
+			shufps		xmm1, xmm1, 0		// r1 = 1/radius, 1/radius, 1/radius, X
+			mulps		xmm4, xmm1			// r4 = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
+			movaps		[edx], xmm4			// v = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
+		}
+#elif POSIX
+		__asm__ __volatile__(
+#ifdef ALIGNED_VECTOR
+            "movaps          %2, %%xmm4 \n\t"
+            "movaps          %%xmm4, %%xmm1 \n\t"
+#else
+            "movups          %2, %%xmm4 \n\t"
+            "movaps          %%xmm4, %%xmm1 \n\t"
+#endif
+            "mulps           %%xmm4, %%xmm1 \n\t"
+            "movhlps         %%xmm1, %%xmm3 \n\t"
+            "movaps          %%xmm1, %%xmm2 \n\t"
+            "shufps          $1, %%xmm2, %%xmm2 \n\t"
+            "addss           %%xmm2, %%xmm1 \n\t"
+            "addss           %%xmm3, %%xmm1 \n\t"
+            "sqrtss          %%xmm1, %%xmm1 \n\t"
+            "movss           %%xmm1, %0 \n\t"
+            "rcpss           %%xmm1, %%xmm1 \n\t"
+            "shufps          $0, %%xmm1, %%xmm1 \n\t"
+            "mulps           %%xmm1, %%xmm4 \n\t"
+            "movaps          %%xmm4, %1 \n\t"
+            : "=m" (radius), "=m" (result)
+            : "m" (*v)
+ 		);
+#else
+	#error "Not Implemented"
+#endif
+		vec.x = result[0];
+		vec.y = result[1];
+		vec.z = result[2];
+
+	}
+
+	return radius;
+}
+
+void FASTCALL _SSE_VectorNormalizeFast (Vector& vec)
+{
+	float ool = _SSE_RSqrtAccurate( FLT_EPSILON + vec.x * vec.x + vec.y * vec.y + vec.z * vec.z );
+
+	vec.x *= ool;
+	vec.y *= ool;
+	vec.z *= ool;
+}
+
+float _SSE_InvRSquared(const float* v)
+{
+	float	inv_r2 = 1.f;
+#ifdef _WIN32
+	_asm { // Intel SSE only routine
+		mov			eax, v
+		movss		xmm5, inv_r2		// x5 = 1.0, 0, 0, 0
+#ifdef ALIGNED_VECTOR
+		movaps		xmm4, [eax]			// x4 = vx, vy, vz, X
+#else
+		movups		xmm4, [eax]			// x4 = vx, vy, vz, X
+#endif
+		movaps		xmm1, xmm4			// x1 = x4
+		mulps		xmm1, xmm4			// x1 = vx * vx, vy * vy, vz * vz, X
+		movhlps		xmm3, xmm1			// x3 = vz * vz, X, X, X
+		movaps		xmm2, xmm1			// x2 = x1
+		shufps		xmm2, xmm2, 1		// x2 = vy * vy, X, X, X
+		addss		xmm1, xmm2			// x1 = (vx * vx) + (vy * vy), X, X, X
+		addss		xmm1, xmm3			// x1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X
+		maxss		xmm1, xmm5			// x1 = max( 1.0, x1 )
+		rcpss		xmm0, xmm1			// x0 = 1 / max( 1.0, x1 )
+		movss		inv_r2, xmm0		// inv_r2 = x0
+	}
+#elif POSIX
+		__asm__ __volatile__(
+		"movss			 %0, %%xmm5 \n\t"
+#ifdef ALIGNED_VECTOR
+		"movaps          %1, %%xmm4 \n\t"
+#else
+		"movups          %1, %%xmm4 \n\t"
+#endif
+        "movaps          %%xmm4, %%xmm1 \n\t"
+        "mulps           %%xmm4, %%xmm1 \n\t"
+		"movhlps         %%xmm1, %%xmm3 \n\t"
+		"movaps          %%xmm1, %%xmm2 \n\t"
+        "shufps          $1, %%xmm2, %%xmm2 \n\t"
+        "addss           %%xmm2, %%xmm1 \n\t"
+        "addss           %%xmm3, %%xmm1 \n\t"
+		"maxss           %%xmm5, %%xmm1 \n\t"
+        "rcpss           %%xmm1, %%xmm0 \n\t"
+		"movss           %%xmm0, %0 \n\t" 
+        : "=m" (inv_r2)
+        : "m" (*v), "0" (inv_r2)
+ 		);
+#else
+	#error "Not Implemented"
+#endif
+
+	return inv_r2;
+}
+
+
+#ifdef POSIX
+// #define _PS_CONST(Name, Val) static const ALIGN16 float _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val }
+#define _PS_CONST_TYPE(Name, Type, Val) static const ALIGN16 Type _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val }
+
+_PS_CONST_TYPE(sign_mask, int, 0x80000000);
+_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+
+#define _PI32_CONST(Name, Val)  static const ALIGN16 int _pi32_##Name[4]  ALIGN16_POST = { Val, Val, Val, Val }
+
+_PI32_CONST(1, 1);
+_PI32_CONST(inv1, ~1);
+_PI32_CONST(2, 2);
+_PI32_CONST(4, 4);
+_PI32_CONST(0x7f, 0x7f);
+_PS_CONST(1  , 1.0f);
+_PS_CONST(0p5, 0.5f);
+
+_PS_CONST(minus_cephes_DP1, -0.78515625);
+_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+_PS_CONST(sincof_p0, -1.9515295891E-4);
+_PS_CONST(sincof_p1,  8.3321608736E-3);
+_PS_CONST(sincof_p2, -1.6666654611E-1);
+_PS_CONST(coscof_p0,  2.443315711809948E-005);
+_PS_CONST(coscof_p1, -1.388731625493765E-003);
+_PS_CONST(coscof_p2,  4.166664568298827E-002);
+_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
+
+typedef union xmm_mm_union {
+	__m128 xmm;
+	__m64 mm[2];
+} xmm_mm_union;
+
+#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; }
+
+typedef __m128 v4sf;  // vector of 4 float (sse1)
+typedef __m64 v2si;   // vector of 2 int (mmx)
+
+#endif
+
+void _SSE_SinCos(float x, float* s, float* c)
+{
+#ifdef _WIN32
+	float t4, t8, t12;
+
+	__asm
+	{
+		movss	xmm0, x
+		movss	t12, xmm0
+		movss	xmm1, _ps_am_inv_sign_mask
+		mov		eax, t12
+		mulss	xmm0, _ps_am_2_o_pi
+		andps	xmm0, xmm1
+		and		eax, 0x80000000
+
+		cvttss2si	edx, xmm0
+		mov		ecx, edx
+		mov		t12, esi
+		mov		esi, edx
+		add		edx, 0x1	
+		shl		ecx, (31 - 1)
+		shl		edx, (31 - 1)
+
+		movss	xmm4, _ps_am_1
+		cvtsi2ss	xmm3, esi
+		mov		t8, eax
+		and		esi, 0x1
+
+		subss	xmm0, xmm3
+		movss	xmm3, _sincos_inv_masks[esi * 4]
+		minss	xmm0, xmm4
+
+		subss	xmm4, xmm0
+
+		movss	xmm6, xmm4
+		andps	xmm4, xmm3
+		and		ecx, 0x80000000
+		movss	xmm2, xmm3
+		andnps	xmm3, xmm0
+		and		edx, 0x80000000
+		movss	xmm7, t8
+		andps	xmm0, xmm2
+		mov		t8, ecx
+		mov		t4, edx
+		orps	xmm4, xmm3
+
+		mov		eax, s     //mov eax, [esp + 4 + 16]
+		mov		edx, c //mov edx, [esp + 4 + 16 + 4]
+
+		andnps	xmm2, xmm6
+		orps	xmm0, xmm2
+
+		movss	xmm2, t8
+		movss	xmm1, xmm0
+		movss	xmm5, xmm4
+		xorps	xmm7, xmm2
+		movss	xmm3, _ps_sincos_p3
+		mulss	xmm0, xmm0
+		mulss	xmm4, xmm4
+		movss	xmm2, xmm0
+		movss	xmm6, xmm4
+		orps	xmm1, xmm7
+		movss	xmm7, _ps_sincos_p2
+		mulss	xmm0, xmm3
+		mulss	xmm4, xmm3
+		movss	xmm3, _ps_sincos_p1
+		addss	xmm0, xmm7
+		addss	xmm4, xmm7
+		movss	xmm7, _ps_sincos_p0
+		mulss	xmm0, xmm2
+		mulss	xmm4, xmm6
+		addss	xmm0, xmm3
+		addss	xmm4, xmm3
+		movss	xmm3, t4
+		mulss	xmm0, xmm2
+		mulss	xmm4, xmm6
+		orps	xmm5, xmm3
+		mov		esi, t12
+		addss	xmm0, xmm7
+		addss	xmm4, xmm7
+		mulss	xmm0, xmm1
+		mulss	xmm4, xmm5
+
+		// use full stores since caller might reload with full loads
+		movss	[eax], xmm0
+		movss	[edx], xmm4
+	}
+#elif POSIX
+	
+	Assert( "Needs testing, verify impl!\n" );
+	
+	v4sf  xx = _mm_load_ss( &x );
+	
+	v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+	v2si mm0, mm1, mm2, mm3, mm4, mm5;
+	sign_bit_sin = xx;
+	/* take the absolute value */
+	xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask);
+	/* extract the sign bit (upper one) */
+	sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
+	
+	/* scale by 4/Pi */
+	y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI);
+	
+	/* store the integer part of y in mm2:mm3 */
+	xmm3 = _mm_movehl_ps(xmm3, y);
+	mm2 = _mm_cvttps_pi32(y);
+	mm3 = _mm_cvttps_pi32(xmm3);
+	
+	/* j=(j+1) & (~1) (see the cephes sources) */
+	mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+	mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+	
+	y = _mm_cvtpi32x2_ps(mm2, mm3);
+	
+	mm4 = mm2;
+	mm5 = mm3;
+	
+	/* get the swap sign flag for the sine */
+	mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+	mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+	mm0 = _mm_slli_pi32(mm0, 29);
+	mm1 = _mm_slli_pi32(mm1, 29);
+	v4sf swap_sign_bit_sin;
+	COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
+	
+	/* get the polynom selection mask for the sine */
+	
+	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+	mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+	mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+	v4sf poly_mask;
+	COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+	
+	/* The magic pass: "Extended precision modular arithmetic" 
+	 x = ((x - y * DP1) - y * DP2) - y * DP3; */
+	xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+	xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+	xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+	xmm1 = _mm_mul_ps(y, xmm1);
+	xmm2 = _mm_mul_ps(y, xmm2);
+	xmm3 = _mm_mul_ps(y, xmm3);
+	xx = _mm_add_ps(xx, xmm1);
+	xx = _mm_add_ps(xx, xmm2);
+	xx = _mm_add_ps(xx, xmm3);
+	
+	/* get the sign flag for the cosine */
+	mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
+	mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
+	mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
+	mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
+	mm4 = _mm_slli_pi32(mm4, 29);
+	mm5 = _mm_slli_pi32(mm5, 29);
+	v4sf sign_bit_cos;
+	COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
+	_mm_empty(); /* good-bye mmx */
+	
+	sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+	
+	
+	/* Evaluate the first polynom  (0 <= x <= Pi/4) */
+	v4sf z = _mm_mul_ps(xx,xx);
+	y = *(v4sf*)_ps_coscof_p0;
+	
+	y = _mm_mul_ps(y, z);
+	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+	y = _mm_mul_ps(y, z);
+	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+	y = _mm_mul_ps(y, z);
+	y = _mm_mul_ps(y, z);
+	v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+	y = _mm_sub_ps(y, tmp);
+	y = _mm_add_ps(y, *(v4sf*)_ps_1);
+	
+	/* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+	
+	v4sf y2 = *(v4sf*)_ps_sincof_p0;
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_mul_ps(y2, xx);
+	y2 = _mm_add_ps(y2, xx);
+	
+	/* select the correct result from the two polynoms */  
+	xmm3 = poly_mask;
+	v4sf ysin2 = _mm_and_ps(xmm3, y2);
+	v4sf ysin1 = _mm_andnot_ps(xmm3, y);
+	y2 = _mm_sub_ps(y2,ysin2);
+	y = _mm_sub_ps(y, ysin1);
+	
+	xmm1 = _mm_add_ps(ysin1,ysin2);
+	xmm2 = _mm_add_ps(y,y2);
+	
+	/* update the sign */
+	_mm_store_ss( s, _mm_xor_ps(xmm1, sign_bit_sin) );
+	_mm_store_ss( c, _mm_xor_ps(xmm2, sign_bit_cos) );
+
+#else
+	#error "Not Implemented"
+#endif
+}
+
+float _SSE_cos( float x )
+{
+#ifdef _WIN32
+	float temp;
+	__asm
+	{
+		movss	xmm0, x
+		movss	xmm1, _ps_am_inv_sign_mask
+		andps	xmm0, xmm1
+		addss	xmm0, _ps_am_pi_o_2
+		mulss	xmm0, _ps_am_2_o_pi
+
+		cvttss2si	ecx, xmm0
+		movss	xmm5, _ps_am_1
+		mov		edx, ecx
+		shl		edx, (31 - 1)
+		cvtsi2ss	xmm1, ecx
+		and		edx, 0x80000000
+		and		ecx, 0x1
+
+		subss	xmm0, xmm1
+		movss	xmm6, _sincos_masks[ecx * 4]
+		minss	xmm0, xmm5
+
+		movss	xmm1, _ps_sincos_p3
+		subss	xmm5, xmm0
+
+		andps	xmm5, xmm6
+		movss	xmm7, _ps_sincos_p2
+		andnps	xmm6, xmm0
+		mov		temp, edx
+		orps	xmm5, xmm6
+		movss	xmm0, xmm5
+
+		mulss	xmm5, xmm5
+		movss	xmm4, _ps_sincos_p1
+		movss	xmm2, xmm5
+		mulss	xmm5, xmm1
+		movss	xmm1, _ps_sincos_p0
+		addss	xmm5, xmm7
+		mulss	xmm5, xmm2
+		movss	xmm3, temp
+		addss	xmm5, xmm4
+		mulss	xmm5, xmm2
+		orps	xmm0, xmm3
+		addss	xmm5, xmm1
+		mulss	xmm0, xmm5
+		
+		movss   x,    xmm0
+
+	}
+#elif POSIX
+
+	Assert( "Needs testing, verify impl!\n" );
+
+	v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+	v2si mm0, mm1, mm2, mm3;
+	/* take the absolute value */
+	v4sf  xx = _mm_load_ss( &x );
+
+	xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask);
+		
+	/* scale by 4/Pi */
+	y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI);
+	
+	/* store the integer part of y in mm0:mm1 */
+	xmm2 = _mm_movehl_ps(xmm2, y);
+	mm2 = _mm_cvttps_pi32(y);
+	mm3 = _mm_cvttps_pi32(xmm2);
+	
+	/* j=(j+1) & (~1) (see the cephes sources) */
+	mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+	mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+	
+	y = _mm_cvtpi32x2_ps(mm2, mm3);
+	
+	
+	mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
+	mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
+	
+	/* get the swap sign flag in mm0:mm1 and the 
+	 polynom selection mask in mm2:mm3 */
+	
+	mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
+	mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
+	mm0 = _mm_slli_pi32(mm0, 29);
+	mm1 = _mm_slli_pi32(mm1, 29);
+	
+	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+	
+	mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+	mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+	
+	v4sf sign_bit, poly_mask;
+	COPY_MM_TO_XMM(mm0, mm1, sign_bit);
+	COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+	_mm_empty(); /* good-bye mmx */
+
+	/* The magic pass: "Extended precision modular arithmetic" 
+	 x = ((x - y * DP1) - y * DP2) - y * DP3; */
+	xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+	xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+	xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+	xmm1 = _mm_mul_ps(y, xmm1);
+	xmm2 = _mm_mul_ps(y, xmm2);
+	xmm3 = _mm_mul_ps(y, xmm3);
+	xx = _mm_add_ps(xx, xmm1);
+	xx = _mm_add_ps(xx, xmm2);
+	xx = _mm_add_ps(xx, xmm3);
+	
+	/* Evaluate the first polynom  (0 <= x <= Pi/4) */
+	y = *(v4sf*)_ps_coscof_p0;
+	v4sf z = _mm_mul_ps(xx,xx);
+	
+	y = _mm_mul_ps(y, z);
+	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+	y = _mm_mul_ps(y, z);
+	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+	y = _mm_mul_ps(y, z);
+	y = _mm_mul_ps(y, z);
+	v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+	y = _mm_sub_ps(y, tmp);
+	y = _mm_add_ps(y, *(v4sf*)_ps_1);
+	
+	/* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+	
+	v4sf y2 = *(v4sf*)_ps_sincof_p0;
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_mul_ps(y2, xx);
+	y2 = _mm_add_ps(y2, xx);
+	
+	/* select the correct result from the two polynoms */  
+	xmm3 = poly_mask;
+	y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+	y = _mm_andnot_ps(xmm3, y);
+	y = _mm_add_ps(y,y2);
+	/* update the sign */
+
+	_mm_store_ss( &x, _mm_xor_ps(y, sign_bit) );
+
+#else
+	#error "Not Implemented"
+#endif
+
+	return x;
+}
+
+//-----------------------------------------------------------------------------
+// SSE2 implementations of optimized routines:
+//-----------------------------------------------------------------------------
+void _SSE2_SinCos(float x, float* s, float* c)  // any x
+{
+#ifdef _WIN32
+	__asm
+	{
+		movss	xmm0, x
+		movaps	xmm7, xmm0
+		movss	xmm1, _ps_am_inv_sign_mask
+		movss	xmm2, _ps_am_sign_mask
+		movss	xmm3, _ps_am_2_o_pi
+		andps	xmm0, xmm1
+		andps	xmm7, xmm2
+		mulss	xmm0, xmm3
+
+		pxor	xmm3, xmm3
+		movd	xmm5, _epi32_1
+		movss	xmm4, _ps_am_1
+
+		cvttps2dq	xmm2, xmm0
+		pand	xmm5, xmm2
+		movd	xmm1, _epi32_2
+		pcmpeqd	xmm5, xmm3
+		movd	xmm3, _epi32_1
+		cvtdq2ps	xmm6, xmm2
+		paddd	xmm3, xmm2
+		pand	xmm2, xmm1
+		pand	xmm3, xmm1
+		subss	xmm0, xmm6
+		pslld	xmm2, (31 - 1)
+		minss	xmm0, xmm4
+
+		mov		eax, s     // mov eax, [esp + 4 + 16]
+		mov		edx, c	   // mov edx, [esp + 4 + 16 + 4]
+
+		subss	xmm4, xmm0
+		pslld	xmm3, (31 - 1)
+
+		movaps	xmm6, xmm4
+		xorps	xmm2, xmm7
+		movaps	xmm7, xmm5
+		andps	xmm6, xmm7
+		andnps	xmm7, xmm0
+		andps	xmm0, xmm5
+		andnps	xmm5, xmm4
+		movss	xmm4, _ps_sincos_p3
+		orps	xmm6, xmm7
+		orps	xmm0, xmm5
+		movss	xmm5, _ps_sincos_p2
+
+		movaps	xmm1, xmm0
+		movaps	xmm7, xmm6
+		mulss	xmm0, xmm0
+		mulss	xmm6, xmm6
+		orps	xmm1, xmm2
+		orps	xmm7, xmm3
+		movaps	xmm2, xmm0
+		movaps	xmm3, xmm6
+		mulss	xmm0, xmm4
+		mulss	xmm6, xmm4
+		movss	xmm4, _ps_sincos_p1
+		addss	xmm0, xmm5
+		addss	xmm6, xmm5
+		movss	xmm5, _ps_sincos_p0
+		mulss	xmm0, xmm2
+		mulss	xmm6, xmm3
+		addss	xmm0, xmm4
+		addss	xmm6, xmm4
+		mulss	xmm0, xmm2
+		mulss	xmm6, xmm3
+		addss	xmm0, xmm5
+		addss	xmm6, xmm5
+		mulss	xmm0, xmm1
+		mulss	xmm6, xmm7
+
+		// use full stores since caller might reload with full loads
+		movss	[eax], xmm0
+		movss	[edx], xmm6
+	}
+#elif POSIX
+	#warning "_SSE2_SinCos NOT implemented!"
+	Assert( 0 );
+#else
+	#error "Not Implemented"
+#endif
+}
+
+float _SSE2_cos(float x)  
+{
+#ifdef _WIN32
+	__asm
+	{
+		movss	xmm0, x
+		movss	xmm1, _ps_am_inv_sign_mask
+		movss	xmm2, _ps_am_pi_o_2
+		movss	xmm3, _ps_am_2_o_pi
+		andps	xmm0, xmm1
+		addss	xmm0, xmm2
+		mulss	xmm0, xmm3
+
+		pxor	xmm3, xmm3
+		movd	xmm5, _epi32_1
+		movss	xmm4, _ps_am_1
+		cvttps2dq	xmm2, xmm0
+		pand	xmm5, xmm2
+		movd	xmm1, _epi32_2
+		pcmpeqd	xmm5, xmm3
+		cvtdq2ps	xmm6, xmm2
+		pand	xmm2, xmm1
+		pslld	xmm2, (31 - 1)
+
+		subss	xmm0, xmm6
+		movss	xmm3, _ps_sincos_p3
+		minss	xmm0, xmm4
+		subss	xmm4, xmm0
+		andps	xmm0, xmm5
+		andnps	xmm5, xmm4
+		orps	xmm0, xmm5
+
+		movaps	xmm1, xmm0
+		movss	xmm4, _ps_sincos_p2
+		mulss	xmm0, xmm0
+		movss	xmm5, _ps_sincos_p1
+		orps	xmm1, xmm2
+		movaps	xmm7, xmm0
+		mulss	xmm0, xmm3
+		movss	xmm6, _ps_sincos_p0
+		addss	xmm0, xmm4
+		mulss	xmm0, xmm7
+		addss	xmm0, xmm5
+		mulss	xmm0, xmm7
+		addss	xmm0, xmm6
+		mulss	xmm0, xmm1
+		movss   x,    xmm0
+	}
+#elif POSIX
+	#warning "_SSE2_cos NOT implemented!"
+	Assert( 0 );
+#else
+	#error "Not Implemented"
+#endif
+
+	return x;
+}
+
+// SSE Version of VectorTransform
+void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1)
+{
+	Assert( s_bMathlibInitialized );
+	Assert( in1 != out1 );
+
+#ifdef _WIN32
+	__asm
+	{
+		mov eax, in1;
+		mov ecx, in2;
+		mov edx, out1;
+
+		movss xmm0, [eax];
+		mulss xmm0, [ecx];
+		movss xmm1, [eax+4];
+		mulss xmm1, [ecx+4];
+		movss xmm2, [eax+8];
+		mulss xmm2, [ecx+8];
+		addss xmm0, xmm1;
+		addss xmm0, xmm2;
+		addss xmm0, [ecx+12]
+ 		movss [edx], xmm0;
+		add ecx, 16;
+
+		movss xmm0, [eax];
+		mulss xmm0, [ecx];
+		movss xmm1, [eax+4];
+		mulss xmm1, [ecx+4];
+		movss xmm2, [eax+8];
+		mulss xmm2, [ecx+8];
+		addss xmm0, xmm1;
+		addss xmm0, xmm2;
+		addss xmm0, [ecx+12]
+		movss [edx+4], xmm0;
+		add ecx, 16;
+
+		movss xmm0, [eax];
+		mulss xmm0, [ecx];
+		movss xmm1, [eax+4];
+		mulss xmm1, [ecx+4];
+		movss xmm2, [eax+8];
+		mulss xmm2, [ecx+8];
+		addss xmm0, xmm1;
+		addss xmm0, xmm2;
+		addss xmm0, [ecx+12]
+		movss [edx+8], xmm0;
+	}
+#elif POSIX
+	#warning "VectorTransformSSE C implementation only"
+		out1[0] = DotProduct(in1, in2[0]) + in2[0][3];
+		out1[1] = DotProduct(in1, in2[1]) + in2[1][3];
+		out1[2] = DotProduct(in1, in2[2]) + in2[2][3];
+#else
+	#error "Not Implemented"
+#endif
+}
+
+void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 )
+{
+	Assert( s_bMathlibInitialized );
+	Assert( in1 != out1 );
+
+#ifdef _WIN32
+	__asm
+	{
+		mov eax, in1;
+		mov ecx, in2;
+		mov edx, out1;
+
+		movss xmm0, [eax];
+		mulss xmm0, [ecx];
+		movss xmm1, [eax+4];
+		mulss xmm1, [ecx+4];
+		movss xmm2, [eax+8];
+		mulss xmm2, [ecx+8];
+		addss xmm0, xmm1;
+		addss xmm0, xmm2;
+ 		movss [edx], xmm0;
+		add ecx, 16;
+
+		movss xmm0, [eax];
+		mulss xmm0, [ecx];
+		movss xmm1, [eax+4];
+		mulss xmm1, [ecx+4];
+		movss xmm2, [eax+8];
+		mulss xmm2, [ecx+8];
+		addss xmm0, xmm1;
+		addss xmm0, xmm2;
+		movss [edx+4], xmm0;
+		add ecx, 16;
+
+		movss xmm0, [eax];
+		mulss xmm0, [ecx];
+		movss xmm1, [eax+4];
+		mulss xmm1, [ecx+4];
+		movss xmm2, [eax+8];
+		mulss xmm2, [ecx+8];
+		addss xmm0, xmm1;
+		addss xmm0, xmm2;
+		movss [edx+8], xmm0;
+	}
+#elif POSIX
+	#warning "VectorRotateSSE C implementation only"
+		out1[0] = DotProduct( in1, in2[0] );
+		out1[1] = DotProduct( in1, in2[1] );
+		out1[2] = DotProduct( in1, in2[2] );
+#else
+	#error "Not Implemented"
+#endif
+}
+
+#ifdef _WIN32
+void _declspec(naked) _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest )
+{
+	// FIXME: This don't work!! It will overwrite memory in the write to dest
+	Assert(0);
+
+	Assert( s_bMathlibInitialized );
+	_asm {  // Intel SSE only routine
+		mov	eax, DWORD PTR [esp+0x04]	; *start, s0..s2
+		mov ecx, DWORD PTR [esp+0x0c]	; *direction, d0..d2
+		mov edx, DWORD PTR [esp+0x10]	; *dest
+		movss	xmm2, [esp+0x08]		; x2 = scale, 0, 0, 0
+#ifdef ALIGNED_VECTOR
+		movaps	xmm3, [ecx]				; x3 = dir0,dir1,dir2,X
+		pshufd	xmm2, xmm2, 0			; x2 = scale, scale, scale, scale
+		movaps	xmm1, [eax]				; x1 = start1, start2, start3, X
+		mulps	xmm3, xmm2				; x3 *= x2
+		addps	xmm3, xmm1				; x3 += x1
+		movaps	[edx], xmm3				; *dest = x3
+#else
+		movups	xmm3, [ecx]				; x3 = dir0,dir1,dir2,X
+		pshufd	xmm2, xmm2, 0			; x2 = scale, scale, scale, scale
+		movups	xmm1, [eax]				; x1 = start1, start2, start3, X
+		mulps	xmm3, xmm2				; x3 *= x2
+		addps	xmm3, xmm1				; x3 += x1
+		movups	[edx], xmm3				; *dest = x3
+#endif
+	}
+}
+#endif
+
+#ifdef _WIN32
+#ifdef PFN_VECTORMA
+void _declspec(naked) __cdecl _SSE_VectorMA( const Vector &start, float scale, const Vector &direction, Vector &dest )
+{
+	// FIXME: This don't work!! It will overwrite memory in the write to dest
+	Assert(0);
+
+	Assert( s_bMathlibInitialized );
+	_asm 
+	{  
+		// Intel SSE only routine
+		mov	eax, DWORD PTR [esp+0x04]	; *start, s0..s2
+		mov ecx, DWORD PTR [esp+0x0c]	; *direction, d0..d2
+		mov edx, DWORD PTR [esp+0x10]	; *dest
+		movss	xmm2, [esp+0x08]		; x2 = scale, 0, 0, 0
+#ifdef ALIGNED_VECTOR
+		movaps	xmm3, [ecx]				; x3 = dir0,dir1,dir2,X
+		pshufd	xmm2, xmm2, 0			; x2 = scale, scale, scale, scale
+		movaps	xmm1, [eax]				; x1 = start1, start2, start3, X
+		mulps	xmm3, xmm2				; x3 *= x2
+		addps	xmm3, xmm1				; x3 += x1
+		movaps	[edx], xmm3				; *dest = x3
+#else
+		movups	xmm3, [ecx]				; x3 = dir0,dir1,dir2,X
+		pshufd	xmm2, xmm2, 0			; x2 = scale, scale, scale, scale
+		movups	xmm1, [eax]				; x1 = start1, start2, start3, X
+		mulps	xmm3, xmm2				; x3 *= x2
+		addps	xmm3, xmm1				; x3 += x1
+		movups	[edx], xmm3				; *dest = x3
+#endif
+	}
+}
+float (__cdecl *pfVectorMA)(Vector& v) = _VectorMA;
+#endif
+#endif
+
+
+// SSE DotProduct -- it's a smidgen faster than the asm DotProduct...
+//   Should be validated too!  :)
+//   NJS: (Nov 1 2002) -NOT- faster.  may time a couple cycles faster in a single function like 
+//   this, but when inlined, and instruction scheduled, the C version is faster.  
+//   Verified this via VTune
+/*
+vec_t DotProduct (const vec_t *a, const vec_t *c)
+{
+	vec_t temp;
+
+	__asm
+	{
+		mov eax, a;
+		mov ecx, c;
+		mov edx, DWORD PTR [temp]
+		movss xmm0, [eax];
+		mulss xmm0, [ecx];
+		movss xmm1, [eax+4];
+		mulss xmm1, [ecx+4];
+		movss xmm2, [eax+8];
+		mulss xmm2, [ecx+8];
+		addss xmm0, xmm1;
+		addss xmm0, xmm2;
+		movss [edx], xmm0;
+		fld DWORD PTR [edx];
+		ret
+	}
+}
+*/
+
+#endif // COMPILER_MSVC64 
diff --git a/mp/src/mathlib/sse.h b/mp/src/mathlib/sse.h
index f0da3b06..72de1d3b 100644
--- a/mp/src/mathlib/sse.h
+++ b/mp/src/mathlib/sse.h
@@ -1,23 +1,23 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: 
-//
-//=====================================================================================//
-
-#ifndef _SSE_H
-#define _SSE_H
-
-float _SSE_Sqrt(float x);
-float _SSE_RSqrtAccurate(float a);
-float _SSE_RSqrtFast(float x);
-float FASTCALL _SSE_VectorNormalize(Vector& vec);
-void FASTCALL _SSE_VectorNormalizeFast(Vector& vec);
-float _SSE_InvRSquared(const float* v);
-void _SSE_SinCos(float x, float* s, float* c);
-float _SSE_cos( float x);
-void _SSE2_SinCos(float x, float* s, float* c);
-float _SSE2_cos(float x); 
-void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1);
-void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 );
-
-#endif // _SSE_H
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+//=====================================================================================//
+
+#ifndef _SSE_H
+#define _SSE_H
+
+float _SSE_Sqrt(float x);
+float _SSE_RSqrtAccurate(float a);
+float _SSE_RSqrtFast(float x);
+float FASTCALL _SSE_VectorNormalize(Vector& vec);
+void FASTCALL _SSE_VectorNormalizeFast(Vector& vec);
+float _SSE_InvRSquared(const float* v);
+void _SSE_SinCos(float x, float* s, float* c);
+float _SSE_cos( float x);
+void _SSE2_SinCos(float x, float* s, float* c);
+float _SSE2_cos(float x); 
+void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1);
+void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 );
+
+#endif // _SSE_H
diff --git a/mp/src/mathlib/sseconst.cpp b/mp/src/mathlib/sseconst.cpp
index d9ba06b2..2f923193 100644
--- a/mp/src/mathlib/sseconst.cpp
+++ b/mp/src/mathlib/sseconst.cpp
@@ -1,1164 +1,1164 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: 
-//
-//===========================================================================//
-
-#include "mathlib/ssemath.h"
-#include "mathlib/ssequaternion.h"
-
-const fltx4 Four_PointFives={0.5,0.5,0.5,0.5};
-#ifndef _X360
-const fltx4 Four_Zeros={0.0,0.0,0.0,0.0};
-const fltx4 Four_Ones={1.0,1.0,1.0,1.0};
-#endif
-const fltx4 Four_Twos={2.0,2.0,2.0,2.0};
-const fltx4 Four_Threes={3.0,3.0,3.0,3.0};
-const fltx4 Four_Fours={4.0,4.0,4.0,4.0};
-const fltx4 Four_Origin={0,0,0,1};
-const fltx4 Four_NegativeOnes={-1,-1,-1,-1};
-
-const fltx4 Four_2ToThe21s={ (float) (1<<21), (float) (1<<21), (float) (1<<21), (float)(1<<21) };
-const fltx4 Four_2ToThe22s={ (float) (1<<22), (float) (1<<22), (float) (1<<22), (float)(1<<22) };
-const fltx4 Four_2ToThe23s={ (float) (1<<23), (float) (1<<23), (float) (1<<23), (float)(1<<23) };
-const fltx4 Four_2ToThe24s={ (float) (1<<24), (float) (1<<24), (float) (1<<24), (float)(1<<24) };
-
-const fltx4 Four_Point225s={ .225, .225, .225, .225 };
-const fltx4 Four_Epsilons={FLT_EPSILON,FLT_EPSILON,FLT_EPSILON,FLT_EPSILON};
-
-const fltx4 Four_FLT_MAX={FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
-const fltx4 Four_Negative_FLT_MAX={-FLT_MAX,-FLT_MAX,-FLT_MAX,-FLT_MAX};
-const fltx4 g_SIMD_0123 = { 0., 1., 2., 3. };
-
-const fltx4 g_QuatMultRowSign[4] =
-{
-	{  1.0f,  1.0f, -1.0f, 1.0f },
-	{ -1.0f,  1.0f,  1.0f, 1.0f },
-	{  1.0f, -1.0f,  1.0f, 1.0f },
-	{ -1.0f, -1.0f, -1.0f, 1.0f }
-};
-
-const int32 ALIGN16 g_SIMD_clear_signmask[4] ALIGN16_POST = {0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff};
-const int32 ALIGN16 g_SIMD_signmask[4] ALIGN16_POST = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
-const int32 ALIGN16 g_SIMD_lsbmask[4] ALIGN16_POST = { 0xfffffffe, 0xfffffffe, 0xfffffffe, 0xfffffffe };
-const int32 ALIGN16 g_SIMD_clear_wmask[4] ALIGN16_POST = { 0xffffffff, 0xffffffff, 0xffffffff, 0 };
-const int32 ALIGN16 g_SIMD_AllOnesMask[4] ALIGN16_POST = { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }; // ~0,~0,~0,~0
-const int32 ALIGN16 g_SIMD_Low16BitsMask[4] ALIGN16_POST = { 0xffff, 0xffff, 0xffff, 0xffff }; // 0xffff x 4
-
-const int32 ALIGN16 g_SIMD_ComponentMask[4][4] ALIGN16_POST =
-{
-	{ 0xFFFFFFFF, 0, 0, 0 }, { 0, 0xFFFFFFFF, 0, 0 }, { 0, 0, 0xFFFFFFFF, 0 }, { 0, 0, 0, 0xFFFFFFFF }
-};
-
-const int32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST =
-{
-	{ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff },
-	{ 0xffffffff, 0x00000000, 0x00000000, 0x00000000 },
-	{ 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 },
-	{ 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 },
-};
-
-
-	// FUNCTIONS
-	// NOTE: WHY YOU **DO NOT** WANT TO PUT FUNCTIONS HERE
-// Generally speaking, you want to make sure SIMD math functions
-// are inlined, because that gives the compiler much more latitude
-// in instruction scheduling. It's not that the overhead of calling
-// the function is particularly great; rather, many of the SIMD 
-// opcodes have long latencies, and if you have a sequence of 
-// several dependent ones inside a function call, the latencies 
-// stack up to create a big penalty. If the function is inlined,
-// the compiler can interleave its operations with ones from the
-// caller to better hide those latencies. Finally, on the 360,
-// putting parameters or return values on the stack, and then 
-// reading them back within the next forty cycles, is a very 
-// severe penalty. So, as much as possible, you want to leave your
-// data on the registers.
-
-// That said, there are certain occasions where it is appropriate
-// to call into functions -- particularly for very large blocks
-// of code that will spill most of the registers anyway. Unless your
-// function is more than one screen long, yours is probably not one
-// of those occasions.
-
-
-
-/// You can use this to rotate a long array of FourVectors all by the same
-/// matrix. The first parameter is the head of the array. The second is the
-/// number of vectors to rotate. The third is the matrix.
-void FourVectors::RotateManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix )
-{
-	Assert(numVectors > 0);
-	if ( numVectors == 0 )
-		return;
-
-	// Splat out each of the entries in the matrix to a fltx4. Do this
-	// in the order that we will need them, to hide latency. I'm
-	// avoiding making an array of them, so that they'll remain in 
-	// registers.
-	fltx4 matSplat00, matSplat01, matSplat02,
-		matSplat10, matSplat11, matSplat12,
-		matSplat20, matSplat21, matSplat22;
-
-	{
-		// Load the matrix into local vectors. Sadly, matrix3x4_ts are 
-		// often unaligned. The w components will be the tranpose row of
-		// the matrix, but we don't really care about that.
-		fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
-		fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
-		fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);
-
-		matSplat00 = SplatXSIMD(matCol0);
-		matSplat01 = SplatYSIMD(matCol0);
-		matSplat02 = SplatZSIMD(matCol0);
-
-		matSplat10 = SplatXSIMD(matCol1);
-		matSplat11 = SplatYSIMD(matCol1);
-		matSplat12 = SplatZSIMD(matCol1);
-
-		matSplat20 = SplatXSIMD(matCol2);
-		matSplat21 = SplatYSIMD(matCol2);
-		matSplat22 = SplatZSIMD(matCol2);
-	}
-
-#ifdef _X360
-	// Same algorithm as above, but the loop is unrolled to eliminate data hazard latencies
-	// and simplify prefetching. Named variables are deliberately used instead of arrays to
-	// ensure that the variables live on the registers instead of the stack (stack load/store
-	// is a serious penalty on 360).  Nb: for prefetching to be most efficient here, the
-	// loop should be unrolled to 8 FourVectors per iteration; because each FourVectors is 
-	// 48 bytes long, 48 * 8 = 384, its least common multiple with the 128-byte cache line. 
-	// That way you can fetch the next 3 cache lines while you work on these three. 
-	// If you do go this route, be sure to dissassemble and make sure it doesn't spill 
-	// registers to stack as you do this; the cost of that will be excessive. Unroll the loop
-	// a little and just live with the fact that you'll be doing a couple of redundant dbcts
-	// (they don't cost you anything). Be aware that all three cores share L2 and it can only
-	// have eight cache lines fetching at a time.
-	fltx4 outX0, outY0, outZ0; // bank one of outputs
-	fltx4 outX1, outY1, outZ1; // bank two of outputs
-
-
-	// Because of instruction latencies and scheduling, it's actually faster to use adds and muls
-	// rather than madds. (Empirically determined by timing.)
-	const FourVectors * stop = pVectors + numVectors;
-	FourVectors * RESTRICT pVectNext;
-	// prime the pump.
-	if (numVectors & 0x01)
-	{
-		// odd number of vectors to process
-		// prime the 1 group of registers
-		pVectNext = pVectors++;
-		outX1 = AddSIMD( AddSIMD( MulSIMD( pVectNext->x, matSplat00 ), MulSIMD( pVectNext->y, matSplat01 ) ), MulSIMD( pVectNext->z, matSplat02 ) );
-		outY1 = AddSIMD( AddSIMD( MulSIMD( pVectNext->x, matSplat10 ), MulSIMD( pVectNext->y, matSplat11 ) ), MulSIMD( pVectNext->z, matSplat12 ) );
-		outZ1 = AddSIMD( AddSIMD( MulSIMD( pVectNext->x, matSplat20 ), MulSIMD( pVectNext->y, matSplat21 ) ), MulSIMD( pVectNext->z, matSplat22 ) );
-	}
-	else
-	{
-		// even number of total vectors to process; 
-		// prime the zero group and jump into the middle of the loop
-		outX0 = AddSIMD( AddSIMD( MulSIMD( pVectors->x, matSplat00 ), MulSIMD( pVectors->y, matSplat01 ) ), MulSIMD( pVectors->z, matSplat02 ) );
-		outY0 = AddSIMD( AddSIMD( MulSIMD( pVectors->x, matSplat10 ), MulSIMD( pVectors->y, matSplat11 ) ), MulSIMD( pVectors->z, matSplat12 ) );
-		outZ0 = AddSIMD( AddSIMD( MulSIMD( pVectors->x, matSplat20 ), MulSIMD( pVectors->y, matSplat21 ) ), MulSIMD( pVectors->z, matSplat22 ) );
-		goto EVEN_CASE;
-	}
-
-	// perform an even number of iterations through this loop.
-	while (pVectors < stop)
-	{
-		outX0 = MaddSIMD( pVectors->z, matSplat02, AddSIMD( MulSIMD( pVectors->x, matSplat00 ), MulSIMD( pVectors->y, matSplat01 ) ) );
-		outY0 = MaddSIMD( pVectors->z, matSplat12, AddSIMD( MulSIMD( pVectors->x, matSplat10 ), MulSIMD( pVectors->y, matSplat11 ) ) );
-		outZ0 = MaddSIMD( pVectors->z, matSplat22, AddSIMD( MulSIMD( pVectors->x, matSplat20 ), MulSIMD( pVectors->y, matSplat21 ) ) );
-
-		pVectNext->x = outX1;
-		pVectNext->y = outY1;
-		pVectNext->z = outZ1;
-
-EVEN_CASE:
-		pVectNext = pVectors+1;
-
-		outX1 = MaddSIMD( pVectNext->z, matSplat02, AddSIMD( MulSIMD( pVectNext->x, matSplat00 ), MulSIMD( pVectNext->y, matSplat01 ) ) );
-		outY1 = MaddSIMD( pVectNext->z, matSplat12, AddSIMD( MulSIMD( pVectNext->x, matSplat10 ), MulSIMD( pVectNext->y, matSplat11 ) ) );
-		outZ1 = MaddSIMD( pVectNext->z, matSplat22, AddSIMD( MulSIMD( pVectNext->x, matSplat20 ), MulSIMD( pVectNext->y, matSplat21 ) ) );
-
-		pVectors->x = outX0;
-		pVectors->y = outY0;
-		pVectors->z = outZ0;
-
-		pVectors += 2;
-	}
-
-	// flush the last round of output
-	pVectNext->x = outX1;
-	pVectNext->y = outY1;
-	pVectNext->z = outZ1;
-#else
-	// PC does not benefit from the unroll/scheduling above
-	fltx4 outX0, outY0, outZ0; // bank one of outputs
-
-
-	// Because of instruction latencies and scheduling, it's actually faster to use adds and muls
-	// rather than madds. (Empirically determined by timing.)
-	const FourVectors * stop = pVectors + numVectors;
-
-	// perform an even number of iterations through this loop.
-	while (pVectors < stop)
-	{
-		outX0 = MaddSIMD( pVectors->z, matSplat02, AddSIMD( MulSIMD( pVectors->x, matSplat00 ), MulSIMD( pVectors->y, matSplat01 ) ) );
-		outY0 = MaddSIMD( pVectors->z, matSplat12, AddSIMD( MulSIMD( pVectors->x, matSplat10 ), MulSIMD( pVectors->y, matSplat11 ) ) );
-		outZ0 = MaddSIMD( pVectors->z, matSplat22, AddSIMD( MulSIMD( pVectors->x, matSplat20 ), MulSIMD( pVectors->y, matSplat21 ) ) );
-
-		pVectors->x = outX0;
-		pVectors->y = outY0;
-		pVectors->z = outZ0;
-		pVectors++;
-	}
-#endif
-}
-
-#ifdef _X360
-// Loop-scheduled code to process FourVectors in groups of eight quite efficiently.
-void FourVectors_TransformManyGroupsOfEightBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut )
-{
-	Assert(numVectors > 0);
-	if ( numVectors == 0 )
-		return;
-
-	AssertMsg( (pOut < pVectors && pOut+numVectors <= pVectors) ||
-			   (pOut > pVectors && pVectors+numVectors <= pOut), "FourVectors::TransformManyBy called with overlapping buffer pointers." );
-
-	// Splat out each of the entries in the matrix to a fltx4. Do this
-	// in the order that we will need them, to hide latency. I'm
-	// avoiding making an array of them, so that they'll remain in 
-	// registers.
-	fltx4 matSplat00, matSplat01, matSplat02, matSplat03,	// TWELVE REGISTERS
-		  matSplat10, matSplat11, matSplat12, matSplat13,
-		  matSplat20, matSplat21, matSplat22, matSplat23;
-
-	{
-		// Load the matrix into local vectors. Sadly, matrix3x4_ts are 
-		// often unaligned. The w components will be the tranpose row of
-		// the matrix.
-		fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
-		fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
-		fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);
-
-		matSplat00 = SplatXSIMD(matCol0);
-		matSplat01 = SplatYSIMD(matCol0);
-		matSplat02 = SplatZSIMD(matCol0);
-		matSplat03 = SplatWSIMD(matCol0);
-
-		matSplat10 = SplatXSIMD(matCol1);
-		matSplat11 = SplatYSIMD(matCol1);
-		matSplat12 = SplatZSIMD(matCol1);
-		matSplat13 = SplatWSIMD(matCol1);
-
-		matSplat20 = SplatXSIMD(matCol2);
-		matSplat21 = SplatYSIMD(matCol2);
-		matSplat22 = SplatZSIMD(matCol2);
-		matSplat23 = SplatWSIMD(matCol2);
-	}
-
-	// this macro defines how to compute a specific row from an input and certain splat columns
-#define COMPUTE(res, invec, xterm, yterm, zterm, transterm) res = AddSIMD( AddSIMD( MulSIMD((invec)->z, zterm), AddSIMD( MulSIMD( (invec)->x, xterm ), MulSIMD( (invec)->y, yterm ) ) ), transterm )
-#define WRITE(term, reg, toptr) toptr->term = reg
-
-	// define result groups (we're going to have an eight-way unroll)
-	
-	fltx4 res0X, res0Y, res0Z, res0XTemp, res0YTemp, res0ZTemp;	// 48 REGISTERS
-	fltx4 res1X, res1Y, res1Z, res1XTemp, res1YTemp, res1ZTemp;
-	fltx4 res2X, res2Y, res2Z, res2XTemp, res2YTemp, res2ZTemp;
-	fltx4 res3X, res3Y, res3Z, res3XTemp, res3YTemp, res3ZTemp;
-	fltx4 res4X, res4Y, res4Z, res4XTemp, res4YTemp, res4ZTemp;
-	fltx4 res5X, res5Y, res5Z, res5XTemp, res5YTemp, res5ZTemp;
-	fltx4 res6X, res6Y, res6Z, res6XTemp, res6YTemp, res6ZTemp;
-	fltx4 res7X, res7Y, res7Z, res7XTemp, res7YTemp, res7ZTemp;
-	
-
-// #define FROZ(out,in,offset) COMPUTE((out+offset)->x, (in + offset), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE((out + offset )->y, (in + offset), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE((out + offset)->z, (in + offset), matSplat20, matSplat21, matSplat22, matSplat23)
-#define COMPUTE_GROUP(resgroup,dataptr) COMPUTE(resgroup ## X, (dataptr), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE(resgroup ## Y, (dataptr), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE(resgroup ## Z, (dataptr), matSplat20, matSplat21, matSplat22, matSplat23)
-#define WRITE_GROUP(ptr, resgroup) (ptr)->x = resgroup ## X; (ptr)->y = resgroup ## Y; (ptr)->z = resgroup ## Z
-
-	/*
-	// stage 1 -- 6 ops for xyz, each w 12 cycle latency
-	res0X = MulSIMD( (invec)->y, matSplat01 );
-	res0Temp = MaddSIMD((invec)->z, matSplat02, matSplat03);
-	// stage 2 -- 3 clocks for xyz
-	res0X = MaddSIMD( (invec)->x, matSplat00, res0X );
-	// stage 3 -- 3 clocks for xyz
-	res0X = AddSIMD(res0X, res0Temp);
-	*/
-#define COMPUTE_STAGE1_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MulSIMD( (invec)->y, ysplat ); tempvar = MaddSIMD((invec)->z, zsplat, transplat)
-#define COMPUTE_STAGE2_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MaddSIMD( (invec)->x, xsplat, res )
-#define COMPUTE_STAGE3_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = AddSIMD(res, tempvar)  // frees up the tempvar
-
-#define COMPUTE_STAGE1_GROUP(resgroup, invec) COMPUTE_STAGE1_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
-										COMPUTE_STAGE1_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
-										COMPUTE_STAGE1_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
-
-#define COMPUTE_STAGE2_GROUP(resgroup, invec) COMPUTE_STAGE2_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
-										COMPUTE_STAGE2_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
-										COMPUTE_STAGE2_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
-
-#define COMPUTE_STAGE3_GROUP(resgroup, invec) COMPUTE_STAGE3_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
-										COMPUTE_STAGE3_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
-										COMPUTE_STAGE3_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
-
-	FourVectors * RESTRICT inData = pVectors;
-	FourVectors * RESTRICT outData = pOut;
-	const FourVectors * const RESTRICT STOP = pVectors + numVectors;
-
-	// Use techniques of loop scheduling to eliminate data hazards; process
-	// eight groups simultaneously so that we never have any operations stalling
-	// waiting for data.
-	// Note: this loop, while pretty fast, could be faster still -- you'll notice
-	// that it does all of its loads, then all computation, then writes everything
-	// out. If made truly cyclic, such that every line interleaved a stage 1, stage 2,
-	// stage 3, and write, then throughput could be higher (probably by about 50%). 
-	while (inData < STOP)
-	{
-		// start prefetching the three cache lines
-		// we'll hit two iterations from now
-		__dcbt( sizeof(FourVectors) * 16,       inData );
-		__dcbt( sizeof(FourVectors) * 16 + 128, inData );
-		__dcbt( sizeof(FourVectors) * 16 + 256, inData );
-
-		// synchro
-		COMPUTE_STAGE1_GROUP(res0, inData + 0);
-		COMPUTE_STAGE1_GROUP(res1, inData + 1);
-		COMPUTE_STAGE1_GROUP(res2, inData + 2);
-		COMPUTE_STAGE1_GROUP(res3, inData + 3);
-
-			COMPUTE_STAGE2_GROUP(res0, inData + 0);
-		COMPUTE_STAGE1_GROUP(res4, inData + 4);
-			COMPUTE_STAGE2_GROUP(res1, inData + 1);
-		COMPUTE_STAGE1_GROUP(res5, inData + 5);
-			COMPUTE_STAGE2_GROUP(res2, inData + 2);
-		COMPUTE_STAGE1_GROUP(res6, inData + 6);
-			COMPUTE_STAGE2_GROUP(res3, inData + 3);
-		COMPUTE_STAGE1_GROUP(res7, inData + 7);
-
-				COMPUTE_STAGE3_GROUP(res0, inData + 0);
-			COMPUTE_STAGE2_GROUP(res4, inData + 4);
-				COMPUTE_STAGE3_GROUP(res1, inData + 1);
-			COMPUTE_STAGE2_GROUP(res5, inData + 5);
-				COMPUTE_STAGE3_GROUP(res2, inData + 2);
-			COMPUTE_STAGE2_GROUP(res6, inData + 6);
-				COMPUTE_STAGE3_GROUP(res3, inData + 3);
-			COMPUTE_STAGE2_GROUP(res7, inData + 7);
-
-				COMPUTE_STAGE3_GROUP(res4, inData + 4);
-					WRITE_GROUP( outData + 0, res0 );
-				COMPUTE_STAGE3_GROUP(res5, inData + 5);
-					WRITE_GROUP( outData + 1, res1 );
-				COMPUTE_STAGE3_GROUP(res6, inData + 6);
-					WRITE_GROUP( outData + 2, res2 );
-				COMPUTE_STAGE3_GROUP(res7, inData + 7);
-					WRITE_GROUP( outData + 3, res3 );
-		
-
-					WRITE_GROUP( outData + 4, res4 );
-					WRITE_GROUP( outData + 5, res5 );
-					WRITE_GROUP( outData + 6, res6 );
-					WRITE_GROUP( outData + 7, res7 );
-		
-		inData += 8;
-		outData += 8;
-	}
-
-
-#undef COMPUTE
-#undef WRITE
-#undef COMPUTE_STAGE1_ROW
-#undef COMPUTE_STAGE2_ROW
-#undef COMPUTE_STAGE3_ROW
-#undef COMPUTE_STAGE1_GROUP
-#undef COMPUTE_STAGE2_GROUP
-#undef COMPUTE_STAGE3_GROUP
-#undef COMPUTE_GROUP
-#undef WRITE_GROUP
-}
-
-#ifdef _X360
-// Loop-scheduled code to process FourVectors in groups of eight quite efficiently. This is the version
-// to call when starting on a 128-byte-aligned address.
-void FourVectors_TransformManyGroupsOfEightBy_128byteAligned(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut )
-{
-	/* If this has changed, you will need to change all the prefetches, *
-	 * and groups of eight are no longer the ideal unit for iterating   *
-	 * on many vectors.													*/
-	COMPILE_TIME_ASSERT( sizeof(FourVectors) == 48 ) ;
-
-	Assert(numVectors > 0);
-	if ( numVectors == 0 )
-		return;
-
-	AssertMsg((numVectors & 0x07) == 0, "FourVectors_TransformManyGroupsOfEight called with numVectors % 8 != 0!");
-
-	// Assert alignment
-	AssertMsg( ( ( reinterpret_cast<uint32>( pVectors )  & 127 ) == 0) && 
-			   ( ( reinterpret_cast<uint32>(pOut) & 127 ) == 0),
-			   "FourVectors_Transform..aligned called with non-128-byte-aligned buffers." );
-
-	// Assert non overlap
-	AssertMsg( (pOut < pVectors && pOut+numVectors <= pVectors) ||
-		(pOut > pVectors && pVectors+numVectors <= pOut), "FourVectors::TransformManyBy called with overlapping buffer pointers." );
-
-		// Here's the plan. 8 four-vecs = 3 cache lines exactly. It takes about 400 cycles to process a group
-		// of eight, and cache latency is 600 cycles, so we try to prefetch two iterations ahead (eg fetch
-		// iteration 3 while working on iteration 1). In the case of the output, we can simply zero-flush 
-		// the cache lines since we are sure to write into them. Because we're reading and fetching two ahead,
-		// we want to stop two away from the last iteration.
-
-		// No matter what, we will need to prefetch the first two groups of eight of input (that's the 
-		// first six cache lines)
-	__dcbt( 0, pVectors );
-	__dcbt( 128, pVectors );
-	__dcbt( 256, pVectors );
-	__dcbt( 384, pVectors );
-	__dcbt( 512, pVectors );
-	__dcbt( 640, pVectors );
-
-
-	// Splat out each of the entries in the matrix to a fltx4. Do this
-	// in the order that we will need them, to hide latency. I'm
-	// avoiding making an array of them, so that they'll remain in 
-	// registers.
-	fltx4 matSplat00, matSplat01, matSplat02, matSplat03,	// TWELVE REGISTERS
-		matSplat10, matSplat11, matSplat12, matSplat13,
-		matSplat20, matSplat21, matSplat22, matSplat23;
-
-	{
-		// Load the matrix into local vectors. Sadly, matrix3x4_ts are 
-		// often unaligned. The w components will be the tranpose row of
-		// the matrix.
-		fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
-		fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
-		fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);
-
-		matSplat00 = SplatXSIMD(matCol0);
-		matSplat01 = SplatYSIMD(matCol0);
-		matSplat02 = SplatZSIMD(matCol0);
-		matSplat03 = SplatWSIMD(matCol0);
-
-		matSplat10 = SplatXSIMD(matCol1);
-		matSplat11 = SplatYSIMD(matCol1);
-		matSplat12 = SplatZSIMD(matCol1);
-		matSplat13 = SplatWSIMD(matCol1);
-
-		matSplat20 = SplatXSIMD(matCol2);
-		matSplat21 = SplatYSIMD(matCol2);
-		matSplat22 = SplatZSIMD(matCol2);
-		matSplat23 = SplatWSIMD(matCol2);
-	}
-
-	// this macro defines how to compute a specific row from an input and certain splat columns
-#define COMPUTE(res, invec, xterm, yterm, zterm, transterm) res = AddSIMD( AddSIMD( MulSIMD((invec)->z, zterm), AddSIMD( MulSIMD( (invec)->x, xterm ), MulSIMD( (invec)->y, yterm ) ) ), transterm )
-#define WRITE(term, reg, toptr) toptr->term = reg
-
-	// define result groups (we're going to have an eight-way unroll)
-
-	fltx4 res0X, res0Y, res0Z, res0XTemp, res0YTemp, res0ZTemp;	// 48 REGISTERS
-	fltx4 res1X, res1Y, res1Z, res1XTemp, res1YTemp, res1ZTemp;
-	fltx4 res2X, res2Y, res2Z, res2XTemp, res2YTemp, res2ZTemp;
-	fltx4 res3X, res3Y, res3Z, res3XTemp, res3YTemp, res3ZTemp;
-	fltx4 res4X, res4Y, res4Z, res4XTemp, res4YTemp, res4ZTemp;
-	fltx4 res5X, res5Y, res5Z, res5XTemp, res5YTemp, res5ZTemp;
-	fltx4 res6X, res6Y, res6Z, res6XTemp, res6YTemp, res6ZTemp;
-	fltx4 res7X, res7Y, res7Z, res7XTemp, res7YTemp, res7ZTemp;
-
-
-	// #define FROZ(out,in,offset) COMPUTE((out+offset)->x, (in + offset), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE((out + offset )->y, (in + offset), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE((out + offset)->z, (in + offset), matSplat20, matSplat21, matSplat22, matSplat23)
-#define COMPUTE_GROUP(resgroup,dataptr) COMPUTE(resgroup ## X, (dataptr), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE(resgroup ## Y, (dataptr), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE(resgroup ## Z, (dataptr), matSplat20, matSplat21, matSplat22, matSplat23)
-#define WRITE_GROUP(ptr, resgroup) (ptr)->x = resgroup ## X; (ptr)->y = resgroup ## Y; (ptr)->z = resgroup ## Z
-
-	/*
-	// stage 1 -- 6 ops for xyz, each w 12 cycle latency
-	res0X = MulSIMD( (invec)->y, matSplat01 );
-	res0Temp = MaddSIMD((invec)->z, matSplat02, matSplat03);
-	// stage 2 -- 3 clocks for xyz
-	res0X = MaddSIMD( (invec)->x, matSplat00, res0X );
-	// stage 3 -- 3 clocks for xyz
-	res0X = AddSIMD(res0X, res0Temp);
-	*/
-#define COMPUTE_STAGE1_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MulSIMD( (invec)->y, ysplat ); tempvar = MaddSIMD((invec)->z, zsplat, transplat)
-#define COMPUTE_STAGE2_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MaddSIMD( (invec)->x, xsplat, res )
-#define COMPUTE_STAGE3_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = AddSIMD(res, tempvar)  // frees up the tempvar
-
-#define COMPUTE_STAGE1_GROUP(resgroup, invec) COMPUTE_STAGE1_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
-	COMPUTE_STAGE1_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
-	COMPUTE_STAGE1_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
-
-#define COMPUTE_STAGE2_GROUP(resgroup, invec) COMPUTE_STAGE2_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
-	COMPUTE_STAGE2_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
-	COMPUTE_STAGE2_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
-
-#define COMPUTE_STAGE3_GROUP(resgroup, invec) COMPUTE_STAGE3_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
-	COMPUTE_STAGE3_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
-	COMPUTE_STAGE3_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
-
-
-		// Okay. First do all but the last two turns of the crank; we don't want to overshoot with the flush-to-zero.
-	FourVectors * RESTRICT inData = pVectors;
-	FourVectors * RESTRICT outData = pOut;
-	const FourVectors * RESTRICT STOP;
-	if (numVectors > 16)
-	{
-		STOP = pVectors + numVectors - 16;
-		// flush the first two blocks we'll write into 
-		__dcbz128( 0, outData );
-		__dcbz128( 128, outData );
-		__dcbz128( 256, outData );
-
-		while (inData < STOP)
-		{
-			// start prefetching the three cache lines
-			// we'll hit two iterations from now
-			__dcbt( sizeof(FourVectors) * 16,       inData );
-			__dcbt( sizeof(FourVectors) * 16 + 128, inData );
-			__dcbt( sizeof(FourVectors) * 16 + 256, inData );
-
-			// synchro
-			COMPUTE_STAGE1_GROUP(res0, inData + 0);
-			COMPUTE_STAGE1_GROUP(res1, inData + 1);
-			COMPUTE_STAGE1_GROUP(res2, inData + 2);
-			COMPUTE_STAGE1_GROUP(res3, inData + 3);
-
-			// pre-zero the three cache lines we'll overwrite
-			// in the next iteration
-			__dcbz128( 384, outData );
-			__dcbz128( 512, outData );
-			__dcbz128( 640, outData );
-
-
-			COMPUTE_STAGE2_GROUP(res0, inData + 0);
-			COMPUTE_STAGE1_GROUP(res4, inData + 4);
-			COMPUTE_STAGE2_GROUP(res1, inData + 1);
-			COMPUTE_STAGE1_GROUP(res5, inData + 5);
-			COMPUTE_STAGE2_GROUP(res2, inData + 2);
-			COMPUTE_STAGE1_GROUP(res6, inData + 6);
-			COMPUTE_STAGE2_GROUP(res3, inData + 3);
-			COMPUTE_STAGE1_GROUP(res7, inData + 7);
-
-			COMPUTE_STAGE3_GROUP(res0, inData + 0);
-			COMPUTE_STAGE2_GROUP(res4, inData + 4);
-			COMPUTE_STAGE3_GROUP(res1, inData + 1);
-			COMPUTE_STAGE2_GROUP(res5, inData + 5);
-			COMPUTE_STAGE3_GROUP(res2, inData + 2);
-			COMPUTE_STAGE2_GROUP(res6, inData + 6);
-			COMPUTE_STAGE3_GROUP(res3, inData + 3);
-			COMPUTE_STAGE2_GROUP(res7, inData + 7);
-
-			COMPUTE_STAGE3_GROUP(res4, inData + 4);
-			WRITE_GROUP( outData + 0, res0 );
-			COMPUTE_STAGE3_GROUP(res5, inData + 5);
-			WRITE_GROUP( outData + 1, res1 );
-			COMPUTE_STAGE3_GROUP(res6, inData + 6);
-			WRITE_GROUP( outData + 2, res2 );
-			COMPUTE_STAGE3_GROUP(res7, inData + 7);
-			WRITE_GROUP( outData + 3, res3 );
-
-
-			WRITE_GROUP( outData + 4, res4 );
-			WRITE_GROUP( outData + 5, res5 );
-			WRITE_GROUP( outData + 6, res6 );
-			WRITE_GROUP( outData + 7, res7 );
-
-			inData += 8;
-			outData += 8;
-		}
-	}
-	else if (numVectors == 16)
-	{
-		// zero out the exactly six cache lines we will write into
-		__dcbz128( 0, outData );
-		__dcbz128( 128, outData );
-		__dcbz128( 256, outData );
-		__dcbz128( 384, outData );
-		__dcbz128( 512, outData );
-		__dcbz128( 640, outData );
-	}
-	else if (numVectors == 8)
-	{
-		// zero out the exactly three cache lines we will write into
-		__dcbz128( 0, outData );
-		__dcbz128( 128, outData );
-		__dcbz128( 256, outData );
-	}
-	else
-	{
-		AssertMsg(false, "Can't happen!");
-	}
-	
-	// deal with the ultimate two groups (or, if we were fed
-	// less than 16 groups, the whole shebang)
-	STOP = pVectors + numVectors - 16;
-	
-
-	// Use techniques of loop scheduling to eliminate data hazards; process
-	// eight groups simultaneously so that we never have any operations stalling
-	// waiting for data.
-	// Note: this loop, while pretty fast, could be faster still -- you'll notice
-	// that it does all of its loads, then all computation, then writes everything
-	// out. If made truly cyclic, such that every line interleaved a stage 1, stage 2,
-	// stage 3, and write, then throughput could be higher (probably by about 50%). 
-	while (inData < STOP)
-	{
-		// synchro
-		COMPUTE_STAGE1_GROUP(res0, inData + 0);
-		COMPUTE_STAGE1_GROUP(res1, inData + 1);
-		COMPUTE_STAGE1_GROUP(res2, inData + 2);
-		COMPUTE_STAGE1_GROUP(res3, inData + 3);
-
-		COMPUTE_STAGE2_GROUP(res0, inData + 0);
-		COMPUTE_STAGE1_GROUP(res4, inData + 4);
-		COMPUTE_STAGE2_GROUP(res1, inData + 1);
-		COMPUTE_STAGE1_GROUP(res5, inData + 5);
-		COMPUTE_STAGE2_GROUP(res2, inData + 2);
-		COMPUTE_STAGE1_GROUP(res6, inData + 6);
-		COMPUTE_STAGE2_GROUP(res3, inData + 3);
-		COMPUTE_STAGE1_GROUP(res7, inData + 7);
-
-		COMPUTE_STAGE3_GROUP(res0, inData + 0);
-		COMPUTE_STAGE2_GROUP(res4, inData + 4);
-		COMPUTE_STAGE3_GROUP(res1, inData + 1);
-		COMPUTE_STAGE2_GROUP(res5, inData + 5);
-		COMPUTE_STAGE3_GROUP(res2, inData + 2);
-		COMPUTE_STAGE2_GROUP(res6, inData + 6);
-		COMPUTE_STAGE3_GROUP(res3, inData + 3);
-		COMPUTE_STAGE2_GROUP(res7, inData + 7);
-
-		COMPUTE_STAGE3_GROUP(res4, inData + 4);
-		WRITE_GROUP( outData + 0, res0 );
-		COMPUTE_STAGE3_GROUP(res5, inData + 5);
-		WRITE_GROUP( outData + 1, res1 );
-		COMPUTE_STAGE3_GROUP(res6, inData + 6);
-		WRITE_GROUP( outData + 2, res2 );
-		COMPUTE_STAGE3_GROUP(res7, inData + 7);
-		WRITE_GROUP( outData + 3, res3 );
-
-
-		WRITE_GROUP( outData + 4, res4 );
-		WRITE_GROUP( outData + 5, res5 );
-		WRITE_GROUP( outData + 6, res6 );
-		WRITE_GROUP( outData + 7, res7 );
-
-		inData += 8;
-		outData += 8;
-	}
-
-
-#undef COMPUTE
-#undef WRITE
-#undef COMPUTE_STAGE1_ROW
-#undef COMPUTE_STAGE2_ROW
-#undef COMPUTE_STAGE3_ROW
-#undef COMPUTE_STAGE1_GROUP
-#undef COMPUTE_STAGE2_GROUP
-#undef COMPUTE_STAGE3_GROUP
-#undef COMPUTE_GROUP
-#undef WRITE_GROUP
-}
-#endif
-
-// Transform a long array of FourVectors by a given matrix. 
-void FourVectors::TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut )
-{
-	Assert(numVectors > 0);
-
-	AssertMsg( (pOut < pVectors && pOut+numVectors <= pVectors) ||
-		(pOut > pVectors && pVectors+numVectors <= pOut), "FourVectors::TransformManyBy called with overlapping buffer pointers." );
-
-#ifdef _X360
-	// The really fast version of this function likes to operate on blocks of eight. So, chug through
-	// groups of eight, then deal with any leftovers.
-	int numVectorsRoundedToNearestEight = numVectors & (~0x07);
-	if (numVectors >= 8)
-	{
-		// aligned?
-		if ((reinterpret_cast<unsigned int>(pVectors) & 127) == 0 && (reinterpret_cast<unsigned int>(pOut) & 127) == 0)
-		{
-			FourVectors_TransformManyGroupsOfEightBy_128byteAligned(pVectors, numVectorsRoundedToNearestEight, rotationMatrix, pOut);
-		}
-		else
-		{
-			FourVectors_TransformManyGroupsOfEightBy(pVectors, numVectorsRoundedToNearestEight, rotationMatrix, pOut);
-		}
-		numVectors -= numVectorsRoundedToNearestEight;
-		pVectors += numVectorsRoundedToNearestEight;
-		pOut += numVectorsRoundedToNearestEight;
-	}
-#endif
-
-	// any left over?
-	if (numVectors > 0)
-	{
-
-		// Splat out each of the entries in the matrix to a fltx4. Do this
-		// in the order that we will need them, to hide latency. I'm
-		// avoiding making an array of them, so that they'll remain in 
-		// registers.
-		fltx4 matSplat00, matSplat01, matSplat02, matSplat03,	// TWELVE REGISTERS
-			matSplat10, matSplat11, matSplat12, matSplat13,
-			matSplat20, matSplat21, matSplat22, matSplat23;
-
-		{
-			// Load the matrix into local vectors. Sadly, matrix3x4_ts are 
-			// often unaligned. The w components will be the transpose row of
-			// the matrix.
-			fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
-			fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
-			fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);
-
-			matSplat00 = SplatXSIMD(matCol0);
-			matSplat01 = SplatYSIMD(matCol0);
-			matSplat02 = SplatZSIMD(matCol0);
-			matSplat03 = SplatWSIMD(matCol0);
-
-			matSplat10 = SplatXSIMD(matCol1);
-			matSplat11 = SplatYSIMD(matCol1);
-			matSplat12 = SplatZSIMD(matCol1);
-			matSplat13 = SplatWSIMD(matCol1);
-
-			matSplat20 = SplatXSIMD(matCol2);
-			matSplat21 = SplatYSIMD(matCol2);
-			matSplat22 = SplatZSIMD(matCol2);
-			matSplat23 = SplatWSIMD(matCol2);
-		}
-
-		do 
-		{
-			// Trust in the compiler to schedule these operations correctly:
-			pOut->x = MaddSIMD(pVectors->z, matSplat02, MaddSIMD(pVectors->y, matSplat01, MaddSIMD(pVectors->x, matSplat00, matSplat03)));
-			pOut->y = MaddSIMD(pVectors->z, matSplat12, MaddSIMD(pVectors->y, matSplat11, MaddSIMD(pVectors->x, matSplat00, matSplat13)));
-			pOut->z = MaddSIMD(pVectors->z, matSplat22, MaddSIMD(pVectors->y, matSplat21, MaddSIMD(pVectors->x, matSplat00, matSplat23)));
-
-			++pOut;
-			++pVectors;
-			--numVectors;
-		} while(numVectors > 0);
-	}
-}
-
-#ifdef _X360
-// Loop-scheduled code to process FourVectors in groups of eight quite efficiently.
-static void FourVectors_TransformManyGroupsOfEightBy_InPlace(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix )
-{
-	Assert(numVectors > 0);
-	if ( numVectors == 0 )
-		return;
-
-	// Prefetch line 1 and 2
-	__dcbt(0,pVectors);
-	__dcbt(128,pVectors);
-
-	// Splat out each of the entries in the matrix to a fltx4. Do this
-	// in the order that we will need them, to hide latency. I'm
-	// avoiding making an array of them, so that they'll remain in 
-	// registers.
-	fltx4 matSplat00, matSplat01, matSplat02, matSplat03,	// TWELVE REGISTERS
-		matSplat10, matSplat11, matSplat12, matSplat13,
-		matSplat20, matSplat21, matSplat22, matSplat23;
-
-	{
-		// Load the matrix into local vectors. Sadly, matrix3x4_ts are 
-		// often unaligned. The w components will be the tranpose row of
-		// the matrix.
-		fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
-		fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
-		fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);
-
-		matSplat00 = SplatXSIMD(matCol0);
-		matSplat01 = SplatYSIMD(matCol0);
-		matSplat02 = SplatZSIMD(matCol0);
-		matSplat03 = SplatWSIMD(matCol0);
-
-		matSplat10 = SplatXSIMD(matCol1);
-		matSplat11 = SplatYSIMD(matCol1);
-		matSplat12 = SplatZSIMD(matCol1);
-		matSplat13 = SplatWSIMD(matCol1);
-
-		matSplat20 = SplatXSIMD(matCol2);
-		matSplat21 = SplatYSIMD(matCol2);
-		matSplat22 = SplatZSIMD(matCol2);
-		matSplat23 = SplatWSIMD(matCol2);
-	}
-
-	// this macro defines how to compute a specific row from an input and certain splat columns
-#define COMPUTE(res, invec, xterm, yterm, zterm, transterm) res = AddSIMD( AddSIMD( MulSIMD((invec)->z, zterm), AddSIMD( MulSIMD( (invec)->x, xterm ), MulSIMD( (invec)->y, yterm ) ) ), transterm )
-#define WRITE(term, reg, toptr) toptr->term = reg
-
-	// define result groups (we're going to have an eight-way unroll)
-
-	fltx4 res0X, res0Y, res0Z, res0XTemp, res0YTemp, res0ZTemp;	// 48 REGISTERS
-	fltx4 res1X, res1Y, res1Z, res1XTemp, res1YTemp, res1ZTemp;
-	fltx4 res2X, res2Y, res2Z, res2XTemp, res2YTemp, res2ZTemp;
-	fltx4 res3X, res3Y, res3Z, res3XTemp, res3YTemp, res3ZTemp;
-	fltx4 res4X, res4Y, res4Z, res4XTemp, res4YTemp, res4ZTemp;
-	fltx4 res5X, res5Y, res5Z, res5XTemp, res5YTemp, res5ZTemp;
-	fltx4 res6X, res6Y, res6Z, res6XTemp, res6YTemp, res6ZTemp;
-	fltx4 res7X, res7Y, res7Z, res7XTemp, res7YTemp, res7ZTemp;
-
-
-	// #define FROZ(out,in,offset) COMPUTE((out+offset)->x, (in + offset), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE((out + offset )->y, (in + offset), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE((out + offset)->z, (in + offset), matSplat20, matSplat21, matSplat22, matSplat23)
-#define COMPUTE_GROUP(resgroup,dataptr) COMPUTE(resgroup ## X, (dataptr), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE(resgroup ## Y, (dataptr), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE(resgroup ## Z, (dataptr), matSplat20, matSplat21, matSplat22, matSplat23)
-#define WRITE_GROUP(ptr, resgroup) (ptr)->x = resgroup ## X; (ptr)->y = resgroup ## Y; (ptr)->z = resgroup ## Z
-
-	/*
-	// stage 1 -- 6 ops for xyz, each w 12 cycle latency
-	res0X = MulSIMD( (invec)->y, matSplat01 );
-	res0Temp = MaddSIMD((invec)->z, matSplat02, matSplat03);
-	// stage 2 -- 3 clocks for xyz
-	res0X = MaddSIMD( (invec)->x, matSplat00, res0X );
-	// stage 3 -- 3 clocks for xyz
-	res0X = AddSIMD(res0X, res0Temp);
-	*/
-#define COMPUTE_STAGE1_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MulSIMD( (invec)->y, ysplat ); tempvar = MaddSIMD((invec)->z, zsplat, transplat)
-#define COMPUTE_STAGE2_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MaddSIMD( (invec)->x, xsplat, res )
-#define COMPUTE_STAGE3_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = AddSIMD(res, tempvar)  // frees up the tempvar
-
-#define COMPUTE_STAGE1_GROUP(resgroup, invec) COMPUTE_STAGE1_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
-	COMPUTE_STAGE1_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
-	COMPUTE_STAGE1_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
-
-#define COMPUTE_STAGE2_GROUP(resgroup, invec) COMPUTE_STAGE2_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
-	COMPUTE_STAGE2_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
-	COMPUTE_STAGE2_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
-
-#define COMPUTE_STAGE3_GROUP(resgroup, invec) COMPUTE_STAGE3_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
-	COMPUTE_STAGE3_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
-	COMPUTE_STAGE3_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
-
-	const FourVectors * const RESTRICT STOP = pVectors + numVectors;
-
-	// Use techniques of loop scheduling to eliminate data hazards; process
-	// eight groups simultaneously so that we never have any operations stalling
-	// waiting for data.
-	// Note: this loop, while pretty fast, could be faster still -- you'll notice
-	// that it does all of its loads, then all computation, then writes everything
-	// out. If made truly cyclic, such that every line interleaved a stage 1, stage 2,
-	// stage 3, and write, then throughput could be higher (probably by about 50%). 
-	while (pVectors < STOP)
-	{
-		// start prefetching the three cache lines
-		// we'll hit two iterations from now
-		__dcbt( sizeof(FourVectors) * 16,       pVectors );
-		__dcbt( sizeof(FourVectors) * 16 + 128, pVectors );
-		__dcbt( sizeof(FourVectors) * 16 + 256, pVectors );
-
-		// synchro
-		COMPUTE_STAGE1_GROUP(res0, pVectors + 0);
-		COMPUTE_STAGE1_GROUP(res1, pVectors + 1);
-		COMPUTE_STAGE1_GROUP(res2, pVectors + 2);
-		COMPUTE_STAGE1_GROUP(res3, pVectors + 3);
-
-		COMPUTE_STAGE2_GROUP(res0, pVectors + 0);
-		COMPUTE_STAGE1_GROUP(res4, pVectors + 4);
-		COMPUTE_STAGE2_GROUP(res1, pVectors + 1);
-		COMPUTE_STAGE1_GROUP(res5, pVectors + 5);
-		COMPUTE_STAGE2_GROUP(res2, pVectors + 2);
-		COMPUTE_STAGE1_GROUP(res6, pVectors + 6);
-		COMPUTE_STAGE2_GROUP(res3, pVectors + 3);
-		COMPUTE_STAGE1_GROUP(res7, pVectors + 7);
-
-		COMPUTE_STAGE3_GROUP(res0, pVectors + 0);
-		COMPUTE_STAGE2_GROUP(res4, pVectors + 4);
-		COMPUTE_STAGE3_GROUP(res1, pVectors + 1);
-		COMPUTE_STAGE2_GROUP(res5, pVectors + 5);
-		COMPUTE_STAGE3_GROUP(res2, pVectors + 2);
-		COMPUTE_STAGE2_GROUP(res6, pVectors + 6);
-		COMPUTE_STAGE3_GROUP(res3, pVectors + 3);
-		COMPUTE_STAGE2_GROUP(res7, pVectors + 7);
-
-		COMPUTE_STAGE3_GROUP(res4, pVectors + 4);
-		WRITE_GROUP( pVectors + 0, res0 );
-		COMPUTE_STAGE3_GROUP(res5, pVectors + 5);
-		WRITE_GROUP( pVectors + 1, res1 );
-		COMPUTE_STAGE3_GROUP(res6, pVectors + 6);
-		WRITE_GROUP( pVectors + 2, res2 );
-		COMPUTE_STAGE3_GROUP(res7, pVectors + 7);
-		WRITE_GROUP( pVectors + 3, res3 );
-
-		WRITE_GROUP( pVectors + 4, res4 );
-		WRITE_GROUP( pVectors + 5, res5 );
-		WRITE_GROUP( pVectors + 6, res6 );
-		WRITE_GROUP( pVectors + 7, res7 );
-
-		pVectors += 8;
-	}
-
-
-#undef COMPUTE
-#undef WRITE
-#undef COMPUTE_STAGE1_ROW
-#undef COMPUTE_STAGE2_ROW
-#undef COMPUTE_STAGE3_ROW
-#undef COMPUTE_STAGE1_GROUP
-#undef COMPUTE_STAGE2_GROUP
-#undef COMPUTE_STAGE3_GROUP
-#undef COMPUTE_GROUP
-#undef WRITE_GROUP
-}
-#endif
-
-// In-place version of above. It's necessary to have this, rather than just allowing pOut and pVectors
-// to equal each other, because of the semantics of RESTRICT: pVectors and pOut must not be allowed 
-// to alias. (Simply un-restricting the pointers results in very poor scheduling.)
-void FourVectors::TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix )
-{
-	Assert(numVectors > 0);
-
-#ifdef _X360
-	// The really fast version of this function likes to operate on blocks of eight. So, chug through
-	// groups of eight, then deal with any leftovers.
-	int numVectorsRoundedToNearestEight = numVectors & (~0x07);
-	if (numVectors >= 8)
-	{
-		FourVectors_TransformManyGroupsOfEightBy_InPlace(pVectors, numVectorsRoundedToNearestEight, rotationMatrix);
-		numVectors -= numVectorsRoundedToNearestEight;
-		pVectors += numVectorsRoundedToNearestEight;
-	}
-#endif
-
-	// any left over?
-	if (numVectors > 0)
-	{
-
-		// Splat out each of the entries in the matrix to a fltx4. Do this
-		// in the order that we will need them, to hide latency. I'm
-		// avoiding making an array of them, so that they'll remain in 
-		// registers.
-		fltx4 matSplat00, matSplat01, matSplat02, matSplat03,	// TWELVE REGISTERS
-			matSplat10, matSplat11, matSplat12, matSplat13,
-			matSplat20, matSplat21, matSplat22, matSplat23;
-
-		{
-			// Load the matrix into local vectors. Sadly, matrix3x4_ts are 
-			// often unaligned. The w components will be the transpose row of
-			// the matrix.
-			fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
-			fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
-			fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);
-
-			matSplat00 = SplatXSIMD(matCol0);
-			matSplat01 = SplatYSIMD(matCol0);
-			matSplat02 = SplatZSIMD(matCol0);
-			matSplat03 = SplatWSIMD(matCol0);
-
-			matSplat10 = SplatXSIMD(matCol1);
-			matSplat11 = SplatYSIMD(matCol1);
-			matSplat12 = SplatZSIMD(matCol1);
-			matSplat13 = SplatWSIMD(matCol1);
-
-			matSplat20 = SplatXSIMD(matCol2);
-			matSplat21 = SplatYSIMD(matCol2);
-			matSplat22 = SplatZSIMD(matCol2);
-			matSplat23 = SplatWSIMD(matCol2);
-		}
-
-		do 
-		{
-			fltx4 resultX, resultY, resultZ;
-			// Trust in the compiler to schedule these operations correctly:
-			resultX = MaddSIMD(pVectors->z, matSplat02, MaddSIMD(pVectors->y, matSplat01, MaddSIMD(pVectors->x, matSplat00, matSplat03)));
-			resultY = MaddSIMD(pVectors->z, matSplat12, MaddSIMD(pVectors->y, matSplat11, MaddSIMD(pVectors->x, matSplat00, matSplat13)));
-			resultZ = MaddSIMD(pVectors->z, matSplat22, MaddSIMD(pVectors->y, matSplat21, MaddSIMD(pVectors->x, matSplat00, matSplat23)));
-
-			pVectors->x = resultX;
-			pVectors->y = resultY;
-			pVectors->z = resultZ;
-
-			++pVectors;
-			--numVectors;
-		} while(numVectors > 0);
-	}
-}
-
-
-#endif
-
-// Transform many (horizontal) points in-place by a 3x4 matrix,
-// here already loaded onto three fltx4 registers but not transposed. 
-// The points must be stored as 16-byte aligned. They are points
-// and not vectors because we assume the w-component to be 1. 
-#ifdef _X360
-void TransformManyPointsBy(VectorAligned * RESTRICT pVectors, unsigned int numVectors, FLTX4 mRow0, FLTX4 mRow1, FLTX4 mRow2)
-{
-	/**************************************************
-	 *  Here is an elaborate and carefully scheduled  *
-	 *  algorithm nicked from xboxmath.inl and hacked *
-	 *  up for 3x4 matrices.                          *
-	 **************************************************/
-
-	COMPILE_TIME_ASSERT(sizeof(VectorAligned) == sizeof(XMFLOAT4)); // VectorAligned's need to be 16 bytes
-
-	XMVECTOR R0[8], R1[8], R2[8];
-	XMVECTOR vIn[8];
-
-	//    C_ASSERT(UnrollCount == 8);
-	//    C_ASSERT(sizeof(XMFLOAT4) == 16);
-	Assert(pVectors);
-	Assert(((UINT_PTR)pVectors & 3) == 0); // assert alignment 
-
-	UINT GroupIndex;
-
-	VectorAligned * RESTRICT vCurrent = pVectors;
-	// sentinel pointers
-	VectorAligned * vStreamEnd, *vStreamGroupBase, *vStreamGroupEnd;
-
-	{
-		// cook up the pointers from integer math. Necessary because otherwise we LHS all over
-		// the place. (Odd that this doesn't happen to the xbox math.) 
-
-		UINT_PTR InputVector = (UINT_PTR)pVectors;
-		UINT_PTR InputStreamEnd = InputVector + numVectors * sizeof(XMFLOAT4);
-		// compute start and end points on 128-byte alignment
-		UINT_PTR InputStreamCGroupBase = XMMin(InputVector + (XM_CACHE_LINE_SIZE - 1), InputStreamEnd) & ~(XM_CACHE_LINE_SIZE - 1);
-		UINT_PTR InputStreamCGroupEnd = InputStreamCGroupBase + ((InputStreamEnd - InputStreamCGroupBase) & ~(4 * XM_CACHE_LINE_SIZE - 1));
-
-		vStreamEnd = (VectorAligned *)InputStreamEnd;
-		vStreamGroupBase = (VectorAligned *)InputStreamCGroupBase;
-		vStreamGroupEnd = (VectorAligned *)InputStreamCGroupEnd;
-	}
-
-
-	__dcbt(0,                      vStreamGroupBase);
-	__dcbt(XM_CACHE_LINE_SIZE,     vStreamGroupBase);
-	__dcbt(XM_CACHE_LINE_SIZE * 2, vStreamGroupBase);
-	__dcbt(XM_CACHE_LINE_SIZE * 3, vStreamGroupBase);
-
-	while (vCurrent < vStreamGroupBase)
-	{
-		fltx4 vec = __lvx(vCurrent->Base(), 0);
-
-		R0[0] = __vmsum4fp(vec, mRow0);
-		R1[0] = __vmsum4fp(vec, mRow1);
-		R2[0] = __vmsum4fp(vec, mRow2);
-
-		__stvewx(R0[0], vCurrent->Base(), 0);
-		__stvewx(R1[0], vCurrent->Base(), 4);
-		__stvewx(R2[0], vCurrent->Base(), 8);
-
-		vCurrent++; 
-	}
-
-	while (vCurrent < vStreamGroupEnd)
-	{
-		__dcbt(XM_CACHE_LINE_SIZE * 4, vCurrent);
-		__dcbt(XM_CACHE_LINE_SIZE * 5, vCurrent);
-		__dcbt(XM_CACHE_LINE_SIZE * 6, vCurrent);
-		__dcbt(XM_CACHE_LINE_SIZE * 7, vCurrent);
-
-		for (GroupIndex = 0; GroupIndex < 4; GroupIndex++)
-		{
-			// all kinds of LHS on this pointer. Why?
-			VectorAligned* OutputVector = vCurrent;
-
-			vIn[0] = __lvx(vCurrent->Base(), 0);
-			vCurrent++;
-			vIn[1] = __lvx(vCurrent->Base(), 0);
-			vCurrent++;
-			vIn[2] = __lvx(vCurrent->Base(), 0);
-			vCurrent++;
-			vIn[3] = __lvx(vCurrent->Base(), 0);
-			vCurrent++;
-			vIn[4] = __lvx(vCurrent->Base(), 0);
-			vCurrent++;
-			vIn[5] = __lvx(vCurrent->Base(), 0);
-			vCurrent++;
-			vIn[6] = __lvx(vCurrent->Base(), 0);
-			vCurrent++;
-			vIn[7] = __lvx(vCurrent->Base(), 0);
-			vCurrent++;
-
-			R0[0] = __vmsum4fp(vIn[0], mRow0);
-			R1[0] = __vmsum4fp(vIn[0], mRow1);
-			R2[0] = __vmsum4fp(vIn[0], mRow2);
-
-			R0[1] = __vmsum4fp(vIn[1], mRow0);
-			R1[1] = __vmsum4fp(vIn[1], mRow1);
-			R2[1] = __vmsum4fp(vIn[1], mRow2);
-
-			R0[2] = __vmsum4fp(vIn[2], mRow0);
-			R1[2] = __vmsum4fp(vIn[2], mRow1);
-			R2[2] = __vmsum4fp(vIn[2], mRow2);
-
-			R0[3] = __vmsum4fp(vIn[3], mRow0);
-			R1[3] = __vmsum4fp(vIn[3], mRow1);
-			R2[3] = __vmsum4fp(vIn[3], mRow2);
-
-			R0[4] = __vmsum4fp(vIn[4], mRow0);
-			R1[4] = __vmsum4fp(vIn[4], mRow1);
-			R2[4] = __vmsum4fp(vIn[4], mRow2);
-
-			R0[5] = __vmsum4fp(vIn[5], mRow0);
-			R1[5] = __vmsum4fp(vIn[5], mRow1);
-			R2[5] = __vmsum4fp(vIn[5], mRow2);
-
-			R0[6] = __vmsum4fp(vIn[6], mRow0);
-			R1[6] = __vmsum4fp(vIn[6], mRow1);
-			R2[6] = __vmsum4fp(vIn[6], mRow2);
-
-			R0[7] = __vmsum4fp(vIn[7], mRow0);
-			R1[7] = __vmsum4fp(vIn[7], mRow1);
-			R2[7] = __vmsum4fp(vIn[7], mRow2);
-
-			__stvewx(R0[0], OutputVector, 0);
-			__stvewx(R1[0], OutputVector, 4);
-			__stvewx(R2[0], OutputVector, 8);
-			OutputVector++;
-
-			__stvewx(R0[1], OutputVector, 0);
-			__stvewx(R1[1], OutputVector, 4);
-			__stvewx(R2[1], OutputVector, 8);
-			OutputVector++;
-
-			__stvewx(R0[2], OutputVector, 0);
-			__stvewx(R1[2], OutputVector, 4);
-			__stvewx(R2[2], OutputVector, 8);
-			OutputVector++;
-
-			__stvewx(R0[3], OutputVector, 0);
-			__stvewx(R1[3], OutputVector, 4);
-			__stvewx(R2[3], OutputVector, 8);
-			OutputVector++;
-
-			__stvewx(R0[4], OutputVector, 0);
-			__stvewx(R1[4], OutputVector, 4);
-			__stvewx(R2[4], OutputVector, 8);
-			OutputVector++;
-
-			__stvewx(R0[5], OutputVector, 0);
-			__stvewx(R1[5], OutputVector, 4);
-			__stvewx(R2[5], OutputVector, 8);
-			OutputVector++;
-
-			__stvewx(R0[6], OutputVector, 0);
-			__stvewx(R1[6], OutputVector, 4);
-			__stvewx(R2[6], OutputVector, 8);
-			OutputVector++;
-
-			__stvewx(R0[7], OutputVector, 0);
-			__stvewx(R1[7], OutputVector, 4);
-			__stvewx(R2[7], OutputVector, 8);
-			OutputVector++;
-		}
-	}
-
-	while (vCurrent < vStreamEnd)
-	{
-		vIn[0] = __lvx(vCurrent->Base(), 0);
-
-		R0[0] = __vmsum4fp(vIn[0], mRow0);
-		R1[0] = __vmsum4fp(vIn[0], mRow1);
-		R2[0] = __vmsum4fp(vIn[0], mRow2);
-
-		__stvewx(R0[0], vCurrent->Base(), 0);
-		__stvewx(R1[0], vCurrent->Base(), 4);
-		__stvewx(R2[0], vCurrent->Base(), 8);
-
-		vCurrent++;
-	}
-	
-
-}
-#endif
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+//===========================================================================//
+
+#include "mathlib/ssemath.h"
+#include "mathlib/ssequaternion.h"
+
+const fltx4 Four_PointFives={0.5,0.5,0.5,0.5};
+#ifndef _X360
+const fltx4 Four_Zeros={0.0,0.0,0.0,0.0};
+const fltx4 Four_Ones={1.0,1.0,1.0,1.0};
+#endif
+const fltx4 Four_Twos={2.0,2.0,2.0,2.0};
+const fltx4 Four_Threes={3.0,3.0,3.0,3.0};
+const fltx4 Four_Fours={4.0,4.0,4.0,4.0};
+const fltx4 Four_Origin={0,0,0,1};
+const fltx4 Four_NegativeOnes={-1,-1,-1,-1};
+
+const fltx4 Four_2ToThe21s={ (float) (1<<21), (float) (1<<21), (float) (1<<21), (float)(1<<21) };
+const fltx4 Four_2ToThe22s={ (float) (1<<22), (float) (1<<22), (float) (1<<22), (float)(1<<22) };
+const fltx4 Four_2ToThe23s={ (float) (1<<23), (float) (1<<23), (float) (1<<23), (float)(1<<23) };
+const fltx4 Four_2ToThe24s={ (float) (1<<24), (float) (1<<24), (float) (1<<24), (float)(1<<24) };
+
+const fltx4 Four_Point225s={ .225, .225, .225, .225 };
+const fltx4 Four_Epsilons={FLT_EPSILON,FLT_EPSILON,FLT_EPSILON,FLT_EPSILON};
+
+const fltx4 Four_FLT_MAX={FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+const fltx4 Four_Negative_FLT_MAX={-FLT_MAX,-FLT_MAX,-FLT_MAX,-FLT_MAX};
+const fltx4 g_SIMD_0123 = { 0., 1., 2., 3. };
+
+const fltx4 g_QuatMultRowSign[4] =
+{
+	{  1.0f,  1.0f, -1.0f, 1.0f },
+	{ -1.0f,  1.0f,  1.0f, 1.0f },
+	{  1.0f, -1.0f,  1.0f, 1.0f },
+	{ -1.0f, -1.0f, -1.0f, 1.0f }
+};
+
+const int32 ALIGN16 g_SIMD_clear_signmask[4] ALIGN16_POST = {0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff};
+const int32 ALIGN16 g_SIMD_signmask[4] ALIGN16_POST = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
+const int32 ALIGN16 g_SIMD_lsbmask[4] ALIGN16_POST = { 0xfffffffe, 0xfffffffe, 0xfffffffe, 0xfffffffe };
+const int32 ALIGN16 g_SIMD_clear_wmask[4] ALIGN16_POST = { 0xffffffff, 0xffffffff, 0xffffffff, 0 };
+const int32 ALIGN16 g_SIMD_AllOnesMask[4] ALIGN16_POST = { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }; // ~0,~0,~0,~0
+const int32 ALIGN16 g_SIMD_Low16BitsMask[4] ALIGN16_POST = { 0xffff, 0xffff, 0xffff, 0xffff }; // 0xffff x 4
+
+const int32 ALIGN16 g_SIMD_ComponentMask[4][4] ALIGN16_POST =
+{
+	{ 0xFFFFFFFF, 0, 0, 0 }, { 0, 0xFFFFFFFF, 0, 0 }, { 0, 0, 0xFFFFFFFF, 0 }, { 0, 0, 0, 0xFFFFFFFF }
+};
+
+const int32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST =
+{
+	{ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff },
+	{ 0xffffffff, 0x00000000, 0x00000000, 0x00000000 },
+	{ 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 },
+	{ 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 },
+};
+
+
+	// FUNCTIONS
+	// NOTE: WHY YOU **DO NOT** WANT TO PUT FUNCTIONS HERE
+// Generally speaking, you want to make sure SIMD math functions
+// are inlined, because that gives the compiler much more latitude
+// in instruction scheduling. It's not that the overhead of calling
+// the function is particularly great; rather, many of the SIMD 
+// opcodes have long latencies, and if you have a sequence of 
+// several dependent ones inside a function call, the latencies 
+// stack up to create a big penalty. If the function is inlined,
+// the compiler can interleave its operations with ones from the
+// caller to better hide those latencies. Finally, on the 360,
+// putting parameters or return values on the stack, and then 
+// reading them back within the next forty cycles, is a very 
+// severe penalty. So, as much as possible, you want to leave your
+// data on the registers.
+
+// That said, there are certain occasions where it is appropriate
+// to call into functions -- particularly for very large blocks
+// of code that will spill most of the registers anyway. Unless your
+// function is more than one screen long, yours is probably not one
+// of those occasions.
+
+
+
+/// You can use this to rotate a long array of FourVectors all by the same
+/// matrix. The first parameter is the head of the array. The second is the
+/// number of vectors to rotate. The third is the matrix.
+void FourVectors::RotateManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix )
+{
+	Assert(numVectors > 0);
+	if ( numVectors == 0 )
+		return;
+
+	// Splat out each of the entries in the matrix to a fltx4. Do this
+	// in the order that we will need them, to hide latency. I'm
+	// avoiding making an array of them, so that they'll remain in 
+	// registers.
+	fltx4 matSplat00, matSplat01, matSplat02,
+		matSplat10, matSplat11, matSplat12,
+		matSplat20, matSplat21, matSplat22;
+
+	{
+		// Load the matrix into local vectors. Sadly, matrix3x4_ts are 
+		// often unaligned. The w components will be the tranpose row of
+		// the matrix, but we don't really care about that.
+		fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
+		fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
+		fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);
+
+		matSplat00 = SplatXSIMD(matCol0);
+		matSplat01 = SplatYSIMD(matCol0);
+		matSplat02 = SplatZSIMD(matCol0);
+
+		matSplat10 = SplatXSIMD(matCol1);
+		matSplat11 = SplatYSIMD(matCol1);
+		matSplat12 = SplatZSIMD(matCol1);
+
+		matSplat20 = SplatXSIMD(matCol2);
+		matSplat21 = SplatYSIMD(matCol2);
+		matSplat22 = SplatZSIMD(matCol2);
+	}
+
+#ifdef _X360
+	// Same algorithm as above, but the loop is unrolled to eliminate data hazard latencies
+	// and simplify prefetching. Named variables are deliberately used instead of arrays to
+	// ensure that the variables live on the registers instead of the stack (stack load/store
+	// is a serious penalty on 360).  Nb: for prefetching to be most efficient here, the
+	// loop should be unrolled to 8 FourVectors per iteration; because each FourVectors is 
+	// 48 bytes long, 48 * 8 = 384, its least common multiple with the 128-byte cache line. 
+	// That way you can fetch the next 3 cache lines while you work on these three. 
+	// If you do go this route, be sure to dissassemble and make sure it doesn't spill 
+	// registers to stack as you do this; the cost of that will be excessive. Unroll the loop
+	// a little and just live with the fact that you'll be doing a couple of redundant dbcts
+	// (they don't cost you anything). Be aware that all three cores share L2 and it can only
+	// have eight cache lines fetching at a time.
+	fltx4 outX0, outY0, outZ0; // bank one of outputs
+	fltx4 outX1, outY1, outZ1; // bank two of outputs
+
+
+	// Because of instruction latencies and scheduling, it's actually faster to use adds and muls
+	// rather than madds. (Empirically determined by timing.)
+	const FourVectors * stop = pVectors + numVectors;
+	FourVectors * RESTRICT pVectNext;
+	// prime the pump.
+	if (numVectors & 0x01)
+	{
+		// odd number of vectors to process
+		// prime the 1 group of registers
+		pVectNext = pVectors++;
+		outX1 = AddSIMD( AddSIMD( MulSIMD( pVectNext->x, matSplat00 ), MulSIMD( pVectNext->y, matSplat01 ) ), MulSIMD( pVectNext->z, matSplat02 ) );
+		outY1 = AddSIMD( AddSIMD( MulSIMD( pVectNext->x, matSplat10 ), MulSIMD( pVectNext->y, matSplat11 ) ), MulSIMD( pVectNext->z, matSplat12 ) );
+		outZ1 = AddSIMD( AddSIMD( MulSIMD( pVectNext->x, matSplat20 ), MulSIMD( pVectNext->y, matSplat21 ) ), MulSIMD( pVectNext->z, matSplat22 ) );
+	}
+	else
+	{
+		// even number of total vectors to process; 
+		// prime the zero group and jump into the middle of the loop
+		outX0 = AddSIMD( AddSIMD( MulSIMD( pVectors->x, matSplat00 ), MulSIMD( pVectors->y, matSplat01 ) ), MulSIMD( pVectors->z, matSplat02 ) );
+		outY0 = AddSIMD( AddSIMD( MulSIMD( pVectors->x, matSplat10 ), MulSIMD( pVectors->y, matSplat11 ) ), MulSIMD( pVectors->z, matSplat12 ) );
+		outZ0 = AddSIMD( AddSIMD( MulSIMD( pVectors->x, matSplat20 ), MulSIMD( pVectors->y, matSplat21 ) ), MulSIMD( pVectors->z, matSplat22 ) );
+		goto EVEN_CASE;
+	}
+
+	// perform an even number of iterations through this loop.
+	while (pVectors < stop)
+	{
+		outX0 = MaddSIMD( pVectors->z, matSplat02, AddSIMD( MulSIMD( pVectors->x, matSplat00 ), MulSIMD( pVectors->y, matSplat01 ) ) );
+		outY0 = MaddSIMD( pVectors->z, matSplat12, AddSIMD( MulSIMD( pVectors->x, matSplat10 ), MulSIMD( pVectors->y, matSplat11 ) ) );
+		outZ0 = MaddSIMD( pVectors->z, matSplat22, AddSIMD( MulSIMD( pVectors->x, matSplat20 ), MulSIMD( pVectors->y, matSplat21 ) ) );
+
+		pVectNext->x = outX1;
+		pVectNext->y = outY1;
+		pVectNext->z = outZ1;
+
+EVEN_CASE:
+		pVectNext = pVectors+1;
+
+		outX1 = MaddSIMD( pVectNext->z, matSplat02, AddSIMD( MulSIMD( pVectNext->x, matSplat00 ), MulSIMD( pVectNext->y, matSplat01 ) ) );
+		outY1 = MaddSIMD( pVectNext->z, matSplat12, AddSIMD( MulSIMD( pVectNext->x, matSplat10 ), MulSIMD( pVectNext->y, matSplat11 ) ) );
+		outZ1 = MaddSIMD( pVectNext->z, matSplat22, AddSIMD( MulSIMD( pVectNext->x, matSplat20 ), MulSIMD( pVectNext->y, matSplat21 ) ) );
+
+		pVectors->x = outX0;
+		pVectors->y = outY0;
+		pVectors->z = outZ0;
+
+		pVectors += 2;
+	}
+
+	// flush the last round of output
+	pVectNext->x = outX1;
+	pVectNext->y = outY1;
+	pVectNext->z = outZ1;
+#else
+	// PC does not benefit from the unroll/scheduling above
+	fltx4 outX0, outY0, outZ0; // bank one of outputs
+
+
+	// Because of instruction latencies and scheduling, it's actually faster to use adds and muls
+	// rather than madds. (Empirically determined by timing.)
+	const FourVectors * stop = pVectors + numVectors;
+
+	// perform an even number of iterations through this loop.
+	while (pVectors < stop)
+	{
+		outX0 = MaddSIMD( pVectors->z, matSplat02, AddSIMD( MulSIMD( pVectors->x, matSplat00 ), MulSIMD( pVectors->y, matSplat01 ) ) );
+		outY0 = MaddSIMD( pVectors->z, matSplat12, AddSIMD( MulSIMD( pVectors->x, matSplat10 ), MulSIMD( pVectors->y, matSplat11 ) ) );
+		outZ0 = MaddSIMD( pVectors->z, matSplat22, AddSIMD( MulSIMD( pVectors->x, matSplat20 ), MulSIMD( pVectors->y, matSplat21 ) ) );
+
+		pVectors->x = outX0;
+		pVectors->y = outY0;
+		pVectors->z = outZ0;
+		pVectors++;
+	}
+#endif
+}
+
+#ifdef _X360
+// Loop-scheduled code to process FourVectors in groups of eight quite efficiently.
+void FourVectors_TransformManyGroupsOfEightBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut )
+{
+	Assert(numVectors > 0);
+	if ( numVectors == 0 )
+		return;
+
+	AssertMsg( (pOut < pVectors && pOut+numVectors <= pVectors) ||
+			   (pOut > pVectors && pVectors+numVectors <= pOut), "FourVectors::TransformManyBy called with overlapping buffer pointers." );
+
+	// Splat out each of the entries in the matrix to a fltx4. Do this
+	// in the order that we will need them, to hide latency. I'm
+	// avoiding making an array of them, so that they'll remain in 
+	// registers.
+	fltx4 matSplat00, matSplat01, matSplat02, matSplat03,	// TWELVE REGISTERS
+		  matSplat10, matSplat11, matSplat12, matSplat13,
+		  matSplat20, matSplat21, matSplat22, matSplat23;
+
+	{
+		// Load the matrix into local vectors. Sadly, matrix3x4_ts are 
+		// often unaligned. The w components will be the tranpose row of
+		// the matrix.
+		fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
+		fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
+		fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);
+
+		matSplat00 = SplatXSIMD(matCol0);
+		matSplat01 = SplatYSIMD(matCol0);
+		matSplat02 = SplatZSIMD(matCol0);
+		matSplat03 = SplatWSIMD(matCol0);
+
+		matSplat10 = SplatXSIMD(matCol1);
+		matSplat11 = SplatYSIMD(matCol1);
+		matSplat12 = SplatZSIMD(matCol1);
+		matSplat13 = SplatWSIMD(matCol1);
+
+		matSplat20 = SplatXSIMD(matCol2);
+		matSplat21 = SplatYSIMD(matCol2);
+		matSplat22 = SplatZSIMD(matCol2);
+		matSplat23 = SplatWSIMD(matCol2);
+	}
+
+	// this macro defines how to compute a specific row from an input and certain splat columns
+#define COMPUTE(res, invec, xterm, yterm, zterm, transterm) res = AddSIMD( AddSIMD( MulSIMD((invec)->z, zterm), AddSIMD( MulSIMD( (invec)->x, xterm ), MulSIMD( (invec)->y, yterm ) ) ), transterm )
+#define WRITE(term, reg, toptr) toptr->term = reg
+
+	// define result groups (we're going to have an eight-way unroll)
+	
+	fltx4 res0X, res0Y, res0Z, res0XTemp, res0YTemp, res0ZTemp;	// 48 REGISTERS
+	fltx4 res1X, res1Y, res1Z, res1XTemp, res1YTemp, res1ZTemp;
+	fltx4 res2X, res2Y, res2Z, res2XTemp, res2YTemp, res2ZTemp;
+	fltx4 res3X, res3Y, res3Z, res3XTemp, res3YTemp, res3ZTemp;
+	fltx4 res4X, res4Y, res4Z, res4XTemp, res4YTemp, res4ZTemp;
+	fltx4 res5X, res5Y, res5Z, res5XTemp, res5YTemp, res5ZTemp;
+	fltx4 res6X, res6Y, res6Z, res6XTemp, res6YTemp, res6ZTemp;
+	fltx4 res7X, res7Y, res7Z, res7XTemp, res7YTemp, res7ZTemp;
+	
+
+// #define FROZ(out,in,offset) COMPUTE((out+offset)->x, (in + offset), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE((out + offset )->y, (in + offset), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE((out + offset)->z, (in + offset), matSplat20, matSplat21, matSplat22, matSplat23)
+#define COMPUTE_GROUP(resgroup,dataptr) COMPUTE(resgroup ## X, (dataptr), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE(resgroup ## Y, (dataptr), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE(resgroup ## Z, (dataptr), matSplat20, matSplat21, matSplat22, matSplat23)
+#define WRITE_GROUP(ptr, resgroup) (ptr)->x = resgroup ## X; (ptr)->y = resgroup ## Y; (ptr)->z = resgroup ## Z
+
+	/*
+	// stage 1 -- 6 ops for xyz, each w 12 cycle latency
+	res0X = MulSIMD( (invec)->y, matSplat01 );
+	res0Temp = MaddSIMD((invec)->z, matSplat02, matSplat03);
+	// stage 2 -- 3 clocks for xyz
+	res0X = MaddSIMD( (invec)->x, matSplat00, res0X );
+	// stage 3 -- 3 clocks for xyz
+	res0X = AddSIMD(res0X, res0Temp);
+	*/
+#define COMPUTE_STAGE1_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MulSIMD( (invec)->y, ysplat ); tempvar = MaddSIMD((invec)->z, zsplat, transplat)
+#define COMPUTE_STAGE2_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MaddSIMD( (invec)->x, xsplat, res )
+#define COMPUTE_STAGE3_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = AddSIMD(res, tempvar)  // frees up the tempvar
+
+#define COMPUTE_STAGE1_GROUP(resgroup, invec) COMPUTE_STAGE1_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
+										COMPUTE_STAGE1_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
+										COMPUTE_STAGE1_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
+
+#define COMPUTE_STAGE2_GROUP(resgroup, invec) COMPUTE_STAGE2_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
+										COMPUTE_STAGE2_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
+										COMPUTE_STAGE2_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
+
+#define COMPUTE_STAGE3_GROUP(resgroup, invec) COMPUTE_STAGE3_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
+										COMPUTE_STAGE3_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
+										COMPUTE_STAGE3_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
+
+	FourVectors * RESTRICT inData = pVectors;
+	FourVectors * RESTRICT outData = pOut;
+	const FourVectors * const RESTRICT STOP = pVectors + numVectors;
+
+	// Use techniques of loop scheduling to eliminate data hazards; process
+	// eight groups simultaneously so that we never have any operations stalling
+	// waiting for data.
+	// Note: this loop, while pretty fast, could be faster still -- you'll notice
+	// that it does all of its loads, then all computation, then writes everything
+	// out. If made truly cyclic, such that every line interleaved a stage 1, stage 2,
+	// stage 3, and write, then throughput could be higher (probably by about 50%). 
+	while (inData < STOP)
+	{
+		// start prefetching the three cache lines
+		// we'll hit two iterations from now
+		__dcbt( sizeof(FourVectors) * 16,       inData );
+		__dcbt( sizeof(FourVectors) * 16 + 128, inData );
+		__dcbt( sizeof(FourVectors) * 16 + 256, inData );
+
+		// synchro
+		COMPUTE_STAGE1_GROUP(res0, inData + 0);
+		COMPUTE_STAGE1_GROUP(res1, inData + 1);
+		COMPUTE_STAGE1_GROUP(res2, inData + 2);
+		COMPUTE_STAGE1_GROUP(res3, inData + 3);
+
+			COMPUTE_STAGE2_GROUP(res0, inData + 0);
+		COMPUTE_STAGE1_GROUP(res4, inData + 4);
+			COMPUTE_STAGE2_GROUP(res1, inData + 1);
+		COMPUTE_STAGE1_GROUP(res5, inData + 5);
+			COMPUTE_STAGE2_GROUP(res2, inData + 2);
+		COMPUTE_STAGE1_GROUP(res6, inData + 6);
+			COMPUTE_STAGE2_GROUP(res3, inData + 3);
+		COMPUTE_STAGE1_GROUP(res7, inData + 7);
+
+				COMPUTE_STAGE3_GROUP(res0, inData + 0);
+			COMPUTE_STAGE2_GROUP(res4, inData + 4);
+				COMPUTE_STAGE3_GROUP(res1, inData + 1);
+			COMPUTE_STAGE2_GROUP(res5, inData + 5);
+				COMPUTE_STAGE3_GROUP(res2, inData + 2);
+			COMPUTE_STAGE2_GROUP(res6, inData + 6);
+				COMPUTE_STAGE3_GROUP(res3, inData + 3);
+			COMPUTE_STAGE2_GROUP(res7, inData + 7);
+
+				COMPUTE_STAGE3_GROUP(res4, inData + 4);
+					WRITE_GROUP( outData + 0, res0 );
+				COMPUTE_STAGE3_GROUP(res5, inData + 5);
+					WRITE_GROUP( outData + 1, res1 );
+				COMPUTE_STAGE3_GROUP(res6, inData + 6);
+					WRITE_GROUP( outData + 2, res2 );
+				COMPUTE_STAGE3_GROUP(res7, inData + 7);
+					WRITE_GROUP( outData + 3, res3 );
+		
+
+					WRITE_GROUP( outData + 4, res4 );
+					WRITE_GROUP( outData + 5, res5 );
+					WRITE_GROUP( outData + 6, res6 );
+					WRITE_GROUP( outData + 7, res7 );
+		
+		inData += 8;
+		outData += 8;
+	}
+
+
+#undef COMPUTE
+#undef WRITE
+#undef COMPUTE_STAGE1_ROW
+#undef COMPUTE_STAGE2_ROW
+#undef COMPUTE_STAGE3_ROW
+#undef COMPUTE_STAGE1_GROUP
+#undef COMPUTE_STAGE2_GROUP
+#undef COMPUTE_STAGE3_GROUP
+#undef COMPUTE_GROUP
+#undef WRITE_GROUP
+}
+
+#ifdef _X360
+// Loop-scheduled code to process FourVectors in groups of eight quite efficiently. This is the version
+// to call when starting on a 128-byte-aligned address.
+void FourVectors_TransformManyGroupsOfEightBy_128byteAligned(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut )
+{
+	/* If this has changed, you will need to change all the prefetches, *
+	 * and groups of eight are no longer the ideal unit for iterating   *
+	 * on many vectors.													*/
+	COMPILE_TIME_ASSERT( sizeof(FourVectors) == 48 ) ;
+
+	Assert(numVectors > 0);
+	if ( numVectors == 0 )
+		return;
+
+	AssertMsg((numVectors & 0x07) == 0, "FourVectors_TransformManyGroupsOfEight called with numVectors % 8 != 0!");
+
+	// Assert alignment
+	AssertMsg( ( ( reinterpret_cast<uint32>( pVectors )  & 127 ) == 0) && 
+			   ( ( reinterpret_cast<uint32>(pOut) & 127 ) == 0),
+			   "FourVectors_Transform..aligned called with non-128-byte-aligned buffers." );
+
+	// Assert non overlap
+	AssertMsg( (pOut < pVectors && pOut+numVectors <= pVectors) ||
+		(pOut > pVectors && pVectors+numVectors <= pOut), "FourVectors::TransformManyBy called with overlapping buffer pointers." );
+
+		// Here's the plan. 8 four-vecs = 3 cache lines exactly. It takes about 400 cycles to process a group
+		// of eight, and cache latency is 600 cycles, so we try to prefetch two iterations ahead (eg fetch
+		// iteration 3 while working on iteration 1). In the case of the output, we can simply zero-flush 
+		// the cache lines since we are sure to write into them. Because we're reading and fetching two ahead,
+		// we want to stop two away from the last iteration.
+
+		// No matter what, we will need to prefetch the first two groups of eight of input (that's the 
+		// first six cache lines)
+	__dcbt( 0, pVectors );
+	__dcbt( 128, pVectors );
+	__dcbt( 256, pVectors );
+	__dcbt( 384, pVectors );
+	__dcbt( 512, pVectors );
+	__dcbt( 640, pVectors );
+
+
+	// Splat out each of the entries in the matrix to a fltx4. Do this
+	// in the order that we will need them, to hide latency. I'm
+	// avoiding making an array of them, so that they'll remain in 
+	// registers.
+	fltx4 matSplat00, matSplat01, matSplat02, matSplat03,	// TWELVE REGISTERS
+		matSplat10, matSplat11, matSplat12, matSplat13,
+		matSplat20, matSplat21, matSplat22, matSplat23;
+
+	{
+		// Load the matrix into local vectors. Sadly, matrix3x4_ts are 
+		// often unaligned. The w components will be the tranpose row of
+		// the matrix.
+		fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
+		fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
+		fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);
+
+		matSplat00 = SplatXSIMD(matCol0);
+		matSplat01 = SplatYSIMD(matCol0);
+		matSplat02 = SplatZSIMD(matCol0);
+		matSplat03 = SplatWSIMD(matCol0);
+
+		matSplat10 = SplatXSIMD(matCol1);
+		matSplat11 = SplatYSIMD(matCol1);
+		matSplat12 = SplatZSIMD(matCol1);
+		matSplat13 = SplatWSIMD(matCol1);
+
+		matSplat20 = SplatXSIMD(matCol2);
+		matSplat21 = SplatYSIMD(matCol2);
+		matSplat22 = SplatZSIMD(matCol2);
+		matSplat23 = SplatWSIMD(matCol2);
+	}
+
+	// this macro defines how to compute a specific row from an input and certain splat columns
+#define COMPUTE(res, invec, xterm, yterm, zterm, transterm) res = AddSIMD( AddSIMD( MulSIMD((invec)->z, zterm), AddSIMD( MulSIMD( (invec)->x, xterm ), MulSIMD( (invec)->y, yterm ) ) ), transterm )
+#define WRITE(term, reg, toptr) toptr->term = reg
+
+	// define result groups (we're going to have an eight-way unroll)
+
+	fltx4 res0X, res0Y, res0Z, res0XTemp, res0YTemp, res0ZTemp;	// 48 REGISTERS
+	fltx4 res1X, res1Y, res1Z, res1XTemp, res1YTemp, res1ZTemp;
+	fltx4 res2X, res2Y, res2Z, res2XTemp, res2YTemp, res2ZTemp;
+	fltx4 res3X, res3Y, res3Z, res3XTemp, res3YTemp, res3ZTemp;
+	fltx4 res4X, res4Y, res4Z, res4XTemp, res4YTemp, res4ZTemp;
+	fltx4 res5X, res5Y, res5Z, res5XTemp, res5YTemp, res5ZTemp;
+	fltx4 res6X, res6Y, res6Z, res6XTemp, res6YTemp, res6ZTemp;
+	fltx4 res7X, res7Y, res7Z, res7XTemp, res7YTemp, res7ZTemp;
+
+
+	// #define FROZ(out,in,offset) COMPUTE((out+offset)->x, (in + offset), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE((out + offset )->y, (in + offset), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE((out + offset)->z, (in + offset), matSplat20, matSplat21, matSplat22, matSplat23)
+#define COMPUTE_GROUP(resgroup,dataptr) COMPUTE(resgroup ## X, (dataptr), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE(resgroup ## Y, (dataptr), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE(resgroup ## Z, (dataptr), matSplat20, matSplat21, matSplat22, matSplat23)
+#define WRITE_GROUP(ptr, resgroup) (ptr)->x = resgroup ## X; (ptr)->y = resgroup ## Y; (ptr)->z = resgroup ## Z
+
+	/*
+	// stage 1 -- 6 ops for xyz, each w 12 cycle latency
+	res0X = MulSIMD( (invec)->y, matSplat01 );
+	res0Temp = MaddSIMD((invec)->z, matSplat02, matSplat03);
+	// stage 2 -- 3 clocks for xyz
+	res0X = MaddSIMD( (invec)->x, matSplat00, res0X );
+	// stage 3 -- 3 clocks for xyz
+	res0X = AddSIMD(res0X, res0Temp);
+	*/
+#define COMPUTE_STAGE1_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MulSIMD( (invec)->y, ysplat ); tempvar = MaddSIMD((invec)->z, zsplat, transplat)
+#define COMPUTE_STAGE2_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MaddSIMD( (invec)->x, xsplat, res )
+#define COMPUTE_STAGE3_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = AddSIMD(res, tempvar)  // frees up the tempvar
+
+#define COMPUTE_STAGE1_GROUP(resgroup, invec) COMPUTE_STAGE1_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
+	COMPUTE_STAGE1_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
+	COMPUTE_STAGE1_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
+
+#define COMPUTE_STAGE2_GROUP(resgroup, invec) COMPUTE_STAGE2_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
+	COMPUTE_STAGE2_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
+	COMPUTE_STAGE2_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
+
+#define COMPUTE_STAGE3_GROUP(resgroup, invec) COMPUTE_STAGE3_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
+	COMPUTE_STAGE3_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
+	COMPUTE_STAGE3_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
+
+
+		// Okay. First do all but the last two turns of the crank; we don't want to overshoot with the flush-to-zero.
+	FourVectors * RESTRICT inData = pVectors;
+	FourVectors * RESTRICT outData = pOut;
+	const FourVectors * RESTRICT STOP;
+	if (numVectors > 16)
+	{
+		STOP = pVectors + numVectors - 16;
+		// flush the first two blocks we'll write into 
+		__dcbz128( 0, outData );
+		__dcbz128( 128, outData );
+		__dcbz128( 256, outData );
+
+		while (inData < STOP)
+		{
+			// start prefetching the three cache lines
+			// we'll hit two iterations from now
+			__dcbt( sizeof(FourVectors) * 16,       inData );
+			__dcbt( sizeof(FourVectors) * 16 + 128, inData );
+			__dcbt( sizeof(FourVectors) * 16 + 256, inData );
+
+			// synchro
+			COMPUTE_STAGE1_GROUP(res0, inData + 0);
+			COMPUTE_STAGE1_GROUP(res1, inData + 1);
+			COMPUTE_STAGE1_GROUP(res2, inData + 2);
+			COMPUTE_STAGE1_GROUP(res3, inData + 3);
+
+			// pre-zero the three cache lines we'll overwrite
+			// in the next iteration
+			__dcbz128( 384, outData );
+			__dcbz128( 512, outData );
+			__dcbz128( 640, outData );
+
+
+			COMPUTE_STAGE2_GROUP(res0, inData + 0);
+			COMPUTE_STAGE1_GROUP(res4, inData + 4);
+			COMPUTE_STAGE2_GROUP(res1, inData + 1);
+			COMPUTE_STAGE1_GROUP(res5, inData + 5);
+			COMPUTE_STAGE2_GROUP(res2, inData + 2);
+			COMPUTE_STAGE1_GROUP(res6, inData + 6);
+			COMPUTE_STAGE2_GROUP(res3, inData + 3);
+			COMPUTE_STAGE1_GROUP(res7, inData + 7);
+
+			COMPUTE_STAGE3_GROUP(res0, inData + 0);
+			COMPUTE_STAGE2_GROUP(res4, inData + 4);
+			COMPUTE_STAGE3_GROUP(res1, inData + 1);
+			COMPUTE_STAGE2_GROUP(res5, inData + 5);
+			COMPUTE_STAGE3_GROUP(res2, inData + 2);
+			COMPUTE_STAGE2_GROUP(res6, inData + 6);
+			COMPUTE_STAGE3_GROUP(res3, inData + 3);
+			COMPUTE_STAGE2_GROUP(res7, inData + 7);
+
+			COMPUTE_STAGE3_GROUP(res4, inData + 4);
+			WRITE_GROUP( outData + 0, res0 );
+			COMPUTE_STAGE3_GROUP(res5, inData + 5);
+			WRITE_GROUP( outData + 1, res1 );
+			COMPUTE_STAGE3_GROUP(res6, inData + 6);
+			WRITE_GROUP( outData + 2, res2 );
+			COMPUTE_STAGE3_GROUP(res7, inData + 7);
+			WRITE_GROUP( outData + 3, res3 );
+
+
+			WRITE_GROUP( outData + 4, res4 );
+			WRITE_GROUP( outData + 5, res5 );
+			WRITE_GROUP( outData + 6, res6 );
+			WRITE_GROUP( outData + 7, res7 );
+
+			inData += 8;
+			outData += 8;
+		}
+	}
+	else if (numVectors == 16)
+	{
+		// zero out the exactly six cache lines we will write into
+		__dcbz128( 0, outData );
+		__dcbz128( 128, outData );
+		__dcbz128( 256, outData );
+		__dcbz128( 384, outData );
+		__dcbz128( 512, outData );
+		__dcbz128( 640, outData );
+	}
+	else if (numVectors == 8)
+	{
+		// zero out the exactly three cache lines we will write into
+		__dcbz128( 0, outData );
+		__dcbz128( 128, outData );
+		__dcbz128( 256, outData );
+	}
+	else
+	{
+		AssertMsg(false, "Can't happen!");
+	}
+	
+	// deal with the ultimate two groups (or, if we were fed
+	// less than 16 groups, the whole shebang)
+	STOP = pVectors + numVectors - 16;
+	
+
+	// Use techniques of loop scheduling to eliminate data hazards; process
+	// eight groups simultaneously so that we never have any operations stalling
+	// waiting for data.
+	// Note: this loop, while pretty fast, could be faster still -- you'll notice
+	// that it does all of its loads, then all computation, then writes everything
+	// out. If made truly cyclic, such that every line interleaved a stage 1, stage 2,
+	// stage 3, and write, then throughput could be higher (probably by about 50%). 
+	while (inData < STOP)
+	{
+		// synchro
+		COMPUTE_STAGE1_GROUP(res0, inData + 0);
+		COMPUTE_STAGE1_GROUP(res1, inData + 1);
+		COMPUTE_STAGE1_GROUP(res2, inData + 2);
+		COMPUTE_STAGE1_GROUP(res3, inData + 3);
+
+		COMPUTE_STAGE2_GROUP(res0, inData + 0);
+		COMPUTE_STAGE1_GROUP(res4, inData + 4);
+		COMPUTE_STAGE2_GROUP(res1, inData + 1);
+		COMPUTE_STAGE1_GROUP(res5, inData + 5);
+		COMPUTE_STAGE2_GROUP(res2, inData + 2);
+		COMPUTE_STAGE1_GROUP(res6, inData + 6);
+		COMPUTE_STAGE2_GROUP(res3, inData + 3);
+		COMPUTE_STAGE1_GROUP(res7, inData + 7);
+
+		COMPUTE_STAGE3_GROUP(res0, inData + 0);
+		COMPUTE_STAGE2_GROUP(res4, inData + 4);
+		COMPUTE_STAGE3_GROUP(res1, inData + 1);
+		COMPUTE_STAGE2_GROUP(res5, inData + 5);
+		COMPUTE_STAGE3_GROUP(res2, inData + 2);
+		COMPUTE_STAGE2_GROUP(res6, inData + 6);
+		COMPUTE_STAGE3_GROUP(res3, inData + 3);
+		COMPUTE_STAGE2_GROUP(res7, inData + 7);
+
+		COMPUTE_STAGE3_GROUP(res4, inData + 4);
+		WRITE_GROUP( outData + 0, res0 );
+		COMPUTE_STAGE3_GROUP(res5, inData + 5);
+		WRITE_GROUP( outData + 1, res1 );
+		COMPUTE_STAGE3_GROUP(res6, inData + 6);
+		WRITE_GROUP( outData + 2, res2 );
+		COMPUTE_STAGE3_GROUP(res7, inData + 7);
+		WRITE_GROUP( outData + 3, res3 );
+
+
+		WRITE_GROUP( outData + 4, res4 );
+		WRITE_GROUP( outData + 5, res5 );
+		WRITE_GROUP( outData + 6, res6 );
+		WRITE_GROUP( outData + 7, res7 );
+
+		inData += 8;
+		outData += 8;
+	}
+
+
+#undef COMPUTE
+#undef WRITE
+#undef COMPUTE_STAGE1_ROW
+#undef COMPUTE_STAGE2_ROW
+#undef COMPUTE_STAGE3_ROW
+#undef COMPUTE_STAGE1_GROUP
+#undef COMPUTE_STAGE2_GROUP
+#undef COMPUTE_STAGE3_GROUP
+#undef COMPUTE_GROUP
+#undef WRITE_GROUP
+}
+#endif
+
+// Transform a long array of FourVectors by a given matrix. 
+void FourVectors::TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut )
+{
+	Assert(numVectors > 0);
+
+	AssertMsg( (pOut < pVectors && pOut+numVectors <= pVectors) ||
+		(pOut > pVectors && pVectors+numVectors <= pOut), "FourVectors::TransformManyBy called with overlapping buffer pointers." );
+
+#ifdef _X360
+	// The really fast version of this function likes to operate on blocks of eight. So, chug through
+	// groups of eight, then deal with any leftovers.
+	int numVectorsRoundedToNearestEight = numVectors & (~0x07);
+	if (numVectors >= 8)
+	{
+		// aligned?
+		if ((reinterpret_cast<unsigned int>(pVectors) & 127) == 0 && (reinterpret_cast<unsigned int>(pOut) & 127) == 0)
+		{
+			FourVectors_TransformManyGroupsOfEightBy_128byteAligned(pVectors, numVectorsRoundedToNearestEight, rotationMatrix, pOut);
+		}
+		else
+		{
+			FourVectors_TransformManyGroupsOfEightBy(pVectors, numVectorsRoundedToNearestEight, rotationMatrix, pOut);
+		}
+		numVectors -= numVectorsRoundedToNearestEight;
+		pVectors += numVectorsRoundedToNearestEight;
+		pOut += numVectorsRoundedToNearestEight;
+	}
+#endif
+
+	// any left over?
+	if (numVectors > 0)
+	{
+
+		// Splat out each of the entries in the matrix to a fltx4. Do this
+		// in the order that we will need them, to hide latency. I'm
+		// avoiding making an array of them, so that they'll remain in 
+		// registers.
+		fltx4 matSplat00, matSplat01, matSplat02, matSplat03,	// TWELVE REGISTERS
+			matSplat10, matSplat11, matSplat12, matSplat13,
+			matSplat20, matSplat21, matSplat22, matSplat23;
+
+		{
+			// Load the matrix into local vectors. Sadly, matrix3x4_ts are 
+			// often unaligned. The w components will be the transpose row of
+			// the matrix.
+			fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
+			fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
+			fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);
+
+			matSplat00 = SplatXSIMD(matCol0);
+			matSplat01 = SplatYSIMD(matCol0);
+			matSplat02 = SplatZSIMD(matCol0);
+			matSplat03 = SplatWSIMD(matCol0);
+
+			matSplat10 = SplatXSIMD(matCol1);
+			matSplat11 = SplatYSIMD(matCol1);
+			matSplat12 = SplatZSIMD(matCol1);
+			matSplat13 = SplatWSIMD(matCol1);
+
+			matSplat20 = SplatXSIMD(matCol2);
+			matSplat21 = SplatYSIMD(matCol2);
+			matSplat22 = SplatZSIMD(matCol2);
+			matSplat23 = SplatWSIMD(matCol2);
+		}
+
+		do 
+		{
+			// Trust in the compiler to schedule these operations correctly:
+			pOut->x = MaddSIMD(pVectors->z, matSplat02, MaddSIMD(pVectors->y, matSplat01, MaddSIMD(pVectors->x, matSplat00, matSplat03)));
+			pOut->y = MaddSIMD(pVectors->z, matSplat12, MaddSIMD(pVectors->y, matSplat11, MaddSIMD(pVectors->x, matSplat00, matSplat13)));
+			pOut->z = MaddSIMD(pVectors->z, matSplat22, MaddSIMD(pVectors->y, matSplat21, MaddSIMD(pVectors->x, matSplat00, matSplat23)));
+
+			++pOut;
+			++pVectors;
+			--numVectors;
+		} while(numVectors > 0);
+	}
+}
+
+#ifdef _X360
+// Loop-scheduled code to process FourVectors in groups of eight quite efficiently.
+static void FourVectors_TransformManyGroupsOfEightBy_InPlace(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix )
+{
+	Assert(numVectors > 0);
+	if ( numVectors == 0 )
+		return;
+
+	// Prefetch line 1 and 2
+	__dcbt(0,pVectors);
+	__dcbt(128,pVectors);
+
+	// Splat out each of the entries in the matrix to a fltx4. Do this
+	// in the order that we will need them, to hide latency. I'm
+	// avoiding making an array of them, so that they'll remain in 
+	// registers.
+	fltx4 matSplat00, matSplat01, matSplat02, matSplat03,	// TWELVE REGISTERS
+		matSplat10, matSplat11, matSplat12, matSplat13,
+		matSplat20, matSplat21, matSplat22, matSplat23;
+
+	{
+		// Load the matrix into local vectors. Sadly, matrix3x4_ts are 
+		// often unaligned. The w components will be the tranpose row of
+		// the matrix.
+		fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
+		fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
+		fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);
+
+		matSplat00 = SplatXSIMD(matCol0);
+		matSplat01 = SplatYSIMD(matCol0);
+		matSplat02 = SplatZSIMD(matCol0);
+		matSplat03 = SplatWSIMD(matCol0);
+
+		matSplat10 = SplatXSIMD(matCol1);
+		matSplat11 = SplatYSIMD(matCol1);
+		matSplat12 = SplatZSIMD(matCol1);
+		matSplat13 = SplatWSIMD(matCol1);
+
+		matSplat20 = SplatXSIMD(matCol2);
+		matSplat21 = SplatYSIMD(matCol2);
+		matSplat22 = SplatZSIMD(matCol2);
+		matSplat23 = SplatWSIMD(matCol2);
+	}
+
+	// this macro defines how to compute a specific row from an input and certain splat columns
+#define COMPUTE(res, invec, xterm, yterm, zterm, transterm) res = AddSIMD( AddSIMD( MulSIMD((invec)->z, zterm), AddSIMD( MulSIMD( (invec)->x, xterm ), MulSIMD( (invec)->y, yterm ) ) ), transterm )
+#define WRITE(term, reg, toptr) toptr->term = reg
+
+	// define result groups (we're going to have an eight-way unroll)
+
+	fltx4 res0X, res0Y, res0Z, res0XTemp, res0YTemp, res0ZTemp;	// 48 REGISTERS
+	fltx4 res1X, res1Y, res1Z, res1XTemp, res1YTemp, res1ZTemp;
+	fltx4 res2X, res2Y, res2Z, res2XTemp, res2YTemp, res2ZTemp;
+	fltx4 res3X, res3Y, res3Z, res3XTemp, res3YTemp, res3ZTemp;
+	fltx4 res4X, res4Y, res4Z, res4XTemp, res4YTemp, res4ZTemp;
+	fltx4 res5X, res5Y, res5Z, res5XTemp, res5YTemp, res5ZTemp;
+	fltx4 res6X, res6Y, res6Z, res6XTemp, res6YTemp, res6ZTemp;
+	fltx4 res7X, res7Y, res7Z, res7XTemp, res7YTemp, res7ZTemp;
+
+
+	// #define FROZ(out,in,offset) COMPUTE((out+offset)->x, (in + offset), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE((out + offset )->y, (in + offset), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE((out + offset)->z, (in + offset), matSplat20, matSplat21, matSplat22, matSplat23)
+#define COMPUTE_GROUP(resgroup,dataptr) COMPUTE(resgroup ## X, (dataptr), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE(resgroup ## Y, (dataptr), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE(resgroup ## Z, (dataptr), matSplat20, matSplat21, matSplat22, matSplat23)
+#define WRITE_GROUP(ptr, resgroup) (ptr)->x = resgroup ## X; (ptr)->y = resgroup ## Y; (ptr)->z = resgroup ## Z
+
+	/*
+	// stage 1 -- 6 ops for xyz, each w 12 cycle latency
+	res0X = MulSIMD( (invec)->y, matSplat01 );
+	res0Temp = MaddSIMD((invec)->z, matSplat02, matSplat03);
+	// stage 2 -- 3 clocks for xyz
+	res0X = MaddSIMD( (invec)->x, matSplat00, res0X );
+	// stage 3 -- 3 clocks for xyz
+	res0X = AddSIMD(res0X, res0Temp);
+	*/
+#define COMPUTE_STAGE1_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MulSIMD( (invec)->y, ysplat ); tempvar = MaddSIMD((invec)->z, zsplat, transplat)
+#define COMPUTE_STAGE2_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MaddSIMD( (invec)->x, xsplat, res )
+#define COMPUTE_STAGE3_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = AddSIMD(res, tempvar)  // frees up the tempvar
+
+#define COMPUTE_STAGE1_GROUP(resgroup, invec) COMPUTE_STAGE1_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
+	COMPUTE_STAGE1_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
+	COMPUTE_STAGE1_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
+
+#define COMPUTE_STAGE2_GROUP(resgroup, invec) COMPUTE_STAGE2_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
+	COMPUTE_STAGE2_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
+	COMPUTE_STAGE2_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
+
+#define COMPUTE_STAGE3_GROUP(resgroup, invec) COMPUTE_STAGE3_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
+	COMPUTE_STAGE3_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
+	COMPUTE_STAGE3_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
+
+	const FourVectors * const RESTRICT STOP = pVectors + numVectors;
+
+	// Use techniques of loop scheduling to eliminate data hazards; process
+	// eight groups simultaneously so that we never have any operations stalling
+	// waiting for data.
+	// Note: this loop, while pretty fast, could be faster still -- you'll notice
+	// that it does all of its loads, then all computation, then writes everything
+	// out. If made truly cyclic, such that every line interleaved a stage 1, stage 2,
+	// stage 3, and write, then throughput could be higher (probably by about 50%). 
+	while (pVectors < STOP)
+	{
+		// start prefetching the three cache lines
+		// we'll hit two iterations from now
+		__dcbt( sizeof(FourVectors) * 16,       pVectors );
+		__dcbt( sizeof(FourVectors) * 16 + 128, pVectors );
+		__dcbt( sizeof(FourVectors) * 16 + 256, pVectors );
+
+		// synchro
+		COMPUTE_STAGE1_GROUP(res0, pVectors + 0);
+		COMPUTE_STAGE1_GROUP(res1, pVectors + 1);
+		COMPUTE_STAGE1_GROUP(res2, pVectors + 2);
+		COMPUTE_STAGE1_GROUP(res3, pVectors + 3);
+
+		COMPUTE_STAGE2_GROUP(res0, pVectors + 0);
+		COMPUTE_STAGE1_GROUP(res4, pVectors + 4);
+		COMPUTE_STAGE2_GROUP(res1, pVectors + 1);
+		COMPUTE_STAGE1_GROUP(res5, pVectors + 5);
+		COMPUTE_STAGE2_GROUP(res2, pVectors + 2);
+		COMPUTE_STAGE1_GROUP(res6, pVectors + 6);
+		COMPUTE_STAGE2_GROUP(res3, pVectors + 3);
+		COMPUTE_STAGE1_GROUP(res7, pVectors + 7);
+
+		COMPUTE_STAGE3_GROUP(res0, pVectors + 0);
+		COMPUTE_STAGE2_GROUP(res4, pVectors + 4);
+		COMPUTE_STAGE3_GROUP(res1, pVectors + 1);
+		COMPUTE_STAGE2_GROUP(res5, pVectors + 5);
+		COMPUTE_STAGE3_GROUP(res2, pVectors + 2);
+		COMPUTE_STAGE2_GROUP(res6, pVectors + 6);
+		COMPUTE_STAGE3_GROUP(res3, pVectors + 3);
+		COMPUTE_STAGE2_GROUP(res7, pVectors + 7);
+
+		COMPUTE_STAGE3_GROUP(res4, pVectors + 4);
+		WRITE_GROUP( pVectors + 0, res0 );
+		COMPUTE_STAGE3_GROUP(res5, pVectors + 5);
+		WRITE_GROUP( pVectors + 1, res1 );
+		COMPUTE_STAGE3_GROUP(res6, pVectors + 6);
+		WRITE_GROUP( pVectors + 2, res2 );
+		COMPUTE_STAGE3_GROUP(res7, pVectors + 7);
+		WRITE_GROUP( pVectors + 3, res3 );
+
+		WRITE_GROUP( pVectors + 4, res4 );
+		WRITE_GROUP( pVectors + 5, res5 );
+		WRITE_GROUP( pVectors + 6, res6 );
+		WRITE_GROUP( pVectors + 7, res7 );
+
+		pVectors += 8;
+	}
+
+
+#undef COMPUTE
+#undef WRITE
+#undef COMPUTE_STAGE1_ROW
+#undef COMPUTE_STAGE2_ROW
+#undef COMPUTE_STAGE3_ROW
+#undef COMPUTE_STAGE1_GROUP
+#undef COMPUTE_STAGE2_GROUP
+#undef COMPUTE_STAGE3_GROUP
+#undef COMPUTE_GROUP
+#undef WRITE_GROUP
+}
+#endif
+
+// In-place version of above. It's necessary to have this, rather than just allowing pOut and pVectors
+// to equal each other, because of the semantics of RESTRICT: pVectors and pOut must not be allowed 
+// to alias. (Simply un-restricting the pointers results in very poor scheduling.)
+void FourVectors::TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix )
+{
+	Assert(numVectors > 0);
+
+#ifdef _X360
+	// The really fast version of this function likes to operate on blocks of eight. So, chug through
+	// groups of eight, then deal with any leftovers.
+	int numVectorsRoundedToNearestEight = numVectors & (~0x07);
+	if (numVectors >= 8)
+	{
+		FourVectors_TransformManyGroupsOfEightBy_InPlace(pVectors, numVectorsRoundedToNearestEight, rotationMatrix);
+		numVectors -= numVectorsRoundedToNearestEight;
+		pVectors += numVectorsRoundedToNearestEight;
+	}
+#endif
+
+	// any left over?
+	if (numVectors > 0)
+	{
+
+		// Splat out each of the entries in the matrix to a fltx4. Do this
+		// in the order that we will need them, to hide latency. I'm
+		// avoiding making an array of them, so that they'll remain in 
+		// registers.
+		fltx4 matSplat00, matSplat01, matSplat02, matSplat03,	// TWELVE REGISTERS
+			matSplat10, matSplat11, matSplat12, matSplat13,
+			matSplat20, matSplat21, matSplat22, matSplat23;
+
+		{
+			// Load the matrix into local vectors. Sadly, matrix3x4_ts are 
+			// often unaligned. The w components will be the transpose row of
+			// the matrix.
+			fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
+			fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
+			fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);
+
+			matSplat00 = SplatXSIMD(matCol0);
+			matSplat01 = SplatYSIMD(matCol0);
+			matSplat02 = SplatZSIMD(matCol0);
+			matSplat03 = SplatWSIMD(matCol0);
+
+			matSplat10 = SplatXSIMD(matCol1);
+			matSplat11 = SplatYSIMD(matCol1);
+			matSplat12 = SplatZSIMD(matCol1);
+			matSplat13 = SplatWSIMD(matCol1);
+
+			matSplat20 = SplatXSIMD(matCol2);
+			matSplat21 = SplatYSIMD(matCol2);
+			matSplat22 = SplatZSIMD(matCol2);
+			matSplat23 = SplatWSIMD(matCol2);
+		}
+
+		do 
+		{
+			fltx4 resultX, resultY, resultZ;
+			// Trust in the compiler to schedule these operations correctly:
+			resultX = MaddSIMD(pVectors->z, matSplat02, MaddSIMD(pVectors->y, matSplat01, MaddSIMD(pVectors->x, matSplat00, matSplat03)));
+			resultY = MaddSIMD(pVectors->z, matSplat12, MaddSIMD(pVectors->y, matSplat11, MaddSIMD(pVectors->x, matSplat00, matSplat13)));
+			resultZ = MaddSIMD(pVectors->z, matSplat22, MaddSIMD(pVectors->y, matSplat21, MaddSIMD(pVectors->x, matSplat00, matSplat23)));
+
+			pVectors->x = resultX;
+			pVectors->y = resultY;
+			pVectors->z = resultZ;
+
+			++pVectors;
+			--numVectors;
+		} while(numVectors > 0);
+	}
+}
+
+
+#endif
+
+// Transform many (horizontal) points in-place by a 3x4 matrix,
+// here already loaded onto three fltx4 registers but not transposed. 
+// The points must be stored as 16-byte aligned. They are points
+// and not vectors because we assume the w-component to be 1. 
+#ifdef _X360
+void TransformManyPointsBy(VectorAligned * RESTRICT pVectors, unsigned int numVectors, FLTX4 mRow0, FLTX4 mRow1, FLTX4 mRow2)
+{
+	/**************************************************
+	 *  Here is an elaborate and carefully scheduled  *
+	 *  algorithm nicked from xboxmath.inl and hacked *
+	 *  up for 3x4 matrices.                          *
+	 **************************************************/
+
+	COMPILE_TIME_ASSERT(sizeof(VectorAligned) == sizeof(XMFLOAT4)); // VectorAligned's need to be 16 bytes
+
+	XMVECTOR R0[8], R1[8], R2[8];
+	XMVECTOR vIn[8];
+
+	//    C_ASSERT(UnrollCount == 8);
+	//    C_ASSERT(sizeof(XMFLOAT4) == 16);
+	Assert(pVectors);
+	Assert(((UINT_PTR)pVectors & 3) == 0); // assert alignment 
+
+	UINT GroupIndex;
+
+	VectorAligned * RESTRICT vCurrent = pVectors;
+	// sentinel pointers
+	VectorAligned * vStreamEnd, *vStreamGroupBase, *vStreamGroupEnd;
+
+	{
+		// cook up the pointers from integer math. Necessary because otherwise we LHS all over
+		// the place. (Odd that this doesn't happen to the xbox math.) 
+
+		UINT_PTR InputVector = (UINT_PTR)pVectors;
+		UINT_PTR InputStreamEnd = InputVector + numVectors * sizeof(XMFLOAT4);
+		// compute start and end points on 128-byte alignment
+		UINT_PTR InputStreamCGroupBase = XMMin(InputVector + (XM_CACHE_LINE_SIZE - 1), InputStreamEnd) & ~(XM_CACHE_LINE_SIZE - 1);
+		UINT_PTR InputStreamCGroupEnd = InputStreamCGroupBase + ((InputStreamEnd - InputStreamCGroupBase) & ~(4 * XM_CACHE_LINE_SIZE - 1));
+
+		vStreamEnd = (VectorAligned *)InputStreamEnd;
+		vStreamGroupBase = (VectorAligned *)InputStreamCGroupBase;
+		vStreamGroupEnd = (VectorAligned *)InputStreamCGroupEnd;
+	}
+
+
+	__dcbt(0,                      vStreamGroupBase);
+	__dcbt(XM_CACHE_LINE_SIZE,     vStreamGroupBase);
+	__dcbt(XM_CACHE_LINE_SIZE * 2, vStreamGroupBase);
+	__dcbt(XM_CACHE_LINE_SIZE * 3, vStreamGroupBase);
+
+	while (vCurrent < vStreamGroupBase)
+	{
+		fltx4 vec = __lvx(vCurrent->Base(), 0);
+
+		R0[0] = __vmsum4fp(vec, mRow0);
+		R1[0] = __vmsum4fp(vec, mRow1);
+		R2[0] = __vmsum4fp(vec, mRow2);
+
+		__stvewx(R0[0], vCurrent->Base(), 0);
+		__stvewx(R1[0], vCurrent->Base(), 4);
+		__stvewx(R2[0], vCurrent->Base(), 8);
+
+		vCurrent++; 
+	}
+
+	while (vCurrent < vStreamGroupEnd)
+	{
+		__dcbt(XM_CACHE_LINE_SIZE * 4, vCurrent);
+		__dcbt(XM_CACHE_LINE_SIZE * 5, vCurrent);
+		__dcbt(XM_CACHE_LINE_SIZE * 6, vCurrent);
+		__dcbt(XM_CACHE_LINE_SIZE * 7, vCurrent);
+
+		for (GroupIndex = 0; GroupIndex < 4; GroupIndex++)
+		{
+			// all kinds of LHS on this pointer. Why?
+			VectorAligned* OutputVector = vCurrent;
+
+			vIn[0] = __lvx(vCurrent->Base(), 0);
+			vCurrent++;
+			vIn[1] = __lvx(vCurrent->Base(), 0);
+			vCurrent++;
+			vIn[2] = __lvx(vCurrent->Base(), 0);
+			vCurrent++;
+			vIn[3] = __lvx(vCurrent->Base(), 0);
+			vCurrent++;
+			vIn[4] = __lvx(vCurrent->Base(), 0);
+			vCurrent++;
+			vIn[5] = __lvx(vCurrent->Base(), 0);
+			vCurrent++;
+			vIn[6] = __lvx(vCurrent->Base(), 0);
+			vCurrent++;
+			vIn[7] = __lvx(vCurrent->Base(), 0);
+			vCurrent++;
+
+			R0[0] = __vmsum4fp(vIn[0], mRow0);
+			R1[0] = __vmsum4fp(vIn[0], mRow1);
+			R2[0] = __vmsum4fp(vIn[0], mRow2);
+
+			R0[1] = __vmsum4fp(vIn[1], mRow0);
+			R1[1] = __vmsum4fp(vIn[1], mRow1);
+			R2[1] = __vmsum4fp(vIn[1], mRow2);
+
+			R0[2] = __vmsum4fp(vIn[2], mRow0);
+			R1[2] = __vmsum4fp(vIn[2], mRow1);
+			R2[2] = __vmsum4fp(vIn[2], mRow2);
+
+			R0[3] = __vmsum4fp(vIn[3], mRow0);
+			R1[3] = __vmsum4fp(vIn[3], mRow1);
+			R2[3] = __vmsum4fp(vIn[3], mRow2);
+
+			R0[4] = __vmsum4fp(vIn[4], mRow0);
+			R1[4] = __vmsum4fp(vIn[4], mRow1);
+			R2[4] = __vmsum4fp(vIn[4], mRow2);
+
+			R0[5] = __vmsum4fp(vIn[5], mRow0);
+			R1[5] = __vmsum4fp(vIn[5], mRow1);
+			R2[5] = __vmsum4fp(vIn[5], mRow2);
+
+			R0[6] = __vmsum4fp(vIn[6], mRow0);
+			R1[6] = __vmsum4fp(vIn[6], mRow1);
+			R2[6] = __vmsum4fp(vIn[6], mRow2);
+
+			R0[7] = __vmsum4fp(vIn[7], mRow0);
+			R1[7] = __vmsum4fp(vIn[7], mRow1);
+			R2[7] = __vmsum4fp(vIn[7], mRow2);
+
+			__stvewx(R0[0], OutputVector, 0);
+			__stvewx(R1[0], OutputVector, 4);
+			__stvewx(R2[0], OutputVector, 8);
+			OutputVector++;
+
+			__stvewx(R0[1], OutputVector, 0);
+			__stvewx(R1[1], OutputVector, 4);
+			__stvewx(R2[1], OutputVector, 8);
+			OutputVector++;
+
+			__stvewx(R0[2], OutputVector, 0);
+			__stvewx(R1[2], OutputVector, 4);
+			__stvewx(R2[2], OutputVector, 8);
+			OutputVector++;
+
+			__stvewx(R0[3], OutputVector, 0);
+			__stvewx(R1[3], OutputVector, 4);
+			__stvewx(R2[3], OutputVector, 8);
+			OutputVector++;
+
+			__stvewx(R0[4], OutputVector, 0);
+			__stvewx(R1[4], OutputVector, 4);
+			__stvewx(R2[4], OutputVector, 8);
+			OutputVector++;
+
+			__stvewx(R0[5], OutputVector, 0);
+			__stvewx(R1[5], OutputVector, 4);
+			__stvewx(R2[5], OutputVector, 8);
+			OutputVector++;
+
+			__stvewx(R0[6], OutputVector, 0);
+			__stvewx(R1[6], OutputVector, 4);
+			__stvewx(R2[6], OutputVector, 8);
+			OutputVector++;
+
+			__stvewx(R0[7], OutputVector, 0);
+			__stvewx(R1[7], OutputVector, 4);
+			__stvewx(R2[7], OutputVector, 8);
+			OutputVector++;
+		}
+	}
+
+	while (vCurrent < vStreamEnd)
+	{
+		vIn[0] = __lvx(vCurrent->Base(), 0);
+
+		R0[0] = __vmsum4fp(vIn[0], mRow0);
+		R1[0] = __vmsum4fp(vIn[0], mRow1);
+		R2[0] = __vmsum4fp(vIn[0], mRow2);
+
+		__stvewx(R0[0], vCurrent->Base(), 0);
+		__stvewx(R1[0], vCurrent->Base(), 4);
+		__stvewx(R2[0], vCurrent->Base(), 8);
+
+		vCurrent++;
+	}
+	
+
+}
+#endif
diff --git a/mp/src/mathlib/ssenoise.cpp b/mp/src/mathlib/ssenoise.cpp
index 60c2eb82..244a1e59 100644
--- a/mp/src/mathlib/ssenoise.cpp
+++ b/mp/src/mathlib/ssenoise.cpp
@@ -1,105 +1,105 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: Fast low quality noise suitable for real time use
-//
-//=====================================================================================//
-
-#include <math.h>
-#include <float.h>	// Needed for FLT_EPSILON
-#include "basetypes.h"
-#include <memory.h>
-#include "tier0/dbg.h"
-#include "mathlib/mathlib.h"
-#include "mathlib/vector.h"
-#include "mathlib/ssemath.h"
-
-// memdbgon must be the last include file in a .cpp file!!!
-#include "tier0/memdbgon.h"
-#include "noisedata.h"
-
-
-#define MAGIC_NUMBER (1<<15)								// gives 8 bits of fraction
-
-static fltx4 Four_MagicNumbers = { MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER };
-
-
-static ALIGN16 int32 idx_mask[4]= {0xffff, 0xffff, 0xffff, 0xffff};
-
-#define MASK255 (*((fltx4 *)(& idx_mask )))
-
-// returns 0..1
-static inline float GetLatticePointValue( int idx_x, int idx_y, int idx_z )
-{
-	int ret_idx = perm_a[idx_x & 0xff];
-	ret_idx = perm_b[( idx_y + ret_idx ) & 0xff];
-	ret_idx = perm_c[( idx_z + ret_idx ) & 0xff];
-	return impulse_xcoords[ret_idx];
-
-}
-
-fltx4 NoiseSIMD( const fltx4 & x, const fltx4 & y, const fltx4 & z )
-{
-	// use magic to convert to integer index
-	fltx4 x_idx = AndSIMD( MASK255, AddSIMD( x, Four_MagicNumbers ) );
-	fltx4 y_idx = AndSIMD( MASK255, AddSIMD( y, Four_MagicNumbers ) );
-	fltx4 z_idx = AndSIMD( MASK255, AddSIMD( z, Four_MagicNumbers ) );
-
-	fltx4 lattice000 = Four_Zeros, lattice001 = Four_Zeros, lattice010 = Four_Zeros, lattice011 = Four_Zeros;
-	fltx4 lattice100 = Four_Zeros, lattice101 = Four_Zeros, lattice110 = Four_Zeros, lattice111 = Four_Zeros;
-
-	// FIXME: Converting the input vectors to int indices will cause load-hit-stores (48 bytes)
-	//        Converting the indexed noise values back to vectors will cause more (128 bytes)
-	//        The noise table could store vectors if we chunked it into 2x2x2 blocks.
-	fltx4 xfrac = Four_Zeros, yfrac = Four_Zeros, zfrac = Four_Zeros;
-#define DOPASS(i)															\
-    {	unsigned int xi = SubInt( x_idx, i );								\
-		unsigned int yi = SubInt( y_idx, i );								\
-		unsigned int zi = SubInt( z_idx, i );								\
-		SubFloat( xfrac, i ) = (xi & 0xff)*(1.0/256.0);						\
-		SubFloat( yfrac, i ) = (yi & 0xff)*(1.0/256.0);						\
-		SubFloat( zfrac, i ) = (zi & 0xff)*(1.0/256.0);						\
-		xi>>=8;																\
-		yi>>=8;																\
-		zi>>=8;																\
-																			\
-		SubFloat( lattice000, i ) = GetLatticePointValue( xi,yi,zi );		\
-		SubFloat( lattice001, i ) = GetLatticePointValue( xi,yi,zi+1 );		\
-		SubFloat( lattice010, i ) = GetLatticePointValue( xi,yi+1,zi );		\
-		SubFloat( lattice011, i ) = GetLatticePointValue( xi,yi+1,zi+1 );	\
-		SubFloat( lattice100, i ) = GetLatticePointValue( xi+1,yi,zi );		\
-		SubFloat( lattice101, i ) = GetLatticePointValue( xi+1,yi,zi+1 );	\
-		SubFloat( lattice110, i ) = GetLatticePointValue( xi+1,yi+1,zi );	\
-		SubFloat( lattice111, i ) = GetLatticePointValue( xi+1,yi+1,zi+1 );	\
-    }
-
-	DOPASS( 0 );
-	DOPASS( 1 );
-	DOPASS( 2 );
-	DOPASS( 3 );
-
-	// now, we have 8 lattice values for each of four points as m128s, and interpolant values for
-	// each axis in m128 form in [xyz]frac. Perfom the trilinear interpolation as SIMD ops
-
-	// first, do x interpolation
-	fltx4 l2d00 = AddSIMD( lattice000, MulSIMD( xfrac, SubSIMD( lattice100, lattice000 ) ) );
-	fltx4 l2d01 = AddSIMD( lattice001, MulSIMD( xfrac, SubSIMD( lattice101, lattice001 ) ) );
-	fltx4 l2d10 = AddSIMD( lattice010, MulSIMD( xfrac, SubSIMD( lattice110, lattice010 ) ) );
-	fltx4 l2d11 = AddSIMD( lattice011, MulSIMD( xfrac, SubSIMD( lattice111, lattice011 ) ) );
-
-	// now, do y interpolation
-	fltx4 l1d0 = AddSIMD( l2d00, MulSIMD( yfrac, SubSIMD( l2d10, l2d00 ) ) );
-	fltx4 l1d1 = AddSIMD( l2d01, MulSIMD( yfrac, SubSIMD( l2d11, l2d01 ) ) );
-
-	// final z interpolation
-	fltx4 rslt = AddSIMD( l1d0, MulSIMD( zfrac, SubSIMD( l1d1, l1d0 ) ) );
-
-	// map to 0..1
-	return MulSIMD( Four_Twos, SubSIMD( rslt, Four_PointFives ) );
-
-
-}
-
-fltx4 NoiseSIMD( FourVectors const &pos )
-{
-	return NoiseSIMD( pos.x, pos.y, pos.z );
-}
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: Fast low quality noise suitable for real time use
+//
+//=====================================================================================//
+
+#include <math.h>
+#include <float.h>	// Needed for FLT_EPSILON
+#include "basetypes.h"
+#include <memory.h>
+#include "tier0/dbg.h"
+#include "mathlib/mathlib.h"
+#include "mathlib/vector.h"
+#include "mathlib/ssemath.h"
+
+// memdbgon must be the last include file in a .cpp file!!!
+#include "tier0/memdbgon.h"
+#include "noisedata.h"
+
+
+#define MAGIC_NUMBER (1<<15)								// gives 8 bits of fraction
+
+static fltx4 Four_MagicNumbers = { MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER };
+
+
+static ALIGN16 int32 idx_mask[4]= {0xffff, 0xffff, 0xffff, 0xffff};
+
+#define MASK255 (*((fltx4 *)(& idx_mask )))
+
+// returns 0..1
+static inline float GetLatticePointValue( int idx_x, int idx_y, int idx_z )
+{
+	int ret_idx = perm_a[idx_x & 0xff];
+	ret_idx = perm_b[( idx_y + ret_idx ) & 0xff];
+	ret_idx = perm_c[( idx_z + ret_idx ) & 0xff];
+	return impulse_xcoords[ret_idx];
+
+}
+
+fltx4 NoiseSIMD( const fltx4 & x, const fltx4 & y, const fltx4 & z )
+{
+	// use magic to convert to integer index
+	fltx4 x_idx = AndSIMD( MASK255, AddSIMD( x, Four_MagicNumbers ) );
+	fltx4 y_idx = AndSIMD( MASK255, AddSIMD( y, Four_MagicNumbers ) );
+	fltx4 z_idx = AndSIMD( MASK255, AddSIMD( z, Four_MagicNumbers ) );
+
+	fltx4 lattice000 = Four_Zeros, lattice001 = Four_Zeros, lattice010 = Four_Zeros, lattice011 = Four_Zeros;
+	fltx4 lattice100 = Four_Zeros, lattice101 = Four_Zeros, lattice110 = Four_Zeros, lattice111 = Four_Zeros;
+
+	// FIXME: Converting the input vectors to int indices will cause load-hit-stores (48 bytes)
+	//        Converting the indexed noise values back to vectors will cause more (128 bytes)
+	//        The noise table could store vectors if we chunked it into 2x2x2 blocks.
+	fltx4 xfrac = Four_Zeros, yfrac = Four_Zeros, zfrac = Four_Zeros;
+#define DOPASS(i)															\
+    {	unsigned int xi = SubInt( x_idx, i );								\
+		unsigned int yi = SubInt( y_idx, i );								\
+		unsigned int zi = SubInt( z_idx, i );								\
+		SubFloat( xfrac, i ) = (xi & 0xff)*(1.0/256.0);						\
+		SubFloat( yfrac, i ) = (yi & 0xff)*(1.0/256.0);						\
+		SubFloat( zfrac, i ) = (zi & 0xff)*(1.0/256.0);						\
+		xi>>=8;																\
+		yi>>=8;																\
+		zi>>=8;																\
+																			\
+		SubFloat( lattice000, i ) = GetLatticePointValue( xi,yi,zi );		\
+		SubFloat( lattice001, i ) = GetLatticePointValue( xi,yi,zi+1 );		\
+		SubFloat( lattice010, i ) = GetLatticePointValue( xi,yi+1,zi );		\
+		SubFloat( lattice011, i ) = GetLatticePointValue( xi,yi+1,zi+1 );	\
+		SubFloat( lattice100, i ) = GetLatticePointValue( xi+1,yi,zi );		\
+		SubFloat( lattice101, i ) = GetLatticePointValue( xi+1,yi,zi+1 );	\
+		SubFloat( lattice110, i ) = GetLatticePointValue( xi+1,yi+1,zi );	\
+		SubFloat( lattice111, i ) = GetLatticePointValue( xi+1,yi+1,zi+1 );	\
+    }
+
+	DOPASS( 0 );
+	DOPASS( 1 );
+	DOPASS( 2 );
+	DOPASS( 3 );
+
+	// now, we have 8 lattice values for each of four points as m128s, and interpolant values for
+	// each axis in m128 form in [xyz]frac. Perfom the trilinear interpolation as SIMD ops
+
+	// first, do x interpolation
+	fltx4 l2d00 = AddSIMD( lattice000, MulSIMD( xfrac, SubSIMD( lattice100, lattice000 ) ) );
+	fltx4 l2d01 = AddSIMD( lattice001, MulSIMD( xfrac, SubSIMD( lattice101, lattice001 ) ) );
+	fltx4 l2d10 = AddSIMD( lattice010, MulSIMD( xfrac, SubSIMD( lattice110, lattice010 ) ) );
+	fltx4 l2d11 = AddSIMD( lattice011, MulSIMD( xfrac, SubSIMD( lattice111, lattice011 ) ) );
+
+	// now, do y interpolation
+	fltx4 l1d0 = AddSIMD( l2d00, MulSIMD( yfrac, SubSIMD( l2d10, l2d00 ) ) );
+	fltx4 l1d1 = AddSIMD( l2d01, MulSIMD( yfrac, SubSIMD( l2d11, l2d01 ) ) );
+
+	// final z interpolation
+	fltx4 rslt = AddSIMD( l1d0, MulSIMD( zfrac, SubSIMD( l1d1, l1d0 ) ) );
+
+	// map to 0..1
+	return MulSIMD( Four_Twos, SubSIMD( rslt, Four_PointFives ) );
+
+
+}
+
+fltx4 NoiseSIMD( FourVectors const &pos )
+{
+	return NoiseSIMD( pos.x, pos.y, pos.z );
+}
diff --git a/mp/src/mathlib/vector.cpp b/mp/src/mathlib/vector.cpp
index 420486d4..5cb72d49 100644
--- a/mp/src/mathlib/vector.cpp
+++ b/mp/src/mathlib/vector.cpp
@@ -1,12 +1,12 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: 
-//
-// $NoKeywords: $
-//
-//=============================================================================//
-
-#include "mathlib/vector.h"
-
-Vector vec3_origin(0,0,0);
-
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+
+#include "mathlib/vector.h"
+
+Vector vec3_origin(0,0,0);
+
diff --git a/mp/src/mathlib/vmatrix.cpp b/mp/src/mathlib/vmatrix.cpp
index 77c0656f..1cd316f3 100644
--- a/mp/src/mathlib/vmatrix.cpp
+++ b/mp/src/mathlib/vmatrix.cpp
@@ -1,1273 +1,1273 @@
-//========= Copyright Valve Corporation, All rights reserved. ============//
-//
-// Purpose: 
-//
-// $NoKeywords: $
-//
-//=============================================================================//
-
-#if !defined(_STATIC_LINKED) || defined(_SHARED_LIB)
-
-#include "basetypes.h"
-#include "mathlib/vmatrix.h"
-#include "mathlib/mathlib.h"
-#include <string.h>
-#include "mathlib/vector4d.h"
-#include "tier0/dbg.h"
-
-// memdbgon must be the last include file in a .cpp file!!!
-#include "tier0/memdbgon.h"
-
-#pragma warning (disable : 4700) // local variable 'x' used without having been initialized
-
-// ------------------------------------------------------------------------------------------- //
-// Helper functions.
-// ------------------------------------------------------------------------------------------- //
-
-#ifndef VECTOR_NO_SLOW_OPERATIONS
-
-VMatrix SetupMatrixIdentity()
-{
-	return VMatrix(
-		1.0f, 0.0f, 0.0f, 0.0f,
-		0.0f, 1.0f, 0.0f, 0.0f,
-		0.0f, 0.0f, 1.0f, 0.0f,
-		0.0f, 0.0f, 0.0f, 1.0f);
-}
-
-VMatrix SetupMatrixTranslation(const Vector &vTranslation)
-{
-	return VMatrix(
-		1.0f, 0.0f, 0.0f, vTranslation.x,
-		0.0f, 1.0f, 0.0f, vTranslation.y,
-		0.0f, 0.0f, 1.0f, vTranslation.z,
-		0.0f, 0.0f, 0.0f, 1.0f
-		);
-}
-
-VMatrix SetupMatrixScale(const Vector &vScale)
-{
-	return VMatrix(
-		vScale.x, 0.0f,     0.0f,     0.0f,
-		0.0f,     vScale.y, 0.0f,     0.0f,
-		0.0f,     0.0f,     vScale.z, 0.0f,
-		0.0f,     0.0f,     0.0f,     1.0f
-		);
-}
-
-VMatrix SetupMatrixReflection(const VPlane &thePlane)
-{
-	VMatrix mReflect, mBack, mForward;
-	Vector vOrigin, N;
-
-	N = thePlane.m_Normal;
-
-	mReflect.Init( 
-		-2.0f*N.x*N.x + 1.0f,	-2.0f*N.x*N.y,			-2.0f*N.x*N.z,			0.0f,
-		-2.0f*N.y*N.x,			-2.0f*N.y*N.y + 1.0f,	-2.0f*N.y*N.z,			0.0f,
-		-2.0f*N.z*N.x,			-2.0f*N.z*N.y,			-2.0f*N.z*N.z + 1.0f,	0.0f,
-		0.0f,					0.0f,					0.0f,					1.0f
-		);
-
-	vOrigin = thePlane.GetPointOnPlane();
-
-	mBack.Identity();
-	mBack.SetTranslation(-vOrigin);
-
-	mForward.Identity();
-	mForward.SetTranslation(vOrigin);
-
-	// (multiplied in reverse order, so it translates to the origin point,
-	// reflects, and translates back).
-	return mForward * mReflect * mBack;
-}
-
-VMatrix SetupMatrixProjection(const Vector &vOrigin, const VPlane &thePlane)
-{
-	vec_t dot;
-	VMatrix mRet;
-
-
-	#define PN thePlane.m_Normal
-	#define PD thePlane.m_Dist;
-
-		dot = PN[0]*vOrigin.x + PN[1]*vOrigin.y + PN[2]*vOrigin.z - PD;
-
-		mRet.m[0][0] = dot - vOrigin.x * PN[0];
-		mRet.m[0][1] = -vOrigin.x * PN[1];
-		mRet.m[0][2] = -vOrigin.x * PN[2];
-		mRet.m[0][3] = -vOrigin.x * -PD;
-
-		mRet.m[1][0] = -vOrigin.y * PN[0];
-		mRet.m[1][1] = dot - vOrigin.y * PN[1];
-		mRet.m[1][2] = -vOrigin.y * PN[2];
-		mRet.m[1][3] = -vOrigin.y * -PD;
-
-		mRet.m[2][0] = -vOrigin.z * PN[0];
-		mRet.m[2][1] = -vOrigin.z * PN[1];
-		mRet.m[2][2] = dot - vOrigin.z * PN[2];
-		mRet.m[2][3] = -vOrigin.z * -PD;
-
-		mRet.m[3][0] = -PN[0];
-		mRet.m[3][1] = -PN[1];
-		mRet.m[3][2] = -PN[2];
-		mRet.m[3][3] = dot + PD;
-
-	#undef PN
-	#undef PD	
-
-	return mRet;
-}
-
-VMatrix SetupMatrixAxisRot(const Vector &vAxis, vec_t fDegrees)
-{
-	vec_t s, c, t;
-	vec_t tx, ty, tz;
-	vec_t sx, sy, sz;
-	vec_t fRadians;
-
-
-	fRadians = fDegrees * (M_PI / 180.0f);
-	
-	s = (vec_t)sin(fRadians);
-	c = (vec_t)cos(fRadians);
-	t = 1.0f - c;
-
-	tx = t * vAxis.x;	ty = t * vAxis.y;	tz = t * vAxis.z;
-	sx = s * vAxis.x;	sy = s * vAxis.y;	sz = s * vAxis.z;
-
-	return VMatrix(
-		tx*vAxis.x + c,  tx*vAxis.y - sz, tx*vAxis.z + sy, 0.0f,
-		tx*vAxis.y + sz, ty*vAxis.y + c,  ty*vAxis.z - sx, 0.0f,
-		tx*vAxis.z - sy, ty*vAxis.z + sx, tz*vAxis.z + c,  0.0f,
-		0.0f, 0.0f, 0.0f, 1.0f);
-}
-
-VMatrix SetupMatrixAngles(const QAngle &vAngles)
-{
-	VMatrix mRet;
-	MatrixFromAngles( vAngles, mRet );
-	return mRet;
-}
-
-VMatrix SetupMatrixOrgAngles(const Vector &origin, const QAngle &vAngles)
-{
-	VMatrix mRet;
-	mRet.SetupMatrixOrgAngles( origin, vAngles );
-	return mRet;
-}
-
-#endif // VECTOR_NO_SLOW_OPERATIONS
-
-
-bool PlaneIntersection( const VPlane &vp1, const VPlane &vp2, const VPlane &vp3, Vector &vOut )
-{
-	VMatrix mMat, mInverse;
-
-	mMat.Init(
-		vp1.m_Normal.x, vp1.m_Normal.y, vp1.m_Normal.z, -vp1.m_Dist,
-		vp2.m_Normal.x, vp2.m_Normal.y, vp2.m_Normal.z, -vp2.m_Dist,
-		vp3.m_Normal.x, vp3.m_Normal.y, vp3.m_Normal.z, -vp3.m_Dist,
-		0.0f, 0.0f, 0.0f, 1.0f
-		);
-	
-	if(mMat.InverseGeneral(mInverse))
-	{
-		//vOut = mInverse * Vector(0.0f, 0.0f, 0.0f);
-		mInverse.GetTranslation( vOut );
-		return true;
-	}
-	else
-	{
-		return false;
-	}
-}
-
-
-
-// ------------------------------------------------------------------------------------------- //
-// VMatrix functions.
-// ------------------------------------------------------------------------------------------- //
-
-VMatrix& VMatrix::operator=(const VMatrix &mOther)
-{
-	m[0][0] = mOther.m[0][0];
-	m[0][1] = mOther.m[0][1];
-	m[0][2] = mOther.m[0][2];
-	m[0][3] = mOther.m[0][3];
-
-	m[1][0] = mOther.m[1][0];
-	m[1][1] = mOther.m[1][1];
-	m[1][2] = mOther.m[1][2];
-	m[1][3] = mOther.m[1][3];
-
-	m[2][0] = mOther.m[2][0];
-	m[2][1] = mOther.m[2][1];
-	m[2][2] = mOther.m[2][2];
-	m[2][3] = mOther.m[2][3];
-
-	m[3][0] = mOther.m[3][0];
-	m[3][1] = mOther.m[3][1];
-	m[3][2] = mOther.m[3][2];
-	m[3][3] = mOther.m[3][3];
-
-	return *this;
-}
-
-bool VMatrix::operator==( const VMatrix& src ) const
-{
-	return !memcmp( src.m, m, sizeof(m) );
-}
-
-void VMatrix::MatrixMul( const VMatrix &vm, VMatrix &out ) const
-{
-	out.Init(
-		m[0][0]*vm.m[0][0] + m[0][1]*vm.m[1][0] + m[0][2]*vm.m[2][0] + m[0][3]*vm.m[3][0],
-		m[0][0]*vm.m[0][1] + m[0][1]*vm.m[1][1] + m[0][2]*vm.m[2][1] + m[0][3]*vm.m[3][1],
-		m[0][0]*vm.m[0][2] + m[0][1]*vm.m[1][2] + m[0][2]*vm.m[2][2] + m[0][3]*vm.m[3][2],
-		m[0][0]*vm.m[0][3] + m[0][1]*vm.m[1][3] + m[0][2]*vm.m[2][3] + m[0][3]*vm.m[3][3],
-
-		m[1][0]*vm.m[0][0] + m[1][1]*vm.m[1][0] + m[1][2]*vm.m[2][0] + m[1][3]*vm.m[3][0],
-		m[1][0]*vm.m[0][1] + m[1][1]*vm.m[1][1] + m[1][2]*vm.m[2][1] + m[1][3]*vm.m[3][1],
-		m[1][0]*vm.m[0][2] + m[1][1]*vm.m[1][2] + m[1][2]*vm.m[2][2] + m[1][3]*vm.m[3][2],
-		m[1][0]*vm.m[0][3] + m[1][1]*vm.m[1][3] + m[1][2]*vm.m[2][3] + m[1][3]*vm.m[3][3],
-
-		m[2][0]*vm.m[0][0] + m[2][1]*vm.m[1][0] + m[2][2]*vm.m[2][0] + m[2][3]*vm.m[3][0],
-		m[2][0]*vm.m[0][1] + m[2][1]*vm.m[1][1] + m[2][2]*vm.m[2][1] + m[2][3]*vm.m[3][1],
-		m[2][0]*vm.m[0][2] + m[2][1]*vm.m[1][2] + m[2][2]*vm.m[2][2] + m[2][3]*vm.m[3][2],
-		m[2][0]*vm.m[0][3] + m[2][1]*vm.m[1][3] + m[2][2]*vm.m[2][3] + m[2][3]*vm.m[3][3],
-
-		m[3][0]*vm.m[0][0] + m[3][1]*vm.m[1][0] + m[3][2]*vm.m[2][0] + m[3][3]*vm.m[3][0],
-		m[3][0]*vm.m[0][1] + m[3][1]*vm.m[1][1] + m[3][2]*vm.m[2][1] + m[3][3]*vm.m[3][1],
-		m[3][0]*vm.m[0][2] + m[3][1]*vm.m[1][2] + m[3][2]*vm.m[2][2] + m[3][3]*vm.m[3][2],
-		m[3][0]*vm.m[0][3] + m[3][1]*vm.m[1][3] + m[3][2]*vm.m[2][3] + m[3][3]*vm.m[3][3]
-		);
-}
-
-#ifndef VECTOR_NO_SLOW_OPERATIONS
-
-VMatrix VMatrix::operator*(const VMatrix &vm) const
-{
-	VMatrix ret;
-	MatrixMul( vm, ret );
-	return ret;
-}
-
-#endif
-
-bool VMatrix::InverseGeneral(VMatrix &vInverse) const
-{
-	return MatrixInverseGeneral( *this, vInverse );
-}
-
-
-bool MatrixInverseGeneral(const VMatrix& src, VMatrix& dst)
-{
-	int iRow, i, j, iTemp, iTest;
-	vec_t mul, fTest, fLargest;
-	vec_t mat[4][8];
-	int rowMap[4], iLargest;
-	vec_t *pOut, *pRow, *pScaleRow;
-
-
-	// How it's done.
-	// AX = I
-	// A = this
-	// X = the matrix we're looking for
-	// I = identity
-
-	// Setup AI
-	for(i=0; i < 4; i++)
-	{
-		const vec_t *pIn = src[i];
-		pOut = mat[i];
-
-		for(j=0; j < 4; j++)
-		{
-			pOut[j] = pIn[j];
-		}
-
-		pOut[4] = 0.0f;
-		pOut[5] = 0.0f;
-		pOut[6] = 0.0f;
-		pOut[7] = 0.0f;
-		pOut[i+4] = 1.0f;
-
-		rowMap[i] = i;
-	}
-
-	// Use row operations to get to reduced row-echelon form using these rules:
-	// 1. Multiply or divide a row by a nonzero number.
-	// 2. Add a multiple of one row to another.
-	// 3. Interchange two rows.
-
-	for(iRow=0; iRow < 4; iRow++)
-	{
-		// Find the row with the largest element in this column.
-		fLargest = 0.00001f;
-		iLargest = -1;
-		for(iTest=iRow; iTest < 4; iTest++)
-		{
-			fTest = (vec_t)FloatMakePositive(mat[rowMap[iTest]][iRow]);
-			if(fTest > fLargest)
-			{
-				iLargest = iTest;
-				fLargest = fTest;
-			}
-		}
-
-		// They're all too small.. sorry.
-		if(iLargest == -1)
-		{
-			return false;
-		}
-
-		// Swap the rows.
-		iTemp = rowMap[iLargest];
-		rowMap[iLargest] = rowMap[iRow];
-		rowMap[iRow] = iTemp;
-
-		pRow = mat[rowMap[iRow]];
-
-		// Divide this row by the element.
-		mul = 1.0f / pRow[iRow];
-		for(j=0; j < 8; j++)
-			pRow[j] *= mul;
-
-		pRow[iRow] = 1.0f; // Preserve accuracy...
-		
-		// Eliminate this element from the other rows using operation 2.
-		for(i=0; i < 4; i++)
-		{
-			if(i == iRow)
-				continue;
-
-			pScaleRow = mat[rowMap[i]];
-		
-			// Multiply this row by -(iRow*the element).
-			mul = -pScaleRow[iRow];
-			for(j=0; j < 8; j++)
-			{
-				pScaleRow[j] += pRow[j] * mul;
-			}
-
-			pScaleRow[iRow] = 0.0f; // Preserve accuracy...
-		}
-	}
-
-	// The inverse is on the right side of AX now (the identity is on the left).
-	for(i=0; i < 4; i++)
-	{
-		const vec_t *pIn = mat[rowMap[i]] + 4;
-		pOut = dst.m[i];
-
-		for(j=0; j < 4; j++)
-		{
-			pOut[j] = pIn[j];
-		}
-	}
-
-	return true;
-}
-
-
-//-----------------------------------------------------------------------------
-// Does a fast inverse, assuming the matrix only contains translation and rotation.
-//-----------------------------------------------------------------------------
-void MatrixInverseTR( const VMatrix& src, VMatrix &dst )
-{
-	Vector vTrans, vNewTrans;
-
-	// Transpose the upper 3x3.
-	dst.m[0][0] = src.m[0][0];  dst.m[0][1] = src.m[1][0]; dst.m[0][2] = src.m[2][0];
-	dst.m[1][0] = src.m[0][1];  dst.m[1][1] = src.m[1][1]; dst.m[1][2] = src.m[2][1];
-	dst.m[2][0] = src.m[0][2];  dst.m[2][1] = src.m[1][2]; dst.m[2][2] = src.m[2][2];
-
-	// Transform the translation.
-	vTrans.Init( -src.m[0][3], -src.m[1][3], -src.m[2][3] );
-	Vector3DMultiply( dst, vTrans, vNewTrans );
-	MatrixSetColumn( dst, 3, vNewTrans );
-
-	// Fill in the bottom row.
-	dst.m[3][0] = dst.m[3][1] = dst.m[3][2] = 0.0f;
-	dst.m[3][3] = 1.0f;
-}
-
-
-void VMatrix::InverseTR( VMatrix &ret ) const
-{
-	MatrixInverseTR( *this, ret );
-}
-
-void MatrixInverseTranspose( const VMatrix& src, VMatrix& dst )
-{
-	src.InverseGeneral( dst );
-	MatrixTranspose( dst, dst );
-}
-
-//-----------------------------------------------------------------------------
-// Computes the inverse transpose
-//-----------------------------------------------------------------------------
-void MatrixInverseTranspose( const matrix3x4_t& src, matrix3x4_t& dst )
-{
-	VMatrix tmp, out;
-	tmp.CopyFrom3x4( src );
-	::MatrixInverseTranspose( tmp, out );
-	out.Set3x4( dst );
-}
-
-
-#ifndef VECTOR_NO_SLOW_OPERATIONS
-
-VMatrix VMatrix::InverseTR() const
-{
-	VMatrix ret;
-	MatrixInverseTR( *this, ret );
-	return ret;
-}
-
-Vector VMatrix::GetScale() const
-{
-	Vector vecs[3];
-
-	GetBasisVectors(vecs[0], vecs[1], vecs[2]);
-
-	return Vector(
-		vecs[0].Length(),
-		vecs[1].Length(),
-		vecs[2].Length()
-		);
-}
-
-VMatrix VMatrix::Scale(const Vector &vScale)
-{
-	return VMatrix(
-		m[0][0]*vScale.x, m[0][1]*vScale.y, m[0][2]*vScale.z, m[0][3],
-		m[1][0]*vScale.x, m[1][1]*vScale.y, m[1][2]*vScale.z, m[1][3],
-		m[2][0]*vScale.x, m[2][1]*vScale.y, m[2][2]*vScale.z, m[2][3],
-		m[3][0]*vScale.x, m[3][1]*vScale.y, m[3][2]*vScale.z, 1.0f
-		);
-}
-
-VMatrix VMatrix::NormalizeBasisVectors() const
-{
-	Vector vecs[3];
-	VMatrix mRet;
-
-
-	GetBasisVectors(vecs[0], vecs[1], vecs[2]);
-	
-	VectorNormalize( vecs[0] );
-	VectorNormalize( vecs[1] );
-	VectorNormalize( vecs[2] );
-
-	mRet.SetBasisVectors(vecs[0], vecs[1], vecs[2]);
-	
-	// Set everything but basis vectors to identity.
-	mRet.m[3][0] = mRet.m[3][1] = mRet.m[3][2] = 0.0f;
-	mRet.m[3][3] = 1.0f;
-
-	return mRet;
-}
-
-VMatrix VMatrix::Transpose() const
-{
-	return VMatrix(
-		m[0][0], m[1][0], m[2][0], m[3][0],
-		m[0][1], m[1][1], m[2][1], m[3][1],
-		m[0][2], m[1][2], m[2][2], m[3][2],
-		m[0][3], m[1][3], m[2][3], m[3][3]);
-}
-
-// Transpose upper-left 3x3.
-VMatrix VMatrix::Transpose3x3() const
-{
-	return VMatrix(
-		m[0][0], m[1][0], m[2][0], m[0][3],
-		m[0][1], m[1][1], m[2][1], m[1][3],
-		m[0][2], m[1][2], m[2][2], m[2][3],
-		m[3][0], m[3][1], m[3][2], m[3][3]);
-}
-
-#endif // VECTOR_NO_SLOW_OPERATIONS
-
-
-bool VMatrix::IsRotationMatrix() const
-{
-	Vector &v1 = (Vector&)m[0][0];
-	Vector &v2 = (Vector&)m[1][0];
-	Vector &v3 = (Vector&)m[2][0];
-
-	return 
-		FloatMakePositive( 1 - v1.Length() ) < 0.01f && 
-		FloatMakePositive( 1 - v2.Length() ) < 0.01f && 
-		FloatMakePositive( 1 - v3.Length() ) < 0.01f && 
-		FloatMakePositive( v1.Dot(v2) ) < 0.01f &&
-		FloatMakePositive( v1.Dot(v3) ) < 0.01f &&
-		FloatMakePositive( v2.Dot(v3) ) < 0.01f;
-}
-
-void VMatrix::SetupMatrixOrgAngles( const Vector &origin, const QAngle &vAngles )
-{
-	float		sr, sp, sy, cr, cp, cy;
-
-	SinCos( DEG2RAD( vAngles[YAW] ), &sy, &cy );
-	SinCos( DEG2RAD( vAngles[PITCH] ), &sp, &cp );
-	SinCos( DEG2RAD( vAngles[ROLL] ), &sr, &cr );
-
-	// matrix = (YAW * PITCH) * ROLL
-	m[0][0] = cp*cy;
-	m[1][0] = cp*sy;
-	m[2][0] = -sp;
-	m[0][1] = sr*sp*cy+cr*-sy;
-	m[1][1] = sr*sp*sy+cr*cy;
-	m[2][1] = sr*cp;
-	m[0][2] = (cr*sp*cy+-sr*-sy);
-	m[1][2] = (cr*sp*sy+-sr*cy);
-	m[2][2] = cr*cp;
-	m[0][3] = 0.f;
-	m[1][3] = 0.f;
-	m[2][3] = 0.f;
-	
-	// Add translation
-	m[0][3] = origin.x;
-	m[1][3] = origin.y;
-	m[2][3] = origin.z;
-	m[3][0] = 0.0f;
-	m[3][1] = 0.0f;
-	m[3][2] = 0.0f;
-	m[3][3] = 1.0f;
-}
-
-
-//-----------------------------------------------------------------------------
-// Sets matrix to identity
-//-----------------------------------------------------------------------------
-void MatrixSetIdentity( VMatrix &dst )
-{
-	dst[0][0] = 1.0f; dst[0][1] = 0.0f; dst[0][2] = 0.0f; dst[0][3] = 0.0f;
-	dst[1][0] = 0.0f; dst[1][1] = 1.0f; dst[1][2] = 0.0f; dst[1][3] = 0.0f;
-	dst[2][0] = 0.0f; dst[2][1] = 0.0f; dst[2][2] = 1.0f; dst[2][3] = 0.0f;
-	dst[3][0] = 0.0f; dst[3][1] = 0.0f; dst[3][2] = 0.0f; dst[3][3] = 1.0f;
-}
-
-
-//-----------------------------------------------------------------------------
-// Setup a matrix from euler angles. 
-//-----------------------------------------------------------------------------
-void MatrixFromAngles( const QAngle& vAngles, VMatrix& dst )
-{
-	dst.SetupMatrixOrgAngles( vec3_origin, vAngles );
-}
-
-
-//-----------------------------------------------------------------------------
-// Creates euler angles from a matrix 
-//-----------------------------------------------------------------------------
-void MatrixToAngles( const VMatrix& src, QAngle& vAngles )
-{
-	float forward[3];
-	float left[3];
-	float up[3];
-
-	// Extract the basis vectors from the matrix. Since we only need the Z
-	// component of the up vector, we don't get X and Y.
-	forward[0] = src[0][0];
-	forward[1] = src[1][0];
-	forward[2] = src[2][0];
-	left[0] = src[0][1];
-	left[1] = src[1][1];
-	left[2] = src[2][1];
-	up[2] = src[2][2];
-
-	float xyDist = sqrtf( forward[0] * forward[0] + forward[1] * forward[1] );
-	
-	// enough here to get angles?
-	if ( xyDist > 0.001f )
-	{
-		// (yaw)	y = ATAN( forward.y, forward.x );		-- in our space, forward is the X axis
-		vAngles[1] = RAD2DEG( atan2f( forward[1], forward[0] ) );
-
-		// The engine does pitch inverted from this, but we always end up negating it in the DLL
-		// UNDONE: Fix the engine to make it consistent
-		// (pitch)	x = ATAN( -forward.z, sqrt(forward.x*forward.x+forward.y*forward.y) );
-		vAngles[0] = RAD2DEG( atan2f( -forward[2], xyDist ) );
-
-		// (roll)	z = ATAN( left.z, up.z );
-		vAngles[2] = RAD2DEG( atan2f( left[2], up[2] ) );
-	}
-	else	// forward is mostly Z, gimbal lock-
-	{
-		// (yaw)	y = ATAN( -left.x, left.y );			-- forward is mostly z, so use right for yaw
-		vAngles[1] = RAD2DEG( atan2f( -left[0], left[1] ) );
-
-		// The engine does pitch inverted from this, but we always end up negating it in the DLL
-		// UNDONE: Fix the engine to make it consistent
-		// (pitch)	x = ATAN( -forward.z, sqrt(forward.x*forward.x+forward.y*forward.y) );
-		vAngles[0] = RAD2DEG( atan2f( -forward[2], xyDist ) );
-
-		// Assume no roll in this case as one degree of freedom has been lost (i.e. yaw == roll)
-		vAngles[2] = 0;
-	}
-}
-
-
-//-----------------------------------------------------------------------------
-// Transpose
-//-----------------------------------------------------------------------------
-inline void Swap( float& a, float& b )
-{
-	float tmp = a;
-	a = b;
-	b = tmp;
-}
-
-void MatrixTranspose( const VMatrix& src, VMatrix& dst )
-{
-	if (&src == &dst)
-	{
-		Swap( dst[0][1], dst[1][0] );
-		Swap( dst[0][2], dst[2][0] );
-		Swap( dst[0][3], dst[3][0] );
-		Swap( dst[1][2], dst[2][1] );
-		Swap( dst[1][3], dst[3][1] );
-		Swap( dst[2][3], dst[3][2] );
-	}
-	else
-	{
-		dst[0][0] = src[0][0]; dst[0][1] = src[1][0]; dst[0][2] = src[2][0]; dst[0][3] = src[3][0];
-		dst[1][0] = src[0][1]; dst[1][1] = src[1][1]; dst[1][2] = src[2][1]; dst[1][3] = src[3][1];
-		dst[2][0] = src[0][2]; dst[2][1] = src[1][2]; dst[2][2] = src[2][2]; dst[2][3] = src[3][2];
-		dst[3][0] = src[0][3]; dst[3][1] = src[1][3]; dst[3][2] = src[2][3]; dst[3][3] = src[3][3];
-	}
-}
-
-
-//-----------------------------------------------------------------------------
-// Matrix copy
-//-----------------------------------------------------------------------------
-
-void MatrixCopy( const VMatrix& src, VMatrix& dst )
-{
-	if (&src != &dst)
-	{
-		memcpy( dst.m, src.m, 16 * sizeof(float) );
-	}
-}
-
-//-----------------------------------------------------------------------------
-// Matrix multiply
-//-----------------------------------------------------------------------------
-typedef float VMatrixRaw_t[4];
-
-void MatrixMultiply( const VMatrix& src1, const VMatrix& src2, VMatrix& dst )
-{
-	// Make sure it works if src1 == dst or src2 == dst
-	VMatrix tmp1, tmp2;
-	const VMatrixRaw_t* s1 = (&src1 == &dst) ? tmp1.m : src1.m;
-	const VMatrixRaw_t* s2 = (&src2 == &dst) ? tmp2.m : src2.m;
-
-	if (&src1 == &dst)
-	{
-		MatrixCopy( src1, tmp1 );
-	}
-	if (&src2 == &dst)
-	{
-		MatrixCopy( src2, tmp2 );
-	}
-
-	dst[0][0] = s1[0][0] * s2[0][0] + s1[0][1] * s2[1][0] + s1[0][2] * s2[2][0] + s1[0][3] * s2[3][0];
-	dst[0][1] = s1[0][0] * s2[0][1] + s1[0][1] * s2[1][1] + s1[0][2] * s2[2][1] + s1[0][3] * s2[3][1];
-	dst[0][2] = s1[0][0] * s2[0][2] + s1[0][1] * s2[1][2] + s1[0][2] * s2[2][2] + s1[0][3] * s2[3][2];
-	dst[0][3] = s1[0][0] * s2[0][3] + s1[0][1] * s2[1][3] + s1[0][2] * s2[2][3] + s1[0][3] * s2[3][3];
-
-	dst[1][0] = s1[1][0] * s2[0][0] + s1[1][1] * s2[1][0] + s1[1][2] * s2[2][0] + s1[1][3] * s2[3][0];
-	dst[1][1] = s1[1][0] * s2[0][1] + s1[1][1] * s2[1][1] + s1[1][2] * s2[2][1] + s1[1][3] * s2[3][1];
-	dst[1][2] = s1[1][0] * s2[0][2] + s1[1][1] * s2[1][2] + s1[1][2] * s2[2][2] + s1[1][3] * s2[3][2];
-	dst[1][3] = s1[1][0] * s2[0][3] + s1[1][1] * s2[1][3] + s1[1][2] * s2[2][3] + s1[1][3] * s2[3][3];
-
-	dst[2][0] = s1[2][0] * s2[0][0] + s1[2][1] * s2[1][0] + s1[2][2] * s2[2][0] + s1[2][3] * s2[3][0];
-	dst[2][1] = s1[2][0] * s2[0][1] + s1[2][1] * s2[1][1] + s1[2][2] * s2[2][1] + s1[2][3] * s2[3][1];
-	dst[2][2] = s1[2][0] * s2[0][2] + s1[2][1] * s2[1][2] + s1[2][2] * s2[2][2] + s1[2][3] * s2[3][2];
-	dst[2][3] = s1[2][0] * s2[0][3] + s1[2][1] * s2[1][3] + s1[2][2] * s2[2][3] + s1[2][3] * s2[3][3];
-
-	dst[3][0] = s1[3][0] * s2[0][0] + s1[3][1] * s2[1][0] + s1[3][2] * s2[2][0] + s1[3][3] * s2[3][0];
-	dst[3][1] = s1[3][0] * s2[0][1] + s1[3][1] * s2[1][1] + s1[3][2] * s2[2][1] + s1[3][3] * s2[3][1];
-	dst[3][2] = s1[3][0] * s2[0][2] + s1[3][1] * s2[1][2] + s1[3][2] * s2[2][2] + s1[3][3] * s2[3][2];
-	dst[3][3] = s1[3][0] * s2[0][3] + s1[3][1] * s2[1][3] + s1[3][2] * s2[2][3] + s1[3][3] * s2[3][3];
-}
-
-//-----------------------------------------------------------------------------
-// Matrix/vector multiply
-//-----------------------------------------------------------------------------
-
-void Vector4DMultiply( const VMatrix& src1, Vector4D const& src2, Vector4D& dst )
-{
-	// Make sure it works if src2 == dst
-	Vector4D tmp;
-	Vector4D const&v = (&src2 == &dst) ? tmp : src2;
-
-	if (&src2 == &dst)
-	{
-		Vector4DCopy( src2, tmp );
-	}
-
-	dst[0] = src1[0][0] * v[0] + src1[0][1] * v[1] + src1[0][2] * v[2] + src1[0][3] * v[3];
-	dst[1] = src1[1][0] * v[0] + src1[1][1] * v[1] + src1[1][2] * v[2] + src1[1][3] * v[3];
-	dst[2] = src1[2][0] * v[0] + src1[2][1] * v[1] + src1[2][2] * v[2] + src1[2][3] * v[3];
-	dst[3] = src1[3][0] * v[0] + src1[3][1] * v[1] + src1[3][2] * v[2] + src1[3][3] * v[3];
-}
-
-//-----------------------------------------------------------------------------
-// Matrix/vector multiply
-//-----------------------------------------------------------------------------
-
-void Vector4DMultiplyPosition( const VMatrix& src1, Vector const& src2, Vector4D& dst )
-{
-	// Make sure it works if src2 == dst
-	Vector tmp;
-	Vector const&v = ( &src2 == &dst.AsVector3D() ) ? static_cast<const Vector&>(tmp) : src2;
-
-	if (&src2 == &dst.AsVector3D())
-	{
-		VectorCopy( src2, tmp );
-	}
-
-	dst[0] = src1[0][0] * v[0] + src1[0][1] * v[1] + src1[0][2] * v[2] + src1[0][3];
-	dst[1] = src1[1][0] * v[0] + src1[1][1] * v[1] + src1[1][2] * v[2] + src1[1][3];
-	dst[2] = src1[2][0] * v[0] + src1[2][1] * v[1] + src1[2][2] * v[2] + src1[2][3];
-	dst[3] = src1[3][0] * v[0] + src1[3][1] * v[1] + src1[3][2] * v[2] + src1[3][3];
-}
-
-
-
-//-----------------------------------------------------------------------------
-// Matrix/vector multiply
-//-----------------------------------------------------------------------------
-
-void Vector3DMultiply( const VMatrix &src1, const Vector &src2, Vector &dst )
-{
-	// Make sure it works if src2 == dst
-	Vector tmp;
-	const Vector &v = (&src2 == &dst) ?  static_cast<const Vector&>(tmp) : src2;
-
-	if( &src2 == &dst )
-	{
-		VectorCopy( src2, tmp );
-	}
-
-	dst[0] = src1[0][0] * v[0] + src1[0][1] * v[1] + src1[0][2] * v[2];
-	dst[1] = src1[1][0] * v[0] + src1[1][1] * v[1] + src1[1][2] * v[2];
-	dst[2] = src1[2][0] * v[0] + src1[2][1] * v[1] + src1[2][2] * v[2];
-}
-
-
-//-----------------------------------------------------------------------------
-// Vector3DMultiplyPositionProjective treats src2 as if it's a point 
-// and does the perspective divide at the end
-//-----------------------------------------------------------------------------
-void Vector3DMultiplyPositionProjective( const VMatrix& src1, const Vector &src2, Vector& dst )
-{
-	// Make sure it works if src2 == dst
-	Vector tmp;
-	const Vector &v = (&src2 == &dst) ? static_cast<const Vector&>(tmp): src2;
-	if( &src2 == &dst )
-	{
-		VectorCopy( src2, tmp );
-	}
-
-	float w = src1[3][0] * v[0] + src1[3][1] * v[1] + src1[3][2] * v[2] + src1[3][3];
-	if ( w != 0.0f ) 
-	{
-		w = 1.0f / w;
-	}
-
-	dst[0] = src1[0][0] * v[0] + src1[0][1] * v[1] + src1[0][2] * v[2] + src1[0][3];
-	dst[1] = src1[1][0] * v[0] + src1[1][1] * v[1] + src1[1][2] * v[2] + src1[1][3];
-	dst[2] = src1[2][0] * v[0] + src1[2][1] * v[1] + src1[2][2] * v[2] + src1[2][3];
-	dst *= w;
-}
-
-
-//-----------------------------------------------------------------------------
-// Vector3DMultiplyProjective treats src2 as if it's a direction 
-// and does the perspective divide at the end
-//-----------------------------------------------------------------------------
-void Vector3DMultiplyProjective( const VMatrix& src1, const Vector &src2, Vector& dst )
-{
-	// Make sure it works if src2 == dst
-	Vector tmp;
-	const Vector &v = (&src2 == &dst) ? static_cast<const Vector&>(tmp) : src2;
-	if( &src2 == &dst )
-	{
-		VectorCopy( src2, tmp );
-	}
-
-	float w;
-	dst[0] = src1[0][0] * v[0] + src1[0][1] * v[1] + src1[0][2] * v[2];
-	dst[1] = src1[1][0] * v[0] + src1[1][1] * v[1] + src1[1][2] * v[2];
-	dst[2] = src1[2][0] * v[0] + src1[2][1] * v[1] + src1[2][2] * v[2];
-	w = src1[3][0] * v[0] + src1[3][1] * v[1] + src1[3][2] * v[2];
-	if (w != 0.0f)
-	{
-		dst /= w;
-	}
-	else
-	{
-		dst = vec3_origin;
-	}
-}
-
-
-//-----------------------------------------------------------------------------
-// Multiplies the vector by the transpose of the matrix
-//-----------------------------------------------------------------------------
-void Vector4DMultiplyTranspose( const VMatrix& src1, Vector4D const& src2, Vector4D& dst )
-{
-	// Make sure it works if src2 == dst
-	bool srcEqualsDst = (&src2 == &dst);
-
-	Vector4D tmp;
-	Vector4D const&v = srcEqualsDst ? tmp : src2;
-
-	if (srcEqualsDst)
-	{
-		Vector4DCopy( src2, tmp );
-	}
-
-	dst[0] = src1[0][0] * v[0] + src1[1][0] * v[1] + src1[2][0] * v[2] + src1[3][0] * v[3];
-	dst[1] = src1[0][1] * v[0] + src1[1][1] * v[1] + src1[2][1] * v[2] + src1[3][1] * v[3];
-	dst[2] = src1[0][2] * v[0] + src1[1][2] * v[1] + src1[2][2] * v[2] + src1[3][2] * v[3];
-	dst[3] = src1[0][3] * v[0] + src1[1][3] * v[1] + src1[2][3] * v[2] + src1[3][3] * v[3];
-}
-
-//-----------------------------------------------------------------------------
-// Multiplies the vector by the transpose of the matrix
-//-----------------------------------------------------------------------------
-void Vector3DMultiplyTranspose( const VMatrix& src1, const Vector& src2, Vector& dst )
-{
-	// Make sure it works if src2 == dst
-	bool srcEqualsDst = (&src2 == &dst);
-
-	Vector tmp;
-	const Vector&v = srcEqualsDst ? static_cast<const Vector&>(tmp) : src2;
-
-	if (srcEqualsDst)
-	{
-		VectorCopy( src2, tmp );
-	}
-
-	dst[0] = src1[0][0] * v[0] + src1[1][0] * v[1] + src1[2][0] * v[2];
-	dst[1] = src1[0][1] * v[0] + src1[1][1] * v[1] + src1[2][1] * v[2];
-	dst[2] = src1[0][2] * v[0] + src1[1][2] * v[1] + src1[2][2] * v[2];
-}
-
-
-//-----------------------------------------------------------------------------
-// Transform a plane
-//-----------------------------------------------------------------------------
-void MatrixTransformPlane( const VMatrix &src, const cplane_t &inPlane, cplane_t &outPlane )
-{
-	// What we want to do is the following:
-	// 1) transform the normal into the new space.
-	// 2) Determine a point on the old plane given by plane dist * plane normal
-	// 3) Transform that point into the new space
-	// 4) Plane dist = DotProduct( new normal, new point )
-
-	// An optimized version, which works if the plane is orthogonal.
-	// 1) Transform the normal into the new space
-	// 2) Realize that transforming the old plane point into the new space
-	// is given by [ d * n'x + Tx, d * n'y + Ty, d * n'z + Tz ]
-	// where d = old plane dist, n' = transformed normal, Tn = translational component of transform
-	// 3) Compute the new plane dist using the dot product of the normal result of #2
-
-	// For a correct result, this should be an inverse-transpose matrix
-	// but that only matters if there are nonuniform scale or skew factors in this matrix.
-	Vector vTrans;
-	Vector3DMultiply( src, inPlane.normal, outPlane.normal );
-	outPlane.dist = inPlane.dist * DotProduct( outPlane.normal, outPlane.normal );
-	outPlane.dist += DotProduct( outPlane.normal, src.GetTranslation(vTrans) );
-}
-
-
-#ifndef VECTOR_NO_SLOW_OPERATIONS
-
-VPlane VMatrix::operator*(const VPlane &thePlane) const
-{
-	VPlane ret;
-	TransformPlane( thePlane, ret );
-	return ret;
-}
-
-#endif
-
-
-//-----------------------------------------------------------------------------
-// Builds a rotation matrix that rotates one direction vector into another
-//-----------------------------------------------------------------------------
-void MatrixBuildTranslation( VMatrix& dst, float x, float y, float z )
-{
-	MatrixSetIdentity( dst );
-	dst[0][3] = x;
-	dst[1][3] = y;
-	dst[2][3] = z;
-}
-
-void MatrixBuildTranslation( VMatrix& dst, const Vector &translation )
-{
-	MatrixSetIdentity( dst );
-	dst[0][3] = translation[0];
-	dst[1][3] = translation[1];
-	dst[2][3] = translation[2];
-}
-
-
-//-----------------------------------------------------------------------------
-// Purpose: Builds the matrix for a counterclockwise rotation about an arbitrary axis.
-//
-//		   | ax2 + (1 - ax2)cosQ		axay(1 - cosQ) - azsinQ		azax(1 - cosQ) + aysinQ |
-// Ra(Q) = | axay(1 - cosQ) + azsinQ	ay2 + (1 - ay2)cosQ			ayaz(1 - cosQ) - axsinQ |
-//		   | azax(1 - cosQ) - aysinQ	ayaz(1 - cosQ) + axsinQ		az2 + (1 - az2)cosQ     |
-//          
-// Input  : mat - 
-//			vAxisOrRot - 
-//			angle - 
-//-----------------------------------------------------------------------------
-void MatrixBuildRotationAboutAxis( VMatrix &dst, const Vector &vAxisOfRot, float angleDegrees )
-{
-	MatrixBuildRotationAboutAxis( vAxisOfRot, angleDegrees, dst.As3x4() );
-	dst[3][0] = 0;
-	dst[3][1] = 0;
-	dst[3][2] = 0;
-	dst[3][3] = 1;
-}
-
-
-//-----------------------------------------------------------------------------
-// Builds a rotation matrix that rotates one direction vector into another
-//-----------------------------------------------------------------------------
-void MatrixBuildRotation( VMatrix &dst, const Vector& initialDirection, const Vector& finalDirection )
-{
-	float angle = DotProduct( initialDirection, finalDirection );
-	Assert( IsFinite(angle) );
-	
-	Vector axis;
-
-	// No rotation required
-	if (angle - 1.0 > -1e-3)
-	{
-		// parallel case
-		MatrixSetIdentity(dst);
-		return;
-	}
-	else if (angle + 1.0 < 1e-3)
-	{
-		// antiparallel case, pick any axis in the plane
-		// perpendicular to the final direction. Choose the direction (x,y,z)
-		// which has the minimum component of the final direction, use that
-		// as an initial guess, then subtract out the component which is 
-		// parallel to the final direction
-		int idx = 0;
-		if (FloatMakePositive(finalDirection[1]) < FloatMakePositive(finalDirection[idx]))
-			idx = 1;
-		if (FloatMakePositive(finalDirection[2]) < FloatMakePositive(finalDirection[idx]))
-			idx = 2;
-
-		axis.Init( 0, 0, 0 );
-		axis[idx] = 1.0f;
-		VectorMA( axis, -DotProduct( axis, finalDirection ), finalDirection, axis );
-		VectorNormalize(axis);
-		angle = 180.0f;
-	}
-	else
-	{
-		CrossProduct( initialDirection, finalDirection, axis );
-		VectorNormalize( axis );
-		angle = acos(angle) * 180 / M_PI;
-	}
-
-	MatrixBuildRotationAboutAxis( dst, axis, angle );
-
-#ifdef _DEBUG
-	Vector test;
-	Vector3DMultiply( dst, initialDirection, test );
-	test -= finalDirection;
-	Assert( test.LengthSqr() < 1e-3 );
-#endif
-}
-
-//-----------------------------------------------------------------------------
-//-----------------------------------------------------------------------------
-void MatrixBuildRotateZ( VMatrix &dst, float angleDegrees )
-{
-	float radians = angleDegrees * ( M_PI / 180.0f );
-
-	float fSin = ( float )sin( radians );
-	float fCos = ( float )cos( radians );
-
-	dst[0][0] = fCos; dst[0][1] = -fSin; dst[0][2] = 0.0f; dst[0][3] = 0.0f;
-	dst[1][0] = fSin; dst[1][1] =  fCos; dst[1][2] = 0.0f; dst[1][3] = 0.0f;
-	dst[2][0] = 0.0f; dst[2][1] =  0.0f; dst[2][2] = 1.0f; dst[2][3] = 0.0f;
-	dst[3][0] = 0.0f; dst[3][1] =  0.0f; dst[3][2] = 0.0f; dst[3][3] = 1.0f;
-}
-
-// Builds a scale matrix
-void MatrixBuildScale( VMatrix &dst, float x, float y, float z )
-{
-	dst[0][0] = x;		dst[0][1] = 0.0f;	dst[0][2] = 0.0f;	dst[0][3] = 0.0f;
-	dst[1][0] = 0.0f;	dst[1][1] = y;		dst[1][2] = 0.0f;	dst[1][3] = 0.0f;
-	dst[2][0] = 0.0f;	dst[2][1] = 0.0f;	dst[2][2] = z;		dst[2][3] = 0.0f;
-	dst[3][0] = 0.0f;	dst[3][1] = 0.0f;	dst[3][2] = 0.0f;	dst[3][3] = 1.0f;
-}
-
-void MatrixBuildScale( VMatrix &dst, const Vector& scale )
-{
-	MatrixBuildScale( dst, scale.x, scale.y, scale.z );
-}
-
-void MatrixBuildPerspective( VMatrix &dst, float fovX, float fovY, float zNear, float zFar )
-{
-	// FIXME: collapse all of this into one matrix after we figure out what all should be in here.
-	float width = 2 * zNear * tan( fovX * ( M_PI/180.0f ) * 0.5f );
-	float height = 2 * zNear * tan( fovY * ( M_PI/180.0f ) * 0.5f );
-
-	memset( dst.Base(), 0, sizeof( dst ) );
-	dst[0][0]  = 2.0F * zNear / width;
-	dst[1][1]  = 2.0F * zNear / height;
-	dst[2][2] = -zFar / ( zNear - zFar );
-	dst[3][2] = 1.0f;
-	dst[2][3] = zNear * zFar / ( zNear - zFar );
-
-	// negate X and Y so that X points right, and Y points up.
-	VMatrix negateXY;
-	negateXY.Identity();
-	negateXY[0][0] = -1.0f;
-	negateXY[1][1] = -1.0f;
-	MatrixMultiply( negateXY, dst, dst );
-	
-	VMatrix addW;
-	addW.Identity();
-	addW[0][3] = 1.0f;
-	addW[1][3] = 1.0f;
-	addW[2][3] = 0.0f;
-	MatrixMultiply( addW, dst, dst );
-	
-	VMatrix scaleHalf;
-	scaleHalf.Identity();
-	scaleHalf[0][0] = 0.5f;
-	scaleHalf[1][1] = 0.5f;
-	MatrixMultiply( scaleHalf, dst, dst );
-}
-
-static inline void CalculateAABBForNormalizedFrustum_Helper( float x, float y, float z, const VMatrix &volumeToWorld, Vector &mins, Vector &maxs )
-{
-	Vector volumeSpacePos( x, y, z );
-
-	// Make sure it's been clipped
-	Assert( volumeSpacePos[0] >= -1e-3f );
-	Assert( volumeSpacePos[0] - 1.0f <= 1e-3f );
-	Assert( volumeSpacePos[1] >= -1e-3f );
-	Assert( volumeSpacePos[1] - 1.0f <= 1e-3f );
-	Assert( volumeSpacePos[2] >= -1e-3f );
-	Assert( volumeSpacePos[2] - 1.0f <= 1e-3f );
-
-	Vector worldPos;
-	Vector3DMultiplyPositionProjective( volumeToWorld, volumeSpacePos, worldPos );
-	AddPointToBounds( worldPos, mins, maxs );
-}
-
-//-----------------------------------------------------------------------------
-// Given an inverse projection matrix, take the extremes of the space in transformed into world space and
-// get a bounding box.
-//-----------------------------------------------------------------------------
-void CalculateAABBFromProjectionMatrixInverse( const VMatrix &volumeToWorld, Vector *pMins, Vector *pMaxs )
-{
-	// FIXME: Could maybe do better than the compile with all of these multiplies by 0 and 1.
-	ClearBounds( *pMins, *pMaxs );
-	CalculateAABBForNormalizedFrustum_Helper( 0, 0, 0, volumeToWorld, *pMins, *pMaxs );
-	CalculateAABBForNormalizedFrustum_Helper( 0, 0, 1, volumeToWorld, *pMins, *pMaxs );
-	CalculateAABBForNormalizedFrustum_Helper( 0, 1, 0, volumeToWorld, *pMins, *pMaxs );
-	CalculateAABBForNormalizedFrustum_Helper( 0, 1, 1, volumeToWorld, *pMins, *pMaxs );
-	CalculateAABBForNormalizedFrustum_Helper( 1, 0, 0, volumeToWorld, *pMins, *pMaxs );
-	CalculateAABBForNormalizedFrustum_Helper( 1, 0, 1, volumeToWorld, *pMins, *pMaxs );
-	CalculateAABBForNormalizedFrustum_Helper( 1, 1, 0, volumeToWorld, *pMins, *pMaxs );
-	CalculateAABBForNormalizedFrustum_Helper( 1, 1, 1, volumeToWorld, *pMins, *pMaxs );
-}
-
-void CalculateAABBFromProjectionMatrix( const VMatrix &worldToVolume, Vector *pMins, Vector *pMaxs )
-{
-	VMatrix volumeToWorld;
-	MatrixInverseGeneral( worldToVolume, volumeToWorld );
-	CalculateAABBFromProjectionMatrixInverse( volumeToWorld, pMins, pMaxs );
-}
-
-//-----------------------------------------------------------------------------
-// Given an inverse projection matrix, take the extremes of the space in transformed into world space and
-// get a bounding sphere.
-//-----------------------------------------------------------------------------
-void CalculateSphereFromProjectionMatrixInverse( const VMatrix &volumeToWorld, Vector *pCenter, float *pflRadius )
-{
-	// FIXME: Could maybe do better than the compile with all of these multiplies by 0 and 1.
-
-	// Need 3 points: the endpoint of the line through the center of the near + far planes,
-	// and one point on the far plane. From that, we can derive a point somewhere on the center	line
-	// which would produce the smallest bounding sphere.
-	Vector vecCenterNear, vecCenterFar, vecNearEdge, vecFarEdge;
-	Vector3DMultiplyPositionProjective( volumeToWorld, Vector( 0.5f, 0.5f, 0.0f ), vecCenterNear );
-	Vector3DMultiplyPositionProjective( volumeToWorld, Vector( 0.5f, 0.5f, 1.0f ), vecCenterFar );
-	Vector3DMultiplyPositionProjective( volumeToWorld, Vector( 0.0f, 0.0f, 0.0f ), vecNearEdge );
-	Vector3DMultiplyPositionProjective( volumeToWorld, Vector( 0.0f, 0.0f, 1.0f ), vecFarEdge );
-
-	// Let the distance between the near + far center points = l
-	// Let the distance between the near center point + near edge point = h1
-	// Let the distance between the far center point + far edge point = h2
-	// Let the distance along the center line from the near point to the sphere center point = x
-	// Then let the distance between the sphere center point + near edge point == 
-	//	the distance between the sphere center point + far edge point == r == radius of sphere
-	// Then h1^2 + x^2 == r^2 == (l-x)^2 + h2^2
-	// h1^x + x^2 = l^2 - 2 * l * x + x^2 + h2^2
-	// 2 * l * x = l^2 + h2^2 - h1^2
-	// x = (l^2 + h2^2 - h1^2) / (2 * l)
-	// r = sqrt( hl^1 + x^2 )
-	Vector vecDelta;
-	VectorSubtract( vecCenterFar, vecCenterNear, vecDelta );
-	float l = vecDelta.Length();
-	float h1Sqr = vecCenterNear.DistToSqr( vecNearEdge );
-	float h2Sqr = vecCenterFar.DistToSqr( vecFarEdge );
-	float x = (l*l + h2Sqr - h1Sqr) / (2.0f * l);
-	VectorMA( vecCenterNear, (x / l), vecDelta, *pCenter );
-	*pflRadius = sqrt( h1Sqr + x*x );
-}
-
-//-----------------------------------------------------------------------------
-// Given a projection matrix, take the extremes of the space in transformed into world space and
-// get a bounding sphere.
-//-----------------------------------------------------------------------------
-void CalculateSphereFromProjectionMatrix( const VMatrix &worldToVolume, Vector *pCenter, float *pflRadius )
-{
-	VMatrix volumeToWorld;
-	MatrixInverseGeneral( worldToVolume, volumeToWorld );
-	CalculateSphereFromProjectionMatrixInverse( volumeToWorld, pCenter, pflRadius );
-}
-
-
-static inline void FrustumPlanesFromMatrixHelper( const VMatrix &shadowToWorld, const Vector &p1, const Vector &p2, const Vector &p3, 
-												 Vector &normal, float &dist )
-{
-	Vector world1, world2, world3;
-	Vector3DMultiplyPositionProjective( shadowToWorld, p1, world1 );
-	Vector3DMultiplyPositionProjective( shadowToWorld, p2, world2 );
-	Vector3DMultiplyPositionProjective( shadowToWorld, p3, world3 );
-
-	Vector v1, v2;
-	VectorSubtract( world2, world1, v1 );
-	VectorSubtract( world3, world1, v2 );
-
-	CrossProduct( v1, v2, normal );
-	VectorNormalize( normal );
-	dist = DotProduct( normal, world1 );	
-}
-
-void FrustumPlanesFromMatrix( const VMatrix &clipToWorld, Frustum_t &frustum )
-{
-	Vector normal;
-	float dist;
-
-	FrustumPlanesFromMatrixHelper( clipToWorld, 
-		Vector( 0.0f, 0.0f, 0.0f ), Vector( 1.0f, 0.0f, 0.0f ), Vector( 0.0f, 1.0f, 0.0f ), normal, dist );
-	frustum.SetPlane( FRUSTUM_NEARZ, PLANE_ANYZ, normal, dist );
-
-	FrustumPlanesFromMatrixHelper( clipToWorld, 
-		Vector( 0.0f, 0.0f, 1.0f ), Vector( 0.0f, 1.0f, 1.0f ), Vector( 1.0f, 0.0f, 1.0f ), normal, dist );
-	frustum.SetPlane( FRUSTUM_FARZ, PLANE_ANYZ, normal, dist );
-
-	FrustumPlanesFromMatrixHelper( clipToWorld, 
-		Vector( 1.0f, 0.0f, 0.0f ), Vector( 1.0f, 1.0f, 1.0f ), Vector( 1.0f, 1.0f, 0.0f ), normal, dist );
-	frustum.SetPlane( FRUSTUM_RIGHT, PLANE_ANYZ, normal, dist );
-
-	FrustumPlanesFromMatrixHelper( clipToWorld, 
-		Vector( 0.0f, 0.0f, 0.0f ), Vector( 0.0f, 1.0f, 1.0f ), Vector( 0.0f, 0.0f, 1.0f ), normal, dist );
-	frustum.SetPlane( FRUSTUM_LEFT, PLANE_ANYZ, normal, dist );
-
-	FrustumPlanesFromMatrixHelper( clipToWorld, 
-		Vector( 1.0f, 1.0f, 0.0f ), Vector( 1.0f, 1.0f, 1.0f ), Vector( 0.0f, 1.0f, 1.0f ), normal, dist );
-	frustum.SetPlane( FRUSTUM_TOP, PLANE_ANYZ, normal, dist );
-
-	FrustumPlanesFromMatrixHelper( clipToWorld, 
-		Vector( 1.0f, 0.0f, 0.0f ), Vector( 0.0f, 0.0f, 1.0f ), Vector( 1.0f, 0.0f, 1.0f ), normal, dist );
-	frustum.SetPlane( FRUSTUM_BOTTOM, PLANE_ANYZ, normal, dist );
-}
-
-void MatrixBuildOrtho( VMatrix& dst, double left, double top, double right, double bottom, double zNear, double zFar )
-{
-	// FIXME: This is being used incorrectly! Should read:
-	// D3DXMatrixOrthoOffCenterRH( &matrix, left, right, bottom, top, zNear, zFar );
-	// Which is certainly why we need these extra -1 scales in y. Bleah
-
-	// NOTE: The camera can be imagined as the following diagram:
-	//		/z
-	//	   /
-	//	  /____ x	Z is going into the screen
-	//	  |
-	//	  |
-	//	  |y
-	//
-	// (0,0,z) represents the upper-left corner of the screen.
-	// Our projection transform needs to transform from this space to a LH coordinate
-	// system that looks thusly:
-	// 
-	//	y|  /z
-	//	 | /
-	//	 |/____ x	Z is going into the screen
-	//
-	// Where x,y lies between -1 and 1, and z lies from 0 to 1
-	// This is because the viewport transformation from projection space to pixels
-	// introduces a -1 scale in the y coordinates
-	//		D3DXMatrixOrthoOffCenterRH( &matrix, left, right, top, bottom, zNear, zFar );
-
-	dst.Init(	 2.0f / ( right - left ),						0.0f,						0.0f, ( left + right ) / ( left - right ),
-				0.0f,	 2.0f / ( bottom - top ),						0.0f, ( bottom + top ) / ( top - bottom ),
-				0.0f,						0.0f,	 1.0f / ( zNear - zFar ),			 zNear / ( zNear - zFar ),
-				0.0f,						0.0f,						0.0f,								1.0f );
-}
-
-void MatrixBuildPerspectiveZRange( VMatrix& dst, double flZNear, double flZFar )
-{
-	dst.m[2][0] = 0.0f;
-	dst.m[2][1] = 0.0f;
-	dst.m[2][2] = flZFar / ( flZNear - flZFar );
-	dst.m[2][3] = flZNear * flZFar / ( flZNear - flZFar );
-}
-
-void MatrixBuildPerspectiveX( VMatrix& dst, double flFovX, double flAspect, double flZNear, double flZFar )
-{
-	float flWidthScale = 1.0f / tanf( flFovX * M_PI / 360.0f );
-	float flHeightScale = flAspect * flWidthScale;
-	dst.Init(   flWidthScale,				0.0f,							0.0f,										0.0f,
-				0.0f,						flHeightScale,					0.0f,										0.0f,
-				0.0f,						0.0f,							0.0f,										0.0f,
-				0.0f,						0.0f,						   -1.0f,										0.0f );
-
-	MatrixBuildPerspectiveZRange ( dst, flZNear, flZFar );
-}
-
-void MatrixBuildPerspectiveOffCenterX( VMatrix& dst, double flFovX, double flAspect, double flZNear, double flZFar, double bottom, double top, double left, double right )
-{
-	float flWidth = tanf( flFovX * M_PI / 360.0f );
-	float flHeight = flWidth / flAspect;
-
-	// bottom, top, left, right are 0..1 so convert to -<val>/2..<val>/2
-	float flLeft   = -(flWidth/2.0f)  * (1.0f - left)   + left   * (flWidth/2.0f);
-	float flRight  = -(flWidth/2.0f)  * (1.0f - right)  + right  * (flWidth/2.0f);
-	float flBottom = -(flHeight/2.0f) * (1.0f - bottom) + bottom * (flHeight/2.0f);
-	float flTop    = -(flHeight/2.0f) * (1.0f - top)    + top    * (flHeight/2.0f);
-
-	dst.Init(   1.0f / (flRight-flLeft),        0.0f,                           (flLeft+flRight)/(flRight-flLeft),  0.0f,
-				0.0f,                           1.0f /(flTop-flBottom),         (flTop+flBottom)/(flTop-flBottom),  0.0f,
-				0.0f,                           0.0f,							0.0f,								0.0f,
-				0.0f,                           0.0f,                           -1.0f,								0.0f );
-
-	MatrixBuildPerspectiveZRange ( dst, flZNear, flZFar );
-}
-#endif // !_STATIC_LINKED || _SHARED_LIB
-
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+
+#if !defined(_STATIC_LINKED) || defined(_SHARED_LIB)
+
+#include "basetypes.h"
+#include "mathlib/vmatrix.h"
+#include "mathlib/mathlib.h"
+#include <string.h>
+#include "mathlib/vector4d.h"
+#include "tier0/dbg.h"
+
+// memdbgon must be the last include file in a .cpp file!!!
+#include "tier0/memdbgon.h"
+
+#pragma warning (disable : 4700) // local variable 'x' used without having been initialized
+
+// ------------------------------------------------------------------------------------------- //
+// Helper functions.
+// ------------------------------------------------------------------------------------------- //
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+
+VMatrix SetupMatrixIdentity()
+{
+	return VMatrix(
+		1.0f, 0.0f, 0.0f, 0.0f,
+		0.0f, 1.0f, 0.0f, 0.0f,
+		0.0f, 0.0f, 1.0f, 0.0f,
+		0.0f, 0.0f, 0.0f, 1.0f);
+}
+
+VMatrix SetupMatrixTranslation(const Vector &vTranslation)
+{
+	return VMatrix(
+		1.0f, 0.0f, 0.0f, vTranslation.x,
+		0.0f, 1.0f, 0.0f, vTranslation.y,
+		0.0f, 0.0f, 1.0f, vTranslation.z,
+		0.0f, 0.0f, 0.0f, 1.0f
+		);
+}
+
+VMatrix SetupMatrixScale(const Vector &vScale)
+{
+	return VMatrix(
+		vScale.x, 0.0f,     0.0f,     0.0f,
+		0.0f,     vScale.y, 0.0f,     0.0f,
+		0.0f,     0.0f,     vScale.z, 0.0f,
+		0.0f,     0.0f,     0.0f,     1.0f
+		);
+}
+
+VMatrix SetupMatrixReflection(const VPlane &thePlane)
+{
+	VMatrix mReflect, mBack, mForward;
+	Vector vOrigin, N;
+
+	N = thePlane.m_Normal;
+
+	mReflect.Init( 
+		-2.0f*N.x*N.x + 1.0f,	-2.0f*N.x*N.y,			-2.0f*N.x*N.z,			0.0f,
+		-2.0f*N.y*N.x,			-2.0f*N.y*N.y + 1.0f,	-2.0f*N.y*N.z,			0.0f,
+		-2.0f*N.z*N.x,			-2.0f*N.z*N.y,			-2.0f*N.z*N.z + 1.0f,	0.0f,
+		0.0f,					0.0f,					0.0f,					1.0f
+		);
+
+	vOrigin = thePlane.GetPointOnPlane();
+
+	mBack.Identity();
+	mBack.SetTranslation(-vOrigin);
+
+	mForward.Identity();
+	mForward.SetTranslation(vOrigin);
+
+	// (multiplied in reverse order, so it translates to the origin point,
+	// reflects, and translates back).
+	return mForward * mReflect * mBack;
+}
+
+VMatrix SetupMatrixProjection(const Vector &vOrigin, const VPlane &thePlane)
+{
+	vec_t dot;
+	VMatrix mRet;
+
+
+	#define PN thePlane.m_Normal
+	#define PD thePlane.m_Dist;
+
+		dot = PN[0]*vOrigin.x + PN[1]*vOrigin.y + PN[2]*vOrigin.z - PD;
+
+		mRet.m[0][0] = dot - vOrigin.x * PN[0];
+		mRet.m[0][1] = -vOrigin.x * PN[1];
+		mRet.m[0][2] = -vOrigin.x * PN[2];
+		mRet.m[0][3] = -vOrigin.x * -PD;
+
+		mRet.m[1][0] = -vOrigin.y * PN[0];
+		mRet.m[1][1] = dot - vOrigin.y * PN[1];
+		mRet.m[1][2] = -vOrigin.y * PN[2];
+		mRet.m[1][3] = -vOrigin.y * -PD;
+
+		mRet.m[2][0] = -vOrigin.z * PN[0];
+		mRet.m[2][1] = -vOrigin.z * PN[1];
+		mRet.m[2][2] = dot - vOrigin.z * PN[2];
+		mRet.m[2][3] = -vOrigin.z * -PD;
+
+		mRet.m[3][0] = -PN[0];
+		mRet.m[3][1] = -PN[1];
+		mRet.m[3][2] = -PN[2];
+		mRet.m[3][3] = dot + PD;
+
+	#undef PN
+	#undef PD	
+
+	return mRet;
+}
+
+VMatrix SetupMatrixAxisRot(const Vector &vAxis, vec_t fDegrees)
+{
+	vec_t s, c, t;
+	vec_t tx, ty, tz;
+	vec_t sx, sy, sz;
+	vec_t fRadians;
+
+
+	fRadians = fDegrees * (M_PI / 180.0f);
+	
+	s = (vec_t)sin(fRadians);
+	c = (vec_t)cos(fRadians);
+	t = 1.0f - c;
+
+	tx = t * vAxis.x;	ty = t * vAxis.y;	tz = t * vAxis.z;
+	sx = s * vAxis.x;	sy = s * vAxis.y;	sz = s * vAxis.z;
+
+	return VMatrix(
+		tx*vAxis.x + c,  tx*vAxis.y - sz, tx*vAxis.z + sy, 0.0f,
+		tx*vAxis.y + sz, ty*vAxis.y + c,  ty*vAxis.z - sx, 0.0f,
+		tx*vAxis.z - sy, ty*vAxis.z + sx, tz*vAxis.z + c,  0.0f,
+		0.0f, 0.0f, 0.0f, 1.0f);
+}
+
+VMatrix SetupMatrixAngles(const QAngle &vAngles)
+{
+	VMatrix mRet;
+	MatrixFromAngles( vAngles, mRet );
+	return mRet;
+}
+
+VMatrix SetupMatrixOrgAngles(const Vector &origin, const QAngle &vAngles)
+{
+	VMatrix mRet;
+	mRet.SetupMatrixOrgAngles( origin, vAngles );
+	return mRet;
+}
+
+#endif // VECTOR_NO_SLOW_OPERATIONS
+
+
+bool PlaneIntersection( const VPlane &vp1, const VPlane &vp2, const VPlane &vp3, Vector &vOut )
+{
+	VMatrix mMat, mInverse;
+
+	mMat.Init(
+		vp1.m_Normal.x, vp1.m_Normal.y, vp1.m_Normal.z, -vp1.m_Dist,
+		vp2.m_Normal.x, vp2.m_Normal.y, vp2.m_Normal.z, -vp2.m_Dist,
+		vp3.m_Normal.x, vp3.m_Normal.y, vp3.m_Normal.z, -vp3.m_Dist,
+		0.0f, 0.0f, 0.0f, 1.0f
+		);
+	
+	if(mMat.InverseGeneral(mInverse))
+	{
+		//vOut = mInverse * Vector(0.0f, 0.0f, 0.0f);
+		mInverse.GetTranslation( vOut );
+		return true;
+	}
+	else
+	{
+		return false;
+	}
+}
+
+
+
+// ------------------------------------------------------------------------------------------- //
+// VMatrix functions.
+// ------------------------------------------------------------------------------------------- //
+
+VMatrix& VMatrix::operator=(const VMatrix &mOther)
+{
+	m[0][0] = mOther.m[0][0];
+	m[0][1] = mOther.m[0][1];
+	m[0][2] = mOther.m[0][2];
+	m[0][3] = mOther.m[0][3];
+
+	m[1][0] = mOther.m[1][0];
+	m[1][1] = mOther.m[1][1];
+	m[1][2] = mOther.m[1][2];
+	m[1][3] = mOther.m[1][3];
+
+	m[2][0] = mOther.m[2][0];
+	m[2][1] = mOther.m[2][1];
+	m[2][2] = mOther.m[2][2];
+	m[2][3] = mOther.m[2][3];
+
+	m[3][0] = mOther.m[3][0];
+	m[3][1] = mOther.m[3][1];
+	m[3][2] = mOther.m[3][2];
+	m[3][3] = mOther.m[3][3];
+
+	return *this;
+}
+
+bool VMatrix::operator==( const VMatrix& src ) const
+{
+	return !memcmp( src.m, m, sizeof(m) );
+}
+
+void VMatrix::MatrixMul( const VMatrix &vm, VMatrix &out ) const
+{
+	out.Init(
+		m[0][0]*vm.m[0][0] + m[0][1]*vm.m[1][0] + m[0][2]*vm.m[2][0] + m[0][3]*vm.m[3][0],
+		m[0][0]*vm.m[0][1] + m[0][1]*vm.m[1][1] + m[0][2]*vm.m[2][1] + m[0][3]*vm.m[3][1],
+		m[0][0]*vm.m[0][2] + m[0][1]*vm.m[1][2] + m[0][2]*vm.m[2][2] + m[0][3]*vm.m[3][2],
+		m[0][0]*vm.m[0][3] + m[0][1]*vm.m[1][3] + m[0][2]*vm.m[2][3] + m[0][3]*vm.m[3][3],
+
+		m[1][0]*vm.m[0][0] + m[1][1]*vm.m[1][0] + m[1][2]*vm.m[2][0] + m[1][3]*vm.m[3][0],
+		m[1][0]*vm.m[0][1] + m[1][1]*vm.m[1][1] + m[1][2]*vm.m[2][1] + m[1][3]*vm.m[3][1],
+		m[1][0]*vm.m[0][2] + m[1][1]*vm.m[1][2] + m[1][2]*vm.m[2][2] + m[1][3]*vm.m[3][2],
+		m[1][0]*vm.m[0][3] + m[1][1]*vm.m[1][3] + m[1][2]*vm.m[2][3] + m[1][3]*vm.m[3][3],
+
+		m[2][0]*vm.m[0][0] + m[2][1]*vm.m[1][0] + m[2][2]*vm.m[2][0] + m[2][3]*vm.m[3][0],
+		m[2][0]*vm.m[0][1] + m[2][1]*vm.m[1][1] + m[2][2]*vm.m[2][1] + m[2][3]*vm.m[3][1],
+		m[2][0]*vm.m[0][2] + m[2][1]*vm.m[1][2] + m[2][2]*vm.m[2][2] + m[2][3]*vm.m[3][2],
+		m[2][0]*vm.m[0][3] + m[2][1]*vm.m[1][3] + m[2][2]*vm.m[2][3] + m[2][3]*vm.m[3][3],
+
+		m[3][0]*vm.m[0][0] + m[3][1]*vm.m[1][0] + m[3][2]*vm.m[2][0] + m[3][3]*vm.m[3][0],
+		m[3][0]*vm.m[0][1] + m[3][1]*vm.m[1][1] + m[3][2]*vm.m[2][1] + m[3][3]*vm.m[3][1],
+		m[3][0]*vm.m[0][2] + m[3][1]*vm.m[1][2] + m[3][2]*vm.m[2][2] + m[3][3]*vm.m[3][2],
+		m[3][0]*vm.m[0][3] + m[3][1]*vm.m[1][3] + m[3][2]*vm.m[2][3] + m[3][3]*vm.m[3][3]
+		);
+}
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+
+VMatrix VMatrix::operator*(const VMatrix &vm) const
+{
+	VMatrix ret;
+	MatrixMul( vm, ret );
+	return ret;
+}
+
+#endif
+
+bool VMatrix::InverseGeneral(VMatrix &vInverse) const
+{
+	return MatrixInverseGeneral( *this, vInverse );
+}
+
+
+bool MatrixInverseGeneral(const VMatrix& src, VMatrix& dst)
+{
+	int iRow, i, j, iTemp, iTest;
+	vec_t mul, fTest, fLargest;
+	vec_t mat[4][8];
+	int rowMap[4], iLargest;
+	vec_t *pOut, *pRow, *pScaleRow;
+
+
+	// How it's done.
+	// AX = I
+	// A = this
+	// X = the matrix we're looking for
+	// I = identity
+
+	// Setup AI
+	for(i=0; i < 4; i++)
+	{
+		const vec_t *pIn = src[i];
+		pOut = mat[i];
+
+		for(j=0; j < 4; j++)
+		{
+			pOut[j] = pIn[j];
+		}
+
+		pOut[4] = 0.0f;
+		pOut[5] = 0.0f;
+		pOut[6] = 0.0f;
+		pOut[7] = 0.0f;
+		pOut[i+4] = 1.0f;
+
+		rowMap[i] = i;
+	}
+
+	// Use row operations to get to reduced row-echelon form using these rules:
+	// 1. Multiply or divide a row by a nonzero number.
+	// 2. Add a multiple of one row to another.
+	// 3. Interchange two rows.
+
+	for(iRow=0; iRow < 4; iRow++)
+	{
+		// Find the row with the largest element in this column.
+		fLargest = 0.00001f;
+		iLargest = -1;
+		for(iTest=iRow; iTest < 4; iTest++)
+		{
+			fTest = (vec_t)FloatMakePositive(mat[rowMap[iTest]][iRow]);
+			if(fTest > fLargest)
+			{
+				iLargest = iTest;
+				fLargest = fTest;
+			}
+		}
+
+		// They're all too small.. sorry.
+		if(iLargest == -1)
+		{
+			return false;
+		}
+
+		// Swap the rows.
+		iTemp = rowMap[iLargest];
+		rowMap[iLargest] = rowMap[iRow];
+		rowMap[iRow] = iTemp;
+
+		pRow = mat[rowMap[iRow]];
+
+		// Divide this row by the element.
+		mul = 1.0f / pRow[iRow];
+		for(j=0; j < 8; j++)
+			pRow[j] *= mul;
+
+		pRow[iRow] = 1.0f; // Preserve accuracy...
+		
+		// Eliminate this element from the other rows using operation 2.
+		for(i=0; i < 4; i++)
+		{
+			if(i == iRow)
+				continue;
+
+			pScaleRow = mat[rowMap[i]];
+		
+			// Multiply this row by -(iRow*the element).
+			mul = -pScaleRow[iRow];
+			for(j=0; j < 8; j++)
+			{
+				pScaleRow[j] += pRow[j] * mul;
+			}
+
+			pScaleRow[iRow] = 0.0f; // Preserve accuracy...
+		}
+	}
+
+	// The inverse is on the right side of AX now (the identity is on the left).
+	for(i=0; i < 4; i++)
+	{
+		const vec_t *pIn = mat[rowMap[i]] + 4;
+		pOut = dst.m[i];
+
+		for(j=0; j < 4; j++)
+		{
+			pOut[j] = pIn[j];
+		}
+	}
+
+	return true;
+}
+
+
+//-----------------------------------------------------------------------------
+// Does a fast inverse, assuming the matrix only contains translation and rotation.
+//-----------------------------------------------------------------------------
+void MatrixInverseTR( const VMatrix& src, VMatrix &dst )
+{
+	Vector vTrans, vNewTrans;
+
+	// Transpose the upper 3x3.
+	dst.m[0][0] = src.m[0][0];  dst.m[0][1] = src.m[1][0]; dst.m[0][2] = src.m[2][0];
+	dst.m[1][0] = src.m[0][1];  dst.m[1][1] = src.m[1][1]; dst.m[1][2] = src.m[2][1];
+	dst.m[2][0] = src.m[0][2];  dst.m[2][1] = src.m[1][2]; dst.m[2][2] = src.m[2][2];
+
+	// Transform the translation.
+	vTrans.Init( -src.m[0][3], -src.m[1][3], -src.m[2][3] );
+	Vector3DMultiply( dst, vTrans, vNewTrans );
+	MatrixSetColumn( dst, 3, vNewTrans );
+
+	// Fill in the bottom row.
+	dst.m[3][0] = dst.m[3][1] = dst.m[3][2] = 0.0f;
+	dst.m[3][3] = 1.0f;
+}
+
+
+void VMatrix::InverseTR( VMatrix &ret ) const
+{
+	MatrixInverseTR( *this, ret );
+}
+
+void MatrixInverseTranspose( const VMatrix& src, VMatrix& dst )
+{
+	src.InverseGeneral( dst );
+	MatrixTranspose( dst, dst );
+}
+
+//-----------------------------------------------------------------------------
+// Computes the inverse transpose
+//-----------------------------------------------------------------------------
+void MatrixInverseTranspose( const matrix3x4_t& src, matrix3x4_t& dst )
+{
+	VMatrix tmp, out;
+	tmp.CopyFrom3x4( src );
+	::MatrixInverseTranspose( tmp, out );
+	out.Set3x4( dst );
+}
+
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+
+VMatrix VMatrix::InverseTR() const
+{
+	VMatrix ret;
+	MatrixInverseTR( *this, ret );
+	return ret;
+}
+
+Vector VMatrix::GetScale() const
+{
+	Vector vecs[3];
+
+	GetBasisVectors(vecs[0], vecs[1], vecs[2]);
+
+	return Vector(
+		vecs[0].Length(),
+		vecs[1].Length(),
+		vecs[2].Length()
+		);
+}
+
+VMatrix VMatrix::Scale(const Vector &vScale)
+{
+	return VMatrix(
+		m[0][0]*vScale.x, m[0][1]*vScale.y, m[0][2]*vScale.z, m[0][3],
+		m[1][0]*vScale.x, m[1][1]*vScale.y, m[1][2]*vScale.z, m[1][3],
+		m[2][0]*vScale.x, m[2][1]*vScale.y, m[2][2]*vScale.z, m[2][3],
+		m[3][0]*vScale.x, m[3][1]*vScale.y, m[3][2]*vScale.z, 1.0f
+		);
+}
+
+VMatrix VMatrix::NormalizeBasisVectors() const
+{
+	Vector vecs[3];
+	VMatrix mRet;
+
+
+	GetBasisVectors(vecs[0], vecs[1], vecs[2]);
+	
+	VectorNormalize( vecs[0] );
+	VectorNormalize( vecs[1] );
+	VectorNormalize( vecs[2] );
+
+	mRet.SetBasisVectors(vecs[0], vecs[1], vecs[2]);
+	
+	// Set everything but basis vectors to identity.
+	mRet.m[3][0] = mRet.m[3][1] = mRet.m[3][2] = 0.0f;
+	mRet.m[3][3] = 1.0f;
+
+	return mRet;
+}
+
+VMatrix VMatrix::Transpose() const
+{
+	return VMatrix(
+		m[0][0], m[1][0], m[2][0], m[3][0],
+		m[0][1], m[1][1], m[2][1], m[3][1],
+		m[0][2], m[1][2], m[2][2], m[3][2],
+		m[0][3], m[1][3], m[2][3], m[3][3]);
+}
+
+// Transpose upper-left 3x3.
+VMatrix VMatrix::Transpose3x3() const
+{
+	return VMatrix(
+		m[0][0], m[1][0], m[2][0], m[0][3],
+		m[0][1], m[1][1], m[2][1], m[1][3],
+		m[0][2], m[1][2], m[2][2], m[2][3],
+		m[3][0], m[3][1], m[3][2], m[3][3]);
+}
+
+#endif // VECTOR_NO_SLOW_OPERATIONS
+
+
+bool VMatrix::IsRotationMatrix() const
+{
+	Vector &v1 = (Vector&)m[0][0];
+	Vector &v2 = (Vector&)m[1][0];
+	Vector &v3 = (Vector&)m[2][0];
+
+	return 
+		FloatMakePositive( 1 - v1.Length() ) < 0.01f && 
+		FloatMakePositive( 1 - v2.Length() ) < 0.01f && 
+		FloatMakePositive( 1 - v3.Length() ) < 0.01f && 
+		FloatMakePositive( v1.Dot(v2) ) < 0.01f &&
+		FloatMakePositive( v1.Dot(v3) ) < 0.01f &&
+		FloatMakePositive( v2.Dot(v3) ) < 0.01f;
+}
+
+void VMatrix::SetupMatrixOrgAngles( const Vector &origin, const QAngle &vAngles )
+{
+	float		sr, sp, sy, cr, cp, cy;
+
+	SinCos( DEG2RAD( vAngles[YAW] ), &sy, &cy );
+	SinCos( DEG2RAD( vAngles[PITCH] ), &sp, &cp );
+	SinCos( DEG2RAD( vAngles[ROLL] ), &sr, &cr );
+
+	// matrix = (YAW * PITCH) * ROLL
+	m[0][0] = cp*cy;
+	m[1][0] = cp*sy;
+	m[2][0] = -sp;
+	m[0][1] = sr*sp*cy+cr*-sy;
+	m[1][1] = sr*sp*sy+cr*cy;
+	m[2][1] = sr*cp;
+	m[0][2] = (cr*sp*cy+-sr*-sy);
+	m[1][2] = (cr*sp*sy+-sr*cy);
+	m[2][2] = cr*cp;
+	m[0][3] = 0.f;
+	m[1][3] = 0.f;
+	m[2][3] = 0.f;
+	
+	// Add translation
+	m[0][3] = origin.x;
+	m[1][3] = origin.y;
+	m[2][3] = origin.z;
+	m[3][0] = 0.0f;
+	m[3][1] = 0.0f;
+	m[3][2] = 0.0f;
+	m[3][3] = 1.0f;
+}
+
+
+//-----------------------------------------------------------------------------
+// Sets matrix to identity
+//-----------------------------------------------------------------------------
+void MatrixSetIdentity( VMatrix &dst )
+{
+	dst[0][0] = 1.0f; dst[0][1] = 0.0f; dst[0][2] = 0.0f; dst[0][3] = 0.0f;
+	dst[1][0] = 0.0f; dst[1][1] = 1.0f; dst[1][2] = 0.0f; dst[1][3] = 0.0f;
+	dst[2][0] = 0.0f; dst[2][1] = 0.0f; dst[2][2] = 1.0f; dst[2][3] = 0.0f;
+	dst[3][0] = 0.0f; dst[3][1] = 0.0f; dst[3][2] = 0.0f; dst[3][3] = 1.0f;
+}
+
+
+//-----------------------------------------------------------------------------
+// Setup a matrix from euler angles. 
+//-----------------------------------------------------------------------------
+void MatrixFromAngles( const QAngle& vAngles, VMatrix& dst )
+{
+	dst.SetupMatrixOrgAngles( vec3_origin, vAngles );
+}
+
+
+//-----------------------------------------------------------------------------
+// Creates euler angles from a matrix 
+//-----------------------------------------------------------------------------
+void MatrixToAngles( const VMatrix& src, QAngle& vAngles )
+{
+	float forward[3];
+	float left[3];
+	float up[3];
+
+	// Extract the basis vectors from the matrix. Since we only need the Z
+	// component of the up vector, we don't get X and Y.
+	forward[0] = src[0][0];
+	forward[1] = src[1][0];
+	forward[2] = src[2][0];
+	left[0] = src[0][1];
+	left[1] = src[1][1];
+	left[2] = src[2][1];
+	up[2] = src[2][2];
+
+	float xyDist = sqrtf( forward[0] * forward[0] + forward[1] * forward[1] );
+	
+	// enough here to get angles?
+	if ( xyDist > 0.001f )
+	{
+		// (yaw)	y = ATAN( forward.y, forward.x );		-- in our space, forward is the X axis
+		vAngles[1] = RAD2DEG( atan2f( forward[1], forward[0] ) );
+
+		// The engine does pitch inverted from this, but we always end up negating it in the DLL
+		// UNDONE: Fix the engine to make it consistent
+		// (pitch)	x = ATAN( -forward.z, sqrt(forward.x*forward.x+forward.y*forward.y) );
+		vAngles[0] = RAD2DEG( atan2f( -forward[2], xyDist ) );
+
+		// (roll)	z = ATAN( left.z, up.z );
+		vAngles[2] = RAD2DEG( atan2f( left[2], up[2] ) );
+	}
+	else	// forward is mostly Z, gimbal lock-
+	{
+		// (yaw)	y = ATAN( -left.x, left.y );			-- forward is mostly z, so use right for yaw
+		vAngles[1] = RAD2DEG( atan2f( -left[0], left[1] ) );
+
+		// The engine does pitch inverted from this, but we always end up negating it in the DLL
+		// UNDONE: Fix the engine to make it consistent
+		// (pitch)	x = ATAN( -forward.z, sqrt(forward.x*forward.x+forward.y*forward.y) );
+		vAngles[0] = RAD2DEG( atan2f( -forward[2], xyDist ) );
+
+		// Assume no roll in this case as one degree of freedom has been lost (i.e. yaw == roll)
+		vAngles[2] = 0;
+	}
+}
+
+
+//-----------------------------------------------------------------------------
+// Transpose
+//-----------------------------------------------------------------------------
+inline void Swap( float& a, float& b )
+{
+	float tmp = a;
+	a = b;
+	b = tmp;
+}
+
+void MatrixTranspose( const VMatrix& src, VMatrix& dst )
+{
+	if (&src == &dst)
+	{
+		Swap( dst[0][1], dst[1][0] );
+		Swap( dst[0][2], dst[2][0] );
+		Swap( dst[0][3], dst[3][0] );
+		Swap( dst[1][2], dst[2][1] );
+		Swap( dst[1][3], dst[3][1] );
+		Swap( dst[2][3], dst[3][2] );
+	}
+	else
+	{
+		dst[0][0] = src[0][0]; dst[0][1] = src[1][0]; dst[0][2] = src[2][0]; dst[0][3] = src[3][0];
+		dst[1][0] = src[0][1]; dst[1][1] = src[1][1]; dst[1][2] = src[2][1]; dst[1][3] = src[3][1];
+		dst[2][0] = src[0][2]; dst[2][1] = src[1][2]; dst[2][2] = src[2][2]; dst[2][3] = src[3][2];
+		dst[3][0] = src[0][3]; dst[3][1] = src[1][3]; dst[3][2] = src[2][3]; dst[3][3] = src[3][3];
+	}
+}
+
+
+//-----------------------------------------------------------------------------
+// Matrix copy
+//-----------------------------------------------------------------------------
+
+void MatrixCopy( const VMatrix& src, VMatrix& dst )
+{
+	if (&src != &dst)
+	{
+		memcpy( dst.m, src.m, 16 * sizeof(float) );
+	}
+}
+
+//-----------------------------------------------------------------------------
+// Matrix multiply
+//-----------------------------------------------------------------------------
+typedef float VMatrixRaw_t[4];
+
+void MatrixMultiply( const VMatrix& src1, const VMatrix& src2, VMatrix& dst )
+{
+	// Make sure it works if src1 == dst or src2 == dst
+	VMatrix tmp1, tmp2;
+	const VMatrixRaw_t* s1 = (&src1 == &dst) ? tmp1.m : src1.m;
+	const VMatrixRaw_t* s2 = (&src2 == &dst) ? tmp2.m : src2.m;
+
+	if (&src1 == &dst)
+	{
+		MatrixCopy( src1, tmp1 );
+	}
+	if (&src2 == &dst)
+	{
+		MatrixCopy( src2, tmp2 );
+	}
+
+	dst[0][0] = s1[0][0] * s2[0][0] + s1[0][1] * s2[1][0] + s1[0][2] * s2[2][0] + s1[0][3] * s2[3][0];
+	dst[0][1] = s1[0][0] * s2[0][1] + s1[0][1] * s2[1][1] + s1[0][2] * s2[2][1] + s1[0][3] * s2[3][1];
+	dst[0][2] = s1[0][0] * s2[0][2] + s1[0][1] * s2[1][2] + s1[0][2] * s2[2][2] + s1[0][3] * s2[3][2];
+	dst[0][3] = s1[0][0] * s2[0][3] + s1[0][1] * s2[1][3] + s1[0][2] * s2[2][3] + s1[0][3] * s2[3][3];
+
+	dst[1][0] = s1[1][0] * s2[0][0] + s1[1][1] * s2[1][0] + s1[1][2] * s2[2][0] + s1[1][3] * s2[3][0];
+	dst[1][1] = s1[1][0] * s2[0][1] + s1[1][1] * s2[1][1] + s1[1][2] * s2[2][1] + s1[1][3] * s2[3][1];
+	dst[1][2] = s1[1][0] * s2[0][2] + s1[1][1] * s2[1][2] + s1[1][2] * s2[2][2] + s1[1][3] * s2[3][2];
+	dst[1][3] = s1[1][0] * s2[0][3] + s1[1][1] * s2[1][3] + s1[1][2] * s2[2][3] + s1[1][3] * s2[3][3];
+
+	dst[2][0] = s1[2][0] * s2[0][0] + s1[2][1] * s2[1][0] + s1[2][2] * s2[2][0] + s1[2][3] * s2[3][0];
+	dst[2][1] = s1[2][0] * s2[0][1] + s1[2][1] * s2[1][1] + s1[2][2] * s2[2][1] + s1[2][3] * s2[3][1];
+	dst[2][2] = s1[2][0] * s2[0][2] + s1[2][1] * s2[1][2] + s1[2][2] * s2[2][2] + s1[2][3] * s2[3][2];
+	dst[2][3] = s1[2][0] * s2[0][3] + s1[2][1] * s2[1][3] + s1[2][2] * s2[2][3] + s1[2][3] * s2[3][3];
+
+	dst[3][0] = s1[3][0] * s2[0][0] + s1[3][1] * s2[1][0] + s1[3][2] * s2[2][0] + s1[3][3] * s2[3][0];
+	dst[3][1] = s1[3][0] * s2[0][1] + s1[3][1] * s2[1][1] + s1[3][2] * s2[2][1] + s1[3][3] * s2[3][1];
+	dst[3][2] = s1[3][0] * s2[0][2] + s1[3][1] * s2[1][2] + s1[3][2] * s2[2][2] + s1[3][3] * s2[3][2];
+	dst[3][3] = s1[3][0] * s2[0][3] + s1[3][1] * s2[1][3] + s1[3][2] * s2[2][3] + s1[3][3] * s2[3][3];
+}
+
+//-----------------------------------------------------------------------------
+// Matrix/vector multiply
+//-----------------------------------------------------------------------------
+
+void Vector4DMultiply( const VMatrix& src1, Vector4D const& src2, Vector4D& dst )
+{
+	// Make sure it works if src2 == dst
+	Vector4D tmp;
+	Vector4D const&v = (&src2 == &dst) ? tmp : src2;
+
+	if (&src2 == &dst)
+	{
+		Vector4DCopy( src2, tmp );
+	}
+
+	dst[0] = src1[0][0] * v[0] + src1[0][1] * v[1] + src1[0][2] * v[2] + src1[0][3] * v[3];
+	dst[1] = src1[1][0] * v[0] + src1[1][1] * v[1] + src1[1][2] * v[2] + src1[1][3] * v[3];
+	dst[2] = src1[2][0] * v[0] + src1[2][1] * v[1] + src1[2][2] * v[2] + src1[2][3] * v[3];
+	dst[3] = src1[3][0] * v[0] + src1[3][1] * v[1] + src1[3][2] * v[2] + src1[3][3] * v[3];
+}
+
+//-----------------------------------------------------------------------------
+// Matrix/vector multiply
+//-----------------------------------------------------------------------------
+
+void Vector4DMultiplyPosition( const VMatrix& src1, Vector const& src2, Vector4D& dst )
+{
+	// Make sure it works if src2 == dst
+	Vector tmp;
+	Vector const&v = ( &src2 == &dst.AsVector3D() ) ? static_cast<const Vector&>(tmp) : src2;
+
+	if (&src2 == &dst.AsVector3D())
+	{
+		VectorCopy( src2, tmp );
+	}
+
+	dst[0] = src1[0][0] * v[0] + src1[0][1] * v[1] + src1[0][2] * v[2] + src1[0][3];
+	dst[1] = src1[1][0] * v[0] + src1[1][1] * v[1] + src1[1][2] * v[2] + src1[1][3];
+	dst[2] = src1[2][0] * v[0] + src1[2][1] * v[1] + src1[2][2] * v[2] + src1[2][3];
+	dst[3] = src1[3][0] * v[0] + src1[3][1] * v[1] + src1[3][2] * v[2] + src1[3][3];
+}
+
+
+
+//-----------------------------------------------------------------------------
+// Matrix/vector multiply
+//-----------------------------------------------------------------------------
+
+void Vector3DMultiply( const VMatrix &src1, const Vector &src2, Vector &dst )
+{
+	// Make sure it works if src2 == dst
+	Vector tmp;
+	const Vector &v = (&src2 == &dst) ?  static_cast<const Vector&>(tmp) : src2;
+
+	if( &src2 == &dst )
+	{
+		VectorCopy( src2, tmp );
+	}
+
+	dst[0] = src1[0][0] * v[0] + src1[0][1] * v[1] + src1[0][2] * v[2];
+	dst[1] = src1[1][0] * v[0] + src1[1][1] * v[1] + src1[1][2] * v[2];
+	dst[2] = src1[2][0] * v[0] + src1[2][1] * v[1] + src1[2][2] * v[2];
+}
+
+
+//-----------------------------------------------------------------------------
+// Vector3DMultiplyPositionProjective treats src2 as if it's a point 
+// and does the perspective divide at the end
+//-----------------------------------------------------------------------------
+void Vector3DMultiplyPositionProjective( const VMatrix& src1, const Vector &src2, Vector& dst )
+{
+	// Make sure it works if src2 == dst
+	Vector tmp;
+	const Vector &v = (&src2 == &dst) ? static_cast<const Vector&>(tmp): src2;
+	if( &src2 == &dst )
+	{
+		VectorCopy( src2, tmp );
+	}
+
+	float w = src1[3][0] * v[0] + src1[3][1] * v[1] + src1[3][2] * v[2] + src1[3][3];
+	if ( w != 0.0f ) 
+	{
+		w = 1.0f / w;
+	}
+
+	dst[0] = src1[0][0] * v[0] + src1[0][1] * v[1] + src1[0][2] * v[2] + src1[0][3];
+	dst[1] = src1[1][0] * v[0] + src1[1][1] * v[1] + src1[1][2] * v[2] + src1[1][3];
+	dst[2] = src1[2][0] * v[0] + src1[2][1] * v[1] + src1[2][2] * v[2] + src1[2][3];
+	dst *= w;
+}
+
+
+//-----------------------------------------------------------------------------
+// Vector3DMultiplyProjective treats src2 as if it's a direction 
+// and does the perspective divide at the end
+//-----------------------------------------------------------------------------
+void Vector3DMultiplyProjective( const VMatrix& src1, const Vector &src2, Vector& dst )
+{
+	// Make sure it works if src2 == dst
+	Vector tmp;
+	const Vector &v = (&src2 == &dst) ? static_cast<const Vector&>(tmp) : src2;
+	if( &src2 == &dst )
+	{
+		VectorCopy( src2, tmp );
+	}
+
+	float w;
+	dst[0] = src1[0][0] * v[0] + src1[0][1] * v[1] + src1[0][2] * v[2];
+	dst[1] = src1[1][0] * v[0] + src1[1][1] * v[1] + src1[1][2] * v[2];
+	dst[2] = src1[2][0] * v[0] + src1[2][1] * v[1] + src1[2][2] * v[2];
+	w = src1[3][0] * v[0] + src1[3][1] * v[1] + src1[3][2] * v[2];
+	if (w != 0.0f)
+	{
+		dst /= w;
+	}
+	else
+	{
+		dst = vec3_origin;
+	}
+}
+
+
+//-----------------------------------------------------------------------------
+// Multiplies the vector by the transpose of the matrix
+//-----------------------------------------------------------------------------
+void Vector4DMultiplyTranspose( const VMatrix& src1, Vector4D const& src2, Vector4D& dst )
+{
+	// Make sure it works if src2 == dst
+	bool srcEqualsDst = (&src2 == &dst);
+
+	Vector4D tmp;
+	Vector4D const&v = srcEqualsDst ? tmp : src2;
+
+	if (srcEqualsDst)
+	{
+		Vector4DCopy( src2, tmp );
+	}
+
+	dst[0] = src1[0][0] * v[0] + src1[1][0] * v[1] + src1[2][0] * v[2] + src1[3][0] * v[3];
+	dst[1] = src1[0][1] * v[0] + src1[1][1] * v[1] + src1[2][1] * v[2] + src1[3][1] * v[3];
+	dst[2] = src1[0][2] * v[0] + src1[1][2] * v[1] + src1[2][2] * v[2] + src1[3][2] * v[3];
+	dst[3] = src1[0][3] * v[0] + src1[1][3] * v[1] + src1[2][3] * v[2] + src1[3][3] * v[3];
+}
+
+//-----------------------------------------------------------------------------
+// Multiplies the vector by the transpose of the matrix
+//-----------------------------------------------------------------------------
+void Vector3DMultiplyTranspose( const VMatrix& src1, const Vector& src2, Vector& dst )
+{
+	// Make sure it works if src2 == dst
+	bool srcEqualsDst = (&src2 == &dst);
+
+	Vector tmp;
+	const Vector&v = srcEqualsDst ? static_cast<const Vector&>(tmp) : src2;
+
+	if (srcEqualsDst)
+	{
+		VectorCopy( src2, tmp );
+	}
+
+	dst[0] = src1[0][0] * v[0] + src1[1][0] * v[1] + src1[2][0] * v[2];
+	dst[1] = src1[0][1] * v[0] + src1[1][1] * v[1] + src1[2][1] * v[2];
+	dst[2] = src1[0][2] * v[0] + src1[1][2] * v[1] + src1[2][2] * v[2];
+}
+
+
+//-----------------------------------------------------------------------------
+// Transform a plane
+//-----------------------------------------------------------------------------
+void MatrixTransformPlane( const VMatrix &src, const cplane_t &inPlane, cplane_t &outPlane )
+{
+	// What we want to do is the following:
+	// 1) transform the normal into the new space.
+	// 2) Determine a point on the old plane given by plane dist * plane normal
+	// 3) Transform that point into the new space
+	// 4) Plane dist = DotProduct( new normal, new point )
+
+	// An optimized version, which works if the plane is orthogonal.
+	// 1) Transform the normal into the new space
+	// 2) Realize that transforming the old plane point into the new space
+	// is given by [ d * n'x + Tx, d * n'y + Ty, d * n'z + Tz ]
+	// where d = old plane dist, n' = transformed normal, Tn = translational component of transform
+	// 3) Compute the new plane dist using the dot product of the normal result of #2
+
+	// For a correct result, this should be an inverse-transpose matrix
+	// but that only matters if there are nonuniform scale or skew factors in this matrix.
+	Vector vTrans;
+	Vector3DMultiply( src, inPlane.normal, outPlane.normal );
+	outPlane.dist = inPlane.dist * DotProduct( outPlane.normal, outPlane.normal );
+	outPlane.dist += DotProduct( outPlane.normal, src.GetTranslation(vTrans) );
+}
+
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+
+VPlane VMatrix::operator*(const VPlane &thePlane) const
+{
+	VPlane ret;
+	TransformPlane( thePlane, ret );
+	return ret;
+}
+
+#endif
+
+
+//-----------------------------------------------------------------------------
+// Builds a rotation matrix that rotates one direction vector into another
+//-----------------------------------------------------------------------------
+void MatrixBuildTranslation( VMatrix& dst, float x, float y, float z )
+{
+	MatrixSetIdentity( dst );
+	dst[0][3] = x;
+	dst[1][3] = y;
+	dst[2][3] = z;
+}
+
+void MatrixBuildTranslation( VMatrix& dst, const Vector &translation )
+{
+	MatrixSetIdentity( dst );
+	dst[0][3] = translation[0];
+	dst[1][3] = translation[1];
+	dst[2][3] = translation[2];
+}
+
+
+//-----------------------------------------------------------------------------
+// Purpose: Builds the matrix for a counterclockwise rotation about an arbitrary axis.
+//
+//		   | ax2 + (1 - ax2)cosQ		axay(1 - cosQ) - azsinQ		azax(1 - cosQ) + aysinQ |
+// Ra(Q) = | axay(1 - cosQ) + azsinQ	ay2 + (1 - ay2)cosQ			ayaz(1 - cosQ) - axsinQ |
+//		   | azax(1 - cosQ) - aysinQ	ayaz(1 - cosQ) + axsinQ		az2 + (1 - az2)cosQ     |
+//          
+// Input  : mat - 
+//			vAxisOrRot - 
+//			angle - 
+//-----------------------------------------------------------------------------
+void MatrixBuildRotationAboutAxis( VMatrix &dst, const Vector &vAxisOfRot, float angleDegrees )
+{
+	MatrixBuildRotationAboutAxis( vAxisOfRot, angleDegrees, dst.As3x4() );
+	dst[3][0] = 0;
+	dst[3][1] = 0;
+	dst[3][2] = 0;
+	dst[3][3] = 1;
+}
+
+
+//-----------------------------------------------------------------------------
+// Builds a rotation matrix that rotates one direction vector into another
+//-----------------------------------------------------------------------------
+void MatrixBuildRotation( VMatrix &dst, const Vector& initialDirection, const Vector& finalDirection )
+{
+	float angle = DotProduct( initialDirection, finalDirection );
+	Assert( IsFinite(angle) );
+	
+	Vector axis;
+
+	// No rotation required
+	if (angle - 1.0 > -1e-3)
+	{
+		// parallel case
+		MatrixSetIdentity(dst);
+		return;
+	}
+	else if (angle + 1.0 < 1e-3)
+	{
+		// antiparallel case, pick any axis in the plane
+		// perpendicular to the final direction. Choose the direction (x,y,z)
+		// which has the minimum component of the final direction, use that
+		// as an initial guess, then subtract out the component which is 
+		// parallel to the final direction
+		int idx = 0;
+		if (FloatMakePositive(finalDirection[1]) < FloatMakePositive(finalDirection[idx]))
+			idx = 1;
+		if (FloatMakePositive(finalDirection[2]) < FloatMakePositive(finalDirection[idx]))
+			idx = 2;
+
+		axis.Init( 0, 0, 0 );
+		axis[idx] = 1.0f;
+		VectorMA( axis, -DotProduct( axis, finalDirection ), finalDirection, axis );
+		VectorNormalize(axis);
+		angle = 180.0f;
+	}
+	else
+	{
+		CrossProduct( initialDirection, finalDirection, axis );
+		VectorNormalize( axis );
+		angle = acos(angle) * 180 / M_PI;
+	}
+
+	MatrixBuildRotationAboutAxis( dst, axis, angle );
+
+#ifdef _DEBUG
+	Vector test;
+	Vector3DMultiply( dst, initialDirection, test );
+	test -= finalDirection;
+	Assert( test.LengthSqr() < 1e-3 );
+#endif
+}
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+void MatrixBuildRotateZ( VMatrix &dst, float angleDegrees )
+{
+	float radians = angleDegrees * ( M_PI / 180.0f );
+
+	float fSin = ( float )sin( radians );
+	float fCos = ( float )cos( radians );
+
+	dst[0][0] = fCos; dst[0][1] = -fSin; dst[0][2] = 0.0f; dst[0][3] = 0.0f;
+	dst[1][0] = fSin; dst[1][1] =  fCos; dst[1][2] = 0.0f; dst[1][3] = 0.0f;
+	dst[2][0] = 0.0f; dst[2][1] =  0.0f; dst[2][2] = 1.0f; dst[2][3] = 0.0f;
+	dst[3][0] = 0.0f; dst[3][1] =  0.0f; dst[3][2] = 0.0f; dst[3][3] = 1.0f;
+}
+
+// Builds a scale matrix
+void MatrixBuildScale( VMatrix &dst, float x, float y, float z )
+{
+	dst[0][0] = x;		dst[0][1] = 0.0f;	dst[0][2] = 0.0f;	dst[0][3] = 0.0f;
+	dst[1][0] = 0.0f;	dst[1][1] = y;		dst[1][2] = 0.0f;	dst[1][3] = 0.0f;
+	dst[2][0] = 0.0f;	dst[2][1] = 0.0f;	dst[2][2] = z;		dst[2][3] = 0.0f;
+	dst[3][0] = 0.0f;	dst[3][1] = 0.0f;	dst[3][2] = 0.0f;	dst[3][3] = 1.0f;
+}
+
+void MatrixBuildScale( VMatrix &dst, const Vector& scale )
+{
+	MatrixBuildScale( dst, scale.x, scale.y, scale.z );
+}
+
+void MatrixBuildPerspective( VMatrix &dst, float fovX, float fovY, float zNear, float zFar )
+{
+	// FIXME: collapse all of this into one matrix after we figure out what all should be in here.
+	float width = 2 * zNear * tan( fovX * ( M_PI/180.0f ) * 0.5f );
+	float height = 2 * zNear * tan( fovY * ( M_PI/180.0f ) * 0.5f );
+
+	memset( dst.Base(), 0, sizeof( dst ) );
+	dst[0][0]  = 2.0F * zNear / width;
+	dst[1][1]  = 2.0F * zNear / height;
+	dst[2][2] = -zFar / ( zNear - zFar );
+	dst[3][2] = 1.0f;
+	dst[2][3] = zNear * zFar / ( zNear - zFar );
+
+	// negate X and Y so that X points right, and Y points up.
+	VMatrix negateXY;
+	negateXY.Identity();
+	negateXY[0][0] = -1.0f;
+	negateXY[1][1] = -1.0f;
+	MatrixMultiply( negateXY, dst, dst );
+	
+	VMatrix addW;
+	addW.Identity();
+	addW[0][3] = 1.0f;
+	addW[1][3] = 1.0f;
+	addW[2][3] = 0.0f;
+	MatrixMultiply( addW, dst, dst );
+	
+	VMatrix scaleHalf;
+	scaleHalf.Identity();
+	scaleHalf[0][0] = 0.5f;
+	scaleHalf[1][1] = 0.5f;
+	MatrixMultiply( scaleHalf, dst, dst );
+}
+
+static inline void CalculateAABBForNormalizedFrustum_Helper( float x, float y, float z, const VMatrix &volumeToWorld, Vector &mins, Vector &maxs )
+{
+	Vector volumeSpacePos( x, y, z );
+
+	// Make sure it's been clipped
+	Assert( volumeSpacePos[0] >= -1e-3f );
+	Assert( volumeSpacePos[0] - 1.0f <= 1e-3f );
+	Assert( volumeSpacePos[1] >= -1e-3f );
+	Assert( volumeSpacePos[1] - 1.0f <= 1e-3f );
+	Assert( volumeSpacePos[2] >= -1e-3f );
+	Assert( volumeSpacePos[2] - 1.0f <= 1e-3f );
+
+	Vector worldPos;
+	Vector3DMultiplyPositionProjective( volumeToWorld, volumeSpacePos, worldPos );
+	AddPointToBounds( worldPos, mins, maxs );
+}
+
+//-----------------------------------------------------------------------------
+// Given an inverse projection matrix, take the extremes of the space in transformed into world space and
+// get a bounding box.
+//-----------------------------------------------------------------------------
+void CalculateAABBFromProjectionMatrixInverse( const VMatrix &volumeToWorld, Vector *pMins, Vector *pMaxs )
+{
+	// FIXME: Could maybe do better than the compile with all of these multiplies by 0 and 1.
+	ClearBounds( *pMins, *pMaxs );
+	CalculateAABBForNormalizedFrustum_Helper( 0, 0, 0, volumeToWorld, *pMins, *pMaxs );
+	CalculateAABBForNormalizedFrustum_Helper( 0, 0, 1, volumeToWorld, *pMins, *pMaxs );
+	CalculateAABBForNormalizedFrustum_Helper( 0, 1, 0, volumeToWorld, *pMins, *pMaxs );
+	CalculateAABBForNormalizedFrustum_Helper( 0, 1, 1, volumeToWorld, *pMins, *pMaxs );
+	CalculateAABBForNormalizedFrustum_Helper( 1, 0, 0, volumeToWorld, *pMins, *pMaxs );
+	CalculateAABBForNormalizedFrustum_Helper( 1, 0, 1, volumeToWorld, *pMins, *pMaxs );
+	CalculateAABBForNormalizedFrustum_Helper( 1, 1, 0, volumeToWorld, *pMins, *pMaxs );
+	CalculateAABBForNormalizedFrustum_Helper( 1, 1, 1, volumeToWorld, *pMins, *pMaxs );
+}
+
+void CalculateAABBFromProjectionMatrix( const VMatrix &worldToVolume, Vector *pMins, Vector *pMaxs )
+{
+	VMatrix volumeToWorld;
+	MatrixInverseGeneral( worldToVolume, volumeToWorld );
+	CalculateAABBFromProjectionMatrixInverse( volumeToWorld, pMins, pMaxs );
+}
+
+//-----------------------------------------------------------------------------
+// Given an inverse projection matrix, take the extremes of the space in transformed into world space and
+// get a bounding sphere.
+//-----------------------------------------------------------------------------
+void CalculateSphereFromProjectionMatrixInverse( const VMatrix &volumeToWorld, Vector *pCenter, float *pflRadius )
+{
+	// FIXME: Could maybe do better than the compile with all of these multiplies by 0 and 1.
+
+	// Need 3 points: the endpoint of the line through the center of the near + far planes,
+	// and one point on the far plane. From that, we can derive a point somewhere on the center	line
+	// which would produce the smallest bounding sphere.
+	Vector vecCenterNear, vecCenterFar, vecNearEdge, vecFarEdge;
+	Vector3DMultiplyPositionProjective( volumeToWorld, Vector( 0.5f, 0.5f, 0.0f ), vecCenterNear );
+	Vector3DMultiplyPositionProjective( volumeToWorld, Vector( 0.5f, 0.5f, 1.0f ), vecCenterFar );
+	Vector3DMultiplyPositionProjective( volumeToWorld, Vector( 0.0f, 0.0f, 0.0f ), vecNearEdge );
+	Vector3DMultiplyPositionProjective( volumeToWorld, Vector( 0.0f, 0.0f, 1.0f ), vecFarEdge );
+
+	// Let the distance between the near + far center points = l
+	// Let the distance between the near center point + near edge point = h1
+	// Let the distance between the far center point + far edge point = h2
+	// Let the distance along the center line from the near point to the sphere center point = x
+	// Then let the distance between the sphere center point + near edge point == 
+	//	the distance between the sphere center point + far edge point == r == radius of sphere
+	// Then h1^2 + x^2 == r^2 == (l-x)^2 + h2^2
+	// h1^x + x^2 = l^2 - 2 * l * x + x^2 + h2^2
+	// 2 * l * x = l^2 + h2^2 - h1^2
+	// x = (l^2 + h2^2 - h1^2) / (2 * l)
+	// r = sqrt( hl^1 + x^2 )
+	Vector vecDelta;
+	VectorSubtract( vecCenterFar, vecCenterNear, vecDelta );
+	float l = vecDelta.Length();
+	float h1Sqr = vecCenterNear.DistToSqr( vecNearEdge );
+	float h2Sqr = vecCenterFar.DistToSqr( vecFarEdge );
+	float x = (l*l + h2Sqr - h1Sqr) / (2.0f * l);
+	VectorMA( vecCenterNear, (x / l), vecDelta, *pCenter );
+	*pflRadius = sqrt( h1Sqr + x*x );
+}
+
+//-----------------------------------------------------------------------------
+// Given a projection matrix, take the extremes of the space in transformed into world space and
+// get a bounding sphere.
+//-----------------------------------------------------------------------------
+void CalculateSphereFromProjectionMatrix( const VMatrix &worldToVolume, Vector *pCenter, float *pflRadius )
+{
+	VMatrix volumeToWorld;
+	MatrixInverseGeneral( worldToVolume, volumeToWorld );
+	CalculateSphereFromProjectionMatrixInverse( volumeToWorld, pCenter, pflRadius );
+}
+
+
+static inline void FrustumPlanesFromMatrixHelper( const VMatrix &shadowToWorld, const Vector &p1, const Vector &p2, const Vector &p3, 
+												 Vector &normal, float &dist )
+{
+	Vector world1, world2, world3;
+	Vector3DMultiplyPositionProjective( shadowToWorld, p1, world1 );
+	Vector3DMultiplyPositionProjective( shadowToWorld, p2, world2 );
+	Vector3DMultiplyPositionProjective( shadowToWorld, p3, world3 );
+
+	Vector v1, v2;
+	VectorSubtract( world2, world1, v1 );
+	VectorSubtract( world3, world1, v2 );
+
+	CrossProduct( v1, v2, normal );
+	VectorNormalize( normal );
+	dist = DotProduct( normal, world1 );	
+}
+
+void FrustumPlanesFromMatrix( const VMatrix &clipToWorld, Frustum_t &frustum )
+{
+	Vector normal;
+	float dist;
+
+	FrustumPlanesFromMatrixHelper( clipToWorld, 
+		Vector( 0.0f, 0.0f, 0.0f ), Vector( 1.0f, 0.0f, 0.0f ), Vector( 0.0f, 1.0f, 0.0f ), normal, dist );
+	frustum.SetPlane( FRUSTUM_NEARZ, PLANE_ANYZ, normal, dist );
+
+	FrustumPlanesFromMatrixHelper( clipToWorld, 
+		Vector( 0.0f, 0.0f, 1.0f ), Vector( 0.0f, 1.0f, 1.0f ), Vector( 1.0f, 0.0f, 1.0f ), normal, dist );
+	frustum.SetPlane( FRUSTUM_FARZ, PLANE_ANYZ, normal, dist );
+
+	FrustumPlanesFromMatrixHelper( clipToWorld, 
+		Vector( 1.0f, 0.0f, 0.0f ), Vector( 1.0f, 1.0f, 1.0f ), Vector( 1.0f, 1.0f, 0.0f ), normal, dist );
+	frustum.SetPlane( FRUSTUM_RIGHT, PLANE_ANYZ, normal, dist );
+
+	FrustumPlanesFromMatrixHelper( clipToWorld, 
+		Vector( 0.0f, 0.0f, 0.0f ), Vector( 0.0f, 1.0f, 1.0f ), Vector( 0.0f, 0.0f, 1.0f ), normal, dist );
+	frustum.SetPlane( FRUSTUM_LEFT, PLANE_ANYZ, normal, dist );
+
+	FrustumPlanesFromMatrixHelper( clipToWorld, 
+		Vector( 1.0f, 1.0f, 0.0f ), Vector( 1.0f, 1.0f, 1.0f ), Vector( 0.0f, 1.0f, 1.0f ), normal, dist );
+	frustum.SetPlane( FRUSTUM_TOP, PLANE_ANYZ, normal, dist );
+
+	FrustumPlanesFromMatrixHelper( clipToWorld, 
+		Vector( 1.0f, 0.0f, 0.0f ), Vector( 0.0f, 0.0f, 1.0f ), Vector( 1.0f, 0.0f, 1.0f ), normal, dist );
+	frustum.SetPlane( FRUSTUM_BOTTOM, PLANE_ANYZ, normal, dist );
+}
+
+void MatrixBuildOrtho( VMatrix& dst, double left, double top, double right, double bottom, double zNear, double zFar )
+{
+	// FIXME: This is being used incorrectly! Should read:
+	// D3DXMatrixOrthoOffCenterRH( &matrix, left, right, bottom, top, zNear, zFar );
+	// Which is certainly why we need these extra -1 scales in y. Bleah
+
+	// NOTE: The camera can be imagined as the following diagram:
+	//		/z
+	//	   /
+	//	  /____ x	Z is going into the screen
+	//	  |
+	//	  |
+	//	  |y
+	//
+	// (0,0,z) represents the upper-left corner of the screen.
+	// Our projection transform needs to transform from this space to a LH coordinate
+	// system that looks thusly:
+	// 
+	//	y|  /z
+	//	 | /
+	//	 |/____ x	Z is going into the screen
+	//
+	// Where x,y lies between -1 and 1, and z lies from 0 to 1
+	// This is because the viewport transformation from projection space to pixels
+	// introduces a -1 scale in the y coordinates
+	//		D3DXMatrixOrthoOffCenterRH( &matrix, left, right, top, bottom, zNear, zFar );
+
+	dst.Init(	 2.0f / ( right - left ),						0.0f,						0.0f, ( left + right ) / ( left - right ),
+				0.0f,	 2.0f / ( bottom - top ),						0.0f, ( bottom + top ) / ( top - bottom ),
+				0.0f,						0.0f,	 1.0f / ( zNear - zFar ),			 zNear / ( zNear - zFar ),
+				0.0f,						0.0f,						0.0f,								1.0f );
+}
+
+void MatrixBuildPerspectiveZRange( VMatrix& dst, double flZNear, double flZFar )
+{
+	dst.m[2][0] = 0.0f;
+	dst.m[2][1] = 0.0f;
+	dst.m[2][2] = flZFar / ( flZNear - flZFar );
+	dst.m[2][3] = flZNear * flZFar / ( flZNear - flZFar );
+}
+
+void MatrixBuildPerspectiveX( VMatrix& dst, double flFovX, double flAspect, double flZNear, double flZFar )
+{
+	float flWidthScale = 1.0f / tanf( flFovX * M_PI / 360.0f );
+	float flHeightScale = flAspect * flWidthScale;
+	dst.Init(   flWidthScale,				0.0f,							0.0f,										0.0f,
+				0.0f,						flHeightScale,					0.0f,										0.0f,
+				0.0f,						0.0f,							0.0f,										0.0f,
+				0.0f,						0.0f,						   -1.0f,										0.0f );
+
+	MatrixBuildPerspectiveZRange ( dst, flZNear, flZFar );
+}
+
+void MatrixBuildPerspectiveOffCenterX( VMatrix& dst, double flFovX, double flAspect, double flZNear, double flZFar, double bottom, double top, double left, double right )
+{
+	float flWidth = tanf( flFovX * M_PI / 360.0f );
+	float flHeight = flWidth / flAspect;
+
+	// bottom, top, left, right are 0..1 so convert to -<val>/2..<val>/2
+	float flLeft   = -(flWidth/2.0f)  * (1.0f - left)   + left   * (flWidth/2.0f);
+	float flRight  = -(flWidth/2.0f)  * (1.0f - right)  + right  * (flWidth/2.0f);
+	float flBottom = -(flHeight/2.0f) * (1.0f - bottom) + bottom * (flHeight/2.0f);
+	float flTop    = -(flHeight/2.0f) * (1.0f - top)    + top    * (flHeight/2.0f);
+
+	dst.Init(   1.0f / (flRight-flLeft),        0.0f,                           (flLeft+flRight)/(flRight-flLeft),  0.0f,
+				0.0f,                           1.0f /(flTop-flBottom),         (flTop+flBottom)/(flTop-flBottom),  0.0f,
+				0.0f,                           0.0f,							0.0f,								0.0f,
+				0.0f,                           0.0f,                           -1.0f,								0.0f );
+
+	MatrixBuildPerspectiveZRange ( dst, flZNear, flZFar );
+}
+#endif // !_STATIC_LINKED || _SHARED_LIB
+
author	Alan Edwardes <[email protected]>	2013-12-03 10:47:30 +0000
committer	Alan Edwardes <[email protected]>	2013-12-03 10:47:30 +0000
commit	550992aebacbc7586553c15a3c2120f85a879126 (patch)
tree	c814cf654018acd5d69bb6e4be5dc9900391fd37 /mp/src/mathlib
parent	VBSP now checks all search paths for an FGD file. (diff)
parent	Make .xcconfigs text files too. (diff)
download	source-sdk-2013-550992aebacbc7586553c15a3c2120f85a879126.tar.xz source-sdk-2013-550992aebacbc7586553c15a3c2120f85a879126.zip