1HEAD master

author: FluorescentCIAAfricanAmerican <[email protected]> 2020-04-22 12:56:21 -0400
committer: FluorescentCIAAfricanAmerican <[email protected]> 2020-04-22 12:56:21 -0400
commit: 3bf9df6b2785fa6d951086978a3e66f49427166a (patch)
tree: 2c0f1f0c63c4832882bc93814ebd2c2b1c6224e5 /public/mathlib
download: archived-source-engine-2018-hl2-src-master.tar.xz
archived-source-engine-2018-hl2-src-master.zip
24 files changed, 13858 insertions, 0 deletions
diff --git a/public/mathlib/IceKey.H b/public/mathlib/IceKey.H
new file mode 100644
index 0000000..f8641d0
--- /dev/null
+++ b/public/mathlib/IceKey.H
@@ -0,0 +1,62 @@
+// Purpose: Header file for the C++ ICE encryption class.
+//			Taken from public domain code, as written by Matthew Kwan - July 1996
+//			http://www.darkside.com.au/ice/
+
+#ifndef _IceKey_H
+#define _IceKey_H
+
+/*
+The IceKey class is used for encrypting and decrypting 64-bit blocks of data 
+with the ICE (Information Concealment Engine) encryption algorithm. 
+
+The constructor creates a new IceKey object that can be used to encrypt and decrypt data. 
+The level of encryption determines the size of the key, and hence its speed. 
+Level 0 uses the Thin-ICE variant, which is an 8-round cipher taking an 8-byte key. 
+This is the fastest option, and is generally considered to be at least as secure as DES, 
+although it is not yet certain whether it is as secure as its key size. 
+
+For levels n greater than zero, a 16n-round cipher is used, taking 8n-byte keys. 
+Although not as fast as level 0, these are very very secure. 
+
+Before an IceKey can be used to encrypt data, its key schedule must be set with the set() member function. 
+The length of the key required is determined by the level, as described above. 
+
+The member functions encrypt() and decrypt() encrypt and decrypt respectively data 
+in blocks of eight chracters, using the specified key. 
+
+Two functions keySize() and blockSize() are provided 
+which return the key and block size respectively, measured in bytes. 
+The key size is determined by the level, while the block size is always 8. 
+
+The destructor zeroes out and frees up all memory associated with the key. 
+*/
+
+class IceSubkey;
+
+class IceKey {
+    public:
+	IceKey (int n);
+	~IceKey ();
+
+	void		set (const unsigned char *key);
+
+	void		encrypt (const unsigned char *plaintext,
+					unsigned char *ciphertext) const;
+
+	void		decrypt (const unsigned char *ciphertext,
+					unsigned char *plaintext) const;
+
+	int		keySize () const;
+
+	int		blockSize () const;
+
+    private:
+	void		scheduleBuild (unsigned short *k, int n,
+							const int *keyrot);
+
+	int		_size;
+	int		_rounds;
+	IceSubkey	*_keysched;
+};
+
+#endif
diff --git a/public/mathlib/amd3dx.h b/public/mathlib/amd3dx.h
new file mode 100644
index 0000000..9dab1bf
--- /dev/null
+++ b/public/mathlib/amd3dx.h
@@ -0,0 +1,1188 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+/******************************************************************************
+
+ Copyright (c) 1999 Advanced Micro Devices, Inc.
+
+ LIMITATION OF LIABILITY:  THE MATERIALS ARE PROVIDED *AS IS* WITHOUT ANY
+ EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY,
+ NONINFRINGEMENT OF THIRD-PARTY INTELLECTUAL PROPERTY, OR FITNESS FOR ANY
+ PARTICULAR PURPOSE.  IN NO EVENT SHALL AMD OR ITS SUPPLIERS BE LIABLE FOR ANY
+ DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF PROFITS,
+ BUSINESS INTERRUPTION, LOSS OF INFORMATION) ARISING OUT OF THE USE OF OR
+ INABILITY TO USE THE MATERIALS, EVEN IF AMD HAS BEEN ADVISED OF THE POSSIBILITY
+ OF SUCH DAMAGES.  BECAUSE SOME JURISDICTIONS PROHIBIT THE EXCLUSION OR LIMITATION
+ OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE LIMITATION MAY
+ NOT APPLY TO YOU.
+
+ AMD does not assume any responsibility for any errors which may appear in the
+ Materials nor any responsibility to support or update the Materials.  AMD retains
+ the right to make changes to its test specifications at any time, without notice.
+
+ NO SUPPORT OBLIGATION: AMD is not obligated to furnish, support, or make any
+ further information, software, technical information, know-how, or show-how
+ available to you.
+
+ So that all may benefit from your experience, please report  any  problems
+ or  suggestions about this software to [email protected]
+
+ AMD Developer Technologies, M/S 585
+ Advanced Micro Devices, Inc.
+ 5900 E. Ben White Blvd.
+ Austin, TX 78741
+ [email protected]
+
+*******************************************************************************
+
+ AMD3DX.H
+
+ MACRO FORMAT
+ ============
+ This file contains inline assembly macros that
+ generate AMD-3D instructions in binary format.
+ Therefore, C or C++ programmer can use AMD-3D instructions
+ without any penalty in their C or C++ source code.
+
+ The macro's name and format conventions are as follow:
+
+
+ 1. First argument of macro is a destination and
+    second argument is a source operand.
+      ex) _asm PFCMPEQ (mm3, mm4)
+                         |    |
+                        dst  src
+
+ 2. The destination operand can be m0 to m7 only.
+    The source operand can be any one of the register
+    m0 to m7 or _eax, _ecx, _edx, _ebx, _esi, or _edi
+    that contains effective address.
+      ex) _asm PFRCP    (MM7, MM6)
+      ex) _asm PFRCPIT2 (mm0, mm4)
+      ex) _asm PFMUL    (mm3, _edi)
+
+  3. The prefetch(w) takes one src operand _eax, ecx, _edx,
+     _ebx, _esi, or _edi that contains effective address.
+      ex) _asm PREFETCH (_edi)
+
+ For WATCOM C/C++ users, when using #pragma aux instead if 
+ _asm, all macro names should be prefixed by a p_ or P_. 
+ Macros should not be enclosed in quotes.
+              ex) p_pfrcp (MM7,MM6)
+
+ NOTE: Not all instruction macros, nor all possible
+       combinations of operands have been explicitely
+       tested. If any errors are found, please report
+       them.
+
+ EXAMPLE
+ =======
+ Following program doesn't do anything but it shows you
+ how to use inline assembly AMD-3D instructions in C.
+ Note that this will only work in flat memory model which
+ segment registers cs, ds, ss and es point to the same
+ linear address space total less than 4GB.
+
+ Used Microsoft VC++ 5.0
+
+ #include <stdio.h>
+ #include "amd3d.h"
+
+ void main ()
+ {
+      float x = (float)1.25;
+      float y = (float)1.25;
+      float z, zz;
+
+     _asm {
+              movd mm1, x
+              movd mm2, y
+              pfmul (mm1, mm2)
+              movd z, mm1
+              femms
+      }
+
+      printf ("value of z = %f\n", z);
+
+      //
+      // Demonstration of using the memory instead of
+      // multimedia register
+      //
+      _asm {
+              movd mm3, x
+              lea esi, y   // load effective address of y
+              pfmul (mm3, _esi)
+              movd zz, mm3
+              femms
+      }
+
+      printf ("value of zz = %f\n", zz);
+  }
+
+ #pragma aux EXAMPLE with WATCOM C/C++ v11.x
+ ===========================================
+
+    extern void Add(float *__Dest, float *__A, float *__B);
+    #pragma aux Add =               \
+            p_femms                 \
+            "movd mm6,[esi]"        \
+            p_pfadd(mm6,_edi)       \
+            "movd [ebx],mm6"        \
+            p_femms                 \
+            parm [ebx] [esi] [edi];
+
+*******************************************************************************/
+
+#ifndef _K3DMACROSINCLUDED_
+#define _K3DMACROSINCLUDED_
+
+#if defined (__WATCOMC__)
+
+// The WATCOM C/C++ version of the 3DNow! macros.
+//
+// The older, compbined register style for WATCOM C/C++ macros is not 
+// supported.
+
+/* Operand defines for instructions two operands */
+#define _k3d_mm0_mm0 0xc0
+#define _k3d_mm0_mm1 0xc1
+#define _k3d_mm0_mm2 0xc2
+#define _k3d_mm0_mm3 0xc3
+#define _k3d_mm0_mm4 0xc4
+#define _k3d_mm0_mm5 0xc5
+#define _k3d_mm0_mm6 0xc6
+#define _k3d_mm0_mm7 0xc7
+#define _k3d_mm0_eax 0x00
+#define _k3d_mm0_ecx 0x01
+#define _k3d_mm0_edx 0x02
+#define _k3d_mm0_ebx 0x03
+#define _k3d_mm0_esi 0x06
+#define _k3d_mm0_edi 0x07
+#define _k3d_mm1_mm0 0xc8
+#define _k3d_mm1_mm1 0xc9
+#define _k3d_mm1_mm2 0xca
+#define _k3d_mm1_mm3 0xcb
+#define _k3d_mm1_mm4 0xcc
+#define _k3d_mm1_mm5 0xcd
+#define _k3d_mm1_mm6 0xce
+#define _k3d_mm1_mm7 0xcf
+#define _k3d_mm1_eax 0x08
+#define _k3d_mm1_ecx 0x09
+#define _k3d_mm1_edx 0x0a
+#define _k3d_mm1_ebx 0x0b
+#define _k3d_mm1_esi 0x0e
+#define _k3d_mm1_edi 0x0f
+#define _k3d_mm2_mm0 0xd0
+#define _k3d_mm2_mm1 0xd1
+#define _k3d_mm2_mm2 0xd2
+#define _k3d_mm2_mm3 0xd3
+#define _k3d_mm2_mm4 0xd4
+#define _k3d_mm2_mm5 0xd5
+#define _k3d_mm2_mm6 0xd6
+#define _k3d_mm2_mm7 0xd7
+#define _k3d_mm2_eax 0x10
+#define _k3d_mm2_ecx 0x11
+#define _k3d_mm2_edx 0x12
+#define _k3d_mm2_ebx 0x13
+#define _k3d_mm2_esi 0x16
+#define _k3d_mm2_edi 0x17
+#define _k3d_mm3_mm0 0xd8
+#define _k3d_mm3_mm1 0xd9
+#define _k3d_mm3_mm2 0xda
+#define _k3d_mm3_mm3 0xdb
+#define _k3d_mm3_mm4 0xdc
+#define _k3d_mm3_mm5 0xdd
+#define _k3d_mm3_mm6 0xde
+#define _k3d_mm3_mm7 0xdf
+#define _k3d_mm3_eax 0x18
+#define _k3d_mm3_ecx 0x19
+#define _k3d_mm3_edx 0x1a
+#define _k3d_mm3_ebx 0x1b
+#define _k3d_mm3_esi 0x1e
+#define _k3d_mm3_edi 0x1f
+#define _k3d_mm4_mm0 0xe0
+#define _k3d_mm4_mm1 0xe1
+#define _k3d_mm4_mm2 0xe2
+#define _k3d_mm4_mm3 0xe3
+#define _k3d_mm4_mm4 0xe4
+#define _k3d_mm4_mm5 0xe5
+#define _k3d_mm4_mm6 0xe6
+#define _k3d_mm4_mm7 0xe7
+#define _k3d_mm4_eax 0x20
+#define _k3d_mm4_ecx 0x21
+#define _k3d_mm4_edx 0x22
+#define _k3d_mm4_ebx 0x23
+#define _k3d_mm4_esi 0x26
+#define _k3d_mm4_edi 0x27
+#define _k3d_mm5_mm0 0xe8
+#define _k3d_mm5_mm1 0xe9
+#define _k3d_mm5_mm2 0xea
+#define _k3d_mm5_mm3 0xeb
+#define _k3d_mm5_mm4 0xec
+#define _k3d_mm5_mm5 0xed
+#define _k3d_mm5_mm6 0xee
+#define _k3d_mm5_mm7 0xef
+#define _k3d_mm5_eax 0x28
+#define _k3d_mm5_ecx 0x29
+#define _k3d_mm5_edx 0x2a
+#define _k3d_mm5_ebx 0x2b
+#define _k3d_mm5_esi 0x2e
+#define _k3d_mm5_edi 0x2f
+#define _k3d_mm6_mm0 0xf0
+#define _k3d_mm6_mm1 0xf1
+#define _k3d_mm6_mm2 0xf2
+#define _k3d_mm6_mm3 0xf3
+#define _k3d_mm6_mm4 0xf4
+#define _k3d_mm6_mm5 0xf5
+#define _k3d_mm6_mm6 0xf6
+#define _k3d_mm6_mm7 0xf7
+#define _k3d_mm6_eax 0x30
+#define _k3d_mm6_ecx 0x31
+#define _k3d_mm6_edx 0x32
+#define _k3d_mm6_ebx 0x33
+#define _k3d_mm6_esi 0x36
+#define _k3d_mm6_edi 0x37
+#define _k3d_mm7_mm0 0xf8
+#define _k3d_mm7_mm1 0xf9
+#define _k3d_mm7_mm2 0xfa
+#define _k3d_mm7_mm3 0xfb
+#define _k3d_mm7_mm4 0xfc
+#define _k3d_mm7_mm5 0xfd
+#define _k3d_mm7_mm6 0xfe
+#define _k3d_mm7_mm7 0xff
+#define _k3d_mm7_eax 0x38
+#define _k3d_mm7_ecx 0x39
+#define _k3d_mm7_edx 0x3a
+#define _k3d_mm7_ebx 0x3b
+#define _k3d_mm7_esi 0x3e
+#define _k3d_mm7_edi 0x3f
+
+#define _k3d_name_xlat_m0 _mm0
+#define _k3d_name_xlat_m1 _mm1
+#define _k3d_name_xlat_m2 _mm2
+#define _k3d_name_xlat_m3 _mm3
+#define _k3d_name_xlat_m4 _mm4
+#define _k3d_name_xlat_m5 _mm5
+#define _k3d_name_xlat_m6 _mm6
+#define _k3d_name_xlat_m7 _mm7
+#define _k3d_name_xlat_M0 _mm0
+#define _k3d_name_xlat_M1 _mm1
+#define _k3d_name_xlat_M2 _mm2
+#define _k3d_name_xlat_M3 _mm3
+#define _k3d_name_xlat_M4 _mm4
+#define _k3d_name_xlat_M5 _mm5
+#define _k3d_name_xlat_M6 _mm6
+#define _k3d_name_xlat_M7 _mm7
+#define _k3d_name_xlat_mm0 _mm0
+#define _k3d_name_xlat_mm1 _mm1
+#define _k3d_name_xlat_mm2 _mm2
+#define _k3d_name_xlat_mm3 _mm3
+#define _k3d_name_xlat_mm4 _mm4
+#define _k3d_name_xlat_mm5 _mm5
+#define _k3d_name_xlat_mm6 _mm6
+#define _k3d_name_xlat_mm7 _mm7
+#define _k3d_name_xlat_MM0 _mm0
+#define _k3d_name_xlat_MM1 _mm1
+#define _k3d_name_xlat_MM2 _mm2
+#define _k3d_name_xlat_MM3 _mm3
+#define _k3d_name_xlat_MM4 _mm4
+#define _k3d_name_xlat_MM5 _mm5
+#define _k3d_name_xlat_MM6 _mm6
+#define _k3d_name_xlat_MM7 _mm7
+#define _k3d_name_xlat_eax _eax
+#define _k3d_name_xlat_ebx _ebx
+#define _k3d_name_xlat_ecx _ecx
+#define _k3d_name_xlat_edx _edx
+#define _k3d_name_xlat_esi _esi
+#define _k3d_name_xlat_edi _edi
+#define _k3d_name_xlat_ebp _ebp
+#define _k3d_name_xlat_EAX _eax
+#define _k3d_name_xlat_EBX _ebx
+#define _k3d_name_xlat_ECX _ecx
+#define _k3d_name_xlat_EDX _edx
+#define _k3d_name_xlat_ESI _esi
+#define _k3d_name_xlat_EDI _edi
+#define _k3d_name_xlat_EBP _ebp
+#define _k3d_name_xlat__eax _eax
+#define _k3d_name_xlat__ebx _ebx
+#define _k3d_name_xlat__ecx _ecx
+#define _k3d_name_xlat__edx _edx
+#define _k3d_name_xlat__esi _esi
+#define _k3d_name_xlat__edi _edi
+#define _k3d_name_xlat__ebp _ebp
+#define _k3d_name_xlat__EAX _eax
+#define _k3d_name_xlat__EBX _ebx
+#define _k3d_name_xlat__ECX _ecx
+#define _k3d_name_xlat__EDX _edx
+#define _k3d_name_xlat__ESI _esi
+#define _k3d_name_xlat__EDI _edi
+#define _k3d_name_xlat__EBP _ebp
+
+#define _k3d_xglue3(a,b,c) a##b##c
+#define _k3d_glue3(a,b,c) _k3d_xglue3(a,b,c)
+#define _k3d_MODRM(dst, src) _k3d_glue3(_k3d,_k3d_name_xlat_##dst,_k3d_name_xlat_##src)
+
+/* Operand defines for prefetch and prefetchw */
+
+#define _k3d_pref_eax 0x00
+#define _k3d_pref_ecx 0x01
+#define _k3d_pref_edx 0x02
+#define _k3d_pref_ebx 0x03
+#define _k3d_pref_esi 0x06
+#define _k3d_pref_edi 0x07
+#define _k3d_pref_EAX 0x00
+#define _k3d_pref_ECX 0x01
+#define _k3d_pref_EDX 0x02
+#define _k3d_pref_EBX 0x03
+#define _k3d_pref_ESI 0x06
+#define _k3d_pref_EDI 0x07
+#define _k3d_prefw_eax 0x08
+#define _k3d_prefw_ecx 0x09
+#define _k3d_prefw_edx 0x0A
+#define _k3d_prefw_ebx 0x0B
+#define _k3d_prefw_esi 0x0E
+#define _k3d_prefw_edi 0x0F
+#define _k3d_prefw_EAX 0x08
+#define _k3d_prefw_ECX 0x09
+#define _k3d_prefw_EDX 0x0A
+#define _k3d_prefw_EBX 0x0B
+#define _k3d_prefw_ESI 0x0E
+#define _k3d_prefw_EDI 0x0F
+
+/* Defines for 3DNow! instructions */
+#define PF2ID(dst, src)         db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0x1d
+#define PFACC(dst, src)         db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0xae
+#define PFADD(dst, src)         db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0x9e
+#define PFCMPEQ(dst, src)       db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0xb0
+#define PFCMPGE(dst, src)       db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0x90
+#define PFCMPGT(dst, src)       db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0xa0
+#define PFMAX(dst, src)         db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0xa4
+#define PFMIN(dst, src)         db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0x94
+#define PFMUL(dst, src)         db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0xb4
+#define PFRCP(dst, src)         db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0x96
+#define PFRCPIT1(dst, src)      db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0xa6
+#define PFRCPIT2(dst, src)      db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0xb6
+#define PFRSQRT(dst, src)       db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0x97
+#define PFRSQIT1(dst, src)      db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0xa7
+#define PFSUB(dst, src)         db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0x9a
+#define PFSUBR(dst, src)        db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0xaa
+#define PI2FD(dst, src)         db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0x0d
+#define FEMMS                   db 0x0f, 0x0e
+#define PAVGUSB(dst, src)       db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0xbf
+#define PMULHRW(dst, src)       db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0xb7
+#define PREFETCH(src)           db 0x0f, 0x0d, _k3d_pref_##src
+#define PREFETCHW(src)          db 0x0f, 0x0d, _k3d_prefw_##src
+#define CPUID                   db 0x0f, 0xa2
+
+/* Defines for new, K7 opcodes */
+#define PFNACC(dst,src)         db 0x0f, 0x0f, _k3d_MODRM(dst,src), 0x8a
+#define FPPNACC(dst,src)        db 0x0f, 0x0f, _k3d_MODRM(dst,src), 0x8e
+#define PSWAPD(dst,src)         db 0x0f, 0x0f, _k3d_MODRM(dst,src), 0xbb
+#define PMINUB(dst,src)         db 0x0f, 0xda, _k3d_MODRM(dst,src)
+#define PMAXUB(dst,src)         db 0x0f, 0xde, _k3d_MODRM(dst,src)
+#define PMINSW(dst,src)         db 0x0f, 0xea, _k3d_MODRM(dst,src)
+#define PMAXSW(dst,src)         db 0x0f, 0xee, _k3d_MODRM(dst,src)
+#define PMULHUW(dst,src)        db 0x0f, 0xe4, _k3d_MODRM(dst,src)
+#define PAVGB(dst,src)          db 0x0f, 0xe0, _k3d_MODRM(dst,src)
+#define PAVGW(dst,src)          db 0x0f, 0xe3, _k3d_MODRM(dst,src)
+#define PSADBW(dst,src)         db 0x0f, 0xf6, _k3d_MODRM(dst,src)
+#define PMOVMSKB(dst,src)       db 0x0f, 0xd7, _k3d_MODRM(dst,src)
+#define PMASKMOVQ(dst,src)      db 0x0f, 0xf7, _k3d_MODRM(dst,src)
+#define PINSRW(dst,src,msk)     db 0x0f, 0xc4, _k3d_MODRM(dst,src), msk
+#define PEXTRW(dst,src,msk)     db 0x0f, 0xc5, _k3d_MODRM(dst,src), msk
+#define PSHUFW(dst,src,msk)     db 0x0f, 0x70, _k3d_MODRM(dst,src), msk
+#define MOVNTQ(dst,src)         db 0x0f, 0xe7, _k3d_MODRM(src,dst)
+#define SFENCE                  db 0x0f, 0xae, 0xf8
+
+/* Memory/offset versions of the opcodes */
+#define PF2IDM(dst,src,off)     db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0x1d
+#define PFACCM(dst,src,off)     db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0xae
+#define PFADDM(dst,src,off)     db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0x9e
+#define PFCMPEQM(dst,src,off)   db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0xb0
+#define PFCMPGEM(dst,src,off)   db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0x90
+#define PFCMPGTM(dst,src,off)   db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0xa0
+#define PFMAXM(dst,src,off)     db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0xa4
+#define PFMINM(dst,src,off)     db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0x94
+#define PFMULM(dst,src,off)     db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0xb4
+#define PFRCPM(dst,src,off)     db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0x96
+#define PFRCPIT1M(dst,src,off)  db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0xa6
+#define PFRCPIT2M(dst,src,off)  db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0xb6
+#define PFRSQRTM(dst,src,off)   db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0x97
+#define PFRSQIT1M(dst,src,off)  db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0xa7
+#define PFSUBM(dst,src,off)     db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0x9a
+#define PFSUBRM(dst,src,off)    db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0xaa
+#define PI2FDM(dst,src,off)     db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0x0d
+#define PAVGUSBM(dst,src,off)   db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0xbf
+#define PMULHRWM(dst,src,off)   db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0xb7
+
+
+/* Memory/offset versions of the new, K7 opcodes */
+#define PFNACCM(dst,src,off)        db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0x8a
+#define FPPNACCM(dst,src,off)       db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0x8e
+#define PSWAPDM(dst,src,off)        db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0xbb
+#define PMINUBM(dst,src,off)        db 0x0f, 0xda, _k3d_MODRM(dst,src) | 0x40, off
+#define PMAXUBM(dst,src,off)        db 0x0f, 0xde, _k3d_MODRM(dst,src) | 0x40, off
+#define PMINSWM(dst,src,off)        db 0x0f, 0xea, _k3d_MODRM(dst,src) | 0x40, off
+#define PMAXSWM(dst,src,off)        db 0x0f, 0xee, _k3d_MODRM(dst,src) | 0x40, off
+#define PMULHUWM(dst,src,off)       db 0x0f, 0xe4, _k3d_MODRM(dst,src) | 0x40, off
+#define PAVGBM(dst,src,off)         db 0x0f, 0xe0, _k3d_MODRM(dst,src) | 0x40, off
+#define PAVGWM(dst,src,off)         db 0x0f, 0xe3, _k3d_MODRM(dst,src) | 0x40, off
+#define PSADBWM(dst,src,off)        db 0x0f, 0xf6, _k3d_MODRM(dst,src) | 0x40, off
+#define PMOVMSKBM(dst,src,off)      db 0x0f, 0xd7, _k3d_MODRM(dst,src) | 0x40, off
+#define PMASKMOVQM(dst,src,off)     db 0x0f, 0xf7, _k3d_MODRM(dst,src) | 0x40, off
+#define MOVNTQM(dst,src,off)        db 0x0f, 0xe7, _k3d_MODRM(src,dst) | 0x40, off
+#define PINSRWM(dst,src,off,msk)    db 0x0f, 0xc4, _k3d_MODRM(dst,src) | 0x40, off, msk
+#define PSHUFWM(dst,src,off,msk)    db 0x0f, 0x70, _k3d_MODRM(dst,src) | 0x40, off, msk
+
+
+/* Defines for 3DNow! instructions for use in pragmas */
+#define p_pf2id(dst,src)        0x0f 0x0f _k3d_MODRM(dst,src) 0x1d
+#define p_pfacc(dst,src)        0x0f 0x0f _k3d_MODRM(dst,src) 0xae
+#define p_pfadd(dst,src)        0x0f 0x0f _k3d_MODRM(dst,src) 0x9e
+#define p_pfcmpeq(dst,src)      0x0f 0x0f _k3d_MODRM(dst,src) 0xb0
+#define p_pfcmpge(dst,src)      0x0f 0x0f _k3d_MODRM(dst,src) 0x90
+#define p_pfcmpgt(dst,src)      0x0f 0x0f _k3d_MODRM(dst,src) 0xa0
+#define p_pfmax(dst,src)        0x0f 0x0f _k3d_MODRM(dst,src) 0xa4
+#define p_pfmin(dst,src)        0x0f 0x0f _k3d_MODRM(dst,src) 0x94
+#define p_pfmul(dst,src)        0x0f 0x0f _k3d_MODRM(dst,src) 0xb4
+#define p_pfrcp(dst,src)        0x0f 0x0f _k3d_MODRM(dst,src) 0x96
+#define p_pfrcpit1(dst,src)     0x0f 0x0f _k3d_MODRM(dst,src) 0xa6
+#define p_pfrcpit2(dst,src)     0x0f 0x0f _k3d_MODRM(dst,src) 0xb6
+#define p_pfrsqrt(dst,src)      0x0f 0x0f _k3d_MODRM(dst,src) 0x97
+#define p_pfrsqit1(dst,src)     0x0f 0x0f _k3d_MODRM(dst,src) 0xa7
+#define p_pfsub(dst,src)        0x0f 0x0f _k3d_MODRM(dst,src) 0x9a
+#define p_pfsubr(dst,src)       0x0f 0x0f _k3d_MODRM(dst,src) 0xaa
+#define p_pi2fd(dst,src)        0x0f 0x0f _k3d_MODRM(dst,src) 0x0d
+#define p_femms                 0x0f 0x0e
+#define p_pavgusb(dst,src)      0x0f 0x0f _k3d_MODRM(dst,src) 0xbf
+#define p_pmulhrw(dst,src)      0x0f 0x0f _k3d_MODRM(dst,src) 0xb7
+#define p_prefetch(src)         0x0f 0x0d _k3d_pref_##src
+#define p_prefetchw(src)        0x0f 0x0d _k3d_prefw_##src
+#define P_PFNACC(dst,src)       0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0x8a
+#define P_FPPNACC(dst,src)      0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0x8e
+#define P_PSWAPD(dst,src)       0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0xbb
+#define P_PMINUB(dst,src)       0x0f 0xda (_k3d_MODRM(dst,src) | 0x40) off
+#define P_PMAXUB(dst,src)       0x0f 0xde (_k3d_MODRM(dst,src) | 0x40) off
+#define P_PMINSW(dst,src)       0x0f 0xea (_k3d_MODRM(dst,src) | 0x40) off
+#define P_PMAXSW(dst,src)       0x0f 0xee (_k3d_MODRM(dst,src) | 0x40) off
+#define P_PMULHUW(dst,src)      0x0f 0xe4 (_k3d_MODRM(dst,src) | 0x40) off
+#define P_PAVGB(dst,src)        0x0f 0xe0 (_k3d_MODRM(dst,src) | 0x40) off
+#define P_PAVGW(dst,src)        0x0f 0xe3 (_k3d_MODRM(dst,src) | 0x40) off
+#define P_PSADBW(dst,src)       0x0f 0xf6 (_k3d_MODRM(dst,src) | 0x40) off
+#define P_PMOVMSKB(dst,src)     0x0f 0xd7 (_k3d_MODRM(dst,src) | 0x40) off
+#define P_PMASKMOVQ(dst,src)    0x0f 0xf7 (_k3d_MODRM(dst,src) | 0x40) off
+#define P_PINSRW(dst,src,msk)   0x0f 0xc4 (_k3d_MODRM(dst,src) | 0x40) off msk
+#define P_PEXTRW(dst,src,msk)   0x0f 0xc5 (_k3d_MODRM(dst,src) | 0x40) off msk
+#define P_PSHUFW(dst,src,msk)   0x0f 0x70 (_k3d_MODRM(dst,src) | 0x40) off msk
+#define P_MOVNTQ(dst,src)       0x0f 0xe7 (_k3d_MODRM(src,dst) | 0x40) off
+
+#define P_PF2IDM(dst,src,off)    0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0x1d
+#define P_PFACCM(dst,src,off)    0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0xae
+#define P_PFADDM(dst,src,off)    0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0x9e
+#define P_PFCMPEQM(dst,src,off)  0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0xb0
+#define P_PFCMPGEM(dst,src,off)  0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0x90
+#define P_PFCMPGTM(dst,src,off)  0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0xa0
+#define P_PFMAXM(dst,src,off)    0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0xa4
+#define P_PFMINM(dst,src,off)    0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0x94
+#define P_PFMULM(dst,src,off)    0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0xb4
+#define P_PFRCPM(dst,src,off)    0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0x96
+#define P_PFRCPIT1M(dst,src,off) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0xa6
+#define P_PFRCPIT2M(dst,src,off) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0xb6
+#define P_PFRSQRTM(dst,src,off)  0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0x97
+#define P_PFRSQIT1M(dst,src,off) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0xa7
+#define P_PFSUBM(dst,src,off)    0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0x9a
+#define P_PFSUBRM(dst,src,off)   0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0xaa
+#define P_PI2FDM(dst,src,off)    0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0x0d
+#define P_PAVGUSBM(dst,src,off)  0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0xbf
+#define P_PMULHRWM(dst,src,off)  0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0xb7
+#define P_PFNACCM(dst,src,off)   0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0x8a
+#define P_FPPNACCM(dst,src,off)  0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0x8e
+#define P_PSWAPDM(dst,src,off)   0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0xbb
+#define P_PMINUBM(dst,src,off)   0x0f 0xda (_k3d_MODRM(dst,src) | 0x40) off
+#define P_PMAXUBM(dst,src,off)   0x0f 0xde (_k3d_MODRM(dst,src) | 0x40) off
+#define P_PMINSWM(dst,src,off)   0x0f 0xea (_k3d_MODRM(dst,src) | 0x40) off
+#define P_PMAXSWM(dst,src,off)   0x0f 0xee (_k3d_MODRM(dst,src) | 0x40) off
+#define P_PMULHUWM(dst,src,off)  0x0f 0xe4 (_k3d_MODRM(dst,src) | 0x40) off
+#define P_PAVGBM(dst,src,off)    0x0f 0xe0 (_k3d_MODRM(dst,src) | 0x40) off
+#define P_PAVGWM(dst,src,off)    0x0f 0xe3 (_k3d_MODRM(dst,src) | 0x40) off
+#define P_PSADBWM(dst,src,off)   0x0f 0xf6 (_k3d_MODRM(dst,src) | 0x40) off
+#define P_PMOVMSKBM(dst,src,off) 0x0f 0xd7 (_k3d_MODRM(dst,src) | 0x40) off
+#define P_MOVNTQM(dst,src,off)   0x0f 0xe7 (_k3d_MODRM(src,dst) | 0x40) off
+#define P_PMASKMOVQM(dst,src,off)   0x0f 0xf7 (_k3d_MODRM(dst,src) | 0x40) off
+#define P_PINSRWM(dst,src,off,msk)  0x0f 0xc4 (_k3d_MODRM(dst,src) | 0x40) off msk
+#define P_PSHUFWM(dst,src,off,msk)  0x0f 0x70 (_k3d_MODRM(dst,src) | 0x40) off msk
+
+
+#define P_PF2ID(dst,src)            p_pf2id(dst,src)
+#define P_PFACC(dst,src)            p_pfacc(dst,src)
+#define P_PFADD(dst,src)            p_pfadd(dst,src)
+#define P_PFCMPEQ(dst,src)          p_pfcmpeq(dst,src)
+#define P_PFCMPGE(dst,src)          p_pfcmpge(dst,src)
+#define P_PFCMPGT(dst,src)          p_pfcmpgt(dst,src)
+#define P_PFMAX(dst,src)            p_pfmax(dst,src)
+#define P_PFMIN(dst,src)            p_pfmin(dst,src)
+#define P_PFMUL(dst,src)            p_pfmul(dst,src)
+#define P_PFRCP(dst,src)            p_pfrcp(dst,src)
+#define P_PFRCPIT1(dst,src)         p_pfrcpit1(dst,src)
+#define P_PFRCPIT2(dst,src)         p_pfrcpit2(dst,src)
+#define P_PFRSQRT(dst,src)          p_pfrsqrt(dst,src)
+#define P_PFRSQIT1(dst,src)         p_pfrsqit1(dst,src)
+#define P_PFSUB(dst,src)            p_pfsub(dst,src)
+#define P_PFSUBR(dst,src)           p_pfsubr(dst,src)
+#define P_PI2FD(dst,src)            p_pi2fd(dst,src)
+#define P_FEMMS                     p_femms
+#define P_PAVGUSB(dst,src)          p_pavgusb(dst,src)
+#define P_PMULHRW(dst,src)          p_pmulhrw(dst,src)
+#define P_PREFETCH(src)             p_prefetch(src)
+#define P_PREFETCHW(src)            p_prefetchw(src)
+#define p_CPUID                     0x0f 0xa2
+#define p_pf2idm(dst,src,off)       P_PF2IDM(dst,src,off)
+#define p_pfaccm(dst,src,off)       P_PFACCM(dst,src,off)
+#define p_pfaddm(dst,src,off)       P_PFADDM(dst,src,off)
+#define p_pfcmpeqm(dst,src,off)     P_PFCMPEQM(dst,src,off)
+#define p_pfcmpgem(dst,src,off)     P_PFCMPGEM(dst,src,off)
+#define p_pfcmpgtm(dst,src,off)     P_PFCMPGTM(dst,src,off)
+#define p_pfmaxm(dst,src,off)       P_PFMAXM(dst,src,off)
+#define p_pfminm(dst,src,off)       P_PFMINM(dst,src,off)
+#define p_pfmulm(dst,src,off)       P_PFMULM(dst,src,off)
+#define p_pfrcpm(dst,src,off)       P_PFRCPM(dst,src,off)
+#define p_pfrcpit1m(dst,src,off)    P_PFRCPIT1M(dst,src,off)
+#define p_pfrcpit2m(dst,src,off)    P_PFRCPIT2M(dst,src,off)
+#define p_pfrsqrtm(dst,src,off)     P_PFRSQRTM(dst,src,off)
+#define p_pfrsqit1m(dst,src,off)    P_PFRSQIT1M(dst,src,off)
+#define p_pfsubm(dst,src,off)       P_PFSUBM(dst,src,off)
+#define p_pfsubrm(dst,src,off)      P_PFSUBRM(dst,src,off)
+#define p_pi2fdm(dst,src,off)       P_PI2FDM(dst,src,off)
+#define p_pavgusbm(dst,src,off)     P_PAVGUSBM(dst,src,off)
+#define p_pmulhrwm(dst,src,off)     P_PMULHRWM(dst,src,off)
+
+#define P_PFNACC(dst,src)           p_pfnacc(dst,src)
+#define P_FPPNACC(dst,src)          p_pfpnacc(dst,src)
+#define P_PSWAPD(dst,src)           p_pswapd(dst,src)
+#define P_PMINUB(dst,src)           p_pminub(dst,src)
+#define P_PMAXUB(dst,src)           p_pmaxub(dst,src)
+#define P_PMINSW(dst,src)           p_pminsw(dst,src)
+#define P_PMAXSW(dst,src)           p_pmaxsw(dst,src)
+#define P_PMULHUW(dst,src)          p_pmulhuw(dst,src)
+#define P_PAVGB(dst,src)            p_pavgb(dst,src)
+#define P_PAVGW(dst,src)            p_avgw(dst,src)
+#define P_PSADBW(dst,src)           p_psadbw(dst,src)
+#define P_PMOVMSKB(dst,src)         p_pmovmskb(dst,src)
+#define P_PMASKMOVQ(dst,src)        p_pmaskmovq(dst,src)
+#define P_PINSRW(dst,src,msk)       p_pinsrw(dst,src)
+#define P_PEXTRW(dst,src,msk)       p_pextrw(dst,src)
+#define P_PSHUFW(dst,src,msk)       p_pshufw(dst,src)
+#define P_MOVNTQ(dst,src)           p_movntq(dst,src)
+
+#define P_PFNACCM(dst,src,off)          p_pfnaccm(dst,src,off)
+#define P_FPPNACCM(dst,src,off)         p_pfpnaccm(dst,src,off)
+#define P_PSWAPDM(dst,src,off)          p_pswapdm(dst,src,off)
+#define P_PMINUBM(dst,src,off)          p_pminubm(dst,src,off)
+#define P_PMAXUBM(dst,src,off)          p_pmaxubm(dst,src,off)
+#define P_PMINSWM(dst,src,off)          p_pminswm(dst,src,off)
+#define P_PMAXSWM(dst,src,off)          p_pmaxswm(dst,src,off)
+#define P_PMULHUWM(dst,src,off)         p_pmulhuwm(dst,src,off)
+#define P_PAVGBM(dst,src,off)           p_pavgbm(dst,src,off)
+#define P_PAVGWM(dst,src,off)           p_avgwm(dst,src,off)
+#define P_PSADBWM(dst,src,off)          p_psadbwm(dst,src,off)
+#define P_PMOVMSKBM(dst,src,off)        p_pmovmskbm(dst,src,off)
+#define P_PMASKMOVQM(dst,src,off)       p_pmaskmovqm(dst,src,off)
+#define P_PINSRWM(dst,src,off,msk)      p_pinsrwm(dst,src,off,msk)
+#define P_PSHUFWM(dst,src,off,msk)      p_pshufwm(dst,src,off,msk)
+#define P_MOVNTQM(dst,src,off)          p_movntqm(dst,src,off)
+
+#elif defined (_MSC_VER) && !defined (__MWERKS__)
+// The Microsoft Visual C++ version of the 3DNow! macros.
+
+// Stop the "no EMMS" warning, since it doesn't detect FEMMS properly
+#pragma warning(disable:4799)
+
+// Defines for operands.
+#define _K3D_MM0 0xc0
+#define _K3D_MM1 0xc1
+#define _K3D_MM2 0xc2
+#define _K3D_MM3 0xc3
+#define _K3D_MM4 0xc4
+#define _K3D_MM5 0xc5
+#define _K3D_MM6 0xc6
+#define _K3D_MM7 0xc7
+#define _K3D_mm0 0xc0
+#define _K3D_mm1 0xc1
+#define _K3D_mm2 0xc2
+#define _K3D_mm3 0xc3
+#define _K3D_mm4 0xc4
+#define _K3D_mm5 0xc5
+#define _K3D_mm6 0xc6
+#define _K3D_mm7 0xc7
+#define _K3D_EAX 0x00
+#define _K3D_ECX 0x01
+#define _K3D_EDX 0x02
+#define _K3D_EBX 0x03
+#define _K3D_ESI 0x06
+#define _K3D_EDI 0x07
+#define _K3D_eax 0x00
+#define _K3D_ecx 0x01
+#define _K3D_edx 0x02
+#define _K3D_ebx 0x03
+#define _K3D_esi 0x06
+#define _K3D_edi 0x07
+
+// These defines are for compatibility with the previous version of the header file.
+#define _K3D_M0   0xc0
+#define _K3D_M1   0xc1
+#define _K3D_M2   0xc2
+#define _K3D_M3   0xc3
+#define _K3D_M4   0xc4
+#define _K3D_M5   0xc5
+#define _K3D_M6   0xc6
+#define _K3D_M7   0xc7
+#define _K3D_m0   0xc0
+#define _K3D_m1   0xc1
+#define _K3D_m2   0xc2
+#define _K3D_m3   0xc3
+#define _K3D_m4   0xc4
+#define _K3D_m5   0xc5
+#define _K3D_m6   0xc6
+#define _K3D_m7   0xc7
+#define _K3D__EAX 0x00
+#define _K3D__ECX 0x01
+#define _K3D__EDX 0x02
+#define _K3D__EBX 0x03
+#define _K3D__ESI 0x06
+#define _K3D__EDI 0x07
+#define _K3D__eax 0x00
+#define _K3D__ecx 0x01
+#define _K3D__edx 0x02
+#define _K3D__ebx 0x03
+#define _K3D__esi 0x06
+#define _K3D__edi 0x07
+
+// General 3DNow! instruction format that is supported by 
+// these macros. Note that only the most basic form of memory 
+// operands are supported by these macros. 
+
+#define InjK3DOps(dst,src,inst)                         \
+{                                                       \
+   _asm _emit 0x0f                                      \
+   _asm _emit 0x0f                                      \
+   _asm _emit ((_K3D_##dst & 0x3f) << 3) | _K3D_##src   \
+   _asm _emit _3DNowOpcode##inst                        \
+}
+
+#define InjK3DMOps(dst,src,off,inst)                    \
+{                                                       \
+   _asm _emit 0x0f                                      \
+   _asm _emit 0x0f                                      \
+   _asm _emit (((_K3D_##dst & 0x3f) << 3) | _K3D_##src | 0x40) \
+   _asm _emit off                                       \
+   _asm _emit _3DNowOpcode##inst                        \
+}
+
+#define InjMMXOps(dst,src,inst)                         \
+{                                                       \
+   _asm _emit 0x0f                                      \
+   _asm _emit _3DNowOpcode##inst                        \
+   _asm _emit ((_K3D_##dst & 0x3f) << 3) | _K3D_##src   \
+}
+
+#define InjMMXMOps(dst,src,off,inst)                    \
+{                                                       \
+   _asm _emit 0x0f                                      \
+   _asm _emit _3DNowOpcode##inst                        \
+   _asm _emit (((_K3D_##dst & 0x3f) << 3) | _K3D_##src | 0x40) \
+   _asm _emit off                                       \
+}
+
+#define _3DNowOpcodePF2ID    0x1d
+#define _3DNowOpcodePFACC    0xae
+#define _3DNowOpcodePFADD    0x9e
+#define _3DNowOpcodePFCMPEQ  0xb0
+#define _3DNowOpcodePFCMPGE  0x90
+#define _3DNowOpcodePFCMPGT  0xa0
+#define _3DNowOpcodePFMAX    0xa4
+#define _3DNowOpcodePFMIN    0x94
+#define _3DNowOpcodePFMUL    0xb4
+#define _3DNowOpcodePFRCP    0x96
+#define _3DNowOpcodePFRCPIT1 0xa6
+#define _3DNowOpcodePFRCPIT2 0xb6
+#define _3DNowOpcodePFRSQRT  0x97
+#define _3DNowOpcodePFRSQIT1 0xa7
+#define _3DNowOpcodePFSUB    0x9a
+#define _3DNowOpcodePFSUBR   0xaa
+#define _3DNowOpcodePI2FD    0x0d
+#define _3DNowOpcodePAVGUSB  0xbf
+#define _3DNowOpcodePMULHRW  0xb7
+#define _3DNowOpcodePFNACC   0x8a
+#define _3DNowOpcodeFPPNACC  0x8e
+#define _3DNowOpcodePSWAPD   0xbb
+#define _3DNowOpcodePMINUB   0xda
+#define _3DNowOpcodePMAXUB   0xde
+#define _3DNowOpcodePMINSW   0xea
+#define _3DNowOpcodePMAXSW   0xee
+#define _3DNowOpcodePMULHUW  0xe4
+#define _3DNowOpcodePAVGB    0xe0
+#define _3DNowOpcodePAVGW    0xe3
+#define _3DNowOpcodePSADBW   0xf6
+#define _3DNowOpcodePMOVMSKB 0xd7
+#define _3DNowOpcodePMASKMOVQ   0xf7
+#define _3DNowOpcodePINSRW   0xc4
+#define _3DNowOpcodePEXTRW   0xc5
+#define _3DNowOpcodePSHUFW   0x70
+#define _3DNowOpcodeMOVNTQ   0xe7
+#define _3DNowOpcodePREFETCHT 0x18
+
+
+#define PF2ID(dst,src)      InjK3DOps(dst, src, PF2ID)
+#define PFACC(dst,src)      InjK3DOps(dst, src, PFACC)
+#define PFADD(dst,src)      InjK3DOps(dst, src, PFADD)
+#define PFCMPEQ(dst,src)    InjK3DOps(dst, src, PFCMPEQ)
+#define PFCMPGE(dst,src)    InjK3DOps(dst, src, PFCMPGE)
+#define PFCMPGT(dst,src)    InjK3DOps(dst, src, PFCMPGT)
+#define PFMAX(dst,src)      InjK3DOps(dst, src, PFMAX)
+#define PFMIN(dst,src)      InjK3DOps(dst, src, PFMIN)
+#define PFMUL(dst,src)      InjK3DOps(dst, src, PFMUL)
+#define PFRCP(dst,src)      InjK3DOps(dst, src, PFRCP)
+#define PFRCPIT1(dst,src)   InjK3DOps(dst, src, PFRCPIT1)
+#define PFRCPIT2(dst,src)   InjK3DOps(dst, src, PFRCPIT2)
+#define PFRSQRT(dst,src)    InjK3DOps(dst, src, PFRSQRT)
+#define PFRSQIT1(dst,src)   InjK3DOps(dst, src, PFRSQIT1)
+#define PFSUB(dst,src)      InjK3DOps(dst, src, PFSUB)
+#define PFSUBR(dst,src)     InjK3DOps(dst, src, PFSUBR)
+#define PI2FD(dst,src)      InjK3DOps(dst, src, PI2FD)
+#define PAVGUSB(dst,src)    InjK3DOps(dst, src, PAVGUSB)
+#define PMULHRW(dst,src)    InjK3DOps(dst, src, PMULHRW)
+
+#define FEMMS                                   \
+{                                               \
+   _asm _emit 0x0f                              \
+   _asm _emit 0x0e                              \
+}
+
+#define PREFETCH(src)                           \
+{                                               \
+   _asm _emit 0x0f                              \
+   _asm _emit 0x0d                              \
+   _asm _emit (_K3D_##src & 0x07)               \
+}
+
+/* Prefetch with a short offset, < 127 or > -127
+   Carefull!  Doesn't check for your offset being
+   in range. */
+
+#define PREFETCHM(src,off)					    \
+{                                               \
+   _asm _emit 0x0f                              \
+   _asm _emit 0x0d								\
+   _asm _emit (0x40 | (_K3D_##src & 0x07))		\
+   _asm _emit off								\
+}
+
+/* Prefetch with a long offset */
+
+#define PREFETCHMLONG(src,off)					\
+{                                               \
+   _asm _emit 0x0f                              \
+   _asm _emit 0x0d								\
+   _asm _emit (0x80 | (_K3D_##src & 0x07))		\
+   _asm _emit (off & 0x000000ff)				\
+   _asm _emit (off & 0x0000ff00) >>	8			\
+   _asm _emit (off & 0x00ff0000) >>	16			\
+   _asm _emit (off & 0xff000000) >>	24			\
+}
+
+#define PREFETCHW(src)                          \
+{                                               \
+   _asm _emit 0x0f                              \
+   _asm _emit 0x0d                              \
+   _asm _emit (0x08 | (_K3D_##src & 0x07))      \
+}
+
+#define PREFETCHWM(src,off)                     \
+{                                               \
+   _asm _emit 0x0f                              \
+   _asm _emit 0x0d                              \
+   _asm _emit 0x48 | (_K3D_##src & 0x07)        \
+   _asm	_emit off								\
+}
+
+#define PREFETCHWMLONG(src,off)                 \
+{                                               \
+   _asm _emit 0x0f                              \
+   _asm _emit 0x0d                              \
+   _asm _emit 0x88 | (_K3D_##src & 0x07)        \
+   _asm _emit (off & 0x000000ff)				\
+   _asm _emit (off & 0x0000ff00) >>	8			\
+   _asm _emit (off & 0x00ff0000) >>	16			\
+   _asm _emit (off & 0xff000000) >>	24			\
+}
+
+#define CPUID                                   \
+{                                               \
+    _asm _emit 0x0f                             \
+    _asm _emit 0xa2                             \
+}
+
+
+/* Defines for new, K7 opcodes */
+#define SFENCE                                  \
+{                                               \
+    _asm _emit 0x0f                             \
+    _asm _emit 0xae                             \
+    _asm _emit 0xf8                             \
+}
+
+#define PFNACC(dst,src)         InjK3DOps(dst,src,PFNACC)
+#define PFPNACC(dst,src)        InjK3DOps(dst,src,PFPNACC)
+#define PSWAPD(dst,src)         InjK3DOps(dst,src,PSWAPD)
+#define PMINUB(dst,src)         InjMMXOps(dst,src,PMINUB)
+#define PMAXUB(dst,src)         InjMMXOps(dst,src,PMAXUB)
+#define PMINSW(dst,src)         InjMMXOps(dst,src,PMINSW)
+#define PMAXSW(dst,src)         InjMMXOps(dst,src,PMAXSW)
+#define PMULHUW(dst,src)        InjMMXOps(dst,src,PMULHUW)
+#define PAVGB(dst,src)          InjMMXOps(dst,src,PAVGB)
+#define PAVGW(dst,src)          InjMMXOps(dst,src,PAVGW)
+#define PSADBW(dst,src)         InjMMXOps(dst,src,PSADBW)
+#define PMOVMSKB(dst,src)       InjMMXOps(dst,src,PMOVMSKB)
+#define PMASKMOVQ(dst,src)      InjMMXOps(dst,src,PMASKMOVQ)
+#define PINSRW(dst,src,msk)     InjMMXOps(dst,src,PINSRW) _asm _emit msk
+#define PEXTRW(dst,src,msk)     InjMMXOps(dst,src,PEXTRW) _asm _emit msk
+#define PSHUFW(dst,src,msk)     InjMMXOps(dst,src,PSHUFW) _asm _emit msk
+#define MOVNTQ(dst,src)         InjMMXOps(src,dst,MOVNTQ)
+#define PREFETCHNTA(mem)        InjMMXOps(mm0,mem,PREFETCHT)
+#define PREFETCHT0(mem)         InjMMXOps(mm1,mem,PREFETCHT)
+#define PREFETCHT1(mem)         InjMMXOps(mm2,mem,PREFETCHT)
+#define PREFETCHT2(mem)         InjMMXOps(mm3,mem,PREFETCHT)
+
+
+/* Memory/offset versions of the opcodes */
+#define PAVGUSBM(dst,src,off)   InjK3DMOps(dst,src,off,PAVGUSB)
+#define PF2IDM(dst,src,off)     InjK3DMOps(dst,src,off,PF2ID)
+#define PFACCM(dst,src,off)     InjK3DMOps(dst,src,off,PFACC)
+#define PFADDM(dst,src,off)     InjK3DMOps(dst,src,off,PFADD)
+#define PFCMPEQM(dst,src,off)   InjK3DMOps(dst,src,off,PFCMPEQ)
+#define PFCMPGEM(dst,src,off)   InjK3DMOps(dst,src,off,PFCMPGE)
+#define PFCMPGTM(dst,src,off)   InjK3DMOps(dst,src,off,PFCMPGT)
+#define PFMAXM(dst,src,off)     InjK3DMOps(dst,src,off,PFMAX)
+#define PFMINM(dst,src,off)     InjK3DMOps(dst,src,off,PFMIN)
+#define PFMULM(dst,src,off)     InjK3DMOps(dst,src,off,PFMUL)
+#define PFRCPM(dst,src,off)     InjK3DMOps(dst,src,off,PFRCP)
+#define PFRCPIT1M(dst,src,off)  InjK3DMOps(dst,src,off,PFRCPIT1)
+#define PFRCPIT2M(dst,src,off)  InjK3DMOps(dst,src,off,PFRCPIT2)
+#define PFRSQRTM(dst,src,off)   InjK3DMOps(dst,src,off,PFRSQRT)
+#define PFRSQIT1M(dst,src,off)  InjK3DMOps(dst,src,off,PFRSQIT1)
+#define PFSUBM(dst,src,off)     InjK3DMOps(dst,src,off,PFSUB)
+#define PFSUBRM(dst,src,off)    InjK3DMOps(dst,src,off,PFSUBR)
+#define PI2FDM(dst,src,off)     InjK3DMOps(dst,src,off,PI2FD)
+#define PMULHRWM(dst,src,off)   InjK3DMOps(dst,src,off,PMULHRW)
+
+
+/* Memory/offset versions of the K7 opcodes */
+#define PFNACCM(dst,src,off)     InjK3DMOps(dst,src,off,PFNACC)
+#define PFPNACCM(dst,src,off)    InjK3DMOps(dst,src,off,PFPNACC)
+#define PSWAPDM(dst,src,off)     InjK3DMOps(dst,src,off,PSWAPD)
+#define PMINUBM(dst,src,off)     InjMMXMOps(dst,src,off,PMINUB)
+#define PMAXUBM(dst,src,off)     InjMMXMOps(dst,src,off,PMAXUB)
+#define PMINSWM(dst,src,off)     InjMMXMOps(dst,src,off,PMINSW)
+#define PMAXSWM(dst,src,off)     InjMMXMOps(dst,src,off,PMAXSW)
+#define PMULHUWM(dst,src,off)    InjMMXMOps(dst,src,off,PMULHUW)
+#define PAVGBM(dst,src,off)      InjMMXMOps(dst,src,off,PAVGB)
+#define PAVGWM(dst,src,off)      InjMMXMOps(dst,src,off,PAVGW)
+#define PSADBWM(dst,src,off)     InjMMXMOps(dst,src,off,PSADBW)
+#define PMOVMSKBM(dst,src,off)   InjMMXMOps(dst,src,off,PMOVMSKB)
+#define PMASKMOVQM(dst,src,off)  InjMMXMOps(dst,src,off,PMASKMOVQ)
+#define PINSRWM(dst,src,off,msk) InjMMXMOps(dst,src,off,PINSRW) _asm _emit msk
+#define PSHUFWM(dst,src,off,msk) InjMMXMOps(dst,src,off,PSHUFW) _asm _emit msk
+#define MOVNTQM(dst,src,off)     InjMMXMOps(src,dst,off,MOVNTQ)
+#define PREFETCHNTAM(mem,off)    InjMMXMOps(mm0,mem,off,PREFETCHT)
+#define PREFETCHT0M(mem,off)     InjMMXMOps(mm1,mem,off,PREFETCHT)
+#define PREFETCHT1M(mem,off)     InjMMXMOps(mm2,mem,off,PREFETCHT)
+#define PREFETCHT2M(mem,off)     InjMMXMOps(mm3,mem,off,PREFETCHT)
+
+
+#else
+
+/* Assume built-in support for 3DNow! opcodes, replace macros with opcodes */
+#define PAVGUSB(dst,src)    pavgusb     dst,src
+#define PF2ID(dst,src)      pf2id       dst,src
+#define PFACC(dst,src)      pfacc       dst,src
+#define PFADD(dst,src)      pfadd       dst,src
+#define PFCMPEQ(dst,src)    pfcmpeq     dst,src
+#define PFCMPGE(dst,src)    pfcmpge     dst,src
+#define PFCMPGT(dst,src)    pfcmpgt     dst,src
+#define PFMAX(dst,src)      pfmax       dst,src
+#define PFMIN(dst,src)      pfmin       dst,src
+#define PFMUL(dst,src)      pfmul       dst,src
+#define PFRCP(dst,src)      pfrcp       dst,src
+#define PFRCPIT1(dst,src)   pfrcpit1    dst,src
+#define PFRCPIT2(dst,src)   pfrcpit2    dst,src
+#define PFRSQRT(dst,src)    pfrsqrt     dst,src
+#define PFRSQIT1(dst,src)   pfrsqit1    dst,src
+#define PFSUB(dst,src)      pfsub       dst,src
+#define PFSUBR(dst,src)     pfsubr      dst,src
+#define PI2FD(dst,src)      pi2fd       dst,src
+#define PMULHRW(dst,src)    pmulhrw     dst,src
+#define PREFETCH(src)       prefetch    src
+#define PREFETCHW(src)      prefetchw   src
+
+#define PAVGUSBM(dst,src,off)   pavgusb     dst,[src+off]
+#define PF2IDM(dst,src,off)     PF2ID       dst,[src+off]
+#define PFACCM(dst,src,off)     PFACC       dst,[src+off]
+#define PFADDM(dst,src,off)     PFADD       dst,[src+off]
+#define PFCMPEQM(dst,src,off)   PFCMPEQ     dst,[src+off]
+#define PFCMPGEM(dst,src,off)   PFCMPGE     dst,[src+off]
+#define PFCMPGTM(dst,src,off)   PFCMPGT     dst,[src+off]
+#define PFMAXM(dst,src,off)     PFMAX       dst,[src+off]
+#define PFMINM(dst,src,off)     PFMIN       dst,[src+off]
+#define PFMULM(dst,src,off)     PFMUL       dst,[src+off]
+#define PFRCPM(dst,src,off)     PFRCP       dst,[src+off]
+#define PFRCPIT1M(dst,src,off)  PFRCPIT1    dst,[src+off]
+#define PFRCPIT2M(dst,src,off)  PFRCPIT2    dst,[src+off]
+#define PFRSQRTM(dst,src,off)   PFRSQRT     dst,[src+off]
+#define PFRSQIT1M(dst,src,off)  PFRSQIT1    dst,[src+off]
+#define PFSUBM(dst,src,off)     PFSUB       dst,[src+off]
+#define PFSUBRM(dst,src,off)    PFSUBR      dst,[src+off]
+#define PI2FDM(dst,src,off)     PI2FD       dst,[src+off]
+#define PMULHRWM(dst,src,off)   PMULHRW     dst,[src+off]
+
+
+#if defined (__MWERKS__)
+// At the moment, CodeWarrior does not support these opcodes, so hand-assemble them
+
+// Defines for operands.
+#define _K3D_MM0 0xc0
+#define _K3D_MM1 0xc1
+#define _K3D_MM2 0xc2
+#define _K3D_MM3 0xc3
+#define _K3D_MM4 0xc4
+#define _K3D_MM5 0xc5
+#define _K3D_MM6 0xc6
+#define _K3D_MM7 0xc7
+#define _K3D_mm0 0xc0
+#define _K3D_mm1 0xc1
+#define _K3D_mm2 0xc2
+#define _K3D_mm3 0xc3
+#define _K3D_mm4 0xc4
+#define _K3D_mm5 0xc5
+#define _K3D_mm6 0xc6
+#define _K3D_mm7 0xc7
+#define _K3D_EAX 0x00
+#define _K3D_ECX 0x01
+#define _K3D_EDX 0x02
+#define _K3D_EBX 0x03
+#define _K3D_ESI 0x06
+#define _K3D_EDI 0x07
+#define _K3D_eax 0x00
+#define _K3D_ecx 0x01
+#define _K3D_edx 0x02
+#define _K3D_ebx 0x03
+#define _K3D_esi 0x06
+#define _K3D_edi 0x07
+#define _K3D_EAX 0x00
+#define _K3D_ECX 0x01
+#define _K3D_EDX 0x02
+#define _K3D_EBX 0x03
+#define _K3D_ESI 0x06
+#define _K3D_EDI 0x07
+#define _K3D_eax 0x00
+#define _K3D_ecx 0x01
+#define _K3D_edx 0x02
+#define _K3D_ebx 0x03
+#define _K3D_esi 0x06
+#define _K3D_edi 0x07
+
+#define InjK3DOps(dst,src,inst) \
+    db 0x0f, 0x0f, (((_K3D_##dst & 0x3f) << 3) | _K3D_##src), _3DNowOpcode##inst
+
+#define InjK3DMOps(dst,src,off,inst) \
+    db 0x0f, 0x0f, (((_K3D_##dst & 0x3f) << 3) | _K3D_##src | 0x40), off, _3DNowOpcode##inst
+
+#define InjMMXOps(dst,src,inst)                     \
+    db 0x0f, _3DNowOpcode##inst, (((_K3D_##dst & 0x3f) << 3) | _K3D_##src)
+
+#define InjMMXMOps(dst,src,off,inst)                \
+    db 0x0f, _3DNowOpcode##inst, (((_K3D_##dst & 0x3f) << 3) | _K3D_##src | 0x40), off
+
+#define PFNACC(dst,src)         InjK3DOps(dst,src,PFNACC)
+#define PFPNACC(dst,src)        InjK3DOps(dst,src,PFPNACC)
+#define PSWAPD(dst,src)         InjK3DOps(dst,src,PSWAPD)
+#define PMINUB(dst,src)         InjMMXOps(dst,src,PMINUB)
+#define PMAXUB(dst,src)         InjMMXOps(dst,src,PMAXUB)
+#define PMINSW(dst,src)         InjMMXOps(dst,src,PMINSW)
+#define PMAXSW(dst,src)         InjMMXOps(dst,src,PMAXSW)
+#define PMULHUW(dst,src)        InjMMXOps(dst,src,PMULHUW)
+#define PAVGB(dst,src)          InjMMXOps(dst,src,PAVGB)
+#define PAVGW(dst,src)          InjMMXOps(dst,src,PAVGW)
+#define PSADBW(dst,src)         InjMMXOps(dst,src,PSADBW)
+#define PMOVMSKB(dst,src)       InjMMXOps(dst,src,PMOVMSKB)
+#define PMASKMOVQ(dst,src)      InjMMXOps(dst,src,PMASKMOVQ)
+#define PINSRW(dst,src,msk)     InjMMXOps(dst,src,PINSRW) db msk
+#define PEXTRW(dst,src,msk)     InjMMXOps(dst,src,PEXTRW) db msk
+#define PSHUFW(dst,src,msk)     InjMMXOps(dst,src,PSHUFW) db msk
+#define MOVNTQ(dst,src)         InjMMXOps(src,dst,MOVNTQ)
+#define PREFETCHNTA(mem)        InjMMXOps(mm0,mem,PREFETCHT)
+#define PREFETCHT0(mem)         InjMMXOps(mm1,mem,PREFETCHT)
+#define PREFETCHT1(mem)         InjMMXOps(mm2,mem,PREFETCHT)
+#define PREFETCHT2(mem)         InjMMXOps(mm3,mem,PREFETCHT)
+
+
+/* Memory/offset versions of the K7 opcodes */
+#define PFNACCM(dst,src,off)     InjK3DMOps(dst,src,off,PFNACC)
+#define PFPNACCM(dst,src,off)    InjK3DMOps(dst,src,off,PFPNACC)
+#define PSWAPDM(dst,src,off)     InjK3DMOps(dst,src,off,PSWAPD)
+#define PMINUBM(dst,src,off)     InjMMXMOps(dst,src,off,PMINUB)
+#define PMAXUBM(dst,src,off)     InjMMXMOps(dst,src,off,PMAXUB)
+#define PMINSWM(dst,src,off)     InjMMXMOps(dst,src,off,PMINSW)
+#define PMAXSWM(dst,src,off)     InjMMXMOps(dst,src,off,PMAXSW)
+#define PMULHUWM(dst,src,off)    InjMMXMOps(dst,src,off,PMULHUW)
+#define PAVGBM(dst,src,off)      InjMMXMOps(dst,src,off,PAVGB)
+#define PAVGWM(dst,src,off)      InjMMXMOps(dst,src,off,PAVGW)
+#define PSADBWM(dst,src,off)     InjMMXMOps(dst,src,off,PSADBW)
+#define PMOVMSKBM(dst,src,off)   InjMMXMOps(dst,src,off,PMOVMSKB)
+#define PMASKMOVQM(dst,src,off)  InjMMXMOps(dst,src,off,PMASKMOVQ)
+#define PINSRWM(dst,src,off,msk) InjMMXMOps(dst,src,off,PINSRW), msk
+#define PEXTRWM(dst,src,off,msk) InjMMXMOps(dst,src,off,PEXTRW), msk
+#define PSHUFWM(dst,src,off,msk) InjMMXMOps(dst,src,off,PSHUFW), msk
+#define MOVNTQM(dst,src,off)     InjMMXMOps(src,dst,off,MOVNTQ)
+#define PREFETCHNTAM(mem,off)    InjMMXMOps(mm0,mem,off,PREFETCHT)
+#define PREFETCHT0M(mem,off)     InjMMXMOps(mm1,mem,off,PREFETCHT)
+#define PREFETCHT1M(mem,off)     InjMMXMOps(mm2,mem,off,PREFETCHT)
+#define PREFETCHT2M(mem,off)     InjMMXMOps(mm3,mem,off,PREFETCHT)
+
+
+#else
+
+#define PFNACC(dst,src)         PFNACC      dst,src
+#define PFPNACC(dst,src)        PFPNACC     dst,src
+#define PSWAPD(dst,src)         PSWAPD      dst,src
+#define PMINUB(dst,src)         PMINUB      dst,src
+#define PMAXUB(dst,src)         PMAXUB      dst,src
+#define PMINSW(dst,src)         PMINSW      dst,src
+#define PMAXSW(dst,src)         PMAXSW      dst,src
+#define PMULHUW(dst,src)        PMULHUW     dst,src
+#define PAVGB(dst,src)          PAVGB       dst,src
+#define PAVGW(dst,src)          PAVGW       dst,src
+#define PSADBW(dst,src)         PSADBW      dst,src
+#define PMOVMSKB(dst,src)       PMOVMSKB    dst,src
+#define PMASKMOVQ(dst,src)      PMASKMOVQ   dst,src
+#define PINSRW(dst,src,msk)     PINSRW      dst,src,msk
+#define PEXTRW(dst,src,msk)     PEXTRW      dst,src,msk
+#define PSHUFW(dst,src,msk)     PSHUFW      dst,src,msk
+#define MOVNTQ(dst,src)         MOVNTQ      dst,src
+
+#define PFNACCM(dst,src,off)    PFNACC      dst,[src+off]
+#define PFPNACCM(dst,src,off)   PFPNACC     dst,[src+off]
+#define PSWAPDM(dst,src,off)    PSWAPD      dst,[src+off]
+#define PMINUBM(dst,src,off)    PMINUB      dst,[src+off]
+#define PMAXUBM(dst,src,off)    PMAXUB      dst,[src+off]
+#define PMINSWM(dst,src,off)    PMINSW      dst,[src+off]
+#define PMAXSWM(dst,src,off)    PMAXSW      dst,[src+off]
+#define PMULHUWM(dst,src,off)   PMULHUW     dst,[src+off]
+#define PAVGBM(dst,src,off)     PAVGB       dst,[src+off]
+#define PAVGWM(dst,src,off)     PAVGW       dst,[src+off]
+#define PSADBWM(dst,src,off)    PSADBW      dst,[src+off]
+#define PMOVMSKBM(dst,src,off)  PMOVMSKB    dst,[src+off]
+#define PMASKMOVQM(dst,src,off) PMASKMOVQ   dst,[src+off]
+#define PINSRWM(dst,src,off,msk) PINSRW     dst,[src+off],msk
+#define PEXTRWM(dst,src,off,msk) PEXTRW     dst,[src+off],msk
+#define PSHUFWM(dst,src,off,msk) PSHUFW     dst,[src+off],msk
+#define MOVNTQM(dst,src,off)    MOVNTQ      dst,[src+off]
+
+#endif
+
+#endif
+
+/* Just to deal with lower case. */
+#define pf2id(dst,src)          PF2ID(dst,src)
+#define pfacc(dst,src)          PFACC(dst,src)
+#define pfadd(dst,src)          PFADD(dst,src)
+#define pfcmpeq(dst,src)        PFCMPEQ(dst,src)
+#define pfcmpge(dst,src)        PFCMPGE(dst,src)
+#define pfcmpgt(dst,src)        PFCMPGT(dst,src)
+#define pfmax(dst,src)          PFMAX(dst,src)
+#define pfmin(dst,src)          PFMIN(dst,src)
+#define pfmul(dst,src)          PFMUL(dst,src)
+#define pfrcp(dst,src)          PFRCP(dst,src)
+#define pfrcpit1(dst,src)       PFRCPIT1(dst,src)
+#define pfrcpit2(dst,src)       PFRCPIT2(dst,src)
+#define pfrsqrt(dst,src)        PFRSQRT(dst,src)
+#define pfrsqit1(dst,src)       PFRSQIT1(dst,src)
+#define pfsub(dst,src)          PFSUB(dst,src)
+#define pfsubr(dst,src)         PFSUBR(dst,src)
+#define pi2fd(dst,src)          PI2FD(dst,src)
+#define femms                   FEMMS
+#define pavgusb(dst,src)        PAVGUSB(dst,src)
+#define pmulhrw(dst,src)        PMULHRW(dst,src)
+#define prefetch(src)           PREFETCH(src)
+#define prefetchw(src)          PREFETCHW(src)
+
+#define prefetchm(src,off)      PREFETCHM(src,off)
+#define prefetchmlong(src,off)	PREFETCHMLONG(src,off)
+#define prefetchwm(src,off)     PREFETCHWM(src,off)
+#define prefetchwmlong(src,off)	 PREFETCHWMLONG(src,off)
+
+#define pfnacc(dst,src)         PFNACC(dst,src)
+#define pfpnacc(dst,src)        PFPNACC(dst,src)
+#define pswapd(dst,src)         PSWAPD(dst,src)
+#define pminub(dst,src)         PMINUB(dst,src)
+#define pmaxub(dst,src)         PMAXUB(dst,src)
+#define pminsw(dst,src)         PMINSW(dst,src)
+#define pmaxsw(dst,src)         PMAXSW(dst,src)
+#define pmulhuw(dst,src)        PMULHUW(dst,src)
+#define pavgb(dst,src)          PAVGB(dst,src)
+#define pavgw(dst,src)          PAVGW(dst,src)
+#define psadbw(dst,src)         PSADBW(dst,src)
+#define pmovmskb(dst,src)       PMOVMSKB(dst,src)
+#define pmaskmovq(dst,src)      PMASKMOVQ(dst,src)
+#define pinsrw(dst,src,msk)     PINSRW(dst,src,msk)
+#define pextrw(dst,src,msk)     PEXTRW(dst,src,msk)
+#define pshufw(dst,src,msk)     PSHUFW(dst,src,msk)
+#define movntq(dst,src)         MOVNTQ(dst,src)
+#define prefetchnta(mem)        PREFETCHNTA(mem)
+#define prefetcht0(mem)         PREFETCHT0(mem)  
+#define prefetcht1(mem)         PREFETCHT1(mem)  
+#define prefetcht2(mem)         PREFETCHT2(mem)  
+
+
+#define pavgusbm(dst,src,off)   PAVGUSBM(dst,src,off)
+#define pf2idm(dst,src,off)     PF2IDM(dst,src,off)
+#define pfaccm(dst,src,off)     PFACCM(dst,src,off)
+#define pfaddm(dst,src,off)     PFADDM(dst,src,off)
+#define pfcmpeqm(dst,src,off)   PFCMPEQM(dst,src,off)
+#define pfcmpgem(dst,src,off)   PFCMPGEM(dst,src,off)
+#define pfcmpgtm(dst,src,off)   PFCMPGTM(dst,src,off)
+#define pfmaxm(dst,src,off)     PFMAXM(dst,src,off)
+#define pfminm(dst,src,off)     PFMINM(dst,src,off)
+#define pfmulm(dst,src,off)     PFMULM(dst,src,off)
+#define pfrcpm(dst,src,off)     PFRCPM(dst,src,off)
+#define pfrcpit1m(dst,src,off)  PFRCPIT1M(dst,src,off)
+#define pfrcpit2m(dst,src,off)  PFRCPIT2M(dst,src,off)
+#define pfrsqrtm(dst,src,off)   PFRSQRTM(dst,src,off)
+#define pfrsqit1m(dst,src,off)  PFRSQIT1M(dst,src,off)
+#define pfsubm(dst,src,off)     PFSUBM(dst,src,off)
+#define pfsubrm(dst,src,off)    PFSUBRM(dst,src,off)
+#define pi2fdm(dst,src,off)     PI2FDM(dst,src,off)
+#define pmulhrwm(dst,src,off)   PMULHRWM(dst,src,off)
+#define cpuid                   CPUID
+#define sfence                  SFENCE
+
+#define pfnaccm(dst,src,off)    PFNACCM(dst,src,off)
+#define pfpnaccm(dst,src,off)   PFPNACCM(dst,src,off)
+#define pswapdm(dst,src,off)    PSWAPDM(dst,src,off)
+#define pminubm(dst,src,off)    PMINUBM(dst,src,off)
+#define pmaxubm(dst,src,off)    PMAXUBM(dst,src,off)
+#define pminswm(dst,src,off)    PMINSWM(dst,src,off)
+#define pmaxswm(dst,src,off)    PMAXSWM(dst,src,off)
+#define pmulhuwm(dst,src,off)   PMULHUWM(dst,src,off)
+#define pavgbm(dst,src,off)     PAVGBM(dst,src,off)
+#define pavgwm(dst,src,off)     PAVGWM(dst,src,off)
+#define psadbwm(dst,src,off)    PSADBWM(dst,src,off)
+#define pmovmskbm(dst,src,off)  PMOVMSKBM(dst,src,off)
+#define pmaskmovqm(dst,src,off) PMASKMOVQM(dst,src,off)
+#define pinsrwm(dst,src,off,msk)    PINSRWM(dst,src,off,msk)
+#define pextrwm(dst,src,off,msk)    PEXTRWM(dst,src,off,msk)
+#define pshufwm(dst,src,off,msk)    PSHUFWM(dst,src,off,msk)
+#define movntqm(dst,src,off)    MOVNTQM(dst,src,off)
+#define prefetchntam(mem,off)   PREFETCHNTA(mem,off)
+#define prefetcht0m(mem,off)    PREFETCHT0(mem,off)  
+#define prefetcht1m(mem,off)    PREFETCHT1(mem,off)  
+#define prefetcht2m(mem,off)    PREFETCHT2(mem,off)  
+
+#endif
diff --git a/public/mathlib/anorms.h b/public/mathlib/anorms.h
new file mode 100644
index 0000000..4f65383
--- /dev/null
+++ b/public/mathlib/anorms.h
@@ -0,0 +1,25 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+//=============================================================================//
+
+#ifndef ANORMS_H
+#define ANORMS_H
+#ifdef _WIN32
+#pragma once
+#endif
+
+
+#include "mathlib/vector.h"
+
+
+#define NUMVERTEXNORMALS	162
+
+// the angle between consecutive g_anorms[] vectors is ~14.55 degrees
+#define VERTEXNORMAL_CONE_INNER_ANGLE	DEG2RAD(7.275)
+
+extern Vector g_anorms[NUMVERTEXNORMALS];
+
+
+#endif // ANORMS_H
diff --git a/public/mathlib/bumpvects.h b/public/mathlib/bumpvects.h
new file mode 100644
index 0000000..6939ca0
--- /dev/null
+++ b/public/mathlib/bumpvects.h
@@ -0,0 +1,37 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $Workfile:     $
+// $Date:         $
+// $NoKeywords: $
+//=============================================================================//
+
+#ifndef BUMPVECTS_H
+#define BUMPVECTS_H
+
+#ifdef _WIN32
+#pragma once
+#endif
+
+#include "mathlib/mathlib.h"
+
+#define OO_SQRT_2 0.70710676908493042f
+#define OO_SQRT_3 0.57735025882720947f
+#define OO_SQRT_6 0.40824821591377258f
+// sqrt( 2 / 3 )
+#define OO_SQRT_2_OVER_3 0.81649661064147949f
+
+#define NUM_BUMP_VECTS 3
+
+const TableVector g_localBumpBasis[NUM_BUMP_VECTS] = 
+{
+	{	OO_SQRT_2_OVER_3, 0.0f, OO_SQRT_3 },
+	{  -OO_SQRT_6, OO_SQRT_2, OO_SQRT_3 },
+	{  -OO_SQRT_6, -OO_SQRT_2, OO_SQRT_3 }
+};
+
+void GetBumpNormals( const Vector& sVect, const Vector& tVect, const Vector& flatNormal, 
+					 const Vector& phongNormal, Vector bumpNormals[NUM_BUMP_VECTS] );
+
+#endif // BUMPVECTS_H
diff --git a/public/mathlib/compressed_3d_unitvec.h b/public/mathlib/compressed_3d_unitvec.h
new file mode 100644
index 0000000..a92dba2
--- /dev/null
+++ b/public/mathlib/compressed_3d_unitvec.h
@@ -0,0 +1,284 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+#ifndef _3D_UNITVEC_H
+#define _3D_UNITVEC_H
+
+
+#define UNITVEC_DECLARE_STATICS \
+   float cUnitVector::mUVAdjustment[0x2000]; \
+   Vector cUnitVector::mTmpVec;
+
+// upper 3 bits
+#define SIGN_MASK  0xe000
+#define XSIGN_MASK 0x8000
+#define YSIGN_MASK 0x4000
+#define ZSIGN_MASK 0x2000
+
+// middle 6 bits - xbits
+#define TOP_MASK  0x1f80
+
+// lower 7 bits - ybits
+#define BOTTOM_MASK  0x007f
+
+// unitcomp.cpp : A Unit Vector to 16-bit word conversion
+// algorithm based on work of Rafael Baptista ([email protected])
+// Accuracy improved by O.D. ([email protected])
+// Used with Permission.
+
+// a compressed unit vector. reasonable fidelty for unit
+// vectors in a 16 bit package. Good enough for surface normals
+// we hope.
+class cUnitVector // : public c3dMathObject
+{
+public:
+   cUnitVector() { mVec = 0; }
+   cUnitVector( const Vector& vec )
+   {
+      packVector( vec );
+   }
+   cUnitVector( unsigned short val ) { mVec = val; }
+
+   cUnitVector& operator=( const Vector& vec )
+   { packVector( vec ); return *this; }
+
+   operator Vector()
+   {
+      unpackVector( mTmpVec );
+      return mTmpVec;
+   }
+
+   void packVector( const Vector& vec )
+   {
+      // convert from Vector to cUnitVector
+
+      Assert( vec.IsValid());
+      Vector tmp = vec;
+
+      // input vector does not have to be unit length
+      // Assert( tmp.length() <= 1.001f );
+
+      mVec = 0;
+      if ( tmp.x < 0 ) { mVec |= XSIGN_MASK; tmp.x = -tmp.x; }
+      if ( tmp.y < 0 ) { mVec |= YSIGN_MASK; tmp.y = -tmp.y; }
+      if ( tmp.z < 0 ) { mVec |= ZSIGN_MASK; tmp.z = -tmp.z; }
+
+      // project the normal onto the plane that goes through
+      // X0=(1,0,0),Y0=(0,1,0),Z0=(0,0,1).
+      // on that plane we choose an (projective!) coordinate system
+      // such that X0->(0,0), Y0->(126,0), Z0->(0,126),(0,0,0)->Infinity
+
+      // a little slower... old pack was 4 multiplies and 2 adds.
+      // This is 2 multiplies, 2 adds, and a divide....
+      float w = 126.0f / ( tmp.x + tmp.y + tmp.z );
+      long xbits = (long)( tmp.x * w );
+      long ybits = (long)( tmp.y * w );
+
+      Assert( xbits <  127 );
+      Assert( xbits >= 0   );
+      Assert( ybits <  127 );
+      Assert( ybits >= 0   );
+
+      // Now we can be sure that 0<=xp<=126, 0<=yp<=126, 0<=xp+yp<=126
+      // however for the sampling we want to transform this triangle
+      // into a rectangle.
+      if ( xbits >= 64 )
+      {
+         xbits = 127 - xbits;
+         ybits = 127 - ybits;
+      }
+
+      // now we that have xp in the range (0,127) and yp in
+      // the range (0,63), we can pack all the bits together
+      mVec |= ( xbits << 7 );
+      mVec |= ybits;
+   }
+
+   void unpackVector( Vector& vec )
+   {
+      // if we do a straightforward backward transform
+      // we will get points on the plane X0,Y0,Z0
+      // however we need points on a sphere that goes through
+      // these points. Therefore we need to adjust x,y,z so
+      // that x^2+y^2+z^2=1 by normalizing the vector. We have
+      // already precalculated the amount by which we need to
+      // scale, so all we do is a table lookup and a
+      // multiplication
+
+      // get the x and y bits
+      long xbits = (( mVec & TOP_MASK ) >> 7 );
+      long ybits = ( mVec & BOTTOM_MASK );
+
+      // map the numbers back to the triangle (0,0)-(0,126)-(126,0)
+      if (( xbits + ybits ) >= 127 )
+      {
+         xbits = 127 - xbits;
+         ybits = 127 - ybits;
+      }
+
+      // do the inverse transform and normalization
+      // costs 3 extra multiplies and 2 subtracts. No big deal.
+      float uvadj = mUVAdjustment[mVec & ~SIGN_MASK];
+      vec.x = uvadj * (float) xbits;
+      vec.y = uvadj * (float) ybits;
+      vec.z = uvadj * (float)( 126 - xbits - ybits );
+
+      // set all the sign bits
+      if ( mVec & XSIGN_MASK ) vec.x = -vec.x;
+      if ( mVec & YSIGN_MASK ) vec.y = -vec.y;
+      if ( mVec & ZSIGN_MASK ) vec.z = -vec.z;
+
+      Assert( vec.IsValid());
+   }
+
+   static void initializeStatics()
+   {
+      for ( int idx = 0; idx < 0x2000; idx++ )
+      {
+         long xbits = idx >> 7;
+         long ybits = idx & BOTTOM_MASK;
+
+         // map the numbers back to the triangle (0,0)-(0,127)-(127,0)
+         if (( xbits + ybits ) >= 127 )
+         {
+            xbits = 127 - xbits;
+            ybits = 127 - ybits;
+         }
+
+         // convert to 3D vectors
+         float x = (float)xbits;
+         float y = (float)ybits;
+         float z = (float)( 126 - xbits - ybits );
+		
+         // calculate the amount of normalization required
+         mUVAdjustment[idx] = 1.0f / sqrtf( y*y + z*z + x*x );
+         Assert( _finite( mUVAdjustment[idx]));
+
+         //cerr << mUVAdjustment[idx] << "\t";
+         //if ( xbits == 0 ) cerr << "\n";
+      }
+   }
+
+#if 0
+   void test()
+   {
+      #define TEST_RANGE 4
+      #define TEST_RANDOM 100
+      #define TEST_ANGERROR 1.0
+
+      float maxError = 0;
+      float avgError = 0;
+      int numVecs = 0;
+
+      {for ( int x = -TEST_RANGE; x < TEST_RANGE; x++ )
+      {
+         for ( int y = -TEST_RANGE; y < TEST_RANGE; y++ )
+         {
+            for ( int z = -TEST_RANGE; z < TEST_RANGE; z++ )
+            {
+               if (( x + y + z ) == 0 ) continue;
+
+               Vector vec( (float)x, (float)y, (float)z );
+               Vector vec2;
+
+               vec.normalize();
+               packVector( vec );
+               unpackVector( vec2 );
+
+               float ang = vec.dot( vec2 );
+               ang = (( fabs( ang ) > 0.99999f ) ? 0 : (float)acos(ang));
+
+               if (( ang > TEST_ANGERROR ) | ( !_finite( ang )))
+               {
+                  cerr << "error: " << ang << endl;
+                  cerr << "orig vec:       " << vec.x << ",\t"
+                       << vec.y << ",\t" << vec.z << "\tmVec: "
+                       << mVec << endl;
+                  cerr << "quantized vec2: " << vec2.x
+                       << ",\t" << vec2.y << ",\t"
+                       << vec2.z << endl << endl;
+               }
+               avgError += ang;
+               numVecs++;
+               if ( maxError < ang ) maxError = ang;
+            }
+         }
+      }}
+
+      for ( int w = 0; w < TEST_RANDOM; w++ )
+      {
+         Vector vec( genRandom(), genRandom(), genRandom());
+         Vector vec2;
+         vec.normalize();
+
+         packVector( vec );
+         unpackVector( vec2 );
+
+         float ang =vec.dot( vec2 );
+         ang = (( ang > 0.999f ) ? 0 : (float)acos(ang));
+
+         if (( ang > TEST_ANGERROR ) | ( !_finite( ang )))
+         {
+            cerr << "error: " << ang << endl;
+            cerr << "orig vec:       " << vec.x << ",\t"
+                 << vec.y << ",\t" << vec.z << "\tmVec: "
+                 << mVec << endl;
+            cerr << "quantized vec2: " << vec2.x << ",\t"
+                 << vec2.y << ",\t"
+                 << vec2.z << endl << endl;
+         }
+         avgError += ang;
+         numVecs++;
+         if ( maxError < ang ) maxError = ang;
+      }
+
+      { for ( int x = 0; x < 50; x++ )
+      {
+         Vector vec( (float)x, 25.0f, 0.0f );
+         Vector vec2;
+
+         vec.normalize();
+         packVector( vec );
+         unpackVector( vec2 );
+
+         float ang = vec.dot( vec2 );
+         ang = (( fabs( ang ) > 0.999f ) ? 0 : (float)acos(ang));
+
+         if (( ang > TEST_ANGERROR ) | ( !_finite( ang )))
+         {
+            cerr << "error: " << ang << endl;
+            cerr << "orig vec:       " << vec.x << ",\t"
+                 << vec.y << ",\t" << vec.z << "\tmVec: "
+                 << mVec << endl;
+            cerr << "   quantized vec2: " << vec2.x << ",\t"
+                 << vec2.y << ",\t" << vec2.z << endl << endl;
+         }
+
+         avgError += ang;
+         numVecs++;
+         if ( maxError < ang ) maxError = ang;
+      }}
+
+      cerr << "max angle error: " << maxError
+           << ", average error: " << avgError / numVecs
+           << ", num tested vecs: " << numVecs << endl;
+   }
+
+   friend ostream& operator<< ( ostream& os, const cUnitVector& vec )
+   { os << vec.mVec; return os; }
+#endif
+
+//protected: // !!!!
+
+   unsigned short mVec;
+   static float mUVAdjustment[0x2000];
+   static Vector mTmpVec;
+};
+
+#endif // _3D_VECTOR_H
+
+
diff --git a/public/mathlib/compressed_light_cube.h b/public/mathlib/compressed_light_cube.h
new file mode 100644
index 0000000..207f92d
--- /dev/null
+++ b/public/mathlib/compressed_light_cube.h
@@ -0,0 +1,24 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+//=============================================================================//
+
+#ifndef COMPRESSED_LIGHT_CUBE_H
+#define COMPRESSED_LIGHT_CUBE_H
+#ifdef _WIN32
+#pragma once
+#endif
+
+
+#include "mathlib/mathlib.h"
+
+
+struct CompressedLightCube
+{
+	DECLARE_BYTESWAP_DATADESC();
+	ColorRGBExp32 m_Color[6];
+};
+
+
+#endif // COMPRESSED_LIGHT_CUBE_H
diff --git a/public/mathlib/compressed_vector.h b/public/mathlib/compressed_vector.h
new file mode 100644
index 0000000..6a49522
--- /dev/null
+++ b/public/mathlib/compressed_vector.h
@@ -0,0 +1,608 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+
+#ifndef COMPRESSED_VECTOR_H
+#define COMPRESSED_VECTOR_H
+
+#ifdef _WIN32
+#pragma once
+#endif
+
+#include <math.h>
+#include <float.h>
+
+// For vec_t, put this somewhere else?
+#include "basetypes.h"
+
+// For rand(). We really need a library!
+#include <stdlib.h>
+
+#include "tier0/dbg.h"
+#include "mathlib/vector.h"
+
+#include "mathlib/mathlib.h"
+
+#if defined( _X360 )
+#pragma bitfield_order( push, lsb_to_msb )
+#endif
+//=========================================================
+// fit a 3D vector into 32 bits
+//=========================================================
+
+class Vector32
+{
+public:
+	// Construction/destruction:
+	Vector32(void); 
+	Vector32(vec_t X, vec_t Y, vec_t Z);
+
+	// assignment
+	Vector32& operator=(const Vector &vOther);
+	operator Vector ();
+
+private:
+	unsigned short x:10;
+	unsigned short y:10;
+	unsigned short z:10;
+	unsigned short exp:2;
+};
+
+inline Vector32& Vector32::operator=(const Vector &vOther)	
+{
+	CHECK_VALID(vOther);
+
+	static float expScale[4] = { 4.0f, 16.0f, 32.f, 64.f };
+
+	float fmax = Max( fabs( vOther.x ), fabs( vOther.y ) );
+	fmax = Max( fmax, (float)fabs( vOther.z ) );
+
+	for (exp = 0; exp < 3; exp++)
+	{
+		if (fmax < expScale[exp])
+			break;
+	}
+	Assert( fmax < expScale[exp] );
+
+	float fexp = 512.0f / expScale[exp];
+
+	x = Clamp( (int)(vOther.x * fexp) + 512, 0, 1023 );
+	y = Clamp( (int)(vOther.y * fexp) + 512, 0, 1023 );
+	z = Clamp( (int)(vOther.z * fexp) + 512, 0, 1023 );
+	return *this; 
+}
+
+
+inline Vector32::operator Vector ()
+{
+	Vector tmp;
+
+	static float expScale[4] = { 4.0f, 16.0f, 32.f, 64.f };
+
+	float fexp = expScale[exp] / 512.0f;
+
+	tmp.x = (((int)x) - 512) * fexp;
+	tmp.y = (((int)y) - 512) * fexp;
+	tmp.z = (((int)z) - 512) * fexp; 
+	return tmp; 
+}
+
+
+//=========================================================
+// Fit a unit vector into 32 bits
+//=========================================================
+
+class Normal32
+{
+public:
+	// Construction/destruction:
+	Normal32(void); 
+	Normal32(vec_t X, vec_t Y, vec_t Z);
+
+	// assignment
+	Normal32& operator=(const Vector &vOther);
+	operator Vector ();
+
+private:
+	unsigned short x:15;
+	unsigned short y:15;
+	unsigned short zneg:1;
+};
+
+
+inline Normal32& Normal32::operator=(const Vector &vOther)	
+{
+	CHECK_VALID(vOther);
+
+	x = Clamp( (int)(vOther.x * 16384) + 16384, 0, 32767 );
+	y = Clamp( (int)(vOther.y * 16384) + 16384, 0, 32767 );
+	zneg = (vOther.z < 0);
+	//x = vOther.x; 
+	//y = vOther.y; 
+	//z = vOther.z; 
+	return *this; 
+}
+
+
+inline Normal32::operator Vector ()
+{
+	Vector tmp;
+
+	tmp.x = ((int)x - 16384) * (1 / 16384.0);
+	tmp.y = ((int)y - 16384) * (1 / 16384.0);
+	tmp.z = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y );
+	if (zneg)
+		tmp.z = -tmp.z;
+	return tmp; 
+}
+
+
+//=========================================================
+// 64 bit Quaternion
+//=========================================================
+
+class Quaternion64
+{
+public:
+	// Construction/destruction:
+	Quaternion64(void); 
+	Quaternion64(vec_t X, vec_t Y, vec_t Z);
+
+	// assignment
+	// Quaternion& operator=(const Quaternion64 &vOther);
+	Quaternion64& operator=(const Quaternion &vOther);
+	operator Quaternion ();
+private:
+	uint64 x:21;
+	uint64 y:21;
+	uint64 z:21;
+	uint64 wneg:1;
+};
+
+
+inline Quaternion64::operator Quaternion ()	
+{
+	Quaternion tmp;
+
+	// shift to -1048576, + 1048575, then round down slightly to -1.0 < x < 1.0
+	tmp.x = ((int)x - 1048576) * (1 / 1048576.5f);
+	tmp.y = ((int)y - 1048576) * (1 / 1048576.5f);
+	tmp.z = ((int)z - 1048576) * (1 / 1048576.5f);
+	tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );
+	if (wneg)
+		tmp.w = -tmp.w;
+	return tmp; 
+}
+
+inline Quaternion64& Quaternion64::operator=(const Quaternion &vOther)	
+{
+	CHECK_VALID(vOther);
+
+	x = Clamp( (int)(vOther.x * 1048576) + 1048576, 0, 2097151 );
+	y = Clamp( (int)(vOther.y * 1048576) + 1048576, 0, 2097151 );
+	z = Clamp( (int)(vOther.z * 1048576) + 1048576, 0, 2097151 );
+	wneg = (vOther.w < 0);
+	return *this; 
+}
+
+//=========================================================
+// 48 bit Quaternion
+//=========================================================
+
+class Quaternion48
+{
+public:
+	// Construction/destruction:
+	Quaternion48(void); 
+	Quaternion48(vec_t X, vec_t Y, vec_t Z);
+
+	// assignment
+	// Quaternion& operator=(const Quaternion48 &vOther);
+	Quaternion48& operator=(const Quaternion &vOther);
+	operator Quaternion ();
+private:
+	unsigned short x:16;
+	unsigned short y:16;
+	unsigned short z:15;
+	unsigned short wneg:1;
+};
+
+
+inline Quaternion48::operator Quaternion ()	
+{
+	Quaternion tmp;
+
+	tmp.x = ((int)x - 32768) * (1 / 32768.0);
+	tmp.y = ((int)y - 32768) * (1 / 32768.0);
+	tmp.z = ((int)z - 16384) * (1 / 16384.0);
+	tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );
+	if (wneg)
+		tmp.w = -tmp.w;
+	return tmp; 
+}
+
+inline Quaternion48& Quaternion48::operator=(const Quaternion &vOther)	
+{
+	CHECK_VALID(vOther);
+
+	x = Clamp( (int)(vOther.x * 32768) + 32768, 0, 65535 );
+	y = Clamp( (int)(vOther.y * 32768) + 32768, 0, 65535 );
+	z = Clamp( (int)(vOther.z * 16384) + 16384, 0, 32767 );
+	wneg = (vOther.w < 0);
+	return *this; 
+}
+
+//=========================================================
+// 32 bit Quaternion
+//=========================================================
+
+class Quaternion32
+{
+public:
+	// Construction/destruction:
+	Quaternion32(void); 
+	Quaternion32(vec_t X, vec_t Y, vec_t Z);
+
+	// assignment
+	// Quaternion& operator=(const Quaternion48 &vOther);
+	Quaternion32& operator=(const Quaternion &vOther);
+	operator Quaternion ();
+private:
+	unsigned int x:11;
+	unsigned int y:10;
+	unsigned int z:10;
+	unsigned int wneg:1;
+};
+
+
+inline Quaternion32::operator Quaternion ()	
+{
+	Quaternion tmp;
+
+	tmp.x = ((int)x - 1024) * (1 / 1024.0);
+	tmp.y = ((int)y - 512) * (1 / 512.0);
+	tmp.z = ((int)z - 512) * (1 / 512.0);
+	tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );
+	if (wneg)
+		tmp.w = -tmp.w;
+	return tmp; 
+}
+
+inline Quaternion32& Quaternion32::operator=(const Quaternion &vOther)	
+{
+	CHECK_VALID(vOther);
+
+	x = Clamp( (int)(vOther.x * 1024) + 1024, 0, 2047 );
+	y = Clamp( (int)(vOther.y * 512) + 512, 0, 1023 );
+	z = Clamp( (int)(vOther.z * 512) + 512, 0, 1023 );
+	wneg = (vOther.w < 0);
+	return *this; 
+}
+
+//=========================================================
+// 16 bit float
+//=========================================================
+
+
+const int float32bias = 127;
+const int float16bias = 15;
+
+const float maxfloat16bits = 65504.0f;
+
+class float16
+{
+public:
+	//float16() {}
+	//float16( float f ) { m_storage.rawWord = ConvertFloatTo16bits(f); }
+
+	void Init() { m_storage.rawWord = 0; }
+//	float16& operator=(const float16 &other) { m_storage.rawWord = other.m_storage.rawWord; return *this; }
+//	float16& operator=(const float &other) { m_storage.rawWord = ConvertFloatTo16bits(other); return *this; }
+//	operator unsigned short () { return m_storage.rawWord; }
+//	operator float () { return Convert16bitFloatTo32bits( m_storage.rawWord ); }
+	unsigned short GetBits() const 
+	{ 
+		return m_storage.rawWord; 
+	}
+	float GetFloat() const 
+	{ 
+		return Convert16bitFloatTo32bits( m_storage.rawWord ); 
+	}
+	void SetFloat( float in ) 
+	{ 
+		m_storage.rawWord = ConvertFloatTo16bits( in ); 
+	}
+
+	bool IsInfinity() const
+	{
+		return m_storage.bits.biased_exponent == 31 && m_storage.bits.mantissa == 0;
+	}
+	bool IsNaN() const
+	{
+		return m_storage.bits.biased_exponent == 31 && m_storage.bits.mantissa != 0;
+	}
+
+	bool operator==(const float16 other) const { return m_storage.rawWord == other.m_storage.rawWord; }
+	bool operator!=(const float16 other) const { return m_storage.rawWord != other.m_storage.rawWord; }
+	
+//	bool operator< (const float other) const	   { return GetFloat() < other; }
+//	bool operator> (const float other) const	   { return GetFloat() > other; }
+
+protected:
+	union float32bits
+	{
+		float rawFloat;
+		struct 
+		{
+			unsigned int mantissa : 23;
+			unsigned int biased_exponent : 8;
+			unsigned int sign : 1;
+		} bits;
+	};
+
+	union float16bits
+	{
+		unsigned short rawWord;
+		struct
+		{
+			unsigned short mantissa : 10;
+			unsigned short biased_exponent : 5;
+			unsigned short sign : 1;
+		} bits;
+	};
+
+	static bool IsNaN( float16bits in )
+	{
+		return in.bits.biased_exponent == 31 && in.bits.mantissa != 0;
+	}
+	static bool IsInfinity( float16bits in )
+	{
+		return in.bits.biased_exponent == 31 && in.bits.mantissa == 0;
+	}
+
+	// 0x0001 - 0x03ff
+	static unsigned short ConvertFloatTo16bits( float input )
+	{
+		if ( input > maxfloat16bits )
+			input = maxfloat16bits;
+		else if ( input < -maxfloat16bits )
+			input = -maxfloat16bits;
+
+		float16bits output;
+		float32bits inFloat;
+
+		inFloat.rawFloat = input;
+
+		output.bits.sign = inFloat.bits.sign;
+
+		if ( (inFloat.bits.biased_exponent==0) && (inFloat.bits.mantissa==0) ) 
+		{ 
+			// zero
+			output.bits.mantissa = 0;
+			output.bits.biased_exponent = 0;
+		}
+		else if ( (inFloat.bits.biased_exponent==0) && (inFloat.bits.mantissa!=0) ) 
+		{  
+			// denorm -- denorm float maps to 0 half
+			output.bits.mantissa = 0;
+			output.bits.biased_exponent = 0;
+		}
+		else if ( (inFloat.bits.biased_exponent==0xff) && (inFloat.bits.mantissa==0) ) 
+		{ 
+#if 0
+			// infinity
+			output.bits.mantissa = 0;
+			output.bits.biased_exponent = 31;
+#else
+			// infinity maps to maxfloat
+			output.bits.mantissa = 0x3ff;
+			output.bits.biased_exponent = 0x1e;
+#endif
+		}
+		else if ( (inFloat.bits.biased_exponent==0xff) && (inFloat.bits.mantissa!=0) ) 
+		{ 
+#if 0
+			// NaN
+			output.bits.mantissa = 1;
+			output.bits.biased_exponent = 31;
+#else
+			// NaN maps to zero
+			output.bits.mantissa = 0;
+			output.bits.biased_exponent = 0;
+#endif
+		}
+		else 
+		{ 
+			// regular number
+			int new_exp = inFloat.bits.biased_exponent-127;
+
+			if (new_exp<-24) 
+			{ 
+				// this maps to 0
+				output.bits.mantissa = 0;
+				output.bits.biased_exponent = 0;
+			}
+
+			if (new_exp<-14) 
+			{
+				// this maps to a denorm
+				output.bits.biased_exponent = 0;
+				unsigned int exp_val = ( unsigned int )( -14 - ( inFloat.bits.biased_exponent - float32bias ) );
+				if( exp_val > 0 && exp_val < 11 )
+				{
+					output.bits.mantissa = ( 1 << ( 10 - exp_val ) ) + ( inFloat.bits.mantissa >> ( 13 + exp_val ) );
+				}
+			}
+			else if (new_exp>15) 
+			{ 
+#if 0
+				// map this value to infinity
+				output.bits.mantissa = 0;
+				output.bits.biased_exponent = 31;
+#else
+				// to big. . . maps to maxfloat
+				output.bits.mantissa = 0x3ff;
+				output.bits.biased_exponent = 0x1e;
+#endif
+			}
+			else 
+			{
+				output.bits.biased_exponent = new_exp+15;
+				output.bits.mantissa = (inFloat.bits.mantissa >> 13);
+			}
+		}
+		return output.rawWord;
+	}
+
+	static float Convert16bitFloatTo32bits( unsigned short input )
+	{
+		float32bits output;
+		const float16bits &inFloat = *((float16bits *)&input);
+
+		if( IsInfinity( inFloat ) )
+		{
+			return maxfloat16bits * ( ( inFloat.bits.sign == 1 ) ? -1.0f : 1.0f );
+		}
+		if( IsNaN( inFloat ) )
+		{
+			return 0.0;
+		}
+		if( inFloat.bits.biased_exponent == 0 && inFloat.bits.mantissa != 0 )
+		{
+			// denorm
+			const float half_denorm = (1.0f/16384.0f); // 2^-14
+			float mantissa = ((float)(inFloat.bits.mantissa)) / 1024.0f;
+			float sgn = (inFloat.bits.sign)? -1.0f :1.0f;
+			output.rawFloat = sgn*mantissa*half_denorm;
+		}
+		else
+		{
+			// regular number
+			unsigned mantissa = inFloat.bits.mantissa;
+			unsigned biased_exponent = inFloat.bits.biased_exponent;
+			unsigned sign = ((unsigned)inFloat.bits.sign) << 31;
+			biased_exponent = ( (biased_exponent - float16bias + float32bias) * (biased_exponent != 0) ) << 23;
+			mantissa <<= (23-10);
+
+			*((unsigned *)&output) = ( mantissa | biased_exponent | sign );
+		}
+		
+		return output.rawFloat;
+	}
+
+
+	float16bits m_storage;
+};
+
+class float16_with_assign : public float16
+{
+public:
+	float16_with_assign() {}
+	float16_with_assign( float f ) { m_storage.rawWord = ConvertFloatTo16bits(f); }
+
+	float16& operator=(const float16 &other) { m_storage.rawWord = ((float16_with_assign &)other).m_storage.rawWord; return *this; }
+	float16& operator=(const float &other) { m_storage.rawWord = ConvertFloatTo16bits(other); return *this; }
+//	operator unsigned short () const { return m_storage.rawWord; }
+	operator float () const { return Convert16bitFloatTo32bits( m_storage.rawWord ); }
+};
+
+//=========================================================
+// Fit a 3D vector in 48 bits
+//=========================================================
+
+class Vector48
+{
+public:
+	// Construction/destruction:
+	Vector48(void) {}
+	Vector48(vec_t X, vec_t Y, vec_t Z) { x.SetFloat( X ); y.SetFloat( Y ); z.SetFloat( Z ); }
+
+	// assignment
+	Vector48& operator=(const Vector &vOther);
+	operator Vector ();
+
+	const float operator[]( int i ) const { return (((float16 *)this)[i]).GetFloat(); }
+
+	float16 x;
+	float16 y;
+	float16 z;
+};
+
+inline Vector48& Vector48::operator=(const Vector &vOther)	
+{
+	CHECK_VALID(vOther);
+
+	x.SetFloat( vOther.x );
+	y.SetFloat( vOther.y );
+	z.SetFloat( vOther.z );
+	return *this; 
+}
+
+
+inline Vector48::operator Vector ()
+{
+	Vector tmp;
+
+	tmp.x = x.GetFloat();
+	tmp.y = y.GetFloat();
+	tmp.z = z.GetFloat(); 
+
+	return tmp;
+}
+
+//=========================================================
+// Fit a 2D vector in 32 bits
+//=========================================================
+
+class Vector2d32
+{
+public:
+	// Construction/destruction:
+	Vector2d32(void) {}
+	Vector2d32(vec_t X, vec_t Y) { x.SetFloat( X ); y.SetFloat( Y ); }
+
+	// assignment
+	Vector2d32& operator=(const Vector &vOther);
+	Vector2d32& operator=(const Vector2D &vOther);
+
+	operator Vector2D ();
+
+	void Init( vec_t ix = 0.f, vec_t iy = 0.f);
+
+	float16_with_assign x;
+	float16_with_assign y;
+};
+
+inline Vector2d32& Vector2d32::operator=(const Vector2D &vOther)	
+{
+	x.SetFloat( vOther.x );
+	y.SetFloat( vOther.y );
+	return *this; 
+}
+
+inline Vector2d32::operator Vector2D ()
+{
+	Vector2D tmp;
+
+	tmp.x = x.GetFloat();
+	tmp.y = y.GetFloat();
+
+	return tmp;
+}
+
+inline void Vector2d32::Init( vec_t ix, vec_t iy )
+{
+	x.SetFloat(ix);
+	y.SetFloat(iy);
+}
+
+#if defined( _X360 )
+#pragma bitfield_order( pop )
+#endif
+
+#endif
+
diff --git a/public/mathlib/halton.h b/public/mathlib/halton.h
new file mode 100644
index 0000000..44df68f
--- /dev/null
+++ b/public/mathlib/halton.h
@@ -0,0 +1,71 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+// $Id$
+
+// halton.h - classes, etc for generating numbers using the Halton pseudo-random sequence.  See
+// http://halton-sequences.wikiverse.org/.
+//
+// what this function is useful for is any sort of sampling/integration problem where
+// you want to solve it by random sampling. Each call the NextValue() generates
+// a random number between 0 and 1, in an unclumped manner, so that the space can be more
+// or less evenly sampled with a minimum number of samples.
+//
+// It is NOT useful for generating random numbers dynamically, since the outputs aren't
+// particularly random.
+//
+// To generate multidimensional sample values (points in a plane, etc), use two
+// HaltonSequenceGenerator_t's, with different (primes) bases.
+
+#ifndef HALTON_H
+#define HALTON_H
+
+#include <tier0/platform.h>
+#include <mathlib/vector.h>
+
+class HaltonSequenceGenerator_t
+{
+	int seed;
+	int base;
+	float fbase;											//< base as a float
+
+public:
+	HaltonSequenceGenerator_t(int base);					//< base MUST be prime, >=2
+
+	float GetElement(int element);
+
+	inline float NextValue(void)
+	{
+		return GetElement(seed++);
+	}
+
+};
+
+
+class DirectionalSampler_t									//< pseudo-random sphere sampling
+{
+	HaltonSequenceGenerator_t zdot;
+	HaltonSequenceGenerator_t vrot;
+public:
+	DirectionalSampler_t(void)
+		: zdot(2),vrot(3)
+	{
+	}
+
+	Vector NextValue(void)
+	{
+		float zvalue=zdot.NextValue();
+		zvalue=2*zvalue-1.0;								// map from 0..1 to -1..1
+		float phi=acos(zvalue);
+		// now, generate a random rotation angle for x/y
+		float theta=2.0*M_PI*vrot.NextValue();
+		float sin_p=sin(phi);
+		return Vector(cos(theta)*sin_p,
+					  sin(theta)*sin_p,
+					  zvalue);
+
+	}
+};
+
+
+
+
+#endif // halton_h
diff --git a/public/mathlib/lightdesc.h b/public/mathlib/lightdesc.h
new file mode 100644
index 0000000..1096d62
--- /dev/null
+++ b/public/mathlib/lightdesc.h
@@ -0,0 +1,173 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+//===========================================================================//
+
+// light structure definitions.
+#ifndef LIGHTDESC_H
+#define LIGHTDESC_H
+
+#include <mathlib/ssemath.h>
+#include <mathlib/vector.h>
+
+//-----------------------------------------------------------------------------
+// Light structure
+//-----------------------------------------------------------------------------
+
+enum LightType_t
+{
+	MATERIAL_LIGHT_DISABLE = 0,
+	MATERIAL_LIGHT_POINT,
+	MATERIAL_LIGHT_DIRECTIONAL,
+	MATERIAL_LIGHT_SPOT,
+};
+
+enum LightType_OptimizationFlags_t
+{
+	LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION0 = 1,
+	LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION1 = 2,
+	LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION2 = 4,
+	LIGHTTYPE_OPTIMIZATIONFLAGS_DERIVED_VALUES_CALCED = 8,
+};
+
+struct LightDesc_t 
+{
+    LightType_t m_Type;										//< MATERIAL_LIGHT_xxx
+	Vector m_Color;											//< color+intensity 
+    Vector m_Position;										//< light source center position
+    Vector m_Direction;										//< for SPOT, direction it is pointing
+    float  m_Range;											//< distance range for light.0=infinite
+    float m_Falloff;										//< angular falloff exponent for spot lights
+    float m_Attenuation0;									//< constant distance falloff term
+    float m_Attenuation1;									//< linear term of falloff
+    float m_Attenuation2;									//< quadatic term of falloff
+    float m_Theta;											//< inner cone angle. no angular falloff 
+															//< within this cone
+    float m_Phi;											//< outer cone angle
+
+	// the values below are derived from the above settings for optimizations
+	// These aren't used by DX8. . used for software lighting.
+	float m_ThetaDot;
+	float m_PhiDot;
+	unsigned int m_Flags;
+protected:
+	float OneOver_ThetaDot_Minus_PhiDot;
+	float m_RangeSquared;
+public:
+
+	void RecalculateDerivedValues(void);			 // calculate m_xxDot, m_Type for changed parms
+
+	LightDesc_t(void)
+	{
+	}
+
+	// constructors for various useful subtypes
+
+	// a point light with infinite range
+	LightDesc_t( const Vector &pos, const Vector &color )
+	{
+		InitPoint( pos, color );
+	}
+	
+	/// a simple light. cone boundaries in radians. you pass a look_at point and the
+	/// direciton is derived from that.
+	LightDesc_t( const Vector &pos, const Vector &color, const Vector &point_at,
+				float inner_cone_boundary, float outer_cone_boundary )
+	{
+		InitSpot( pos, color, point_at, inner_cone_boundary, outer_cone_boundary );
+	}
+
+	void InitPoint( const Vector &pos, const Vector &color );
+	void InitDirectional( const Vector &dir, const Vector &color );
+	void InitSpot(const Vector &pos, const Vector &color, const Vector &point_at,
+		float inner_cone_boundary, float outer_cone_boundary );
+
+	/// Given 4 points and 4 normals, ADD lighting from this light into "color".
+	void ComputeLightAtPoints( const FourVectors &pos, const FourVectors &normal,
+							   FourVectors &color, bool DoHalfLambert=false ) const;
+	void ComputeNonincidenceLightAtPoints( const FourVectors &pos, FourVectors &color ) const;
+	void ComputeLightAtPointsForDirectional( const FourVectors &pos,
+											 const FourVectors &normal,
+											 FourVectors &color, bool DoHalfLambert=false ) const;
+
+	// warning - modifies color!!! set color first!!
+	void SetupOldStyleAttenuation( float fQuadatricAttn, float fLinearAttn, float fConstantAttn );
+
+	void SetupNewStyleAttenuation( float fFiftyPercentDistance, float fZeroPercentDistance );
+
+
+/// given a direction relative to the light source position, is this ray within the
+	/// light cone (for spotlights..non spots consider all rays to be within their cone)
+	bool IsDirectionWithinLightCone(const Vector &rdir) const
+	{
+		return ((m_Type!=MATERIAL_LIGHT_SPOT) || (rdir.Dot(m_Direction)>=m_PhiDot));
+	}
+
+	float OneOverThetaDotMinusPhiDot() const
+	{
+		return OneOver_ThetaDot_Minus_PhiDot;
+	}
+};
+
+
+//-----------------------------------------------------------------------------
+// a point light with infinite range
+//-----------------------------------------------------------------------------
+inline void LightDesc_t::InitPoint( const Vector &pos, const Vector &color )
+{
+	m_Type=MATERIAL_LIGHT_POINT;
+	m_Color=color;
+	m_Position=pos;
+	m_Range=0.0;									// infinite
+	m_Attenuation0=1.0;
+	m_Attenuation1=0;
+	m_Attenuation2=0;
+	RecalculateDerivedValues();
+}
+
+
+//-----------------------------------------------------------------------------
+// a directional light with infinite range
+//-----------------------------------------------------------------------------
+inline void LightDesc_t::InitDirectional( const Vector &dir, const Vector &color )
+{
+	m_Type=MATERIAL_LIGHT_DIRECTIONAL;
+	m_Color=color;
+	m_Direction=dir;
+	m_Range=0.0;									// infinite
+	m_Attenuation0=1.0;
+	m_Attenuation1=0;
+	m_Attenuation2=0;
+	RecalculateDerivedValues();
+}
+
+
+//-----------------------------------------------------------------------------
+// a simple light. cone boundaries in radians. you pass a look_at point and the
+// direciton is derived from that.
+//-----------------------------------------------------------------------------
+inline void LightDesc_t::InitSpot(const Vector &pos, const Vector &color, const Vector &point_at,
+	float inner_cone_boundary, float outer_cone_boundary)
+{
+	m_Type=MATERIAL_LIGHT_SPOT;
+	m_Color=color;
+	m_Position=pos;
+	m_Direction=point_at;
+	m_Direction-=pos;
+	VectorNormalizeFast(m_Direction);
+	m_Falloff=5.0;										// linear angle falloff
+	m_Theta=inner_cone_boundary;
+	m_Phi=outer_cone_boundary;
+
+	m_Range=0.0;										// infinite
+
+	m_Attenuation0=1.0;
+	m_Attenuation1=0;
+	m_Attenuation2=0;
+	RecalculateDerivedValues();
+}
+
+
+#endif
+
diff --git a/public/mathlib/math_pfns.h b/public/mathlib/math_pfns.h
new file mode 100644
index 0000000..d43411c
--- /dev/null
+++ b/public/mathlib/math_pfns.h
@@ -0,0 +1,80 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+//=====================================================================================//
+
+#ifndef _MATH_PFNS_H_
+#define _MATH_PFNS_H_
+
+#if defined( _X360 )
+#include <xboxmath.h>
+#endif
+
+#if !defined( _X360 )
+
+// These globals are initialized by mathlib and redirected based on available fpu features
+extern float (*pfSqrt)(float x);
+extern float (*pfRSqrt)(float x);
+extern float (*pfRSqrtFast)(float x);
+extern void  (*pfFastSinCos)(float x, float *s, float *c);
+extern float (*pfFastCos)(float x);
+
+// The following are not declared as macros because they are often used in limiting situations,
+// and sometimes the compiler simply refuses to inline them for some reason
+#define FastSqrt(x)			(*pfSqrt)(x)
+#define	FastRSqrt(x)		(*pfRSqrt)(x)
+#define FastRSqrtFast(x)    (*pfRSqrtFast)(x)
+#define FastSinCos(x,s,c)   (*pfFastSinCos)(x,s,c)
+#define FastCos(x)			(*pfFastCos)(x)
+
+#if defined(__i386__) || defined(_M_IX86)
+// On x86, the inline FPU or SSE sqrt instruction is faster than
+// the overhead of setting up a function call and saving/restoring
+// the FPU or SSE register state and can be scheduled better, too.
+#undef FastSqrt
+#define FastSqrt(x)			::sqrtf(x)
+#endif
+
+#endif // !_X360
+
+#if defined( _X360 )
+
+FORCEINLINE float _VMX_Sqrt( float x )
+{
+	return __fsqrts( x );
+}
+
+FORCEINLINE float _VMX_RSqrt( float x )
+{
+	float rroot = __frsqrte( x );
+
+	// Single iteration NewtonRaphson on reciprocal square root estimate
+	return (0.5f * rroot) * (3.0f - (x * rroot) * rroot);
+}
+
+FORCEINLINE float _VMX_RSqrtFast( float x )
+{
+	return __frsqrte( x );
+}
+
+FORCEINLINE void _VMX_SinCos( float a, float *pS, float *pC )
+{
+	XMScalarSinCos( pS, pC, a );
+}
+
+FORCEINLINE float _VMX_Cos( float a )
+{
+	return XMScalarCos( a );
+}
+
+// the 360 has fixed hw and calls directly
+#define FastSqrt(x)			_VMX_Sqrt(x)
+#define	FastRSqrt(x)		_VMX_RSqrt(x)
+#define FastRSqrtFast(x)	_VMX_RSqrtFast(x)
+#define FastSinCos(x,s,c)	_VMX_SinCos(x,s,c)
+#define FastCos(x)			_VMX_Cos(x)
+
+#endif // _X360
+
+#endif // _MATH_PFNS_H_
diff --git a/public/mathlib/mathlib.h b/public/mathlib/mathlib.h
new file mode 100644
index 0000000..a6d302f
--- /dev/null
+++ b/public/mathlib/mathlib.h
@@ -0,0 +1,2187 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+//===========================================================================//
+
+#ifndef MATH_LIB_H
+#define MATH_LIB_H
+
+#include <math.h>
+#include "minmax.h"
+#include "tier0/basetypes.h"
+#include "tier0/commonmacros.h"
+#include "mathlib/vector.h"
+#include "mathlib/vector2d.h"
+#include "tier0/dbg.h"
+
+#include "mathlib/math_pfns.h"
+
+#if defined(__i386__) || defined(_M_IX86)
+// For MMX intrinsics
+#include <xmmintrin.h>
+#endif
+
+// XXX remove me
+#undef clamp
+
+// Uncomment this to enable FP exceptions in parts of the code.
+// This can help track down FP bugs. However the code is not
+// FP exception clean so this not a turnkey operation.
+//#define FP_EXCEPTIONS_ENABLED
+
+
+#ifdef FP_EXCEPTIONS_ENABLED
+#include <float.h> // For _clearfp and _controlfp_s
+#endif
+
+// FPExceptionDisabler and FPExceptionEnabler taken from my blog post
+// at http://www.altdevblogaday.com/2012/04/20/exceptional-floating-point/
+
+// Declare an object of this type in a scope in order to suppress
+// all floating-point exceptions temporarily. The old exception
+// state will be reset at the end.
+class FPExceptionDisabler
+{
+public:
+#ifdef FP_EXCEPTIONS_ENABLED
+	FPExceptionDisabler();
+	~FPExceptionDisabler();
+
+private:
+	unsigned int mOldValues;
+#else
+	FPExceptionDisabler() {}
+	~FPExceptionDisabler() {}
+#endif
+
+private:
+	// Make the copy constructor and assignment operator private
+	// and unimplemented to prohibit copying.
+	FPExceptionDisabler(const FPExceptionDisabler&);
+	FPExceptionDisabler& operator=(const FPExceptionDisabler&);
+};
+
+// Declare an object of this type in a scope in order to enable a
+// specified set of floating-point exceptions temporarily. The old
+// exception state will be reset at the end.
+// This class can be nested.
+class FPExceptionEnabler
+{
+public:
+	// Overflow, divide-by-zero, and invalid-operation are the FP
+	// exceptions most frequently associated with bugs.
+#ifdef FP_EXCEPTIONS_ENABLED
+	FPExceptionEnabler(unsigned int enableBits = _EM_OVERFLOW | _EM_ZERODIVIDE | _EM_INVALID);
+	~FPExceptionEnabler();
+
+private:
+	unsigned int mOldValues;
+#else
+	FPExceptionEnabler(unsigned int enableBits = 0)
+	{
+	}
+	~FPExceptionEnabler()
+	{
+	}
+#endif
+
+private:
+	// Make the copy constructor and assignment operator private
+	// and unimplemented to prohibit copying.
+	FPExceptionEnabler(const FPExceptionEnabler&);
+	FPExceptionEnabler& operator=(const FPExceptionEnabler&);
+};
+
+
+
+#ifdef DEBUG  // stop crashing edit-and-continue
+FORCEINLINE float clamp( float val, float minVal, float maxVal )
+{
+	if ( maxVal < minVal )
+		return maxVal;
+	else if( val < minVal )
+		return minVal;
+	else if( val > maxVal )
+		return maxVal;
+	else
+		return val;
+}
+#else // DEBUG
+FORCEINLINE float clamp( float val, float minVal, float maxVal )
+{
+#if defined(__i386__) || defined(_M_IX86)
+	_mm_store_ss( &val,
+		_mm_min_ss(
+			_mm_max_ss(
+				_mm_load_ss(&val),
+				_mm_load_ss(&minVal) ),
+			_mm_load_ss(&maxVal) ) );
+#else
+	val = fpmax(minVal, val);
+	val = fpmin(maxVal, val);
+#endif
+	return val;
+}
+#endif // DEBUG
+
+//
+// Returns a clamped value in the range [min, max].
+//
+template< class T >
+inline T clamp( T const &val, T const &minVal, T const &maxVal )
+{
+	if ( maxVal < minVal )
+		return maxVal;
+	else if( val < minVal )
+		return minVal;
+	else if( val > maxVal )
+		return maxVal;
+	else
+		return val;
+}
+
+
+// plane_t structure
+// !!! if this is changed, it must be changed in asm code too !!!
+// FIXME: does the asm code even exist anymore?
+// FIXME: this should move to a different file
+struct cplane_t
+{
+	Vector	normal;
+	float	dist;
+	byte	type;			// for fast side tests
+	byte	signbits;		// signx + (signy<<1) + (signz<<1)
+	byte	pad[2];
+
+#ifdef VECTOR_NO_SLOW_OPERATIONS
+	cplane_t() {}
+
+private:
+	// No copy constructors allowed if we're in optimal mode
+	cplane_t(const cplane_t& vOther);
+#endif
+};
+
+// structure offset for asm code
+#define CPLANE_NORMAL_X			0
+#define CPLANE_NORMAL_Y			4
+#define CPLANE_NORMAL_Z			8
+#define CPLANE_DIST				12
+#define CPLANE_TYPE				16
+#define CPLANE_SIGNBITS			17
+#define CPLANE_PAD0				18
+#define CPLANE_PAD1				19
+
+// 0-2 are axial planes
+#define	PLANE_X			0
+#define	PLANE_Y			1
+#define	PLANE_Z			2
+
+// 3-5 are non-axial planes snapped to the nearest
+#define	PLANE_ANYX		3
+#define	PLANE_ANYY		4
+#define	PLANE_ANYZ		5
+
+
+//-----------------------------------------------------------------------------
+// Frustum plane indices.
+// WARNING: there is code that depends on these values
+//-----------------------------------------------------------------------------
+
+enum
+{
+	FRUSTUM_RIGHT		= 0,
+	FRUSTUM_LEFT		= 1,
+	FRUSTUM_TOP			= 2,
+	FRUSTUM_BOTTOM		= 3,
+	FRUSTUM_NEARZ		= 4,
+	FRUSTUM_FARZ		= 5,
+	FRUSTUM_NUMPLANES	= 6
+};
+
+extern int SignbitsForPlane( cplane_t *out );
+
+class Frustum_t
+{
+public:
+	void SetPlane( int i, int nType, const Vector &vecNormal, float dist )
+	{
+		m_Plane[i].normal = vecNormal;
+		m_Plane[i].dist = dist;
+		m_Plane[i].type = nType;
+		m_Plane[i].signbits = SignbitsForPlane( &m_Plane[i] );
+		m_AbsNormal[i].Init( fabs(vecNormal.x), fabs(vecNormal.y), fabs(vecNormal.z) );
+	}
+
+	inline const cplane_t *GetPlane( int i ) const { return &m_Plane[i]; }
+	inline const Vector &GetAbsNormal( int i ) const { return m_AbsNormal[i]; }
+
+private:
+	cplane_t	m_Plane[FRUSTUM_NUMPLANES];
+	Vector		m_AbsNormal[FRUSTUM_NUMPLANES];
+};
+
+// Computes Y fov from an X fov and a screen aspect ratio + X from Y
+float CalcFovY( float flFovX, float flScreenAspect );
+float CalcFovX( float flFovY, float flScreenAspect );
+
+// Generate a frustum based on perspective view parameters
+// NOTE: FOV is specified in degrees, as the *full* view angle (not half-angle)
+void GeneratePerspectiveFrustum( const Vector& origin, const QAngle &angles, float flZNear, float flZFar, float flFovX, float flAspectRatio, Frustum_t &frustum );
+void GeneratePerspectiveFrustum( const Vector& origin, const Vector &forward, const Vector &right, const Vector &up, float flZNear, float flZFar, float flFovX, float flFovY, Frustum_t &frustum );
+
+// Cull the world-space bounding box to the specified frustum.
+bool R_CullBox( const Vector& mins, const Vector& maxs, const Frustum_t &frustum );
+bool R_CullBoxSkipNear( const Vector& mins, const Vector& maxs, const Frustum_t &frustum );
+
+struct matrix3x4_t
+{
+	matrix3x4_t() {}
+	matrix3x4_t( 
+		float m00, float m01, float m02, float m03,
+		float m10, float m11, float m12, float m13,
+		float m20, float m21, float m22, float m23 )
+	{
+		m_flMatVal[0][0] = m00;	m_flMatVal[0][1] = m01; m_flMatVal[0][2] = m02; m_flMatVal[0][3] = m03;
+		m_flMatVal[1][0] = m10;	m_flMatVal[1][1] = m11; m_flMatVal[1][2] = m12; m_flMatVal[1][3] = m13;
+		m_flMatVal[2][0] = m20;	m_flMatVal[2][1] = m21; m_flMatVal[2][2] = m22; m_flMatVal[2][3] = m23;
+	}
+
+	//-----------------------------------------------------------------------------
+	// Creates a matrix where the X axis = forward
+	// the Y axis = left, and the Z axis = up
+	//-----------------------------------------------------------------------------
+	void Init( const Vector& xAxis, const Vector& yAxis, const Vector& zAxis, const Vector &vecOrigin )
+	{
+		m_flMatVal[0][0] = xAxis.x; m_flMatVal[0][1] = yAxis.x; m_flMatVal[0][2] = zAxis.x; m_flMatVal[0][3] = vecOrigin.x;
+		m_flMatVal[1][0] = xAxis.y; m_flMatVal[1][1] = yAxis.y; m_flMatVal[1][2] = zAxis.y; m_flMatVal[1][3] = vecOrigin.y;
+		m_flMatVal[2][0] = xAxis.z; m_flMatVal[2][1] = yAxis.z; m_flMatVal[2][2] = zAxis.z; m_flMatVal[2][3] = vecOrigin.z;
+	}
+
+	//-----------------------------------------------------------------------------
+	// Creates a matrix where the X axis = forward
+	// the Y axis = left, and the Z axis = up
+	//-----------------------------------------------------------------------------
+	matrix3x4_t( const Vector& xAxis, const Vector& yAxis, const Vector& zAxis, const Vector &vecOrigin )
+	{
+		Init( xAxis, yAxis, zAxis, vecOrigin );
+	}
+
+	inline void Invalidate( void )
+	{
+		for (int i = 0; i < 3; i++)
+		{
+			for (int j = 0; j < 4; j++)
+			{
+				m_flMatVal[i][j] = VEC_T_NAN;
+			}
+		}
+	}
+
+	float *operator[]( int i )				{ Assert(( i >= 0 ) && ( i < 3 )); return m_flMatVal[i]; }
+	const float *operator[]( int i ) const	{ Assert(( i >= 0 ) && ( i < 3 )); return m_flMatVal[i]; }
+	float *Base()							{ return &m_flMatVal[0][0]; }
+	const float *Base() const				{ return &m_flMatVal[0][0]; }
+
+	float m_flMatVal[3][4];
+};
+
+
+#ifndef M_PI
+	#define M_PI		3.14159265358979323846	// matches value in gcc v2 math.h
+#endif
+
+#define M_PI_F		((float)(M_PI))	// Shouldn't collide with anything.
+
+// NJS: Inlined to prevent floats from being autopromoted to doubles, as with the old system.
+#ifndef RAD2DEG
+	#define RAD2DEG( x  )  ( (float)(x) * (float)(180.f / M_PI_F) )
+#endif
+
+#ifndef DEG2RAD
+	#define DEG2RAD( x  )  ( (float)(x) * (float)(M_PI_F / 180.f) )
+#endif
+
+// Used to represent sides of things like planes.
+#define	SIDE_FRONT	0
+#define	SIDE_BACK	1
+#define	SIDE_ON		2
+#define SIDE_CROSS  -2      // necessary for polylib.c
+
+#define ON_VIS_EPSILON  0.01    // necessary for vvis (flow.c) -- again look into moving later!
+#define	EQUAL_EPSILON	0.001   // necessary for vbsp (faces.c) -- should look into moving it there?
+
+extern bool s_bMathlibInitialized;
+
+extern  const Vector vec3_origin;
+extern  const QAngle vec3_angle;
+extern	const Quaternion quat_identity;
+extern const Vector vec3_invalid;
+extern	const int nanmask;
+
+#define	IS_NAN(x) (((*(int *)&x)&nanmask)==nanmask)
+
+FORCEINLINE vec_t DotProduct(const vec_t *v1, const vec_t *v2)
+{
+	return v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2];
+}
+FORCEINLINE void VectorSubtract(const vec_t *a, const vec_t *b, vec_t *c)
+{
+	c[0]=a[0]-b[0];
+	c[1]=a[1]-b[1];
+	c[2]=a[2]-b[2];
+}
+FORCEINLINE void VectorAdd(const vec_t *a, const vec_t *b, vec_t *c)
+{
+	c[0]=a[0]+b[0];
+	c[1]=a[1]+b[1];
+	c[2]=a[2]+b[2];
+}
+FORCEINLINE void VectorCopy(const vec_t *a, vec_t *b)
+{
+	b[0]=a[0];
+	b[1]=a[1];
+	b[2]=a[2];
+}
+FORCEINLINE void VectorClear(vec_t *a)
+{
+	a[0]=a[1]=a[2]=0;
+}
+
+FORCEINLINE float VectorMaximum(const vec_t *v)
+{
+	return max( v[0], max( v[1], v[2] ) );
+}
+
+FORCEINLINE float VectorMaximum(const Vector& v)
+{
+	return max( v.x, max( v.y, v.z ) );
+}
+
+FORCEINLINE void VectorScale (const float* in, vec_t scale, float* out)
+{
+	out[0] = in[0]*scale;
+	out[1] = in[1]*scale;
+	out[2] = in[2]*scale;
+}
+
+
+// Cannot be forceinline as they have overloads:
+inline void VectorFill(vec_t *a, float b)
+{
+	a[0]=a[1]=a[2]=b;
+}
+
+inline void VectorNegate(vec_t *a)
+{
+	a[0]=-a[0];
+	a[1]=-a[1];
+	a[2]=-a[2];
+}
+
+
+//#define VectorMaximum(a)		( max( (a)[0], max( (a)[1], (a)[2] ) ) )
+#define Vector2Clear(x)			{(x)[0]=(x)[1]=0;}
+#define Vector2Negate(x)		{(x)[0]=-((x)[0]);(x)[1]=-((x)[1]);}
+#define Vector2Copy(a,b)		{(b)[0]=(a)[0];(b)[1]=(a)[1];}
+#define Vector2Subtract(a,b,c)	{(c)[0]=(a)[0]-(b)[0];(c)[1]=(a)[1]-(b)[1];}
+#define Vector2Add(a,b,c)		{(c)[0]=(a)[0]+(b)[0];(c)[1]=(a)[1]+(b)[1];}
+#define Vector2Scale(a,b,c)		{(c)[0]=(b)*(a)[0];(c)[1]=(b)*(a)[1];}
+
+// NJS: Some functions in VBSP still need to use these for dealing with mixing vec4's and shorts with vec_t's.
+// remove when no longer needed.
+#define VECTOR_COPY( A, B ) do { (B)[0] = (A)[0]; (B)[1] = (A)[1]; (B)[2]=(A)[2]; } while(0)
+#define DOT_PRODUCT( A, B ) ( (A)[0]*(B)[0] + (A)[1]*(B)[1] + (A)[2]*(B)[2] )
+
+FORCEINLINE void VectorMAInline( const float* start, float scale, const float* direction, float* dest )
+{
+	dest[0]=start[0]+direction[0]*scale;
+	dest[1]=start[1]+direction[1]*scale;
+	dest[2]=start[2]+direction[2]*scale;
+}
+
+FORCEINLINE void VectorMAInline( const Vector& start, float scale, const Vector& direction, Vector& dest )
+{
+	dest.x=start.x+direction.x*scale;
+	dest.y=start.y+direction.y*scale;
+	dest.z=start.z+direction.z*scale;
+}
+
+FORCEINLINE void VectorMA( const Vector& start, float scale, const Vector& direction, Vector& dest )
+{
+	VectorMAInline(start, scale, direction, dest);
+}
+
+FORCEINLINE void VectorMA( const float * start, float scale, const float *direction, float *dest )
+{
+	VectorMAInline(start, scale, direction, dest);
+}
+
+
+int VectorCompare (const float *v1, const float *v2);
+
+inline float VectorLength(const float *v)
+{
+	return FastSqrt( v[0]*v[0] + v[1]*v[1] + v[2]*v[2] + FLT_EPSILON );
+}
+
+void CrossProduct (const float *v1, const float *v2, float *cross);
+
+qboolean VectorsEqual( const float *v1, const float *v2 );
+
+inline vec_t RoundInt (vec_t in)
+{
+	return floor(in + 0.5f);
+}
+
+int Q_log2(int val);
+
+// Math routines done in optimized assembly math package routines
+void inline SinCos( float radians, float *sine, float *cosine )
+{
+#if defined( _X360 )
+	XMScalarSinCos( sine, cosine, radians );
+#elif defined( PLATFORM_WINDOWS_PC32 )
+	_asm
+	{
+		fld		DWORD PTR [radians]
+		fsincos
+
+		mov edx, DWORD PTR [cosine]
+		mov eax, DWORD PTR [sine]
+
+		fstp DWORD PTR [edx]
+		fstp DWORD PTR [eax]
+	}
+#elif defined( PLATFORM_WINDOWS_PC64 )
+	*sine = sin( radians );
+	*cosine = cos( radians );
+#elif defined( POSIX )
+	double __cosr, __sinr;
+	__asm ("fsincos" : "=t" (__cosr), "=u" (__sinr) : "0" (radians));
+
+  	*sine = __sinr;
+  	*cosine = __cosr;
+#endif
+}
+
+#define SIN_TABLE_SIZE	256
+#define FTOIBIAS		12582912.f
+extern float SinCosTable[SIN_TABLE_SIZE];
+
+inline float TableCos( float theta )
+{
+	union
+	{
+		int i;
+		float f;
+	} ftmp;
+
+	// ideally, the following should compile down to: theta * constant + constant, changing any of these constants from defines sometimes fubars this.
+	ftmp.f = theta * ( float )( SIN_TABLE_SIZE / ( 2.0f * M_PI ) ) + ( FTOIBIAS + ( SIN_TABLE_SIZE / 4 ) );
+	return SinCosTable[ ftmp.i & ( SIN_TABLE_SIZE - 1 ) ];
+}
+
+inline float TableSin( float theta )
+{
+	union
+	{
+		int i;
+		float f;
+	} ftmp;
+
+	// ideally, the following should compile down to: theta * constant + constant
+	ftmp.f = theta * ( float )( SIN_TABLE_SIZE / ( 2.0f * M_PI ) ) + FTOIBIAS;
+	return SinCosTable[ ftmp.i & ( SIN_TABLE_SIZE - 1 ) ];
+}
+
+template<class T>
+FORCEINLINE T Square( T const &a )
+{
+	return a * a;
+}
+
+
+// return the smallest power of two >= x.
+// returns 0 if x == 0 or x > 0x80000000 (ie numbers that would be negative if x was signed)
+// NOTE: the old code took an int, and if you pass in an int of 0x80000000 casted to a uint,
+//       you'll get 0x80000000, which is correct for uints, instead of 0, which was correct for ints
+FORCEINLINE uint SmallestPowerOfTwoGreaterOrEqual( uint x )
+{
+	x -= 1;
+	x |= x >> 1;
+	x |= x >> 2;
+	x |= x >> 4;
+	x |= x >> 8;
+	x |= x >> 16;
+	return x + 1;
+}
+
+// return the largest power of two <= x. Will return 0 if passed 0
+FORCEINLINE uint LargestPowerOfTwoLessThanOrEqual( uint x )
+{
+	if ( x >= 0x80000000 )
+		return 0x80000000;
+
+	return SmallestPowerOfTwoGreaterOrEqual( x + 1 ) >> 1;
+}
+
+
+// Math routines for optimizing division
+void FloorDivMod (double numer, double denom, int *quotient, int *rem);
+int GreatestCommonDivisor (int i1, int i2);
+
+// Test for FPU denormal mode
+bool IsDenormal( const float &val );
+
+// MOVEMENT INFO
+enum
+{
+	PITCH = 0,	// up / down
+	YAW,		// left / right
+	ROLL		// fall over
+};
+
+void MatrixAngles( const matrix3x4_t & matrix, float *angles ); // !!!!
+void MatrixVectors( const matrix3x4_t &matrix, Vector* pForward, Vector *pRight, Vector *pUp );
+void VectorTransform (const float *in1, const matrix3x4_t & in2, float *out);
+void VectorITransform (const float *in1, const matrix3x4_t & in2, float *out);
+void VectorRotate( const float *in1, const matrix3x4_t & in2, float *out);
+void VectorRotate( const Vector &in1, const QAngle &in2, Vector &out );
+void VectorRotate( const Vector &in1, const Quaternion &in2, Vector &out );
+void VectorIRotate( const float *in1, const matrix3x4_t & in2, float *out);
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+
+QAngle TransformAnglesToLocalSpace( const QAngle &angles, const matrix3x4_t &parentMatrix );
+QAngle TransformAnglesToWorldSpace( const QAngle &angles, const matrix3x4_t &parentMatrix );
+
+#endif
+
+void MatrixInitialize( matrix3x4_t &mat, const Vector &vecOrigin, const Vector &vecXAxis, const Vector &vecYAxis, const Vector &vecZAxis );
+void MatrixCopy( const matrix3x4_t &in, matrix3x4_t &out );
+void MatrixInvert( const matrix3x4_t &in, matrix3x4_t &out );
+
+// Matrix equality test
+bool MatricesAreEqual( const matrix3x4_t &src1, const matrix3x4_t &src2, float flTolerance = 1e-5 );
+
+void MatrixGetColumn( const matrix3x4_t &in, int column, Vector &out );
+void MatrixSetColumn( const Vector &in, int column, matrix3x4_t &out );
+
+inline void MatrixGetTranslation( const matrix3x4_t &in, Vector &out )
+{
+	MatrixGetColumn ( in, 3, out );
+}
+
+inline void MatrixSetTranslation( const Vector &in, matrix3x4_t &out )
+{
+	MatrixSetColumn ( in, 3, out );
+}
+
+void MatrixScaleBy ( const float flScale, matrix3x4_t &out );
+void MatrixScaleByZero ( matrix3x4_t &out );
+
+//void DecomposeRotation( const matrix3x4_t &mat, float *out );
+void ConcatRotations (const matrix3x4_t &in1, const matrix3x4_t &in2, matrix3x4_t &out);
+void ConcatTransforms (const matrix3x4_t &in1, const matrix3x4_t &in2, matrix3x4_t &out);
+
+// For identical interface w/ VMatrix
+inline void MatrixMultiply ( const matrix3x4_t &in1, const matrix3x4_t &in2, matrix3x4_t &out )
+{
+	ConcatTransforms( in1, in2, out );
+}
+
+void QuaternionSlerp( const Quaternion &p, const Quaternion &q, float t, Quaternion &qt );
+void QuaternionSlerpNoAlign( const Quaternion &p, const Quaternion &q, float t, Quaternion &qt );
+void QuaternionBlend( const Quaternion &p, const Quaternion &q, float t, Quaternion &qt );
+void QuaternionBlendNoAlign( const Quaternion &p, const Quaternion &q, float t, Quaternion &qt );
+void QuaternionIdentityBlend( const Quaternion &p, float t, Quaternion &qt );
+float QuaternionAngleDiff( const Quaternion &p, const Quaternion &q );
+void QuaternionScale( const Quaternion &p, float t, Quaternion &q );
+void QuaternionAlign( const Quaternion &p, const Quaternion &q, Quaternion &qt );
+float QuaternionDotProduct( const Quaternion &p, const Quaternion &q );
+void QuaternionConjugate( const Quaternion &p, Quaternion &q );
+void QuaternionInvert( const Quaternion &p, Quaternion &q );
+float QuaternionNormalize( Quaternion &q );
+void QuaternionAdd( const Quaternion &p, const Quaternion &q, Quaternion &qt );
+void QuaternionMult( const Quaternion &p, const Quaternion &q, Quaternion &qt );
+void QuaternionMatrix( const Quaternion &q, matrix3x4_t &matrix );
+void QuaternionMatrix( const Quaternion &q, const Vector &pos, matrix3x4_t &matrix );
+void QuaternionAngles( const Quaternion &q, QAngle &angles );
+void AngleQuaternion( const QAngle& angles, Quaternion &qt );
+void QuaternionAngles( const Quaternion &q, RadianEuler &angles );
+void AngleQuaternion( RadianEuler const &angles, Quaternion &qt );
+void QuaternionAxisAngle( const Quaternion &q, Vector &axis, float &angle );
+void AxisAngleQuaternion( const Vector &axis, float angle, Quaternion &q );
+void BasisToQuaternion( const Vector &vecForward, const Vector &vecRight, const Vector &vecUp, Quaternion &q );
+void MatrixQuaternion( const matrix3x4_t &mat, Quaternion &q );
+
+// A couple methods to find the dot product of a vector with a matrix row or column...
+inline float MatrixRowDotProduct( const matrix3x4_t &in1, int row, const Vector& in2 )
+{
+	Assert( (row >= 0) && (row < 3) );
+	return DotProduct( in1[row], in2.Base() ); 
+}
+
+inline float MatrixColumnDotProduct( const matrix3x4_t &in1, int col, const Vector& in2 )
+{
+	Assert( (col >= 0) && (col < 4) );
+	return in1[0][col] * in2[0] + in1[1][col] * in2[1] + in1[2][col] * in2[2]; 
+}
+
+int __cdecl BoxOnPlaneSide (const float *emins, const float *emaxs, const cplane_t *plane);
+
+inline float anglemod(float a)
+{
+	a = (360.f/65536) * ((int)(a*(65536.f/360.0f)) & 65535);
+	return a;
+}
+
+// Remap a value in the range [A,B] to [C,D].
+inline float RemapVal( float val, float A, float B, float C, float D)
+{
+	if ( A == B )
+		return val >= B ? D : C;
+	return C + (D - C) * (val - A) / (B - A);
+}
+
+inline float RemapValClamped( float val, float A, float B, float C, float D)
+{
+	if ( A == B )
+		return val >= B ? D : C;
+	float cVal = (val - A) / (B - A);
+	cVal = clamp( cVal, 0.0f, 1.0f );
+
+	return C + (D - C) * cVal;
+}
+
+// Returns A + (B-A)*flPercent.
+// float Lerp( float flPercent, float A, float B );
+template <class T>
+FORCEINLINE T Lerp( float flPercent, T const &A, T const &B )
+{
+	return A + (B - A) * flPercent;
+}
+
+FORCEINLINE float Sqr( float f )
+{
+	return f*f;
+}
+
+// 5-argument floating point linear interpolation.
+// FLerp(f1,f2,i1,i2,x)=
+//    f1 at x=i1
+//    f2 at x=i2
+//   smooth lerp between f1 and f2 at x>i1 and x<i2
+//   extrapolation for x<i1 or x>i2
+//
+//   If you know a function f(x)'s value (f1) at position i1, and its value (f2) at position i2,
+//   the function can be linearly interpolated with FLerp(f1,f2,i1,i2,x)
+//    i2=i1 will cause a divide by zero.
+static inline float FLerp(float f1, float f2, float i1, float i2, float x)
+{
+  return f1+(f2-f1)*(x-i1)/(i2-i1);
+}
+
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+
+// YWB:  Specialization for interpolating euler angles via quaternions...
+template<> FORCEINLINE QAngle Lerp<QAngle>( float flPercent, const QAngle& q1, const QAngle& q2 )
+{
+	// Avoid precision errors
+	if ( q1 == q2 )
+		return q1;
+
+	Quaternion src, dest;
+
+	// Convert to quaternions
+	AngleQuaternion( q1, src );
+	AngleQuaternion( q2, dest );
+
+	Quaternion result;
+
+	// Slerp
+	QuaternionSlerp( src, dest, flPercent, result );
+
+	// Convert to euler
+	QAngle output;
+	QuaternionAngles( result, output );
+	return output;
+}
+
+#else
+
+#pragma error
+
+// NOTE NOTE: I haven't tested this!! It may not work! Check out interpolatedvar.cpp in the client dll to try it
+template<> FORCEINLINE QAngleByValue Lerp<QAngleByValue>( float flPercent, const QAngleByValue& q1, const QAngleByValue& q2 )
+{
+	// Avoid precision errors
+	if ( q1 == q2 )
+		return q1;
+
+	Quaternion src, dest;
+
+	// Convert to quaternions
+	AngleQuaternion( q1, src );
+	AngleQuaternion( q2, dest );
+
+	Quaternion result;
+
+	// Slerp
+	QuaternionSlerp( src, dest, flPercent, result );
+
+	// Convert to euler
+	QAngleByValue output;
+	QuaternionAngles( result, output );
+	return output;
+}
+
+#endif // VECTOR_NO_SLOW_OPERATIONS
+
+
+/// Same as swap(), but won't cause problems with std::swap
+template <class T> 
+FORCEINLINE void V_swap( T& x, T& y )
+{
+	T temp = x;
+	x = y;
+	y = temp;
+}
+
+template <class T> FORCEINLINE T AVG(T a, T b)
+{
+	return (a+b)/2;
+}
+
+// number of elements in an array of static size
+#define NELEMS(x) ARRAYSIZE(x)
+
+// XYZ macro, for printf type functions - ex printf("%f %f %f",XYZ(myvector));
+#define XYZ(v) (v).x,(v).y,(v).z
+
+
+inline float Sign( float x )
+{
+	return (x <0.0f) ? -1.0f : 1.0f;
+}
+
+//
+// Clamps the input integer to the given array bounds.
+// Equivalent to the following, but without using any branches:
+//
+// if( n < 0 ) return 0;
+// else if ( n > maxindex ) return maxindex;
+// else return n;
+//
+// This is not always a clear performance win, but when you have situations where a clamped 
+// value is thrashing against a boundary this is a big win. (ie, valid, invalid, valid, invalid, ...)
+//
+// Note: This code has been run against all possible integers.
+//
+inline int ClampArrayBounds( int n, unsigned maxindex )
+{
+	// mask is 0 if less than 4096, 0xFFFFFFFF if greater than
+	unsigned int inrangemask = 0xFFFFFFFF + (((unsigned) n) > maxindex );
+	unsigned int lessthan0mask = 0xFFFFFFFF + ( n >= 0 );
+	
+	// If the result was valid, set the result, (otherwise sets zero)
+	int result = (inrangemask & n);
+
+	// if the result was out of range or zero.
+	result |= ((~inrangemask) & (~lessthan0mask)) & maxindex;
+
+	return result;
+}
+
+
+#define BOX_ON_PLANE_SIDE(emins, emaxs, p)	\
+	(((p)->type < 3)?						\
+	(										\
+		((p)->dist <= (emins)[(p)->type])?	\
+			1								\
+		:									\
+		(									\
+			((p)->dist >= (emaxs)[(p)->type])?\
+				2							\
+			:								\
+				3							\
+		)									\
+	)										\
+	:										\
+		BoxOnPlaneSide( (emins), (emaxs), (p)))
+
+//-----------------------------------------------------------------------------
+// FIXME: Vector versions.... the float versions will go away hopefully soon!
+//-----------------------------------------------------------------------------
+
+void AngleVectors (const QAngle& angles, Vector *forward);
+void AngleVectors (const QAngle& angles, Vector *forward, Vector *right, Vector *up);
+void AngleVectorsTranspose (const QAngle& angles, Vector *forward, Vector *right, Vector *up);
+void AngleMatrix (const QAngle &angles, matrix3x4_t &mat );
+void AngleMatrix( const QAngle &angles, const Vector &position, matrix3x4_t &mat );
+void AngleMatrix (const RadianEuler &angles, matrix3x4_t &mat );
+void AngleMatrix( RadianEuler const &angles, const Vector &position, matrix3x4_t &mat );
+void AngleIMatrix (const QAngle &angles, matrix3x4_t &mat );
+void AngleIMatrix (const QAngle &angles, const Vector &position, matrix3x4_t &mat );
+void AngleIMatrix (const RadianEuler &angles, matrix3x4_t &mat );
+void VectorAngles( const Vector &forward, QAngle &angles );
+void VectorAngles( const Vector &forward, const Vector &pseudoup, QAngle &angles );
+void VectorMatrix( const Vector &forward, matrix3x4_t &mat );
+void VectorVectors( const Vector &forward, Vector &right, Vector &up );
+void SetIdentityMatrix( matrix3x4_t &mat );
+void SetScaleMatrix( float x, float y, float z, matrix3x4_t &dst );
+void MatrixBuildRotationAboutAxis( const Vector &vAxisOfRot, float angleDegrees, matrix3x4_t &dst );
+
+inline void SetScaleMatrix( float flScale, matrix3x4_t &dst )
+{
+	SetScaleMatrix( flScale, flScale, flScale, dst );
+}
+
+inline void SetScaleMatrix( const Vector& scale, matrix3x4_t &dst )
+{
+	SetScaleMatrix( scale.x, scale.y, scale.z, dst );
+}
+
+// Computes the inverse transpose
+void MatrixTranspose( matrix3x4_t& mat );
+void MatrixTranspose( const matrix3x4_t& src, matrix3x4_t& dst );
+void MatrixInverseTranspose( const matrix3x4_t& src, matrix3x4_t& dst );
+
+inline void PositionMatrix( const Vector &position, matrix3x4_t &mat )
+{
+	MatrixSetColumn( position, 3, mat );
+}
+
+inline void MatrixPosition( const matrix3x4_t &matrix, Vector &position )
+{
+	MatrixGetColumn( matrix, 3, position );
+}
+
+inline void VectorRotate( const Vector& in1, const matrix3x4_t &in2, Vector &out)
+{
+	VectorRotate( &in1.x, in2, &out.x );
+}
+
+inline void VectorIRotate( const Vector& in1, const matrix3x4_t &in2, Vector &out)
+{
+	VectorIRotate( &in1.x, in2, &out.x );
+}
+
+inline void MatrixAngles( const matrix3x4_t &matrix, QAngle &angles )
+{
+	MatrixAngles( matrix, &angles.x );
+}
+
+inline void MatrixAngles( const matrix3x4_t &matrix, QAngle &angles, Vector &position )
+{
+	MatrixAngles( matrix, angles );
+	MatrixPosition( matrix, position );
+}
+
+inline void MatrixAngles( const matrix3x4_t &matrix, RadianEuler &angles )
+{
+	MatrixAngles( matrix, &angles.x );
+
+	angles.Init( DEG2RAD( angles.z ), DEG2RAD( angles.x ), DEG2RAD( angles.y ) );
+}
+
+void MatrixAngles( const matrix3x4_t &mat, RadianEuler &angles, Vector &position );
+
+void MatrixAngles( const matrix3x4_t &mat, Quaternion &q, Vector &position );
+
+inline int VectorCompare (const Vector& v1, const Vector& v2)
+{
+	return v1 == v2;
+}
+
+inline void VectorTransform (const Vector& in1, const matrix3x4_t &in2, Vector &out)
+{
+	VectorTransform( &in1.x, in2, &out.x );
+}
+
+inline void VectorITransform (const Vector& in1, const matrix3x4_t &in2, Vector &out)
+{
+	VectorITransform( &in1.x, in2, &out.x );
+}
+
+/*
+inline void DecomposeRotation( const matrix3x4_t &mat, Vector &out )
+{
+	DecomposeRotation( mat, &out.x );
+}
+*/
+
+inline int BoxOnPlaneSide (const Vector& emins, const Vector& emaxs, const cplane_t *plane )
+{
+	return BoxOnPlaneSide( &emins.x, &emaxs.x, plane );
+}
+
+inline void VectorFill(Vector& a, float b)
+{
+	a[0]=a[1]=a[2]=b;
+}
+
+inline void VectorNegate(Vector& a)
+{
+	a[0] = -a[0];
+	a[1] = -a[1];
+	a[2] = -a[2];
+}
+
+inline vec_t VectorAvg(Vector& a)
+{
+	return ( a[0] + a[1] + a[2] ) / 3;
+}
+
+//-----------------------------------------------------------------------------
+// Box/plane test (slow version)
+//-----------------------------------------------------------------------------
+inline int FASTCALL BoxOnPlaneSide2 (const Vector& emins, const Vector& emaxs, const cplane_t *p, float tolerance = 0.f )
+{
+	Vector	corners[2];
+
+	if (p->normal[0] < 0)
+	{
+		corners[0][0] = emins[0];
+		corners[1][0] = emaxs[0];
+	}
+	else
+	{
+		corners[1][0] = emins[0];
+		corners[0][0] = emaxs[0];
+	}
+
+	if (p->normal[1] < 0)
+	{
+		corners[0][1] = emins[1];
+		corners[1][1] = emaxs[1];
+	}
+	else
+	{
+		corners[1][1] = emins[1];
+		corners[0][1] = emaxs[1];
+	}
+
+	if (p->normal[2] < 0)
+	{
+		corners[0][2] = emins[2];
+		corners[1][2] = emaxs[2];
+	}
+	else
+	{
+		corners[1][2] = emins[2];
+		corners[0][2] = emaxs[2];
+	}
+
+	int sides = 0;
+
+	float dist1 = DotProduct (p->normal, corners[0]) - p->dist;
+	if (dist1 >= tolerance)
+		sides = 1;
+
+	float dist2 = DotProduct (p->normal, corners[1]) - p->dist;
+	if (dist2 < -tolerance)
+		sides |= 2;
+
+	return sides;
+}
+
+//-----------------------------------------------------------------------------
+// Helpers for bounding box construction
+//-----------------------------------------------------------------------------
+
+void ClearBounds (Vector& mins, Vector& maxs);
+void AddPointToBounds (const Vector& v, Vector& mins, Vector& maxs);
+
+//
+// COLORSPACE/GAMMA CONVERSION STUFF
+//
+void BuildGammaTable( float gamma, float texGamma, float brightness, int overbright );
+
+// convert texture to linear 0..1 value
+inline float TexLightToLinear( int c, int exponent )
+{
+	extern float power2_n[256]; 
+	Assert( exponent >= -128 && exponent <= 127 );
+	return ( float )c * power2_n[exponent+128];
+}
+
+
+// convert texture to linear 0..1 value
+int LinearToTexture( float f );
+// converts 0..1 linear value to screen gamma (0..255)
+int LinearToScreenGamma( float f );
+float TextureToLinear( int c );
+
+// compressed color format 
+struct ColorRGBExp32
+{
+	byte r, g, b;
+	signed char exponent;
+};
+
+void ColorRGBExp32ToVector( const ColorRGBExp32& in, Vector& out );
+void VectorToColorRGBExp32( const Vector& v, ColorRGBExp32 &c );
+
+// solve for "x" where "a x^2 + b x + c = 0", return true if solution exists
+bool SolveQuadratic( float a, float b, float c, float &root1, float &root2 );
+
+// solves for "a, b, c" where "a x^2 + b x + c = y", return true if solution exists
+bool SolveInverseQuadratic( float x1, float y1, float x2, float y2, float x3, float y3, float &a, float &b, float &c );
+
+// solves for a,b,c specified as above, except that it always creates a monotonically increasing or
+// decreasing curve if the data is monotonically increasing or decreasing. In order to enforce the
+// monoticity condition, it is possible that the resulting quadratic will only approximate the data
+// instead of interpolating it. This code is not especially fast.
+bool SolveInverseQuadraticMonotonic( float x1, float y1, float x2, float y2, 
+									 float x3, float y3, float &a, float &b, float &c );
+
+
+
+
+// solves for "a, b, c" where "1/(a x^2 + b x + c ) = y", return true if solution exists
+bool SolveInverseReciprocalQuadratic( float x1, float y1, float x2, float y2, float x3, float y3, float &a, float &b, float &c );
+
+// rotate a vector around the Z axis (YAW)
+void VectorYawRotate( const Vector& in, float flYaw, Vector &out);
+
+
+// Bias takes an X value between 0 and 1 and returns another value between 0 and 1
+// The curve is biased towards 0 or 1 based on biasAmt, which is between 0 and 1.
+// Lower values of biasAmt bias the curve towards 0 and higher values bias it towards 1.
+//
+// For example, with biasAmt = 0.2, the curve looks like this:
+//
+// 1
+// |				  *
+// |				  *
+// |			     *
+// |			   **
+// |			 **
+// |	  	 ****
+// |*********
+// |___________________
+// 0                   1
+//
+//
+// With biasAmt = 0.8, the curve looks like this:
+//
+// 1
+// | 	**************
+// |  **
+// | * 
+// | *
+// |* 
+// |* 
+// |*  
+// |___________________
+// 0                   1
+//
+// With a biasAmt of 0.5, Bias returns X.
+float Bias( float x, float biasAmt );
+
+
+// Gain is similar to Bias, but biasAmt biases towards or away from 0.5.
+// Lower bias values bias towards 0.5 and higher bias values bias away from it.
+//
+// For example, with biasAmt = 0.2, the curve looks like this:
+//
+// 1
+// | 				  *
+// | 				 *
+// | 				**
+// |  ***************
+// | **
+// | *
+// |*
+// |___________________
+// 0                   1
+//
+//
+// With biasAmt = 0.8, the curve looks like this:
+//
+// 1
+// |  		    *****
+// |  		 ***
+// |  		*
+// | 		*
+// | 		*
+// |   	 ***
+// |*****
+// |___________________
+// 0                   1
+float Gain( float x, float biasAmt );
+
+
+// SmoothCurve maps a 0-1 value into another 0-1 value based on a cosine wave
+// where the derivatives of the function at 0 and 1 (and 0.5) are 0. This is useful for
+// any fadein/fadeout effect where it should start and end smoothly.
+//
+// The curve looks like this:
+//
+// 1
+// |  		**
+// | 	   *  *
+// | 	  *	   *
+// | 	  *	   *
+// | 	 *		*
+// |   **		 **
+// |***			   ***
+// |___________________
+// 0                   1
+//
+float SmoothCurve( float x );
+
+
+// This works like SmoothCurve, with two changes:
+//
+// 1. Instead of the curve peaking at 0.5, it will peak at flPeakPos.
+//    (So if you specify flPeakPos=0.2, then the peak will slide to the left).
+//
+// 2. flPeakSharpness is a 0-1 value controlling the sharpness of the peak.
+//    Low values blunt the peak and high values sharpen the peak.
+float SmoothCurve_Tweak( float x, float flPeakPos=0.5, float flPeakSharpness=0.5 );
+
+
+//float ExponentialDecay( float halflife, float dt );
+//float ExponentialDecay( float decayTo, float decayTime, float dt );
+
+// halflife is time for value to reach 50%
+inline float ExponentialDecay( float halflife, float dt )
+{
+	// log(0.5) == -0.69314718055994530941723212145818
+	return expf( -0.69314718f / halflife * dt);
+}
+
+// decayTo is factor the value should decay to in decayTime
+inline float ExponentialDecay( float decayTo, float decayTime, float dt )
+{
+	return expf( logf( decayTo ) / decayTime * dt);
+}
+
+// Get the integrated distanced traveled
+// decayTo is factor the value should decay to in decayTime
+// dt is the time relative to the last velocity update
+inline float ExponentialDecayIntegral( float decayTo, float decayTime, float dt  )
+{
+	return (powf( decayTo, dt / decayTime) * decayTime - decayTime) / logf( decayTo );
+}
+
+// hermite basis function for smooth interpolation
+// Similar to Gain() above, but very cheap to call
+// value should be between 0 & 1 inclusive
+inline float SimpleSpline( float value )
+{
+	float valueSquared = value * value;
+
+	// Nice little ease-in, ease-out spline-like curve
+	return (3 * valueSquared - 2 * valueSquared * value);
+}
+
+// remaps a value in [startInterval, startInterval+rangeInterval] from linear to
+// spline using SimpleSpline
+inline float SimpleSplineRemapVal( float val, float A, float B, float C, float D)
+{
+	if ( A == B )
+		return val >= B ? D : C;
+	float cVal = (val - A) / (B - A);
+	return C + (D - C) * SimpleSpline( cVal );
+}
+
+// remaps a value in [startInterval, startInterval+rangeInterval] from linear to
+// spline using SimpleSpline
+inline float SimpleSplineRemapValClamped( float val, float A, float B, float C, float D )
+{
+	if ( A == B )
+		return val >= B ? D : C;
+	float cVal = (val - A) / (B - A);
+	cVal = clamp( cVal, 0.0f, 1.0f );
+	return C + (D - C) * SimpleSpline( cVal );
+}
+
+FORCEINLINE int RoundFloatToInt(float f)
+{
+#if defined(__i386__) || defined(_M_IX86) || defined( PLATFORM_WINDOWS_PC64 ) || defined(__x86_64__)
+	return _mm_cvtss_si32(_mm_load_ss(&f));
+#elif defined( _X360 )
+#ifdef Assert
+	Assert( IsFPUControlWordSet() );
+#endif
+	union
+	{
+		double flResult;
+		int pResult[2];
+	};
+	flResult = __fctiw( f );
+	return pResult[1];
+#else
+#error Unknown architecture
+#endif
+}
+
+FORCEINLINE unsigned char RoundFloatToByte(float f)
+{
+	int nResult = RoundFloatToInt(f);
+#ifdef Assert
+	Assert( (nResult & ~0xFF) == 0 );
+#endif
+	return (unsigned char) nResult;
+}
+
+FORCEINLINE unsigned long RoundFloatToUnsignedLong(float f)
+{
+#if defined( _X360 )
+#ifdef Assert
+	Assert( IsFPUControlWordSet() );
+#endif
+	union
+	{
+		double flResult;
+		int pIntResult[2];
+		unsigned long pResult[2];
+	};
+	flResult = __fctiw( f );
+	Assert( pIntResult[1] >= 0 );
+	return pResult[1];
+#else  // !X360
+	
+#if defined( PLATFORM_WINDOWS_PC64 )
+	uint nRet = ( uint ) f;
+	if ( nRet & 1 )
+	{
+		if ( ( f - floor( f ) >= 0.5 ) )
+		{
+			nRet++;
+		}
+	}
+	else
+	{
+		if ( ( f - floor( f ) > 0.5 ) )
+		{
+			nRet++;
+		}
+	}
+	return nRet;
+#else // PLATFORM_WINDOWS_PC64
+	unsigned char nResult[8];
+
+	#if defined( _WIN32 )
+		__asm
+		{
+			fld f
+			fistp       qword ptr nResult
+		}
+	#elif POSIX
+		__asm __volatile__ (
+			"fistpl %0;": "=m" (nResult): "t" (f) : "st"
+		);
+	#endif
+
+		return *((unsigned long*)nResult);
+#endif // PLATFORM_WINDOWS_PC64
+#endif // !X360
+}
+
+FORCEINLINE bool IsIntegralValue( float flValue, float flTolerance = 0.001f )
+{
+	return fabs( RoundFloatToInt( flValue ) - flValue ) < flTolerance;
+}
+
+// Fast, accurate ftol:
+FORCEINLINE int Float2Int( float a )
+{
+#if defined( _X360 )
+	union
+	{
+		double flResult;
+		int pResult[2];
+	};
+	flResult = __fctiwz( a );
+	return pResult[1];
+#else  // !X360
+	// Rely on compiler to generate CVTTSS2SI on x86
+	return (int) a;
+#endif
+}
+
+// Over 15x faster than: (int)floor(value)
+inline int Floor2Int( float a )
+{
+	int RetVal;
+#if defined( __i386__ )
+	// Convert to int and back, compare, subtract one if too big
+	__m128 a128 = _mm_set_ss(a);
+	RetVal = _mm_cvtss_si32(a128);
+    __m128 rounded128 = _mm_cvt_si2ss(_mm_setzero_ps(), RetVal);
+	RetVal -= _mm_comigt_ss( rounded128, a128 );
+#else
+	RetVal = static_cast<int>( floor(a) );
+#endif
+	return RetVal;
+}
+
+//-----------------------------------------------------------------------------
+// Fast color conversion from float to unsigned char
+//-----------------------------------------------------------------------------
+FORCEINLINE unsigned int FastFToC( float c )
+{
+#if defined( __i386__ )
+	// IEEE float bit manipulation works for values between [0, 1<<23)
+	union { float f; int i; } convert = { c*255.0f + (float)(1<<23) };
+	return convert.i & 255;
+#else
+	// consoles CPUs suffer from load-hit-store penalty
+	return Float2Int( c * 255.0f );
+#endif
+}
+
+//-----------------------------------------------------------------------------
+// Fast conversion from float to integer with magnitude less than 2**22
+//-----------------------------------------------------------------------------
+FORCEINLINE int FastFloatToSmallInt( float c )
+{
+#if defined( __i386__ )
+	// IEEE float bit manipulation works for values between [-1<<22, 1<<22)
+	union { float f; int i; } convert = { c + (float)(3<<22) };
+	return (convert.i & ((1<<23)-1)) - (1<<22);
+#else
+	// consoles CPUs suffer from load-hit-store penalty
+	return Float2Int( c );
+#endif
+}
+
+//-----------------------------------------------------------------------------
+// Purpose: Bound input float to .001 (millisecond) boundary
+// Input  : in - 
+// Output : inline float
+//-----------------------------------------------------------------------------
+inline float ClampToMsec( float in )
+{
+	int msec = Floor2Int( in * 1000.0f + 0.5f );
+	return 0.001f * msec;
+}
+
+// Over 15x faster than: (int)ceil(value)
+inline int Ceil2Int( float a )
+{
+   int RetVal;
+#if defined( __i386__ )
+   // Convert to int and back, compare, add one if too small
+   __m128 a128 = _mm_load_ss(&a);
+   RetVal = _mm_cvtss_si32(a128);
+   __m128 rounded128 = _mm_cvt_si2ss(_mm_setzero_ps(), RetVal);
+   RetVal += _mm_comilt_ss( rounded128, a128 );
+#else
+   RetVal = static_cast<int>( ceil(a) );
+#endif
+	return RetVal;
+}
+
+
+// Regular signed area of triangle
+#define TriArea2D( A, B, C ) \
+	( 0.5f * ( ( B.x - A.x ) * ( C.y - A.y ) - ( B.y - A.y ) * ( C.x - A.x ) ) )
+
+// This version doesn't premultiply by 0.5f, so it's the area of the rectangle instead
+#define TriArea2DTimesTwo( A, B, C ) \
+	( ( ( B.x - A.x ) * ( C.y - A.y ) - ( B.y - A.y ) * ( C.x - A.x ) ) )
+
+
+// Get the barycentric coordinates of "pt" in triangle [A,B,C].
+inline void GetBarycentricCoords2D( 
+	Vector2D const &A,
+	Vector2D const &B,
+	Vector2D const &C,
+	Vector2D const &pt,
+	float bcCoords[3] )
+{
+	// Note, because to top and bottom are both x2, the issue washes out in the composite
+	float invTriArea = 1.0f / TriArea2DTimesTwo( A, B, C );
+
+	// NOTE: We assume here that the lightmap coordinate vertices go counterclockwise.
+	// If not, TriArea2D() is negated so this works out right.
+	bcCoords[0] = TriArea2DTimesTwo( B, C, pt ) * invTriArea;
+	bcCoords[1] = TriArea2DTimesTwo( C, A, pt ) * invTriArea;
+	bcCoords[2] = TriArea2DTimesTwo( A, B, pt ) * invTriArea;
+}
+
+
+// Return true of the sphere might touch the box (the sphere is actually treated
+// like a box itself, so this may return true if the sphere's bounding box touches
+// a corner of the box but the sphere itself doesn't).
+inline bool QuickBoxSphereTest( 
+	const Vector& vOrigin,
+	float flRadius,
+	const Vector& bbMin,
+	const Vector& bbMax )
+{
+	return vOrigin.x - flRadius < bbMax.x && vOrigin.x + flRadius > bbMin.x &&
+		vOrigin.y - flRadius < bbMax.y && vOrigin.y + flRadius > bbMin.y && 
+		vOrigin.z - flRadius < bbMax.z && vOrigin.z + flRadius > bbMin.z;
+}
+
+
+// Return true of the boxes intersect (but not if they just touch).
+inline bool QuickBoxIntersectTest( 
+	const Vector& vBox1Min,
+	const Vector& vBox1Max,
+	const Vector& vBox2Min,
+	const Vector& vBox2Max )
+{
+	return 
+		vBox1Min.x < vBox2Max.x && vBox1Max.x > vBox2Min.x &&
+		vBox1Min.y < vBox2Max.y && vBox1Max.y > vBox2Min.y && 
+		vBox1Min.z < vBox2Max.z && vBox1Max.z > vBox2Min.z;
+}
+
+
+extern float GammaToLinearFullRange( float gamma );
+extern float LinearToGammaFullRange( float linear );
+extern float GammaToLinear( float gamma );
+extern float LinearToGamma( float linear );
+
+extern float SrgbGammaToLinear( float flSrgbGammaValue );
+extern float SrgbLinearToGamma( float flLinearValue );
+extern float X360GammaToLinear( float fl360GammaValue );
+extern float X360LinearToGamma( float flLinearValue );
+extern float SrgbGammaTo360Gamma( float flSrgbGammaValue );
+
+// linear (0..4) to screen corrected vertex space (0..1?)
+FORCEINLINE float LinearToVertexLight( float f )
+{
+	extern float lineartovertex[4096];	
+
+	// Gotta clamp before the multiply; could overflow...
+	// assume 0..4 range
+	int i = RoundFloatToInt( f * 1024.f );
+
+	// Presumably the comman case will be not to clamp, so check that first:
+	if( (unsigned)i > 4095 )
+	{
+		if ( i < 0 )
+			i = 0;		// Compare to zero instead of 4095 to save 4 bytes in the instruction stream
+		else
+			i = 4095;
+	}
+
+	return lineartovertex[i];
+}
+
+
+FORCEINLINE unsigned char LinearToLightmap( float f )
+{
+	extern unsigned char lineartolightmap[4096];	
+
+	// Gotta clamp before the multiply; could overflow...
+	int i = RoundFloatToInt( f * 1024.f );	// assume 0..4 range
+
+	// Presumably the comman case will be not to clamp, so check that first:
+	if ( (unsigned)i > 4095 )
+	{
+		if ( i < 0 )
+			i = 0;		// Compare to zero instead of 4095 to save 4 bytes in the instruction stream
+		else
+			i = 4095;
+	}
+
+	return lineartolightmap[i];
+}
+
+FORCEINLINE void ColorClamp( Vector& color )
+{
+	float maxc = max( color.x, max( color.y, color.z ) );
+	if ( maxc > 1.0f )
+	{
+		float ooMax = 1.0f / maxc;
+		color.x *= ooMax;
+		color.y *= ooMax;
+		color.z *= ooMax;
+	}
+
+	if ( color[0] < 0.f ) color[0] = 0.f;
+	if ( color[1] < 0.f ) color[1] = 0.f;
+	if ( color[2] < 0.f ) color[2] = 0.f;
+}
+
+inline void ColorClampTruncate( Vector& color )
+{
+	if (color[0] > 1.0f) color[0] = 1.0f; else if (color[0] < 0.0f) color[0] = 0.0f;
+	if (color[1] > 1.0f) color[1] = 1.0f; else if (color[1] < 0.0f) color[1] = 0.0f;
+	if (color[2] > 1.0f) color[2] = 1.0f; else if (color[2] < 0.0f) color[2] = 0.0f;
+}
+
+// Interpolate a Catmull-Rom spline.
+// t is a [0,1] value and interpolates a curve between p2 and p3.
+void Catmull_Rom_Spline(
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector &output );
+
+// Interpolate a Catmull-Rom spline.
+// Returns the tangent of the point at t of the spline
+void Catmull_Rom_Spline_Tangent( 
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector &output );
+
+// area under the curve [0..t]
+void Catmull_Rom_Spline_Integral( 
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector& output );
+
+// area under the curve [0..1]
+void Catmull_Rom_Spline_Integral( 
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	Vector& output );
+
+// Interpolate a Catmull-Rom spline.
+// Normalize p2->p1 and p3->p4 to be the same length as p2->p3
+void Catmull_Rom_Spline_Normalize(
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector &output );
+
+// area under the curve [0..t]
+// Normalize p2->p1 and p3->p4 to be the same length as p2->p3
+void Catmull_Rom_Spline_Integral_Normalize(
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector& output );
+
+// Interpolate a Catmull-Rom spline.
+// Normalize p2.x->p1.x and p3.x->p4.x to be the same length as p2.x->p3.x
+void Catmull_Rom_Spline_NormalizeX(
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector &output );
+
+// area under the curve [0..t]
+void Catmull_Rom_Spline_NormalizeX(
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector& output );
+
+// Interpolate a Hermite spline.
+// t is a [0,1] value and interpolates a curve between p1 and p2 with the deltas d1 and d2.
+void Hermite_Spline(
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &d1,
+	const Vector &d2,
+	float t, 
+	Vector& output );
+
+float Hermite_Spline(
+	float p1,
+	float p2,
+	float d1,
+	float d2,
+	float t );
+
+// t is a [0,1] value and interpolates a curve between p1 and p2 with the slopes p0->p1 and p1->p2
+void Hermite_Spline(
+	const Vector &p0,
+	const Vector &p1,
+	const Vector &p2,
+	float t, 
+	Vector& output );
+
+float Hermite_Spline(
+	float p0,
+	float p1,
+	float p2,
+	float t );
+
+
+void Hermite_SplineBasis( float t, float basis[] );
+
+void Hermite_Spline( 
+	const Quaternion &q0, 
+	const Quaternion &q1, 
+	const Quaternion &q2, 
+	float t, 
+	Quaternion &output );
+
+
+// See http://en.wikipedia.org/wiki/Kochanek-Bartels_curves
+// 
+// Tension:  -1 = Round -> 1 = Tight
+// Bias:     -1 = Pre-shoot (bias left) -> 1 = Post-shoot (bias right)
+// Continuity: -1 = Box corners -> 1 = Inverted corners
+//
+// If T=B=C=0 it's the same matrix as Catmull-Rom.
+// If T=1 & B=C=0 it's the same as Cubic.
+// If T=B=0 & C=-1 it's just linear interpolation
+// 
+// See http://news.povray.org/povray.binaries.tutorials/attachment/%[email protected]%3E/Splines.bas.txt
+// for example code and descriptions of various spline types...
+// 
+void Kochanek_Bartels_Spline(
+	float tension, 
+	float bias, 
+	float continuity,
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector& output );
+
+void Kochanek_Bartels_Spline_NormalizeX(
+	float tension, 
+	float bias, 
+	float continuity,
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector& output );
+
+// See link at Kochanek_Bartels_Spline for info on the basis matrix used
+void Cubic_Spline(
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector& output );
+
+void Cubic_Spline_NormalizeX(
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector& output );
+
+// See link at Kochanek_Bartels_Spline for info on the basis matrix used
+void BSpline(
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector& output );
+
+void BSpline_NormalizeX(
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector& output );
+
+// See link at Kochanek_Bartels_Spline for info on the basis matrix used
+void Parabolic_Spline(
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector& output );
+
+void Parabolic_Spline_NormalizeX(
+	const Vector &p1,
+	const Vector &p2,
+	const Vector &p3,
+	const Vector &p4,
+	float t, 
+	Vector& output );
+
+// quintic interpolating polynomial from Perlin.
+// 0->0, 1->1, smooth-in between with smooth tangents
+FORCEINLINE float QuinticInterpolatingPolynomial(float t)
+{
+	// 6t^5-15t^4+10t^3
+	return t * t * t *( t * ( t* 6.0 - 15.0 ) + 10.0 );
+}
+
+// given a table of sorted tabulated positions, return the two indices and blendfactor to linear
+// interpolate. Does a search. Can be used to find the blend value to interpolate between
+// keyframes.
+void GetInterpolationData( float const *pKnotPositions, 
+						   float const *pKnotValues,
+						   int nNumValuesinList,
+						   int nInterpolationRange,
+						   float flPositionToInterpolateAt,
+						   bool bWrap,
+						   float *pValueA, 
+						   float *pValueB,
+						   float *pInterpolationValue);
+
+float RangeCompressor( float flValue, float flMin, float flMax, float flBase );
+
+// Get the minimum distance from vOrigin to the bounding box defined by [mins,maxs]
+// using voronoi regions.
+// 0 is returned if the origin is inside the box.
+float CalcSqrDistanceToAABB( const Vector &mins, const Vector &maxs, const Vector &point );
+void CalcClosestPointOnAABB( const Vector &mins, const Vector &maxs, const Vector &point, Vector &closestOut );
+void CalcSqrDistAndClosestPointOnAABB( const Vector &mins, const Vector &maxs, const Vector &point, Vector &closestOut, float &distSqrOut );
+
+inline float CalcDistanceToAABB( const Vector &mins, const Vector &maxs, const Vector &point )
+{
+	float flDistSqr = CalcSqrDistanceToAABB( mins, maxs, point );
+	return sqrt(flDistSqr);
+}
+
+// Get the closest point from P to the (infinite) line through vLineA and vLineB and
+// calculate the shortest distance from P to the line.
+// If you pass in a value for t, it will tell you the t for (A + (B-A)t) to get the closest point.
+// If the closest point lies on the segment between A and B, then 0 <= t <= 1.
+void  CalcClosestPointOnLine( const Vector &P, const Vector &vLineA, const Vector &vLineB, Vector &vClosest, float *t=0 );
+float CalcDistanceToLine( const Vector &P, const Vector &vLineA, const Vector &vLineB, float *t=0 );
+float CalcDistanceSqrToLine( const Vector &P, const Vector &vLineA, const Vector &vLineB, float *t=0 );
+
+// The same three functions as above, except now the line is closed between A and B.
+void  CalcClosestPointOnLineSegment( const Vector &P, const Vector &vLineA, const Vector &vLineB, Vector &vClosest, float *t=0 );
+float CalcDistanceToLineSegment( const Vector &P, const Vector &vLineA, const Vector &vLineB, float *t=0 );
+float CalcDistanceSqrToLineSegment( const Vector &P, const Vector &vLineA, const Vector &vLineB, float *t=0 );
+
+// A function to compute the closes line segment connnection two lines (or false if the lines are parallel, etc.)
+bool CalcLineToLineIntersectionSegment(
+   const Vector& p1,const Vector& p2,const Vector& p3,const Vector& p4,Vector *s1,Vector *s2,
+   float *t1, float *t2 );
+
+// The above functions in 2D
+void  CalcClosestPointOnLine2D( Vector2D const &P, Vector2D const &vLineA, Vector2D const &vLineB, Vector2D &vClosest, float *t=0 );
+float CalcDistanceToLine2D( Vector2D const &P, Vector2D const &vLineA, Vector2D const &vLineB, float *t=0 );
+float CalcDistanceSqrToLine2D( Vector2D const &P, Vector2D const &vLineA, Vector2D const &vLineB, float *t=0 );
+void  CalcClosestPointOnLineSegment2D( Vector2D const &P, Vector2D const &vLineA, Vector2D const &vLineB, Vector2D &vClosest, float *t=0 );
+float CalcDistanceToLineSegment2D( Vector2D const &P, Vector2D const &vLineA, Vector2D const &vLineB, float *t=0 );
+float CalcDistanceSqrToLineSegment2D( Vector2D const &P, Vector2D const &vLineA, Vector2D const &vLineB, float *t=0 );
+
+// Init the mathlib
+void MathLib_Init( float gamma = 2.2f, float texGamma = 2.2f, float brightness = 0.0f, int overbright = 2.0f, bool bAllow3DNow = true, bool bAllowSSE = true, bool bAllowSSE2 = true, bool bAllowMMX = true );
+bool MathLib_3DNowEnabled( void );
+bool MathLib_MMXEnabled( void );
+bool MathLib_SSEEnabled( void );
+bool MathLib_SSE2Enabled( void );
+
+float Approach( float target, float value, float speed );
+float ApproachAngle( float target, float value, float speed );
+float AngleDiff( float destAngle, float srcAngle );
+float AngleDistance( float next, float cur );
+float AngleNormalize( float angle );
+
+// ensure that 0 <= angle <= 360
+float AngleNormalizePositive( float angle );
+
+bool AnglesAreEqual( float a, float b, float tolerance = 0.0f );
+
+
+void RotationDeltaAxisAngle( const QAngle &srcAngles, const QAngle &destAngles, Vector &deltaAxis, float &deltaAngle );
+void RotationDelta( const QAngle &srcAngles, const QAngle &destAngles, QAngle *out );
+
+void ComputeTrianglePlane( const Vector& v1, const Vector& v2, const Vector& v3, Vector& normal, float& intercept );
+int PolyFromPlane( Vector *outVerts, const Vector& normal, float dist, float fHalfScale = 9000.0f );
+int ClipPolyToPlane( Vector *inVerts, int vertCount, Vector *outVerts, const Vector& normal, float dist, float fOnPlaneEpsilon = 0.1f );
+int ClipPolyToPlane_Precise( double *inVerts, int vertCount, double *outVerts, const double *normal, double dist, double fOnPlaneEpsilon = 0.1 );
+
+//-----------------------------------------------------------------------------
+// Computes a reasonable tangent space for a triangle
+//-----------------------------------------------------------------------------
+void CalcTriangleTangentSpace( const Vector &p0, const Vector &p1, const Vector &p2,
+							  const Vector2D &t0, const Vector2D &t1, const Vector2D& t2,
+							  Vector &sVect, Vector &tVect );
+
+//-----------------------------------------------------------------------------
+// Transforms a AABB into another space; which will inherently grow the box.
+//-----------------------------------------------------------------------------
+void TransformAABB( const matrix3x4_t &in1, const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut );
+
+//-----------------------------------------------------------------------------
+// Uses the inverse transform of in1
+//-----------------------------------------------------------------------------
+void ITransformAABB( const matrix3x4_t &in1, const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut );
+
+//-----------------------------------------------------------------------------
+// Rotates a AABB into another space; which will inherently grow the box. 
+// (same as TransformAABB, but doesn't take the translation into account)
+//-----------------------------------------------------------------------------
+void RotateAABB( const matrix3x4_t &in1, const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut );
+
+//-----------------------------------------------------------------------------
+// Uses the inverse transform of in1
+//-----------------------------------------------------------------------------
+void IRotateAABB( const matrix3x4_t &in1, const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut );
+
+//-----------------------------------------------------------------------------
+// Transform a plane
+//-----------------------------------------------------------------------------
+inline void MatrixTransformPlane( const matrix3x4_t &src, const cplane_t &inPlane, cplane_t &outPlane )
+{
+	// What we want to do is the following:
+	// 1) transform the normal into the new space.
+	// 2) Determine a point on the old plane given by plane dist * plane normal
+	// 3) Transform that point into the new space
+	// 4) Plane dist = DotProduct( new normal, new point )
+
+	// An optimized version, which works if the plane is orthogonal.
+	// 1) Transform the normal into the new space
+	// 2) Realize that transforming the old plane point into the new space
+	// is given by [ d * n'x + Tx, d * n'y + Ty, d * n'z + Tz ]
+	// where d = old plane dist, n' = transformed normal, Tn = translational component of transform
+	// 3) Compute the new plane dist using the dot product of the normal result of #2
+
+	// For a correct result, this should be an inverse-transpose matrix
+	// but that only matters if there are nonuniform scale or skew factors in this matrix.
+	VectorRotate( inPlane.normal, src, outPlane.normal );
+	outPlane.dist = inPlane.dist * DotProduct( outPlane.normal, outPlane.normal );
+	outPlane.dist += outPlane.normal.x * src[0][3] + outPlane.normal.y * src[1][3] + outPlane.normal.z * src[2][3];
+}
+
+inline void MatrixITransformPlane( const matrix3x4_t &src, const cplane_t &inPlane, cplane_t &outPlane )
+{
+	// The trick here is that Tn = translational component of transform,
+	// but for an inverse transform, Tn = - R^-1 * T
+	Vector vecTranslation;
+	MatrixGetColumn( src, 3, vecTranslation );
+
+	Vector vecInvTranslation;
+	VectorIRotate( vecTranslation, src, vecInvTranslation );
+
+	VectorIRotate( inPlane.normal, src, outPlane.normal );
+	outPlane.dist = inPlane.dist * DotProduct( outPlane.normal, outPlane.normal );
+	outPlane.dist -= outPlane.normal.x * vecInvTranslation[0] + outPlane.normal.y * vecInvTranslation[1] + outPlane.normal.z * vecInvTranslation[2];
+}
+
+int CeilPow2( int in );
+int FloorPow2( int in );
+
+FORCEINLINE float * UnpackNormal_HEND3N( const unsigned int *pPackedNormal, float *pNormal )
+{
+	int temp[3];
+	temp[0] = ((*pPackedNormal >> 0L) & 0x7ff);
+	if ( temp[0] & 0x400 )
+	{
+		temp[0] = 2048 - temp[0];
+	}
+	temp[1] = ((*pPackedNormal >> 11L) & 0x7ff);
+	if ( temp[1] & 0x400 )
+	{
+		temp[1] = 2048 - temp[1];
+	}
+	temp[2] = ((*pPackedNormal >> 22L) & 0x3ff);
+	if ( temp[2] & 0x200 )
+	{
+		temp[2] = 1024 - temp[2];
+	}
+	pNormal[0] = (float)temp[0] * 1.0f/1023.0f;
+	pNormal[1] = (float)temp[1] * 1.0f/1023.0f;
+	pNormal[2] = (float)temp[2] * 1.0f/511.0f;
+	return pNormal;
+}
+
+FORCEINLINE unsigned int * PackNormal_HEND3N( const float *pNormal, unsigned int *pPackedNormal )
+{
+	int temp[3];
+
+	temp[0] = Float2Int( pNormal[0] * 1023.0f );
+	temp[1] = Float2Int( pNormal[1] * 1023.0f );
+	temp[2] = Float2Int( pNormal[2] * 511.0f );
+
+	// the normal is out of bounds, determine the source and fix
+	// clamping would be even more of a slowdown here
+	Assert( temp[0] >= -1023 && temp[0] <= 1023 );
+	Assert( temp[1] >= -1023 && temp[1] <= 1023 );
+	Assert( temp[2] >= -511 && temp[2] <= 511 );
+	
+	*pPackedNormal = ( ( temp[2] & 0x3ff ) << 22L ) |
+                     ( ( temp[1] & 0x7ff ) << 11L ) |
+                     ( ( temp[0] & 0x7ff ) << 0L );
+	return pPackedNormal;
+}
+
+FORCEINLINE unsigned int * PackNormal_HEND3N( float nx, float ny, float nz, unsigned int *pPackedNormal )
+{
+	int temp[3];
+
+	temp[0] = Float2Int( nx * 1023.0f );
+	temp[1] = Float2Int( ny * 1023.0f );
+	temp[2] = Float2Int( nz * 511.0f );
+
+	// the normal is out of bounds, determine the source and fix
+	// clamping would be even more of a slowdown here
+	Assert( temp[0] >= -1023 && temp[0] <= 1023 );
+	Assert( temp[1] >= -1023 && temp[1] <= 1023 );
+	Assert( temp[2] >= -511 && temp[2] <= 511 );
+	
+	*pPackedNormal = ( ( temp[2] & 0x3ff ) << 22L ) |
+                     ( ( temp[1] & 0x7ff ) << 11L ) |
+                     ( ( temp[0] & 0x7ff ) << 0L );
+	return pPackedNormal;
+}
+
+FORCEINLINE float * UnpackNormal_SHORT2( const unsigned int *pPackedNormal, float *pNormal, bool bIsTangent = FALSE )
+{
+	// Unpacks from Jason's 2-short format (fills in a 4th binormal-sign (+1/-1) value, if this is a tangent vector)
+
+	// FIXME: short math is slow on 360 - use ints here instead (bit-twiddle to deal w/ the sign bits)
+	short iX = (*pPackedNormal & 0x0000FFFF);
+	short iY = (*pPackedNormal & 0xFFFF0000) >> 16;
+
+	float zSign = +1;
+	if ( iX < 0 )
+	{
+		zSign = -1;
+		iX    = -iX;
+	}
+	float tSign = +1;
+	if ( iY < 0 )
+	{
+		tSign = -1;
+		iY    = -iY;
+	}
+
+	pNormal[0] = ( iX - 16384.0f ) / 16384.0f;
+	pNormal[1] = ( iY - 16384.0f ) / 16384.0f;
+	pNormal[2] = zSign*sqrtf( 1.0f - ( pNormal[0]*pNormal[0] + pNormal[1]*pNormal[1] ) );
+	if ( bIsTangent )
+	{
+		pNormal[3] = tSign;
+	}
+
+	return pNormal;
+}
+
+FORCEINLINE unsigned int * PackNormal_SHORT2( float nx, float ny, float nz, unsigned int *pPackedNormal, float binormalSign = +1.0f )
+{
+	// Pack a vector (ASSUMED TO BE NORMALIZED) into Jason's 4-byte (SHORT2) format.
+	// This simply reconstructs Z from X & Y. It uses the sign bits of the X & Y coords
+	// to reconstruct the sign of Z and, if this is a tangent vector, the sign of the
+	// binormal (this is needed because tangent/binormal vectors are supposed to follow
+	// UV gradients, but shaders reconstruct the binormal from the tangent and normal
+	// assuming that they form a right-handed basis).
+
+	nx += 1;					// [-1,+1] -> [0,2]
+	ny += 1;
+	nx *= 16384.0f;				// [ 0, 2] -> [0,32768]
+	ny *= 16384.0f;
+
+	// '0' and '32768' values are invalid encodings
+	nx = max( nx, 1.0f );		// Make sure there are no zero values
+	ny = max( ny, 1.0f );
+	nx = min( nx, 32767.0f );	// Make sure there are no 32768 values
+	ny = min( ny, 32767.0f );
+
+	if ( nz < 0.0f )
+		nx = -nx;				// Set the sign bit for z
+
+	ny *= binormalSign;			// Set the sign bit for the binormal (use when encoding a tangent vector)
+
+	// FIXME: short math is slow on 360 - use ints here instead (bit-twiddle to deal w/ the sign bits), also use Float2Int()
+	short sX = (short)nx;		// signed short [1,32767]
+	short sY = (short)ny;
+
+	*pPackedNormal = ( sX & 0x0000FFFF ) | ( sY << 16 ); // NOTE: The mask is necessary (if sX is negative and cast to an int...)
+
+	return pPackedNormal;
+}
+
+FORCEINLINE unsigned int * PackNormal_SHORT2( const float *pNormal, unsigned int *pPackedNormal, float binormalSign = +1.0f )
+{
+	return PackNormal_SHORT2( pNormal[0], pNormal[1], pNormal[2], pPackedNormal, binormalSign );
+}
+
+// Unpacks a UBYTE4 normal (for a tangent, the result's fourth component receives the binormal 'sign')
+FORCEINLINE float * UnpackNormal_UBYTE4( const unsigned int *pPackedNormal, float *pNormal, bool bIsTangent = FALSE )
+{
+	unsigned char cX, cY;
+	if ( bIsTangent )
+	{
+		cX = *pPackedNormal >> 16;					// Unpack Z
+		cY = *pPackedNormal >> 24;					// Unpack W
+	}
+	else
+	{
+		cX = *pPackedNormal >>  0;					// Unpack X
+		cY = *pPackedNormal >>  8;					// Unpack Y
+	}
+
+	float x = cX - 128.0f;
+	float y = cY - 128.0f;
+	float z;
+
+	float zSignBit = x < 0 ? 1.0f : 0.0f;			// z and t negative bits (like slt asm instruction)
+	float tSignBit = y < 0 ? 1.0f : 0.0f;
+	float zSign    = -( 2*zSignBit - 1 );			// z and t signs
+	float tSign    = -( 2*tSignBit - 1 );
+
+	x = x*zSign - zSignBit;							// 0..127
+	y = y*tSign - tSignBit;
+	x = x - 64;										// -64..63
+	y = y - 64;
+
+	float xSignBit = x < 0 ? 1.0f : 0.0f;	// x and y negative bits (like slt asm instruction)
+	float ySignBit = y < 0 ? 1.0f : 0.0f;
+	float xSign    = -( 2*xSignBit - 1 );			// x and y signs
+	float ySign    = -( 2*ySignBit - 1 );
+
+	x = ( x*xSign - xSignBit ) / 63.0f;				// 0..1 range
+	y = ( y*ySign - ySignBit ) / 63.0f;
+	z = 1.0f - x - y;
+
+	float oolen	 = 1.0f / sqrt( x*x + y*y + z*z );	// Normalize and
+	x			*= oolen * xSign;					// Recover signs
+	y			*= oolen * ySign;
+	z			*= oolen * zSign;
+
+	pNormal[0] = x;
+	pNormal[1] = y;
+	pNormal[2] = z;
+	if ( bIsTangent )
+	{
+		pNormal[3] = tSign;
+	}
+
+	return pNormal;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// See: http://www.oroboro.com/rafael/docserv.php/index/programming/article/unitv2
+//
+// UBYTE4 encoding, using per-octant projection onto x+y+z=1
+// Assume input vector is already unit length
+//
+// binormalSign specifies 'sign' of binormal, stored in t sign bit of tangent
+// (lets the shader know whether norm/tan/bin form a right-handed basis)
+//
+// bIsTangent is used to specify which WORD of the output to store the data
+// The expected usage is to call once with the normal and once with
+// the tangent and binormal sign flag, bitwise OR'ing the returned DWORDs
+FORCEINLINE unsigned int * PackNormal_UBYTE4( float nx, float ny, float nz, unsigned int *pPackedNormal, bool bIsTangent = false, float binormalSign = +1.0f )
+{
+	float xSign = nx < 0.0f ? -1.0f : 1.0f;			// -1 or 1 sign
+	float ySign = ny < 0.0f ? -1.0f : 1.0f;
+	float zSign = nz < 0.0f ? -1.0f : 1.0f;
+	float tSign = binormalSign;
+	Assert( ( binormalSign == +1.0f ) || ( binormalSign == -1.0f ) );
+
+	float xSignBit = 0.5f*( 1 - xSign );			// [-1,+1] -> [1,0]
+	float ySignBit = 0.5f*( 1 - ySign );			// 1 is negative bit (like slt instruction)
+	float zSignBit = 0.5f*( 1 - zSign );
+	float tSignBit = 0.5f*( 1 - binormalSign );		
+
+	float absX = xSign*nx;							// 0..1 range (abs)
+	float absY = ySign*ny;
+	float absZ = zSign*nz;
+
+	float xbits = absX / ( absX + absY + absZ );	// Project onto x+y+z=1 plane
+	float ybits = absY / ( absX + absY + absZ );
+
+	xbits *= 63;									// 0..63
+	ybits *= 63;
+
+	xbits  = xbits * xSign - xSignBit;				// -64..63 range
+	ybits  = ybits * ySign - ySignBit;
+	xbits += 64.0f;									// 0..127 range
+	ybits += 64.0f;
+
+	xbits  = xbits * zSign - zSignBit;				// Negate based on z and t
+	ybits  = ybits * tSign - tSignBit;				// -128..127 range
+
+	xbits += 128.0f;								// 0..255 range
+	ybits += 128.0f;
+
+	unsigned char cX = (unsigned char) xbits;
+	unsigned char cY = (unsigned char) ybits;
+
+	if ( !bIsTangent )
+		*pPackedNormal = (cX <<  0) | (cY <<  8);	// xy for normal
+	else						   
+		*pPackedNormal = (cX << 16) | (cY << 24);	// zw for tangent
+
+	return pPackedNormal;
+}
+
+FORCEINLINE unsigned int * PackNormal_UBYTE4( const float *pNormal, unsigned int *pPackedNormal, bool bIsTangent = false, float binormalSign = +1.0f )
+{
+	return PackNormal_UBYTE4( pNormal[0], pNormal[1], pNormal[2], pPackedNormal, bIsTangent, binormalSign );
+}
+
+
+//-----------------------------------------------------------------------------
+// Convert RGB to HSV
+//-----------------------------------------------------------------------------
+void RGBtoHSV( const Vector &rgb, Vector &hsv );
+
+
+//-----------------------------------------------------------------------------
+// Convert HSV to RGB
+//-----------------------------------------------------------------------------
+void HSVtoRGB( const Vector &hsv, Vector &rgb );
+
+
+//-----------------------------------------------------------------------------
+// Fast version of pow and log
+//-----------------------------------------------------------------------------
+
+float FastLog2(float i);			// log2( i )
+float FastPow2(float i);			// 2^i
+float FastPow(float a, float b);	// a^b
+float FastPow10( float i );			// 10^i
+
+//-----------------------------------------------------------------------------
+// For testing float equality
+//-----------------------------------------------------------------------------
+
+inline bool CloseEnough( float a, float b, float epsilon = EQUAL_EPSILON )
+{
+	return fabs( a - b ) <= epsilon;
+}
+
+inline bool CloseEnough( const Vector &a, const Vector &b, float epsilon = EQUAL_EPSILON )
+{
+	return fabs( a.x - b.x ) <= epsilon &&
+		fabs( a.y - b.y ) <= epsilon &&
+		fabs( a.z - b.z ) <= epsilon;
+}
+
+// Fast compare
+// maxUlps is the maximum error in terms of Units in the Last Place. This 
+// specifies how big an error we are willing to accept in terms of the value
+// of the least significant digit of the floating point number�s 
+// representation. maxUlps can also be interpreted in terms of how many 
+// representable floats we are willing to accept between A and B. 
+// This function will allow maxUlps-1 floats between A and B.
+bool AlmostEqual(float a, float b, int maxUlps = 10);
+
+inline bool AlmostEqual( const Vector &a, const Vector &b, int maxUlps = 10)
+{
+	return AlmostEqual( a.x, b.x, maxUlps ) &&
+		AlmostEqual( a.y, b.y, maxUlps ) &&
+		AlmostEqual( a.z, b.z, maxUlps );
+}
+
+
+#endif	// MATH_BASE_H
+
diff --git a/public/mathlib/matrixmath.h b/public/mathlib/matrixmath.h
new file mode 100644
index 0000000..9c7f207
--- /dev/null
+++ b/public/mathlib/matrixmath.h
@@ -0,0 +1,385 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+//  A set of generic, template-based matrix functions.
+//===========================================================================//
+
+#ifndef MATRIXMATH_H
+#define MATRIXMATH_H
+
+#include <stdarg.h>
+
+// The operations in this file can perform basic matrix operations on matrices represented
+// using any class that supports the necessary operations:
+//
+//  .Element( row, col )  - return the element at a given matrox position
+//  .SetElement( row, col, val ) - modify an element
+//  .Width(), .Height() - get dimensions
+//  .SetDimensions( nrows, ncols) - set a matrix to be un-initted and the appropriate size
+//
+// Generally, vectors can be used with these functions by using N x 1 matrices to represent them.
+//  Matrices are addressed as row, column, and indices are 0-based
+//
+//
+// Note that the template versions of these routines are defined for generality - it is expected
+// that template specialization is used for common high performance cases.
+
+namespace MatrixMath
+{
+	/// M *= flScaleValue
+	template<class MATRIXCLASS>
+	void ScaleMatrix( MATRIXCLASS &matrix, float flScaleValue )
+	{
+		for( int i = 0; i < matrix.Height(); i++ )
+		{
+			for( int j = 0; j < matrix.Width(); j++ )
+			{
+				matrix.SetElement( i, j, flScaleValue * matrix.Element( i, j ) );
+			}
+		}
+	}
+
+	/// AppendElementToMatrix - same as setting the element, except only works when all calls
+	/// happen in top to bottom left to right order, end you have to call FinishedAppending when
+	/// done. For normal matrix classes this is not different then SetElement, but for
+	/// CSparseMatrix, it is an accelerated way to fill a matrix from scratch.
+	template<class MATRIXCLASS>
+	FORCEINLINE void AppendElement( MATRIXCLASS &matrix, int nRow, int nCol, float flValue )
+	{
+		matrix.SetElement( nRow, nCol, flValue );			// default implementation
+	}
+
+	template<class MATRIXCLASS>
+	FORCEINLINE void FinishedAppending( MATRIXCLASS &matrix ) {} // default implementation
+
+	/// M += fl
+	template<class MATRIXCLASS>
+	void AddToMatrix( MATRIXCLASS &matrix, float flAddend )
+	{
+		for( int i = 0; i < matrix.Height(); i++ )
+		{
+			for( int j = 0; j < matrix.Width(); j++ )
+			{
+				matrix.SetElement( i, j, flAddend + matrix.Element( i, j ) );
+			}
+		}
+	}
+
+	/// transpose
+	template<class MATRIXCLASSIN, class MATRIXCLASSOUT>
+	void TransposeMatrix( MATRIXCLASSIN const &matrixIn, MATRIXCLASSOUT *pMatrixOut )
+	{
+		pMatrixOut->SetDimensions( matrixIn.Width(), matrixIn.Height() );
+		for( int i = 0; i < pMatrixOut->Height(); i++ )
+		{
+			for( int j = 0; j < pMatrixOut->Width(); j++ )
+			{
+				AppendElement( *pMatrixOut, i, j, matrixIn.Element( j, i ) );
+			}
+		}
+		FinishedAppending( *pMatrixOut );
+	}
+
+	/// copy
+	template<class MATRIXCLASSIN, class MATRIXCLASSOUT>
+	void CopyMatrix( MATRIXCLASSIN const &matrixIn, MATRIXCLASSOUT *pMatrixOut )
+	{
+		pMatrixOut->SetDimensions( matrixIn.Height(), matrixIn.Width() );
+		for( int i = 0; i < matrixIn.Height(); i++ )
+		{
+			for( int j = 0; j < matrixIn.Width(); j++ )
+			{
+				AppendElement( *pMatrixOut, i, j, matrixIn.Element( i, j ) );
+			}
+		}
+		FinishedAppending( *pMatrixOut );
+	}
+
+
+
+	/// M+=M
+	template<class MATRIXCLASSIN, class MATRIXCLASSOUT>
+	void AddMatrixToMatrix( MATRIXCLASSIN const &matrixIn, MATRIXCLASSOUT *pMatrixOut )
+	{
+		for( int i = 0; i < matrixIn.Height(); i++ )
+		{
+			for( int j = 0; j < matrixIn.Width(); j++ )
+			{
+				pMatrixOut->SetElement( i, j, pMatrixOut->Element( i, j ) + matrixIn.Element( i, j ) );
+			}
+		}
+	}
+
+	// M += scale * M
+	template<class MATRIXCLASSIN, class MATRIXCLASSOUT>
+	void AddScaledMatrixToMatrix( float flScale, MATRIXCLASSIN const &matrixIn, MATRIXCLASSOUT *pMatrixOut )
+	{
+		for( int i = 0; i < matrixIn.Height(); i++ )
+		{
+			for( int j = 0; j < matrixIn.Width(); j++ )
+			{
+				pMatrixOut->SetElement( i, j, pMatrixOut->Element( i, j ) + flScale * matrixIn.Element( i, j ) );
+			}
+		}
+	}
+
+
+	// simple way to initialize a matrix with constants from code.
+	template<class MATRIXCLASSOUT> 
+	void SetMatrixToIdentity( MATRIXCLASSOUT *pMatrixOut, float flDiagonalValue = 1.0 )
+	{
+		for( int i = 0; i < pMatrixOut->Height(); i++ )
+		{
+			for( int j = 0; j < pMatrixOut->Width(); j++ )
+			{
+				AppendElement( *pMatrixOut, i, j, ( i == j ) ? flDiagonalValue : 0 );
+			}
+		}
+		FinishedAppending( *pMatrixOut );
+	}
+
+	//// simple way to initialize a matrix with constants from code
+	template<class MATRIXCLASSOUT> 
+	void SetMatrixValues( MATRIXCLASSOUT *pMatrix, int nRows, int nCols, ... )
+	{
+		va_list argPtr;
+		va_start( argPtr, nCols );
+
+		pMatrix->SetDimensions( nRows, nCols );
+		for( int nRow = 0; nRow < nRows; nRow++ )
+		{
+			for( int nCol = 0; nCol < nCols; nCol++ )
+			{
+				double flNewValue = va_arg( argPtr, double );
+				pMatrix->SetElement( nRow, nCol, flNewValue );
+			}
+		}
+		va_end( argPtr );
+	}
+
+
+	/// row and colum accessors. treat a row or a column as a column vector
+	template<class MATRIXTYPE> class MatrixRowAccessor
+	{
+	public:
+		FORCEINLINE MatrixRowAccessor( MATRIXTYPE const &matrix, int nRow )
+		{
+			m_pMatrix = &matrix;
+			m_nRow = nRow;
+		}
+
+		FORCEINLINE float Element( int nRow, int nCol ) const
+		{
+			Assert( nCol == 0 );
+			return m_pMatrix->Element( m_nRow, nRow );
+		}
+
+		FORCEINLINE int Width( void ) const { return 1; };
+		FORCEINLINE int Height( void ) const { return m_pMatrix->Width(); }
+
+	private:
+		MATRIXTYPE const *m_pMatrix;
+		int m_nRow;
+	};
+
+	template<class MATRIXTYPE> class MatrixColumnAccessor
+	{
+	public:
+		FORCEINLINE MatrixColumnAccessor( MATRIXTYPE const &matrix, int nColumn )
+		{
+			m_pMatrix = &matrix;
+			m_nColumn = nColumn;
+		}
+
+		FORCEINLINE float Element( int nRow, int nColumn ) const
+		{
+			Assert( nColumn == 0 );
+			return m_pMatrix->Element( nRow, m_nColumn );
+		}
+
+		FORCEINLINE int Width( void ) const { return 1; }
+		FORCEINLINE int Height( void ) const { return m_pMatrix->Height(); }
+	private:
+		MATRIXTYPE const *m_pMatrix;
+		int m_nColumn;
+	};
+
+	/// this translator acts as a proxy for the transposed matrix
+	template<class MATRIXTYPE> class MatrixTransposeAccessor
+	{
+	public:
+		FORCEINLINE MatrixTransposeAccessor( MATRIXTYPE const & matrix )
+		{
+			m_pMatrix = &matrix;
+		}
+
+		FORCEINLINE float Element( int nRow, int nColumn ) const
+		{
+			return m_pMatrix->Element( nColumn, nRow );
+		}
+
+		FORCEINLINE int Width( void ) const { return m_pMatrix->Height(); }
+		FORCEINLINE int Height( void ) const { return m_pMatrix->Width(); }
+	private:
+		MATRIXTYPE const *m_pMatrix;
+	};
+
+	/// this tranpose returns a wrapper around it's argument, allowing things like AddMatrixToMatrix( Transpose( matA ), &matB ) without an extra copy
+	template<class MATRIXCLASSIN>
+	MatrixTransposeAccessor<MATRIXCLASSIN> TransposeMatrix( MATRIXCLASSIN const &matrixIn )
+	{
+		return MatrixTransposeAccessor<MATRIXCLASSIN>( matrixIn );
+	}
+
+
+	/// retrieve rows and columns
+	template<class MATRIXTYPE>
+	FORCEINLINE MatrixColumnAccessor<MATRIXTYPE> MatrixColumn( MATRIXTYPE const &matrix, int nColumn )
+	{
+		return MatrixColumnAccessor<MATRIXTYPE>( matrix, nColumn );
+	}
+
+	template<class MATRIXTYPE>
+	FORCEINLINE MatrixRowAccessor<MATRIXTYPE> MatrixRow( MATRIXTYPE const &matrix, int nRow )
+	{
+		return MatrixRowAccessor<MATRIXTYPE>( matrix, nRow );
+	}
+
+	//// dot product between vectors (or rows and/or columns via accessors)
+	template<class MATRIXACCESSORATYPE, class MATRIXACCESSORBTYPE >
+	float InnerProduct( MATRIXACCESSORATYPE const &vecA, MATRIXACCESSORBTYPE const &vecB )
+	{
+		Assert( vecA.Width() == 1 );
+		Assert( vecB.Width() == 1 );
+		Assert( vecA.Height() == vecB.Height() );
+		double flResult = 0;
+		for( int i = 0; i < vecA.Height(); i++ )
+		{
+			flResult += vecA.Element( i, 0 ) * vecB.Element( i, 0 );
+		}
+		return flResult;
+	}
+
+
+
+	/// matrix x matrix multiplication
+	template<class MATRIXATYPE, class MATRIXBTYPE, class MATRIXOUTTYPE>
+	void MatrixMultiply( MATRIXATYPE const &matA, MATRIXBTYPE const &matB, MATRIXOUTTYPE *pMatrixOut )
+	{
+		Assert( matA.Width() == matB.Height() );
+		pMatrixOut->SetDimensions( matA.Height(), matB.Width() );
+		for( int i = 0; i < matA.Height(); i++ )
+		{
+			for( int j = 0; j < matB.Width(); j++ )
+			{
+				pMatrixOut->SetElement( i, j, InnerProduct( MatrixRow( matA, i ), MatrixColumn( matB, j ) ) );
+			}
+		}
+	}
+
+	/// solve Ax=B via the conjugate graident method. Code and naming conventions based on the
+	/// wikipedia article.
+	template<class ATYPE, class XTYPE, class BTYPE>
+	void ConjugateGradient( ATYPE const &matA, BTYPE const &vecB, XTYPE &vecX, float flTolerance = 1.0e-20 )
+	{
+		XTYPE vecR;
+		vecR.SetDimensions( vecX.Height(), 1 );
+		MatrixMultiply( matA, vecX, &vecR );
+		ScaleMatrix( vecR, -1 );
+		AddMatrixToMatrix( vecB, &vecR );
+		XTYPE vecP;
+		CopyMatrix( vecR, &vecP );
+		float flRsOld = InnerProduct( vecR, vecR );
+		for( int nIter = 0; nIter < 100; nIter++ )
+		{
+			XTYPE vecAp;
+			MatrixMultiply( matA, vecP, &vecAp );
+			float flDivisor = InnerProduct( vecAp, vecP );
+			float flAlpha = flRsOld / flDivisor;
+			AddScaledMatrixToMatrix( flAlpha, vecP, &vecX );
+			AddScaledMatrixToMatrix( -flAlpha, vecAp, &vecR );
+			float flRsNew = InnerProduct( vecR, vecR );
+			if ( flRsNew < flTolerance )
+			{
+				break;
+			}
+			ScaleMatrix( vecP, flRsNew / flRsOld );
+			AddMatrixToMatrix( vecR, &vecP );
+			flRsOld = flRsNew;
+		}
+	}
+
+	/// solve (A'*A) x=B via the conjugate gradient method. Code and naming conventions based on
+	/// the wikipedia article. Same as Conjugate gradient but allows passing in two matrices whose
+	/// product is used as the A matrix (in order to preserve sparsity)
+	template<class ATYPE, class APRIMETYPE, class XTYPE, class BTYPE>
+	void ConjugateGradient( ATYPE const &matA, APRIMETYPE const &matAPrime, BTYPE const &vecB, XTYPE &vecX, float flTolerance = 1.0e-20 )
+	{
+		XTYPE vecR1;
+		vecR1.SetDimensions( vecX.Height(), 1 );
+		MatrixMultiply( matA, vecX, &vecR1 );
+		XTYPE vecR;
+		vecR.SetDimensions( vecR1.Height(), 1 );
+		MatrixMultiply( matAPrime, vecR1, &vecR );
+		ScaleMatrix( vecR, -1 );
+		AddMatrixToMatrix( vecB, &vecR );
+		XTYPE vecP;
+		CopyMatrix( vecR, &vecP );
+		float flRsOld = InnerProduct( vecR, vecR );
+		for( int nIter = 0; nIter < 100; nIter++ )
+		{
+			XTYPE vecAp1;
+			MatrixMultiply( matA, vecP, &vecAp1 );
+			XTYPE vecAp;
+			MatrixMultiply( matAPrime, vecAp1, &vecAp );
+			float flDivisor = InnerProduct( vecAp, vecP );
+			float flAlpha = flRsOld / flDivisor;
+			AddScaledMatrixToMatrix( flAlpha, vecP, &vecX );
+			AddScaledMatrixToMatrix( -flAlpha, vecAp, &vecR );
+			float flRsNew = InnerProduct( vecR, vecR );
+			if ( flRsNew < flTolerance )
+			{
+				break;
+			}
+			ScaleMatrix( vecP, flRsNew / flRsOld );
+			AddMatrixToMatrix( vecR, &vecP );
+			flRsOld = flRsNew;
+		}
+	}
+
+	
+	template<class ATYPE,  class XTYPE, class BTYPE>
+	void LeastSquaresFit( ATYPE const &matA, BTYPE const &vecB, XTYPE &vecX )
+	{
+		// now, generate the normal equations
+		BTYPE vecBeta;
+		MatrixMath::MatrixMultiply( MatrixMath::TransposeMatrix( matA ), vecB, &vecBeta );
+
+		vecX.SetDimensions( matA.Width(), 1 );
+		MatrixMath::SetMatrixToIdentity( &vecX );
+
+		ATYPE matATransposed;
+		TransposeMatrix( matA, &matATransposed );
+		ConjugateGradient( matA, matATransposed, vecBeta, vecX, 1.0e-20 );
+	}
+
+};
+
+/// a simple fixed-size matrix class
+template<int NUMROWS, int NUMCOLS> class CFixedMatrix
+{
+public:
+	FORCEINLINE int Width( void ) const { return NUMCOLS; }
+	FORCEINLINE int Height( void ) const { return NUMROWS; }
+	FORCEINLINE float Element( int nRow, int nCol ) const { return m_flValues[nRow][nCol]; }
+	FORCEINLINE void SetElement( int nRow, int nCol, float flValue ) { m_flValues[nRow][nCol] = flValue; }
+	FORCEINLINE void SetDimensions( int nNumRows, int nNumCols ) { Assert( ( nNumRows == NUMROWS ) && ( nNumCols == NUMCOLS ) ); }
+
+private:
+	float m_flValues[NUMROWS][NUMCOLS];
+};
+
+
+
+#endif //matrixmath_h
diff --git a/public/mathlib/noise.h b/public/mathlib/noise.h
new file mode 100644
index 0000000..19d3f72
--- /dev/null
+++ b/public/mathlib/noise.h
@@ -0,0 +1,35 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+//=====================================================================================//
+
+#ifndef NOISE_H
+#define NOISE_H
+
+#include <math.h>
+#include "basetypes.h"
+#include "mathlib/vector.h"
+#include "tier0/dbg.h"
+
+
+// The following code is the c-ification of Ken Perlin's new noise algorithm
+// "JAVA REFERENCE IMPLEMENTATION OF IMPROVED NOISE - COPYRIGHT 2002 KEN PERLIN"
+// as available here: http://mrl.nyu.edu/~perlin/noise/
+// it generates a single octave of noise in the -1..1 range
+// this should at some point probably replace SparseConvolutionNoise - jd
+float ImprovedPerlinNoise( Vector const &pnt );
+
+// get the noise value at a point. Output range is 0..1.
+float SparseConvolutionNoise( Vector const &pnt );
+
+// get the noise value at a point, passing a custom noise shaping function. The noise shaping
+// function should map the domain 0..1 to 0..1.
+float SparseConvolutionNoise(Vector const &pnt, float (*pNoiseShapeFunction)(float) );
+
+// returns a 1/f noise. more octaves take longer
+float FractalNoise( Vector const &pnt, int n_octaves );
+
+// returns a abs(f)*1/f noise i.e. turbulence
+float Turbulence( Vector const &pnt, int n_octaves );
+#endif // NOISE_H
diff --git a/public/mathlib/polyhedron.h b/public/mathlib/polyhedron.h
new file mode 100644
index 0000000..38b465c
--- /dev/null
+++ b/public/mathlib/polyhedron.h
@@ -0,0 +1,73 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+
+#ifndef POLYHEDRON_H_
+#define	POLYHEDRON_H_
+
+#ifdef _WIN32
+#pragma once
+#endif
+
+#include "mathlib/mathlib.h"
+
+
+
+struct Polyhedron_IndexedLine_t
+{
+	unsigned short iPointIndices[2];
+};
+
+struct Polyhedron_IndexedLineReference_t
+{
+	unsigned short iLineIndex;
+	unsigned char iEndPointIndex; //since two polygons reference any one line, one needs to traverse the line backwards, this flags that behavior
+};
+
+struct Polyhedron_IndexedPolygon_t
+{
+	unsigned short iFirstIndex;
+	unsigned short iIndexCount;
+	Vector polyNormal;
+};
+
+class CPolyhedron //made into a class because it's going virtual to support distinctions between temp and permanent versions
+{
+public:
+	Vector *pVertices;
+	Polyhedron_IndexedLine_t *pLines;
+	Polyhedron_IndexedLineReference_t *pIndices;
+	Polyhedron_IndexedPolygon_t *pPolygons;
+	
+	unsigned short iVertexCount;
+	unsigned short iLineCount;
+	unsigned short iIndexCount;
+	unsigned short iPolygonCount;
+
+	virtual ~CPolyhedron( void ) {};
+	virtual void Release( void ) = 0;
+	Vector Center( void );
+};
+
+class CPolyhedron_AllocByNew : public CPolyhedron
+{
+public:
+	virtual void Release( void );
+	static CPolyhedron_AllocByNew *Allocate( unsigned short iVertices, unsigned short iLines, unsigned short iIndices, unsigned short iPolygons ); //creates the polyhedron along with enough memory to hold all it's data in a single allocation
+
+private:
+	CPolyhedron_AllocByNew( void ) { }; //CPolyhedron_AllocByNew::Allocate() is the only way to create one of these.
+};
+
+CPolyhedron *GeneratePolyhedronFromPlanes( const float *pOutwardFacingPlanes, int iPlaneCount, float fOnPlaneEpsilon, bool bUseTemporaryMemory = false ); //be sure to polyhedron->Release()
+CPolyhedron *ClipPolyhedron( const CPolyhedron *pExistingPolyhedron, const float *pOutwardFacingPlanes, int iPlaneCount, float fOnPlaneEpsilon, bool bUseTemporaryMemory = false ); //this does NOT modify/delete the existing polyhedron
+
+CPolyhedron *GetTempPolyhedron( unsigned short iVertices, unsigned short iLines, unsigned short iIndices, unsigned short iPolygons ); //grab the temporary polyhedron. Avoids new/delete for quick work. Can only be in use by one chunk of code at a time
+
+
+#endif //#ifndef POLYHEDRON_H_
+
diff --git a/public/mathlib/quantize.h b/public/mathlib/quantize.h
new file mode 100644
index 0000000..5e5b742
--- /dev/null
+++ b/public/mathlib/quantize.h
@@ -0,0 +1,141 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+#ifndef QUANTIZE_H
+#define QUANTIZE_H
+
+#ifndef STRING_H
+#include <string.h>
+#endif
+
+#define MAXDIMS 768
+#define MAXQUANT 16000
+
+
+#include <tier0/platform.h>
+
+struct Sample;
+
+struct QuantizedValue {
+	double MinError;											// minimum possible error. used
+	// for neighbor searches.
+	struct QuantizedValue *Children[2];						// splits
+	int32 value;											// only exists for leaf nodes
+	struct Sample *Samples;									// every sample quantized into this
+	// entry
+	int32 NSamples;											// how many were quantized to this.
+	int32 TotSamples;
+	double *ErrorMeasure;									// variance measure for each dimension
+	double TotalError;										// sum of errors
+	uint8 *Mean;											// average value of each dimension
+	uint8 *Mins;											// min box for children and this
+	uint8 *Maxs;											// max box for children and this
+	int NQuant;												// the number of samples which were
+															// quantzied to this node since the
+															// last time OptimizeQuantizer()
+															// was called.
+	int *Sums;												// sum used by OptimizeQuantizer
+	int sortdim;											// dimension currently sorted along.
+};
+
+struct Sample {
+	int32 ID;												// identifier of this sample. can
+															// be used for any purpose.
+	int32 Count;											// number of samples this sample
+															// represents
+	int32 QNum;										   // what value this sample ended up quantized
+															// to.
+	struct QuantizedValue *qptr;							// ptr to what this was quantized to.
+	uint8 Value[1];										   // array of values for multi-dimensional
+	// variables.
+};
+
+void FreeQuantization(struct QuantizedValue *t);
+
+struct QuantizedValue *Quantize(struct Sample *s, int nsamples, int ndims,
+								int nvalues, uint8 *weights, int value0=0);
+
+int CompressSamples(struct Sample *s, int nsamples, int ndims);
+
+struct QuantizedValue *FindMatch(uint8 const *sample,
+								 int ndims,uint8 *weights,
+								 struct QuantizedValue *QTable);
+void PrintSamples(struct Sample const *s, int nsamples, int ndims);
+
+struct QuantizedValue *FindQNode(struct QuantizedValue const *q, int32 code);
+
+inline struct Sample *NthSample(struct Sample *s, int i, int nd)
+{
+	uint8 *r=(uint8 *) s;
+	r+=i*(sizeof(*s)+(nd-1));
+	return (struct Sample *) r;
+}
+
+inline struct Sample *AllocSamples(int ns, int nd)
+{
+	size_t size5=(sizeof(struct Sample)+(nd-1))*ns;
+	void *ret=new uint8[size5];
+	memset(ret,0,size5);
+	for(int i=0;i<ns;i++)
+		NthSample((struct Sample *)ret,i,nd)->Count=1;
+	return (struct Sample *) ret;
+}
+
+
+// MinimumError: what is the min error which will occur if quantizing
+// a sample to the given qnode? This is just the error if the qnode
+// is a leaf.
+double MinimumError(struct QuantizedValue const *q, uint8 const *sample,
+					int ndims, uint8 const *weights);
+double MaximumError(struct QuantizedValue const *q, uint8 const *sample,
+					int ndims, uint8 const *weights);
+
+void PrintQTree(struct QuantizedValue const *p,int idlevel=0);
+void OptimizeQuantizer(struct QuantizedValue *q, int ndims);
+
+// RecalculateVelues: update the means in a sample tree, based upon
+// the samples. can be used to reoptimize when samples are deleted,
+// for instance.
+
+void RecalculateValues(struct QuantizedValue *q, int ndims);
+
+extern double SquaredError;	// may be reset and examined. updated by
+															// FindMatch()
+
+
+
+
+// the routines below can be used for uniform quantization via dart-throwing.
+typedef void (*GENERATOR)(void *);    // generate a random sample
+typedef double (*COMPARER)(void const *a, void const *b);
+
+void *DartThrow(int NResults, int NTries, size_t itemsize, GENERATOR gen,
+				COMPARER cmp);
+void *FindClosestDart(void *items,int NResults, size_t itemsize,
+					  COMPARER cmp, void *lookfor, int *idx);
+
+
+
+
+// color quantization of 24 bit images
+#define QUANTFLAGS_NODITHER 1	// don't do Floyd-steinberg dither
+
+extern void ColorQuantize(
+uint8 const	*pImage,			// 4 byte pixels ARGB
+int			nWidth,
+int			nHeight,
+int			nFlags, 			// QUANTFLAGS_xxx
+int			nColors,			// # of colors to fill in in palette
+uint8		*pOutPixels,		// where to store resulting 8 bit pixels
+uint8		*pOutPalette,		// where to store resulting 768-byte palette
+int			nFirstColor);		// first color to use in mapping
+
+
+
+
+
+#endif
diff --git a/public/mathlib/simdvectormatrix.h b/public/mathlib/simdvectormatrix.h
new file mode 100644
index 0000000..f88cd32
--- /dev/null
+++ b/public/mathlib/simdvectormatrix.h
@@ -0,0 +1,142 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: Provide a class (SSE/SIMD only) holding a 2d matrix of class FourVectors,
+// for high speed processing in tools.
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+
+#ifndef SIMDVECTORMATRIX_H
+#define SIMDVECTORMATRIX_H
+
+#ifdef _WIN32
+#pragma once
+#endif
+
+
+#include <string.h>
+#include "tier0/platform.h"
+#include "tier0/dbg.h"
+#include "tier1/utlsoacontainer.h"
+#include "mathlib/ssemath.h"
+
+class CSIMDVectorMatrix
+{
+public:
+	int m_nWidth;											// in actual vectors
+	int m_nHeight;
+
+	int m_nPaddedWidth;										// # of 4x wide elements
+
+	FourVectors *m_pData;
+
+protected:
+	void Init( void )
+	{
+		m_pData = NULL;
+		m_nWidth = 0;
+		m_nHeight = 0;
+		m_nPaddedWidth = 0;
+	}
+
+	int NVectors( void ) const
+	{
+		return m_nHeight * m_nPaddedWidth;
+	}
+
+public:
+	// constructors and destructors
+	CSIMDVectorMatrix( void )
+	{
+		Init();
+	}
+
+	~CSIMDVectorMatrix( void )
+	{
+		if ( m_pData )
+			delete[] m_pData;
+	}
+
+	// set up storage and fields for m x n matrix. destroys old data
+	void SetSize( int width, int height )
+	{
+		if ( ( ! m_pData ) || ( width != m_nWidth ) || ( height != m_nHeight ) )
+		{
+			if ( m_pData )
+				delete[] m_pData;
+			
+			m_nWidth = width;
+			m_nHeight = height;
+			
+			m_nPaddedWidth = ( m_nWidth + 3) >> 2;
+			m_pData = NULL;
+			if ( width && height )
+				m_pData = new FourVectors[ m_nPaddedWidth * m_nHeight ];
+		}
+	}
+
+	CSIMDVectorMatrix( int width, int height )
+	{
+		Init();
+		SetSize( width, height );
+	}
+
+	CSIMDVectorMatrix &operator=( CSIMDVectorMatrix const &src )
+	{
+		SetSize( src.m_nWidth, src.m_nHeight );
+		if ( m_pData )
+			memcpy( m_pData, src.m_pData, m_nHeight*m_nPaddedWidth*sizeof(m_pData[0]) ); 
+		return *this;
+	}
+
+	CSIMDVectorMatrix &operator+=( CSIMDVectorMatrix const &src );
+
+	CSIMDVectorMatrix &operator*=( Vector const &src );
+
+	// create from an RGBA float bitmap. alpha ignored.
+	void CreateFromRGBA_FloatImageData(int srcwidth, int srcheight, float const *srcdata );
+
+	// create from 3 fields in a csoa
+	void CreateFromCSOAAttributes( CSOAContainer const *pSrc,
+								   int nAttrIdx0, int nAttrIdx1, int nAttrIdx2 );
+
+	// Element access. If you are calling this a lot, you don't want to use this class, because
+	// you're not getting the sse advantage
+	Vector Element(int x, int y) const
+	{
+		Assert( m_pData );
+		Assert( x < m_nWidth );
+		Assert( y < m_nHeight );
+		Vector ret;
+		FourVectors const *pData=m_pData+y*m_nPaddedWidth+(x >> 2);
+
+		int xo=(x & 3);
+		ret.x=pData->X( xo );
+		ret.y=pData->Y( xo );
+		ret.z=pData->Z( xo );
+		return ret;
+	}
+
+	//addressing the individual fourvectors elements
+	FourVectors &CompoundElement(int x, int y)
+	{
+		Assert( m_pData );
+		Assert( y < m_nHeight );
+		Assert( x < m_nPaddedWidth );
+		return m_pData[x + m_nPaddedWidth*y ];
+	}
+
+	// math operations on the whole image
+	void Clear( void )
+	{
+		Assert( m_pData );
+		memset( m_pData, 0, m_nHeight*m_nPaddedWidth*sizeof(m_pData[0]) );
+	}
+
+	void RaiseToPower( float power );
+};
+
+
+
+#endif
diff --git a/public/mathlib/spherical_geometry.h b/public/mathlib/spherical_geometry.h
new file mode 100644
index 0000000..04310f4
--- /dev/null
+++ b/public/mathlib/spherical_geometry.h
@@ -0,0 +1,73 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: Functions for spherical geometry.
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+
+#ifndef SPHERICAL_GEOMETRY_H
+#define SPHERICAL_GEOMETRY_H
+
+#ifdef _WIN32
+#pragma once
+#endif
+
+#include <math.h>
+#include <float.h>
+
+// see http://mathworld.wolfram.com/SphericalTrigonometry.html
+
+// return the spherical distance, in radians, between 2 points on the unit sphere.
+FORCEINLINE float UnitSphereLineSegmentLength( Vector const &a, Vector const &b )
+{
+	// check unit length
+	Assert( fabs( VectorLength( a ) - 1.0 ) < 1.0e-3 );
+	Assert( fabs( VectorLength( b ) - 1.0 ) < 1.0e-3 );
+	return acos( DotProduct( a, b ) );
+}
+
+
+// given 3 points on the unit sphere, return the spherical area (in radians) of the triangle they form.
+// valid for "small" triangles.
+FORCEINLINE float UnitSphereTriangleArea( Vector const &a, Vector const &b , Vector const &c )
+{
+	float flLengthA = UnitSphereLineSegmentLength( b, c );
+	float flLengthB = UnitSphereLineSegmentLength( c, a );
+	float flLengthC = UnitSphereLineSegmentLength( a, b );
+	
+	if ( ( flLengthA == 0. ) || ( flLengthB == 0. ) || ( flLengthC == 0. ) )
+		return 0.;											// zero area triangle
+			
+	// now, find the 3 incribed angles for the triangle
+	float flHalfSumLens = 0.5 * ( flLengthA + flLengthB + flLengthC );
+	float flSinSums = sin( flHalfSumLens );
+	float flSinSMinusA= sin( flHalfSumLens - flLengthA );
+	float flSinSMinusB= sin( flHalfSumLens - flLengthB );
+	float flSinSMinusC= sin( flHalfSumLens - flLengthC );
+	
+	float flTanAOver2 = sqrt ( ( flSinSMinusB * flSinSMinusC ) / ( flSinSums * flSinSMinusA ) );
+	float flTanBOver2 = sqrt ( ( flSinSMinusA * flSinSMinusC ) / ( flSinSums * flSinSMinusB ) );
+	float flTanCOver2 = sqrt ( ( flSinSMinusA * flSinSMinusB ) / ( flSinSums * flSinSMinusC ) );
+
+	// Girards formula : area = sum of angles - pi.
+	return 2.0 * ( atan( flTanAOver2 ) + atan( flTanBOver2 ) + atan( flTanCOver2 ) ) - M_PI;
+}
+
+// spherical harmonics-related functions. Best explanation at http://www.research.scea.com/gdc2003/spherical-harmonic-lighting.pdf
+
+// Evaluate associated legendre polynomial P( l, m ) at flX, using recurrence relation
+float AssociatedLegendrePolynomial( int nL, int nM, float flX );
+
+// Evaluate order N spherical harmonic with spherical coordinates
+// nL = band, 0..N
+// nM = -nL .. nL
+// theta = 0..M_PI
+// phi = 0.. 2 * M_PHI
+float SphericalHarmonic( int nL, int nM, float flTheta, float flPhi );
+
+// evaluate spherical harmonic with normalized vector direction
+float SphericalHarmonic( int nL, int nM, Vector const &vecDirection );
+
+
+#endif // SPHERICAL_GEOMETRY_H
diff --git a/public/mathlib/ssemath.h b/public/mathlib/ssemath.h
new file mode 100644
index 0000000..c2ff48d
--- /dev/null
+++ b/public/mathlib/ssemath.h
@@ -0,0 +1,3107 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: - defines SIMD "structure of arrays" classes and functions.
+//
+//===========================================================================//
+#ifndef SSEMATH_H
+#define SSEMATH_H
+
+#if defined( _X360 )
+#include <xboxmath.h>
+#else
+#include <xmmintrin.h>
+#endif
+
+#include <mathlib/vector.h>
+#include <mathlib/mathlib.h>
+
+#if defined(GNUC)
+#define USE_STDC_FOR_SIMD 0
+#else
+#define USE_STDC_FOR_SIMD 0
+#endif
+
+#if (!defined(_X360) && (USE_STDC_FOR_SIMD == 0))
+#define _SSE1 1
+#endif
+
+// I thought about defining a class/union for the SIMD packed floats instead of using fltx4,
+// but decided against it because (a) the nature of SIMD code which includes comparisons is to blur
+// the relationship between packed floats and packed integer types and (b) not sure that the
+// compiler would handle generating good code for the intrinsics.
+
+#if USE_STDC_FOR_SIMD
+
+typedef union
+{
+	float  m128_f32[4];
+	uint32 m128_u32[4];
+} fltx4;
+
+typedef fltx4 i32x4;
+typedef fltx4 u32x4;
+
+#elif ( defined( _X360 ) )
+
+typedef union
+{
+	// This union allows float/int access (which generally shouldn't be done in inner loops)
+	__vector4	vmx;
+	float		m128_f32[4];
+	uint32		m128_u32[4];
+} fltx4_union;
+
+typedef __vector4 fltx4;
+typedef __vector4 i32x4; // a VMX register; just a way of making it explicit that we're doing integer ops.
+typedef __vector4 u32x4; // a VMX register; just a way of making it explicit that we're doing unsigned integer ops.
+
+#else
+
+typedef __m128 fltx4;
+typedef __m128 i32x4;
+typedef __m128 u32x4;
+
+#endif
+
+// The FLTX4 type is a fltx4 used as a parameter to a function.
+// On the 360, the best way to do this is pass-by-copy on the registers.
+// On the PC, the best way is to pass by const reference. 
+// The compiler will sometimes, but not always, replace a pass-by-const-ref
+// with a pass-in-reg on the 360; to avoid this confusion, you can
+// explicitly use a FLTX4 as the parameter type.
+#ifdef _X360
+typedef __vector4 FLTX4;
+#else
+typedef const fltx4 & FLTX4;
+#endif
+
+// A 16-byte aligned int32 datastructure
+// (for use when writing out fltx4's as SIGNED
+// ints).
+struct ALIGN16 intx4
+{
+	int32 m_i32[4];
+
+	inline int & operator[](int which) 
+	{
+		return m_i32[which];
+	}
+
+	inline const int & operator[](int which) const
+	{
+		return m_i32[which];
+	}
+
+	inline int32 *Base() {
+		return m_i32;
+	}
+
+	inline const int32 *Base() const
+	{
+		return m_i32;
+	}
+
+	inline const bool operator==(const intx4 &other) const
+	{
+		return m_i32[0] == other.m_i32[0] &&
+			m_i32[1] == other.m_i32[1] &&
+			m_i32[2] == other.m_i32[2] &&
+			m_i32[3] == other.m_i32[3] 	;
+	}
+} ALIGN16_POST;
+
+
+#if defined( _DEBUG ) && defined( _X360 )
+FORCEINLINE void TestVPUFlags()
+{
+	// Check that the VPU is in the appropriate (Java-compliant) mode (see 3.2.1 in altivec_pem.pdf on xds.xbox.com)
+	__vector4 a;
+	__asm
+	{
+		mfvscr	a;
+	}
+	unsigned int * flags		= (unsigned int *)&a;
+	unsigned int   controlWord	= flags[3];
+	Assert(controlWord == 0);
+}
+#else  // _DEBUG
+FORCEINLINE void TestVPUFlags() {}
+#endif // _DEBUG
+
+
+// useful constants in SIMD packed float format:
+// (note: some of these aren't stored on the 360, 
+// but are manufactured directly in one or two 
+// instructions, saving a load and possible L2
+// miss.)
+#ifndef _X360
+extern const fltx4 Four_Zeros;									// 0 0 0 0
+extern const fltx4 Four_Ones;									// 1 1 1 1
+extern const fltx4 Four_Twos;									// 2 2 2 2
+extern const fltx4 Four_Threes;									// 3 3 3 3
+extern const fltx4 Four_Fours;									// guess.
+extern const fltx4 Four_Point225s;								// .225 .225 .225 .225
+extern const fltx4 Four_PointFives;								// .5 .5 .5 .5
+extern const fltx4 Four_Epsilons;								// FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON
+extern const fltx4 Four_2ToThe21s;								// (1<<21)..
+extern const fltx4 Four_2ToThe22s;								// (1<<22)..
+extern const fltx4 Four_2ToThe23s;								// (1<<23)..
+extern const fltx4 Four_2ToThe24s;								// (1<<24)..
+extern const fltx4 Four_Origin;									// 0 0 0 1 (origin point, like vr0 on the PS2)
+extern const fltx4 Four_NegativeOnes;							// -1 -1 -1 -1 
+#else
+#define			   Four_Zeros XMVectorZero()					// 0 0 0 0
+#define			   Four_Ones XMVectorSplatOne()					// 1 1 1 1
+extern const fltx4 Four_Twos;									// 2 2 2 2
+extern const fltx4 Four_Threes;									// 3 3 3 3
+extern const fltx4 Four_Fours;									// guess.
+extern const fltx4 Four_Point225s;								// .225 .225 .225 .225
+extern const fltx4 Four_PointFives;								// .5 .5 .5 .5
+extern const fltx4 Four_Epsilons;								// FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON
+extern const fltx4 Four_2ToThe21s;								// (1<<21)..
+extern const fltx4 Four_2ToThe22s;								// (1<<22)..
+extern const fltx4 Four_2ToThe23s;								// (1<<23)..
+extern const fltx4 Four_2ToThe24s;								// (1<<24)..
+extern const fltx4 Four_Origin;									// 0 0 0 1 (origin point, like vr0 on the PS2)
+extern const fltx4 Four_NegativeOnes;							// -1 -1 -1 -1 
+#endif
+extern const fltx4 Four_FLT_MAX;								// FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX
+extern const fltx4 Four_Negative_FLT_MAX;						// -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX
+extern const fltx4 g_SIMD_0123;									// 0 1 2 3 as float
+
+// external aligned integer constants
+extern const ALIGN16 uint32 g_SIMD_clear_signmask[] ALIGN16_POST;			// 0x7fffffff x 4
+extern const ALIGN16 uint32 g_SIMD_signmask[] ALIGN16_POST;				// 0x80000000 x 4
+extern const ALIGN16 uint32 g_SIMD_lsbmask[] ALIGN16_POST;				// 0xfffffffe x 4
+extern const ALIGN16 uint32 g_SIMD_clear_wmask[] ALIGN16_POST;			// -1 -1 -1 0
+extern const ALIGN16 uint32 g_SIMD_ComponentMask[4][4] ALIGN16_POST;		// [0xFFFFFFFF 0 0 0], [0 0xFFFFFFFF 0 0], [0 0 0xFFFFFFFF 0], [0 0 0 0xFFFFFFFF]
+extern const ALIGN16 uint32 g_SIMD_AllOnesMask[] ALIGN16_POST;			// ~0,~0,~0,~0
+extern const ALIGN16 uint32 g_SIMD_Low16BitsMask[] ALIGN16_POST;			// 0xffff x 4
+
+// this mask is used for skipping the tail of things. If you have N elements in an array, and wish
+// to mask out the tail, g_SIMD_SkipTailMask[N & 3] what you want to use for the last iteration.
+extern const uint32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST;
+
+// Define prefetch macros.
+// The characteristics of cache and prefetch are completely 
+// different between the different platforms, so you DO NOT
+// want to just define one macro that maps to every platform
+// intrinsic under the hood -- you need to prefetch at different
+// intervals between x86 and PPC, for example, and that is
+// a higher level code change. 
+// On the other hand, I'm tired of typing #ifdef _X360
+// all over the place, so this is just a nop on Intel, PS3.
+#ifdef _X360
+#define PREFETCH360(address, offset) __dcbt(offset,address)
+#else
+#define PREFETCH360(x,y) // nothing
+#endif
+
+#if USE_STDC_FOR_SIMD
+
+//---------------------------------------------------------------------
+// Standard C (fallback/Linux) implementation (only there for compat - slow)
+//---------------------------------------------------------------------
+
+FORCEINLINE float SubFloat( const fltx4 & a, int idx )
+{
+	return a.m128_f32[ idx ];
+}
+
+FORCEINLINE float & SubFloat( fltx4 & a, int idx )
+{
+	return a.m128_f32[idx];
+}
+
+FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )
+{
+	return a.m128_u32[idx];
+}
+
+FORCEINLINE uint32 & SubInt( fltx4 & a, int idx )
+{
+	return a.m128_u32[idx];
+}
+
+// Return one in the fastest way -- on the x360, faster even than loading.
+FORCEINLINE fltx4 LoadZeroSIMD( void )
+{
+	return Four_Zeros;
+}
+
+// Return one in the fastest way -- on the x360, faster even than loading.
+FORCEINLINE fltx4 LoadOneSIMD( void )
+{
+	return Four_Ones;
+}
+
+FORCEINLINE fltx4 SplatXSIMD( const fltx4 & a )
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = SubFloat( a, 0 );
+	SubFloat( retVal, 1 ) = SubFloat( a, 0 );
+	SubFloat( retVal, 2 ) = SubFloat( a, 0 );
+	SubFloat( retVal, 3 ) = SubFloat( a, 0 );
+	return retVal;
+}
+
+FORCEINLINE fltx4 SplatYSIMD( fltx4 a )
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = SubFloat( a, 1 );
+	SubFloat( retVal, 1 ) = SubFloat( a, 1 );
+	SubFloat( retVal, 2 ) = SubFloat( a, 1 );
+	SubFloat( retVal, 3 ) = SubFloat( a, 1 );
+	return retVal;
+}
+
+FORCEINLINE fltx4 SplatZSIMD( fltx4 a )
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = SubFloat( a, 2 );
+	SubFloat( retVal, 1 ) = SubFloat( a, 2 );
+	SubFloat( retVal, 2 ) = SubFloat( a, 2 );
+	SubFloat( retVal, 3 ) = SubFloat( a, 2 );
+	return retVal;
+}
+
+FORCEINLINE fltx4 SplatWSIMD( fltx4 a )
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = SubFloat( a, 3 );
+	SubFloat( retVal, 1 ) = SubFloat( a, 3 );
+	SubFloat( retVal, 2 ) = SubFloat( a, 3 );
+	SubFloat( retVal, 3 ) = SubFloat( a, 3 );
+	return retVal;
+}
+
+FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x )
+{
+	fltx4 result = a;
+	SubFloat( result, 0 ) = SubFloat( x, 0 );
+	return result;
+}
+
+FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y )
+{
+	fltx4 result = a;
+	SubFloat( result, 1 ) = SubFloat( y, 1 );
+	return result;
+}
+
+FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z )
+{
+	fltx4 result = a;
+	SubFloat( result, 2 ) = SubFloat( z, 2 );
+	return result;
+}
+
+FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w )
+{
+	fltx4 result = a;
+	SubFloat( result, 3 ) = SubFloat( w, 3 );
+	return result;
+}
+
+FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, int nComponent, float flValue )
+{
+	fltx4 result = a;
+	SubFloat( result, nComponent ) = flValue;
+	return result;
+}
+
+// a b c d -> b c d a
+FORCEINLINE fltx4 RotateLeft( const fltx4 & a )
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = SubFloat( a, 1 );
+	SubFloat( retVal, 1 ) = SubFloat( a, 2 );
+	SubFloat( retVal, 2 ) = SubFloat( a, 3 );
+	SubFloat( retVal, 3 ) = SubFloat( a, 0 );
+	return retVal;
+}
+
+// a b c d -> c d a b
+FORCEINLINE fltx4 RotateLeft2( const fltx4 & a )
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = SubFloat( a, 2 );
+	SubFloat( retVal, 1 ) = SubFloat( a, 3 );
+	SubFloat( retVal, 2 ) = SubFloat( a, 0 );
+	SubFloat( retVal, 3 ) = SubFloat( a, 1 );
+	return retVal;
+}
+
+#define BINOP(op) 														\
+	fltx4 retVal;                                          				\
+	SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) op SubFloat( b, 0 ) );	\
+	SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) op SubFloat( b, 1 ) );	\
+	SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) op SubFloat( b, 2 ) );	\
+	SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) op SubFloat( b, 3 ) );	\
+    return retVal;
+
+#define IBINOP(op) 														\
+	fltx4 retVal;														\
+	SubInt( retVal, 0 ) = ( SubInt( a, 0 ) op SubInt ( b, 0 ) );		\
+	SubInt( retVal, 1 ) = ( SubInt( a, 1 ) op SubInt ( b, 1 ) );		\
+	SubInt( retVal, 2 ) = ( SubInt( a, 2 ) op SubInt ( b, 2 ) );		\
+	SubInt( retVal, 3 ) = ( SubInt( a, 3 ) op SubInt ( b, 3 ) );		\
+    return retVal;
+
+FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b )
+{
+	BINOP(+);
+}
+
+FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b )				// a-b
+{
+	BINOP(-);
+};
+
+FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b )				// a*b
+{
+	BINOP(*);
+}
+
+FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b )				// a/b
+{
+	BINOP(/);
+}
+
+
+FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c )				// a*b + c
+{
+	return AddSIMD( MulSIMD(a,b), c );
+}
+
+FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c )				// c - a*b
+{
+	return SubSIMD( c, MulSIMD(a,b) );
+};
+
+
+FORCEINLINE fltx4 SinSIMD( const fltx4 &radians )
+{
+	fltx4 result;
+	SubFloat( result, 0 ) = sin( SubFloat( radians, 0 ) );
+	SubFloat( result, 1 ) = sin( SubFloat( radians, 1 ) );
+	SubFloat( result, 2 ) = sin( SubFloat( radians, 2 ) );
+	SubFloat( result, 3 ) = sin( SubFloat( radians, 3 ) );
+	return result;
+}
+
+FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
+{
+	SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
+	SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) );
+	SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) );
+}
+
+FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
+{
+	SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
+	SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) );
+	SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) );
+	SinCos( SubFloat( radians, 3 ), &SubFloat( sine, 3 ), &SubFloat( cosine, 3 ) );
+}
+
+FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine )
+{
+	fltx4 result;
+	SubFloat( result, 0 ) = asin( SubFloat( sine, 0 ) );
+	SubFloat( result, 1 ) = asin( SubFloat( sine, 1 ) );
+	SubFloat( result, 2 ) = asin( SubFloat( sine, 2 ) );
+	SubFloat( result, 3 ) = asin( SubFloat( sine, 3 ) );
+	return result;
+}
+
+FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs )
+{
+	fltx4 result;
+	SubFloat( result, 0 ) = acos( SubFloat( cs, 0 ) );
+	SubFloat( result, 1 ) = acos( SubFloat( cs, 1 ) );
+	SubFloat( result, 2 ) = acos( SubFloat( cs, 2 ) );
+	SubFloat( result, 3 ) = acos( SubFloat( cs, 3 ) );
+	return result;
+}
+
+// tan^1(a/b) .. ie, pass sin in as a and cos in as b
+FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b )
+{
+	fltx4 result;
+	SubFloat( result, 0 ) = atan2( SubFloat( a, 0 ), SubFloat( b, 0 ) );
+	SubFloat( result, 1 ) = atan2( SubFloat( a, 1 ), SubFloat( b, 1 ) );
+	SubFloat( result, 2 ) = atan2( SubFloat( a, 2 ), SubFloat( b, 2 ) );
+	SubFloat( result, 3 ) = atan2( SubFloat( a, 3 ), SubFloat( b, 3 ) );
+	return result;
+}
+
+FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b )				// max(a,b)
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = max( SubFloat( a, 0 ), SubFloat( b, 0 ) );
+	SubFloat( retVal, 1 ) = max( SubFloat( a, 1 ), SubFloat( b, 1 ) );
+	SubFloat( retVal, 2 ) = max( SubFloat( a, 2 ), SubFloat( b, 2 ) );
+	SubFloat( retVal, 3 ) = max( SubFloat( a, 3 ), SubFloat( b, 3 ) );
+	return retVal;
+}
+
+FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b )				// min(a,b)
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = min( SubFloat( a, 0 ), SubFloat( b, 0 ) );
+	SubFloat( retVal, 1 ) = min( SubFloat( a, 1 ), SubFloat( b, 1 ) );
+	SubFloat( retVal, 2 ) = min( SubFloat( a, 2 ), SubFloat( b, 2 ) );
+	SubFloat( retVal, 3 ) = min( SubFloat( a, 3 ), SubFloat( b, 3 ) );
+	return retVal;
+}
+
+FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b )				// a & b
+{
+	IBINOP(&);
+}
+
+FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b )			// ~a & b
+{
+	fltx4 retVal;
+	SubInt( retVal, 0 ) = ~SubInt( a, 0 ) & SubInt( b, 0 );
+	SubInt( retVal, 1 ) = ~SubInt( a, 1 ) & SubInt( b, 1 );
+	SubInt( retVal, 2 ) = ~SubInt( a, 2 ) & SubInt( b, 2 );
+	SubInt( retVal, 3 ) = ~SubInt( a, 3 ) & SubInt( b, 3 );
+	return retVal;
+}
+
+FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b )				// a ^ b
+{
+	IBINOP(^);
+}
+
+FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b )				// a | b
+{
+	IBINOP(|);
+}
+
+FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a
+{
+	fltx4 retval;
+	SubFloat( retval, 0 ) = -SubFloat( a, 0 );
+	SubFloat( retval, 1 ) = -SubFloat( a, 1 );
+	SubFloat( retval, 2 ) = -SubFloat( a, 2 );
+	SubFloat( retval, 3 ) = -SubFloat( a, 3 );
+
+	return retval;
+}
+
+FORCEINLINE bool IsAllZeros( const fltx4 & a )								// all floats of a zero?
+{
+	return	( SubFloat( a, 0 ) == 0.0 ) &&
+		( SubFloat( a, 1 ) == 0.0 ) &&
+		( SubFloat( a, 2 ) == 0.0 ) &&
+		( SubFloat( a, 3 ) == 0.0 ) ;
+}
+
+
+// for branching when a.xyzw > b.xyzw
+FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b )
+{
+	return	SubFloat(a,0) > SubFloat(b,0) &&
+		SubFloat(a,1) > SubFloat(b,1) &&
+		SubFloat(a,2) > SubFloat(b,2) &&
+		SubFloat(a,3) > SubFloat(b,3);
+}
+
+// for branching when a.xyzw >= b.xyzw
+FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b )
+{
+	return	SubFloat(a,0) >= SubFloat(b,0) &&
+		SubFloat(a,1) >= SubFloat(b,1) &&
+		SubFloat(a,2) >= SubFloat(b,2) &&
+		SubFloat(a,3) >= SubFloat(b,3);
+}
+
+// For branching if all a.xyzw == b.xyzw
+FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b )
+{
+	return	SubFloat(a,0) == SubFloat(b,0) &&
+		SubFloat(a,1) == SubFloat(b,1) &&
+		SubFloat(a,2) == SubFloat(b,2) &&
+		SubFloat(a,3) == SubFloat(b,3);
+}
+
+FORCEINLINE int TestSignSIMD( const fltx4 & a )								// mask of which floats have the high bit set
+{
+	int nRet = 0;
+
+	nRet |= ( SubInt( a, 0 ) & 0x80000000 ) >> 31; // sign(x) -> bit 0
+	nRet |= ( SubInt( a, 1 ) & 0x80000000 ) >> 30; // sign(y) -> bit 1
+	nRet |= ( SubInt( a, 2 ) & 0x80000000 ) >> 29; // sign(z) -> bit 2
+	nRet |= ( SubInt( a, 3 ) & 0x80000000 ) >> 28; // sign(w) -> bit 3
+
+	return nRet;
+}
+
+FORCEINLINE bool IsAnyNegative( const fltx4 & a )							// (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
+{
+	return (0 != TestSignSIMD( a ));
+}
+
+FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b )				// (a==b) ? ~0:0
+{
+	fltx4 retVal;
+	SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) == SubFloat( b, 0 )) ? ~0 : 0;
+	SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) == SubFloat( b, 1 )) ? ~0 : 0;
+	SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) == SubFloat( b, 2 )) ? ~0 : 0;
+	SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) == SubFloat( b, 3 )) ? ~0 : 0;
+	return retVal;
+}
+
+FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b )				// (a>b) ? ~0:0
+{
+	fltx4 retVal;
+	SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) > SubFloat( b, 0 )) ? ~0 : 0;
+	SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) > SubFloat( b, 1 )) ? ~0 : 0;
+	SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) > SubFloat( b, 2 )) ? ~0 : 0;
+	SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) > SubFloat( b, 3 )) ? ~0 : 0;
+	return retVal;
+}
+
+FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b )				// (a>=b) ? ~0:0
+{
+	fltx4 retVal;
+	SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) >= SubFloat( b, 0 )) ? ~0 : 0;
+	SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) >= SubFloat( b, 1 )) ? ~0 : 0;
+	SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) >= SubFloat( b, 2 )) ? ~0 : 0;
+	SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) >= SubFloat( b, 3 )) ? ~0 : 0;
+	return retVal;
+}
+
+FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b )				// (a<b) ? ~0:0
+{
+	fltx4 retVal;
+	SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) < SubFloat( b, 0 )) ? ~0 : 0;
+	SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) < SubFloat( b, 1 )) ? ~0 : 0;
+	SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) < SubFloat( b, 2 )) ? ~0 : 0;
+	SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) < SubFloat( b, 3 )) ? ~0 : 0;
+	return retVal;
+}
+
+FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b )				// (a<=b) ? ~0:0
+{
+	fltx4 retVal;
+	SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 )) ? ~0 : 0;
+	SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 )) ? ~0 : 0;
+	SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 )) ? ~0 : 0;
+	SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 )) ? ~0 : 0;
+	return retVal;
+}
+
+FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b )		// (a <= b && a >= -b) ? ~0 : 0
+{
+	fltx4 retVal;
+	SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 ) && SubFloat( a, 0 ) >= -SubFloat( b, 0 ) ) ? ~0 : 0;
+	SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 ) && SubFloat( a, 1 ) >= -SubFloat( b, 1 ) ) ? ~0 : 0;
+	SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 ) && SubFloat( a, 2 ) >= -SubFloat( b, 2 ) ) ? ~0 : 0;
+	SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 ) && SubFloat( a, 3 ) >= -SubFloat( b, 3 ) ) ? ~0 : 0;
+	return retVal;
+}
+
+
+FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue )
+{
+	return OrSIMD(
+		AndSIMD( ReplacementMask, NewValue ),
+		AndNotSIMD( ReplacementMask, OldValue ) );
+}
+
+FORCEINLINE fltx4 ReplicateX4( float flValue )					//  a,a,a,a
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = flValue;
+	SubFloat( retVal, 1 ) = flValue;
+	SubFloat( retVal, 2 ) = flValue;
+	SubFloat( retVal, 3 ) = flValue;
+	return retVal;
+}
+
+/// replicate a single 32 bit integer value to all 4 components of an m128
+FORCEINLINE fltx4 ReplicateIX4( int nValue )
+{
+	fltx4 retVal;
+	SubInt( retVal, 0 ) = nValue;
+	SubInt( retVal, 1 ) = nValue;
+	SubInt( retVal, 2 ) = nValue;
+	SubInt( retVal, 3 ) = nValue;
+	return retVal;
+
+}
+
+// Round towards positive infinity
+FORCEINLINE fltx4 CeilSIMD( const fltx4 &a )
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = ceil( SubFloat( a, 0 ) );
+	SubFloat( retVal, 1 ) = ceil( SubFloat( a, 1 ) );
+	SubFloat( retVal, 2 ) = ceil( SubFloat( a, 2 ) );
+	SubFloat( retVal, 3 ) = ceil( SubFloat( a, 3 ) );
+	return retVal;
+
+}
+
+// Round towards negative infinity
+FORCEINLINE fltx4 FloorSIMD( const fltx4 &a )
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = floor( SubFloat( a, 0 ) );
+	SubFloat( retVal, 1 ) = floor( SubFloat( a, 1 ) );
+	SubFloat( retVal, 2 ) = floor( SubFloat( a, 2 ) );
+	SubFloat( retVal, 3 ) = floor( SubFloat( a, 3 ) );
+	return retVal;
+
+}
+
+FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a )				// sqrt(a), more or less
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = sqrt( SubFloat( a, 0 ) );
+	SubFloat( retVal, 1 ) = sqrt( SubFloat( a, 1 ) );
+	SubFloat( retVal, 2 ) = sqrt( SubFloat( a, 2 ) );
+	SubFloat( retVal, 3 ) = sqrt( SubFloat( a, 3 ) );
+	return retVal;
+}
+
+FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a )					// sqrt(a)
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = sqrt( SubFloat( a, 0 ) );
+	SubFloat( retVal, 1 ) = sqrt( SubFloat( a, 1 ) );
+	SubFloat( retVal, 2 ) = sqrt( SubFloat( a, 2 ) );
+	SubFloat( retVal, 3 ) = sqrt( SubFloat( a, 3 ) );
+	return retVal;
+}
+
+FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a )		// 1/sqrt(a), more or less
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) );
+	SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) );
+	SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) );
+	SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) );
+	return retVal;
+}
+
+FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a )
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) != 0.0f ? SubFloat( a, 0 ) : FLT_EPSILON );
+	SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) != 0.0f ? SubFloat( a, 1 ) : FLT_EPSILON );
+	SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) != 0.0f ? SubFloat( a, 2 ) : FLT_EPSILON );
+	SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) != 0.0f ? SubFloat( a, 3 ) : FLT_EPSILON );
+	return retVal;
+}
+
+FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a )			// 1/sqrt(a)
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) );
+	SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) );
+	SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) );
+	SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) );
+	return retVal;
+}
+
+FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a )			// 1/a, more or less
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = 1.0 / SubFloat( a, 0 );
+	SubFloat( retVal, 1 ) = 1.0 / SubFloat( a, 1 );
+	SubFloat( retVal, 2 ) = 1.0 / SubFloat( a, 2 );
+	SubFloat( retVal, 3 ) = 1.0 / SubFloat( a, 3 );
+	return retVal;
+}
+
+FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a )				// 1/a
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = 1.0 / SubFloat( a, 0 );
+	SubFloat( retVal, 1 ) = 1.0 / SubFloat( a, 1 );
+	SubFloat( retVal, 2 ) = 1.0 / SubFloat( a, 2 );
+	SubFloat( retVal, 3 ) = 1.0 / SubFloat( a, 3 );
+	return retVal;
+}
+
+/// 1/x for all 4 values.
+/// 1/0 will result in a big but NOT infinite result
+FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a )
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = 1.0 / (SubFloat( a, 0 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 0 ));
+	SubFloat( retVal, 1 ) = 1.0 / (SubFloat( a, 1 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 1 ));
+	SubFloat( retVal, 2 ) = 1.0 / (SubFloat( a, 2 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 2 ));
+	SubFloat( retVal, 3 ) = 1.0 / (SubFloat( a, 3 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 3 ));
+	return retVal;
+}
+
+FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a )
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = 1.0 / (SubFloat( a, 0 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 0 ));
+	SubFloat( retVal, 1 ) = 1.0 / (SubFloat( a, 1 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 1 ));
+	SubFloat( retVal, 2 ) = 1.0 / (SubFloat( a, 2 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 2 ));
+	SubFloat( retVal, 3 ) = 1.0 / (SubFloat( a, 3 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 3 ));
+	return retVal;
+}
+
+// 2^x for all values (the antilog)
+FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower )
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = powf( 2, SubFloat(toPower, 0) );
+	SubFloat( retVal, 1 ) = powf( 2, SubFloat(toPower, 1) );
+	SubFloat( retVal, 2 ) = powf( 2, SubFloat(toPower, 2) );
+	SubFloat( retVal, 3 ) = powf( 2, SubFloat(toPower, 3) );
+
+	return retVal;
+}
+
+FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b )
+{
+	float flDot = SubFloat( a, 0 ) * SubFloat( b, 0 ) +
+		SubFloat( a, 1 ) * SubFloat( b, 1 ) + 
+		SubFloat( a, 2 ) * SubFloat( b, 2 );
+	return ReplicateX4( flDot );
+}
+
+FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b )
+{
+	float flDot = SubFloat( a, 0 ) * SubFloat( b, 0 ) +
+		SubFloat( a, 1 ) * SubFloat( b, 1 ) + 
+		SubFloat( a, 2 ) * SubFloat( b, 2 ) +
+		SubFloat( a, 3 ) * SubFloat( b, 3 );
+	return ReplicateX4( flDot );
+}
+
+// Clamps the components of a vector to a specified minimum and maximum range.
+FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max)
+{
+	return MaxSIMD( min, MinSIMD( max, in ) );
+}
+
+// Squelch the w component of a vector to +0.0.
+// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)
+FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a )
+{
+	fltx4 retval;
+	retval = a;
+	SubFloat( retval, 0 ) = 0;
+	return retval;
+}
+
+FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD )
+{
+	return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
+}
+
+FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD )
+{
+	return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
+}
+
+FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD )
+{
+	return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
+}
+
+// for the transitional class -- load a 3-by VectorAligned and squash its w component
+FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD )
+{
+	fltx4 retval = LoadAlignedSIMD(pSIMD.Base());
+	// squelch w
+	SubInt( retval, 3 ) = 0;
+	return retval;
+}
+
+FORCEINLINE void StoreAlignedSIMD( float *pSIMD, const fltx4 & a )
+{
+	*( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a;
+}
+
+FORCEINLINE void StoreUnalignedSIMD( float *pSIMD, const fltx4 & a )
+{
+	*( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a;
+}
+
+FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a )
+{
+	*pSIMD     = SubFloat(a, 0);
+	*(pSIMD+1) = SubFloat(a, 1);
+	*(pSIMD+2) = SubFloat(a, 2);
+}
+
+// strongly typed -- syntactic castor oil used for typechecking as we transition to SIMD
+FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a )
+{
+	StoreAlignedSIMD(pSIMD->Base(),a);
+}
+
+FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w )
+{
+#define SWAP_FLOATS( _a_, _ia_, _b_, _ib_ ) { float tmp = SubFloat( _a_, _ia_ ); SubFloat( _a_, _ia_ ) = SubFloat( _b_, _ib_ ); SubFloat( _b_, _ib_ ) = tmp; }
+	SWAP_FLOATS( x, 1, y, 0 );
+	SWAP_FLOATS( x, 2, z, 0 );
+	SWAP_FLOATS( x, 3, w, 0 );
+	SWAP_FLOATS( y, 2, z, 1 );
+	SWAP_FLOATS( y, 3, w, 1 );
+	SWAP_FLOATS( z, 3, w, 2 );
+}
+
+// find the lowest component of a.x, a.y, a.z,
+// and replicate it to the whole return value.
+FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a )
+{
+	float lowest = min( min( SubFloat(a, 0), SubFloat(a, 1) ), SubFloat(a, 2));
+	return ReplicateX4(lowest);
+}
+
+// find the highest component of a.x, a.y, a.z,
+// and replicate it to the whole return value.
+FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a )
+{
+	float highest = max( max( SubFloat(a, 0), SubFloat(a, 1) ), SubFloat(a, 2));
+	return ReplicateX4(highest);
+}
+
+// Fixed-point conversion and save as SIGNED INTS.
+// pDest->x = Int (vSrc.x)
+// note: some architectures have means of doing 
+// fixed point conversion when the fix depth is
+// specified as an immediate.. but there is no way 
+// to guarantee an immediate as a parameter to function
+// like this.
+FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc)
+{
+	(*pDest)[0] = SubFloat(vSrc, 0);
+	(*pDest)[1] = SubFloat(vSrc, 1);
+	(*pDest)[2] = SubFloat(vSrc, 2);
+	(*pDest)[3] = SubFloat(vSrc, 3);
+}
+
+// ------------------------------------
+// INTEGER SIMD OPERATIONS.
+// ------------------------------------
+// splat all components of a vector to a signed immediate int number.
+FORCEINLINE fltx4 IntSetImmediateSIMD( int nValue )
+{
+	fltx4 retval;
+	SubInt( retval, 0 ) = SubInt( retval, 1 ) = SubInt( retval, 2 ) = SubInt( retval, 3) = nValue;
+	return retval;
+}
+
+// Load 4 aligned words into a SIMD register
+FORCEINLINE i32x4 LoadAlignedIntSIMD(const void * RESTRICT pSIMD)
+{
+	return *( reinterpret_cast< const i32x4 *> ( pSIMD ) );
+}
+
+// Load 4 unaligned words into a SIMD register
+FORCEINLINE i32x4 LoadUnalignedIntSIMD( const void * RESTRICT pSIMD)
+{
+	return *( reinterpret_cast< const i32x4 *> ( pSIMD ) );
+}
+
+// save into four words, 16-byte aligned
+FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const fltx4 & a )
+{
+	*( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a;
+}
+
+FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a )
+{
+	*( reinterpret_cast< i32x4 *> ( pSIMD.Base() ) ) = a;
+}
+
+FORCEINLINE void StoreUnalignedIntSIMD( int32 *pSIMD, const fltx4 & a )
+{
+	*( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a;
+}
+
+// Take a fltx4 containing fixed-point uints and 
+// return them as single precision floats. No
+// fixed point conversion is done.
+FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const u32x4 &vSrcA )
+{
+	Assert(0);			/* pc has no such operation */
+	fltx4 retval;
+	SubFloat( retval, 0 ) = ( (float) SubInt( retval, 0 ) );
+	SubFloat( retval, 1 ) = ( (float) SubInt( retval, 1 ) );
+	SubFloat( retval, 2 ) = ( (float) SubInt( retval, 2 ) );
+	SubFloat( retval, 3 ) = ( (float) SubInt( retval, 3 ) );
+	return retval;
+}
+
+
+#if 0				/* pc has no such op */
+// Take a fltx4 containing fixed-point sints and 
+// return them as single precision floats. No 
+// fixed point conversion is done.
+FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA )
+{
+	fltx4 retval;
+	SubFloat( retval, 0 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[0])) );
+	SubFloat( retval, 1 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[1])) );
+	SubFloat( retval, 2 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[2])) );
+	SubFloat( retval, 3 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[3])) );
+	return retval;
+}
+
+
+/*
+  works on fltx4's as if they are four uints.
+  the first parameter contains the words to be shifted,
+  the second contains the amount to shift by AS INTS
+
+  for i = 0 to 3
+  shift = vSrcB_i*32:(i*32)+4
+  vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift
+*/
+FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4 &vSrcA, const i32x4 &vSrcB)
+{
+	i32x4 retval;
+	SubInt(retval, 0) = SubInt(vSrcA, 0) << SubInt(vSrcB, 0);
+	SubInt(retval, 1) = SubInt(vSrcA, 1) << SubInt(vSrcB, 1);
+	SubInt(retval, 2) = SubInt(vSrcA, 2) << SubInt(vSrcB, 2);
+	SubInt(retval, 3) = SubInt(vSrcA, 3) << SubInt(vSrcB, 3);
+
+
+	return retval;
+}
+#endif
+
+#elif ( defined( _X360 ) )
+
+//---------------------------------------------------------------------
+// X360 implementation
+//---------------------------------------------------------------------
+
+FORCEINLINE float & FloatSIMD( fltx4 & a, int idx )
+{
+	fltx4_union & a_union = (fltx4_union &)a;
+	return a_union.m128_f32[idx];
+}
+
+FORCEINLINE unsigned int & UIntSIMD( fltx4 & a, int idx )
+{
+	fltx4_union & a_union = (fltx4_union &)a;
+	return a_union.m128_u32[idx];
+}
+
+FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b )
+{
+	return __vaddfp( a, b );
+}
+
+FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b )				// a-b
+{
+	return __vsubfp( a, b );
+}
+
+FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b )				// a*b
+{
+	return __vmulfp( a, b );
+}
+
+FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c )				// a*b + c
+{
+	return __vmaddfp( a, b, c );
+}
+
+FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c )				// c - a*b
+{
+	return __vnmsubfp( a, b, c );
+};
+
+FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b )
+{
+	return __vmsum3fp( a, b );
+}
+
+FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b )
+{
+	return __vmsum4fp( a, b );
+}
+
+FORCEINLINE fltx4 SinSIMD( const fltx4 &radians )
+{
+	return XMVectorSin( radians );
+}
+
+FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
+{
+	XMVectorSinCos( &sine, &cosine, radians ); 	
+}
+
+FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )			
+{
+	XMVectorSinCos( &sine, &cosine, radians ); 	
+}
+
+FORCEINLINE void CosSIMD( fltx4 &cosine, const fltx4 &radians )				
+{
+	cosine = XMVectorCos( radians ); 	
+}
+
+FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine )
+{
+	return XMVectorASin( sine );
+}
+
+FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs )
+{
+	return XMVectorACos( cs );
+}
+
+// tan^1(a/b) .. ie, pass sin in as a and cos in as b
+FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b )
+{
+	return XMVectorATan2( a, b );
+}
+
+// DivSIMD defined further down, since it uses ReciprocalSIMD
+
+FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b )				// max(a,b)
+{
+	return __vmaxfp( a, b );
+}
+
+FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b )				// min(a,b)
+{
+	return __vminfp( a, b );
+}
+
+FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b )				// a & b
+{
+    return __vand( a, b );
+}
+
+FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b )			// ~a & b
+{
+	// NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second
+    return __vandc( b, a );
+}
+
+FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b )				// a ^ b
+{
+    return __vxor( a, b );
+}
+
+FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b )				// a | b
+{
+    return __vor( a, b );
+}
+
+FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a
+{
+	return XMVectorNegate(a);
+}
+
+FORCEINLINE bool IsAllZeros( const fltx4 & a )								// all floats of a zero?
+{
+	unsigned int equalFlags = 0;
+    __vcmpeqfpR( a, Four_Zeros, &equalFlags );
+    return XMComparisonAllTrue( equalFlags );
+}
+
+FORCEINLINE bool IsAnyZeros( const fltx4 & a )								// any floats are zero?
+{
+	unsigned int conditionregister;
+	XMVectorEqualR(&conditionregister, a, XMVectorZero());
+	return XMComparisonAnyTrue(conditionregister);
+}
+
+FORCEINLINE bool IsAnyXYZZero( const fltx4 &a )								// are any of x,y,z zero?
+{
+	// copy a's x component into w, in case w was zero.
+	fltx4 temp = __vrlimi(a, a, 1, 1);
+	unsigned int conditionregister;
+	XMVectorEqualR(&conditionregister, temp, XMVectorZero());
+	return XMComparisonAnyTrue(conditionregister);
+}
+
+// for branching when a.xyzw > b.xyzw
+FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b )
+{
+	unsigned int cr;
+	XMVectorGreaterR(&cr,a,b);
+	return XMComparisonAllTrue(cr);
+}
+
+// for branching when a.xyzw >= b.xyzw
+FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b )
+{
+	unsigned int cr;
+	XMVectorGreaterOrEqualR(&cr,a,b);
+	return XMComparisonAllTrue(cr);
+}
+
+// For branching if all a.xyzw == b.xyzw
+FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b )
+{
+	unsigned int cr;
+	XMVectorEqualR(&cr,a,b);
+	return XMComparisonAllTrue(cr);
+}
+
+
+FORCEINLINE int TestSignSIMD( const fltx4 & a )								// mask of which floats have the high bit set
+{
+	// NOTE: this maps to SSE way better than it does to VMX (most code uses IsAnyNegative(), though)
+	int nRet = 0;
+
+	const fltx4_union & a_union = (const fltx4_union &)a;
+	nRet |= ( a_union.m128_u32[0] & 0x80000000 ) >> 31; // sign(x) -> bit 0
+	nRet |= ( a_union.m128_u32[1] & 0x80000000 ) >> 30; // sign(y) -> bit 1
+	nRet |= ( a_union.m128_u32[2] & 0x80000000 ) >> 29; // sign(z) -> bit 2
+	nRet |= ( a_union.m128_u32[3] & 0x80000000 ) >> 28; // sign(w) -> bit 3
+
+	return nRet;
+}
+
+// Squelch the w component of a vector to +0.0.
+// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)
+FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a )
+{
+	return __vrlimi( a, __vzero(), 1, 0 );
+}
+
+FORCEINLINE bool IsAnyNegative( const fltx4 & a )							// (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
+{
+	// NOTE: this tests the top bits of each vector element using integer math
+	//       (so it ignores NaNs - it will return true for "-NaN")
+	unsigned int equalFlags = 0;
+    fltx4 signMask = __vspltisw( -1 );             // 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF (low order 5 bits of each element = 31)
+    signMask       = __vslw( signMask, signMask ); // 0x80000000 0x80000000 0x80000000 0x80000000 
+	__vcmpequwR( Four_Zeros, __vand( signMask, a ), &equalFlags );
+	return !XMComparisonAllTrue( equalFlags );
+}
+
+FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b )				// (a==b) ? ~0:0
+{
+    return __vcmpeqfp( a, b );
+}
+
+
+FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b )				// (a>b) ? ~0:0
+{
+    return __vcmpgtfp( a, b );
+}
+
+FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b )				// (a>=b) ? ~0:0
+{
+    return __vcmpgefp( a, b );
+}
+
+FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b )				// (a<b) ? ~0:0
+{
+    return __vcmpgtfp( b, a );
+}
+
+FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b )				// (a<=b) ? ~0:0
+{
+    return __vcmpgefp( b, a );
+}
+
+FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b )		// (a <= b && a >= -b) ? ~0 : 0
+{
+	return XMVectorInBounds( a, b );
+}
+
+// returned[i] = ReplacementMask[i] == 0 ? OldValue : NewValue
+FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue )
+{
+    return __vsel( OldValue, NewValue, ReplacementMask );
+}
+
+// AKA "Broadcast", "Splat"
+FORCEINLINE fltx4 ReplicateX4( float flValue )					//  a,a,a,a
+{
+	// NOTE: if flValue comes from a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
+	float * pValue = &flValue;
+	Assert( pValue );
+    Assert( ((unsigned int)pValue & 3) == 0);
+	return __vspltw( __lvlx( pValue, 0 ), 0 );
+}
+
+FORCEINLINE fltx4 ReplicateX4( const float *pValue )					//  a,a,a,a
+{
+	Assert( pValue );
+	return __vspltw( __lvlx( pValue, 0 ), 0 );
+}
+
+/// replicate a single 32 bit integer value to all 4 components of an m128
+FORCEINLINE fltx4 ReplicateIX4( int nValue )
+{
+	// NOTE: if nValue comes from a register, this causes a Load-Hit-Store stall (should not mix ints with fltx4s!)
+	int * pValue = &nValue;
+	Assert( pValue );
+    Assert( ((unsigned int)pValue & 3) == 0);
+	return __vspltw( __lvlx( pValue, 0 ), 0 );
+}
+
+// Round towards positive infinity
+FORCEINLINE fltx4 CeilSIMD( const fltx4 &a )
+{
+	return __vrfip(a);
+}
+
+// Round towards nearest integer
+FORCEINLINE fltx4 RoundSIMD( const fltx4 &a )
+{
+	return __vrfin(a);
+}
+
+// Round towards negative infinity
+FORCEINLINE fltx4 FloorSIMD( const fltx4 &a )
+{
+	return __vrfim(a);
+}
+
+FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a )				// sqrt(a), more or less
+{
+	// This is emulated from rsqrt
+	return XMVectorSqrtEst( a );
+}
+
+FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a )					// sqrt(a)
+{
+	// This is emulated from rsqrt
+	return XMVectorSqrt( a );
+}
+
+FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a )		// 1/sqrt(a), more or less
+{
+    return __vrsqrtefp( a );
+}
+
+FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a )
+{
+	// Convert zeros to epsilons
+	fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
+	fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
+	return ReciprocalSqrtEstSIMD( a_safe );
+}
+
+FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a )			// 1/sqrt(a)
+{
+	// This uses Newton-Raphson to improve the HW result
+ 	return XMVectorReciprocalSqrt( a );
+}
+
+FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a )			// 1/a, more or less
+{
+    return __vrefp( a );
+}
+
+/// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration.
+/// No error checking!
+FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a )				// 1/a
+{
+	// This uses Newton-Raphson to improve the HW result
+	return XMVectorReciprocal( a );
+}
+
+// FIXME: on 360, this is very slow, since it uses ReciprocalSIMD (do we need DivEstSIMD?)
+FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b )	// a/b
+{
+	return MulSIMD( ReciprocalSIMD( b ), a );
+}
+
+/// 1/x for all 4 values.
+/// 1/0 will result in a big but NOT infinite result
+FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a )
+{
+	// Convert zeros to epsilons
+	fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
+	fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
+	return ReciprocalEstSIMD( a_safe );
+}
+
+FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a )
+{
+	// Convert zeros to epsilons
+	fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
+	fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
+	return ReciprocalSIMD( a_safe );
+
+	// FIXME: This could be faster (BUT: it doesn't preserve the sign of -0.0, whereas the above does)
+	// fltx4 zeroMask = CmpEqSIMD( Four_Zeros, a );
+	// fltx4 a_safe = XMVectorSelect( a, Four_Epsilons, zeroMask );
+	// return ReciprocalSIMD( a_safe );
+}
+
+// CHRISG: is it worth doing integer bitfiddling for this?
+// 2^x for all values (the antilog)
+FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower )
+{
+	return XMVectorExp(toPower);
+}
+
+// Clamps the components of a vector to a specified minimum and maximum range.
+FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max)
+{
+	return XMVectorClamp(in, min, max);
+}
+
+FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD )
+{
+	return XMLoadVector4( pSIMD );
+}
+
+// load a 3-vector (as opposed to LoadUnalignedSIMD, which loads a 4-vec). 
+FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD )
+{
+	return XMLoadVector3( pSIMD );
+}
+
+FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD )
+{
+	return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
+}
+
+// for the transitional class -- load a 3-by VectorAligned and squash its w component
+FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD )
+{
+	fltx4 out = XMLoadVector3A(pSIMD.Base());
+	// squelch w
+	return __vrlimi( out, __vzero(), 1, 0 );
+}
+
+// for the transitional class -- load a 3-by VectorAligned and squash its w component
+FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned * RESTRICT pSIMD )
+{
+	fltx4 out = XMLoadVector3A(pSIMD);
+	// squelch w
+	return __vrlimi( out, __vzero(), 1, 0 );
+}
+
+FORCEINLINE void StoreAlignedSIMD( float *pSIMD, const fltx4 & a )
+{
+	*( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a;
+}
+
+FORCEINLINE void StoreUnalignedSIMD( float *pSIMD, const fltx4 & a )
+{
+	XMStoreVector4( pSIMD, a );
+}
+
+FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a )
+{
+	XMStoreVector3( pSIMD, a );
+}
+
+
+// strongly typed -- for typechecking as we transition to SIMD
+FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a )
+{
+	XMStoreVector3A(pSIMD->Base(),a);
+}
+
+
+// Fixed-point conversion and save as SIGNED INTS.
+// pDest->x = Int (vSrc.x)
+// note: some architectures have means of doing 
+// fixed point conversion when the fix depth is
+// specified as an immediate.. but there is no way 
+// to guarantee an immediate as a parameter to function
+// like this.
+FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc)
+{
+	fltx4 asInt = __vctsxs( vSrc, 0 );
+	XMStoreVector4A(pDest->Base(), asInt);
+}
+
+FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w )
+{
+	XMMATRIX xyzwMatrix = _XMMATRIX( x, y, z, w );
+	xyzwMatrix = XMMatrixTranspose( xyzwMatrix );
+	x = xyzwMatrix.r[0];
+	y = xyzwMatrix.r[1];
+	z = xyzwMatrix.r[2];
+	w = xyzwMatrix.r[3];
+}
+
+// Return one in the fastest way -- faster even than loading.
+FORCEINLINE fltx4 LoadZeroSIMD( void )
+{
+	return XMVectorZero();
+}
+
+// Return one in the fastest way -- faster even than loading.
+FORCEINLINE fltx4 LoadOneSIMD( void )
+{
+	return XMVectorSplatOne();
+}
+
+FORCEINLINE fltx4 SplatXSIMD( fltx4 a )
+{
+	return XMVectorSplatX( a );
+}
+
+FORCEINLINE fltx4 SplatYSIMD( fltx4 a )
+{
+	return XMVectorSplatY( a );
+}
+
+FORCEINLINE fltx4 SplatZSIMD( fltx4 a )
+{
+	return XMVectorSplatZ( a );
+}
+
+FORCEINLINE fltx4 SplatWSIMD( fltx4 a )
+{
+	return XMVectorSplatW( a );
+}
+
+FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x )
+{
+	fltx4 result = __vrlimi(a, x, 8, 0);
+	return result;
+}
+
+FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y )
+{
+	fltx4 result = __vrlimi(a, y, 4, 0);
+	return result;
+}
+
+FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z )
+{
+	fltx4 result = __vrlimi(a, z, 2, 0);
+	return result;
+}
+
+FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w )
+{
+	fltx4 result = __vrlimi(a, w, 1, 0);
+	return result;
+}
+
+FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, int nComponent, float flValue )
+{
+	static int s_nVrlimiMask[4] = { 8, 4, 2, 1 };
+	fltx4 val = ReplicateX4( flValue );
+	fltx4 result = __vrlimi(a, val, s_nVrlimiMask[nComponent], 0);
+	return result;
+}
+
+FORCEINLINE fltx4 RotateLeft( const fltx4 & a )
+{
+	fltx4 compareOne = a;
+	return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 1 );
+}
+
+FORCEINLINE fltx4 RotateLeft2( const fltx4 & a )
+{
+	fltx4 compareOne = a;
+	return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 2 );
+}
+
+
+
+// find the lowest component of a.x, a.y, a.z,
+// and replicate it to the whole return value.
+// ignores a.w.
+// Though this is only five instructions long,
+// they are all dependent, making this stall city.
+// Forcing this inline should hopefully help with scheduling.
+FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a )
+{
+	// a is [x,y,z,G] (where G is garbage)
+	// rotate left by one 
+	fltx4 compareOne = a ;
+	compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 );
+	// compareOne is [y,z,G,G]
+	fltx4 retval = MinSIMD( a, compareOne );
+	// retVal is [min(x,y), min(y,z), G, G]
+	compareOne = __vrlimi( compareOne, a, 8 , 2);
+	// compareOne is [z, G, G, G]
+	retval = MinSIMD( retval, compareOne );
+	// retVal = [ min(min(x,y),z), G, G, G ]
+	
+	// splat the x component out to the whole vector and return
+	return SplatXSIMD( retval );
+}
+
+// find the highest component of a.x, a.y, a.z,
+// and replicate it to the whole return value.
+// ignores a.w.
+// Though this is only five instructions long,
+// they are all dependent, making this stall city.
+// Forcing this inline should hopefully help with scheduling.
+FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a )
+{
+	// a is [x,y,z,G] (where G is garbage)
+	// rotate left by one 
+	fltx4 compareOne = a ;
+	compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 );
+	// compareOne is [y,z,G,G]
+	fltx4 retval = MaxSIMD( a, compareOne );
+	// retVal is [max(x,y), max(y,z), G, G]
+	compareOne = __vrlimi( compareOne, a, 8 , 2);
+	// compareOne is [z, G, G, G]
+	retval = MaxSIMD( retval, compareOne );
+	// retVal = [ max(max(x,y),z), G, G, G ]
+
+	// splat the x component out to the whole vector and return
+	return SplatXSIMD( retval );
+}
+
+
+// Transform many (horizontal) points in-place by a 3x4 matrix,
+// here already loaded onto three fltx4 registers. 
+// The points must be stored as 16-byte aligned. They are points
+// and not vectors because we assume the w-component to be 1. 
+// To spare yourself the annoyance of loading the matrix yourself,
+// use one of the overloads below.
+void TransformManyPointsBy(VectorAligned * RESTRICT pVectors, unsigned int numVectors, FLTX4 mRow1, FLTX4 mRow2, FLTX4 mRow3);
+
+// Transform many (horizontal) points in-place by a 3x4 matrix.
+// The points must be stored as 16-byte aligned. They are points
+// and not vectors because we assume the w-component to be 1. 
+// In this function, the matrix need not be aligned.
+FORCEINLINE void TransformManyPointsBy(VectorAligned * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t &pMatrix)
+{
+	return TransformManyPointsBy(pVectors, numVectors, 
+								 LoadUnalignedSIMD( pMatrix[0] ), LoadUnalignedSIMD( pMatrix[1] ), LoadUnalignedSIMD( pMatrix[2] ) );
+}
+
+// Transform many (horizontal) points in-place by a 3x4 matrix.
+// The points must be stored as 16-byte aligned. They are points
+// and not vectors because we assume the w-component to be 1. 
+// In this function, the matrix must itself be aligned on a 16-byte
+// boundary.
+FORCEINLINE void TransformManyPointsByA(VectorAligned * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t &pMatrix)
+{
+	return TransformManyPointsBy(pVectors, numVectors, 
+								 LoadAlignedSIMD( pMatrix[0] ), LoadAlignedSIMD( pMatrix[1] ), LoadAlignedSIMD( pMatrix[2] ) );
+}
+
+// ------------------------------------
+// INTEGER SIMD OPERATIONS.
+// ------------------------------------
+
+// Load 4 aligned words into a SIMD register
+FORCEINLINE i32x4 LoadAlignedIntSIMD( const void * RESTRICT pSIMD)
+{
+	return XMLoadVector4A(pSIMD);
+}
+
+// Load 4 unaligned words into a SIMD register
+FORCEINLINE i32x4 LoadUnalignedIntSIMD(const void * RESTRICT pSIMD)
+{
+	return XMLoadVector4( pSIMD );
+}
+
+// save into four words, 16-byte aligned
+FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const fltx4 & a )
+{
+	*( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a;
+}
+
+FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a )
+{
+	*( reinterpret_cast< i32x4 *> ( pSIMD.Base() ) ) = a;
+}
+
+FORCEINLINE void StoreUnalignedIntSIMD( int32 *pSIMD, const fltx4 & a )
+{
+	XMStoreVector4(pSIMD, a);
+}
+
+
+// Take a fltx4 containing fixed-point uints and 
+// return them as single precision floats. No
+// fixed point conversion is done.
+FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA )
+{
+	return __vcfux( vSrcA, 0 );
+}
+
+
+// Take a fltx4 containing fixed-point sints and 
+// return them as single precision floats. No 
+// fixed point conversion is done.
+FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA )
+{
+	return __vcfsx( vSrcA, 0 );
+}
+
+// Take a fltx4 containing fixed-point uints and 
+// return them as single precision floats. Each uint
+// will be divided by 2^immed after conversion
+// (eg, this is fixed point math). 
+/* as if:
+   FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed )
+   {
+   return __vcfux( vSrcA, uImmed );
+   }
+*/
+#define UnsignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfux( (vSrcA), (uImmed) ))
+
+// Take a fltx4 containing fixed-point sints and 
+// return them as single precision floats. Each int
+// will be divided by 2^immed (eg, this is fixed point
+// math). 
+/* as if:
+   FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed )
+   {
+   return __vcfsx( vSrcA, uImmed );
+   }
+*/
+#define SignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfsx( (vSrcA), (uImmed) ))
+
+// set all components of a vector to a signed immediate int number.
+/* as if:
+   FORCEINLINE fltx4 IntSetImmediateSIMD(int toImmediate)
+   {
+   return __vspltisw( toImmediate );
+   }
+*/
+#define IntSetImmediateSIMD(x) (__vspltisw(x))
+
+/*
+  works on fltx4's as if they are four uints.
+  the first parameter contains the words to be shifted,
+  the second contains the amount to shift by AS INTS
+
+  for i = 0 to 3
+  shift = vSrcB_i*32:(i*32)+4
+  vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift
+*/
+FORCEINLINE fltx4 IntShiftLeftWordSIMD(fltx4 vSrcA, fltx4 vSrcB)
+{
+	return __vslw(vSrcA, vSrcB);
+}
+
+FORCEINLINE float SubFloat( const fltx4 & a, int idx )
+{
+	// NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
+	const fltx4_union & a_union = (const fltx4_union &)a;
+	return a_union.m128_f32[ idx ];
+}
+
+FORCEINLINE float & SubFloat( fltx4 & a, int idx )
+{
+	fltx4_union & a_union = (fltx4_union &)a;
+	return a_union.m128_f32[idx];
+}
+
+FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx )
+{
+	fltx4 t = __vctuxs( a, 0 );
+	const fltx4_union & a_union = (const fltx4_union &)t;
+	return a_union.m128_u32[idx];
+}
+
+
+FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )
+{
+	const fltx4_union & a_union = (const fltx4_union &)a;
+	return a_union.m128_u32[idx];
+}
+
+FORCEINLINE uint32 & SubInt( fltx4 & a, int idx )
+{
+	fltx4_union & a_union = (fltx4_union &)a;
+	return a_union.m128_u32[idx];
+}
+
+#else
+
+//---------------------------------------------------------------------
+// Intel/SSE implementation
+//---------------------------------------------------------------------
+
+FORCEINLINE void StoreAlignedSIMD( float * RESTRICT pSIMD, const fltx4 & a )
+{
+	_mm_store_ps( pSIMD, a );
+}
+
+FORCEINLINE void StoreUnalignedSIMD( float * RESTRICT pSIMD, const fltx4 & a )
+{
+	_mm_storeu_ps( pSIMD, a );
+}
+
+
+FORCEINLINE fltx4 RotateLeft( const fltx4 & a );
+FORCEINLINE fltx4 RotateLeft2( const fltx4 & a );
+
+FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a )
+{
+	_mm_store_ss(pSIMD, a);
+	_mm_store_ss(pSIMD+1, RotateLeft(a));
+	_mm_store_ss(pSIMD+2, RotateLeft2(a));
+}
+
+// strongly typed -- syntactic castor oil used for typechecking as we transition to SIMD
+FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a )
+{
+	StoreAlignedSIMD( pSIMD->Base(),a );
+}
+
+FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD )
+{
+	return _mm_load_ps( reinterpret_cast< const float *> ( pSIMD ) );
+}
+
+FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b )				// a & b
+{
+	return _mm_and_ps( a, b );
+}
+
+FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b )			// ~a & b
+{
+	return _mm_andnot_ps( a, b );
+}
+
+FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b )				// a ^ b
+{
+	return _mm_xor_ps( a, b );
+}
+
+FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b )				// a | b
+{
+	return _mm_or_ps( a, b );
+}
+
+// Squelch the w component of a vector to +0.0.
+// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)
+FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a )
+{
+	return AndSIMD( a, LoadAlignedSIMD( g_SIMD_clear_wmask ) );
+}
+
+// for the transitional class -- load a 3-by VectorAligned and squash its w component
+FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD )
+{
+	return SetWToZeroSIMD( LoadAlignedSIMD(pSIMD.Base()) );
+}
+
+FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD )
+{
+	return _mm_loadu_ps( reinterpret_cast<const float *>( pSIMD ) );
+}
+
+FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD )
+{
+	return _mm_loadu_ps( reinterpret_cast<const float *>( pSIMD ) );
+}
+
+/// replicate a single 32 bit integer value to all 4 components of an m128
+FORCEINLINE fltx4 ReplicateIX4( int i )
+{
+	fltx4 value = _mm_set_ss( * ( ( float *) &i ) );;
+	return _mm_shuffle_ps( value, value, 0);
+}
+
+
+FORCEINLINE fltx4 ReplicateX4( float flValue )
+{
+	__m128 value = _mm_set_ss( flValue );
+	return _mm_shuffle_ps( value, value, 0 );
+}
+
+
+FORCEINLINE float SubFloat( const fltx4 & a, int idx )
+{
+	// NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
+#ifndef POSIX
+	return a.m128_f32[ idx ];
+#else
+	return (reinterpret_cast<float const *>(&a))[idx];
+#endif
+}
+
+FORCEINLINE float & SubFloat( fltx4 & a, int idx )
+{
+#ifndef POSIX
+	return a.m128_f32[ idx ];
+#else
+	return (reinterpret_cast<float *>(&a))[idx];
+#endif
+}
+
+FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx )
+{
+	return (uint32)SubFloat(a,idx);
+}
+
+FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )
+{
+#ifndef POSIX
+	return a.m128_u32[idx];
+#else
+	return (reinterpret_cast<uint32 const *>(&a))[idx];
+#endif
+}
+
+FORCEINLINE uint32 & SubInt( fltx4 & a, int idx )
+{
+#ifndef POSIX
+	return a.m128_u32[idx];
+#else
+	return (reinterpret_cast<uint32 *>(&a))[idx];
+#endif
+}
+
+// Return one in the fastest way -- on the x360, faster even than loading.
+FORCEINLINE fltx4 LoadZeroSIMD( void )
+{
+	return Four_Zeros;
+}
+
+// Return one in the fastest way -- on the x360, faster even than loading.
+FORCEINLINE fltx4 LoadOneSIMD( void )
+{
+	return Four_Ones;
+}
+
+FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue )
+{
+	return OrSIMD(
+		AndSIMD( ReplacementMask, NewValue ),
+		AndNotSIMD( ReplacementMask, OldValue ) );
+}
+
+// remember, the SSE numbers its words 3 2 1 0
+// The way we want to specify shuffles is backwards from the default
+// MM_SHUFFLE_REV is in array index order (default is reversed)
+#define MM_SHUFFLE_REV(a,b,c,d) _MM_SHUFFLE(d,c,b,a)
+
+FORCEINLINE fltx4 SplatXSIMD( fltx4 const & a )
+{
+	return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 0, 0, 0, 0 ) );
+}
+
+FORCEINLINE fltx4 SplatYSIMD( fltx4 const &a )
+{
+	return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 1, 1, 1, 1 ) );
+}
+
+FORCEINLINE fltx4 SplatZSIMD( fltx4 const &a )
+{
+	return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 2, 2, 2 ) );
+}
+
+FORCEINLINE fltx4 SplatWSIMD( fltx4 const &a )
+{
+	return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 3, 3, 3, 3 ) );
+}
+
+FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x )
+{
+	fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[0] ), x, a );
+	return result;
+}
+
+FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y )
+{
+	fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[1] ), y, a );
+	return result;
+}
+
+FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z )
+{
+	fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[2] ), z, a );
+	return result;
+}
+
+FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w )
+{
+	fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[3] ), w, a );
+	return result;
+}
+
+FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, int nComponent, float flValue )
+{
+	fltx4 val = ReplicateX4( flValue );
+	fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[nComponent] ), val, a );
+	return result;
+}
+
+// a b c d -> b c d a
+FORCEINLINE fltx4 RotateLeft( const fltx4 & a )
+{
+	return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 1, 2, 3, 0 ) );
+}
+
+// a b c d -> c d a b
+FORCEINLINE fltx4 RotateLeft2( const fltx4 & a )
+{
+	return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 3, 0, 1 ) );
+}
+
+// a b c d -> d a b c
+FORCEINLINE fltx4 RotateRight( const fltx4 & a )
+{
+	return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 0, 3, 2, 1) );
+}
+
+// a b c d -> c d a b
+FORCEINLINE fltx4 RotateRight2( const fltx4 & a )
+{
+	return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 1, 0, 3, 2 ) );
+}
+
+
+FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b )				// a+b
+{
+	return _mm_add_ps( a, b );
+};
+
+FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b )				// a-b
+{
+	return _mm_sub_ps( a, b );
+};
+
+FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b )				// a*b
+{
+	return _mm_mul_ps( a, b );
+};
+
+FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b )				// a/b
+{
+	return _mm_div_ps( a, b );
+};
+
+FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c )				// a*b + c
+{
+	return AddSIMD( MulSIMD(a,b), c );
+}
+
+FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c )				// c - a*b
+{
+	return SubSIMD( c, MulSIMD(a,b) );
+};
+
+FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b )
+{
+	fltx4 m = MulSIMD( a, b );
+	float flDot = SubFloat( m, 0 ) + SubFloat( m, 1 ) + SubFloat( m, 2 );
+	return ReplicateX4( flDot );
+}
+
+FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b )
+{
+	fltx4 m = MulSIMD( a, b );
+	float flDot = SubFloat( m, 0 ) + SubFloat( m, 1 ) + SubFloat( m, 2 ) + SubFloat( m, 3 );
+	return ReplicateX4( flDot );
+}
+
+//TODO: implement as four-way Taylor series (see xbox implementation)
+FORCEINLINE fltx4 SinSIMD( const fltx4 &radians )
+{
+	fltx4 result;
+	SubFloat( result, 0 ) = sin( SubFloat( radians, 0 ) );
+	SubFloat( result, 1 ) = sin( SubFloat( radians, 1 ) );
+	SubFloat( result, 2 ) = sin( SubFloat( radians, 2 ) );
+	SubFloat( result, 3 ) = sin( SubFloat( radians, 3 ) );
+	return result;
+}
+
+FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
+{
+	// FIXME: Make a fast SSE version
+	SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
+	SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) );
+	SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) );
+}
+
+FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )				// a*b + c
+{
+	// FIXME: Make a fast SSE version
+	SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
+	SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) );
+	SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) );
+	SinCos( SubFloat( radians, 3 ), &SubFloat( sine, 3 ), &SubFloat( cosine, 3 ) );
+}
+
+//TODO: implement as four-way Taylor series (see xbox implementation)
+FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine )
+{
+	// FIXME: Make a fast SSE version
+	fltx4 result;
+	SubFloat( result, 0 ) = asin( SubFloat( sine, 0 ) );
+	SubFloat( result, 1 ) = asin( SubFloat( sine, 1 ) );
+	SubFloat( result, 2 ) = asin( SubFloat( sine, 2 ) );
+	SubFloat( result, 3 ) = asin( SubFloat( sine, 3 ) );
+	return result;
+}
+
+FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs )
+{
+	fltx4 result;
+	SubFloat( result, 0 ) = acos( SubFloat( cs, 0 ) );
+	SubFloat( result, 1 ) = acos( SubFloat( cs, 1 ) );
+	SubFloat( result, 2 ) = acos( SubFloat( cs, 2 ) );
+	SubFloat( result, 3 ) = acos( SubFloat( cs, 3 ) );
+	return result;
+}
+
+// tan^1(a/b) .. ie, pass sin in as a and cos in as b
+FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b )
+{
+	fltx4 result;
+	SubFloat( result, 0 ) = atan2( SubFloat( a, 0 ), SubFloat( b, 0 ) );
+	SubFloat( result, 1 ) = atan2( SubFloat( a, 1 ), SubFloat( b, 1 ) );
+	SubFloat( result, 2 ) = atan2( SubFloat( a, 2 ), SubFloat( b, 2 ) );
+	SubFloat( result, 3 ) = atan2( SubFloat( a, 3 ), SubFloat( b, 3 ) );
+	return result;
+}
+
+FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a
+{
+	return SubSIMD(LoadZeroSIMD(),a);
+}
+
+FORCEINLINE int TestSignSIMD( const fltx4 & a )								// mask of which floats have the high bit set
+{
+	return _mm_movemask_ps( a );
+}
+
+FORCEINLINE bool IsAnyNegative( const fltx4 & a )							// (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
+{
+	return (0 != TestSignSIMD( a ));
+}
+
+FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b )				// (a==b) ? ~0:0
+{
+	return _mm_cmpeq_ps( a, b );
+}
+
+FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b )				// (a>b) ? ~0:0
+{
+	return _mm_cmpgt_ps( a, b );
+}
+
+FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b )				// (a>=b) ? ~0:0
+{
+	return _mm_cmpge_ps( a, b );
+}
+
+FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b )				// (a<b) ? ~0:0
+{
+	return _mm_cmplt_ps( a, b );
+}
+
+FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b )				// (a<=b) ? ~0:0
+{
+	return _mm_cmple_ps( a, b );
+}
+
+// for branching when a.xyzw > b.xyzw
+FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b )
+{
+	return	TestSignSIMD( CmpLeSIMD( a, b ) ) == 0;
+}
+
+// for branching when a.xyzw >= b.xyzw
+FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b )
+{
+	return	TestSignSIMD( CmpLtSIMD( a, b ) ) == 0;
+}
+
+// For branching if all a.xyzw == b.xyzw
+FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b )
+{
+	return	TestSignSIMD( CmpEqSIMD( a, b ) ) == 0xf;
+}
+
+FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b )		// (a <= b && a >= -b) ? ~0 : 0
+{
+	return AndSIMD( CmpLeSIMD(a,b), CmpGeSIMD(a, NegSIMD(b)) );
+}
+
+FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b )				// min(a,b)
+{
+	return _mm_min_ps( a, b );
+}
+
+FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b )				// max(a,b)
+{
+	return _mm_max_ps( a, b );
+}
+
+
+
+// SSE lacks rounding operations. 
+// Really.
+// You can emulate them by setting the rounding mode for the 
+// whole processor and then converting to int, and then back again.
+// But every time you set the rounding mode, you clear out the
+// entire pipeline. So, I can't do them per operation. You
+// have to do it once, before the loop that would call these.
+// Round towards positive infinity
+FORCEINLINE fltx4 CeilSIMD( const fltx4 &a )
+{
+	fltx4 retVal;
+	SubFloat( retVal, 0 ) = ceil( SubFloat( a, 0 ) );
+	SubFloat( retVal, 1 ) = ceil( SubFloat( a, 1 ) );
+	SubFloat( retVal, 2 ) = ceil( SubFloat( a, 2 ) );
+	SubFloat( retVal, 3 ) = ceil( SubFloat( a, 3 ) );
+	return retVal;
+
+}
+
+fltx4 fabs( const fltx4 & x );
+// Round towards negative infinity
+// This is the implementation that was here before; it assumes
+// you are in round-to-floor mode, which I guess is usually the
+// case for us vis-a-vis SSE. It's totally unnecessary on 
+// VMX, which has a native floor op.
+FORCEINLINE fltx4 FloorSIMD( const fltx4 &val )
+{
+	fltx4 fl4Abs = fabs( val );
+	fltx4 ival = SubSIMD( AddSIMD( fl4Abs, Four_2ToThe23s ), Four_2ToThe23s );
+	ival = MaskedAssign( CmpGtSIMD( ival, fl4Abs ), SubSIMD( ival, Four_Ones ), ival );
+	return XorSIMD( ival, XorSIMD( val, fl4Abs ) );			// restore sign bits
+}
+
+
+
+inline bool IsAllZeros( const fltx4 & var )
+{
+	return TestSignSIMD( CmpEqSIMD( var, Four_Zeros ) ) == 0xF;
+}
+
+FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a )					// sqrt(a), more or less
+{
+	return _mm_sqrt_ps( a );
+}
+
+FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a )						// sqrt(a)
+{
+	return _mm_sqrt_ps( a );
+}
+
+FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a )			// 1/sqrt(a), more or less
+{
+	return _mm_rsqrt_ps( a );
+}
+
+FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a )
+{
+	fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
+	fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
+	ret = ReciprocalSqrtEstSIMD( ret );
+	return ret;
+}
+
+/// uses newton iteration for higher precision results than ReciprocalSqrtEstSIMD
+FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a )				// 1/sqrt(a)
+{
+	fltx4 guess = ReciprocalSqrtEstSIMD( a );
+	// newton iteration for 1/sqrt(a) : y(n+1) = 1/2 (y(n)*(3-a*y(n)^2));
+	guess = MulSIMD( guess, SubSIMD( Four_Threes, MulSIMD( a, MulSIMD( guess, guess ))));
+	guess = MulSIMD( Four_PointFives, guess);
+	return guess;
+}
+
+FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a )				// 1/a, more or less
+{
+	return _mm_rcp_ps( a );
+}
+
+/// 1/x for all 4 values, more or less
+/// 1/0 will result in a big but NOT infinite result
+FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a )
+{
+	fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
+	fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
+	ret = ReciprocalEstSIMD( ret );
+	return ret;
+}
+
+/// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration.
+/// No error checking!
+FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a )					// 1/a
+{
+	fltx4 ret = ReciprocalEstSIMD( a );
+	// newton iteration is: Y(n+1) = 2*Y(n)-a*Y(n)^2
+	ret = SubSIMD( AddSIMD( ret, ret ), MulSIMD( a, MulSIMD( ret, ret ) ) );
+	return ret;
+}
+
+/// 1/x for all 4 values.
+/// 1/0 will result in a big but NOT infinite result
+FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a )
+{
+	fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
+	fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
+	ret = ReciprocalSIMD( ret );
+	return ret;
+}
+
+// CHRISG: is it worth doing integer bitfiddling for this?
+// 2^x for all values (the antilog)
+FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower )
+{
+	fltx4 retval;
+	SubFloat( retval, 0 ) = powf( 2, SubFloat(toPower, 0) );
+	SubFloat( retval, 1 ) = powf( 2, SubFloat(toPower, 1) );
+	SubFloat( retval, 2 ) = powf( 2, SubFloat(toPower, 2) );
+	SubFloat( retval, 3 ) = powf( 2, SubFloat(toPower, 3) );
+
+	return retval;
+}
+
+// Clamps the components of a vector to a specified minimum and maximum range.
+FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max)
+{
+	return MaxSIMD( min, MinSIMD( max, in ) );
+}
+
+FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w)
+{
+	_MM_TRANSPOSE4_PS( x, y, z, w );
+}
+
+FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 &a )
+{
+	// a is [x,y,z,G] (where G is garbage)
+	// rotate left by one 
+	fltx4 compareOne = RotateLeft( a );
+	// compareOne is [y,z,G,x]
+	fltx4 retval = MinSIMD( a, compareOne );
+	// retVal is [min(x,y), ... ]
+	compareOne = RotateLeft2( a );
+	// compareOne is [z, G, x, y]
+	retval = MinSIMD( retval, compareOne );
+	// retVal = [ min(min(x,y),z)..]
+	// splat the x component out to the whole vector and return
+	return SplatXSIMD( retval );
+	
+}
+
+FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 &a )
+{
+	// a is [x,y,z,G] (where G is garbage)
+	// rotate left by one 
+	fltx4 compareOne = RotateLeft( a );
+	// compareOne is [y,z,G,x]
+	fltx4 retval = MaxSIMD( a, compareOne );
+	// retVal is [max(x,y), ... ]
+	compareOne = RotateLeft2( a );
+	// compareOne is [z, G, x, y]
+	retval = MaxSIMD( retval, compareOne );
+	// retVal = [ max(max(x,y),z)..]
+	// splat the x component out to the whole vector and return
+	return SplatXSIMD( retval );
+	
+}
+
+// ------------------------------------
+// INTEGER SIMD OPERATIONS.
+// ------------------------------------
+
+
+#if 0				/* pc does not have these ops */
+// splat all components of a vector to a signed immediate int number.
+FORCEINLINE fltx4 IntSetImmediateSIMD(int to)
+{
+	//CHRISG: SSE2 has this, but not SSE1. What to do?
+	fltx4 retval;
+	SubInt( retval, 0 ) = to;
+	SubInt( retval, 1 ) = to;
+	SubInt( retval, 2 ) = to;
+	SubInt( retval, 3 ) = to;
+	return retval;
+}
+#endif
+
+// Load 4 aligned words into a SIMD register
+FORCEINLINE i32x4 LoadAlignedIntSIMD( const void * RESTRICT pSIMD)
+{
+	return _mm_load_ps( reinterpret_cast<const float *>(pSIMD) );
+}
+
+// Load 4 unaligned words into a SIMD register
+FORCEINLINE i32x4 LoadUnalignedIntSIMD( const void * RESTRICT pSIMD)
+{
+	return _mm_loadu_ps( reinterpret_cast<const float *>(pSIMD) );
+}
+
+// save into four words, 16-byte aligned
+FORCEINLINE void StoreAlignedIntSIMD( int32 * RESTRICT pSIMD, const fltx4 & a )
+{
+	_mm_store_ps( reinterpret_cast<float *>(pSIMD), a );
+}
+
+FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a )
+{
+	_mm_store_ps( reinterpret_cast<float *>(pSIMD.Base()), a );
+}
+
+FORCEINLINE void StoreUnalignedIntSIMD( int32 * RESTRICT pSIMD, const fltx4 & a )
+{
+	_mm_storeu_ps( reinterpret_cast<float *>(pSIMD), a );
+}
+
+
+// CHRISG: the conversion functions all seem to operate on m64's only...
+// how do we make them work here?
+
+// Take a fltx4 containing fixed-point uints and 
+// return them as single precision floats. No
+// fixed point conversion is done.
+FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const u32x4 &vSrcA )
+{
+	fltx4 retval;
+	SubFloat( retval, 0 ) = ( (float) SubInt( retval, 0 ) );
+	SubFloat( retval, 1 ) = ( (float) SubInt( retval, 1 ) );
+	SubFloat( retval, 2 ) = ( (float) SubInt( retval, 2 ) );
+	SubFloat( retval, 3 ) = ( (float) SubInt( retval, 3 ) );
+	return retval;
+}
+
+
+// Take a fltx4 containing fixed-point sints and 
+// return them as single precision floats. No 
+// fixed point conversion is done.
+FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA )
+{
+	fltx4 retval;
+	SubFloat( retval, 0 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[0]));
+	SubFloat( retval, 1 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[1]));
+	SubFloat( retval, 2 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[2]));
+	SubFloat( retval, 3 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[3]));
+	return retval;
+}
+
+/*
+  works on fltx4's as if they are four uints.
+  the first parameter contains the words to be shifted,
+  the second contains the amount to shift by AS INTS
+
+  for i = 0 to 3
+  shift = vSrcB_i*32:(i*32)+4
+  vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift
+*/
+FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4 &vSrcA, const i32x4 &vSrcB)
+{
+	i32x4 retval;
+	SubInt(retval, 0) = SubInt(vSrcA, 0) << SubInt(vSrcB, 0);
+	SubInt(retval, 1) = SubInt(vSrcA, 1) << SubInt(vSrcB, 1);
+	SubInt(retval, 2) = SubInt(vSrcA, 2) << SubInt(vSrcB, 2);
+	SubInt(retval, 3) = SubInt(vSrcA, 3) << SubInt(vSrcB, 3);
+
+
+	return retval;
+}
+
+
+// Fixed-point conversion and save as SIGNED INTS.
+// pDest->x = Int (vSrc.x)
+// note: some architectures have means of doing 
+// fixed point conversion when the fix depth is
+// specified as an immediate.. but there is no way 
+// to guarantee an immediate as a parameter to function
+// like this.
+FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc)
+{
+#if defined( COMPILER_MSVC64 )
+
+	(*pDest)[0] = SubFloat( vSrc, 0 );
+	(*pDest)[1] = SubFloat( vSrc, 1 );
+	(*pDest)[2] = SubFloat( vSrc, 2 );
+	(*pDest)[3] = SubFloat( vSrc, 3 );
+
+#else
+	__m64 bottom = _mm_cvttps_pi32( vSrc );
+	__m64 top    = _mm_cvttps_pi32( _mm_movehl_ps(vSrc,vSrc) );
+
+	*reinterpret_cast<__m64 *>(&(*pDest)[0]) = bottom;
+	*reinterpret_cast<__m64 *>(&(*pDest)[2]) = top;
+
+	_mm_empty();
+#endif
+}
+
+
+
+#endif
+
+
+
+/// class FourVectors stores 4 independent vectors for use in SIMD processing. These vectors are
+/// stored in the format x x x x y y y y z z z z so that they can be efficiently SIMD-accelerated.
+class ALIGN16 FourVectors
+{
+public:
+	fltx4 x, y, z;
+
+	FORCEINLINE void DuplicateVector(Vector const &v)			//< set all 4 vectors to the same vector value
+	{
+		x=ReplicateX4(v.x);
+		y=ReplicateX4(v.y);
+		z=ReplicateX4(v.z);
+	}
+
+	FORCEINLINE fltx4 const & operator[](int idx) const
+	{
+		return *((&x)+idx);
+	}
+
+	FORCEINLINE fltx4 & operator[](int idx)
+	{
+		return *((&x)+idx);
+	}
+
+	FORCEINLINE void operator+=(FourVectors const &b)			//< add 4 vectors to another 4 vectors
+	{
+		x=AddSIMD(x,b.x);
+		y=AddSIMD(y,b.y);
+		z=AddSIMD(z,b.z);
+	}
+
+	FORCEINLINE void operator-=(FourVectors const &b)			//< subtract 4 vectors from another 4
+	{
+		x=SubSIMD(x,b.x);
+		y=SubSIMD(y,b.y);
+		z=SubSIMD(z,b.z);
+	}
+
+	FORCEINLINE void operator*=(FourVectors const &b)			//< scale all four vectors per component scale
+	{
+		x=MulSIMD(x,b.x);
+		y=MulSIMD(y,b.y);
+		z=MulSIMD(z,b.z);
+	}
+
+	FORCEINLINE void operator*=(const fltx4 & scale)			//< scale 
+	{
+		x=MulSIMD(x,scale);
+		y=MulSIMD(y,scale);
+		z=MulSIMD(z,scale);
+	}
+
+	FORCEINLINE void operator*=(float scale)					//< uniformly scale all 4 vectors
+	{
+		fltx4 scalepacked = ReplicateX4(scale);
+		*this *= scalepacked;
+	}
+
+	FORCEINLINE fltx4 operator*(FourVectors const &b) const		//< 4 dot products
+	{
+		fltx4 dot=MulSIMD(x,b.x);
+		dot=MaddSIMD(y,b.y,dot);
+		dot=MaddSIMD(z,b.z,dot);
+		return dot;
+	}
+
+	FORCEINLINE fltx4 operator*(Vector const &b) const			//< dot product all 4 vectors with 1 vector
+	{
+		fltx4 dot=MulSIMD(x,ReplicateX4(b.x));
+		dot=MaddSIMD(y,ReplicateX4(b.y), dot);
+		dot=MaddSIMD(z,ReplicateX4(b.z), dot);
+		return dot;
+	}
+
+	FORCEINLINE void VProduct(FourVectors const &b)				//< component by component mul
+	{
+		x=MulSIMD(x,b.x);
+		y=MulSIMD(y,b.y);
+		z=MulSIMD(z,b.z);
+	}
+	FORCEINLINE void MakeReciprocal(void)						//< (x,y,z)=(1/x,1/y,1/z)
+	{
+		x=ReciprocalSIMD(x);
+		y=ReciprocalSIMD(y);
+		z=ReciprocalSIMD(z);
+	}
+
+	FORCEINLINE void MakeReciprocalSaturate(void)				//< (x,y,z)=(1/x,1/y,1/z), 1/0=1.0e23
+	{
+		x=ReciprocalSaturateSIMD(x);
+		y=ReciprocalSaturateSIMD(y);
+		z=ReciprocalSaturateSIMD(z);
+	}
+
+	// Assume the given matrix is a rotation, and rotate these vectors by it.
+	// If you have a long list of FourVectors structures that you all want 
+	// to rotate by the same matrix, use FourVectors::RotateManyBy() instead.
+	inline void RotateBy(const matrix3x4_t& matrix);
+
+	/// You can use this to rotate a long array of FourVectors all by the same
+	/// matrix. The first parameter is the head of the array. The second is the
+	/// number of vectors to rotate. The third is the matrix.
+	static void RotateManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix );
+
+	/// Assume the vectors are points, and transform them in place by the matrix.
+	inline void TransformBy(const matrix3x4_t& matrix);
+
+	/// You can use this to Transform a long array of FourVectors all by the same
+	/// matrix. The first parameter is the head of the array. The second is the
+	/// number of vectors to rotate. The third is the matrix. The fourth is the 
+	/// output buffer, which must not overlap the pVectors buffer. This is not 
+	/// an in-place transformation.
+	static void TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut );
+
+	/// You can use this to Transform a long array of FourVectors all by the same
+	/// matrix. The first parameter is the head of the array. The second is the
+	/// number of vectors to rotate. The third is the matrix. The fourth is the 
+	/// output buffer, which must not overlap the pVectors buffer. 
+	/// This is an in-place transformation.
+	static void TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix );
+
+	// X(),Y(),Z() - get at the desired component of the i'th (0..3) vector.
+	FORCEINLINE const float & X(int idx) const
+	{
+		// NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
+		return SubFloat( (fltx4 &)x, idx );
+	}
+
+	FORCEINLINE const float & Y(int idx) const
+	{
+		return SubFloat( (fltx4 &)y, idx );
+	}
+
+	FORCEINLINE const float & Z(int idx) const
+	{
+		return SubFloat( (fltx4 &)z, idx );
+	}
+
+	FORCEINLINE float & X(int idx)
+	{
+		return SubFloat( x, idx );
+	}
+
+	FORCEINLINE float & Y(int idx)
+	{
+		return SubFloat( y, idx );
+	}
+
+	FORCEINLINE float & Z(int idx)
+	{
+		return SubFloat( z, idx );
+	}
+
+	FORCEINLINE Vector Vec(int idx) const						//< unpack one of the vectors
+	{
+		return Vector( X(idx), Y(idx), Z(idx) );
+	}
+	
+	FourVectors(void)
+	{
+	}
+
+	FourVectors( FourVectors const &src )
+	{
+		x=src.x;
+		y=src.y;
+		z=src.z;
+	}
+
+	FORCEINLINE void operator=( FourVectors const &src )
+	{
+		x=src.x;
+		y=src.y;
+		z=src.z;
+	}
+
+	/// LoadAndSwizzle - load 4 Vectors into a FourVectors, performing transpose op
+	FORCEINLINE void LoadAndSwizzle(Vector const &a, Vector const &b, Vector const &c, Vector const &d)
+	{
+		// TransposeSIMD has large sub-expressions that the compiler can't eliminate on x360
+		// use an unfolded implementation here
+#if _X360
+		fltx4 tx = LoadUnalignedSIMD( &a.x );
+		fltx4 ty = LoadUnalignedSIMD( &b.x );
+		fltx4 tz = LoadUnalignedSIMD( &c.x );
+		fltx4 tw = LoadUnalignedSIMD( &d.x );
+		fltx4 r0 = __vmrghw(tx, tz);
+		fltx4 r1 = __vmrghw(ty, tw);
+		fltx4 r2 = __vmrglw(tx, tz);
+		fltx4 r3 = __vmrglw(ty, tw);
+
+		x = __vmrghw(r0, r1);
+		y = __vmrglw(r0, r1);
+		z = __vmrghw(r2, r3);
+#else
+		x		= LoadUnalignedSIMD( &( a.x ));
+		y		= LoadUnalignedSIMD( &( b.x ));
+		z		= LoadUnalignedSIMD( &( c.x ));
+		fltx4 w = LoadUnalignedSIMD( &( d.x ));
+		// now, matrix is:
+		// x y z ?
+		// x y z ?
+		// x y z ?
+		// x y z ?
+		TransposeSIMD(x, y, z, w);
+#endif
+	}
+
+	/// LoadAndSwizzleAligned - load 4 Vectors into a FourVectors, performing transpose op.
+	/// all 4 vectors must be 128 bit boundary
+	FORCEINLINE void LoadAndSwizzleAligned(const float *RESTRICT a, const float *RESTRICT b, const float *RESTRICT c, const float *RESTRICT d)
+	{
+#if _X360
+		fltx4 tx = LoadAlignedSIMD(a);
+		fltx4 ty = LoadAlignedSIMD(b);
+		fltx4 tz = LoadAlignedSIMD(c);
+		fltx4 tw = LoadAlignedSIMD(d);
+		fltx4 r0 = __vmrghw(tx, tz);
+		fltx4 r1 = __vmrghw(ty, tw);
+		fltx4 r2 = __vmrglw(tx, tz);
+		fltx4 r3 = __vmrglw(ty, tw);
+
+		x = __vmrghw(r0, r1);
+		y = __vmrglw(r0, r1);
+		z = __vmrghw(r2, r3);
+#else
+		x		= LoadAlignedSIMD( a );
+		y		= LoadAlignedSIMD( b );
+		z		= LoadAlignedSIMD( c );
+		fltx4 w = LoadAlignedSIMD( d );
+		// now, matrix is:
+		// x y z ?
+		// x y z ?
+		// x y z ?
+		// x y z ?
+		TransposeSIMD( x, y, z, w );
+#endif
+	}
+
+	FORCEINLINE void LoadAndSwizzleAligned(Vector const &a, Vector const &b, Vector const &c, Vector const &d)
+	{
+		LoadAndSwizzleAligned( &a.x, &b.x, &c.x, &d.x );
+	}
+
+	/// return the squared length of all 4 vectors
+	FORCEINLINE fltx4 length2(void) const
+	{
+		return (*this)*(*this);
+	}
+
+	/// return the approximate length of all 4 vectors. uses the sqrt approximation instruction
+	FORCEINLINE fltx4 length(void) const
+	{
+		return SqrtEstSIMD(length2());
+	}
+
+	/// normalize all 4 vectors in place. not mega-accurate (uses reciprocal approximation instruction)
+	FORCEINLINE void VectorNormalizeFast(void)
+	{
+		fltx4 mag_sq=(*this)*(*this);						// length^2
+		(*this) *= ReciprocalSqrtEstSIMD(mag_sq);			// *(1.0/sqrt(length^2))
+	}
+
+	/// normalize all 4 vectors in place.
+	FORCEINLINE void VectorNormalize(void)
+	{
+		fltx4 mag_sq=(*this)*(*this);						// length^2
+		(*this) *= ReciprocalSqrtSIMD(mag_sq);				// *(1.0/sqrt(length^2))
+	}
+
+	/// construct a FourVectors from 4 separate Vectors
+	FORCEINLINE FourVectors(Vector const &a, Vector const &b, Vector const &c, Vector const &d)
+	{
+		LoadAndSwizzle(a,b,c,d);
+	}
+
+	/// construct a FourVectors from 4 separate Vectors
+	FORCEINLINE FourVectors(VectorAligned const &a, VectorAligned const &b, VectorAligned const &c, VectorAligned const &d)
+	{
+		LoadAndSwizzleAligned(a,b,c,d);
+	}
+
+	FORCEINLINE fltx4 DistToSqr( FourVectors const &pnt )
+	{
+		fltx4 fl4dX = SubSIMD( pnt.x, x );
+		fltx4 fl4dY = SubSIMD( pnt.y, y );
+		fltx4 fl4dZ = SubSIMD( pnt.z, z );
+		return AddSIMD( MulSIMD( fl4dX, fl4dX), AddSIMD( MulSIMD( fl4dY, fl4dY ), MulSIMD( fl4dZ, fl4dZ ) ) );
+
+	}
+	
+	FORCEINLINE fltx4 TValueOfClosestPointOnLine( FourVectors const &p0, FourVectors const &p1 ) const
+	{
+		FourVectors lineDelta = p1;
+		lineDelta -= p0;
+		fltx4 OOlineDirDotlineDir = ReciprocalSIMD( p1 * p1 );
+		FourVectors v4OurPnt = *this;
+		v4OurPnt -= p0;
+		return MulSIMD( OOlineDirDotlineDir, v4OurPnt * lineDelta );
+	}
+
+	FORCEINLINE fltx4 DistSqrToLineSegment( FourVectors const &p0, FourVectors const &p1 ) const
+	{
+		FourVectors lineDelta = p1;
+		FourVectors v4OurPnt = *this;
+		v4OurPnt -= p0;
+		lineDelta -= p0;
+
+		fltx4 OOlineDirDotlineDir = ReciprocalSIMD( lineDelta * lineDelta );
+
+		fltx4 fl4T = MulSIMD( OOlineDirDotlineDir, v4OurPnt * lineDelta );
+
+		fl4T = MinSIMD( fl4T, Four_Ones );
+		fl4T = MaxSIMD( fl4T, Four_Zeros );
+		lineDelta *= fl4T;
+		return v4OurPnt.DistToSqr( lineDelta );
+	}
+
+};
+
+/// form 4 cross products
+inline FourVectors operator ^(const FourVectors &a, const FourVectors &b)
+{
+	FourVectors ret;
+	ret.x=SubSIMD(MulSIMD(a.y,b.z),MulSIMD(a.z,b.y));
+	ret.y=SubSIMD(MulSIMD(a.z,b.x),MulSIMD(a.x,b.z));
+	ret.z=SubSIMD(MulSIMD(a.x,b.y),MulSIMD(a.y,b.x));
+	return ret;
+}
+
+/// component-by-componentwise MAX operator
+inline FourVectors maximum(const FourVectors &a, const FourVectors &b)
+{
+	FourVectors ret;
+	ret.x=MaxSIMD(a.x,b.x);
+	ret.y=MaxSIMD(a.y,b.y);
+	ret.z=MaxSIMD(a.z,b.z);
+	return ret;
+}
+
+/// component-by-componentwise MIN operator
+inline FourVectors minimum(const FourVectors &a, const FourVectors &b)
+{
+	FourVectors ret;
+	ret.x=MinSIMD(a.x,b.x);
+	ret.y=MinSIMD(a.y,b.y);
+	ret.z=MinSIMD(a.z,b.z);
+	return ret;
+}
+
+/// calculate reflection vector. incident and normal dir assumed normalized
+FORCEINLINE FourVectors VectorReflect( const FourVectors &incident, const FourVectors &normal )
+{
+	FourVectors ret = incident;
+	fltx4 iDotNx2 = incident * normal;
+	iDotNx2 = AddSIMD( iDotNx2, iDotNx2 );
+	FourVectors nPart = normal;
+	nPart *= iDotNx2;
+	ret -= nPart;											// i-2(n*i)n
+	return ret;
+}
+
+/// calculate slide vector. removes all components of a vector which are perpendicular to a normal vector.
+FORCEINLINE FourVectors VectorSlide( const FourVectors &incident, const FourVectors &normal )
+{
+	FourVectors ret = incident;
+	fltx4 iDotN = incident * normal;
+	FourVectors nPart = normal;
+	nPart *= iDotN;
+	ret -= nPart;											// i-(n*i)n
+	return ret;
+}
+
+
+// Assume the given matrix is a rotation, and rotate these vectors by it.
+// If you have a long list of FourVectors structures that you all want 
+// to rotate by the same matrix, use FourVectors::RotateManyBy() instead.
+void FourVectors::RotateBy(const matrix3x4_t& matrix)
+{
+	// Splat out each of the entries in the matrix to a fltx4. Do this
+	// in the order that we will need them, to hide latency. I'm
+	// avoiding making an array of them, so that they'll remain in 
+	// registers.
+	fltx4 matSplat00, matSplat01, matSplat02,
+		matSplat10, matSplat11, matSplat12,
+		matSplat20, matSplat21, matSplat22;
+
+ {
+	// Load the matrix into local vectors. Sadly, matrix3x4_ts are 
+	// often unaligned. The w components will be the tranpose row of
+	// the matrix, but we don't really care about that.
+	fltx4 matCol0 = LoadUnalignedSIMD( matrix[0] );
+	fltx4 matCol1 = LoadUnalignedSIMD( matrix[1] );
+	fltx4 matCol2 = LoadUnalignedSIMD( matrix[2] );
+	
+	matSplat00 = SplatXSIMD( matCol0 );
+	matSplat01 = SplatYSIMD( matCol0 );
+	matSplat02 = SplatZSIMD( matCol0 );
+
+	matSplat10 = SplatXSIMD( matCol1 );
+	matSplat11 = SplatYSIMD( matCol1 );
+	matSplat12 = SplatZSIMD( matCol1 );
+	
+	matSplat20 = SplatXSIMD( matCol2 );
+	matSplat21 = SplatYSIMD( matCol2 );
+	matSplat22 = SplatZSIMD( matCol2 );
+ }
+
+	// Trust in the compiler to schedule these operations correctly:
+	fltx4 outX, outY, outZ;
+	outX = AddSIMD( AddSIMD( MulSIMD( x, matSplat00 ), MulSIMD( y, matSplat01 ) ), MulSIMD( z, matSplat02 ) );
+	outY = AddSIMD( AddSIMD( MulSIMD( x, matSplat10 ), MulSIMD( y, matSplat11 ) ), MulSIMD( z, matSplat12 ) );
+	outZ = AddSIMD( AddSIMD( MulSIMD( x, matSplat20 ), MulSIMD( y, matSplat21 ) ), MulSIMD( z, matSplat22 ) );
+	
+	x = outX;
+	y = outY;
+	z = outZ;
+}
+
+// Assume the given matrix is a rotation, and rotate these vectors by it.
+// If you have a long list of FourVectors structures that you all want 
+// to rotate by the same matrix, use FourVectors::RotateManyBy() instead.
+void FourVectors::TransformBy(const matrix3x4_t& matrix)
+{
+	// Splat out each of the entries in the matrix to a fltx4. Do this
+	// in the order that we will need them, to hide latency. I'm
+	// avoiding making an array of them, so that they'll remain in 
+	// registers.
+	fltx4 matSplat00, matSplat01, matSplat02,
+		matSplat10, matSplat11, matSplat12,
+		matSplat20, matSplat21, matSplat22;
+
+ {
+	// Load the matrix into local vectors. Sadly, matrix3x4_ts are 
+	// often unaligned. The w components will be the tranpose row of
+	// the matrix, but we don't really care about that.
+	fltx4 matCol0 = LoadUnalignedSIMD( matrix[0] );
+	fltx4 matCol1 = LoadUnalignedSIMD( matrix[1] );
+	fltx4 matCol2 = LoadUnalignedSIMD( matrix[2] );
+	
+	matSplat00 = SplatXSIMD( matCol0 );
+	matSplat01 = SplatYSIMD( matCol0 );
+	matSplat02 = SplatZSIMD( matCol0 );
+	
+	matSplat10 = SplatXSIMD( matCol1 );
+	matSplat11 = SplatYSIMD( matCol1 );
+	matSplat12 = SplatZSIMD( matCol1 );
+	
+	matSplat20 = SplatXSIMD( matCol2 );
+	matSplat21 = SplatYSIMD( matCol2 );
+	matSplat22 = SplatZSIMD( matCol2 );
+ }
+	
+	// Trust in the compiler to schedule these operations correctly:
+	fltx4 outX, outY, outZ;
+	
+	outX = MaddSIMD( z, matSplat02, AddSIMD( MulSIMD( x, matSplat00 ), MulSIMD( y, matSplat01 ) ) );
+	outY = MaddSIMD( z, matSplat12, AddSIMD( MulSIMD( x, matSplat10 ), MulSIMD( y, matSplat11 ) ) );
+	outZ = MaddSIMD( z, matSplat22, AddSIMD( MulSIMD( x, matSplat20 ), MulSIMD( y, matSplat21 ) ) );
+	
+	x = AddSIMD( outX, ReplicateX4( matrix[0][3] ));
+	y = AddSIMD( outY, ReplicateX4( matrix[1][3] ));
+	 z = AddSIMD( outZ, ReplicateX4( matrix[2][3] ));
+}
+
+
+
+/// quick, low quality perlin-style noise() function suitable for real time use.
+/// return value is -1..1. Only reliable around +/- 1 million or so.
+fltx4 NoiseSIMD( const fltx4 & x, const fltx4 & y, const fltx4 & z );
+fltx4 NoiseSIMD( FourVectors const &v );
+
+// vector valued noise direction
+FourVectors DNoiseSIMD( FourVectors const &v );
+
+// vector value "curl" noise function. see http://hyperphysics.phy-astr.gsu.edu/hbase/curl.html
+FourVectors CurlNoiseSIMD( FourVectors const &v );
+
+
+/// calculate the absolute value of a packed single
+inline fltx4 fabs( const fltx4 & x )
+{
+	return AndSIMD( x, LoadAlignedSIMD( g_SIMD_clear_signmask ) );
+}
+
+/// negate all four components of a SIMD packed single
+inline fltx4 fnegate( const fltx4 & x )
+{
+	return XorSIMD( x, LoadAlignedSIMD( g_SIMD_signmask ) );
+}
+
+
+fltx4 Pow_FixedPoint_Exponent_SIMD( const fltx4 & x, int exponent);
+
+// PowSIMD - raise a SIMD register to a power.  This is analogous to the C pow() function, with some
+// restictions: fractional exponents are only handled with 2 bits of precision. Basically,
+// fractions of 0,.25,.5, and .75 are handled. PowSIMD(x,.30) will be the same as PowSIMD(x,.25).
+// negative and fractional powers are handled by the SIMD reciprocal and square root approximation
+// instructions and so are not especially accurate ----Note that this routine does not raise
+// numeric exceptions because it uses SIMD--- This routine is O(log2(exponent)).
+inline fltx4 PowSIMD( const fltx4 & x, float exponent )
+{
+	return Pow_FixedPoint_Exponent_SIMD(x,(int) (4.0*exponent));
+}
+
+
+
+// random number generation - generate 4 random numbers quickly.
+
+void SeedRandSIMD(uint32 seed);								// seed the random # generator
+fltx4 RandSIMD( int nContext = 0 );							// return 4 numbers in the 0..1 range
+
+// for multithreaded, you need to use these and use the argument form of RandSIMD:
+int GetSIMDRandContext( void );
+void ReleaseSIMDRandContext( int nContext );
+
+FORCEINLINE fltx4 RandSignedSIMD( void )					// -1..1
+{
+	return SubSIMD( MulSIMD( Four_Twos, RandSIMD() ), Four_Ones );
+}
+
+
+// SIMD versions of mathlib simplespline functions
+// hermite basis function for smooth interpolation
+// Similar to Gain() above, but very cheap to call
+// value should be between 0 & 1 inclusive
+inline fltx4 SimpleSpline( const fltx4 & value )
+{
+	// Arranged to avoid a data dependency between these two MULs:
+	fltx4 valueDoubled = MulSIMD( value, Four_Twos );
+	fltx4 valueSquared = MulSIMD( value, value );
+
+	// Nice little ease-in, ease-out spline-like curve
+	return SubSIMD(
+		MulSIMD( Four_Threes,  valueSquared ),
+		MulSIMD( valueDoubled, valueSquared ) );
+}
+
+// remaps a value in [startInterval, startInterval+rangeInterval] from linear to
+// spline using SimpleSpline
+inline fltx4 SimpleSplineRemapValWithDeltas( const fltx4 & val,
+											 const fltx4 & A, const fltx4 & BMinusA,
+											 const fltx4 & OneOverBMinusA, const fltx4 & C, 
+											 const fltx4 & DMinusC )
+{
+// 	if ( A == B )
+// 		return val >= B ? D : C;
+	fltx4 cVal = MulSIMD( SubSIMD( val, A), OneOverBMinusA );
+	return AddSIMD( C, MulSIMD( DMinusC, SimpleSpline( cVal ) ) );
+}
+
+inline fltx4 SimpleSplineRemapValWithDeltasClamped( const fltx4 & val,
+													const fltx4 & A, const fltx4 & BMinusA,
+													const fltx4 & OneOverBMinusA, const fltx4 & C, 
+													const fltx4 & DMinusC )
+{
+// 	if ( A == B )
+// 		return val >= B ? D : C;
+	fltx4 cVal = MulSIMD( SubSIMD( val, A), OneOverBMinusA );
+	cVal = MinSIMD( Four_Ones, MaxSIMD( Four_Zeros, cVal ) );
+	return AddSIMD( C, MulSIMD( DMinusC, SimpleSpline( cVal ) ) );
+}
+
+FORCEINLINE fltx4 FracSIMD( const fltx4 &val )
+{
+	fltx4 fl4Abs = fabs( val );
+	fltx4 ival = SubSIMD( AddSIMD( fl4Abs, Four_2ToThe23s ), Four_2ToThe23s );
+	ival = MaskedAssign( CmpGtSIMD( ival, fl4Abs ), SubSIMD( ival, Four_Ones ), ival );
+	return XorSIMD( SubSIMD( fl4Abs, ival ), XorSIMD( val, fl4Abs ) );			// restore sign bits
+}
+
+FORCEINLINE fltx4 Mod2SIMD( const fltx4 &val )
+{
+	fltx4 fl4Abs = fabs( val );
+	fltx4 ival = SubSIMD( AndSIMD( LoadAlignedSIMD( (float *) g_SIMD_lsbmask ), AddSIMD( fl4Abs, Four_2ToThe23s ) ), Four_2ToThe23s );
+	ival = MaskedAssign( CmpGtSIMD( ival, fl4Abs ), SubSIMD( ival, Four_Twos ), ival );
+	return XorSIMD( SubSIMD( fl4Abs, ival ), XorSIMD( val, fl4Abs ) );			// restore sign bits
+}
+
+FORCEINLINE fltx4 Mod2SIMDPositiveInput( const fltx4 &val )
+{
+	fltx4 ival = SubSIMD( AndSIMD( LoadAlignedSIMD( g_SIMD_lsbmask ), AddSIMD( val, Four_2ToThe23s ) ), Four_2ToThe23s );
+	ival = MaskedAssign( CmpGtSIMD( ival, val ), SubSIMD( ival, Four_Twos ), ival );
+	return SubSIMD( val, ival );
+}
+
+
+// approximate sin of an angle, with -1..1 representing the whole sin wave period instead of -pi..pi.
+// no range reduction is done - for values outside of 0..1 you won't like the results
+FORCEINLINE fltx4 _SinEst01SIMD( const fltx4 &val )
+{
+	// really rough approximation - x*(4-x*4) - a parabola. s(0) = 0, s(.5) = 1, s(1)=0, smooth in-between.
+	// sufficient for simple oscillation.
+	return MulSIMD( val, SubSIMD( Four_Fours, MulSIMD( val, Four_Fours ) ) );
+}
+
+FORCEINLINE fltx4 _Sin01SIMD( const fltx4 &val )
+{
+	// not a bad approximation : parabola always over-estimates. Squared parabola always
+	// underestimates. So lets blend between them:  goodsin = badsin + .225*( badsin^2-badsin)
+	fltx4 fl4BadEst = MulSIMD( val, SubSIMD( Four_Fours, MulSIMD( val, Four_Fours ) ) );
+	return AddSIMD( MulSIMD( Four_Point225s, SubSIMD( MulSIMD( fl4BadEst, fl4BadEst ), fl4BadEst ) ), fl4BadEst );
+}
+
+// full range useable implementations
+FORCEINLINE fltx4 SinEst01SIMD( const fltx4 &val )
+{
+	fltx4 fl4Abs = fabs( val );
+	fltx4 fl4Reduced2 = Mod2SIMDPositiveInput( fl4Abs );
+	fltx4 fl4OddMask = CmpGeSIMD( fl4Reduced2, Four_Ones );
+	fltx4 fl4val = SubSIMD( fl4Reduced2, AndSIMD( Four_Ones, fl4OddMask ) );
+	fltx4 fl4Sin = _SinEst01SIMD( fl4val );
+	fl4Sin = XorSIMD( fl4Sin, AndSIMD( LoadAlignedSIMD( g_SIMD_signmask ), XorSIMD( val, fl4OddMask ) ) );
+	return fl4Sin;
+
+}
+
+FORCEINLINE fltx4 Sin01SIMD( const fltx4 &val )
+{
+	fltx4 fl4Abs = fabs( val );
+	fltx4 fl4Reduced2 = Mod2SIMDPositiveInput( fl4Abs );
+	fltx4 fl4OddMask = CmpGeSIMD( fl4Reduced2, Four_Ones );
+	fltx4 fl4val = SubSIMD( fl4Reduced2, AndSIMD( Four_Ones, fl4OddMask ) );
+	fltx4 fl4Sin = _Sin01SIMD( fl4val );
+	fl4Sin = XorSIMD( fl4Sin, AndSIMD( LoadAlignedSIMD( g_SIMD_signmask ), XorSIMD( val, fl4OddMask ) ) );
+	return fl4Sin;
+
+}
+
+// Schlick style Bias approximation see graphics gems 4 : bias(t,a)= t/( (1/a-2)*(1-t)+1)
+ 
+FORCEINLINE fltx4 PreCalcBiasParameter( const fltx4 &bias_parameter )
+{
+	// convert perlin-style-bias parameter to the value right for the approximation
+	return SubSIMD( ReciprocalSIMD( bias_parameter ), Four_Twos );
+}
+
+FORCEINLINE fltx4 BiasSIMD( const fltx4 &val, const fltx4 &precalc_param )
+{
+	// similar to bias function except pass precalced bias value from calling PreCalcBiasParameter.
+
+	//!!speed!! use reciprocal est?
+	//!!speed!! could save one op by precalcing _2_ values
+	return DivSIMD( val, AddSIMD( MulSIMD( precalc_param, SubSIMD( Four_Ones, val ) ), Four_Ones ) );
+}
+
+//-----------------------------------------------------------------------------
+// Box/plane test 
+// NOTE: The w component of emins + emaxs must be 1 for this to work
+//-----------------------------------------------------------------------------
+FORCEINLINE int BoxOnPlaneSideSIMD( const fltx4& emins, const fltx4& emaxs, const cplane_t *p, float tolerance = 0.f )
+{
+	fltx4 corners[2];
+	fltx4 normal = LoadUnalignedSIMD( p->normal.Base() );
+	fltx4 dist = ReplicateX4( -p->dist );
+	normal = SetWSIMD( normal, dist );
+	fltx4 t4 = ReplicateX4( tolerance );
+	fltx4 negt4 = ReplicateX4( -tolerance );
+	fltx4 cmp = CmpGeSIMD( normal, Four_Zeros );
+	corners[0] = MaskedAssign( cmp, emaxs, emins );
+	corners[1] = MaskedAssign( cmp, emins, emaxs );
+	fltx4 dot1 = Dot4SIMD( normal, corners[0] );
+	fltx4 dot2 = Dot4SIMD( normal, corners[1] );
+	cmp = CmpGeSIMD( dot1, t4 );
+	fltx4 cmp2 = CmpGtSIMD( negt4, dot2 );
+	fltx4 result = MaskedAssign( cmp, Four_Ones, Four_Zeros );
+	fltx4 result2 = MaskedAssign( cmp2, Four_Twos, Four_Zeros );
+	result = AddSIMD( result, result2 );
+	intx4 sides;
+	ConvertStoreAsIntsSIMD( &sides, result );
+	return sides[0];
+}
+
+#endif // _ssemath_h
diff --git a/public/mathlib/ssequaternion.h b/public/mathlib/ssequaternion.h
new file mode 100644
index 0000000..825a9e4
--- /dev/null
+++ b/public/mathlib/ssequaternion.h
@@ -0,0 +1,367 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: - defines SIMD "structure of arrays" classes and functions.
+//
+//===========================================================================//
+#ifndef SSEQUATMATH_H
+#define SSEQUATMATH_H
+
+#ifdef _WIN32
+#pragma once
+#endif
+
+
+#include "mathlib/ssemath.h"
+
+// Use this #define to allow SSE versions of Quaternion math
+// to exist on PC.
+// On PC, certain horizontal vector operations are not supported.
+// This causes the SSE implementation of quaternion math to mix the
+// vector and scalar floating point units, which is extremely 
+// performance negative if you don't compile to native SSE2 (which 
+// we don't as of Sept 1, 2007). So, it's best not to allow these
+// functions to exist at all. It's not good enough to simply replace
+// the contents of the functions with scalar math, because each call
+// to LoadAligned and StoreAligned will result in an unnecssary copy
+// of the quaternion, and several moves to and from the XMM registers.
+//
+// Basically, the problem you run into is that for efficient SIMD code,
+// you need to load the quaternions and vectors into SIMD registers and
+// keep them there as long as possible while doing only SIMD math,
+// whereas for efficient scalar code, each time you copy onto or ever
+// use a fltx4, it hoses your pipeline. So the difference has to be
+// in the management of temporary variables in the calling function,
+// not inside the math functions.
+//
+// If you compile assuming the presence of SSE2, the MSVC will abandon
+// the traditional x87 FPU operations altogether and make everything use
+// the SSE2 registers, which lessens this problem a little.
+
+// permitted only on 360, as we've done careful tuning on its Altivec math:
+#ifdef _X360
+#define ALLOW_SIMD_QUATERNION_MATH 1  // not on PC!
+#endif
+
+
+
+//---------------------------------------------------------------------
+// Load/store quaternions
+//---------------------------------------------------------------------
+#ifndef _X360
+#if ALLOW_SIMD_QUATERNION_MATH
+// Using STDC or SSE
+FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned & pSIMD )
+{
+	fltx4 retval = LoadAlignedSIMD( pSIMD.Base() );
+	return retval;
+}
+
+FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned * RESTRICT pSIMD )
+{
+	fltx4 retval = LoadAlignedSIMD( pSIMD );
+	return retval;
+}
+
+FORCEINLINE void StoreAlignedSIMD( QuaternionAligned * RESTRICT pSIMD, const fltx4 & a )
+{
+	StoreAlignedSIMD( pSIMD->Base(), a );
+}
+#endif
+#else
+
+// for the transitional class -- load a QuaternionAligned
+FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned & pSIMD )
+{
+	fltx4 retval = XMLoadVector4A( pSIMD.Base() );
+	return retval;
+}
+
+FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned * RESTRICT pSIMD )
+{
+	fltx4 retval = XMLoadVector4A( pSIMD );
+	return retval;
+}
+
+FORCEINLINE void StoreAlignedSIMD( QuaternionAligned * RESTRICT pSIMD, const fltx4 & a )
+{
+	XMStoreVector4A( pSIMD->Base(), a );
+}
+
+#endif
+
+
+#if ALLOW_SIMD_QUATERNION_MATH
+//---------------------------------------------------------------------
+// Make sure quaternions are within 180 degrees of one another, if not, reverse q
+//---------------------------------------------------------------------
+FORCEINLINE fltx4 QuaternionAlignSIMD( const fltx4 &p, const fltx4 &q )
+{
+	// decide if one of the quaternions is backwards
+	fltx4 a = SubSIMD( p, q );
+	fltx4 b = AddSIMD( p, q );
+	a = Dot4SIMD( a, a );
+	b = Dot4SIMD( b, b );
+	fltx4 cmp = CmpGtSIMD( a, b );
+	fltx4 result = MaskedAssign( cmp, NegSIMD(q), q );
+	return result;
+}
+
+//---------------------------------------------------------------------
+// Normalize Quaternion
+//---------------------------------------------------------------------
+#if USE_STDC_FOR_SIMD
+
+FORCEINLINE fltx4 QuaternionNormalizeSIMD( const fltx4 &q )
+{
+	fltx4 radius, result;
+	radius = Dot4SIMD( q, q );
+
+	if ( SubFloat( radius, 0 ) ) // > FLT_EPSILON && ((radius < 1.0f - 4*FLT_EPSILON) || (radius > 1.0f + 4*FLT_EPSILON))
+	{
+		float iradius = 1.0f / sqrt( SubFloat( radius, 0 ) );
+		result = ReplicateX4( iradius );
+		result = MulSIMD( result, q );
+		return result;
+	}
+	return q;
+}
+
+#else
+
+// SSE + X360 implementation
+FORCEINLINE fltx4 QuaternionNormalizeSIMD( const fltx4 &q )
+{
+	fltx4 radius, result, mask;
+	radius = Dot4SIMD( q, q );
+	mask = CmpEqSIMD( radius, Four_Zeros ); // all ones iff radius = 0
+	result = ReciprocalSqrtSIMD( radius );
+	result = MulSIMD( result, q );
+	return MaskedAssign( mask, q, result );	// if radius was 0, just return q
+}
+
+#endif
+
+
+//---------------------------------------------------------------------
+// 0.0 returns p, 1.0 return q.
+//---------------------------------------------------------------------
+FORCEINLINE fltx4 QuaternionBlendNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t )
+{
+	fltx4 sclp, sclq, result;
+	sclq = ReplicateX4( t );
+	sclp = SubSIMD( Four_Ones, sclq );
+	result = MulSIMD( sclp, p );
+	result = MaddSIMD( sclq, q, result );
+	return QuaternionNormalizeSIMD( result );
+}
+
+
+//---------------------------------------------------------------------
+// Blend Quaternions
+//---------------------------------------------------------------------
+FORCEINLINE fltx4 QuaternionBlendSIMD( const fltx4 &p, const fltx4 &q, float t )
+{
+	// decide if one of the quaternions is backwards
+	fltx4 q2, result;
+	q2 = QuaternionAlignSIMD( p, q );
+	result = QuaternionBlendNoAlignSIMD( p, q2, t );
+	return result;
+}
+
+
+//---------------------------------------------------------------------
+// Multiply Quaternions
+//---------------------------------------------------------------------
+#ifndef _X360
+
+// SSE and STDC
+FORCEINLINE fltx4 QuaternionMultSIMD( const fltx4 &p, const fltx4 &q )
+{
+	// decide if one of the quaternions is backwards
+	fltx4 q2, result;
+	q2 = QuaternionAlignSIMD( p, q );
+	SubFloat( result, 0 ) =  SubFloat( p, 0 ) * SubFloat( q2, 3 ) + SubFloat( p, 1 ) * SubFloat( q2, 2 ) - SubFloat( p, 2 ) * SubFloat( q2, 1 ) + SubFloat( p, 3 ) * SubFloat( q2, 0 );
+	SubFloat( result, 1 ) = -SubFloat( p, 0 ) * SubFloat( q2, 2 ) + SubFloat( p, 1 ) * SubFloat( q2, 3 ) + SubFloat( p, 2 ) * SubFloat( q2, 0 ) + SubFloat( p, 3 ) * SubFloat( q2, 1 );
+	SubFloat( result, 2 ) =  SubFloat( p, 0 ) * SubFloat( q2, 1 ) - SubFloat( p, 1 ) * SubFloat( q2, 0 ) + SubFloat( p, 2 ) * SubFloat( q2, 3 ) + SubFloat( p, 3 ) * SubFloat( q2, 2 );
+	SubFloat( result, 3 ) = -SubFloat( p, 0 ) * SubFloat( q2, 0 ) - SubFloat( p, 1 ) * SubFloat( q2, 1 ) - SubFloat( p, 2 ) * SubFloat( q2, 2 ) + SubFloat( p, 3 ) * SubFloat( q2, 3 );
+	return result;
+}
+
+#else 
+
+// X360
+extern const fltx4 g_QuatMultRowSign[4];
+FORCEINLINE fltx4 QuaternionMultSIMD( const fltx4 &p, const fltx4 &q )
+{
+	fltx4 q2, row, result;
+	q2 = QuaternionAlignSIMD( p, q );
+
+	row = XMVectorSwizzle( q2, 3, 2, 1, 0 );
+	row = MulSIMD( row, g_QuatMultRowSign[0] );
+	result = Dot4SIMD( row, p );
+
+	row = XMVectorSwizzle( q2, 2, 3, 0, 1 );
+	row = MulSIMD( row, g_QuatMultRowSign[1] );
+	row = Dot4SIMD( row, p );
+	result = __vrlimi( result, row, 4, 0 );
+	
+	row = XMVectorSwizzle( q2, 1, 0, 3, 2 );
+	row = MulSIMD( row, g_QuatMultRowSign[2] );
+	row = Dot4SIMD( row, p );
+	result = __vrlimi( result, row, 2, 0 );
+	
+	row = MulSIMD( q2, g_QuatMultRowSign[3] );
+	row = Dot4SIMD( row, p );
+	result = __vrlimi( result, row, 1, 0 );
+	return result;
+}
+
+#endif
+
+
+//---------------------------------------------------------------------
+// Quaternion scale
+//---------------------------------------------------------------------
+#ifndef _X360
+
+// SSE and STDC
+FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t )
+{
+	float r;
+	fltx4 q;
+
+	// FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to 
+	// use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale.
+	float sinom = sqrt( SubFloat( p, 0 ) * SubFloat( p, 0 ) + SubFloat( p, 1 ) * SubFloat( p, 1 ) + SubFloat( p, 2 ) * SubFloat( p, 2 ) );
+	sinom = min( sinom, 1.f );
+
+	float sinsom = sin( asin( sinom ) * t );
+
+	t = sinsom / (sinom + FLT_EPSILON);
+	SubFloat( q, 0 ) = t * SubFloat( p, 0 );
+	SubFloat( q, 1 ) = t * SubFloat( p, 1 );
+	SubFloat( q, 2 ) = t * SubFloat( p, 2 );
+
+	// rescale rotation
+	r = 1.0f - sinsom * sinsom;
+
+	// Assert( r >= 0 );
+	if (r < 0.0f) 
+		r = 0.0f;
+	r = sqrt( r );
+
+	// keep sign of rotation
+	SubFloat( q, 3 ) = fsel( SubFloat( p, 3 ), r, -r );
+	return q;
+}
+
+#else
+
+// X360
+FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t )
+{
+	fltx4 sinom = Dot3SIMD( p, p );
+	sinom = SqrtSIMD( sinom );
+	sinom = MinSIMD( sinom, Four_Ones );
+	fltx4 sinsom = ArcSinSIMD( sinom );
+	fltx4 t4 = ReplicateX4( t );
+	sinsom = MulSIMD( sinsom, t4 );
+	sinsom = SinSIMD( sinsom );
+	sinom = AddSIMD( sinom, Four_Epsilons );
+	sinom = ReciprocalSIMD( sinom );
+	t4 = MulSIMD( sinsom, sinom );
+	fltx4 result = MulSIMD( p, t4 );
+
+	// rescale rotation
+	sinsom = MulSIMD( sinsom, sinsom );
+	fltx4 r = SubSIMD( Four_Ones, sinsom );
+	r = MaxSIMD( r, Four_Zeros );
+	r = SqrtSIMD( r );
+
+	// keep sign of rotation
+	fltx4 cmp = CmpGeSIMD( p, Four_Zeros );
+	r = MaskedAssign( cmp, r, NegSIMD( r ) );
+
+	result = __vrlimi(result, r, 1, 0);
+	return result;
+}
+
+#endif
+
+
+//-----------------------------------------------------------------------------
+// Quaternion sphereical linear interpolation
+//-----------------------------------------------------------------------------
+#ifndef _X360
+
+// SSE and STDC
+FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t )
+{
+	float omega, cosom, sinom, sclp, sclq;
+
+	fltx4 result;
+
+	// 0.0 returns p, 1.0 return q.
+	cosom = SubFloat( p, 0 ) * SubFloat( q, 0 ) + SubFloat( p, 1 ) * SubFloat( q, 1 ) + 
+		SubFloat( p, 2 ) * SubFloat( q, 2 ) + SubFloat( p, 3 ) * SubFloat( q, 3 );
+
+	if ( (1.0f + cosom ) > 0.000001f ) 
+	{
+		if ( (1.0f - cosom ) > 0.000001f ) 
+		{
+			omega = acos( cosom );
+			sinom = sin( omega );
+			sclp = sin( (1.0f - t)*omega) / sinom;
+			sclq = sin( t*omega ) / sinom;
+		}
+		else 
+		{
+			// TODO: add short circuit for cosom == 1.0f?
+			sclp = 1.0f - t;
+			sclq = t;
+		}
+		SubFloat( result, 0 ) = sclp * SubFloat( p, 0 ) + sclq * SubFloat( q, 0 );
+		SubFloat( result, 1 ) = sclp * SubFloat( p, 1 ) + sclq * SubFloat( q, 1 );
+		SubFloat( result, 2 ) = sclp * SubFloat( p, 2 ) + sclq * SubFloat( q, 2 );
+		SubFloat( result, 3 ) = sclp * SubFloat( p, 3 ) + sclq * SubFloat( q, 3 );
+	}
+	else 
+	{
+		SubFloat( result, 0 ) = -SubFloat( q, 1 );
+		SubFloat( result, 1 ) =  SubFloat( q, 0 );
+		SubFloat( result, 2 ) = -SubFloat( q, 3 );
+		SubFloat( result, 3 ) =  SubFloat( q, 2 );
+		sclp = sin( (1.0f - t) * (0.5f * M_PI));
+		sclq = sin( t * (0.5f * M_PI));
+		SubFloat( result, 0 ) = sclp * SubFloat( p, 0 ) + sclq * SubFloat( result, 0 );
+		SubFloat( result, 1 ) = sclp * SubFloat( p, 1 ) + sclq * SubFloat( result, 1 );
+		SubFloat( result, 2 ) = sclp * SubFloat( p, 2 ) + sclq * SubFloat( result, 2 );
+	}
+
+	return result;
+}
+
+#else
+
+// X360
+FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t )
+{
+	return XMQuaternionSlerp( p, q, t );
+}
+
+#endif
+
+
+FORCEINLINE fltx4 QuaternionSlerpSIMD( const fltx4 &p, const fltx4 &q, float t )
+{
+	fltx4 q2, result;
+	q2 = QuaternionAlignSIMD( p, q );
+	result = QuaternionSlerpNoAlignSIMD( p, q2, t );
+	return result;
+}
+
+
+#endif // ALLOW_SIMD_QUATERNION_MATH
+
+#endif // SSEQUATMATH_H
+
diff --git a/public/mathlib/vector.h b/public/mathlib/vector.h
new file mode 100644
index 0000000..c7654ba
--- /dev/null
+++ b/public/mathlib/vector.h
@@ -0,0 +1,2311 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+
+#ifndef VECTOR_H
+#define VECTOR_H
+
+#ifdef _WIN32
+#pragma once
+#endif
+
+#include <math.h>
+#include <float.h>
+
+// For vec_t, put this somewhere else?
+#include "tier0/basetypes.h"
+
+// For rand(). We really need a library!
+#include <stdlib.h>
+
+#ifndef _X360
+// For MMX intrinsics
+#include <xmmintrin.h>
+#endif
+
+#include "tier0/dbg.h"
+#include "tier0/threadtools.h"
+#include "mathlib/vector2d.h"
+#include "mathlib/math_pfns.h"
+
+// Uncomment this to add extra Asserts to check for NANs, uninitialized vecs, etc.
+//#define VECTOR_PARANOIA	1
+
+// Uncomment this to make sure we don't do anything slow with our vectors
+//#define VECTOR_NO_SLOW_OPERATIONS 1
+
+
+// Used to make certain code easier to read.
+#define X_INDEX	0
+#define Y_INDEX	1
+#define Z_INDEX	2
+
+
+#ifdef VECTOR_PARANOIA
+#define CHECK_VALID( _v)	Assert( (_v).IsValid() )
+#else
+#ifdef GNUC
+#define CHECK_VALID( _v)
+#else
+#define CHECK_VALID( _v)	0
+#endif
+#endif
+
+#define VecToString(v)	(static_cast<const char *>(CFmtStr("(%f, %f, %f)", (v).x, (v).y, (v).z))) // ** Note: this generates a temporary, don't hold reference!
+
+class VectorByValue;
+
+//=========================================================
+// 3D Vector
+//=========================================================
+class Vector					
+{
+public:
+	// Members
+	vec_t x, y, z;
+
+	// Construction/destruction:
+	Vector(void); 
+	Vector(vec_t X, vec_t Y, vec_t Z);
+	explicit Vector(vec_t XYZ); ///< broadcast initialize
+
+	// Initialization
+	void Init(vec_t ix=0.0f, vec_t iy=0.0f, vec_t iz=0.0f);
+	 // TODO (Ilya): Should there be an init that takes a single float for consistency?
+
+	// Got any nasty NAN's?
+	bool IsValid() const;
+	void Invalidate();
+
+	// array access...
+	vec_t operator[](int i) const;
+	vec_t& operator[](int i);
+
+	// Base address...
+	vec_t* Base();
+	vec_t const* Base() const;
+
+	// Cast to Vector2D...
+	Vector2D& AsVector2D();
+	const Vector2D& AsVector2D() const;
+
+	// Initialization methods
+	void Random( vec_t minVal, vec_t maxVal );
+	inline void Zero(); ///< zero out a vector
+
+	// equality
+	bool operator==(const Vector& v) const;
+	bool operator!=(const Vector& v) const;	
+
+	// arithmetic operations
+	FORCEINLINE Vector&	operator+=(const Vector &v);			
+	FORCEINLINE Vector&	operator-=(const Vector &v);		
+	FORCEINLINE Vector&	operator*=(const Vector &v);			
+	FORCEINLINE Vector&	operator*=(float s);
+	FORCEINLINE Vector&	operator/=(const Vector &v);		
+	FORCEINLINE Vector&	operator/=(float s);	
+	FORCEINLINE Vector&	operator+=(float fl) ; ///< broadcast add
+	FORCEINLINE Vector&	operator-=(float fl) ; ///< broadcast sub			
+
+// negate the vector components
+	void	Negate(); 
+
+	// Get the vector's magnitude.
+	inline vec_t	Length() const;
+
+	// Get the vector's magnitude squared.
+	FORCEINLINE vec_t LengthSqr(void) const
+	{ 
+		CHECK_VALID(*this);
+		return (x*x + y*y + z*z);		
+	}
+
+	// return true if this vector is (0,0,0) within tolerance
+	bool IsZero( float tolerance = 0.01f ) const
+	{
+		return (x > -tolerance && x < tolerance &&
+				y > -tolerance && y < tolerance &&
+				z > -tolerance && z < tolerance);
+	}
+
+	vec_t	NormalizeInPlace();
+	Vector	Normalized() const;
+	bool	IsLengthGreaterThan( float val ) const;
+	bool	IsLengthLessThan( float val ) const;
+
+	// check if a vector is within the box defined by two other vectors
+	FORCEINLINE bool WithinAABox( Vector const &boxmin, Vector const &boxmax);
+ 
+	// Get the distance from this vector to the other one.
+	vec_t	DistTo(const Vector &vOther) const;
+
+	// Get the distance from this vector to the other one squared.
+	// NJS: note, VC wasn't inlining it correctly in several deeply nested inlines due to being an 'out of line' inline.  
+	// may be able to tidy this up after switching to VC7
+	FORCEINLINE vec_t DistToSqr(const Vector &vOther) const
+	{
+		Vector delta;
+
+		delta.x = x - vOther.x;
+		delta.y = y - vOther.y;
+		delta.z = z - vOther.z;
+
+		return delta.LengthSqr();
+	}
+
+	// Copy
+	void	CopyToArray(float* rgfl) const;	
+
+	// Multiply, add, and assign to this (ie: *this = a + b * scalar). This
+	// is about 12% faster than the actual vector equation (because it's done per-component
+	// rather than per-vector).
+	void	MulAdd(const Vector& a, const Vector& b, float scalar);	
+
+	// Dot product.
+	vec_t	Dot(const Vector& vOther) const;			
+
+	// assignment
+	Vector& operator=(const Vector &vOther);
+
+	// 2d
+	vec_t	Length2D(void) const;					
+	vec_t	Length2DSqr(void) const;					
+
+	operator VectorByValue &()				{ return *((VectorByValue *)(this)); }
+	operator const VectorByValue &() const	{ return *((const VectorByValue *)(this)); }
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+	// copy constructors
+//	Vector(const Vector &vOther);
+
+	// arithmetic operations
+	Vector	operator-(void) const;
+				
+	Vector	operator+(const Vector& v) const;	
+	Vector	operator-(const Vector& v) const;	
+	Vector	operator*(const Vector& v) const;	
+	Vector	operator/(const Vector& v) const;	
+	Vector	operator*(float fl) const;
+	Vector	operator/(float fl) const;			
+	
+	// Cross product between two vectors.
+	Vector	Cross(const Vector &vOther) const;		
+
+	// Returns a vector with the min or max in X, Y, and Z.
+	Vector	Min(const Vector &vOther) const;
+	Vector	Max(const Vector &vOther) const;
+
+#else
+
+private:
+	// No copy constructors allowed if we're in optimal mode
+	Vector(const Vector& vOther);
+#endif
+};
+
+FORCEINLINE void NetworkVarConstruct( Vector &v ) { v.Zero(); }
+
+
+#define USE_M64S ( ( !defined( _X360 ) ) )
+
+
+
+//=========================================================
+// 4D Short Vector (aligned on 8-byte boundary)
+//=========================================================
+class ALIGN8 ShortVector
+{
+public:
+
+	short x, y, z, w;
+
+	// Initialization
+	void Init(short ix = 0, short iy = 0, short iz = 0, short iw = 0 );
+
+
+#ifdef USE_M64S
+	__m64 &AsM64() { return *(__m64*)&x; }
+	const __m64 &AsM64() const { return *(const __m64*)&x; } 
+#endif
+
+	// Setter
+	void Set( const ShortVector& vOther );
+	void Set( const short ix, const short iy, const short iz, const short iw );
+
+	// array access...
+	short operator[](int i) const;
+	short& operator[](int i);
+
+	// Base address...
+	short* Base();
+	short const* Base() const;
+
+	// equality
+	bool operator==(const ShortVector& v) const;
+	bool operator!=(const ShortVector& v) const;	
+
+	// Arithmetic operations
+	FORCEINLINE ShortVector& operator+=(const ShortVector &v);			
+	FORCEINLINE ShortVector& operator-=(const ShortVector &v);		
+	FORCEINLINE ShortVector& operator*=(const ShortVector &v);			
+	FORCEINLINE ShortVector& operator*=(float s);
+	FORCEINLINE ShortVector& operator/=(const ShortVector &v);		
+	FORCEINLINE ShortVector& operator/=(float s);					
+	FORCEINLINE ShortVector operator*(float fl) const;
+
+private:
+
+	// No copy constructors allowed if we're in optimal mode
+//	ShortVector(ShortVector const& vOther);
+
+	// No assignment operators either...
+//	ShortVector& operator=( ShortVector const& src );
+
+} ALIGN8_POST;
+
+
+
+
+
+
+//=========================================================
+// 4D Integer Vector
+//=========================================================
+class IntVector4D
+{
+public:
+
+	int x, y, z, w;
+
+	// Initialization
+	void Init(int ix = 0, int iy = 0, int iz = 0, int iw = 0 );
+
+#ifdef USE_M64S
+	__m64 &AsM64() { return *(__m64*)&x; }
+	const __m64 &AsM64() const { return *(const __m64*)&x; } 
+#endif
+
+	// Setter
+	void Set( const IntVector4D& vOther );
+	void Set( const int ix, const int iy, const int iz, const int iw );
+
+	// array access...
+	int operator[](int i) const;
+	int& operator[](int i);
+
+	// Base address...
+	int* Base();
+	int const* Base() const;
+
+	// equality
+	bool operator==(const IntVector4D& v) const;
+	bool operator!=(const IntVector4D& v) const;	
+
+	// Arithmetic operations
+	FORCEINLINE IntVector4D& operator+=(const IntVector4D &v);			
+	FORCEINLINE IntVector4D& operator-=(const IntVector4D &v);		
+	FORCEINLINE IntVector4D& operator*=(const IntVector4D &v);			
+	FORCEINLINE IntVector4D& operator*=(float s);
+	FORCEINLINE IntVector4D& operator/=(const IntVector4D &v);		
+	FORCEINLINE IntVector4D& operator/=(float s);					
+	FORCEINLINE IntVector4D operator*(float fl) const;
+
+private:
+
+	// No copy constructors allowed if we're in optimal mode
+	//	IntVector4D(IntVector4D const& vOther);
+
+	// No assignment operators either...
+	//	IntVector4D& operator=( IntVector4D const& src );
+
+};
+
+
+
+//-----------------------------------------------------------------------------
+// Allows us to specifically pass the vector by value when we need to
+//-----------------------------------------------------------------------------
+class VectorByValue : public Vector
+{
+public:
+	// Construction/destruction:
+	VectorByValue(void) : Vector() {} 
+	VectorByValue(vec_t X, vec_t Y, vec_t Z) : Vector( X, Y, Z ) {}
+	VectorByValue(const VectorByValue& vOther) { *this = vOther; }
+};
+
+
+//-----------------------------------------------------------------------------
+// Utility to simplify table construction. No constructor means can use
+// traditional C-style initialization
+//-----------------------------------------------------------------------------
+class TableVector
+{
+public:
+	vec_t x, y, z;
+
+	operator Vector &()				{ return *((Vector *)(this)); }
+	operator const Vector &() const	{ return *((const Vector *)(this)); }
+
+	// array access...
+	inline vec_t& operator[](int i)
+	{
+		Assert( (i >= 0) && (i < 3) );
+		return ((vec_t*)this)[i];
+	}
+
+	inline vec_t operator[](int i) const
+	{
+		Assert( (i >= 0) && (i < 3) );
+		return ((vec_t*)this)[i];
+	}
+};
+
+
+//-----------------------------------------------------------------------------
+// Here's where we add all those lovely SSE optimized routines
+//-----------------------------------------------------------------------------
+
+class ALIGN16 VectorAligned : public Vector
+{
+public:
+	inline VectorAligned(void) {};
+	inline VectorAligned(vec_t X, vec_t Y, vec_t Z) 
+	{
+		Init(X,Y,Z);
+	}
+
+#ifdef VECTOR_NO_SLOW_OPERATIONS
+
+private:
+	// No copy constructors allowed if we're in optimal mode
+	VectorAligned(const VectorAligned& vOther);
+	VectorAligned(const Vector &vOther);
+
+#else
+public:
+	explicit VectorAligned(const Vector &vOther) 
+	{
+		Init(vOther.x, vOther.y, vOther.z);
+	}
+	
+	VectorAligned& operator=(const Vector &vOther)	
+	{
+		Init(vOther.x, vOther.y, vOther.z);
+		return *this;
+	}
+	
+#endif
+	float w;	// this space is used anyway
+} ALIGN16_POST;
+
+//-----------------------------------------------------------------------------
+// Vector related operations
+//-----------------------------------------------------------------------------
+
+// Vector clear
+FORCEINLINE void VectorClear( Vector& a );
+
+// Copy
+FORCEINLINE void VectorCopy( const Vector& src, Vector& dst );
+
+// Vector arithmetic
+FORCEINLINE void VectorAdd( const Vector& a, const Vector& b, Vector& result );
+FORCEINLINE void VectorSubtract( const Vector& a, const Vector& b, Vector& result );
+FORCEINLINE void VectorMultiply( const Vector& a, vec_t b, Vector& result );
+FORCEINLINE void VectorMultiply( const Vector& a, const Vector& b, Vector& result );
+FORCEINLINE void VectorDivide( const Vector& a, vec_t b, Vector& result );
+FORCEINLINE void VectorDivide( const Vector& a, const Vector& b, Vector& result );
+inline void VectorScale ( const Vector& in, vec_t scale, Vector& result );
+// Don't mark this as inline in its function declaration. That's only necessary on its
+// definition, and 'inline' here leads to gcc warnings.
+void VectorMA( const Vector& start, float scale, const Vector& direction, Vector& dest );
+
+// Vector equality with tolerance
+bool VectorsAreEqual( const Vector& src1, const Vector& src2, float tolerance = 0.0f );
+
+#define VectorExpand(v) (v).x, (v).y, (v).z
+
+
+// Normalization
+// FIXME: Can't use quite yet
+//vec_t VectorNormalize( Vector& v );
+
+// Length
+inline vec_t VectorLength( const Vector& v );
+
+// Dot Product
+FORCEINLINE vec_t DotProduct(const Vector& a, const Vector& b);
+
+// Cross product
+void CrossProduct(const Vector& a, const Vector& b, Vector& result );
+
+// Store the min or max of each of x, y, and z into the result.
+void VectorMin( const Vector &a, const Vector &b, Vector &result );
+void VectorMax( const Vector &a, const Vector &b, Vector &result );
+
+// Linearly interpolate between two vectors
+void VectorLerp(const Vector& src1, const Vector& src2, vec_t t, Vector& dest );
+Vector VectorLerp(const Vector& src1, const Vector& src2, vec_t t );
+
+FORCEINLINE Vector ReplicateToVector( float x )
+{
+	return Vector( x, x, x );
+}
+
+// check if a point is in the field of a view of an object. supports up to 180 degree fov.
+FORCEINLINE bool PointWithinViewAngle( Vector const &vecSrcPosition, 
+									   Vector const &vecTargetPosition, 
+									   Vector const &vecLookDirection, float flCosHalfFOV )
+{
+	Vector vecDelta = vecTargetPosition - vecSrcPosition;
+	float cosDiff = DotProduct( vecLookDirection, vecDelta );
+
+	if ( cosDiff < 0 ) 
+		return false;
+
+	float flLen2 = vecDelta.LengthSqr();
+
+	// a/sqrt(b) > c  == a^2 > b * c ^2
+	return ( cosDiff * cosDiff > flLen2 * flCosHalfFOV * flCosHalfFOV );
+	
+}
+
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+
+// Cross product
+Vector CrossProduct( const Vector& a, const Vector& b );
+
+// Random vector creation
+Vector RandomVector( vec_t minVal, vec_t maxVal );
+
+#endif
+
+float RandomVectorInUnitSphere( Vector *pVector );
+float RandomVectorInUnitCircle( Vector2D *pVector );
+
+
+//-----------------------------------------------------------------------------
+//
+// Inlined Vector methods
+//
+//-----------------------------------------------------------------------------
+
+
+//-----------------------------------------------------------------------------
+// constructors
+//-----------------------------------------------------------------------------
+inline Vector::Vector(void)									
+{ 
+#ifdef _DEBUG
+#ifdef VECTOR_PARANOIA
+	// Initialize to NAN to catch errors
+	x = y = z = VEC_T_NAN;
+#endif
+#endif
+}
+
+inline Vector::Vector(vec_t X, vec_t Y, vec_t Z)						
+{ 
+	x = X; y = Y; z = Z;
+	CHECK_VALID(*this);
+}
+
+inline Vector::Vector(vec_t XYZ)						
+{ 
+	x = y = z = XYZ;
+	CHECK_VALID(*this);
+}
+
+//inline Vector::Vector(const float *pFloat)					
+//{
+//	Assert( pFloat );
+//	x = pFloat[0]; y = pFloat[1]; z = pFloat[2];	
+//	CHECK_VALID(*this);
+//} 
+
+#if 0
+//-----------------------------------------------------------------------------
+// copy constructor
+//-----------------------------------------------------------------------------
+
+inline Vector::Vector(const Vector &vOther)					
+{ 
+	CHECK_VALID(vOther);
+	x = vOther.x; y = vOther.y; z = vOther.z;
+}
+#endif
+
+//-----------------------------------------------------------------------------
+// initialization
+//-----------------------------------------------------------------------------
+
+inline void Vector::Init( vec_t ix, vec_t iy, vec_t iz )    
+{ 
+	x = ix; y = iy; z = iz;
+	CHECK_VALID(*this);
+}
+
+inline void Vector::Random( vec_t minVal, vec_t maxVal )
+{
+	x = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
+	y = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
+	z = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
+	CHECK_VALID(*this);
+}
+
+// This should really be a single opcode on the PowerPC (move r0 onto the vec reg)
+inline void Vector::Zero()
+{
+	x = y = z = 0.0f;
+}
+
+inline void VectorClear( Vector& a )
+{
+	a.x = a.y = a.z = 0.0f;
+}
+
+//-----------------------------------------------------------------------------
+// assignment
+//-----------------------------------------------------------------------------
+
+inline Vector& Vector::operator=(const Vector &vOther)	
+{
+	CHECK_VALID(vOther);
+	x=vOther.x; y=vOther.y; z=vOther.z; 
+	return *this; 
+}
+
+
+//-----------------------------------------------------------------------------
+// Array access
+//-----------------------------------------------------------------------------
+inline vec_t& Vector::operator[](int i)
+{
+	Assert( (i >= 0) && (i < 3) );
+	return ((vec_t*)this)[i];
+}
+
+inline vec_t Vector::operator[](int i) const
+{
+	Assert( (i >= 0) && (i < 3) );
+	return ((vec_t*)this)[i];
+}
+
+
+//-----------------------------------------------------------------------------
+// Base address...
+//-----------------------------------------------------------------------------
+inline vec_t* Vector::Base()
+{
+	return (vec_t*)this;
+}
+
+inline vec_t const* Vector::Base() const
+{
+	return (vec_t const*)this;
+}
+
+//-----------------------------------------------------------------------------
+// Cast to Vector2D...
+//-----------------------------------------------------------------------------
+
+inline Vector2D& Vector::AsVector2D()
+{
+	return *(Vector2D*)this;
+}
+
+inline const Vector2D& Vector::AsVector2D() const
+{
+	return *(const Vector2D*)this;
+}
+
+//-----------------------------------------------------------------------------
+// IsValid?
+//-----------------------------------------------------------------------------
+
+inline bool Vector::IsValid() const
+{
+	return IsFinite(x) && IsFinite(y) && IsFinite(z);
+}
+
+//-----------------------------------------------------------------------------
+// Invalidate
+//-----------------------------------------------------------------------------
+
+inline void Vector::Invalidate()
+{
+//#ifdef _DEBUG
+//#ifdef VECTOR_PARANOIA
+	x = y = z = VEC_T_NAN;
+//#endif
+//#endif
+}
+
+//-----------------------------------------------------------------------------
+// comparison
+//-----------------------------------------------------------------------------
+
+inline bool Vector::operator==( const Vector& src ) const
+{
+	CHECK_VALID(src);
+	CHECK_VALID(*this);
+	return (src.x == x) && (src.y == y) && (src.z == z);
+}
+
+inline bool Vector::operator!=( const Vector& src ) const
+{
+	CHECK_VALID(src);
+	CHECK_VALID(*this);
+	return (src.x != x) || (src.y != y) || (src.z != z);
+}
+
+
+//-----------------------------------------------------------------------------
+// Copy
+//-----------------------------------------------------------------------------
+
+FORCEINLINE void VectorCopy( const Vector& src, Vector& dst )
+{
+	CHECK_VALID(src);
+	dst.x = src.x;
+	dst.y = src.y;
+	dst.z = src.z;
+}
+
+inline void	Vector::CopyToArray(float* rgfl) const		
+{ 
+	Assert( rgfl );
+	CHECK_VALID(*this);
+	rgfl[0] = x, rgfl[1] = y, rgfl[2] = z; 
+}
+
+//-----------------------------------------------------------------------------
+// standard math operations
+//-----------------------------------------------------------------------------
+// #pragma message("TODO: these should be SSE")
+
+inline void Vector::Negate()
+{ 
+	CHECK_VALID(*this);
+	x = -x; y = -y; z = -z; 
+} 
+
+FORCEINLINE  Vector& Vector::operator+=(const Vector& v)	
+{ 
+	CHECK_VALID(*this);
+	CHECK_VALID(v);
+	x+=v.x; y+=v.y; z += v.z;	
+	return *this;
+}
+
+FORCEINLINE  Vector& Vector::operator-=(const Vector& v)	
+{ 
+	CHECK_VALID(*this);
+	CHECK_VALID(v);
+	x-=v.x; y-=v.y; z -= v.z;	
+	return *this;
+}
+
+FORCEINLINE  Vector& Vector::operator*=(float fl)	
+{
+	x *= fl;
+	y *= fl;
+	z *= fl;
+	CHECK_VALID(*this);
+	return *this;
+}
+
+FORCEINLINE  Vector& Vector::operator*=(const Vector& v)	
+{ 
+	CHECK_VALID(v);
+	x *= v.x;
+	y *= v.y;
+	z *= v.z;
+	CHECK_VALID(*this);
+	return *this;
+}
+
+// this ought to be an opcode.
+FORCEINLINE Vector&	Vector::operator+=(float fl) 
+{
+	x += fl;
+	y += fl;
+	z += fl;
+	CHECK_VALID(*this);
+	return *this;
+}
+
+FORCEINLINE Vector&	Vector::operator-=(float fl) 
+{
+	x -= fl;
+	y -= fl;
+	z -= fl;
+	CHECK_VALID(*this);
+	return *this;
+}
+
+
+
+FORCEINLINE  Vector& Vector::operator/=(float fl)	
+{
+	Assert( fl != 0.0f );
+	float oofl = 1.0f / fl;
+	x *= oofl;
+	y *= oofl;
+	z *= oofl;
+	CHECK_VALID(*this);
+	return *this;
+}
+
+FORCEINLINE  Vector& Vector::operator/=(const Vector& v)	
+{ 
+	CHECK_VALID(v);
+	Assert( v.x != 0.0f && v.y != 0.0f && v.z != 0.0f );
+	x /= v.x;
+	y /= v.y;
+	z /= v.z;
+	CHECK_VALID(*this);
+	return *this;
+}
+
+
+
+//-----------------------------------------------------------------------------
+//
+// Inlined Short Vector methods
+//
+//-----------------------------------------------------------------------------
+
+
+inline void ShortVector::Init( short ix, short iy, short iz, short iw )    
+{ 
+	x = ix; y = iy; z = iz; w = iw;
+}
+
+FORCEINLINE void ShortVector::Set( const ShortVector& vOther )
+{
+   x = vOther.x;
+   y = vOther.y;
+   z = vOther.z;
+   w = vOther.w;
+}
+
+FORCEINLINE void ShortVector::Set( const short ix, const short iy, const short iz, const short iw )
+{
+   x = ix;
+   y = iy;
+   z = iz;
+   w = iw;
+}
+
+
+//-----------------------------------------------------------------------------
+// Array access
+//-----------------------------------------------------------------------------
+inline short ShortVector::operator[](int i) const
+{
+	Assert( (i >= 0) && (i < 4) );
+	return ((short*)this)[i];
+}
+
+inline short& ShortVector::operator[](int i)
+{
+	Assert( (i >= 0) && (i < 4) );
+	return ((short*)this)[i];
+}
+
+//-----------------------------------------------------------------------------
+// Base address...
+//-----------------------------------------------------------------------------
+inline short* ShortVector::Base()
+{
+	return (short*)this;
+}
+
+inline short const* ShortVector::Base() const
+{
+	return (short const*)this;
+}
+
+
+//-----------------------------------------------------------------------------
+// comparison
+//-----------------------------------------------------------------------------
+
+inline bool ShortVector::operator==( const ShortVector& src ) const
+{
+	return (src.x == x) && (src.y == y) && (src.z == z) && (src.w == w);
+}
+
+inline bool ShortVector::operator!=( const ShortVector& src ) const
+{
+	return (src.x != x) || (src.y != y) || (src.z != z) || (src.w != w);
+}
+
+
+
+//-----------------------------------------------------------------------------
+// standard math operations
+//-----------------------------------------------------------------------------
+
+FORCEINLINE  ShortVector& ShortVector::operator+=(const ShortVector& v)	
+{ 
+	x+=v.x; y+=v.y; z += v.z; w += v.w;
+	return *this;
+}
+
+FORCEINLINE  ShortVector& ShortVector::operator-=(const ShortVector& v)	
+{ 
+	x-=v.x; y-=v.y; z -= v.z; w -= v.w;
+	return *this;
+}
+
+FORCEINLINE  ShortVector& ShortVector::operator*=(float fl)	
+{
+	x *= fl;
+	y *= fl;
+	z *= fl;
+	w *= fl;
+	return *this;
+}
+
+FORCEINLINE  ShortVector& ShortVector::operator*=(const ShortVector& v)	
+{ 
+	x *= v.x;
+	y *= v.y;
+	z *= v.z;
+	w *= v.w;
+	return *this;
+}
+
+FORCEINLINE  ShortVector& ShortVector::operator/=(float fl)	
+{
+	Assert( fl != 0.0f );
+	float oofl = 1.0f / fl;
+	x *= oofl;
+	y *= oofl;
+	z *= oofl;
+	w *= oofl;
+	return *this;
+}
+
+FORCEINLINE  ShortVector& ShortVector::operator/=(const ShortVector& v)	
+{ 
+	Assert( v.x != 0 && v.y != 0 && v.z != 0 && v.w != 0 );
+	x /= v.x;
+	y /= v.y;
+	z /= v.z;
+	w /= v.w;
+	return *this;
+}
+
+FORCEINLINE void ShortVectorMultiply( const ShortVector& src, float fl, ShortVector& res )
+{
+	Assert( IsFinite(fl) );
+	res.x = src.x * fl;
+	res.y = src.y * fl;
+	res.z = src.z * fl;
+	res.w = src.w * fl;
+}
+
+FORCEINLINE ShortVector ShortVector::operator*(float fl) const
+{ 
+	ShortVector res;
+	ShortVectorMultiply( *this, fl, res );
+	return res;	
+}
+
+
+
+
+
+
+//-----------------------------------------------------------------------------
+//
+// Inlined Integer Vector methods
+//
+//-----------------------------------------------------------------------------
+
+
+inline void IntVector4D::Init( int ix, int iy, int iz, int iw )    
+{ 
+	x = ix; y = iy; z = iz; w = iw;
+}
+
+FORCEINLINE void IntVector4D::Set( const IntVector4D& vOther )
+{
+	x = vOther.x;
+	y = vOther.y;
+	z = vOther.z;
+	w = vOther.w;
+}
+
+FORCEINLINE void IntVector4D::Set( const int ix, const int iy, const int iz, const int iw )
+{
+	x = ix;
+	y = iy;
+	z = iz;
+	w = iw;
+}
+
+
+//-----------------------------------------------------------------------------
+// Array access
+//-----------------------------------------------------------------------------
+inline int IntVector4D::operator[](int i) const
+{
+	Assert( (i >= 0) && (i < 4) );
+	return ((int*)this)[i];
+}
+
+inline int& IntVector4D::operator[](int i)
+{
+	Assert( (i >= 0) && (i < 4) );
+	return ((int*)this)[i];
+}
+
+//-----------------------------------------------------------------------------
+// Base address...
+//-----------------------------------------------------------------------------
+inline int* IntVector4D::Base()
+{
+	return (int*)this;
+}
+
+inline int const* IntVector4D::Base() const
+{
+	return (int const*)this;
+}
+
+
+//-----------------------------------------------------------------------------
+// comparison
+//-----------------------------------------------------------------------------
+
+inline bool IntVector4D::operator==( const IntVector4D& src ) const
+{
+	return (src.x == x) && (src.y == y) && (src.z == z) && (src.w == w);
+}
+
+inline bool IntVector4D::operator!=( const IntVector4D& src ) const
+{
+	return (src.x != x) || (src.y != y) || (src.z != z) || (src.w != w);
+}
+
+
+
+//-----------------------------------------------------------------------------
+// standard math operations
+//-----------------------------------------------------------------------------
+
+FORCEINLINE  IntVector4D& IntVector4D::operator+=(const IntVector4D& v)	
+{ 
+	x+=v.x; y+=v.y; z += v.z; w += v.w;
+	return *this;
+}
+
+FORCEINLINE  IntVector4D& IntVector4D::operator-=(const IntVector4D& v)	
+{ 
+	x-=v.x; y-=v.y; z -= v.z; w -= v.w;
+	return *this;
+}
+
+FORCEINLINE  IntVector4D& IntVector4D::operator*=(float fl)	
+{
+	x *= fl;
+	y *= fl;
+	z *= fl;
+	w *= fl;
+	return *this;
+}
+
+FORCEINLINE  IntVector4D& IntVector4D::operator*=(const IntVector4D& v)	
+{ 
+	x *= v.x;
+	y *= v.y;
+	z *= v.z;
+	w *= v.w;
+	return *this;
+}
+
+FORCEINLINE  IntVector4D& IntVector4D::operator/=(float fl)	
+{
+	Assert( fl != 0.0f );
+	float oofl = 1.0f / fl;
+	x *= oofl;
+	y *= oofl;
+	z *= oofl;
+	w *= oofl;
+	return *this;
+}
+
+FORCEINLINE  IntVector4D& IntVector4D::operator/=(const IntVector4D& v)	
+{ 
+	Assert( v.x != 0 && v.y != 0 && v.z != 0 && v.w != 0 );
+	x /= v.x;
+	y /= v.y;
+	z /= v.z;
+	w /= v.w;
+	return *this;
+}
+
+FORCEINLINE void IntVector4DMultiply( const IntVector4D& src, float fl, IntVector4D& res )
+{
+	Assert( IsFinite(fl) );
+	res.x = src.x * fl;
+	res.y = src.y * fl;
+	res.z = src.z * fl;
+	res.w = src.w * fl;
+}
+
+FORCEINLINE IntVector4D IntVector4D::operator*(float fl) const
+{ 
+	IntVector4D res;
+	IntVector4DMultiply( *this, fl, res );
+	return res;	
+}
+
+
+
+// =======================
+
+
+FORCEINLINE void VectorAdd( const Vector& a, const Vector& b, Vector& c )
+{
+	CHECK_VALID(a);
+	CHECK_VALID(b);
+	c.x = a.x + b.x;
+	c.y = a.y + b.y;
+	c.z = a.z + b.z;
+}
+
+FORCEINLINE void VectorSubtract( const Vector& a, const Vector& b, Vector& c )
+{
+	CHECK_VALID(a);
+	CHECK_VALID(b);
+	c.x = a.x - b.x;
+	c.y = a.y - b.y;
+	c.z = a.z - b.z;
+}
+
+FORCEINLINE void VectorMultiply( const Vector& a, vec_t b, Vector& c )
+{
+	CHECK_VALID(a);
+	Assert( IsFinite(b) );
+	c.x = a.x * b;
+	c.y = a.y * b;
+	c.z = a.z * b;
+}
+
+FORCEINLINE void VectorMultiply( const Vector& a, const Vector& b, Vector& c )
+{
+	CHECK_VALID(a);
+	CHECK_VALID(b);
+	c.x = a.x * b.x;
+	c.y = a.y * b.y;
+	c.z = a.z * b.z;
+}
+
+// for backwards compatability
+inline void VectorScale ( const Vector& in, vec_t scale, Vector& result )
+{
+	VectorMultiply( in, scale, result );
+}
+
+
+FORCEINLINE void VectorDivide( const Vector& a, vec_t b, Vector& c )
+{
+	CHECK_VALID(a);
+	Assert( b != 0.0f );
+	vec_t oob = 1.0f / b;
+	c.x = a.x * oob;
+	c.y = a.y * oob;
+	c.z = a.z * oob;
+}
+
+FORCEINLINE void VectorDivide( const Vector& a, const Vector& b, Vector& c )
+{
+	CHECK_VALID(a);
+	CHECK_VALID(b);
+	Assert( (b.x != 0.0f) && (b.y != 0.0f) && (b.z != 0.0f) );
+	c.x = a.x / b.x;
+	c.y = a.y / b.y;
+	c.z = a.z / b.z;
+}
+
+// FIXME: Remove
+// For backwards compatability
+inline void	Vector::MulAdd(const Vector& a, const Vector& b, float scalar)
+{
+	CHECK_VALID(a);
+	CHECK_VALID(b);
+	x = a.x + b.x * scalar;
+	y = a.y + b.y * scalar;
+	z = a.z + b.z * scalar;
+}
+
+inline void VectorLerp(const Vector& src1, const Vector& src2, vec_t t, Vector& dest )
+{
+	CHECK_VALID(src1);
+	CHECK_VALID(src2);
+	dest.x = src1.x + (src2.x - src1.x) * t;
+	dest.y = src1.y + (src2.y - src1.y) * t;
+	dest.z = src1.z + (src2.z - src1.z) * t;
+}
+
+inline Vector VectorLerp(const Vector& src1, const Vector& src2, vec_t t )
+{
+	Vector result;
+	VectorLerp( src1, src2, t, result );
+	return result;
+}
+
+//-----------------------------------------------------------------------------
+// Temporary storage for vector results so const Vector& results can be returned
+//-----------------------------------------------------------------------------
+inline Vector &AllocTempVector()
+{
+	static Vector s_vecTemp[128];
+	static CInterlockedInt s_nIndex;
+
+	int nIndex;
+	for (;;)
+	{
+		int nOldIndex = s_nIndex;
+		nIndex = ( (nOldIndex + 0x10001) & 0x7F );
+
+		if ( s_nIndex.AssignIf( nOldIndex, nIndex ) )
+		{
+			break;
+		}
+		ThreadPause();
+	} 
+	return s_vecTemp[nIndex];
+}
+
+
+
+//-----------------------------------------------------------------------------
+// dot, cross
+//-----------------------------------------------------------------------------
+FORCEINLINE vec_t DotProduct(const Vector& a, const Vector& b) 
+{ 
+	CHECK_VALID(a);
+	CHECK_VALID(b);
+	return( a.x*b.x + a.y*b.y + a.z*b.z ); 
+}
+
+// for backwards compatability
+inline vec_t Vector::Dot( const Vector& vOther ) const
+{
+	CHECK_VALID(vOther);
+	return DotProduct( *this, vOther );
+}
+
+inline void CrossProduct(const Vector& a, const Vector& b, Vector& result )
+{
+	CHECK_VALID(a);
+	CHECK_VALID(b);
+	Assert( &a != &result );
+	Assert( &b != &result );
+	result.x = a.y*b.z - a.z*b.y;
+	result.y = a.z*b.x - a.x*b.z;
+	result.z = a.x*b.y - a.y*b.x;
+}
+
+inline vec_t DotProductAbs( const Vector &v0, const Vector &v1 )
+{
+	CHECK_VALID(v0);
+	CHECK_VALID(v1);
+	return FloatMakePositive(v0.x*v1.x) + FloatMakePositive(v0.y*v1.y) + FloatMakePositive(v0.z*v1.z);
+}
+
+inline vec_t DotProductAbs( const Vector &v0, const float *v1 )
+{
+	return FloatMakePositive(v0.x * v1[0]) + FloatMakePositive(v0.y * v1[1]) + FloatMakePositive(v0.z * v1[2]);
+}
+
+//-----------------------------------------------------------------------------
+// length
+//-----------------------------------------------------------------------------
+
+inline vec_t VectorLength( const Vector& v )
+{
+	CHECK_VALID(v);
+	return (vec_t)FastSqrt(v.x*v.x + v.y*v.y + v.z*v.z);		
+}
+
+
+inline vec_t Vector::Length(void) const	
+{
+	CHECK_VALID(*this);
+	return VectorLength( *this );
+}
+
+
+//-----------------------------------------------------------------------------
+// Normalization
+//-----------------------------------------------------------------------------
+
+/*
+// FIXME: Can't use until we're un-macroed in mathlib.h
+inline vec_t VectorNormalize( Vector& v )
+{
+	Assert( v.IsValid() );
+	vec_t l = v.Length();
+	if (l != 0.0f)
+	{
+		v /= l;
+	}
+	else
+	{
+		// FIXME: 
+		// Just copying the existing implemenation; shouldn't res.z == 0?
+		v.x = v.y = 0.0f; v.z = 1.0f;
+	}
+	return l;
+}
+*/
+
+
+// check a point against a box
+bool Vector::WithinAABox( Vector const &boxmin, Vector const &boxmax)
+{
+	return ( 
+		( x >= boxmin.x ) && ( x <= boxmax.x) &&
+		( y >= boxmin.y ) && ( y <= boxmax.y) &&
+		( z >= boxmin.z ) && ( z <= boxmax.z)
+		);
+}
+
+//-----------------------------------------------------------------------------
+// Get the distance from this vector to the other one 
+//-----------------------------------------------------------------------------
+inline vec_t Vector::DistTo(const Vector &vOther) const
+{
+	Vector delta;
+	VectorSubtract( *this, vOther, delta );
+	return delta.Length();
+}
+
+
+//-----------------------------------------------------------------------------
+// Vector equality with tolerance
+//-----------------------------------------------------------------------------
+inline bool VectorsAreEqual( const Vector& src1, const Vector& src2, float tolerance )
+{
+	if (FloatMakePositive(src1.x - src2.x) > tolerance)
+		return false;
+	if (FloatMakePositive(src1.y - src2.y) > tolerance)
+		return false;
+	return (FloatMakePositive(src1.z - src2.z) <= tolerance);
+}
+
+
+//-----------------------------------------------------------------------------
+// Computes the closest point to vecTarget no farther than flMaxDist from vecStart
+//-----------------------------------------------------------------------------
+inline void ComputeClosestPoint( const Vector& vecStart, float flMaxDist, const Vector& vecTarget, Vector *pResult )
+{
+	Vector vecDelta;
+	VectorSubtract( vecTarget, vecStart, vecDelta );
+	float flDistSqr = vecDelta.LengthSqr();
+	if ( flDistSqr <= flMaxDist * flMaxDist )
+	{
+		*pResult = vecTarget;
+	}
+	else
+	{
+		vecDelta /= FastSqrt( flDistSqr );
+		VectorMA( vecStart, flMaxDist, vecDelta, *pResult );
+	}
+}
+
+
+//-----------------------------------------------------------------------------
+// Takes the absolute value of a vector
+//-----------------------------------------------------------------------------
+inline void VectorAbs( const Vector& src, Vector& dst )
+{
+	dst.x = FloatMakePositive(src.x);
+	dst.y = FloatMakePositive(src.y);
+	dst.z = FloatMakePositive(src.z);
+}
+
+
+//-----------------------------------------------------------------------------
+//
+// Slow methods
+//
+//-----------------------------------------------------------------------------
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+
+//-----------------------------------------------------------------------------
+// Returns a vector with the min or max in X, Y, and Z.
+//-----------------------------------------------------------------------------
+inline Vector Vector::Min(const Vector &vOther) const
+{
+	return Vector(x < vOther.x ? x : vOther.x, 
+		y < vOther.y ? y : vOther.y, 
+		z < vOther.z ? z : vOther.z);
+}
+
+inline Vector Vector::Max(const Vector &vOther) const
+{
+	return Vector(x > vOther.x ? x : vOther.x, 
+		y > vOther.y ? y : vOther.y, 
+		z > vOther.z ? z : vOther.z);
+}
+
+
+//-----------------------------------------------------------------------------
+// arithmetic operations
+//-----------------------------------------------------------------------------
+
+inline Vector Vector::operator-(void) const
+{ 
+	return Vector(-x,-y,-z);				
+}
+
+inline Vector Vector::operator+(const Vector& v) const	
+{ 
+	Vector res;
+	VectorAdd( *this, v, res );
+	return res;	
+}
+
+inline Vector Vector::operator-(const Vector& v) const	
+{ 
+	Vector res;
+	VectorSubtract( *this, v, res );
+	return res;	
+}
+
+inline Vector Vector::operator*(float fl) const	
+{ 
+	Vector res;
+	VectorMultiply( *this, fl, res );
+	return res;	
+}
+
+inline Vector Vector::operator*(const Vector& v) const	
+{ 
+	Vector res;
+	VectorMultiply( *this, v, res );
+	return res;	
+}
+
+inline Vector Vector::operator/(float fl) const	
+{ 
+	Vector res;
+	VectorDivide( *this, fl, res );
+	return res;	
+}
+
+inline Vector Vector::operator/(const Vector& v) const	
+{ 
+	Vector res;
+	VectorDivide( *this, v, res );
+	return res;	
+}
+
+inline Vector operator*(float fl, const Vector& v)	
+{ 
+	return v * fl; 
+}
+
+//-----------------------------------------------------------------------------
+// cross product
+//-----------------------------------------------------------------------------
+
+inline Vector Vector::Cross(const Vector& vOther) const
+{ 
+	Vector res;
+	CrossProduct( *this, vOther, res );
+	return res;
+}
+
+//-----------------------------------------------------------------------------
+// 2D
+//-----------------------------------------------------------------------------
+
+inline vec_t Vector::Length2D(void) const
+{ 
+	return (vec_t)FastSqrt(x*x + y*y); 
+}
+
+inline vec_t Vector::Length2DSqr(void) const
+{ 
+	return (x*x + y*y); 
+}
+
+inline Vector CrossProduct(const Vector& a, const Vector& b) 
+{ 
+	return Vector( a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x ); 
+}
+
+inline void VectorMin( const Vector &a, const Vector &b, Vector &result )
+{
+	result.x = fpmin(a.x, b.x);
+	result.y = fpmin(a.y, b.y);
+	result.z = fpmin(a.z, b.z);
+}
+
+inline void VectorMax( const Vector &a, const Vector &b, Vector &result )
+{
+	result.x = fpmax(a.x, b.x);
+	result.y = fpmax(a.y, b.y);
+	result.z = fpmax(a.z, b.z);
+}
+
+inline float ComputeVolume( const Vector &vecMins, const Vector &vecMaxs )
+{
+	Vector vecDelta;
+	VectorSubtract( vecMaxs, vecMins, vecDelta );
+	return DotProduct( vecDelta, vecDelta );
+}
+
+// Get a random vector.
+inline Vector RandomVector( float minVal, float maxVal )
+{
+	Vector vRandom;
+	vRandom.Random( minVal, maxVal );
+	return vRandom;
+}
+
+#endif //slow
+
+//-----------------------------------------------------------------------------
+// Helper debugging stuff....
+//-----------------------------------------------------------------------------
+
+inline bool operator==( float const* f, const Vector& v )
+{
+	// AIIIEEEE!!!!
+	Assert(0);
+	return false;
+}
+
+inline bool operator==( const Vector& v, float const* f )
+{
+	// AIIIEEEE!!!!
+	Assert(0);
+	return false;
+}
+
+inline bool operator!=( float const* f, const Vector& v )
+{
+	// AIIIEEEE!!!!
+	Assert(0);
+	return false;
+}
+
+inline bool operator!=( const Vector& v, float const* f )
+{
+	// AIIIEEEE!!!!
+	Assert(0);
+	return false;
+}
+
+
+//-----------------------------------------------------------------------------
+// AngularImpulse
+//-----------------------------------------------------------------------------
+// AngularImpulse are exponetial maps (an axis scaled by a "twist" angle in degrees)
+typedef Vector AngularImpulse;
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+
+inline AngularImpulse RandomAngularImpulse( float minVal, float maxVal )
+{
+	AngularImpulse	angImp;
+	angImp.Random( minVal, maxVal );
+	return angImp;
+}
+
+#endif
+
+
+//-----------------------------------------------------------------------------
+// Quaternion
+//-----------------------------------------------------------------------------
+
+class RadianEuler;
+
+class Quaternion				// same data-layout as engine's vec4_t,
+{								//		which is a vec_t[4]
+public:
+	inline Quaternion(void)	{ 
+	
+	// Initialize to NAN to catch errors
+#ifdef _DEBUG
+#ifdef VECTOR_PARANOIA
+		x = y = z = w = VEC_T_NAN;
+#endif
+#endif
+	}
+	inline Quaternion(vec_t ix, vec_t iy, vec_t iz, vec_t iw) : x(ix), y(iy), z(iz), w(iw) { }
+	inline Quaternion(RadianEuler const &angle);	// evil auto type promotion!!!
+
+	inline void Init(vec_t ix=0.0f, vec_t iy=0.0f, vec_t iz=0.0f, vec_t iw=0.0f)	{ x = ix; y = iy; z = iz; w = iw; }
+
+	bool IsValid() const;
+	void Invalidate();
+
+	bool operator==( const Quaternion &src ) const;
+	bool operator!=( const Quaternion &src ) const;
+
+	vec_t* Base() { return (vec_t*)this; }
+	const vec_t* Base() const { return (vec_t*)this; }
+
+	// array access...
+	vec_t operator[](int i) const;
+	vec_t& operator[](int i);
+
+	vec_t x, y, z, w;
+};
+
+
+//-----------------------------------------------------------------------------
+// Array access
+//-----------------------------------------------------------------------------
+inline vec_t& Quaternion::operator[](int i)
+{
+	Assert( (i >= 0) && (i < 4) );
+	return ((vec_t*)this)[i];
+}
+
+inline vec_t Quaternion::operator[](int i) const
+{
+	Assert( (i >= 0) && (i < 4) );
+	return ((vec_t*)this)[i];
+}
+
+
+//-----------------------------------------------------------------------------
+// Equality test
+//-----------------------------------------------------------------------------
+inline bool Quaternion::operator==( const Quaternion &src ) const
+{
+	return ( x == src.x ) && ( y == src.y ) && ( z == src.z ) && ( w == src.w );
+}
+
+inline bool Quaternion::operator!=( const Quaternion &src ) const
+{
+	return !operator==( src );
+}
+
+
+//-----------------------------------------------------------------------------
+// Quaternion equality with tolerance
+//-----------------------------------------------------------------------------
+inline bool QuaternionsAreEqual( const Quaternion& src1, const Quaternion& src2, float tolerance )
+{
+	if (FloatMakePositive(src1.x - src2.x) > tolerance)
+		return false;
+	if (FloatMakePositive(src1.y - src2.y) > tolerance)
+		return false;
+	if (FloatMakePositive(src1.z - src2.z) > tolerance)
+		return false;
+	return (FloatMakePositive(src1.w - src2.w) <= tolerance);
+}
+
+
+//-----------------------------------------------------------------------------
+// Here's where we add all those lovely SSE optimized routines
+//-----------------------------------------------------------------------------
+class ALIGN16 QuaternionAligned : public Quaternion
+{
+public:
+	inline QuaternionAligned(void) {};
+	inline QuaternionAligned(vec_t X, vec_t Y, vec_t Z, vec_t W) 
+	{
+		Init(X,Y,Z,W);
+	}
+
+#ifdef VECTOR_NO_SLOW_OPERATIONS
+
+private:
+	// No copy constructors allowed if we're in optimal mode
+	QuaternionAligned(const QuaternionAligned& vOther);
+	QuaternionAligned(const Quaternion &vOther);
+
+#else
+public:
+	explicit QuaternionAligned(const Quaternion &vOther) 
+	{
+		Init(vOther.x, vOther.y, vOther.z, vOther.w);
+	}
+
+	QuaternionAligned& operator=(const Quaternion &vOther)	
+	{
+		Init(vOther.x, vOther.y, vOther.z, vOther.w);
+		return *this;
+	}
+
+#endif
+} ALIGN16_POST;
+
+
+//-----------------------------------------------------------------------------
+// Radian Euler angle aligned to axis (NOT ROLL/PITCH/YAW)
+//-----------------------------------------------------------------------------
+class QAngle;
+class RadianEuler
+{
+public:
+	inline RadianEuler(void)							{ }
+	inline RadianEuler(vec_t X, vec_t Y, vec_t Z)		{ x = X; y = Y; z = Z; }
+	inline RadianEuler(Quaternion const &q);	// evil auto type promotion!!!
+	inline RadianEuler(QAngle const &angles);	// evil auto type promotion!!!
+
+	// Initialization
+	inline void Init(vec_t ix=0.0f, vec_t iy=0.0f, vec_t iz=0.0f)	{ x = ix; y = iy; z = iz; }
+
+	//	conversion to qangle
+	QAngle ToQAngle( void ) const;
+	bool IsValid() const;
+	void Invalidate();
+
+	// array access...
+	vec_t operator[](int i) const;
+	vec_t& operator[](int i);
+
+	vec_t x, y, z;
+};
+
+
+extern void AngleQuaternion( RadianEuler const &angles, Quaternion &qt );
+extern void QuaternionAngles( Quaternion const &q, RadianEuler &angles );
+
+FORCEINLINE void NetworkVarConstruct( Quaternion &q ) { q.x = q.y = q.z = q.w = 0.0f; }
+
+inline Quaternion::Quaternion(RadianEuler const &angle)
+{
+	AngleQuaternion( angle, *this );
+}
+
+inline bool Quaternion::IsValid() const
+{
+	return IsFinite(x) && IsFinite(y) && IsFinite(z) && IsFinite(w);
+}
+
+inline void Quaternion::Invalidate()
+{
+//#ifdef _DEBUG
+//#ifdef VECTOR_PARANOIA
+	x = y = z = w = VEC_T_NAN;
+//#endif
+//#endif
+}
+
+inline RadianEuler::RadianEuler(Quaternion const &q)
+{
+	QuaternionAngles( q, *this );
+}
+
+inline void VectorCopy( RadianEuler const& src, RadianEuler &dst )
+{
+	CHECK_VALID(src);
+	dst.x = src.x;
+	dst.y = src.y;
+	dst.z = src.z;
+}
+
+inline void VectorScale( RadianEuler const& src, float b, RadianEuler &dst )
+{
+	CHECK_VALID(src);
+	Assert( IsFinite(b) );
+	dst.x = src.x * b;
+	dst.y = src.y * b;
+	dst.z = src.z * b;
+}
+
+inline bool RadianEuler::IsValid() const
+{
+	return IsFinite(x) && IsFinite(y) && IsFinite(z);
+}
+
+inline void RadianEuler::Invalidate()
+{
+//#ifdef _DEBUG
+//#ifdef VECTOR_PARANOIA
+	x = y = z = VEC_T_NAN;
+//#endif
+//#endif
+}
+
+
+//-----------------------------------------------------------------------------
+// Array access
+//-----------------------------------------------------------------------------
+inline vec_t& RadianEuler::operator[](int i)
+{
+	Assert( (i >= 0) && (i < 3) );
+	return ((vec_t*)this)[i];
+}
+
+inline vec_t RadianEuler::operator[](int i) const
+{
+	Assert( (i >= 0) && (i < 3) );
+	return ((vec_t*)this)[i];
+}
+
+
+//-----------------------------------------------------------------------------
+// Degree Euler QAngle pitch, yaw, roll
+//-----------------------------------------------------------------------------
+class QAngleByValue;
+
+class QAngle					
+{
+public:
+	// Members
+	vec_t x, y, z;
+
+	// Construction/destruction
+	QAngle(void);
+	QAngle(vec_t X, vec_t Y, vec_t Z);
+//	QAngle(RadianEuler const &angles);	// evil auto type promotion!!!
+
+	// Allow pass-by-value
+	operator QAngleByValue &()				{ return *((QAngleByValue *)(this)); }
+	operator const QAngleByValue &() const	{ return *((const QAngleByValue *)(this)); }
+
+	// Initialization
+	void Init(vec_t ix=0.0f, vec_t iy=0.0f, vec_t iz=0.0f);
+	void Random( vec_t minVal, vec_t maxVal );
+
+	// Got any nasty NAN's?
+	bool IsValid() const;
+	void Invalidate();
+
+	// array access...
+	vec_t operator[](int i) const;
+	vec_t& operator[](int i);
+
+	// Base address...
+	vec_t* Base();
+	vec_t const* Base() const;
+	
+	// equality
+	bool operator==(const QAngle& v) const;
+	bool operator!=(const QAngle& v) const;	
+
+	// arithmetic operations
+	QAngle&	operator+=(const QAngle &v);
+	QAngle&	operator-=(const QAngle &v);
+	QAngle&	operator*=(float s);
+	QAngle&	operator/=(float s);
+
+	// Get the vector's magnitude.
+	vec_t	Length() const;
+	vec_t	LengthSqr() const;
+
+	// negate the QAngle components
+	//void	Negate(); 
+
+	// No assignment operators either...
+	QAngle& operator=( const QAngle& src );
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+	// copy constructors
+
+	// arithmetic operations
+	QAngle	operator-(void) const;
+	
+	QAngle	operator+(const QAngle& v) const;
+	QAngle	operator-(const QAngle& v) const;
+	QAngle	operator*(float fl) const;
+	QAngle	operator/(float fl) const;
+#else
+
+private:
+	// No copy constructors allowed if we're in optimal mode
+	QAngle(const QAngle& vOther);
+
+#endif
+};
+
+FORCEINLINE void NetworkVarConstruct( QAngle &q ) { q.x = q.y = q.z = 0.0f; }
+
+//-----------------------------------------------------------------------------
+// Allows us to specifically pass the vector by value when we need to
+//-----------------------------------------------------------------------------
+class QAngleByValue : public QAngle
+{
+public:
+	// Construction/destruction:
+	QAngleByValue(void) : QAngle() {} 
+	QAngleByValue(vec_t X, vec_t Y, vec_t Z) : QAngle( X, Y, Z ) {}
+	QAngleByValue(const QAngleByValue& vOther) { *this = vOther; }
+};
+
+
+inline void VectorAdd( const QAngle& a, const QAngle& b, QAngle& result )
+{
+	CHECK_VALID(a);
+	CHECK_VALID(b);
+	result.x = a.x + b.x;
+	result.y = a.y + b.y;
+	result.z = a.z + b.z;
+}
+
+inline void VectorMA( const QAngle &start, float scale, const QAngle &direction, QAngle &dest )
+{
+	CHECK_VALID(start);
+	CHECK_VALID(direction);
+	dest.x = start.x + scale * direction.x;
+	dest.y = start.y + scale * direction.y;
+	dest.z = start.z + scale * direction.z;
+}
+
+
+//-----------------------------------------------------------------------------
+// constructors
+//-----------------------------------------------------------------------------
+inline QAngle::QAngle(void)									
+{ 
+#ifdef _DEBUG
+#ifdef VECTOR_PARANOIA
+	// Initialize to NAN to catch errors
+	x = y = z = VEC_T_NAN;
+#endif
+#endif
+}
+
+inline QAngle::QAngle(vec_t X, vec_t Y, vec_t Z)						
+{ 
+	x = X; y = Y; z = Z;
+	CHECK_VALID(*this);
+}
+
+
+//-----------------------------------------------------------------------------
+// initialization
+//-----------------------------------------------------------------------------
+inline void QAngle::Init( vec_t ix, vec_t iy, vec_t iz )    
+{ 
+	x = ix; y = iy; z = iz;
+	CHECK_VALID(*this);
+}
+
+inline void QAngle::Random( vec_t minVal, vec_t maxVal )
+{
+	x = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
+	y = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
+	z = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
+	CHECK_VALID(*this);
+}
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+
+inline QAngle RandomAngle( float minVal, float maxVal )
+{
+	Vector vRandom;
+	vRandom.Random( minVal, maxVal );
+	QAngle ret( vRandom.x, vRandom.y, vRandom.z );
+	return ret;
+}
+
+#endif
+
+
+inline RadianEuler::RadianEuler(QAngle const &angles)
+{
+	Init(
+		angles.z * 3.14159265358979323846f / 180.f,
+		angles.x * 3.14159265358979323846f / 180.f, 
+		angles.y * 3.14159265358979323846f / 180.f );
+}
+
+
+
+
+inline QAngle RadianEuler::ToQAngle( void) const
+{
+	return QAngle(
+		y * 180.f / 3.14159265358979323846f,
+		z * 180.f / 3.14159265358979323846f,
+		x * 180.f / 3.14159265358979323846f );
+}
+
+
+//-----------------------------------------------------------------------------
+// assignment
+//-----------------------------------------------------------------------------
+inline QAngle& QAngle::operator=(const QAngle &vOther)	
+{
+	CHECK_VALID(vOther);
+	x=vOther.x; y=vOther.y; z=vOther.z; 
+	return *this; 
+}
+
+
+//-----------------------------------------------------------------------------
+// Array access
+//-----------------------------------------------------------------------------
+inline vec_t& QAngle::operator[](int i)
+{
+	Assert( (i >= 0) && (i < 3) );
+	return ((vec_t*)this)[i];
+}
+
+inline vec_t QAngle::operator[](int i) const
+{
+	Assert( (i >= 0) && (i < 3) );
+	return ((vec_t*)this)[i];
+}
+
+
+//-----------------------------------------------------------------------------
+// Base address...
+//-----------------------------------------------------------------------------
+inline vec_t* QAngle::Base()
+{
+	return (vec_t*)this;
+}
+
+inline vec_t const* QAngle::Base() const
+{
+	return (vec_t const*)this;
+}
+
+
+//-----------------------------------------------------------------------------
+// IsValid?
+//-----------------------------------------------------------------------------
+inline bool QAngle::IsValid() const
+{
+	return IsFinite(x) && IsFinite(y) && IsFinite(z);
+}
+
+//-----------------------------------------------------------------------------
+// Invalidate
+//-----------------------------------------------------------------------------
+
+inline void QAngle::Invalidate()
+{
+//#ifdef _DEBUG
+//#ifdef VECTOR_PARANOIA
+	x = y = z = VEC_T_NAN;
+//#endif
+//#endif
+}
+
+//-----------------------------------------------------------------------------
+// comparison
+//-----------------------------------------------------------------------------
+inline bool QAngle::operator==( const QAngle& src ) const
+{
+	CHECK_VALID(src);
+	CHECK_VALID(*this);
+	return (src.x == x) && (src.y == y) && (src.z == z);
+}
+
+inline bool QAngle::operator!=( const QAngle& src ) const
+{
+	CHECK_VALID(src);
+	CHECK_VALID(*this);
+	return (src.x != x) || (src.y != y) || (src.z != z);
+}
+
+
+//-----------------------------------------------------------------------------
+// Copy
+//-----------------------------------------------------------------------------
+inline void VectorCopy( const QAngle& src, QAngle& dst )
+{
+	CHECK_VALID(src);
+	dst.x = src.x;
+	dst.y = src.y;
+	dst.z = src.z;
+}
+
+
+//-----------------------------------------------------------------------------
+// standard math operations
+//-----------------------------------------------------------------------------
+inline QAngle& QAngle::operator+=(const QAngle& v)	
+{ 
+	CHECK_VALID(*this);
+	CHECK_VALID(v);
+	x+=v.x; y+=v.y; z += v.z;	
+	return *this;
+}
+
+inline QAngle& QAngle::operator-=(const QAngle& v)	
+{ 
+	CHECK_VALID(*this);
+	CHECK_VALID(v);
+	x-=v.x; y-=v.y; z -= v.z;	
+	return *this;
+}
+
+inline QAngle& QAngle::operator*=(float fl)	
+{
+	x *= fl;
+	y *= fl;
+	z *= fl;
+	CHECK_VALID(*this);
+	return *this;
+}
+
+inline QAngle& QAngle::operator/=(float fl)	
+{
+	Assert( fl != 0.0f );
+	float oofl = 1.0f / fl;
+	x *= oofl;
+	y *= oofl;
+	z *= oofl;
+	CHECK_VALID(*this);
+	return *this;
+}
+
+
+//-----------------------------------------------------------------------------
+// length
+//-----------------------------------------------------------------------------
+inline vec_t QAngle::Length( ) const
+{
+	CHECK_VALID(*this);
+	return (vec_t)FastSqrt( LengthSqr( ) );		
+}
+
+
+inline vec_t QAngle::LengthSqr( ) const
+{
+	CHECK_VALID(*this);
+	return x * x + y * y + z * z;
+}
+	
+
+//-----------------------------------------------------------------------------
+// Vector equality with tolerance
+//-----------------------------------------------------------------------------
+inline bool QAnglesAreEqual( const QAngle& src1, const QAngle& src2, float tolerance = 0.0f )
+{
+	if (FloatMakePositive(src1.x - src2.x) > tolerance)
+		return false;
+	if (FloatMakePositive(src1.y - src2.y) > tolerance)
+		return false;
+	return (FloatMakePositive(src1.z - src2.z) <= tolerance);
+}
+
+
+//-----------------------------------------------------------------------------
+// arithmetic operations (SLOW!!)
+//-----------------------------------------------------------------------------
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+
+inline QAngle QAngle::operator-(void) const
+{ 
+	QAngle ret(-x,-y,-z);
+	return ret;
+}
+
+inline QAngle QAngle::operator+(const QAngle& v) const	
+{ 
+	QAngle res;
+	res.x = x + v.x;
+	res.y = y + v.y;
+	res.z = z + v.z;
+	return res;	
+}
+
+inline QAngle QAngle::operator-(const QAngle& v) const	
+{ 
+	QAngle res;
+	res.x = x - v.x;
+	res.y = y - v.y;
+	res.z = z - v.z;
+	return res;	
+}
+
+inline QAngle QAngle::operator*(float fl) const	
+{ 
+	QAngle res;
+	res.x = x * fl;
+	res.y = y * fl;
+	res.z = z * fl;
+	return res;	
+}
+
+inline QAngle QAngle::operator/(float fl) const	
+{ 
+	QAngle res;
+	res.x = x / fl;
+	res.y = y / fl;
+	res.z = z / fl;
+	return res;	
+}
+
+inline QAngle operator*(float fl, const QAngle& v)	
+{ 
+        QAngle ret( v * fl );
+	return ret;
+}
+
+#endif // VECTOR_NO_SLOW_OPERATIONS
+
+
+//-----------------------------------------------------------------------------
+// NOTE: These are not completely correct.  The representations are not equivalent
+// unless the QAngle represents a rotational impulse along a coordinate axis (x,y,z)
+inline void QAngleToAngularImpulse( const QAngle &angles, AngularImpulse &impulse )
+{
+	impulse.x = angles.z;
+	impulse.y = angles.x;
+	impulse.z = angles.y;
+}
+
+inline void AngularImpulseToQAngle( const AngularImpulse &impulse, QAngle &angles )
+{
+	angles.x = impulse.y;
+	angles.y = impulse.z;
+	angles.z = impulse.x;
+}
+
+#if !defined( _X360 )
+
+FORCEINLINE vec_t InvRSquared( float const *v )
+{
+#if defined(__i386__) || defined(_M_IX86)
+	float sqrlen = v[0]*v[0]+v[1]*v[1]+v[2]*v[2] + 1.0e-10f, result;
+	_mm_store_ss(&result, _mm_rcp_ss( _mm_max_ss( _mm_set_ss(1.0f), _mm_load_ss(&sqrlen) ) ));
+	return result;
+#else
+	return 1.f/fpmax(1.f, v[0]*v[0]+v[1]*v[1]+v[2]*v[2]);
+#endif
+}
+
+FORCEINLINE vec_t InvRSquared( const Vector &v )
+{
+	return InvRSquared(&v.x);
+}
+
+#if defined(__i386__) || defined(_M_IX86)
+inline void _SSE_RSqrtInline( float a, float* out )
+{
+	__m128  xx = _mm_load_ss( &a );
+	__m128  xr = _mm_rsqrt_ss( xx );
+	__m128  xt;
+	xt = _mm_mul_ss( xr, xr );
+	xt = _mm_mul_ss( xt, xx );
+	xt = _mm_sub_ss( _mm_set_ss(3.f), xt );
+	xt = _mm_mul_ss( xt, _mm_set_ss(0.5f) );
+	xr = _mm_mul_ss( xr, xt );
+	_mm_store_ss( out, xr );
+}
+#endif
+
+// FIXME: Change this back to a #define once we get rid of the vec_t version
+FORCEINLINE float VectorNormalize( Vector& vec )
+{
+#ifndef DEBUG // stop crashing my edit-and-continue!
+	#if defined(__i386__) || defined(_M_IX86)
+		#define DO_SSE_OPTIMIZATION
+	#endif
+#endif
+
+#if defined( DO_SSE_OPTIMIZATION )
+	float sqrlen = vec.LengthSqr() + 1.0e-10f, invlen;
+	_SSE_RSqrtInline(sqrlen, &invlen);
+	vec.x *= invlen;
+	vec.y *= invlen;
+	vec.z *= invlen;
+	return sqrlen * invlen;
+#else
+	extern float (FASTCALL *pfVectorNormalize)(Vector& v);
+	return (*pfVectorNormalize)(vec);
+#endif
+}
+
+// FIXME: Obsolete version of VectorNormalize, once we remove all the friggin float*s
+FORCEINLINE float VectorNormalize( float * v )
+{
+	return VectorNormalize(*(reinterpret_cast<Vector *>(v)));
+}
+
+FORCEINLINE void VectorNormalizeFast( Vector &vec )
+{
+	VectorNormalize(vec);
+}
+
+#else
+
+FORCEINLINE float _VMX_InvRSquared( const Vector &v )
+{
+	XMVECTOR xmV = XMVector3ReciprocalLength( XMLoadVector3( v.Base() ) );
+	xmV = XMVector3Dot( xmV, xmV );
+	return xmV.x;
+}
+
+// call directly
+FORCEINLINE float _VMX_VectorNormalize( Vector &vec )
+{
+	float mag = XMVector3Length( XMLoadVector3( vec.Base() ) ).x;
+	float den = 1.f / (mag + FLT_EPSILON );
+	vec.x *= den;
+	vec.y *= den;
+	vec.z *= den;
+	return mag;
+}
+
+#define InvRSquared(x) _VMX_InvRSquared(x)
+
+// FIXME: Change this back to a #define once we get rid of the vec_t version
+FORCEINLINE float VectorNormalize( Vector& v )
+{
+	return _VMX_VectorNormalize( v );
+}
+// FIXME: Obsolete version of VectorNormalize, once we remove all the friggin float*s
+FORCEINLINE float VectorNormalize( float *pV )
+{
+	return _VMX_VectorNormalize(*(reinterpret_cast<Vector*>(pV)));
+}
+
+// call directly
+FORCEINLINE void VectorNormalizeFast( Vector &vec )
+{
+	XMVECTOR xmV = XMVector3LengthEst( XMLoadVector3( vec.Base() ) );
+	float den = 1.f / (xmV.x + FLT_EPSILON);
+	vec.x *= den;
+	vec.y *= den;
+	vec.z *= den;
+}
+
+#endif // _X360
+
+
+inline vec_t Vector::NormalizeInPlace()
+{
+	return VectorNormalize( *this );
+}
+
+inline Vector Vector::Normalized() const
+{
+	Vector norm = *this;
+	VectorNormalize( norm );
+	return norm;
+}
+
+inline bool Vector::IsLengthGreaterThan( float val ) const
+{
+	return LengthSqr() > val*val;
+}
+
+inline bool Vector::IsLengthLessThan( float val ) const
+{
+	return LengthSqr() < val*val;
+}
+
+#endif
+
diff --git a/public/mathlib/vector2d.h b/public/mathlib/vector2d.h
new file mode 100644
index 0000000..4138558
--- /dev/null
+++ b/public/mathlib/vector2d.h
@@ -0,0 +1,670 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+
+#ifndef VECTOR2D_H
+#define VECTOR2D_H
+
+#ifdef _WIN32
+#pragma once
+#endif
+
+#include <math.h>
+#include <float.h>
+
+// For vec_t, put this somewhere else?
+#include "tier0/basetypes.h"
+
+// For rand(). We really need a library!
+#include <stdlib.h>
+
+#include "tier0/dbg.h"
+#include "mathlib/math_pfns.h"
+
+//=========================================================
+// 2D Vector2D
+//=========================================================
+
+class Vector2D					
+{
+public:
+	// Members
+	vec_t x, y;
+
+	// Construction/destruction
+	Vector2D(void);
+	Vector2D(vec_t X, vec_t Y);
+	Vector2D(const float *pFloat);
+
+	// Initialization
+	void Init(vec_t ix=0.0f, vec_t iy=0.0f);
+
+	// Got any nasty NAN's?
+	bool IsValid() const;
+
+	// array access...
+	vec_t operator[](int i) const;
+	vec_t& operator[](int i);
+
+	// Base address...
+	vec_t* Base();
+	vec_t const* Base() const;
+
+	// Initialization methods
+	void Random( float minVal, float maxVal );
+
+	// equality
+	bool operator==(const Vector2D& v) const;
+	bool operator!=(const Vector2D& v) const;	
+
+	// arithmetic operations
+	Vector2D&	operator+=(const Vector2D &v);			
+	Vector2D&	operator-=(const Vector2D &v);		
+	Vector2D&	operator*=(const Vector2D &v);			
+	Vector2D&	operator*=(float s);
+	Vector2D&	operator/=(const Vector2D &v);		
+	Vector2D&	operator/=(float s);					
+
+	// negate the Vector2D components
+	void	Negate(); 
+
+	// Get the Vector2D's magnitude.
+	vec_t	Length() const;
+
+	// Get the Vector2D's magnitude squared.
+	vec_t	LengthSqr(void) const;
+
+	// return true if this vector is (0,0) within tolerance
+	bool IsZero( float tolerance = 0.01f ) const
+	{
+		return (x > -tolerance && x < tolerance &&
+				y > -tolerance && y < tolerance);
+	}
+
+	// Normalize in place and return the old length.
+	vec_t	NormalizeInPlace();
+
+	// Compare length.
+	bool	IsLengthGreaterThan( float val ) const;
+	bool	IsLengthLessThan( float val ) const;
+
+	// Get the distance from this Vector2D to the other one.
+	vec_t	DistTo(const Vector2D &vOther) const;
+
+	// Get the distance from this Vector2D to the other one squared.
+	vec_t	DistToSqr(const Vector2D &vOther) const;		
+
+	// Copy
+	void	CopyToArray(float* rgfl) const;	
+
+	// Multiply, add, and assign to this (ie: *this = a + b * scalar). This
+	// is about 12% faster than the actual Vector2D equation (because it's done per-component
+	// rather than per-Vector2D).
+	void	MulAdd(const Vector2D& a, const Vector2D& b, float scalar);	
+
+	// Dot product.
+	vec_t	Dot(const Vector2D& vOther) const;			
+
+	// assignment
+	Vector2D& operator=(const Vector2D &vOther);
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+	// copy constructors
+	Vector2D(const Vector2D &vOther);
+
+	// arithmetic operations
+	Vector2D	operator-(void) const;
+				
+	Vector2D	operator+(const Vector2D& v) const;	
+	Vector2D	operator-(const Vector2D& v) const;	
+	Vector2D	operator*(const Vector2D& v) const;	
+	Vector2D	operator/(const Vector2D& v) const;	
+	Vector2D	operator*(float fl) const;
+	Vector2D	operator/(float fl) const;			
+	
+	// Cross product between two vectors.
+	Vector2D	Cross(const Vector2D &vOther) const;		
+
+	// Returns a Vector2D with the min or max in X, Y, and Z.
+	Vector2D	Min(const Vector2D &vOther) const;
+	Vector2D	Max(const Vector2D &vOther) const;
+
+#else
+
+private:
+	// No copy constructors allowed if we're in optimal mode
+	Vector2D(const Vector2D& vOther);
+#endif
+};
+
+//-----------------------------------------------------------------------------
+
+const Vector2D vec2_origin(0,0);
+const Vector2D vec2_invalid( FLT_MAX, FLT_MAX );
+
+//-----------------------------------------------------------------------------
+// Vector2D related operations
+//-----------------------------------------------------------------------------
+
+// Vector2D clear
+void Vector2DClear( Vector2D& a );
+
+// Copy
+void Vector2DCopy( const Vector2D& src, Vector2D& dst );
+
+// Vector2D arithmetic
+void Vector2DAdd( const Vector2D& a, const Vector2D& b, Vector2D& result );
+void Vector2DSubtract( const Vector2D& a, const Vector2D& b, Vector2D& result );
+void Vector2DMultiply( const Vector2D& a, vec_t b, Vector2D& result );
+void Vector2DMultiply( const Vector2D& a, const Vector2D& b, Vector2D& result );
+void Vector2DDivide( const Vector2D& a, vec_t b, Vector2D& result );
+void Vector2DDivide( const Vector2D& a, const Vector2D& b, Vector2D& result );
+void Vector2DMA( const Vector2D& start, float s, const Vector2D& dir, Vector2D& result );
+
+// Store the min or max of each of x, y, and z into the result.
+void Vector2DMin( const Vector2D &a, const Vector2D &b, Vector2D &result );
+void Vector2DMax( const Vector2D &a, const Vector2D &b, Vector2D &result );
+
+#define Vector2DExpand( v ) (v).x, (v).y
+
+// Normalization
+vec_t Vector2DNormalize( Vector2D& v );
+
+// Length
+vec_t Vector2DLength( const Vector2D& v );
+
+// Dot Product
+vec_t DotProduct2D(const Vector2D& a, const Vector2D& b);
+
+// Linearly interpolate between two vectors
+void Vector2DLerp(const Vector2D& src1, const Vector2D& src2, vec_t t, Vector2D& dest );
+
+
+//-----------------------------------------------------------------------------
+//
+// Inlined Vector2D methods
+//
+//-----------------------------------------------------------------------------
+
+
+//-----------------------------------------------------------------------------
+// constructors
+//-----------------------------------------------------------------------------
+
+inline Vector2D::Vector2D(void)									
+{ 
+#ifdef _DEBUG
+	// Initialize to NAN to catch errors
+	x = y = VEC_T_NAN;
+#endif
+}
+
+inline Vector2D::Vector2D(vec_t X, vec_t Y)						
+{ 
+	x = X; y = Y;
+	Assert( IsValid() );
+}
+
+inline Vector2D::Vector2D(const float *pFloat)					
+{
+	Assert( pFloat );
+	x = pFloat[0]; y = pFloat[1];	
+	Assert( IsValid() );
+}
+
+
+//-----------------------------------------------------------------------------
+// copy constructor
+//-----------------------------------------------------------------------------
+
+inline Vector2D::Vector2D(const Vector2D &vOther)					
+{ 
+	Assert( vOther.IsValid() );
+	x = vOther.x; y = vOther.y;
+}
+
+//-----------------------------------------------------------------------------
+// initialization
+//-----------------------------------------------------------------------------
+
+inline void Vector2D::Init( vec_t ix, vec_t iy )    
+{ 
+	x = ix; y = iy;
+	Assert( IsValid() );
+}
+
+inline void Vector2D::Random( float minVal, float maxVal )
+{
+	x = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
+	y = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
+}
+
+inline void Vector2DClear( Vector2D& a )
+{
+	a.x = a.y = 0.0f;
+}
+
+//-----------------------------------------------------------------------------
+// assignment
+//-----------------------------------------------------------------------------
+
+inline Vector2D& Vector2D::operator=(const Vector2D &vOther)	
+{
+	Assert( vOther.IsValid() );
+	x=vOther.x; y=vOther.y;
+	return *this; 
+}
+
+//-----------------------------------------------------------------------------
+// Array access
+//-----------------------------------------------------------------------------
+
+inline vec_t& Vector2D::operator[](int i)
+{
+	Assert( (i >= 0) && (i < 2) );
+	return ((vec_t*)this)[i];
+}
+
+inline vec_t Vector2D::operator[](int i) const
+{
+	Assert( (i >= 0) && (i < 2) );
+	return ((vec_t*)this)[i];
+}
+
+//-----------------------------------------------------------------------------
+// Base address...
+//-----------------------------------------------------------------------------
+
+inline vec_t* Vector2D::Base()
+{
+	return (vec_t*)this;
+}
+
+inline vec_t const* Vector2D::Base() const
+{
+	return (vec_t const*)this;
+}
+
+//-----------------------------------------------------------------------------
+// IsValid?
+//-----------------------------------------------------------------------------
+
+inline bool Vector2D::IsValid() const
+{
+	return IsFinite(x) && IsFinite(y);
+}
+
+//-----------------------------------------------------------------------------
+// comparison
+//-----------------------------------------------------------------------------
+
+inline bool Vector2D::operator==( const Vector2D& src ) const
+{
+	Assert( src.IsValid() && IsValid() );
+	return (src.x == x) && (src.y == y);
+}
+
+inline bool Vector2D::operator!=( const Vector2D& src ) const
+{
+	Assert( src.IsValid() && IsValid() );
+	return (src.x != x) || (src.y != y);
+}
+
+
+//-----------------------------------------------------------------------------
+// Copy
+//-----------------------------------------------------------------------------
+
+inline void Vector2DCopy( const Vector2D& src, Vector2D& dst )
+{
+	Assert( src.IsValid() );
+	dst.x = src.x;
+	dst.y = src.y;
+}
+
+inline void	Vector2D::CopyToArray(float* rgfl) const		
+{ 
+	Assert( IsValid() );
+	Assert( rgfl );
+	rgfl[0] = x; rgfl[1] = y; 
+}
+
+//-----------------------------------------------------------------------------
+// standard math operations
+//-----------------------------------------------------------------------------
+
+inline void Vector2D::Negate()
+{ 
+	Assert( IsValid() );
+	x = -x; y = -y;
+} 
+
+inline Vector2D& Vector2D::operator+=(const Vector2D& v)	
+{ 
+	Assert( IsValid() && v.IsValid() );
+	x+=v.x; y+=v.y;	
+	return *this;
+}
+
+inline Vector2D& Vector2D::operator-=(const Vector2D& v)	
+{ 
+	Assert( IsValid() && v.IsValid() );
+	x-=v.x; y-=v.y;	
+	return *this;
+}
+
+inline Vector2D& Vector2D::operator*=(float fl)	
+{
+	x *= fl;
+	y *= fl;
+	Assert( IsValid() );
+	return *this;
+}
+
+inline Vector2D& Vector2D::operator*=(const Vector2D& v)	
+{ 
+	x *= v.x;
+	y *= v.y;
+	Assert( IsValid() );
+	return *this;
+}
+
+inline Vector2D& Vector2D::operator/=(float fl)	
+{
+	Assert( fl != 0.0f );
+	float oofl = 1.0f / fl;
+	x *= oofl;
+	y *= oofl;
+	Assert( IsValid() );
+	return *this;
+}
+
+inline Vector2D& Vector2D::operator/=(const Vector2D& v)	
+{ 
+	Assert( v.x != 0.0f && v.y != 0.0f );
+	x /= v.x;
+	y /= v.y;
+	Assert( IsValid() );
+	return *this;
+}
+
+inline void Vector2DAdd( const Vector2D& a, const Vector2D& b, Vector2D& c )
+{
+	Assert( a.IsValid() && b.IsValid() );
+	c.x = a.x + b.x;
+	c.y = a.y + b.y;
+}
+
+inline void Vector2DSubtract( const Vector2D& a, const Vector2D& b, Vector2D& c )
+{
+	Assert( a.IsValid() && b.IsValid() );
+	c.x = a.x - b.x;
+	c.y = a.y - b.y;
+}
+
+inline void Vector2DMultiply( const Vector2D& a, vec_t b, Vector2D& c )
+{
+	Assert( a.IsValid() && IsFinite(b) );
+	c.x = a.x * b;
+	c.y = a.y * b;
+}
+
+inline void Vector2DMultiply( const Vector2D& a, const Vector2D& b, Vector2D& c )
+{				  
+	Assert( a.IsValid() && b.IsValid() );
+	c.x = a.x * b.x;
+	c.y = a.y * b.y;
+}
+
+
+inline void Vector2DDivide( const Vector2D& a, vec_t b, Vector2D& c )
+{
+	Assert( a.IsValid() );
+	Assert( b != 0.0f );
+	vec_t oob = 1.0f / b;
+	c.x = a.x * oob;
+	c.y = a.y * oob;
+}
+
+inline void Vector2DDivide( const Vector2D& a, const Vector2D& b, Vector2D& c )
+{
+	Assert( a.IsValid() );
+	Assert( (b.x != 0.0f) && (b.y != 0.0f) );
+	c.x = a.x / b.x;
+	c.y = a.y / b.y;
+}
+
+inline void Vector2DMA( const Vector2D& start, float s, const Vector2D& dir, Vector2D& result )
+{
+	Assert( start.IsValid() && IsFinite(s) && dir.IsValid() );
+	result.x = start.x + s*dir.x;
+	result.y = start.y + s*dir.y;
+}
+
+// FIXME: Remove
+// For backwards compatability
+inline void	Vector2D::MulAdd(const Vector2D& a, const Vector2D& b, float scalar)
+{
+	x = a.x + b.x * scalar;
+	y = a.y + b.y * scalar;
+}
+
+inline void Vector2DLerp(const Vector2D& src1, const Vector2D& src2, vec_t t, Vector2D& dest )
+{
+	dest[0] = src1[0] + (src2[0] - src1[0]) * t;
+	dest[1] = src1[1] + (src2[1] - src1[1]) * t;
+}
+
+//-----------------------------------------------------------------------------
+// dot, cross
+//-----------------------------------------------------------------------------
+inline vec_t DotProduct2D(const Vector2D& a, const Vector2D& b) 
+{ 
+	Assert( a.IsValid() && b.IsValid() );
+	return( a.x*b.x + a.y*b.y ); 
+}
+
+// for backwards compatability
+inline vec_t Vector2D::Dot( const Vector2D& vOther ) const
+{
+	return DotProduct2D( *this, vOther );
+}
+
+
+//-----------------------------------------------------------------------------
+// length
+//-----------------------------------------------------------------------------
+inline vec_t Vector2DLength( const Vector2D& v )
+{
+	Assert( v.IsValid() );
+	return (vec_t)FastSqrt(v.x*v.x + v.y*v.y);		
+}
+
+inline vec_t Vector2D::LengthSqr(void) const	
+{ 
+	Assert( IsValid() );
+	return (x*x + y*y);		
+}
+
+inline vec_t Vector2D::NormalizeInPlace()
+{
+	return Vector2DNormalize( *this );
+}
+
+inline bool Vector2D::IsLengthGreaterThan( float val ) const
+{
+	return LengthSqr() > val*val;
+}
+
+inline bool Vector2D::IsLengthLessThan( float val ) const
+{
+	return LengthSqr() < val*val;
+}
+
+inline vec_t Vector2D::Length(void) const	
+{
+	return Vector2DLength( *this );
+}
+
+
+inline void Vector2DMin( const Vector2D &a, const Vector2D &b, Vector2D &result )
+{
+	result.x = (a.x < b.x) ? a.x : b.x;
+	result.y = (a.y < b.y) ? a.y : b.y;
+}
+
+
+inline void Vector2DMax( const Vector2D &a, const Vector2D &b, Vector2D &result )
+{
+	result.x = (a.x > b.x) ? a.x : b.x;
+	result.y = (a.y > b.y) ? a.y : b.y;
+}
+
+
+//-----------------------------------------------------------------------------
+// Normalization
+//-----------------------------------------------------------------------------
+inline vec_t Vector2DNormalize( Vector2D& v )
+{
+	Assert( v.IsValid() );
+	vec_t l = v.Length();
+	if (l != 0.0f)
+	{
+		v /= l;
+	}
+	else
+	{
+		v.x = v.y = 0.0f; 
+	}
+	return l;
+}
+
+
+//-----------------------------------------------------------------------------
+// Get the distance from this Vector2D to the other one 
+//-----------------------------------------------------------------------------
+inline vec_t Vector2D::DistTo(const Vector2D &vOther) const
+{
+	Vector2D delta;
+	Vector2DSubtract( *this, vOther, delta );
+	return delta.Length();
+}
+
+inline vec_t Vector2D::DistToSqr(const Vector2D &vOther) const
+{
+	Vector2D delta;
+	Vector2DSubtract( *this, vOther, delta );
+	return delta.LengthSqr();
+}
+
+
+//-----------------------------------------------------------------------------
+// Computes the closest point to vecTarget no farther than flMaxDist from vecStart
+//-----------------------------------------------------------------------------
+inline void ComputeClosestPoint2D( const Vector2D& vecStart, float flMaxDist, const Vector2D& vecTarget, Vector2D *pResult )
+{
+	Vector2D vecDelta;
+	Vector2DSubtract( vecTarget, vecStart, vecDelta );
+	float flDistSqr = vecDelta.LengthSqr();
+	if ( flDistSqr <= flMaxDist * flMaxDist )
+	{
+		*pResult = vecTarget;
+	}
+	else
+	{
+		vecDelta /= FastSqrt( flDistSqr );
+		Vector2DMA( vecStart, flMaxDist, vecDelta, *pResult );
+	}
+}
+
+
+
+//-----------------------------------------------------------------------------
+//
+// Slow methods
+//
+//-----------------------------------------------------------------------------
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+
+//-----------------------------------------------------------------------------
+// Returns a Vector2D with the min or max in X, Y, and Z.
+//-----------------------------------------------------------------------------
+
+inline Vector2D Vector2D::Min(const Vector2D &vOther) const
+{
+	return Vector2D(x < vOther.x ? x : vOther.x, 
+		y < vOther.y ? y : vOther.y);
+}
+
+inline Vector2D Vector2D::Max(const Vector2D &vOther) const
+{
+	return Vector2D(x > vOther.x ? x : vOther.x, 
+		y > vOther.y ? y : vOther.y);
+}
+
+
+//-----------------------------------------------------------------------------
+// arithmetic operations
+//-----------------------------------------------------------------------------
+
+inline Vector2D Vector2D::operator-(void) const
+{ 
+	return Vector2D(-x,-y);				
+}
+
+inline Vector2D Vector2D::operator+(const Vector2D& v) const	
+{ 
+	Vector2D res;
+	Vector2DAdd( *this, v, res );
+	return res;	
+}
+
+inline Vector2D Vector2D::operator-(const Vector2D& v) const	
+{ 
+	Vector2D res;
+	Vector2DSubtract( *this, v, res );
+	return res;	
+}
+
+inline Vector2D Vector2D::operator*(float fl) const	
+{ 
+	Vector2D res;
+	Vector2DMultiply( *this, fl, res );
+	return res;	
+}
+
+inline Vector2D Vector2D::operator*(const Vector2D& v) const	
+{ 
+	Vector2D res;
+	Vector2DMultiply( *this, v, res );
+	return res;	
+}
+
+inline Vector2D Vector2D::operator/(float fl) const	
+{ 
+	Vector2D res;
+	Vector2DDivide( *this, fl, res );
+	return res;	
+}
+
+inline Vector2D Vector2D::operator/(const Vector2D& v) const	
+{ 
+	Vector2D res;
+	Vector2DDivide( *this, v, res );
+	return res;	
+}
+
+inline Vector2D operator*(float fl, const Vector2D& v)	
+{ 
+	return v * fl; 
+}
+
+#endif //slow
+
+#endif // VECTOR2D_H
+
diff --git a/public/mathlib/vector4d.h b/public/mathlib/vector4d.h
new file mode 100644
index 0000000..2b20c88
--- /dev/null
+++ b/public/mathlib/vector4d.h
@@ -0,0 +1,686 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+
+#ifndef VECTOR4D_H
+#define VECTOR4D_H
+
+#ifdef _WIN32
+#pragma once
+#endif
+
+#include <math.h>
+#include <stdlib.h>		// For rand(). We really need a library!
+#include <float.h>
+#if !defined( _X360 )
+#include <xmmintrin.h>	// For SSE
+#endif
+#include "basetypes.h"	// For vec_t, put this somewhere else?
+#include "tier0/dbg.h"
+#include "mathlib/math_pfns.h"
+
+// forward declarations
+class Vector;
+class Vector2D;
+
+//=========================================================
+// 4D Vector4D
+//=========================================================
+
+class Vector4D					
+{
+public:
+	// Members
+	vec_t x, y, z, w;
+
+	// Construction/destruction
+	Vector4D(void);
+	Vector4D(vec_t X, vec_t Y, vec_t Z, vec_t W);
+	Vector4D(const float *pFloat);
+
+	// Initialization
+	void Init(vec_t ix=0.0f, vec_t iy=0.0f, vec_t iz=0.0f, vec_t iw=0.0f);
+
+	// Got any nasty NAN's?
+	bool IsValid() const;
+
+	// array access...
+	vec_t operator[](int i) const;
+	vec_t& operator[](int i);
+
+	// Base address...
+	inline vec_t* Base();
+	inline vec_t const* Base() const;
+
+	// Cast to Vector and Vector2D...
+	Vector& AsVector3D();
+	Vector const& AsVector3D() const;
+
+	Vector2D& AsVector2D();
+	Vector2D const& AsVector2D() const;
+
+	// Initialization methods
+	void Random( vec_t minVal, vec_t maxVal );
+
+	// equality
+	bool operator==(const Vector4D& v) const;
+	bool operator!=(const Vector4D& v) const;	
+
+	// arithmetic operations
+	Vector4D&	operator+=(const Vector4D &v);			
+	Vector4D&	operator-=(const Vector4D &v);		
+	Vector4D&	operator*=(const Vector4D &v);			
+	Vector4D&	operator*=(float s);
+	Vector4D&	operator/=(const Vector4D &v);		
+	Vector4D&	operator/=(float s);					
+
+	// negate the Vector4D components
+	void	Negate(); 
+
+	// Get the Vector4D's magnitude.
+	vec_t	Length() const;
+
+	// Get the Vector4D's magnitude squared.
+	vec_t	LengthSqr(void) const;
+
+	// return true if this vector is (0,0,0,0) within tolerance
+	bool IsZero( float tolerance = 0.01f ) const
+	{
+		return (x > -tolerance && x < tolerance &&
+				y > -tolerance && y < tolerance &&
+				z > -tolerance && z < tolerance &&
+				w > -tolerance && w < tolerance);
+	}
+
+	// Get the distance from this Vector4D to the other one.
+	vec_t	DistTo(const Vector4D &vOther) const;
+
+	// Get the distance from this Vector4D to the other one squared.
+	vec_t	DistToSqr(const Vector4D &vOther) const;		
+
+	// Copy
+	void	CopyToArray(float* rgfl) const;	
+
+	// Multiply, add, and assign to this (ie: *this = a + b * scalar). This
+	// is about 12% faster than the actual Vector4D equation (because it's done per-component
+	// rather than per-Vector4D).
+	void	MulAdd(Vector4D const& a, Vector4D const& b, float scalar);	
+
+	// Dot product.
+	vec_t	Dot(Vector4D const& vOther) const;			
+
+	// No copy constructors allowed if we're in optimal mode
+#ifdef VECTOR_NO_SLOW_OPERATIONS
+private:
+#else
+public:
+#endif
+	Vector4D(Vector4D const& vOther);
+
+	// No assignment operators either...
+	Vector4D& operator=( Vector4D const& src );
+};
+
+const Vector4D vec4_origin( 0.0f, 0.0f, 0.0f, 0.0f );
+const Vector4D vec4_invalid( FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX );
+
+//-----------------------------------------------------------------------------
+// SSE optimized routines
+//-----------------------------------------------------------------------------
+
+class ALIGN16 Vector4DAligned : public Vector4D
+{
+public:
+	Vector4DAligned(void) {}
+	Vector4DAligned( vec_t X, vec_t Y, vec_t Z, vec_t W );
+
+	inline void Set( vec_t X, vec_t Y, vec_t Z, vec_t W );
+	inline void InitZero( void );
+
+	inline __m128 &AsM128() { return *(__m128*)&x; }
+	inline const __m128 &AsM128() const { return *(const __m128*)&x; } 
+
+private:
+	// No copy constructors allowed if we're in optimal mode
+	Vector4DAligned( Vector4DAligned const& vOther );
+
+	// No assignment operators either...
+	Vector4DAligned& operator=( Vector4DAligned const& src );
+} ALIGN16_POST;
+
+//-----------------------------------------------------------------------------
+// Vector4D related operations
+//-----------------------------------------------------------------------------
+
+// Vector4D clear
+void Vector4DClear( Vector4D& a );
+
+// Copy
+void Vector4DCopy( Vector4D const& src, Vector4D& dst );
+
+// Vector4D arithmetic
+void Vector4DAdd( Vector4D const& a, Vector4D const& b, Vector4D& result );
+void Vector4DSubtract( Vector4D const& a, Vector4D const& b, Vector4D& result );
+void Vector4DMultiply( Vector4D const& a, vec_t b, Vector4D& result );
+void Vector4DMultiply( Vector4D const& a, Vector4D const& b, Vector4D& result );
+void Vector4DDivide( Vector4D const& a, vec_t b, Vector4D& result );
+void Vector4DDivide( Vector4D const& a, Vector4D const& b, Vector4D& result );
+void Vector4DMA( Vector4D const& start, float s, Vector4D const& dir, Vector4D& result );
+
+// Vector4DAligned arithmetic
+void Vector4DMultiplyAligned( Vector4DAligned const& a, vec_t b, Vector4DAligned& result );
+
+
+#define Vector4DExpand( v ) (v).x, (v).y, (v).z, (v).w
+
+// Normalization
+vec_t Vector4DNormalize( Vector4D& v );
+
+// Length
+vec_t Vector4DLength( Vector4D const& v );
+
+// Dot Product
+vec_t DotProduct4D(Vector4D const& a, Vector4D const& b);
+
+// Linearly interpolate between two vectors
+void Vector4DLerp(Vector4D const& src1, Vector4D const& src2, vec_t t, Vector4D& dest );
+
+
+//-----------------------------------------------------------------------------
+//
+// Inlined Vector4D methods
+//
+//-----------------------------------------------------------------------------
+
+
+//-----------------------------------------------------------------------------
+// constructors
+//-----------------------------------------------------------------------------
+
+inline Vector4D::Vector4D(void)									
+{ 
+#ifdef _DEBUG
+	// Initialize to NAN to catch errors
+	x = y = z = w = VEC_T_NAN;
+#endif
+}
+
+inline Vector4D::Vector4D(vec_t X, vec_t Y, vec_t Z, vec_t W )
+{ 
+	x = X; y = Y; z = Z; w = W;
+	Assert( IsValid() );
+}
+
+inline Vector4D::Vector4D(const float *pFloat)					
+{
+	Assert( pFloat );
+	x = pFloat[0]; y = pFloat[1]; z = pFloat[2]; w = pFloat[3];	
+	Assert( IsValid() );
+}
+
+
+//-----------------------------------------------------------------------------
+// copy constructor
+//-----------------------------------------------------------------------------
+
+inline Vector4D::Vector4D(const Vector4D &vOther)					
+{ 
+	Assert( vOther.IsValid() );
+	x = vOther.x; y = vOther.y; z = vOther.z; w = vOther.w;
+}
+
+//-----------------------------------------------------------------------------
+// initialization
+//-----------------------------------------------------------------------------
+
+inline void Vector4D::Init( vec_t ix, vec_t iy, vec_t iz, vec_t iw )
+{ 
+	x = ix; y = iy; z = iz;	w = iw;
+	Assert( IsValid() );
+}
+
+inline void Vector4D::Random( vec_t minVal, vec_t maxVal )
+{
+	x = minVal + ((vec_t)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
+	y = minVal + ((vec_t)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
+	z = minVal + ((vec_t)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
+	w = minVal + ((vec_t)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
+}
+
+inline void Vector4DClear( Vector4D& a )
+{
+	a.x = a.y = a.z = a.w = 0.0f;
+}
+
+//-----------------------------------------------------------------------------
+// assignment
+//-----------------------------------------------------------------------------
+
+inline Vector4D& Vector4D::operator=(const Vector4D &vOther)	
+{
+	Assert( vOther.IsValid() );
+	x=vOther.x; y=vOther.y; z=vOther.z; w=vOther.w;
+	return *this; 
+}
+
+//-----------------------------------------------------------------------------
+// Array access
+//-----------------------------------------------------------------------------
+
+inline vec_t& Vector4D::operator[](int i)
+{
+	Assert( (i >= 0) && (i < 4) );
+	return ((vec_t*)this)[i];
+}
+
+inline vec_t Vector4D::operator[](int i) const
+{
+	Assert( (i >= 0) && (i < 4) );
+	return ((vec_t*)this)[i];
+}
+
+//-----------------------------------------------------------------------------
+// Cast to Vector and Vector2D...
+//-----------------------------------------------------------------------------
+
+inline Vector& Vector4D::AsVector3D()
+{
+	return *(Vector*)this;
+}
+
+inline Vector const& Vector4D::AsVector3D() const
+{
+	return *(Vector const*)this;
+}
+
+inline Vector2D& Vector4D::AsVector2D()
+{
+	return *(Vector2D*)this;
+}
+
+inline Vector2D const& Vector4D::AsVector2D() const
+{
+	return *(Vector2D const*)this;
+}
+
+//-----------------------------------------------------------------------------
+// Base address...
+//-----------------------------------------------------------------------------
+
+inline vec_t* Vector4D::Base()
+{
+	return (vec_t*)this;
+}
+
+inline vec_t const* Vector4D::Base() const
+{
+	return (vec_t const*)this;
+}
+
+//-----------------------------------------------------------------------------
+// IsValid?
+//-----------------------------------------------------------------------------
+
+inline bool Vector4D::IsValid() const
+{
+	return IsFinite(x) && IsFinite(y) && IsFinite(z) && IsFinite(w);
+}
+
+//-----------------------------------------------------------------------------
+// comparison
+//-----------------------------------------------------------------------------
+
+inline bool Vector4D::operator==( Vector4D const& src ) const
+{
+	Assert( src.IsValid() && IsValid() );
+	return (src.x == x) && (src.y == y) && (src.z == z) && (src.w == w);
+}
+
+inline bool Vector4D::operator!=( Vector4D const& src ) const
+{
+	Assert( src.IsValid() && IsValid() );
+	return (src.x != x) || (src.y != y) || (src.z != z) || (src.w != w);
+}
+
+
+//-----------------------------------------------------------------------------
+// Copy
+//-----------------------------------------------------------------------------
+
+inline void Vector4DCopy( Vector4D const& src, Vector4D& dst )
+{
+	Assert( src.IsValid() );
+	dst.x = src.x;
+	dst.y = src.y;
+	dst.z = src.z;
+	dst.w = src.w;
+}
+
+inline void	Vector4D::CopyToArray(float* rgfl) const		
+{ 
+	Assert( IsValid() );
+	Assert( rgfl );
+	rgfl[0] = x; rgfl[1] = y; rgfl[2] = z; rgfl[3] = w;
+}
+
+//-----------------------------------------------------------------------------
+// standard math operations
+//-----------------------------------------------------------------------------
+
+inline void Vector4D::Negate()
+{ 
+	Assert( IsValid() );
+	x = -x; y = -y; z = -z; w = -w;
+} 
+
+inline Vector4D& Vector4D::operator+=(const Vector4D& v)	
+{ 
+	Assert( IsValid() && v.IsValid() );
+	x+=v.x; y+=v.y; z += v.z; w += v.w;	
+	return *this;
+}
+
+inline Vector4D& Vector4D::operator-=(const Vector4D& v)	
+{ 
+	Assert( IsValid() && v.IsValid() );
+	x-=v.x; y-=v.y; z -= v.z; w -= v.w;
+	return *this;
+}
+
+inline Vector4D& Vector4D::operator*=(float fl)	
+{
+	x *= fl;
+	y *= fl;
+	z *= fl;
+	w *= fl;
+	Assert( IsValid() );
+	return *this;
+}
+
+inline Vector4D& Vector4D::operator*=(Vector4D const& v)	
+{ 
+	x *= v.x;
+	y *= v.y;
+	z *= v.z;
+	w *= v.w;
+	Assert( IsValid() );
+	return *this;
+}
+
+inline Vector4D& Vector4D::operator/=(float fl)	
+{
+	Assert( fl != 0.0f );
+	float oofl = 1.0f / fl;
+	x *= oofl;
+	y *= oofl;
+	z *= oofl;
+	w *= oofl;
+	Assert( IsValid() );
+	return *this;
+}
+
+inline Vector4D& Vector4D::operator/=(Vector4D const& v)	
+{ 
+	Assert( v.x != 0.0f && v.y != 0.0f && v.z != 0.0f && v.w != 0.0f );
+	x /= v.x;
+	y /= v.y;
+	z /= v.z;
+	w /= v.w;
+	Assert( IsValid() );
+	return *this;
+}
+
+inline void Vector4DAdd( Vector4D const& a, Vector4D const& b, Vector4D& c )
+{
+	Assert( a.IsValid() && b.IsValid() );
+	c.x = a.x + b.x;
+	c.y = a.y + b.y;
+	c.z = a.z + b.z;
+	c.w = a.w + b.w;
+}
+
+inline void Vector4DSubtract( Vector4D const& a, Vector4D const& b, Vector4D& c )
+{
+	Assert( a.IsValid() && b.IsValid() );
+	c.x = a.x - b.x;
+	c.y = a.y - b.y;
+	c.z = a.z - b.z;
+	c.w = a.w - b.w;
+}
+
+inline void Vector4DMultiply( Vector4D const& a, vec_t b, Vector4D& c )
+{
+	Assert( a.IsValid() && IsFinite(b) );
+	c.x = a.x * b;
+	c.y = a.y * b;
+	c.z = a.z * b;
+	c.w = a.w * b;
+}
+
+inline void Vector4DMultiply( Vector4D const& a, Vector4D const& b, Vector4D& c )
+{
+	Assert( a.IsValid() && b.IsValid() );
+	c.x = a.x * b.x;
+	c.y = a.y * b.y;
+	c.z = a.z * b.z;
+	c.w = a.w * b.w;
+}
+
+inline void Vector4DDivide( Vector4D const& a, vec_t b, Vector4D& c )
+{
+	Assert( a.IsValid() );
+	Assert( b != 0.0f );
+	vec_t oob = 1.0f / b;
+	c.x = a.x * oob;
+	c.y = a.y * oob;
+	c.z = a.z * oob;
+	c.w = a.w * oob;
+}
+
+inline void Vector4DDivide( Vector4D const& a, Vector4D const& b, Vector4D& c )
+{
+	Assert( a.IsValid() );
+	Assert( (b.x != 0.0f) && (b.y != 0.0f) && (b.z != 0.0f) && (b.w != 0.0f) );
+	c.x = a.x / b.x;
+	c.y = a.y / b.y;
+	c.z = a.z / b.z;
+	c.w = a.w / b.w;
+}
+
+inline void Vector4DMA( Vector4D const& start, float s, Vector4D const& dir, Vector4D& result )
+{
+	Assert( start.IsValid() && IsFinite(s) && dir.IsValid() );
+	result.x = start.x + s*dir.x;
+	result.y = start.y + s*dir.y;
+	result.z = start.z + s*dir.z;
+	result.w = start.w + s*dir.w;
+}
+
+// FIXME: Remove
+// For backwards compatability
+inline void	Vector4D::MulAdd(Vector4D const& a, Vector4D const& b, float scalar)
+{
+	x = a.x + b.x * scalar;
+	y = a.y + b.y * scalar;
+	z = a.z + b.z * scalar;
+	w = a.w + b.w * scalar;
+}
+
+inline void Vector4DLerp(const Vector4D& src1, const Vector4D& src2, vec_t t, Vector4D& dest )
+{
+	dest[0] = src1[0] + (src2[0] - src1[0]) * t;
+	dest[1] = src1[1] + (src2[1] - src1[1]) * t;
+	dest[2] = src1[2] + (src2[2] - src1[2]) * t;
+	dest[3] = src1[3] + (src2[3] - src1[3]) * t;
+}
+
+//-----------------------------------------------------------------------------
+// dot, cross
+//-----------------------------------------------------------------------------
+
+inline vec_t DotProduct4D(const Vector4D& a, const Vector4D& b) 
+{ 
+	Assert( a.IsValid() && b.IsValid() );
+	return( a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w ); 
+}
+
+// for backwards compatability
+inline vec_t Vector4D::Dot( Vector4D const& vOther ) const
+{
+	return DotProduct4D( *this, vOther );
+}
+
+
+//-----------------------------------------------------------------------------
+// length
+//-----------------------------------------------------------------------------
+
+inline vec_t Vector4DLength( Vector4D const& v )
+{				   
+	Assert( v.IsValid() );
+	return (vec_t)FastSqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w);		
+}
+
+inline vec_t Vector4D::LengthSqr(void) const	
+{ 
+	Assert( IsValid() );
+	return (x*x + y*y + z*z + w*w);		
+}
+
+inline vec_t Vector4D::Length(void) const	
+{
+	return Vector4DLength( *this );
+}
+
+
+//-----------------------------------------------------------------------------
+// Normalization
+//-----------------------------------------------------------------------------
+
+// FIXME: Can't use until we're un-macroed in mathlib.h
+inline vec_t Vector4DNormalize( Vector4D& v )
+{
+	Assert( v.IsValid() );
+	vec_t l = v.Length();
+	if (l != 0.0f)
+	{
+		v /= l;
+	}
+	else
+	{
+		v.x = v.y = v.z = v.w = 0.0f;
+	}
+	return l;
+}
+
+//-----------------------------------------------------------------------------
+// Get the distance from this Vector4D to the other one 
+//-----------------------------------------------------------------------------
+
+inline vec_t Vector4D::DistTo(const Vector4D &vOther) const
+{
+	Vector4D delta;
+	Vector4DSubtract( *this, vOther, delta );
+	return delta.Length();
+}
+
+inline vec_t Vector4D::DistToSqr(const Vector4D &vOther) const
+{
+	Vector4D delta;
+	Vector4DSubtract( *this, vOther, delta );
+	return delta.LengthSqr();
+}
+
+
+//-----------------------------------------------------------------------------
+// Vector4DAligned routines
+//-----------------------------------------------------------------------------
+
+inline Vector4DAligned::Vector4DAligned( vec_t X, vec_t Y, vec_t Z, vec_t W )
+{ 
+	x = X; y = Y; z = Z; w = W;
+	Assert( IsValid() );
+}
+
+inline void Vector4DAligned::Set( vec_t X, vec_t Y, vec_t Z, vec_t W )
+{ 
+	x = X; y = Y; z = Z; w = W;
+	Assert( IsValid() );
+}
+
+inline void Vector4DAligned::InitZero( void )
+{ 
+#if !defined( _X360 )
+	this->AsM128() = _mm_set1_ps( 0.0f );
+#else
+	this->AsM128() = __vspltisw( 0 );
+#endif
+	Assert( IsValid() );
+}
+
+inline void Vector4DMultiplyAligned( Vector4DAligned const& a, Vector4DAligned const& b, Vector4DAligned& c )
+{
+	Assert( a.IsValid() && b.IsValid() );
+#if !defined( _X360 )
+	c.x = a.x * b.x;
+	c.y = a.y * b.y;
+	c.z = a.z * b.z;
+	c.w = a.w * b.w;
+#else
+	c.AsM128() = __vmulfp( a.AsM128(), b.AsM128() );
+#endif
+}
+
+inline void Vector4DWeightMAD( vec_t w, Vector4DAligned const& vInA, Vector4DAligned& vOutA, Vector4DAligned const& vInB, Vector4DAligned& vOutB )
+{
+	Assert( vInA.IsValid() && vInB.IsValid() && IsFinite(w) );
+
+#if !defined( _X360 )
+	vOutA.x += vInA.x * w;
+	vOutA.y += vInA.y * w;
+	vOutA.z += vInA.z * w;
+	vOutA.w += vInA.w * w;
+
+	vOutB.x += vInB.x * w;
+	vOutB.y += vInB.y * w;
+	vOutB.z += vInB.z * w;
+	vOutB.w += vInB.w * w;
+#else
+    __vector4 temp;
+
+    temp = __lvlx( &w, 0 );
+    temp = __vspltw( temp, 0 );
+
+	vOutA.AsM128() = __vmaddfp( vInA.AsM128(), temp, vOutA.AsM128() );
+	vOutB.AsM128() = __vmaddfp( vInB.AsM128(), temp, vOutB.AsM128() );
+#endif
+}
+
+inline void Vector4DWeightMADSSE( vec_t w, Vector4DAligned const& vInA, Vector4DAligned& vOutA, Vector4DAligned const& vInB, Vector4DAligned& vOutB )
+{
+	Assert( vInA.IsValid() && vInB.IsValid() && IsFinite(w) );
+
+#if !defined( _X360 )
+	// Replicate scalar float out to 4 components
+    __m128 packed = _mm_set1_ps( w );
+
+	// 4D SSE Vector MAD
+	vOutA.AsM128() = _mm_add_ps( vOutA.AsM128(), _mm_mul_ps( vInA.AsM128(), packed ) );
+	vOutB.AsM128() = _mm_add_ps( vOutB.AsM128(), _mm_mul_ps( vInB.AsM128(), packed ) );
+#else
+    __vector4 temp;
+
+    temp = __lvlx( &w, 0 );
+    temp = __vspltw( temp, 0 );
+
+	vOutA.AsM128() = __vmaddfp( vInA.AsM128(), temp, vOutA.AsM128() );
+	vOutB.AsM128() = __vmaddfp( vInB.AsM128(), temp, vOutB.AsM128() );
+#endif
+}
+
+#endif // VECTOR4D_H
+
diff --git a/public/mathlib/vmatrix.h b/public/mathlib/vmatrix.h
new file mode 100644
index 0000000..e49a888
--- /dev/null
+++ b/public/mathlib/vmatrix.h
@@ -0,0 +1,947 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $NoKeywords: $
+//
+//=============================================================================//
+//
+// VMatrix always postmultiply vectors as in Ax = b.
+// Given a set of basis vectors ((F)orward, (L)eft, (U)p), and a (T)ranslation, 
+// a matrix to transform a vector into that space looks like this:
+// Fx Lx Ux Tx
+// Fy Ly Uy Ty
+// Fz Lz Uz Tz
+// 0   0  0  1
+
+// Note that concatenating matrices needs to multiply them in reverse order.
+// ie: if I want to apply matrix A, B, then C, the equation needs to look like this:
+// C * B * A * v
+// ie:
+// v = A * v;
+// v = B * v;
+// v = C * v;
+//=============================================================================
+
+#ifndef VMATRIX_H
+#define VMATRIX_H
+
+#ifdef _WIN32
+#pragma once
+#endif
+
+#include <string.h>
+#include "mathlib/vector.h"
+#include "mathlib/vplane.h"
+#include "mathlib/vector4d.h"
+#include "mathlib/mathlib.h"
+
+struct cplane_t;
+
+
+class VMatrix
+{
+public:
+
+	VMatrix();
+	VMatrix(
+		vec_t m00, vec_t m01, vec_t m02, vec_t m03,
+		vec_t m10, vec_t m11, vec_t m12, vec_t m13,
+		vec_t m20, vec_t m21, vec_t m22, vec_t m23,
+		vec_t m30, vec_t m31, vec_t m32, vec_t m33
+		);
+
+	// Creates a matrix where the X axis = forward
+	// the Y axis = left, and the Z axis = up
+	VMatrix( const Vector& forward, const Vector& left, const Vector& up );
+	VMatrix( const Vector& forward, const Vector& left, const Vector& up, const Vector& translation );
+	
+	// Construct from a 3x4 matrix
+	VMatrix( const matrix3x4_t& matrix3x4 );
+
+	// Set the values in the matrix.
+	void		Init( 
+		vec_t m00, vec_t m01, vec_t m02, vec_t m03,
+		vec_t m10, vec_t m11, vec_t m12, vec_t m13,
+		vec_t m20, vec_t m21, vec_t m22, vec_t m23,
+		vec_t m30, vec_t m31, vec_t m32, vec_t m33 
+		);
+
+
+	// Initialize from a 3x4
+	void		Init( const matrix3x4_t& matrix3x4 );
+
+	// array access
+	inline float* operator[](int i)
+	{ 
+		return m[i]; 
+	}
+
+	inline const float* operator[](int i) const
+	{ 
+		return m[i]; 
+	}
+
+	// Get a pointer to m[0][0]
+	inline float *Base()
+	{
+		return &m[0][0];
+	}
+
+	inline const float *Base() const
+	{
+		return &m[0][0];
+	}
+
+	void		SetLeft(const Vector &vLeft);
+	void		SetUp(const Vector &vUp);
+	void		SetForward(const Vector &vForward);
+
+	void		GetBasisVectors(Vector &vForward, Vector &vLeft, Vector &vUp) const;
+	void		SetBasisVectors(const Vector &vForward, const Vector &vLeft, const Vector &vUp);
+
+	// Get/set the translation.
+	Vector &	GetTranslation( Vector &vTrans ) const;
+	void		SetTranslation(const Vector &vTrans);
+
+	void		PreTranslate(const Vector &vTrans);
+	void		PostTranslate(const Vector &vTrans);
+
+	const matrix3x4_t& As3x4() const;
+	void		CopyFrom3x4( const matrix3x4_t &m3x4 );
+	void		Set3x4( matrix3x4_t& matrix3x4 ) const;
+
+	bool		operator==( const VMatrix& src ) const;
+	bool		operator!=( const VMatrix& src ) const { return !( *this == src ); }
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+	// Access the basis vectors.
+	Vector		GetLeft() const;
+	Vector		GetUp() const;
+	Vector		GetForward() const;
+	Vector		GetTranslation() const;
+#endif
+
+
+// Matrix->vector operations.
+public:
+	// Multiply by a 3D vector (same as operator*).
+	void		V3Mul(const Vector &vIn, Vector &vOut) const;
+
+	// Multiply by a 4D vector.
+	void		V4Mul(const Vector4D &vIn, Vector4D &vOut) const;
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+	// Applies the rotation (ignores translation in the matrix). (This just calls VMul3x3).
+	Vector		ApplyRotation(const Vector &vVec) const;
+
+	// Multiply by a vector (divides by w, assumes input w is 1).
+	Vector		operator*(const Vector &vVec) const;
+
+	// Multiply by the upper 3x3 part of the matrix (ie: only apply rotation).
+	Vector		VMul3x3(const Vector &vVec) const;
+
+	// Apply the inverse (transposed) rotation (only works on pure rotation matrix)
+	Vector		VMul3x3Transpose(const Vector &vVec) const;
+
+	// Multiply by the upper 3 rows.
+	Vector		VMul4x3(const Vector &vVec) const;
+
+	// Apply the inverse (transposed) transformation (only works on pure rotation/translation)
+	Vector		VMul4x3Transpose(const Vector &vVec) const;
+#endif
+
+
+// Matrix->plane operations.
+public:
+	// Transform the plane. The matrix can only contain translation and rotation.
+	void		TransformPlane( const VPlane &inPlane, VPlane &outPlane ) const;
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+	// Just calls TransformPlane and returns the result.
+	VPlane		operator*(const VPlane &thePlane) const;
+#endif
+
+// Matrix->matrix operations.
+public:
+
+	VMatrix&	operator=(const VMatrix &mOther);
+	
+	// Multiply two matrices (out = this * vm).
+	void		MatrixMul( const VMatrix &vm, VMatrix &out ) const;
+
+	// Add two matrices.
+	const VMatrix& operator+=(const VMatrix &other);
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+	// Just calls MatrixMul and returns the result.	
+	VMatrix		operator*(const VMatrix &mOther) const;
+
+	// Add/Subtract two matrices.
+	VMatrix		operator+(const VMatrix &other) const;
+	VMatrix		operator-(const VMatrix &other) const;
+
+	// Negation.
+	VMatrix		operator-() const;
+
+	// Return inverse matrix. Be careful because the results are undefined 
+	// if the matrix doesn't have an inverse (ie: InverseGeneral returns false).
+	VMatrix		operator~() const;
+#endif
+
+// Matrix operations.
+public:
+	// Set to identity.
+	void		Identity();
+
+	bool		IsIdentity() const;
+
+	// Setup a matrix for origin and angles.
+	void		SetupMatrixOrgAngles( const Vector &origin, const QAngle &vAngles );
+	
+	// Setup a matrix for angles and no translation.
+	void		SetupMatrixAngles( const QAngle &vAngles );
+
+	// General inverse. This may fail so check the return!
+	bool		InverseGeneral(VMatrix &vInverse) const;
+	
+	// Does a fast inverse, assuming the matrix only contains translation and rotation.
+	void		InverseTR( VMatrix &mRet ) const;
+
+	// Usually used for debug checks. Returns true if the upper 3x3 contains
+	// unit vectors and they are all orthogonal.
+	bool		IsRotationMatrix() const;
+	
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+	// This calls the other InverseTR and returns the result.
+	VMatrix		InverseTR() const;
+
+	// Get the scale of the matrix's basis vectors.
+	Vector		GetScale() const;
+
+	// (Fast) multiply by a scaling matrix setup from vScale.
+	VMatrix		Scale(const Vector &vScale);	
+
+	// Normalize the basis vectors.
+	VMatrix		NormalizeBasisVectors() const;
+
+	// Transpose.
+	VMatrix		Transpose() const;
+
+	// Transpose upper-left 3x3.
+	VMatrix		Transpose3x3() const;
+#endif
+
+public:
+	// The matrix.
+	vec_t		m[4][4];
+};
+
+
+
+//-----------------------------------------------------------------------------
+// Helper functions.
+//-----------------------------------------------------------------------------
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+
+// Setup an identity matrix.
+VMatrix		SetupMatrixIdentity();
+
+// Setup as a scaling matrix.
+VMatrix		SetupMatrixScale(const Vector &vScale);
+
+// Setup a translation matrix.
+VMatrix		SetupMatrixTranslation(const Vector &vTranslation);
+
+// Setup a matrix to reflect around the plane.
+VMatrix		SetupMatrixReflection(const VPlane &thePlane);
+
+// Setup a matrix to project from vOrigin onto thePlane.
+VMatrix		SetupMatrixProjection(const Vector &vOrigin, const VPlane &thePlane);
+
+// Setup a matrix to rotate the specified amount around the specified axis.
+VMatrix		SetupMatrixAxisRot(const Vector &vAxis, vec_t fDegrees);
+
+// Setup a matrix from euler angles. Just sets identity and calls MatrixAngles.
+VMatrix		SetupMatrixAngles(const QAngle &vAngles);
+
+// Setup a matrix for origin and angles.
+VMatrix		SetupMatrixOrgAngles(const Vector &origin, const QAngle &vAngles);
+
+#endif
+
+#define VMatToString(mat)	(static_cast<const char *>(CFmtStr("[ (%f, %f, %f), (%f, %f, %f), (%f, %f, %f), (%f, %f, %f) ]", mat.m[0][0], mat.m[0][1], mat.m[0][2], mat.m[0][3], mat.m[1][0], mat.m[1][1], mat.m[1][2], mat.m[1][3], mat.m[2][0], mat.m[2][1], mat.m[2][2], mat.m[2][3], mat.m[3][0], mat.m[3][1], mat.m[3][2], mat.m[3][3] ))) // ** Note: this generates a temporary, don't hold reference!
+
+//-----------------------------------------------------------------------------
+// Returns the point at the intersection on the 3 planes.
+// Returns false if it can't be solved (2 or more planes are parallel).
+//-----------------------------------------------------------------------------
+bool PlaneIntersection( const VPlane &vp1, const VPlane &vp2, const VPlane &vp3, Vector &vOut );
+
+
+//-----------------------------------------------------------------------------
+// These methods are faster. Use them if you want faster code
+//-----------------------------------------------------------------------------
+void MatrixSetIdentity( VMatrix &dst );
+void MatrixTranspose( const VMatrix& src, VMatrix& dst );
+void MatrixCopy( const VMatrix& src, VMatrix& dst );
+void MatrixMultiply( const VMatrix& src1, const VMatrix& src2, VMatrix& dst );
+
+// Accessors
+void MatrixGetColumn( const VMatrix &src, int nCol, Vector *pColumn );
+void MatrixSetColumn( VMatrix &src, int nCol, const Vector &column );
+void MatrixGetRow( const VMatrix &src, int nCol, Vector *pColumn );
+void MatrixSetRow( VMatrix &src, int nCol, const Vector &column );
+
+// Vector3DMultiply treats src2 as if it's a direction vector
+void Vector3DMultiply( const VMatrix& src1, const Vector& src2, Vector& dst );
+
+// Vector3DMultiplyPosition treats src2 as if it's a point (adds the translation)
+inline void Vector3DMultiplyPosition( const VMatrix& src1, const VectorByValue src2, Vector& dst );
+
+// Vector3DMultiplyPositionProjective treats src2 as if it's a point 
+// and does the perspective divide at the end
+void Vector3DMultiplyPositionProjective( const VMatrix& src1, const Vector &src2, Vector& dst );
+
+// Vector3DMultiplyPosition treats src2 as if it's a direction 
+// and does the perspective divide at the end
+// NOTE: src1 had better be an inverse transpose to use this correctly
+void Vector3DMultiplyProjective( const VMatrix& src1, const Vector &src2, Vector& dst );
+
+void Vector4DMultiply( const VMatrix& src1, const Vector4D& src2, Vector4D& dst );
+
+// Same as Vector4DMultiply except that src2 has an implicit W of 1
+void Vector4DMultiplyPosition( const VMatrix& src1, const Vector &src2, Vector4D& dst );
+
+// Multiplies the vector by the transpose of the matrix
+void Vector3DMultiplyTranspose( const VMatrix& src1, const Vector& src2, Vector& dst );
+void Vector4DMultiplyTranspose( const VMatrix& src1, const Vector4D& src2, Vector4D& dst );
+
+// Transform a plane
+void MatrixTransformPlane( const VMatrix &src, const cplane_t &inPlane, cplane_t &outPlane );
+
+// Transform a plane that has an axis-aligned normal
+void MatrixTransformAxisAlignedPlane( const VMatrix &src, int nDim, float flSign, float flDist, cplane_t &outPlane );
+
+void MatrixBuildTranslation( VMatrix& dst, float x, float y, float z );
+void MatrixBuildTranslation( VMatrix& dst, const Vector &translation );
+
+inline void MatrixTranslate( VMatrix& dst, const Vector &translation )
+{
+	VMatrix matTranslation, temp;
+	MatrixBuildTranslation( matTranslation, translation );
+	MatrixMultiply( dst, matTranslation, temp );
+	dst = temp;
+}
+
+
+void MatrixBuildRotationAboutAxis( VMatrix& dst, const Vector& vAxisOfRot, float angleDegrees );
+void MatrixBuildRotateZ( VMatrix& dst, float angleDegrees );
+
+inline void MatrixRotate( VMatrix& dst, const Vector& vAxisOfRot, float angleDegrees )
+{
+	VMatrix rotation, temp;
+	MatrixBuildRotationAboutAxis( rotation, vAxisOfRot, angleDegrees );
+	MatrixMultiply( dst, rotation, temp );
+	dst = temp;
+}
+
+// Builds a rotation matrix that rotates one direction vector into another
+void MatrixBuildRotation( VMatrix &dst, const Vector& initialDirection, const Vector& finalDirection );
+
+// Builds a scale matrix
+void MatrixBuildScale( VMatrix &dst, float x, float y, float z );
+void MatrixBuildScale( VMatrix &dst, const Vector& scale );
+
+// Build a perspective matrix.
+// zNear and zFar are assumed to be positive.
+// You end up looking down positive Z, X is to the right, Y is up.
+// X range: [0..1]
+// Y range: [0..1]
+// Z range: [0..1]
+void MatrixBuildPerspective( VMatrix &dst, float fovX, float fovY, float zNear, float zFar );
+
+//-----------------------------------------------------------------------------
+// Given a projection matrix, take the extremes of the space in transformed into world space and
+// get a bounding box.
+//-----------------------------------------------------------------------------
+void CalculateAABBFromProjectionMatrix( const VMatrix &worldToVolume, Vector *pMins, Vector *pMaxs );
+
+//-----------------------------------------------------------------------------
+// Given a projection matrix, take the extremes of the space in transformed into world space and
+// get a bounding sphere.
+//-----------------------------------------------------------------------------
+void CalculateSphereFromProjectionMatrix( const VMatrix &worldToVolume, Vector *pCenter, float *pflRadius );
+
+//-----------------------------------------------------------------------------
+// Given an inverse projection matrix, take the extremes of the space in transformed into world space and
+// get a bounding box.
+//-----------------------------------------------------------------------------
+void CalculateAABBFromProjectionMatrixInverse( const VMatrix &volumeToWorld, Vector *pMins, Vector *pMaxs );
+
+//-----------------------------------------------------------------------------
+// Given an inverse projection matrix, take the extremes of the space in transformed into world space and
+// get a bounding sphere.
+//-----------------------------------------------------------------------------
+void CalculateSphereFromProjectionMatrixInverse( const VMatrix &volumeToWorld, Vector *pCenter, float *pflRadius );
+
+//-----------------------------------------------------------------------------
+// Calculate frustum planes given a clip->world space transform.
+//-----------------------------------------------------------------------------
+void FrustumPlanesFromMatrix( const VMatrix &clipToWorld, Frustum_t &frustum );
+
+//-----------------------------------------------------------------------------
+// Setup a matrix from euler angles. 
+//-----------------------------------------------------------------------------
+void MatrixFromAngles( const QAngle& vAngles, VMatrix& dst );
+
+//-----------------------------------------------------------------------------
+// Creates euler angles from a matrix 
+//-----------------------------------------------------------------------------
+void MatrixToAngles( const VMatrix& src, QAngle& vAngles );
+
+//-----------------------------------------------------------------------------
+// Does a fast inverse, assuming the matrix only contains translation and rotation.
+//-----------------------------------------------------------------------------
+void MatrixInverseTR( const VMatrix& src, VMatrix &dst );
+
+//-----------------------------------------------------------------------------
+// Inverts any matrix at all
+//-----------------------------------------------------------------------------
+bool MatrixInverseGeneral(const VMatrix& src, VMatrix& dst);
+
+//-----------------------------------------------------------------------------
+// Computes the inverse transpose
+//-----------------------------------------------------------------------------
+void MatrixInverseTranspose( const VMatrix& src, VMatrix& dst );
+
+
+
+//-----------------------------------------------------------------------------
+// VMatrix inlines.
+//-----------------------------------------------------------------------------
+inline VMatrix::VMatrix()
+{
+}
+
+inline VMatrix::VMatrix(
+	vec_t m00, vec_t m01, vec_t m02, vec_t m03,
+	vec_t m10, vec_t m11, vec_t m12, vec_t m13,
+	vec_t m20, vec_t m21, vec_t m22, vec_t m23,
+	vec_t m30, vec_t m31, vec_t m32, vec_t m33)
+{
+	Init(
+		m00, m01, m02, m03,
+		m10, m11, m12, m13,
+		m20, m21, m22, m23,
+		m30, m31, m32, m33
+		);
+}
+
+
+inline VMatrix::VMatrix( const matrix3x4_t& matrix3x4 )
+{
+	Init( matrix3x4 );
+}
+
+
+//-----------------------------------------------------------------------------
+// Creates a matrix where the X axis = forward
+// the Y axis = left, and the Z axis = up
+//-----------------------------------------------------------------------------
+inline VMatrix::VMatrix( const Vector& xAxis, const Vector& yAxis, const Vector& zAxis )
+{
+	Init(
+		xAxis.x, yAxis.x, zAxis.x, 0.0f,
+		xAxis.y, yAxis.y, zAxis.y, 0.0f,
+		xAxis.z, yAxis.z, zAxis.z, 0.0f,
+		0.0f, 0.0f, 0.0f, 1.0f
+		);
+}
+
+inline VMatrix::VMatrix( const Vector& xAxis, const Vector& yAxis, const Vector& zAxis, const Vector& translation )
+{
+	Init(
+		xAxis.x, yAxis.x, zAxis.x, translation.x,
+		xAxis.y, yAxis.y, zAxis.y, translation.y,
+		xAxis.z, yAxis.z, zAxis.z, translation.z,
+		0.0f, 0.0f, 0.0f, 1.0f
+		);
+}
+
+
+inline void VMatrix::Init(
+	vec_t m00, vec_t m01, vec_t m02, vec_t m03,
+	vec_t m10, vec_t m11, vec_t m12, vec_t m13,
+	vec_t m20, vec_t m21, vec_t m22, vec_t m23,
+	vec_t m30, vec_t m31, vec_t m32, vec_t m33
+	)
+{
+	m[0][0] = m00;
+	m[0][1] = m01;
+	m[0][2] = m02;
+	m[0][3] = m03;
+
+	m[1][0] = m10;
+	m[1][1] = m11;
+	m[1][2] = m12;
+	m[1][3] = m13;
+
+	m[2][0] = m20;
+	m[2][1] = m21;
+	m[2][2] = m22;
+	m[2][3] = m23;
+
+	m[3][0] = m30;
+	m[3][1] = m31;
+	m[3][2] = m32;
+	m[3][3] = m33;
+}
+
+
+//-----------------------------------------------------------------------------
+// Initialize from a 3x4
+//-----------------------------------------------------------------------------
+inline void VMatrix::Init( const matrix3x4_t& matrix3x4 )
+{
+	memcpy(m, matrix3x4.Base(), sizeof( matrix3x4_t ) );
+
+	m[3][0] = 0.0f;
+	m[3][1] = 0.0f;
+	m[3][2] = 0.0f;
+	m[3][3] = 1.0f;	
+}
+
+
+//-----------------------------------------------------------------------------
+// Methods related to the basis vectors of the matrix
+//-----------------------------------------------------------------------------
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+
+inline Vector VMatrix::GetForward() const
+{
+	return Vector(m[0][0], m[1][0], m[2][0]);
+}
+
+inline Vector VMatrix::GetLeft() const
+{
+	return Vector(m[0][1], m[1][1], m[2][1]);
+}
+
+inline Vector VMatrix::GetUp() const
+{
+	return Vector(m[0][2], m[1][2], m[2][2]);
+}
+
+#endif
+
+inline void VMatrix::SetForward(const Vector &vForward)
+{
+	m[0][0] = vForward.x;
+	m[1][0] = vForward.y;
+	m[2][0] = vForward.z;
+}
+
+inline void VMatrix::SetLeft(const Vector &vLeft)
+{
+	m[0][1] = vLeft.x;
+	m[1][1] = vLeft.y;
+	m[2][1] = vLeft.z;
+}
+
+inline void VMatrix::SetUp(const Vector &vUp)
+{
+	m[0][2] = vUp.x;
+	m[1][2] = vUp.y;
+	m[2][2] = vUp.z;
+}
+
+inline void VMatrix::GetBasisVectors(Vector &vForward, Vector &vLeft, Vector &vUp) const
+{
+	vForward.Init( m[0][0], m[1][0], m[2][0] );
+	vLeft.Init( m[0][1], m[1][1], m[2][1] );
+	vUp.Init( m[0][2], m[1][2], m[2][2] );
+}
+
+inline void VMatrix::SetBasisVectors(const Vector &vForward, const Vector &vLeft, const Vector &vUp)
+{
+	SetForward(vForward);
+	SetLeft(vLeft);
+	SetUp(vUp);
+}
+
+
+//-----------------------------------------------------------------------------
+// Methods related to the translation component of the matrix
+//-----------------------------------------------------------------------------
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+
+inline Vector VMatrix::GetTranslation() const
+{
+	return Vector(m[0][3], m[1][3], m[2][3]);
+}
+
+#endif
+
+inline Vector& VMatrix::GetTranslation( Vector &vTrans ) const
+{
+	vTrans.x = m[0][3];
+	vTrans.y = m[1][3];
+	vTrans.z = m[2][3];
+	return vTrans;
+}
+
+inline void VMatrix::SetTranslation(const Vector &vTrans)
+{
+	m[0][3] = vTrans.x;
+	m[1][3] = vTrans.y;
+	m[2][3] = vTrans.z;
+}
+
+		  
+//-----------------------------------------------------------------------------
+// appply translation to this matrix in the input space
+//-----------------------------------------------------------------------------
+inline void VMatrix::PreTranslate(const Vector &vTrans)
+{
+	Vector tmp;
+	Vector3DMultiplyPosition( *this, vTrans, tmp );
+	m[0][3] = tmp.x;
+	m[1][3] = tmp.y;
+	m[2][3] = tmp.z;
+}
+
+
+//-----------------------------------------------------------------------------
+// appply translation to this matrix in the output space
+//-----------------------------------------------------------------------------
+inline void VMatrix::PostTranslate(const Vector &vTrans)
+{
+	m[0][3] += vTrans.x;
+	m[1][3] += vTrans.y;
+	m[2][3] += vTrans.z;
+}
+
+inline const matrix3x4_t& VMatrix::As3x4() const
+{
+	return *((const matrix3x4_t*)this);
+}
+
+inline void VMatrix::CopyFrom3x4( const matrix3x4_t &m3x4 )
+{
+	memcpy( m, m3x4.Base(), sizeof( matrix3x4_t ) );
+	m[3][0] = m[3][1] = m[3][2] = 0;
+	m[3][3] = 1;
+}
+
+inline void	VMatrix::Set3x4( matrix3x4_t& matrix3x4 ) const
+{
+	memcpy(matrix3x4.Base(), m, sizeof( matrix3x4_t ) );
+}
+
+
+//-----------------------------------------------------------------------------
+// Matrix math operations
+//-----------------------------------------------------------------------------
+inline const VMatrix& VMatrix::operator+=(const VMatrix &other)
+{
+	for(int i=0; i < 4; i++)
+	{
+		for(int j=0; j < 4; j++)
+		{
+			m[i][j] += other.m[i][j];
+		}
+	}
+
+	return *this;
+}
+
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+
+inline VMatrix VMatrix::operator+(const VMatrix &other) const
+{
+	VMatrix ret;
+	for(int i=0; i < 16; i++)
+	{
+		((float*)ret.m)[i] = ((float*)m)[i] + ((float*)other.m)[i];
+	}
+	return ret;
+}
+
+inline VMatrix VMatrix::operator-(const VMatrix &other) const
+{
+	VMatrix ret;
+
+	for(int i=0; i < 4; i++)
+	{
+		for(int j=0; j < 4; j++)
+		{
+			ret.m[i][j] = m[i][j] - other.m[i][j];
+		}
+	}
+
+	return ret;
+}
+
+inline VMatrix VMatrix::operator-() const
+{
+	VMatrix ret;
+	for( int i=0; i < 16; i++ )
+	{
+		((float*)ret.m)[i] = ((float*)m)[i];
+	}
+	return ret;
+}
+
+#endif // VECTOR_NO_SLOW_OPERATIONS
+
+
+//-----------------------------------------------------------------------------
+// Vector transformation
+//-----------------------------------------------------------------------------
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+
+inline Vector VMatrix::operator*(const Vector &vVec) const
+{
+	Vector vRet;
+	vRet.x = m[0][0]*vVec.x + m[0][1]*vVec.y + m[0][2]*vVec.z + m[0][3];
+	vRet.y = m[1][0]*vVec.x + m[1][1]*vVec.y + m[1][2]*vVec.z + m[1][3];
+	vRet.z = m[2][0]*vVec.x + m[2][1]*vVec.y + m[2][2]*vVec.z + m[2][3];
+
+	return vRet;
+}
+
+inline Vector VMatrix::VMul4x3(const Vector &vVec) const
+{
+	Vector vResult;
+	Vector3DMultiplyPosition( *this, vVec, vResult );
+	return vResult;
+}
+
+
+inline Vector VMatrix::VMul4x3Transpose(const Vector &vVec) const
+{
+	Vector tmp = vVec;
+	tmp.x -= m[0][3];
+	tmp.y -= m[1][3];
+	tmp.z -= m[2][3];
+
+	return Vector(
+		m[0][0]*tmp.x + m[1][0]*tmp.y + m[2][0]*tmp.z,
+		m[0][1]*tmp.x + m[1][1]*tmp.y + m[2][1]*tmp.z,
+		m[0][2]*tmp.x + m[1][2]*tmp.y + m[2][2]*tmp.z
+		);
+}
+
+inline Vector VMatrix::VMul3x3(const Vector &vVec) const
+{
+	return Vector(
+		m[0][0]*vVec.x + m[0][1]*vVec.y + m[0][2]*vVec.z,
+		m[1][0]*vVec.x + m[1][1]*vVec.y + m[1][2]*vVec.z,
+		m[2][0]*vVec.x + m[2][1]*vVec.y + m[2][2]*vVec.z
+		);
+}
+
+inline Vector VMatrix::VMul3x3Transpose(const Vector &vVec) const
+{
+	return Vector(
+		m[0][0]*vVec.x + m[1][0]*vVec.y + m[2][0]*vVec.z,
+		m[0][1]*vVec.x + m[1][1]*vVec.y + m[2][1]*vVec.z,
+		m[0][2]*vVec.x + m[1][2]*vVec.y + m[2][2]*vVec.z
+		);
+}
+
+#endif // VECTOR_NO_SLOW_OPERATIONS
+
+
+inline void VMatrix::V3Mul(const Vector &vIn, Vector &vOut) const
+{
+	vec_t rw;
+
+	rw = 1.0f / (m[3][0]*vIn.x + m[3][1]*vIn.y + m[3][2]*vIn.z + m[3][3]);
+	vOut.x = (m[0][0]*vIn.x + m[0][1]*vIn.y + m[0][2]*vIn.z + m[0][3]) * rw;
+	vOut.y = (m[1][0]*vIn.x + m[1][1]*vIn.y + m[1][2]*vIn.z + m[1][3]) * rw;
+	vOut.z = (m[2][0]*vIn.x + m[2][1]*vIn.y + m[2][2]*vIn.z + m[2][3]) * rw;
+}
+
+inline void VMatrix::V4Mul(const Vector4D &vIn, Vector4D &vOut) const
+{
+	vOut[0] = m[0][0]*vIn[0] + m[0][1]*vIn[1] + m[0][2]*vIn[2] + m[0][3]*vIn[3];
+	vOut[1] = m[1][0]*vIn[0] + m[1][1]*vIn[1] + m[1][2]*vIn[2] + m[1][3]*vIn[3];
+	vOut[2] = m[2][0]*vIn[0] + m[2][1]*vIn[1] + m[2][2]*vIn[2] + m[2][3]*vIn[3];
+	vOut[3] = m[3][0]*vIn[0] + m[3][1]*vIn[1] + m[3][2]*vIn[2] + m[3][3]*vIn[3];
+}
+
+
+//-----------------------------------------------------------------------------
+// Plane transformation
+//-----------------------------------------------------------------------------
+inline void	VMatrix::TransformPlane( const VPlane &inPlane, VPlane &outPlane ) const
+{
+	Vector vTrans;
+	Vector3DMultiply( *this, inPlane.m_Normal, outPlane.m_Normal );
+	outPlane.m_Dist = inPlane.m_Dist * DotProduct( outPlane.m_Normal, outPlane.m_Normal );
+	outPlane.m_Dist += DotProduct( outPlane.m_Normal, GetTranslation( vTrans ) );
+}
+
+
+//-----------------------------------------------------------------------------
+// Other random stuff
+//-----------------------------------------------------------------------------
+inline void VMatrix::Identity()
+{
+	MatrixSetIdentity( *this );
+}
+
+
+inline bool VMatrix::IsIdentity() const
+{
+	return 
+		m[0][0] == 1.0f && m[0][1] == 0.0f && m[0][2] == 0.0f && m[0][3] == 0.0f &&
+		m[1][0] == 0.0f && m[1][1] == 1.0f && m[1][2] == 0.0f && m[1][3] == 0.0f &&
+		m[2][0] == 0.0f && m[2][1] == 0.0f && m[2][2] == 1.0f && m[2][3] == 0.0f &&
+		m[3][0] == 0.0f && m[3][1] == 0.0f && m[3][2] == 0.0f && m[3][3] == 1.0f;
+}
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+
+inline Vector VMatrix::ApplyRotation(const Vector &vVec) const
+{
+	return VMul3x3(vVec);
+}
+
+inline VMatrix VMatrix::operator~() const
+{
+	VMatrix mRet;
+	InverseGeneral(mRet);
+	return mRet;
+}
+
+#endif
+
+
+//-----------------------------------------------------------------------------
+// Accessors
+//-----------------------------------------------------------------------------
+inline void MatrixGetColumn( const VMatrix &src, int nCol, Vector *pColumn )
+{
+	Assert( (nCol >= 0) && (nCol <= 3) );
+
+	pColumn->x = src[0][nCol];
+	pColumn->y = src[1][nCol];
+	pColumn->z = src[2][nCol];
+}
+
+inline void MatrixSetColumn( VMatrix &src, int nCol, const Vector &column )
+{
+	Assert( (nCol >= 0) && (nCol <= 3) );
+
+	src.m[0][nCol] = column.x;
+	src.m[1][nCol] = column.y;
+	src.m[2][nCol] = column.z;
+}
+
+inline void MatrixGetRow( const VMatrix &src, int nRow, Vector *pRow )
+{
+	Assert( (nRow >= 0) && (nRow <= 3) );
+	*pRow = *(Vector*)src[nRow];
+}
+
+inline void MatrixSetRow( VMatrix &dst, int nRow, const Vector &row )
+{
+	Assert( (nRow >= 0) && (nRow <= 3) );
+	*(Vector*)dst[nRow] = row;
+}
+
+
+//-----------------------------------------------------------------------------
+// Vector3DMultiplyPosition treats src2 as if it's a point (adds the translation)
+//-----------------------------------------------------------------------------
+// NJS: src2 is passed in as a full vector rather than a reference to prevent the need
+// for 2 branches and a potential copy in the body.  (ie, handling the case when the src2
+// reference is the same as the dst reference ).
+inline void Vector3DMultiplyPosition( const VMatrix& src1, const VectorByValue src2, Vector& dst )
+{
+	dst[0] = src1[0][0] * src2.x + src1[0][1] * src2.y + src1[0][2] * src2.z + src1[0][3];
+	dst[1] = src1[1][0] * src2.x + src1[1][1] * src2.y + src1[1][2] * src2.z + src1[1][3];
+	dst[2] = src1[2][0] * src2.x + src1[2][1] * src2.y + src1[2][2] * src2.z + src1[2][3];
+}
+
+
+//-----------------------------------------------------------------------------
+// Transform a plane that has an axis-aligned normal
+//-----------------------------------------------------------------------------
+inline void MatrixTransformAxisAlignedPlane( const VMatrix &src, int nDim, float flSign, float flDist, cplane_t &outPlane )
+{
+	// See MatrixTransformPlane in the .cpp file for an explanation of the algorithm.
+	MatrixGetColumn( src, nDim, &outPlane.normal );
+	outPlane.normal *= flSign;
+	outPlane.dist = flDist * DotProduct( outPlane.normal, outPlane.normal );
+
+	// NOTE: Writing this out by hand because it doesn't inline (inline depth isn't large enough)
+	// This should read outPlane.dist += DotProduct( outPlane.normal, src.GetTranslation );
+	outPlane.dist += outPlane.normal.x * src.m[0][3] + outPlane.normal.y * src.m[1][3] + outPlane.normal.z * src.m[2][3];
+}
+
+
+//-----------------------------------------------------------------------------
+// Matrix equality test
+//-----------------------------------------------------------------------------
+inline bool MatricesAreEqual( const VMatrix &src1, const VMatrix &src2, float flTolerance )
+{
+	for ( int i = 0; i < 3; ++i )
+	{
+		for ( int j = 0; j < 3; ++j )
+		{
+			if ( fabs( src1[i][j] - src2[i][j] ) > flTolerance )
+				return false;
+		}
+	}
+	return true;
+}
+
+//-----------------------------------------------------------------------------
+//
+//-----------------------------------------------------------------------------
+void MatrixBuildOrtho( VMatrix& dst, double left, double top, double right, double bottom, double zNear, double zFar );
+void MatrixBuildPerspectiveX( VMatrix& dst, double flFovX, double flAspect, double flZNear, double flZFar );
+void MatrixBuildPerspectiveOffCenterX( VMatrix& dst, double flFovX, double flAspect, double flZNear, double flZFar, double bottom, double top, double left, double right );
+void MatrixBuildPerspectiveZRange( VMatrix& dst, double flZNear, double flZFar );
+
+inline void MatrixOrtho( VMatrix& dst, double left, double top, double right, double bottom, double zNear, double zFar )
+{
+	VMatrix mat;
+	MatrixBuildOrtho( mat, left, top, right, bottom, zNear, zFar );
+
+	VMatrix temp;
+	MatrixMultiply( dst, mat, temp );
+	dst = temp;
+}
+
+inline void MatrixPerspectiveX( VMatrix& dst, double flFovX, double flAspect, double flZNear, double flZFar )
+{
+	VMatrix mat;
+	MatrixBuildPerspectiveX( mat, flFovX, flAspect, flZNear, flZFar );
+
+	VMatrix temp;
+	MatrixMultiply( dst, mat, temp );
+	dst = temp;
+}
+
+inline void MatrixPerspectiveOffCenterX( VMatrix& dst, double flFovX, double flAspect, double flZNear, double flZFar, double bottom, double top, double left, double right )
+{
+	VMatrix mat;
+	MatrixBuildPerspectiveOffCenterX( mat, flFovX, flAspect, flZNear, flZFar, bottom, top, left, right );
+
+	VMatrix temp;
+	MatrixMultiply( dst, mat, temp );
+	dst = temp;
+}
+
+#endif
+
+
diff --git a/public/mathlib/vplane.h b/public/mathlib/vplane.h
new file mode 100644
index 0000000..dd3d4a9
--- /dev/null
+++ b/public/mathlib/vplane.h
@@ -0,0 +1,182 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+// $Workfile:     $
+// $Date:         $
+// $NoKeywords: $
+//=============================================================================//
+
+#ifndef VPLANE_H
+#define VPLANE_H
+
+#ifdef _WIN32
+#pragma once
+#endif
+
+#include "mathlib/vector.h"
+
+typedef int SideType;
+
+// Used to represent sides of things like planes.
+#define	SIDE_FRONT	0
+#define	SIDE_BACK	1
+#define	SIDE_ON		2
+
+#define VP_EPSILON	0.01f
+
+
+class VPlane
+{
+public:
+				VPlane();
+				VPlane(const Vector &vNormal, vec_t dist);
+
+	void		Init(const Vector &vNormal, vec_t dist);
+
+	// Return the distance from the point to the plane.
+	vec_t		DistTo(const Vector &vVec) const;
+
+	// Copy.
+	VPlane&		operator=(const VPlane &thePlane);
+
+	// Returns SIDE_ON, SIDE_FRONT, or SIDE_BACK.
+	// The epsilon for SIDE_ON can be passed in.
+	SideType	GetPointSide(const Vector &vPoint, vec_t sideEpsilon=VP_EPSILON) const;
+
+	// Returns SIDE_FRONT or SIDE_BACK.
+	SideType	GetPointSideExact(const Vector &vPoint) const;
+
+	// Classify the box with respect to the plane.
+	// Returns SIDE_ON, SIDE_FRONT, or SIDE_BACK
+	SideType	BoxOnPlaneSide(const Vector &vMin, const Vector &vMax) const;
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+	// Flip the plane.
+	VPlane		Flip();
+
+	// Get a point on the plane (normal*dist).
+	Vector		GetPointOnPlane() const;
+
+	// Snap the specified point to the plane (along the plane's normal).
+	Vector		SnapPointToPlane(const Vector &vPoint) const;
+#endif
+
+public:
+	Vector		m_Normal;
+	vec_t		m_Dist;
+
+#ifdef VECTOR_NO_SLOW_OPERATIONS
+private:
+	// No copy constructors allowed if we're in optimal mode
+	VPlane(const VPlane& vOther);
+#endif
+};
+
+
+//-----------------------------------------------------------------------------
+// Inlines.
+//-----------------------------------------------------------------------------
+inline VPlane::VPlane()
+{
+}
+
+inline VPlane::VPlane(const Vector &vNormal, vec_t dist)
+{
+	m_Normal = vNormal;
+	m_Dist = dist;
+}
+
+inline void	VPlane::Init(const Vector &vNormal, vec_t dist)
+{
+	m_Normal = vNormal;
+	m_Dist = dist;
+}
+
+inline vec_t VPlane::DistTo(const Vector &vVec) const
+{
+	return vVec.Dot(m_Normal) - m_Dist;
+}
+
+inline VPlane& VPlane::operator=(const VPlane &thePlane)
+{
+	m_Normal = thePlane.m_Normal;
+	m_Dist = thePlane.m_Dist;
+	return *this;
+}
+
+#ifndef VECTOR_NO_SLOW_OPERATIONS
+
+inline VPlane VPlane::Flip()
+{
+	return VPlane(-m_Normal, -m_Dist);
+}
+
+inline Vector VPlane::GetPointOnPlane() const
+{
+	return m_Normal * m_Dist;
+}
+
+inline Vector VPlane::SnapPointToPlane(const Vector &vPoint) const
+{
+	return vPoint - m_Normal * DistTo(vPoint);
+}
+
+#endif
+
+inline SideType VPlane::GetPointSide(const Vector &vPoint, vec_t sideEpsilon) const
+{
+	vec_t fDist;
+
+	fDist = DistTo(vPoint);
+	if(fDist >= sideEpsilon)
+		return SIDE_FRONT;
+	else if(fDist <= -sideEpsilon)
+		return SIDE_BACK;
+	else
+		return SIDE_ON;
+}
+
+inline SideType VPlane::GetPointSideExact(const Vector &vPoint) const
+{
+	return DistTo(vPoint) > 0.0f ? SIDE_FRONT : SIDE_BACK;
+}
+
+
+// BUGBUG: This should either simply use the implementation in mathlib or cease to exist.
+// mathlib implementation is much more efficient.  Check to see that VPlane isn't used in
+// performance critical code.
+inline SideType VPlane::BoxOnPlaneSide(const Vector &vMin, const Vector &vMax) const
+{
+	int i, firstSide, side;
+	TableVector vPoints[8] = 
+	{
+		{ vMin.x, vMin.y, vMin.z },
+		{ vMin.x, vMin.y, vMax.z },
+		{ vMin.x, vMax.y, vMax.z },
+		{ vMin.x, vMax.y, vMin.z },
+
+		{ vMax.x, vMin.y, vMin.z },
+		{ vMax.x, vMin.y, vMax.z },
+		{ vMax.x, vMax.y, vMax.z },
+		{ vMax.x, vMax.y, vMin.z },
+	};
+
+	firstSide = GetPointSideExact(vPoints[0]);
+	for(i=1; i < 8; i++)
+	{
+		side = GetPointSideExact(vPoints[i]);
+
+		// Does the box cross the plane?
+		if(side != firstSide)
+			return SIDE_ON;
+	}
+
+	// Ok, they're all on the same side, return that.
+	return firstSide;
+}
+
+
+
+
+#endif // VPLANE_H
author	FluorescentCIAAfricanAmerican <[email protected]>	2020-04-22 12:56:21 -0400
committer	FluorescentCIAAfricanAmerican <[email protected]>	2020-04-22 12:56:21 -0400
commit	3bf9df6b2785fa6d951086978a3e66f49427166a (patch)
tree	2c0f1f0c63c4832882bc93814ebd2c2b1c6224e5 /public/mathlib
download	archived-source-engine-2018-hl2-src-master.tar.xz archived-source-engine-2018-hl2-src-master.zip