diff options
| author | FluorescentCIAAfricanAmerican <[email protected]> | 2020-04-22 12:56:21 -0400 |
|---|---|---|
| committer | FluorescentCIAAfricanAmerican <[email protected]> | 2020-04-22 12:56:21 -0400 |
| commit | 3bf9df6b2785fa6d951086978a3e66f49427166a (patch) | |
| tree | 2c0f1f0c63c4832882bc93814ebd2c2b1c6224e5 /public/mathlib | |
| download | archived-source-engine-2018-hl2-src-master.tar.xz archived-source-engine-2018-hl2-src-master.zip | |
Diffstat (limited to 'public/mathlib')
| -rw-r--r-- | public/mathlib/IceKey.H | 62 | ||||
| -rw-r--r-- | public/mathlib/amd3dx.h | 1188 | ||||
| -rw-r--r-- | public/mathlib/anorms.h | 25 | ||||
| -rw-r--r-- | public/mathlib/bumpvects.h | 37 | ||||
| -rw-r--r-- | public/mathlib/compressed_3d_unitvec.h | 284 | ||||
| -rw-r--r-- | public/mathlib/compressed_light_cube.h | 24 | ||||
| -rw-r--r-- | public/mathlib/compressed_vector.h | 608 | ||||
| -rw-r--r-- | public/mathlib/halton.h | 71 | ||||
| -rw-r--r-- | public/mathlib/lightdesc.h | 173 | ||||
| -rw-r--r-- | public/mathlib/math_pfns.h | 80 | ||||
| -rw-r--r-- | public/mathlib/mathlib.h | 2187 | ||||
| -rw-r--r-- | public/mathlib/matrixmath.h | 385 | ||||
| -rw-r--r-- | public/mathlib/noise.h | 35 | ||||
| -rw-r--r-- | public/mathlib/polyhedron.h | 73 | ||||
| -rw-r--r-- | public/mathlib/quantize.h | 141 | ||||
| -rw-r--r-- | public/mathlib/simdvectormatrix.h | 142 | ||||
| -rw-r--r-- | public/mathlib/spherical_geometry.h | 73 | ||||
| -rw-r--r-- | public/mathlib/ssemath.h | 3107 | ||||
| -rw-r--r-- | public/mathlib/ssequaternion.h | 367 | ||||
| -rw-r--r-- | public/mathlib/vector.h | 2311 | ||||
| -rw-r--r-- | public/mathlib/vector2d.h | 670 | ||||
| -rw-r--r-- | public/mathlib/vector4d.h | 686 | ||||
| -rw-r--r-- | public/mathlib/vmatrix.h | 947 | ||||
| -rw-r--r-- | public/mathlib/vplane.h | 182 |
24 files changed, 13858 insertions, 0 deletions
diff --git a/public/mathlib/IceKey.H b/public/mathlib/IceKey.H new file mode 100644 index 0000000..f8641d0 --- /dev/null +++ b/public/mathlib/IceKey.H @@ -0,0 +1,62 @@ +// Purpose: Header file for the C++ ICE encryption class. +// Taken from public domain code, as written by Matthew Kwan - July 1996 +// http://www.darkside.com.au/ice/ + +#ifndef _IceKey_H +#define _IceKey_H + +/* +The IceKey class is used for encrypting and decrypting 64-bit blocks of data +with the ICE (Information Concealment Engine) encryption algorithm. + +The constructor creates a new IceKey object that can be used to encrypt and decrypt data. +The level of encryption determines the size of the key, and hence its speed. +Level 0 uses the Thin-ICE variant, which is an 8-round cipher taking an 8-byte key. +This is the fastest option, and is generally considered to be at least as secure as DES, +although it is not yet certain whether it is as secure as its key size. + +For levels n greater than zero, a 16n-round cipher is used, taking 8n-byte keys. +Although not as fast as level 0, these are very very secure. + +Before an IceKey can be used to encrypt data, its key schedule must be set with the set() member function. +The length of the key required is determined by the level, as described above. + +The member functions encrypt() and decrypt() encrypt and decrypt respectively data +in blocks of eight chracters, using the specified key. + +Two functions keySize() and blockSize() are provided +which return the key and block size respectively, measured in bytes. +The key size is determined by the level, while the block size is always 8. + +The destructor zeroes out and frees up all memory associated with the key. +*/ + +class IceSubkey; + +class IceKey { + public: + IceKey (int n); + ~IceKey (); + + void set (const unsigned char *key); + + void encrypt (const unsigned char *plaintext, + unsigned char *ciphertext) const; + + void decrypt (const unsigned char *ciphertext, + unsigned char *plaintext) const; + + int keySize () const; + + int blockSize () const; + + private: + void scheduleBuild (unsigned short *k, int n, + const int *keyrot); + + int _size; + int _rounds; + IceSubkey *_keysched; +}; + +#endif diff --git a/public/mathlib/amd3dx.h b/public/mathlib/amd3dx.h new file mode 100644 index 0000000..9dab1bf --- /dev/null +++ b/public/mathlib/amd3dx.h @@ -0,0 +1,1188 @@ +//========= Copyright Valve Corporation, All rights reserved. ============// +/****************************************************************************** + + Copyright (c) 1999 Advanced Micro Devices, Inc. + + LIMITATION OF LIABILITY: THE MATERIALS ARE PROVIDED *AS IS* WITHOUT ANY + EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY, + NONINFRINGEMENT OF THIRD-PARTY INTELLECTUAL PROPERTY, OR FITNESS FOR ANY + PARTICULAR PURPOSE. IN NO EVENT SHALL AMD OR ITS SUPPLIERS BE LIABLE FOR ANY + DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF PROFITS, + BUSINESS INTERRUPTION, LOSS OF INFORMATION) ARISING OUT OF THE USE OF OR + INABILITY TO USE THE MATERIALS, EVEN IF AMD HAS BEEN ADVISED OF THE POSSIBILITY + OF SUCH DAMAGES. BECAUSE SOME JURISDICTIONS PROHIBIT THE EXCLUSION OR LIMITATION + OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE LIMITATION MAY + NOT APPLY TO YOU. + + AMD does not assume any responsibility for any errors which may appear in the + Materials nor any responsibility to support or update the Materials. AMD retains + the right to make changes to its test specifications at any time, without notice. + + NO SUPPORT OBLIGATION: AMD is not obligated to furnish, support, or make any + further information, software, technical information, know-how, or show-how + available to you. + + So that all may benefit from your experience, please report any problems + or suggestions about this software to [email protected] + + AMD Developer Technologies, M/S 585 + Advanced Micro Devices, Inc. + 5900 E. Ben White Blvd. + Austin, TX 78741 + +******************************************************************************* + + AMD3DX.H + + MACRO FORMAT + ============ + This file contains inline assembly macros that + generate AMD-3D instructions in binary format. + Therefore, C or C++ programmer can use AMD-3D instructions + without any penalty in their C or C++ source code. + + The macro's name and format conventions are as follow: + + + 1. First argument of macro is a destination and + second argument is a source operand. + ex) _asm PFCMPEQ (mm3, mm4) + | | + dst src + + 2. The destination operand can be m0 to m7 only. + The source operand can be any one of the register + m0 to m7 or _eax, _ecx, _edx, _ebx, _esi, or _edi + that contains effective address. + ex) _asm PFRCP (MM7, MM6) + ex) _asm PFRCPIT2 (mm0, mm4) + ex) _asm PFMUL (mm3, _edi) + + 3. The prefetch(w) takes one src operand _eax, ecx, _edx, + _ebx, _esi, or _edi that contains effective address. + ex) _asm PREFETCH (_edi) + + For WATCOM C/C++ users, when using #pragma aux instead if + _asm, all macro names should be prefixed by a p_ or P_. + Macros should not be enclosed in quotes. + ex) p_pfrcp (MM7,MM6) + + NOTE: Not all instruction macros, nor all possible + combinations of operands have been explicitely + tested. If any errors are found, please report + them. + + EXAMPLE + ======= + Following program doesn't do anything but it shows you + how to use inline assembly AMD-3D instructions in C. + Note that this will only work in flat memory model which + segment registers cs, ds, ss and es point to the same + linear address space total less than 4GB. + + Used Microsoft VC++ 5.0 + + #include <stdio.h> + #include "amd3d.h" + + void main () + { + float x = (float)1.25; + float y = (float)1.25; + float z, zz; + + _asm { + movd mm1, x + movd mm2, y + pfmul (mm1, mm2) + movd z, mm1 + femms + } + + printf ("value of z = %f\n", z); + + // + // Demonstration of using the memory instead of + // multimedia register + // + _asm { + movd mm3, x + lea esi, y // load effective address of y + pfmul (mm3, _esi) + movd zz, mm3 + femms + } + + printf ("value of zz = %f\n", zz); + } + + #pragma aux EXAMPLE with WATCOM C/C++ v11.x + =========================================== + + extern void Add(float *__Dest, float *__A, float *__B); + #pragma aux Add = \ + p_femms \ + "movd mm6,[esi]" \ + p_pfadd(mm6,_edi) \ + "movd [ebx],mm6" \ + p_femms \ + parm [ebx] [esi] [edi]; + +*******************************************************************************/ + +#ifndef _K3DMACROSINCLUDED_ +#define _K3DMACROSINCLUDED_ + +#if defined (__WATCOMC__) + +// The WATCOM C/C++ version of the 3DNow! macros. +// +// The older, compbined register style for WATCOM C/C++ macros is not +// supported. + +/* Operand defines for instructions two operands */ +#define _k3d_mm0_mm0 0xc0 +#define _k3d_mm0_mm1 0xc1 +#define _k3d_mm0_mm2 0xc2 +#define _k3d_mm0_mm3 0xc3 +#define _k3d_mm0_mm4 0xc4 +#define _k3d_mm0_mm5 0xc5 +#define _k3d_mm0_mm6 0xc6 +#define _k3d_mm0_mm7 0xc7 +#define _k3d_mm0_eax 0x00 +#define _k3d_mm0_ecx 0x01 +#define _k3d_mm0_edx 0x02 +#define _k3d_mm0_ebx 0x03 +#define _k3d_mm0_esi 0x06 +#define _k3d_mm0_edi 0x07 +#define _k3d_mm1_mm0 0xc8 +#define _k3d_mm1_mm1 0xc9 +#define _k3d_mm1_mm2 0xca +#define _k3d_mm1_mm3 0xcb +#define _k3d_mm1_mm4 0xcc +#define _k3d_mm1_mm5 0xcd +#define _k3d_mm1_mm6 0xce +#define _k3d_mm1_mm7 0xcf +#define _k3d_mm1_eax 0x08 +#define _k3d_mm1_ecx 0x09 +#define _k3d_mm1_edx 0x0a +#define _k3d_mm1_ebx 0x0b +#define _k3d_mm1_esi 0x0e +#define _k3d_mm1_edi 0x0f +#define _k3d_mm2_mm0 0xd0 +#define _k3d_mm2_mm1 0xd1 +#define _k3d_mm2_mm2 0xd2 +#define _k3d_mm2_mm3 0xd3 +#define _k3d_mm2_mm4 0xd4 +#define _k3d_mm2_mm5 0xd5 +#define _k3d_mm2_mm6 0xd6 +#define _k3d_mm2_mm7 0xd7 +#define _k3d_mm2_eax 0x10 +#define _k3d_mm2_ecx 0x11 +#define _k3d_mm2_edx 0x12 +#define _k3d_mm2_ebx 0x13 +#define _k3d_mm2_esi 0x16 +#define _k3d_mm2_edi 0x17 +#define _k3d_mm3_mm0 0xd8 +#define _k3d_mm3_mm1 0xd9 +#define _k3d_mm3_mm2 0xda +#define _k3d_mm3_mm3 0xdb +#define _k3d_mm3_mm4 0xdc +#define _k3d_mm3_mm5 0xdd +#define _k3d_mm3_mm6 0xde +#define _k3d_mm3_mm7 0xdf +#define _k3d_mm3_eax 0x18 +#define _k3d_mm3_ecx 0x19 +#define _k3d_mm3_edx 0x1a +#define _k3d_mm3_ebx 0x1b +#define _k3d_mm3_esi 0x1e +#define _k3d_mm3_edi 0x1f +#define _k3d_mm4_mm0 0xe0 +#define _k3d_mm4_mm1 0xe1 +#define _k3d_mm4_mm2 0xe2 +#define _k3d_mm4_mm3 0xe3 +#define _k3d_mm4_mm4 0xe4 +#define _k3d_mm4_mm5 0xe5 +#define _k3d_mm4_mm6 0xe6 +#define _k3d_mm4_mm7 0xe7 +#define _k3d_mm4_eax 0x20 +#define _k3d_mm4_ecx 0x21 +#define _k3d_mm4_edx 0x22 +#define _k3d_mm4_ebx 0x23 +#define _k3d_mm4_esi 0x26 +#define _k3d_mm4_edi 0x27 +#define _k3d_mm5_mm0 0xe8 +#define _k3d_mm5_mm1 0xe9 +#define _k3d_mm5_mm2 0xea +#define _k3d_mm5_mm3 0xeb +#define _k3d_mm5_mm4 0xec +#define _k3d_mm5_mm5 0xed +#define _k3d_mm5_mm6 0xee +#define _k3d_mm5_mm7 0xef +#define _k3d_mm5_eax 0x28 +#define _k3d_mm5_ecx 0x29 +#define _k3d_mm5_edx 0x2a +#define _k3d_mm5_ebx 0x2b +#define _k3d_mm5_esi 0x2e +#define _k3d_mm5_edi 0x2f +#define _k3d_mm6_mm0 0xf0 +#define _k3d_mm6_mm1 0xf1 +#define _k3d_mm6_mm2 0xf2 +#define _k3d_mm6_mm3 0xf3 +#define _k3d_mm6_mm4 0xf4 +#define _k3d_mm6_mm5 0xf5 +#define _k3d_mm6_mm6 0xf6 +#define _k3d_mm6_mm7 0xf7 +#define _k3d_mm6_eax 0x30 +#define _k3d_mm6_ecx 0x31 +#define _k3d_mm6_edx 0x32 +#define _k3d_mm6_ebx 0x33 +#define _k3d_mm6_esi 0x36 +#define _k3d_mm6_edi 0x37 +#define _k3d_mm7_mm0 0xf8 +#define _k3d_mm7_mm1 0xf9 +#define _k3d_mm7_mm2 0xfa +#define _k3d_mm7_mm3 0xfb +#define _k3d_mm7_mm4 0xfc +#define _k3d_mm7_mm5 0xfd +#define _k3d_mm7_mm6 0xfe +#define _k3d_mm7_mm7 0xff +#define _k3d_mm7_eax 0x38 +#define _k3d_mm7_ecx 0x39 +#define _k3d_mm7_edx 0x3a +#define _k3d_mm7_ebx 0x3b +#define _k3d_mm7_esi 0x3e +#define _k3d_mm7_edi 0x3f + +#define _k3d_name_xlat_m0 _mm0 +#define _k3d_name_xlat_m1 _mm1 +#define _k3d_name_xlat_m2 _mm2 +#define _k3d_name_xlat_m3 _mm3 +#define _k3d_name_xlat_m4 _mm4 +#define _k3d_name_xlat_m5 _mm5 +#define _k3d_name_xlat_m6 _mm6 +#define _k3d_name_xlat_m7 _mm7 +#define _k3d_name_xlat_M0 _mm0 +#define _k3d_name_xlat_M1 _mm1 +#define _k3d_name_xlat_M2 _mm2 +#define _k3d_name_xlat_M3 _mm3 +#define _k3d_name_xlat_M4 _mm4 +#define _k3d_name_xlat_M5 _mm5 +#define _k3d_name_xlat_M6 _mm6 +#define _k3d_name_xlat_M7 _mm7 +#define _k3d_name_xlat_mm0 _mm0 +#define _k3d_name_xlat_mm1 _mm1 +#define _k3d_name_xlat_mm2 _mm2 +#define _k3d_name_xlat_mm3 _mm3 +#define _k3d_name_xlat_mm4 _mm4 +#define _k3d_name_xlat_mm5 _mm5 +#define _k3d_name_xlat_mm6 _mm6 +#define _k3d_name_xlat_mm7 _mm7 +#define _k3d_name_xlat_MM0 _mm0 +#define _k3d_name_xlat_MM1 _mm1 +#define _k3d_name_xlat_MM2 _mm2 +#define _k3d_name_xlat_MM3 _mm3 +#define _k3d_name_xlat_MM4 _mm4 +#define _k3d_name_xlat_MM5 _mm5 +#define _k3d_name_xlat_MM6 _mm6 +#define _k3d_name_xlat_MM7 _mm7 +#define _k3d_name_xlat_eax _eax +#define _k3d_name_xlat_ebx _ebx +#define _k3d_name_xlat_ecx _ecx +#define _k3d_name_xlat_edx _edx +#define _k3d_name_xlat_esi _esi +#define _k3d_name_xlat_edi _edi +#define _k3d_name_xlat_ebp _ebp +#define _k3d_name_xlat_EAX _eax +#define _k3d_name_xlat_EBX _ebx +#define _k3d_name_xlat_ECX _ecx +#define _k3d_name_xlat_EDX _edx +#define _k3d_name_xlat_ESI _esi +#define _k3d_name_xlat_EDI _edi +#define _k3d_name_xlat_EBP _ebp +#define _k3d_name_xlat__eax _eax +#define _k3d_name_xlat__ebx _ebx +#define _k3d_name_xlat__ecx _ecx +#define _k3d_name_xlat__edx _edx +#define _k3d_name_xlat__esi _esi +#define _k3d_name_xlat__edi _edi +#define _k3d_name_xlat__ebp _ebp +#define _k3d_name_xlat__EAX _eax +#define _k3d_name_xlat__EBX _ebx +#define _k3d_name_xlat__ECX _ecx +#define _k3d_name_xlat__EDX _edx +#define _k3d_name_xlat__ESI _esi +#define _k3d_name_xlat__EDI _edi +#define _k3d_name_xlat__EBP _ebp + +#define _k3d_xglue3(a,b,c) a##b##c +#define _k3d_glue3(a,b,c) _k3d_xglue3(a,b,c) +#define _k3d_MODRM(dst, src) _k3d_glue3(_k3d,_k3d_name_xlat_##dst,_k3d_name_xlat_##src) + +/* Operand defines for prefetch and prefetchw */ + +#define _k3d_pref_eax 0x00 +#define _k3d_pref_ecx 0x01 +#define _k3d_pref_edx 0x02 +#define _k3d_pref_ebx 0x03 +#define _k3d_pref_esi 0x06 +#define _k3d_pref_edi 0x07 +#define _k3d_pref_EAX 0x00 +#define _k3d_pref_ECX 0x01 +#define _k3d_pref_EDX 0x02 +#define _k3d_pref_EBX 0x03 +#define _k3d_pref_ESI 0x06 +#define _k3d_pref_EDI 0x07 +#define _k3d_prefw_eax 0x08 +#define _k3d_prefw_ecx 0x09 +#define _k3d_prefw_edx 0x0A +#define _k3d_prefw_ebx 0x0B +#define _k3d_prefw_esi 0x0E +#define _k3d_prefw_edi 0x0F +#define _k3d_prefw_EAX 0x08 +#define _k3d_prefw_ECX 0x09 +#define _k3d_prefw_EDX 0x0A +#define _k3d_prefw_EBX 0x0B +#define _k3d_prefw_ESI 0x0E +#define _k3d_prefw_EDI 0x0F + +/* Defines for 3DNow! instructions */ +#define PF2ID(dst, src) db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0x1d +#define PFACC(dst, src) db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0xae +#define PFADD(dst, src) db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0x9e +#define PFCMPEQ(dst, src) db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0xb0 +#define PFCMPGE(dst, src) db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0x90 +#define PFCMPGT(dst, src) db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0xa0 +#define PFMAX(dst, src) db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0xa4 +#define PFMIN(dst, src) db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0x94 +#define PFMUL(dst, src) db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0xb4 +#define PFRCP(dst, src) db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0x96 +#define PFRCPIT1(dst, src) db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0xa6 +#define PFRCPIT2(dst, src) db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0xb6 +#define PFRSQRT(dst, src) db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0x97 +#define PFRSQIT1(dst, src) db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0xa7 +#define PFSUB(dst, src) db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0x9a +#define PFSUBR(dst, src) db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0xaa +#define PI2FD(dst, src) db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0x0d +#define FEMMS db 0x0f, 0x0e +#define PAVGUSB(dst, src) db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0xbf +#define PMULHRW(dst, src) db 0x0f, 0x0f, _k3d_MODRM(dst, src), 0xb7 +#define PREFETCH(src) db 0x0f, 0x0d, _k3d_pref_##src +#define PREFETCHW(src) db 0x0f, 0x0d, _k3d_prefw_##src +#define CPUID db 0x0f, 0xa2 + +/* Defines for new, K7 opcodes */ +#define PFNACC(dst,src) db 0x0f, 0x0f, _k3d_MODRM(dst,src), 0x8a +#define FPPNACC(dst,src) db 0x0f, 0x0f, _k3d_MODRM(dst,src), 0x8e +#define PSWAPD(dst,src) db 0x0f, 0x0f, _k3d_MODRM(dst,src), 0xbb +#define PMINUB(dst,src) db 0x0f, 0xda, _k3d_MODRM(dst,src) +#define PMAXUB(dst,src) db 0x0f, 0xde, _k3d_MODRM(dst,src) +#define PMINSW(dst,src) db 0x0f, 0xea, _k3d_MODRM(dst,src) +#define PMAXSW(dst,src) db 0x0f, 0xee, _k3d_MODRM(dst,src) +#define PMULHUW(dst,src) db 0x0f, 0xe4, _k3d_MODRM(dst,src) +#define PAVGB(dst,src) db 0x0f, 0xe0, _k3d_MODRM(dst,src) +#define PAVGW(dst,src) db 0x0f, 0xe3, _k3d_MODRM(dst,src) +#define PSADBW(dst,src) db 0x0f, 0xf6, _k3d_MODRM(dst,src) +#define PMOVMSKB(dst,src) db 0x0f, 0xd7, _k3d_MODRM(dst,src) +#define PMASKMOVQ(dst,src) db 0x0f, 0xf7, _k3d_MODRM(dst,src) +#define PINSRW(dst,src,msk) db 0x0f, 0xc4, _k3d_MODRM(dst,src), msk +#define PEXTRW(dst,src,msk) db 0x0f, 0xc5, _k3d_MODRM(dst,src), msk +#define PSHUFW(dst,src,msk) db 0x0f, 0x70, _k3d_MODRM(dst,src), msk +#define MOVNTQ(dst,src) db 0x0f, 0xe7, _k3d_MODRM(src,dst) +#define SFENCE db 0x0f, 0xae, 0xf8 + +/* Memory/offset versions of the opcodes */ +#define PF2IDM(dst,src,off) db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0x1d +#define PFACCM(dst,src,off) db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0xae +#define PFADDM(dst,src,off) db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0x9e +#define PFCMPEQM(dst,src,off) db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0xb0 +#define PFCMPGEM(dst,src,off) db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0x90 +#define PFCMPGTM(dst,src,off) db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0xa0 +#define PFMAXM(dst,src,off) db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0xa4 +#define PFMINM(dst,src,off) db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0x94 +#define PFMULM(dst,src,off) db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0xb4 +#define PFRCPM(dst,src,off) db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0x96 +#define PFRCPIT1M(dst,src,off) db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0xa6 +#define PFRCPIT2M(dst,src,off) db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0xb6 +#define PFRSQRTM(dst,src,off) db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0x97 +#define PFRSQIT1M(dst,src,off) db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0xa7 +#define PFSUBM(dst,src,off) db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0x9a +#define PFSUBRM(dst,src,off) db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0xaa +#define PI2FDM(dst,src,off) db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0x0d +#define PAVGUSBM(dst,src,off) db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0xbf +#define PMULHRWM(dst,src,off) db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0xb7 + + +/* Memory/offset versions of the new, K7 opcodes */ +#define PFNACCM(dst,src,off) db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0x8a +#define FPPNACCM(dst,src,off) db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0x8e +#define PSWAPDM(dst,src,off) db 0x0f, 0x0f, _k3d_MODRM(dst,src) | 0x40, off, 0xbb +#define PMINUBM(dst,src,off) db 0x0f, 0xda, _k3d_MODRM(dst,src) | 0x40, off +#define PMAXUBM(dst,src,off) db 0x0f, 0xde, _k3d_MODRM(dst,src) | 0x40, off +#define PMINSWM(dst,src,off) db 0x0f, 0xea, _k3d_MODRM(dst,src) | 0x40, off +#define PMAXSWM(dst,src,off) db 0x0f, 0xee, _k3d_MODRM(dst,src) | 0x40, off +#define PMULHUWM(dst,src,off) db 0x0f, 0xe4, _k3d_MODRM(dst,src) | 0x40, off +#define PAVGBM(dst,src,off) db 0x0f, 0xe0, _k3d_MODRM(dst,src) | 0x40, off +#define PAVGWM(dst,src,off) db 0x0f, 0xe3, _k3d_MODRM(dst,src) | 0x40, off +#define PSADBWM(dst,src,off) db 0x0f, 0xf6, _k3d_MODRM(dst,src) | 0x40, off +#define PMOVMSKBM(dst,src,off) db 0x0f, 0xd7, _k3d_MODRM(dst,src) | 0x40, off +#define PMASKMOVQM(dst,src,off) db 0x0f, 0xf7, _k3d_MODRM(dst,src) | 0x40, off +#define MOVNTQM(dst,src,off) db 0x0f, 0xe7, _k3d_MODRM(src,dst) | 0x40, off +#define PINSRWM(dst,src,off,msk) db 0x0f, 0xc4, _k3d_MODRM(dst,src) | 0x40, off, msk +#define PSHUFWM(dst,src,off,msk) db 0x0f, 0x70, _k3d_MODRM(dst,src) | 0x40, off, msk + + +/* Defines for 3DNow! instructions for use in pragmas */ +#define p_pf2id(dst,src) 0x0f 0x0f _k3d_MODRM(dst,src) 0x1d +#define p_pfacc(dst,src) 0x0f 0x0f _k3d_MODRM(dst,src) 0xae +#define p_pfadd(dst,src) 0x0f 0x0f _k3d_MODRM(dst,src) 0x9e +#define p_pfcmpeq(dst,src) 0x0f 0x0f _k3d_MODRM(dst,src) 0xb0 +#define p_pfcmpge(dst,src) 0x0f 0x0f _k3d_MODRM(dst,src) 0x90 +#define p_pfcmpgt(dst,src) 0x0f 0x0f _k3d_MODRM(dst,src) 0xa0 +#define p_pfmax(dst,src) 0x0f 0x0f _k3d_MODRM(dst,src) 0xa4 +#define p_pfmin(dst,src) 0x0f 0x0f _k3d_MODRM(dst,src) 0x94 +#define p_pfmul(dst,src) 0x0f 0x0f _k3d_MODRM(dst,src) 0xb4 +#define p_pfrcp(dst,src) 0x0f 0x0f _k3d_MODRM(dst,src) 0x96 +#define p_pfrcpit1(dst,src) 0x0f 0x0f _k3d_MODRM(dst,src) 0xa6 +#define p_pfrcpit2(dst,src) 0x0f 0x0f _k3d_MODRM(dst,src) 0xb6 +#define p_pfrsqrt(dst,src) 0x0f 0x0f _k3d_MODRM(dst,src) 0x97 +#define p_pfrsqit1(dst,src) 0x0f 0x0f _k3d_MODRM(dst,src) 0xa7 +#define p_pfsub(dst,src) 0x0f 0x0f _k3d_MODRM(dst,src) 0x9a +#define p_pfsubr(dst,src) 0x0f 0x0f _k3d_MODRM(dst,src) 0xaa +#define p_pi2fd(dst,src) 0x0f 0x0f _k3d_MODRM(dst,src) 0x0d +#define p_femms 0x0f 0x0e +#define p_pavgusb(dst,src) 0x0f 0x0f _k3d_MODRM(dst,src) 0xbf +#define p_pmulhrw(dst,src) 0x0f 0x0f _k3d_MODRM(dst,src) 0xb7 +#define p_prefetch(src) 0x0f 0x0d _k3d_pref_##src +#define p_prefetchw(src) 0x0f 0x0d _k3d_prefw_##src +#define P_PFNACC(dst,src) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0x8a +#define P_FPPNACC(dst,src) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0x8e +#define P_PSWAPD(dst,src) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0xbb +#define P_PMINUB(dst,src) 0x0f 0xda (_k3d_MODRM(dst,src) | 0x40) off +#define P_PMAXUB(dst,src) 0x0f 0xde (_k3d_MODRM(dst,src) | 0x40) off +#define P_PMINSW(dst,src) 0x0f 0xea (_k3d_MODRM(dst,src) | 0x40) off +#define P_PMAXSW(dst,src) 0x0f 0xee (_k3d_MODRM(dst,src) | 0x40) off +#define P_PMULHUW(dst,src) 0x0f 0xe4 (_k3d_MODRM(dst,src) | 0x40) off +#define P_PAVGB(dst,src) 0x0f 0xe0 (_k3d_MODRM(dst,src) | 0x40) off +#define P_PAVGW(dst,src) 0x0f 0xe3 (_k3d_MODRM(dst,src) | 0x40) off +#define P_PSADBW(dst,src) 0x0f 0xf6 (_k3d_MODRM(dst,src) | 0x40) off +#define P_PMOVMSKB(dst,src) 0x0f 0xd7 (_k3d_MODRM(dst,src) | 0x40) off +#define P_PMASKMOVQ(dst,src) 0x0f 0xf7 (_k3d_MODRM(dst,src) | 0x40) off +#define P_PINSRW(dst,src,msk) 0x0f 0xc4 (_k3d_MODRM(dst,src) | 0x40) off msk +#define P_PEXTRW(dst,src,msk) 0x0f 0xc5 (_k3d_MODRM(dst,src) | 0x40) off msk +#define P_PSHUFW(dst,src,msk) 0x0f 0x70 (_k3d_MODRM(dst,src) | 0x40) off msk +#define P_MOVNTQ(dst,src) 0x0f 0xe7 (_k3d_MODRM(src,dst) | 0x40) off + +#define P_PF2IDM(dst,src,off) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0x1d +#define P_PFACCM(dst,src,off) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0xae +#define P_PFADDM(dst,src,off) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0x9e +#define P_PFCMPEQM(dst,src,off) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0xb0 +#define P_PFCMPGEM(dst,src,off) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0x90 +#define P_PFCMPGTM(dst,src,off) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0xa0 +#define P_PFMAXM(dst,src,off) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0xa4 +#define P_PFMINM(dst,src,off) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0x94 +#define P_PFMULM(dst,src,off) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0xb4 +#define P_PFRCPM(dst,src,off) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0x96 +#define P_PFRCPIT1M(dst,src,off) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0xa6 +#define P_PFRCPIT2M(dst,src,off) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0xb6 +#define P_PFRSQRTM(dst,src,off) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0x97 +#define P_PFRSQIT1M(dst,src,off) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0xa7 +#define P_PFSUBM(dst,src,off) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0x9a +#define P_PFSUBRM(dst,src,off) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0xaa +#define P_PI2FDM(dst,src,off) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0x0d +#define P_PAVGUSBM(dst,src,off) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0xbf +#define P_PMULHRWM(dst,src,off) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0xb7 +#define P_PFNACCM(dst,src,off) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0x8a +#define P_FPPNACCM(dst,src,off) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0x8e +#define P_PSWAPDM(dst,src,off) 0x0f 0x0f (_k3d_MODRM(dst,src) | 0x40) off 0xbb +#define P_PMINUBM(dst,src,off) 0x0f 0xda (_k3d_MODRM(dst,src) | 0x40) off +#define P_PMAXUBM(dst,src,off) 0x0f 0xde (_k3d_MODRM(dst,src) | 0x40) off +#define P_PMINSWM(dst,src,off) 0x0f 0xea (_k3d_MODRM(dst,src) | 0x40) off +#define P_PMAXSWM(dst,src,off) 0x0f 0xee (_k3d_MODRM(dst,src) | 0x40) off +#define P_PMULHUWM(dst,src,off) 0x0f 0xe4 (_k3d_MODRM(dst,src) | 0x40) off +#define P_PAVGBM(dst,src,off) 0x0f 0xe0 (_k3d_MODRM(dst,src) | 0x40) off +#define P_PAVGWM(dst,src,off) 0x0f 0xe3 (_k3d_MODRM(dst,src) | 0x40) off +#define P_PSADBWM(dst,src,off) 0x0f 0xf6 (_k3d_MODRM(dst,src) | 0x40) off +#define P_PMOVMSKBM(dst,src,off) 0x0f 0xd7 (_k3d_MODRM(dst,src) | 0x40) off +#define P_MOVNTQM(dst,src,off) 0x0f 0xe7 (_k3d_MODRM(src,dst) | 0x40) off +#define P_PMASKMOVQM(dst,src,off) 0x0f 0xf7 (_k3d_MODRM(dst,src) | 0x40) off +#define P_PINSRWM(dst,src,off,msk) 0x0f 0xc4 (_k3d_MODRM(dst,src) | 0x40) off msk +#define P_PSHUFWM(dst,src,off,msk) 0x0f 0x70 (_k3d_MODRM(dst,src) | 0x40) off msk + + +#define P_PF2ID(dst,src) p_pf2id(dst,src) +#define P_PFACC(dst,src) p_pfacc(dst,src) +#define P_PFADD(dst,src) p_pfadd(dst,src) +#define P_PFCMPEQ(dst,src) p_pfcmpeq(dst,src) +#define P_PFCMPGE(dst,src) p_pfcmpge(dst,src) +#define P_PFCMPGT(dst,src) p_pfcmpgt(dst,src) +#define P_PFMAX(dst,src) p_pfmax(dst,src) +#define P_PFMIN(dst,src) p_pfmin(dst,src) +#define P_PFMUL(dst,src) p_pfmul(dst,src) +#define P_PFRCP(dst,src) p_pfrcp(dst,src) +#define P_PFRCPIT1(dst,src) p_pfrcpit1(dst,src) +#define P_PFRCPIT2(dst,src) p_pfrcpit2(dst,src) +#define P_PFRSQRT(dst,src) p_pfrsqrt(dst,src) +#define P_PFRSQIT1(dst,src) p_pfrsqit1(dst,src) +#define P_PFSUB(dst,src) p_pfsub(dst,src) +#define P_PFSUBR(dst,src) p_pfsubr(dst,src) +#define P_PI2FD(dst,src) p_pi2fd(dst,src) +#define P_FEMMS p_femms +#define P_PAVGUSB(dst,src) p_pavgusb(dst,src) +#define P_PMULHRW(dst,src) p_pmulhrw(dst,src) +#define P_PREFETCH(src) p_prefetch(src) +#define P_PREFETCHW(src) p_prefetchw(src) +#define p_CPUID 0x0f 0xa2 +#define p_pf2idm(dst,src,off) P_PF2IDM(dst,src,off) +#define p_pfaccm(dst,src,off) P_PFACCM(dst,src,off) +#define p_pfaddm(dst,src,off) P_PFADDM(dst,src,off) +#define p_pfcmpeqm(dst,src,off) P_PFCMPEQM(dst,src,off) +#define p_pfcmpgem(dst,src,off) P_PFCMPGEM(dst,src,off) +#define p_pfcmpgtm(dst,src,off) P_PFCMPGTM(dst,src,off) +#define p_pfmaxm(dst,src,off) P_PFMAXM(dst,src,off) +#define p_pfminm(dst,src,off) P_PFMINM(dst,src,off) +#define p_pfmulm(dst,src,off) P_PFMULM(dst,src,off) +#define p_pfrcpm(dst,src,off) P_PFRCPM(dst,src,off) +#define p_pfrcpit1m(dst,src,off) P_PFRCPIT1M(dst,src,off) +#define p_pfrcpit2m(dst,src,off) P_PFRCPIT2M(dst,src,off) +#define p_pfrsqrtm(dst,src,off) P_PFRSQRTM(dst,src,off) +#define p_pfrsqit1m(dst,src,off) P_PFRSQIT1M(dst,src,off) +#define p_pfsubm(dst,src,off) P_PFSUBM(dst,src,off) +#define p_pfsubrm(dst,src,off) P_PFSUBRM(dst,src,off) +#define p_pi2fdm(dst,src,off) P_PI2FDM(dst,src,off) +#define p_pavgusbm(dst,src,off) P_PAVGUSBM(dst,src,off) +#define p_pmulhrwm(dst,src,off) P_PMULHRWM(dst,src,off) + +#define P_PFNACC(dst,src) p_pfnacc(dst,src) +#define P_FPPNACC(dst,src) p_pfpnacc(dst,src) +#define P_PSWAPD(dst,src) p_pswapd(dst,src) +#define P_PMINUB(dst,src) p_pminub(dst,src) +#define P_PMAXUB(dst,src) p_pmaxub(dst,src) +#define P_PMINSW(dst,src) p_pminsw(dst,src) +#define P_PMAXSW(dst,src) p_pmaxsw(dst,src) +#define P_PMULHUW(dst,src) p_pmulhuw(dst,src) +#define P_PAVGB(dst,src) p_pavgb(dst,src) +#define P_PAVGW(dst,src) p_avgw(dst,src) +#define P_PSADBW(dst,src) p_psadbw(dst,src) +#define P_PMOVMSKB(dst,src) p_pmovmskb(dst,src) +#define P_PMASKMOVQ(dst,src) p_pmaskmovq(dst,src) +#define P_PINSRW(dst,src,msk) p_pinsrw(dst,src) +#define P_PEXTRW(dst,src,msk) p_pextrw(dst,src) +#define P_PSHUFW(dst,src,msk) p_pshufw(dst,src) +#define P_MOVNTQ(dst,src) p_movntq(dst,src) + +#define P_PFNACCM(dst,src,off) p_pfnaccm(dst,src,off) +#define P_FPPNACCM(dst,src,off) p_pfpnaccm(dst,src,off) +#define P_PSWAPDM(dst,src,off) p_pswapdm(dst,src,off) +#define P_PMINUBM(dst,src,off) p_pminubm(dst,src,off) +#define P_PMAXUBM(dst,src,off) p_pmaxubm(dst,src,off) +#define P_PMINSWM(dst,src,off) p_pminswm(dst,src,off) +#define P_PMAXSWM(dst,src,off) p_pmaxswm(dst,src,off) +#define P_PMULHUWM(dst,src,off) p_pmulhuwm(dst,src,off) +#define P_PAVGBM(dst,src,off) p_pavgbm(dst,src,off) +#define P_PAVGWM(dst,src,off) p_avgwm(dst,src,off) +#define P_PSADBWM(dst,src,off) p_psadbwm(dst,src,off) +#define P_PMOVMSKBM(dst,src,off) p_pmovmskbm(dst,src,off) +#define P_PMASKMOVQM(dst,src,off) p_pmaskmovqm(dst,src,off) +#define P_PINSRWM(dst,src,off,msk) p_pinsrwm(dst,src,off,msk) +#define P_PSHUFWM(dst,src,off,msk) p_pshufwm(dst,src,off,msk) +#define P_MOVNTQM(dst,src,off) p_movntqm(dst,src,off) + +#elif defined (_MSC_VER) && !defined (__MWERKS__) +// The Microsoft Visual C++ version of the 3DNow! macros. + +// Stop the "no EMMS" warning, since it doesn't detect FEMMS properly +#pragma warning(disable:4799) + +// Defines for operands. +#define _K3D_MM0 0xc0 +#define _K3D_MM1 0xc1 +#define _K3D_MM2 0xc2 +#define _K3D_MM3 0xc3 +#define _K3D_MM4 0xc4 +#define _K3D_MM5 0xc5 +#define _K3D_MM6 0xc6 +#define _K3D_MM7 0xc7 +#define _K3D_mm0 0xc0 +#define _K3D_mm1 0xc1 +#define _K3D_mm2 0xc2 +#define _K3D_mm3 0xc3 +#define _K3D_mm4 0xc4 +#define _K3D_mm5 0xc5 +#define _K3D_mm6 0xc6 +#define _K3D_mm7 0xc7 +#define _K3D_EAX 0x00 +#define _K3D_ECX 0x01 +#define _K3D_EDX 0x02 +#define _K3D_EBX 0x03 +#define _K3D_ESI 0x06 +#define _K3D_EDI 0x07 +#define _K3D_eax 0x00 +#define _K3D_ecx 0x01 +#define _K3D_edx 0x02 +#define _K3D_ebx 0x03 +#define _K3D_esi 0x06 +#define _K3D_edi 0x07 + +// These defines are for compatibility with the previous version of the header file. +#define _K3D_M0 0xc0 +#define _K3D_M1 0xc1 +#define _K3D_M2 0xc2 +#define _K3D_M3 0xc3 +#define _K3D_M4 0xc4 +#define _K3D_M5 0xc5 +#define _K3D_M6 0xc6 +#define _K3D_M7 0xc7 +#define _K3D_m0 0xc0 +#define _K3D_m1 0xc1 +#define _K3D_m2 0xc2 +#define _K3D_m3 0xc3 +#define _K3D_m4 0xc4 +#define _K3D_m5 0xc5 +#define _K3D_m6 0xc6 +#define _K3D_m7 0xc7 +#define _K3D__EAX 0x00 +#define _K3D__ECX 0x01 +#define _K3D__EDX 0x02 +#define _K3D__EBX 0x03 +#define _K3D__ESI 0x06 +#define _K3D__EDI 0x07 +#define _K3D__eax 0x00 +#define _K3D__ecx 0x01 +#define _K3D__edx 0x02 +#define _K3D__ebx 0x03 +#define _K3D__esi 0x06 +#define _K3D__edi 0x07 + +// General 3DNow! instruction format that is supported by +// these macros. Note that only the most basic form of memory +// operands are supported by these macros. + +#define InjK3DOps(dst,src,inst) \ +{ \ + _asm _emit 0x0f \ + _asm _emit 0x0f \ + _asm _emit ((_K3D_##dst & 0x3f) << 3) | _K3D_##src \ + _asm _emit _3DNowOpcode##inst \ +} + +#define InjK3DMOps(dst,src,off,inst) \ +{ \ + _asm _emit 0x0f \ + _asm _emit 0x0f \ + _asm _emit (((_K3D_##dst & 0x3f) << 3) | _K3D_##src | 0x40) \ + _asm _emit off \ + _asm _emit _3DNowOpcode##inst \ +} + +#define InjMMXOps(dst,src,inst) \ +{ \ + _asm _emit 0x0f \ + _asm _emit _3DNowOpcode##inst \ + _asm _emit ((_K3D_##dst & 0x3f) << 3) | _K3D_##src \ +} + +#define InjMMXMOps(dst,src,off,inst) \ +{ \ + _asm _emit 0x0f \ + _asm _emit _3DNowOpcode##inst \ + _asm _emit (((_K3D_##dst & 0x3f) << 3) | _K3D_##src | 0x40) \ + _asm _emit off \ +} + +#define _3DNowOpcodePF2ID 0x1d +#define _3DNowOpcodePFACC 0xae +#define _3DNowOpcodePFADD 0x9e +#define _3DNowOpcodePFCMPEQ 0xb0 +#define _3DNowOpcodePFCMPGE 0x90 +#define _3DNowOpcodePFCMPGT 0xa0 +#define _3DNowOpcodePFMAX 0xa4 +#define _3DNowOpcodePFMIN 0x94 +#define _3DNowOpcodePFMUL 0xb4 +#define _3DNowOpcodePFRCP 0x96 +#define _3DNowOpcodePFRCPIT1 0xa6 +#define _3DNowOpcodePFRCPIT2 0xb6 +#define _3DNowOpcodePFRSQRT 0x97 +#define _3DNowOpcodePFRSQIT1 0xa7 +#define _3DNowOpcodePFSUB 0x9a +#define _3DNowOpcodePFSUBR 0xaa +#define _3DNowOpcodePI2FD 0x0d +#define _3DNowOpcodePAVGUSB 0xbf +#define _3DNowOpcodePMULHRW 0xb7 +#define _3DNowOpcodePFNACC 0x8a +#define _3DNowOpcodeFPPNACC 0x8e +#define _3DNowOpcodePSWAPD 0xbb +#define _3DNowOpcodePMINUB 0xda +#define _3DNowOpcodePMAXUB 0xde +#define _3DNowOpcodePMINSW 0xea +#define _3DNowOpcodePMAXSW 0xee +#define _3DNowOpcodePMULHUW 0xe4 +#define _3DNowOpcodePAVGB 0xe0 +#define _3DNowOpcodePAVGW 0xe3 +#define _3DNowOpcodePSADBW 0xf6 +#define _3DNowOpcodePMOVMSKB 0xd7 +#define _3DNowOpcodePMASKMOVQ 0xf7 +#define _3DNowOpcodePINSRW 0xc4 +#define _3DNowOpcodePEXTRW 0xc5 +#define _3DNowOpcodePSHUFW 0x70 +#define _3DNowOpcodeMOVNTQ 0xe7 +#define _3DNowOpcodePREFETCHT 0x18 + + +#define PF2ID(dst,src) InjK3DOps(dst, src, PF2ID) +#define PFACC(dst,src) InjK3DOps(dst, src, PFACC) +#define PFADD(dst,src) InjK3DOps(dst, src, PFADD) +#define PFCMPEQ(dst,src) InjK3DOps(dst, src, PFCMPEQ) +#define PFCMPGE(dst,src) InjK3DOps(dst, src, PFCMPGE) +#define PFCMPGT(dst,src) InjK3DOps(dst, src, PFCMPGT) +#define PFMAX(dst,src) InjK3DOps(dst, src, PFMAX) +#define PFMIN(dst,src) InjK3DOps(dst, src, PFMIN) +#define PFMUL(dst,src) InjK3DOps(dst, src, PFMUL) +#define PFRCP(dst,src) InjK3DOps(dst, src, PFRCP) +#define PFRCPIT1(dst,src) InjK3DOps(dst, src, PFRCPIT1) +#define PFRCPIT2(dst,src) InjK3DOps(dst, src, PFRCPIT2) +#define PFRSQRT(dst,src) InjK3DOps(dst, src, PFRSQRT) +#define PFRSQIT1(dst,src) InjK3DOps(dst, src, PFRSQIT1) +#define PFSUB(dst,src) InjK3DOps(dst, src, PFSUB) +#define PFSUBR(dst,src) InjK3DOps(dst, src, PFSUBR) +#define PI2FD(dst,src) InjK3DOps(dst, src, PI2FD) +#define PAVGUSB(dst,src) InjK3DOps(dst, src, PAVGUSB) +#define PMULHRW(dst,src) InjK3DOps(dst, src, PMULHRW) + +#define FEMMS \ +{ \ + _asm _emit 0x0f \ + _asm _emit 0x0e \ +} + +#define PREFETCH(src) \ +{ \ + _asm _emit 0x0f \ + _asm _emit 0x0d \ + _asm _emit (_K3D_##src & 0x07) \ +} + +/* Prefetch with a short offset, < 127 or > -127 + Carefull! Doesn't check for your offset being + in range. */ + +#define PREFETCHM(src,off) \ +{ \ + _asm _emit 0x0f \ + _asm _emit 0x0d \ + _asm _emit (0x40 | (_K3D_##src & 0x07)) \ + _asm _emit off \ +} + +/* Prefetch with a long offset */ + +#define PREFETCHMLONG(src,off) \ +{ \ + _asm _emit 0x0f \ + _asm _emit 0x0d \ + _asm _emit (0x80 | (_K3D_##src & 0x07)) \ + _asm _emit (off & 0x000000ff) \ + _asm _emit (off & 0x0000ff00) >> 8 \ + _asm _emit (off & 0x00ff0000) >> 16 \ + _asm _emit (off & 0xff000000) >> 24 \ +} + +#define PREFETCHW(src) \ +{ \ + _asm _emit 0x0f \ + _asm _emit 0x0d \ + _asm _emit (0x08 | (_K3D_##src & 0x07)) \ +} + +#define PREFETCHWM(src,off) \ +{ \ + _asm _emit 0x0f \ + _asm _emit 0x0d \ + _asm _emit 0x48 | (_K3D_##src & 0x07) \ + _asm _emit off \ +} + +#define PREFETCHWMLONG(src,off) \ +{ \ + _asm _emit 0x0f \ + _asm _emit 0x0d \ + _asm _emit 0x88 | (_K3D_##src & 0x07) \ + _asm _emit (off & 0x000000ff) \ + _asm _emit (off & 0x0000ff00) >> 8 \ + _asm _emit (off & 0x00ff0000) >> 16 \ + _asm _emit (off & 0xff000000) >> 24 \ +} + +#define CPUID \ +{ \ + _asm _emit 0x0f \ + _asm _emit 0xa2 \ +} + + +/* Defines for new, K7 opcodes */ +#define SFENCE \ +{ \ + _asm _emit 0x0f \ + _asm _emit 0xae \ + _asm _emit 0xf8 \ +} + +#define PFNACC(dst,src) InjK3DOps(dst,src,PFNACC) +#define PFPNACC(dst,src) InjK3DOps(dst,src,PFPNACC) +#define PSWAPD(dst,src) InjK3DOps(dst,src,PSWAPD) +#define PMINUB(dst,src) InjMMXOps(dst,src,PMINUB) +#define PMAXUB(dst,src) InjMMXOps(dst,src,PMAXUB) +#define PMINSW(dst,src) InjMMXOps(dst,src,PMINSW) +#define PMAXSW(dst,src) InjMMXOps(dst,src,PMAXSW) +#define PMULHUW(dst,src) InjMMXOps(dst,src,PMULHUW) +#define PAVGB(dst,src) InjMMXOps(dst,src,PAVGB) +#define PAVGW(dst,src) InjMMXOps(dst,src,PAVGW) +#define PSADBW(dst,src) InjMMXOps(dst,src,PSADBW) +#define PMOVMSKB(dst,src) InjMMXOps(dst,src,PMOVMSKB) +#define PMASKMOVQ(dst,src) InjMMXOps(dst,src,PMASKMOVQ) +#define PINSRW(dst,src,msk) InjMMXOps(dst,src,PINSRW) _asm _emit msk +#define PEXTRW(dst,src,msk) InjMMXOps(dst,src,PEXTRW) _asm _emit msk +#define PSHUFW(dst,src,msk) InjMMXOps(dst,src,PSHUFW) _asm _emit msk +#define MOVNTQ(dst,src) InjMMXOps(src,dst,MOVNTQ) +#define PREFETCHNTA(mem) InjMMXOps(mm0,mem,PREFETCHT) +#define PREFETCHT0(mem) InjMMXOps(mm1,mem,PREFETCHT) +#define PREFETCHT1(mem) InjMMXOps(mm2,mem,PREFETCHT) +#define PREFETCHT2(mem) InjMMXOps(mm3,mem,PREFETCHT) + + +/* Memory/offset versions of the opcodes */ +#define PAVGUSBM(dst,src,off) InjK3DMOps(dst,src,off,PAVGUSB) +#define PF2IDM(dst,src,off) InjK3DMOps(dst,src,off,PF2ID) +#define PFACCM(dst,src,off) InjK3DMOps(dst,src,off,PFACC) +#define PFADDM(dst,src,off) InjK3DMOps(dst,src,off,PFADD) +#define PFCMPEQM(dst,src,off) InjK3DMOps(dst,src,off,PFCMPEQ) +#define PFCMPGEM(dst,src,off) InjK3DMOps(dst,src,off,PFCMPGE) +#define PFCMPGTM(dst,src,off) InjK3DMOps(dst,src,off,PFCMPGT) +#define PFMAXM(dst,src,off) InjK3DMOps(dst,src,off,PFMAX) +#define PFMINM(dst,src,off) InjK3DMOps(dst,src,off,PFMIN) +#define PFMULM(dst,src,off) InjK3DMOps(dst,src,off,PFMUL) +#define PFRCPM(dst,src,off) InjK3DMOps(dst,src,off,PFRCP) +#define PFRCPIT1M(dst,src,off) InjK3DMOps(dst,src,off,PFRCPIT1) +#define PFRCPIT2M(dst,src,off) InjK3DMOps(dst,src,off,PFRCPIT2) +#define PFRSQRTM(dst,src,off) InjK3DMOps(dst,src,off,PFRSQRT) +#define PFRSQIT1M(dst,src,off) InjK3DMOps(dst,src,off,PFRSQIT1) +#define PFSUBM(dst,src,off) InjK3DMOps(dst,src,off,PFSUB) +#define PFSUBRM(dst,src,off) InjK3DMOps(dst,src,off,PFSUBR) +#define PI2FDM(dst,src,off) InjK3DMOps(dst,src,off,PI2FD) +#define PMULHRWM(dst,src,off) InjK3DMOps(dst,src,off,PMULHRW) + + +/* Memory/offset versions of the K7 opcodes */ +#define PFNACCM(dst,src,off) InjK3DMOps(dst,src,off,PFNACC) +#define PFPNACCM(dst,src,off) InjK3DMOps(dst,src,off,PFPNACC) +#define PSWAPDM(dst,src,off) InjK3DMOps(dst,src,off,PSWAPD) +#define PMINUBM(dst,src,off) InjMMXMOps(dst,src,off,PMINUB) +#define PMAXUBM(dst,src,off) InjMMXMOps(dst,src,off,PMAXUB) +#define PMINSWM(dst,src,off) InjMMXMOps(dst,src,off,PMINSW) +#define PMAXSWM(dst,src,off) InjMMXMOps(dst,src,off,PMAXSW) +#define PMULHUWM(dst,src,off) InjMMXMOps(dst,src,off,PMULHUW) +#define PAVGBM(dst,src,off) InjMMXMOps(dst,src,off,PAVGB) +#define PAVGWM(dst,src,off) InjMMXMOps(dst,src,off,PAVGW) +#define PSADBWM(dst,src,off) InjMMXMOps(dst,src,off,PSADBW) +#define PMOVMSKBM(dst,src,off) InjMMXMOps(dst,src,off,PMOVMSKB) +#define PMASKMOVQM(dst,src,off) InjMMXMOps(dst,src,off,PMASKMOVQ) +#define PINSRWM(dst,src,off,msk) InjMMXMOps(dst,src,off,PINSRW) _asm _emit msk +#define PSHUFWM(dst,src,off,msk) InjMMXMOps(dst,src,off,PSHUFW) _asm _emit msk +#define MOVNTQM(dst,src,off) InjMMXMOps(src,dst,off,MOVNTQ) +#define PREFETCHNTAM(mem,off) InjMMXMOps(mm0,mem,off,PREFETCHT) +#define PREFETCHT0M(mem,off) InjMMXMOps(mm1,mem,off,PREFETCHT) +#define PREFETCHT1M(mem,off) InjMMXMOps(mm2,mem,off,PREFETCHT) +#define PREFETCHT2M(mem,off) InjMMXMOps(mm3,mem,off,PREFETCHT) + + +#else + +/* Assume built-in support for 3DNow! opcodes, replace macros with opcodes */ +#define PAVGUSB(dst,src) pavgusb dst,src +#define PF2ID(dst,src) pf2id dst,src +#define PFACC(dst,src) pfacc dst,src +#define PFADD(dst,src) pfadd dst,src +#define PFCMPEQ(dst,src) pfcmpeq dst,src +#define PFCMPGE(dst,src) pfcmpge dst,src +#define PFCMPGT(dst,src) pfcmpgt dst,src +#define PFMAX(dst,src) pfmax dst,src +#define PFMIN(dst,src) pfmin dst,src +#define PFMUL(dst,src) pfmul dst,src +#define PFRCP(dst,src) pfrcp dst,src +#define PFRCPIT1(dst,src) pfrcpit1 dst,src +#define PFRCPIT2(dst,src) pfrcpit2 dst,src +#define PFRSQRT(dst,src) pfrsqrt dst,src +#define PFRSQIT1(dst,src) pfrsqit1 dst,src +#define PFSUB(dst,src) pfsub dst,src +#define PFSUBR(dst,src) pfsubr dst,src +#define PI2FD(dst,src) pi2fd dst,src +#define PMULHRW(dst,src) pmulhrw dst,src +#define PREFETCH(src) prefetch src +#define PREFETCHW(src) prefetchw src + +#define PAVGUSBM(dst,src,off) pavgusb dst,[src+off] +#define PF2IDM(dst,src,off) PF2ID dst,[src+off] +#define PFACCM(dst,src,off) PFACC dst,[src+off] +#define PFADDM(dst,src,off) PFADD dst,[src+off] +#define PFCMPEQM(dst,src,off) PFCMPEQ dst,[src+off] +#define PFCMPGEM(dst,src,off) PFCMPGE dst,[src+off] +#define PFCMPGTM(dst,src,off) PFCMPGT dst,[src+off] +#define PFMAXM(dst,src,off) PFMAX dst,[src+off] +#define PFMINM(dst,src,off) PFMIN dst,[src+off] +#define PFMULM(dst,src,off) PFMUL dst,[src+off] +#define PFRCPM(dst,src,off) PFRCP dst,[src+off] +#define PFRCPIT1M(dst,src,off) PFRCPIT1 dst,[src+off] +#define PFRCPIT2M(dst,src,off) PFRCPIT2 dst,[src+off] +#define PFRSQRTM(dst,src,off) PFRSQRT dst,[src+off] +#define PFRSQIT1M(dst,src,off) PFRSQIT1 dst,[src+off] +#define PFSUBM(dst,src,off) PFSUB dst,[src+off] +#define PFSUBRM(dst,src,off) PFSUBR dst,[src+off] +#define PI2FDM(dst,src,off) PI2FD dst,[src+off] +#define PMULHRWM(dst,src,off) PMULHRW dst,[src+off] + + +#if defined (__MWERKS__) +// At the moment, CodeWarrior does not support these opcodes, so hand-assemble them + +// Defines for operands. +#define _K3D_MM0 0xc0 +#define _K3D_MM1 0xc1 +#define _K3D_MM2 0xc2 +#define _K3D_MM3 0xc3 +#define _K3D_MM4 0xc4 +#define _K3D_MM5 0xc5 +#define _K3D_MM6 0xc6 +#define _K3D_MM7 0xc7 +#define _K3D_mm0 0xc0 +#define _K3D_mm1 0xc1 +#define _K3D_mm2 0xc2 +#define _K3D_mm3 0xc3 +#define _K3D_mm4 0xc4 +#define _K3D_mm5 0xc5 +#define _K3D_mm6 0xc6 +#define _K3D_mm7 0xc7 +#define _K3D_EAX 0x00 +#define _K3D_ECX 0x01 +#define _K3D_EDX 0x02 +#define _K3D_EBX 0x03 +#define _K3D_ESI 0x06 +#define _K3D_EDI 0x07 +#define _K3D_eax 0x00 +#define _K3D_ecx 0x01 +#define _K3D_edx 0x02 +#define _K3D_ebx 0x03 +#define _K3D_esi 0x06 +#define _K3D_edi 0x07 +#define _K3D_EAX 0x00 +#define _K3D_ECX 0x01 +#define _K3D_EDX 0x02 +#define _K3D_EBX 0x03 +#define _K3D_ESI 0x06 +#define _K3D_EDI 0x07 +#define _K3D_eax 0x00 +#define _K3D_ecx 0x01 +#define _K3D_edx 0x02 +#define _K3D_ebx 0x03 +#define _K3D_esi 0x06 +#define _K3D_edi 0x07 + +#define InjK3DOps(dst,src,inst) \ + db 0x0f, 0x0f, (((_K3D_##dst & 0x3f) << 3) | _K3D_##src), _3DNowOpcode##inst + +#define InjK3DMOps(dst,src,off,inst) \ + db 0x0f, 0x0f, (((_K3D_##dst & 0x3f) << 3) | _K3D_##src | 0x40), off, _3DNowOpcode##inst + +#define InjMMXOps(dst,src,inst) \ + db 0x0f, _3DNowOpcode##inst, (((_K3D_##dst & 0x3f) << 3) | _K3D_##src) + +#define InjMMXMOps(dst,src,off,inst) \ + db 0x0f, _3DNowOpcode##inst, (((_K3D_##dst & 0x3f) << 3) | _K3D_##src | 0x40), off + +#define PFNACC(dst,src) InjK3DOps(dst,src,PFNACC) +#define PFPNACC(dst,src) InjK3DOps(dst,src,PFPNACC) +#define PSWAPD(dst,src) InjK3DOps(dst,src,PSWAPD) +#define PMINUB(dst,src) InjMMXOps(dst,src,PMINUB) +#define PMAXUB(dst,src) InjMMXOps(dst,src,PMAXUB) +#define PMINSW(dst,src) InjMMXOps(dst,src,PMINSW) +#define PMAXSW(dst,src) InjMMXOps(dst,src,PMAXSW) +#define PMULHUW(dst,src) InjMMXOps(dst,src,PMULHUW) +#define PAVGB(dst,src) InjMMXOps(dst,src,PAVGB) +#define PAVGW(dst,src) InjMMXOps(dst,src,PAVGW) +#define PSADBW(dst,src) InjMMXOps(dst,src,PSADBW) +#define PMOVMSKB(dst,src) InjMMXOps(dst,src,PMOVMSKB) +#define PMASKMOVQ(dst,src) InjMMXOps(dst,src,PMASKMOVQ) +#define PINSRW(dst,src,msk) InjMMXOps(dst,src,PINSRW) db msk +#define PEXTRW(dst,src,msk) InjMMXOps(dst,src,PEXTRW) db msk +#define PSHUFW(dst,src,msk) InjMMXOps(dst,src,PSHUFW) db msk +#define MOVNTQ(dst,src) InjMMXOps(src,dst,MOVNTQ) +#define PREFETCHNTA(mem) InjMMXOps(mm0,mem,PREFETCHT) +#define PREFETCHT0(mem) InjMMXOps(mm1,mem,PREFETCHT) +#define PREFETCHT1(mem) InjMMXOps(mm2,mem,PREFETCHT) +#define PREFETCHT2(mem) InjMMXOps(mm3,mem,PREFETCHT) + + +/* Memory/offset versions of the K7 opcodes */ +#define PFNACCM(dst,src,off) InjK3DMOps(dst,src,off,PFNACC) +#define PFPNACCM(dst,src,off) InjK3DMOps(dst,src,off,PFPNACC) +#define PSWAPDM(dst,src,off) InjK3DMOps(dst,src,off,PSWAPD) +#define PMINUBM(dst,src,off) InjMMXMOps(dst,src,off,PMINUB) +#define PMAXUBM(dst,src,off) InjMMXMOps(dst,src,off,PMAXUB) +#define PMINSWM(dst,src,off) InjMMXMOps(dst,src,off,PMINSW) +#define PMAXSWM(dst,src,off) InjMMXMOps(dst,src,off,PMAXSW) +#define PMULHUWM(dst,src,off) InjMMXMOps(dst,src,off,PMULHUW) +#define PAVGBM(dst,src,off) InjMMXMOps(dst,src,off,PAVGB) +#define PAVGWM(dst,src,off) InjMMXMOps(dst,src,off,PAVGW) +#define PSADBWM(dst,src,off) InjMMXMOps(dst,src,off,PSADBW) +#define PMOVMSKBM(dst,src,off) InjMMXMOps(dst,src,off,PMOVMSKB) +#define PMASKMOVQM(dst,src,off) InjMMXMOps(dst,src,off,PMASKMOVQ) +#define PINSRWM(dst,src,off,msk) InjMMXMOps(dst,src,off,PINSRW), msk +#define PEXTRWM(dst,src,off,msk) InjMMXMOps(dst,src,off,PEXTRW), msk +#define PSHUFWM(dst,src,off,msk) InjMMXMOps(dst,src,off,PSHUFW), msk +#define MOVNTQM(dst,src,off) InjMMXMOps(src,dst,off,MOVNTQ) +#define PREFETCHNTAM(mem,off) InjMMXMOps(mm0,mem,off,PREFETCHT) +#define PREFETCHT0M(mem,off) InjMMXMOps(mm1,mem,off,PREFETCHT) +#define PREFETCHT1M(mem,off) InjMMXMOps(mm2,mem,off,PREFETCHT) +#define PREFETCHT2M(mem,off) InjMMXMOps(mm3,mem,off,PREFETCHT) + + +#else + +#define PFNACC(dst,src) PFNACC dst,src +#define PFPNACC(dst,src) PFPNACC dst,src +#define PSWAPD(dst,src) PSWAPD dst,src +#define PMINUB(dst,src) PMINUB dst,src +#define PMAXUB(dst,src) PMAXUB dst,src +#define PMINSW(dst,src) PMINSW dst,src +#define PMAXSW(dst,src) PMAXSW dst,src +#define PMULHUW(dst,src) PMULHUW dst,src +#define PAVGB(dst,src) PAVGB dst,src +#define PAVGW(dst,src) PAVGW dst,src +#define PSADBW(dst,src) PSADBW dst,src +#define PMOVMSKB(dst,src) PMOVMSKB dst,src +#define PMASKMOVQ(dst,src) PMASKMOVQ dst,src +#define PINSRW(dst,src,msk) PINSRW dst,src,msk +#define PEXTRW(dst,src,msk) PEXTRW dst,src,msk +#define PSHUFW(dst,src,msk) PSHUFW dst,src,msk +#define MOVNTQ(dst,src) MOVNTQ dst,src + +#define PFNACCM(dst,src,off) PFNACC dst,[src+off] +#define PFPNACCM(dst,src,off) PFPNACC dst,[src+off] +#define PSWAPDM(dst,src,off) PSWAPD dst,[src+off] +#define PMINUBM(dst,src,off) PMINUB dst,[src+off] +#define PMAXUBM(dst,src,off) PMAXUB dst,[src+off] +#define PMINSWM(dst,src,off) PMINSW dst,[src+off] +#define PMAXSWM(dst,src,off) PMAXSW dst,[src+off] +#define PMULHUWM(dst,src,off) PMULHUW dst,[src+off] +#define PAVGBM(dst,src,off) PAVGB dst,[src+off] +#define PAVGWM(dst,src,off) PAVGW dst,[src+off] +#define PSADBWM(dst,src,off) PSADBW dst,[src+off] +#define PMOVMSKBM(dst,src,off) PMOVMSKB dst,[src+off] +#define PMASKMOVQM(dst,src,off) PMASKMOVQ dst,[src+off] +#define PINSRWM(dst,src,off,msk) PINSRW dst,[src+off],msk +#define PEXTRWM(dst,src,off,msk) PEXTRW dst,[src+off],msk +#define PSHUFWM(dst,src,off,msk) PSHUFW dst,[src+off],msk +#define MOVNTQM(dst,src,off) MOVNTQ dst,[src+off] + +#endif + +#endif + +/* Just to deal with lower case. */ +#define pf2id(dst,src) PF2ID(dst,src) +#define pfacc(dst,src) PFACC(dst,src) +#define pfadd(dst,src) PFADD(dst,src) +#define pfcmpeq(dst,src) PFCMPEQ(dst,src) +#define pfcmpge(dst,src) PFCMPGE(dst,src) +#define pfcmpgt(dst,src) PFCMPGT(dst,src) +#define pfmax(dst,src) PFMAX(dst,src) +#define pfmin(dst,src) PFMIN(dst,src) +#define pfmul(dst,src) PFMUL(dst,src) +#define pfrcp(dst,src) PFRCP(dst,src) +#define pfrcpit1(dst,src) PFRCPIT1(dst,src) +#define pfrcpit2(dst,src) PFRCPIT2(dst,src) +#define pfrsqrt(dst,src) PFRSQRT(dst,src) +#define pfrsqit1(dst,src) PFRSQIT1(dst,src) +#define pfsub(dst,src) PFSUB(dst,src) +#define pfsubr(dst,src) PFSUBR(dst,src) +#define pi2fd(dst,src) PI2FD(dst,src) +#define femms FEMMS +#define pavgusb(dst,src) PAVGUSB(dst,src) +#define pmulhrw(dst,src) PMULHRW(dst,src) +#define prefetch(src) PREFETCH(src) +#define prefetchw(src) PREFETCHW(src) + +#define prefetchm(src,off) PREFETCHM(src,off) +#define prefetchmlong(src,off) PREFETCHMLONG(src,off) +#define prefetchwm(src,off) PREFETCHWM(src,off) +#define prefetchwmlong(src,off) PREFETCHWMLONG(src,off) + +#define pfnacc(dst,src) PFNACC(dst,src) +#define pfpnacc(dst,src) PFPNACC(dst,src) +#define pswapd(dst,src) PSWAPD(dst,src) +#define pminub(dst,src) PMINUB(dst,src) +#define pmaxub(dst,src) PMAXUB(dst,src) +#define pminsw(dst,src) PMINSW(dst,src) +#define pmaxsw(dst,src) PMAXSW(dst,src) +#define pmulhuw(dst,src) PMULHUW(dst,src) +#define pavgb(dst,src) PAVGB(dst,src) +#define pavgw(dst,src) PAVGW(dst,src) +#define psadbw(dst,src) PSADBW(dst,src) +#define pmovmskb(dst,src) PMOVMSKB(dst,src) +#define pmaskmovq(dst,src) PMASKMOVQ(dst,src) +#define pinsrw(dst,src,msk) PINSRW(dst,src,msk) +#define pextrw(dst,src,msk) PEXTRW(dst,src,msk) +#define pshufw(dst,src,msk) PSHUFW(dst,src,msk) +#define movntq(dst,src) MOVNTQ(dst,src) +#define prefetchnta(mem) PREFETCHNTA(mem) +#define prefetcht0(mem) PREFETCHT0(mem) +#define prefetcht1(mem) PREFETCHT1(mem) +#define prefetcht2(mem) PREFETCHT2(mem) + + +#define pavgusbm(dst,src,off) PAVGUSBM(dst,src,off) +#define pf2idm(dst,src,off) PF2IDM(dst,src,off) +#define pfaccm(dst,src,off) PFACCM(dst,src,off) +#define pfaddm(dst,src,off) PFADDM(dst,src,off) +#define pfcmpeqm(dst,src,off) PFCMPEQM(dst,src,off) +#define pfcmpgem(dst,src,off) PFCMPGEM(dst,src,off) +#define pfcmpgtm(dst,src,off) PFCMPGTM(dst,src,off) +#define pfmaxm(dst,src,off) PFMAXM(dst,src,off) +#define pfminm(dst,src,off) PFMINM(dst,src,off) +#define pfmulm(dst,src,off) PFMULM(dst,src,off) +#define pfrcpm(dst,src,off) PFRCPM(dst,src,off) +#define pfrcpit1m(dst,src,off) PFRCPIT1M(dst,src,off) +#define pfrcpit2m(dst,src,off) PFRCPIT2M(dst,src,off) +#define pfrsqrtm(dst,src,off) PFRSQRTM(dst,src,off) +#define pfrsqit1m(dst,src,off) PFRSQIT1M(dst,src,off) +#define pfsubm(dst,src,off) PFSUBM(dst,src,off) +#define pfsubrm(dst,src,off) PFSUBRM(dst,src,off) +#define pi2fdm(dst,src,off) PI2FDM(dst,src,off) +#define pmulhrwm(dst,src,off) PMULHRWM(dst,src,off) +#define cpuid CPUID +#define sfence SFENCE + +#define pfnaccm(dst,src,off) PFNACCM(dst,src,off) +#define pfpnaccm(dst,src,off) PFPNACCM(dst,src,off) +#define pswapdm(dst,src,off) PSWAPDM(dst,src,off) +#define pminubm(dst,src,off) PMINUBM(dst,src,off) +#define pmaxubm(dst,src,off) PMAXUBM(dst,src,off) +#define pminswm(dst,src,off) PMINSWM(dst,src,off) +#define pmaxswm(dst,src,off) PMAXSWM(dst,src,off) +#define pmulhuwm(dst,src,off) PMULHUWM(dst,src,off) +#define pavgbm(dst,src,off) PAVGBM(dst,src,off) +#define pavgwm(dst,src,off) PAVGWM(dst,src,off) +#define psadbwm(dst,src,off) PSADBWM(dst,src,off) +#define pmovmskbm(dst,src,off) PMOVMSKBM(dst,src,off) +#define pmaskmovqm(dst,src,off) PMASKMOVQM(dst,src,off) +#define pinsrwm(dst,src,off,msk) PINSRWM(dst,src,off,msk) +#define pextrwm(dst,src,off,msk) PEXTRWM(dst,src,off,msk) +#define pshufwm(dst,src,off,msk) PSHUFWM(dst,src,off,msk) +#define movntqm(dst,src,off) MOVNTQM(dst,src,off) +#define prefetchntam(mem,off) PREFETCHNTA(mem,off) +#define prefetcht0m(mem,off) PREFETCHT0(mem,off) +#define prefetcht1m(mem,off) PREFETCHT1(mem,off) +#define prefetcht2m(mem,off) PREFETCHT2(mem,off) + +#endif diff --git a/public/mathlib/anorms.h b/public/mathlib/anorms.h new file mode 100644 index 0000000..4f65383 --- /dev/null +++ b/public/mathlib/anorms.h @@ -0,0 +1,25 @@ +//========= Copyright Valve Corporation, All rights reserved. ============// +// +// Purpose: +// +//=============================================================================// + +#ifndef ANORMS_H +#define ANORMS_H +#ifdef _WIN32 +#pragma once +#endif + + +#include "mathlib/vector.h" + + +#define NUMVERTEXNORMALS 162 + +// the angle between consecutive g_anorms[] vectors is ~14.55 degrees +#define VERTEXNORMAL_CONE_INNER_ANGLE DEG2RAD(7.275) + +extern Vector g_anorms[NUMVERTEXNORMALS]; + + +#endif // ANORMS_H diff --git a/public/mathlib/bumpvects.h b/public/mathlib/bumpvects.h new file mode 100644 index 0000000..6939ca0 --- /dev/null +++ b/public/mathlib/bumpvects.h @@ -0,0 +1,37 @@ +//========= Copyright Valve Corporation, All rights reserved. ============// +// +// Purpose: +// +// $Workfile: $ +// $Date: $ +// $NoKeywords: $ +//=============================================================================// + +#ifndef BUMPVECTS_H +#define BUMPVECTS_H + +#ifdef _WIN32 +#pragma once +#endif + +#include "mathlib/mathlib.h" + +#define OO_SQRT_2 0.70710676908493042f +#define OO_SQRT_3 0.57735025882720947f +#define OO_SQRT_6 0.40824821591377258f +// sqrt( 2 / 3 ) +#define OO_SQRT_2_OVER_3 0.81649661064147949f + +#define NUM_BUMP_VECTS 3 + +const TableVector g_localBumpBasis[NUM_BUMP_VECTS] = +{ + { OO_SQRT_2_OVER_3, 0.0f, OO_SQRT_3 }, + { -OO_SQRT_6, OO_SQRT_2, OO_SQRT_3 }, + { -OO_SQRT_6, -OO_SQRT_2, OO_SQRT_3 } +}; + +void GetBumpNormals( const Vector& sVect, const Vector& tVect, const Vector& flatNormal, + const Vector& phongNormal, Vector bumpNormals[NUM_BUMP_VECTS] ); + +#endif // BUMPVECTS_H diff --git a/public/mathlib/compressed_3d_unitvec.h b/public/mathlib/compressed_3d_unitvec.h new file mode 100644 index 0000000..a92dba2 --- /dev/null +++ b/public/mathlib/compressed_3d_unitvec.h @@ -0,0 +1,284 @@ +//========= Copyright Valve Corporation, All rights reserved. ============// +// +// Purpose: +// +// $NoKeywords: $ +// +//=============================================================================// +#ifndef _3D_UNITVEC_H +#define _3D_UNITVEC_H + + +#define UNITVEC_DECLARE_STATICS \ + float cUnitVector::mUVAdjustment[0x2000]; \ + Vector cUnitVector::mTmpVec; + +// upper 3 bits +#define SIGN_MASK 0xe000 +#define XSIGN_MASK 0x8000 +#define YSIGN_MASK 0x4000 +#define ZSIGN_MASK 0x2000 + +// middle 6 bits - xbits +#define TOP_MASK 0x1f80 + +// lower 7 bits - ybits +#define BOTTOM_MASK 0x007f + +// unitcomp.cpp : A Unit Vector to 16-bit word conversion +// algorithm based on work of Rafael Baptista ([email protected]) +// Accuracy improved by O.D. ([email protected]) +// Used with Permission. + +// a compressed unit vector. reasonable fidelty for unit +// vectors in a 16 bit package. Good enough for surface normals +// we hope. +class cUnitVector // : public c3dMathObject +{ +public: + cUnitVector() { mVec = 0; } + cUnitVector( const Vector& vec ) + { + packVector( vec ); + } + cUnitVector( unsigned short val ) { mVec = val; } + + cUnitVector& operator=( const Vector& vec ) + { packVector( vec ); return *this; } + + operator Vector() + { + unpackVector( mTmpVec ); + return mTmpVec; + } + + void packVector( const Vector& vec ) + { + // convert from Vector to cUnitVector + + Assert( vec.IsValid()); + Vector tmp = vec; + + // input vector does not have to be unit length + // Assert( tmp.length() <= 1.001f ); + + mVec = 0; + if ( tmp.x < 0 ) { mVec |= XSIGN_MASK; tmp.x = -tmp.x; } + if ( tmp.y < 0 ) { mVec |= YSIGN_MASK; tmp.y = -tmp.y; } + if ( tmp.z < 0 ) { mVec |= ZSIGN_MASK; tmp.z = -tmp.z; } + + // project the normal onto the plane that goes through + // X0=(1,0,0),Y0=(0,1,0),Z0=(0,0,1). + // on that plane we choose an (projective!) coordinate system + // such that X0->(0,0), Y0->(126,0), Z0->(0,126),(0,0,0)->Infinity + + // a little slower... old pack was 4 multiplies and 2 adds. + // This is 2 multiplies, 2 adds, and a divide.... + float w = 126.0f / ( tmp.x + tmp.y + tmp.z ); + long xbits = (long)( tmp.x * w ); + long ybits = (long)( tmp.y * w ); + + Assert( xbits < 127 ); + Assert( xbits >= 0 ); + Assert( ybits < 127 ); + Assert( ybits >= 0 ); + + // Now we can be sure that 0<=xp<=126, 0<=yp<=126, 0<=xp+yp<=126 + // however for the sampling we want to transform this triangle + // into a rectangle. + if ( xbits >= 64 ) + { + xbits = 127 - xbits; + ybits = 127 - ybits; + } + + // now we that have xp in the range (0,127) and yp in + // the range (0,63), we can pack all the bits together + mVec |= ( xbits << 7 ); + mVec |= ybits; + } + + void unpackVector( Vector& vec ) + { + // if we do a straightforward backward transform + // we will get points on the plane X0,Y0,Z0 + // however we need points on a sphere that goes through + // these points. Therefore we need to adjust x,y,z so + // that x^2+y^2+z^2=1 by normalizing the vector. We have + // already precalculated the amount by which we need to + // scale, so all we do is a table lookup and a + // multiplication + + // get the x and y bits + long xbits = (( mVec & TOP_MASK ) >> 7 ); + long ybits = ( mVec & BOTTOM_MASK ); + + // map the numbers back to the triangle (0,0)-(0,126)-(126,0) + if (( xbits + ybits ) >= 127 ) + { + xbits = 127 - xbits; + ybits = 127 - ybits; + } + + // do the inverse transform and normalization + // costs 3 extra multiplies and 2 subtracts. No big deal. + float uvadj = mUVAdjustment[mVec & ~SIGN_MASK]; + vec.x = uvadj * (float) xbits; + vec.y = uvadj * (float) ybits; + vec.z = uvadj * (float)( 126 - xbits - ybits ); + + // set all the sign bits + if ( mVec & XSIGN_MASK ) vec.x = -vec.x; + if ( mVec & YSIGN_MASK ) vec.y = -vec.y; + if ( mVec & ZSIGN_MASK ) vec.z = -vec.z; + + Assert( vec.IsValid()); + } + + static void initializeStatics() + { + for ( int idx = 0; idx < 0x2000; idx++ ) + { + long xbits = idx >> 7; + long ybits = idx & BOTTOM_MASK; + + // map the numbers back to the triangle (0,0)-(0,127)-(127,0) + if (( xbits + ybits ) >= 127 ) + { + xbits = 127 - xbits; + ybits = 127 - ybits; + } + + // convert to 3D vectors + float x = (float)xbits; + float y = (float)ybits; + float z = (float)( 126 - xbits - ybits ); + + // calculate the amount of normalization required + mUVAdjustment[idx] = 1.0f / sqrtf( y*y + z*z + x*x ); + Assert( _finite( mUVAdjustment[idx])); + + //cerr << mUVAdjustment[idx] << "\t"; + //if ( xbits == 0 ) cerr << "\n"; + } + } + +#if 0 + void test() + { + #define TEST_RANGE 4 + #define TEST_RANDOM 100 + #define TEST_ANGERROR 1.0 + + float maxError = 0; + float avgError = 0; + int numVecs = 0; + + {for ( int x = -TEST_RANGE; x < TEST_RANGE; x++ ) + { + for ( int y = -TEST_RANGE; y < TEST_RANGE; y++ ) + { + for ( int z = -TEST_RANGE; z < TEST_RANGE; z++ ) + { + if (( x + y + z ) == 0 ) continue; + + Vector vec( (float)x, (float)y, (float)z ); + Vector vec2; + + vec.normalize(); + packVector( vec ); + unpackVector( vec2 ); + + float ang = vec.dot( vec2 ); + ang = (( fabs( ang ) > 0.99999f ) ? 0 : (float)acos(ang)); + + if (( ang > TEST_ANGERROR ) | ( !_finite( ang ))) + { + cerr << "error: " << ang << endl; + cerr << "orig vec: " << vec.x << ",\t" + << vec.y << ",\t" << vec.z << "\tmVec: " + << mVec << endl; + cerr << "quantized vec2: " << vec2.x + << ",\t" << vec2.y << ",\t" + << vec2.z << endl << endl; + } + avgError += ang; + numVecs++; + if ( maxError < ang ) maxError = ang; + } + } + }} + + for ( int w = 0; w < TEST_RANDOM; w++ ) + { + Vector vec( genRandom(), genRandom(), genRandom()); + Vector vec2; + vec.normalize(); + + packVector( vec ); + unpackVector( vec2 ); + + float ang =vec.dot( vec2 ); + ang = (( ang > 0.999f ) ? 0 : (float)acos(ang)); + + if (( ang > TEST_ANGERROR ) | ( !_finite( ang ))) + { + cerr << "error: " << ang << endl; + cerr << "orig vec: " << vec.x << ",\t" + << vec.y << ",\t" << vec.z << "\tmVec: " + << mVec << endl; + cerr << "quantized vec2: " << vec2.x << ",\t" + << vec2.y << ",\t" + << vec2.z << endl << endl; + } + avgError += ang; + numVecs++; + if ( maxError < ang ) maxError = ang; + } + + { for ( int x = 0; x < 50; x++ ) + { + Vector vec( (float)x, 25.0f, 0.0f ); + Vector vec2; + + vec.normalize(); + packVector( vec ); + unpackVector( vec2 ); + + float ang = vec.dot( vec2 ); + ang = (( fabs( ang ) > 0.999f ) ? 0 : (float)acos(ang)); + + if (( ang > TEST_ANGERROR ) | ( !_finite( ang ))) + { + cerr << "error: " << ang << endl; + cerr << "orig vec: " << vec.x << ",\t" + << vec.y << ",\t" << vec.z << "\tmVec: " + << mVec << endl; + cerr << " quantized vec2: " << vec2.x << ",\t" + << vec2.y << ",\t" << vec2.z << endl << endl; + } + + avgError += ang; + numVecs++; + if ( maxError < ang ) maxError = ang; + }} + + cerr << "max angle error: " << maxError + << ", average error: " << avgError / numVecs + << ", num tested vecs: " << numVecs << endl; + } + + friend ostream& operator<< ( ostream& os, const cUnitVector& vec ) + { os << vec.mVec; return os; } +#endif + +//protected: // !!!! + + unsigned short mVec; + static float mUVAdjustment[0x2000]; + static Vector mTmpVec; +}; + +#endif // _3D_VECTOR_H + + diff --git a/public/mathlib/compressed_light_cube.h b/public/mathlib/compressed_light_cube.h new file mode 100644 index 0000000..207f92d --- /dev/null +++ b/public/mathlib/compressed_light_cube.h @@ -0,0 +1,24 @@ +//========= Copyright Valve Corporation, All rights reserved. ============// +// +// Purpose: +// +//=============================================================================// + +#ifndef COMPRESSED_LIGHT_CUBE_H +#define COMPRESSED_LIGHT_CUBE_H +#ifdef _WIN32 +#pragma once +#endif + + +#include "mathlib/mathlib.h" + + +struct CompressedLightCube +{ + DECLARE_BYTESWAP_DATADESC(); + ColorRGBExp32 m_Color[6]; +}; + + +#endif // COMPRESSED_LIGHT_CUBE_H diff --git a/public/mathlib/compressed_vector.h b/public/mathlib/compressed_vector.h new file mode 100644 index 0000000..6a49522 --- /dev/null +++ b/public/mathlib/compressed_vector.h @@ -0,0 +1,608 @@ +//========= Copyright Valve Corporation, All rights reserved. ============// +// +// Purpose: +// +// $NoKeywords: $ +// +//=============================================================================// + +#ifndef COMPRESSED_VECTOR_H +#define COMPRESSED_VECTOR_H + +#ifdef _WIN32 +#pragma once +#endif + +#include <math.h> +#include <float.h> + +// For vec_t, put this somewhere else? +#include "basetypes.h" + +// For rand(). We really need a library! +#include <stdlib.h> + +#include "tier0/dbg.h" +#include "mathlib/vector.h" + +#include "mathlib/mathlib.h" + +#if defined( _X360 ) +#pragma bitfield_order( push, lsb_to_msb ) +#endif +//========================================================= +// fit a 3D vector into 32 bits +//========================================================= + +class Vector32 +{ +public: + // Construction/destruction: + Vector32(void); + Vector32(vec_t X, vec_t Y, vec_t Z); + + // assignment + Vector32& operator=(const Vector &vOther); + operator Vector (); + +private: + unsigned short x:10; + unsigned short y:10; + unsigned short z:10; + unsigned short exp:2; +}; + +inline Vector32& Vector32::operator=(const Vector &vOther) +{ + CHECK_VALID(vOther); + + static float expScale[4] = { 4.0f, 16.0f, 32.f, 64.f }; + + float fmax = Max( fabs( vOther.x ), fabs( vOther.y ) ); + fmax = Max( fmax, (float)fabs( vOther.z ) ); + + for (exp = 0; exp < 3; exp++) + { + if (fmax < expScale[exp]) + break; + } + Assert( fmax < expScale[exp] ); + + float fexp = 512.0f / expScale[exp]; + + x = Clamp( (int)(vOther.x * fexp) + 512, 0, 1023 ); + y = Clamp( (int)(vOther.y * fexp) + 512, 0, 1023 ); + z = Clamp( (int)(vOther.z * fexp) + 512, 0, 1023 ); + return *this; +} + + +inline Vector32::operator Vector () +{ + Vector tmp; + + static float expScale[4] = { 4.0f, 16.0f, 32.f, 64.f }; + + float fexp = expScale[exp] / 512.0f; + + tmp.x = (((int)x) - 512) * fexp; + tmp.y = (((int)y) - 512) * fexp; + tmp.z = (((int)z) - 512) * fexp; + return tmp; +} + + +//========================================================= +// Fit a unit vector into 32 bits +//========================================================= + +class Normal32 +{ +public: + // Construction/destruction: + Normal32(void); + Normal32(vec_t X, vec_t Y, vec_t Z); + + // assignment + Normal32& operator=(const Vector &vOther); + operator Vector (); + +private: + unsigned short x:15; + unsigned short y:15; + unsigned short zneg:1; +}; + + +inline Normal32& Normal32::operator=(const Vector &vOther) +{ + CHECK_VALID(vOther); + + x = Clamp( (int)(vOther.x * 16384) + 16384, 0, 32767 ); + y = Clamp( (int)(vOther.y * 16384) + 16384, 0, 32767 ); + zneg = (vOther.z < 0); + //x = vOther.x; + //y = vOther.y; + //z = vOther.z; + return *this; +} + + +inline Normal32::operator Vector () +{ + Vector tmp; + + tmp.x = ((int)x - 16384) * (1 / 16384.0); + tmp.y = ((int)y - 16384) * (1 / 16384.0); + tmp.z = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y ); + if (zneg) + tmp.z = -tmp.z; + return tmp; +} + + +//========================================================= +// 64 bit Quaternion +//========================================================= + +class Quaternion64 +{ +public: + // Construction/destruction: + Quaternion64(void); + Quaternion64(vec_t X, vec_t Y, vec_t Z); + + // assignment + // Quaternion& operator=(const Quaternion64 &vOther); + Quaternion64& operator=(const Quaternion &vOther); + operator Quaternion (); +private: + uint64 x:21; + uint64 y:21; + uint64 z:21; + uint64 wneg:1; +}; + + +inline Quaternion64::operator Quaternion () +{ + Quaternion tmp; + + // shift to -1048576, + 1048575, then round down slightly to -1.0 < x < 1.0 + tmp.x = ((int)x - 1048576) * (1 / 1048576.5f); + tmp.y = ((int)y - 1048576) * (1 / 1048576.5f); + tmp.z = ((int)z - 1048576) * (1 / 1048576.5f); + tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z ); + if (wneg) + tmp.w = -tmp.w; + return tmp; +} + +inline Quaternion64& Quaternion64::operator=(const Quaternion &vOther) +{ + CHECK_VALID(vOther); + + x = Clamp( (int)(vOther.x * 1048576) + 1048576, 0, 2097151 ); + y = Clamp( (int)(vOther.y * 1048576) + 1048576, 0, 2097151 ); + z = Clamp( (int)(vOther.z * 1048576) + 1048576, 0, 2097151 ); + wneg = (vOther.w < 0); + return *this; +} + +//========================================================= +// 48 bit Quaternion +//========================================================= + +class Quaternion48 +{ +public: + // Construction/destruction: + Quaternion48(void); + Quaternion48(vec_t X, vec_t Y, vec_t Z); + + // assignment + // Quaternion& operator=(const Quaternion48 &vOther); + Quaternion48& operator=(const Quaternion &vOther); + operator Quaternion (); +private: + unsigned short x:16; + unsigned short y:16; + unsigned short z:15; + unsigned short wneg:1; +}; + + +inline Quaternion48::operator Quaternion () +{ + Quaternion tmp; + + tmp.x = ((int)x - 32768) * (1 / 32768.0); + tmp.y = ((int)y - 32768) * (1 / 32768.0); + tmp.z = ((int)z - 16384) * (1 / 16384.0); + tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z ); + if (wneg) + tmp.w = -tmp.w; + return tmp; +} + +inline Quaternion48& Quaternion48::operator=(const Quaternion &vOther) +{ + CHECK_VALID(vOther); + + x = Clamp( (int)(vOther.x * 32768) + 32768, 0, 65535 ); + y = Clamp( (int)(vOther.y * 32768) + 32768, 0, 65535 ); + z = Clamp( (int)(vOther.z * 16384) + 16384, 0, 32767 ); + wneg = (vOther.w < 0); + return *this; +} + +//========================================================= +// 32 bit Quaternion +//========================================================= + +class Quaternion32 +{ +public: + // Construction/destruction: + Quaternion32(void); + Quaternion32(vec_t X, vec_t Y, vec_t Z); + + // assignment + // Quaternion& operator=(const Quaternion48 &vOther); + Quaternion32& operator=(const Quaternion &vOther); + operator Quaternion (); +private: + unsigned int x:11; + unsigned int y:10; + unsigned int z:10; + unsigned int wneg:1; +}; + + +inline Quaternion32::operator Quaternion () +{ + Quaternion tmp; + + tmp.x = ((int)x - 1024) * (1 / 1024.0); + tmp.y = ((int)y - 512) * (1 / 512.0); + tmp.z = ((int)z - 512) * (1 / 512.0); + tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z ); + if (wneg) + tmp.w = -tmp.w; + return tmp; +} + +inline Quaternion32& Quaternion32::operator=(const Quaternion &vOther) +{ + CHECK_VALID(vOther); + + x = Clamp( (int)(vOther.x * 1024) + 1024, 0, 2047 ); + y = Clamp( (int)(vOther.y * 512) + 512, 0, 1023 ); + z = Clamp( (int)(vOther.z * 512) + 512, 0, 1023 ); + wneg = (vOther.w < 0); + return *this; +} + +//========================================================= +// 16 bit float +//========================================================= + + +const int float32bias = 127; +const int float16bias = 15; + +const float maxfloat16bits = 65504.0f; + +class float16 +{ +public: + //float16() {} + //float16( float f ) { m_storage.rawWord = ConvertFloatTo16bits(f); } + + void Init() { m_storage.rawWord = 0; } +// float16& operator=(const float16 &other) { m_storage.rawWord = other.m_storage.rawWord; return *this; } +// float16& operator=(const float &other) { m_storage.rawWord = ConvertFloatTo16bits(other); return *this; } +// operator unsigned short () { return m_storage.rawWord; } +// operator float () { return Convert16bitFloatTo32bits( m_storage.rawWord ); } + unsigned short GetBits() const + { + return m_storage.rawWord; + } + float GetFloat() const + { + return Convert16bitFloatTo32bits( m_storage.rawWord ); + } + void SetFloat( float in ) + { + m_storage.rawWord = ConvertFloatTo16bits( in ); + } + + bool IsInfinity() const + { + return m_storage.bits.biased_exponent == 31 && m_storage.bits.mantissa == 0; + } + bool IsNaN() const + { + return m_storage.bits.biased_exponent == 31 && m_storage.bits.mantissa != 0; + } + + bool operator==(const float16 other) const { return m_storage.rawWord == other.m_storage.rawWord; } + bool operator!=(const float16 other) const { return m_storage.rawWord != other.m_storage.rawWord; } + +// bool operator< (const float other) const { return GetFloat() < other; } +// bool operator> (const float other) const { return GetFloat() > other; } + +protected: + union float32bits + { + float rawFloat; + struct + { + unsigned int mantissa : 23; + unsigned int biased_exponent : 8; + unsigned int sign : 1; + } bits; + }; + + union float16bits + { + unsigned short rawWord; + struct + { + unsigned short mantissa : 10; + unsigned short biased_exponent : 5; + unsigned short sign : 1; + } bits; + }; + + static bool IsNaN( float16bits in ) + { + return in.bits.biased_exponent == 31 && in.bits.mantissa != 0; + } + static bool IsInfinity( float16bits in ) + { + return in.bits.biased_exponent == 31 && in.bits.mantissa == 0; + } + + // 0x0001 - 0x03ff + static unsigned short ConvertFloatTo16bits( float input ) + { + if ( input > maxfloat16bits ) + input = maxfloat16bits; + else if ( input < -maxfloat16bits ) + input = -maxfloat16bits; + + float16bits output; + float32bits inFloat; + + inFloat.rawFloat = input; + + output.bits.sign = inFloat.bits.sign; + + if ( (inFloat.bits.biased_exponent==0) && (inFloat.bits.mantissa==0) ) + { + // zero + output.bits.mantissa = 0; + output.bits.biased_exponent = 0; + } + else if ( (inFloat.bits.biased_exponent==0) && (inFloat.bits.mantissa!=0) ) + { + // denorm -- denorm float maps to 0 half + output.bits.mantissa = 0; + output.bits.biased_exponent = 0; + } + else if ( (inFloat.bits.biased_exponent==0xff) && (inFloat.bits.mantissa==0) ) + { +#if 0 + // infinity + output.bits.mantissa = 0; + output.bits.biased_exponent = 31; +#else + // infinity maps to maxfloat + output.bits.mantissa = 0x3ff; + output.bits.biased_exponent = 0x1e; +#endif + } + else if ( (inFloat.bits.biased_exponent==0xff) && (inFloat.bits.mantissa!=0) ) + { +#if 0 + // NaN + output.bits.mantissa = 1; + output.bits.biased_exponent = 31; +#else + // NaN maps to zero + output.bits.mantissa = 0; + output.bits.biased_exponent = 0; +#endif + } + else + { + // regular number + int new_exp = inFloat.bits.biased_exponent-127; + + if (new_exp<-24) + { + // this maps to 0 + output.bits.mantissa = 0; + output.bits.biased_exponent = 0; + } + + if (new_exp<-14) + { + // this maps to a denorm + output.bits.biased_exponent = 0; + unsigned int exp_val = ( unsigned int )( -14 - ( inFloat.bits.biased_exponent - float32bias ) ); + if( exp_val > 0 && exp_val < 11 ) + { + output.bits.mantissa = ( 1 << ( 10 - exp_val ) ) + ( inFloat.bits.mantissa >> ( 13 + exp_val ) ); + } + } + else if (new_exp>15) + { +#if 0 + // map this value to infinity + output.bits.mantissa = 0; + output.bits.biased_exponent = 31; +#else + // to big. . . maps to maxfloat + output.bits.mantissa = 0x3ff; + output.bits.biased_exponent = 0x1e; +#endif + } + else + { + output.bits.biased_exponent = new_exp+15; + output.bits.mantissa = (inFloat.bits.mantissa >> 13); + } + } + return output.rawWord; + } + + static float Convert16bitFloatTo32bits( unsigned short input ) + { + float32bits output; + const float16bits &inFloat = *((float16bits *)&input); + + if( IsInfinity( inFloat ) ) + { + return maxfloat16bits * ( ( inFloat.bits.sign == 1 ) ? -1.0f : 1.0f ); + } + if( IsNaN( inFloat ) ) + { + return 0.0; + } + if( inFloat.bits.biased_exponent == 0 && inFloat.bits.mantissa != 0 ) + { + // denorm + const float half_denorm = (1.0f/16384.0f); // 2^-14 + float mantissa = ((float)(inFloat.bits.mantissa)) / 1024.0f; + float sgn = (inFloat.bits.sign)? -1.0f :1.0f; + output.rawFloat = sgn*mantissa*half_denorm; + } + else + { + // regular number + unsigned mantissa = inFloat.bits.mantissa; + unsigned biased_exponent = inFloat.bits.biased_exponent; + unsigned sign = ((unsigned)inFloat.bits.sign) << 31; + biased_exponent = ( (biased_exponent - float16bias + float32bias) * (biased_exponent != 0) ) << 23; + mantissa <<= (23-10); + + *((unsigned *)&output) = ( mantissa | biased_exponent | sign ); + } + + return output.rawFloat; + } + + + float16bits m_storage; +}; + +class float16_with_assign : public float16 +{ +public: + float16_with_assign() {} + float16_with_assign( float f ) { m_storage.rawWord = ConvertFloatTo16bits(f); } + + float16& operator=(const float16 &other) { m_storage.rawWord = ((float16_with_assign &)other).m_storage.rawWord; return *this; } + float16& operator=(const float &other) { m_storage.rawWord = ConvertFloatTo16bits(other); return *this; } +// operator unsigned short () const { return m_storage.rawWord; } + operator float () const { return Convert16bitFloatTo32bits( m_storage.rawWord ); } +}; + +//========================================================= +// Fit a 3D vector in 48 bits +//========================================================= + +class Vector48 +{ +public: + // Construction/destruction: + Vector48(void) {} + Vector48(vec_t X, vec_t Y, vec_t Z) { x.SetFloat( X ); y.SetFloat( Y ); z.SetFloat( Z ); } + + // assignment + Vector48& operator=(const Vector &vOther); + operator Vector (); + + const float operator[]( int i ) const { return (((float16 *)this)[i]).GetFloat(); } + + float16 x; + float16 y; + float16 z; +}; + +inline Vector48& Vector48::operator=(const Vector &vOther) +{ + CHECK_VALID(vOther); + + x.SetFloat( vOther.x ); + y.SetFloat( vOther.y ); + z.SetFloat( vOther.z ); + return *this; +} + + +inline Vector48::operator Vector () +{ + Vector tmp; + + tmp.x = x.GetFloat(); + tmp.y = y.GetFloat(); + tmp.z = z.GetFloat(); + + return tmp; +} + +//========================================================= +// Fit a 2D vector in 32 bits +//========================================================= + +class Vector2d32 +{ +public: + // Construction/destruction: + Vector2d32(void) {} + Vector2d32(vec_t X, vec_t Y) { x.SetFloat( X ); y.SetFloat( Y ); } + + // assignment + Vector2d32& operator=(const Vector &vOther); + Vector2d32& operator=(const Vector2D &vOther); + + operator Vector2D (); + + void Init( vec_t ix = 0.f, vec_t iy = 0.f); + + float16_with_assign x; + float16_with_assign y; +}; + +inline Vector2d32& Vector2d32::operator=(const Vector2D &vOther) +{ + x.SetFloat( vOther.x ); + y.SetFloat( vOther.y ); + return *this; +} + +inline Vector2d32::operator Vector2D () +{ + Vector2D tmp; + + tmp.x = x.GetFloat(); + tmp.y = y.GetFloat(); + + return tmp; +} + +inline void Vector2d32::Init( vec_t ix, vec_t iy ) +{ + x.SetFloat(ix); + y.SetFloat(iy); +} + +#if defined( _X360 ) +#pragma bitfield_order( pop ) +#endif + +#endif + diff --git a/public/mathlib/halton.h b/public/mathlib/halton.h new file mode 100644 index 0000000..44df68f --- /dev/null +++ b/public/mathlib/halton.h @@ -0,0 +1,71 @@ +//========= Copyright Valve Corporation, All rights reserved. ============// +// $Id$ + +// halton.h - classes, etc for generating numbers using the Halton pseudo-random sequence. See +// http://halton-sequences.wikiverse.org/. +// +// what this function is useful for is any sort of sampling/integration problem where +// you want to solve it by random sampling. Each call the NextValue() generates +// a random number between 0 and 1, in an unclumped manner, so that the space can be more +// or less evenly sampled with a minimum number of samples. +// +// It is NOT useful for generating random numbers dynamically, since the outputs aren't +// particularly random. +// +// To generate multidimensional sample values (points in a plane, etc), use two +// HaltonSequenceGenerator_t's, with different (primes) bases. + +#ifndef HALTON_H +#define HALTON_H + +#include <tier0/platform.h> +#include <mathlib/vector.h> + +class HaltonSequenceGenerator_t +{ + int seed; + int base; + float fbase; //< base as a float + +public: + HaltonSequenceGenerator_t(int base); //< base MUST be prime, >=2 + + float GetElement(int element); + + inline float NextValue(void) + { + return GetElement(seed++); + } + +}; + + +class DirectionalSampler_t //< pseudo-random sphere sampling +{ + HaltonSequenceGenerator_t zdot; + HaltonSequenceGenerator_t vrot; +public: + DirectionalSampler_t(void) + : zdot(2),vrot(3) + { + } + + Vector NextValue(void) + { + float zvalue=zdot.NextValue(); + zvalue=2*zvalue-1.0; // map from 0..1 to -1..1 + float phi=acos(zvalue); + // now, generate a random rotation angle for x/y + float theta=2.0*M_PI*vrot.NextValue(); + float sin_p=sin(phi); + return Vector(cos(theta)*sin_p, + sin(theta)*sin_p, + zvalue); + + } +}; + + + + +#endif // halton_h diff --git a/public/mathlib/lightdesc.h b/public/mathlib/lightdesc.h new file mode 100644 index 0000000..1096d62 --- /dev/null +++ b/public/mathlib/lightdesc.h @@ -0,0 +1,173 @@ +//========= Copyright Valve Corporation, All rights reserved. ============// +// +// Purpose: +// +//===========================================================================// + +// light structure definitions. +#ifndef LIGHTDESC_H +#define LIGHTDESC_H + +#include <mathlib/ssemath.h> +#include <mathlib/vector.h> + +//----------------------------------------------------------------------------- +// Light structure +//----------------------------------------------------------------------------- + +enum LightType_t +{ + MATERIAL_LIGHT_DISABLE = 0, + MATERIAL_LIGHT_POINT, + MATERIAL_LIGHT_DIRECTIONAL, + MATERIAL_LIGHT_SPOT, +}; + +enum LightType_OptimizationFlags_t +{ + LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION0 = 1, + LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION1 = 2, + LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION2 = 4, + LIGHTTYPE_OPTIMIZATIONFLAGS_DERIVED_VALUES_CALCED = 8, +}; + +struct LightDesc_t +{ + LightType_t m_Type; //< MATERIAL_LIGHT_xxx + Vector m_Color; //< color+intensity + Vector m_Position; //< light source center position + Vector m_Direction; //< for SPOT, direction it is pointing + float m_Range; //< distance range for light.0=infinite + float m_Falloff; //< angular falloff exponent for spot lights + float m_Attenuation0; //< constant distance falloff term + float m_Attenuation1; //< linear term of falloff + float m_Attenuation2; //< quadatic term of falloff + float m_Theta; //< inner cone angle. no angular falloff + //< within this cone + float m_Phi; //< outer cone angle + + // the values below are derived from the above settings for optimizations + // These aren't used by DX8. . used for software lighting. + float m_ThetaDot; + float m_PhiDot; + unsigned int m_Flags; +protected: + float OneOver_ThetaDot_Minus_PhiDot; + float m_RangeSquared; +public: + + void RecalculateDerivedValues(void); // calculate m_xxDot, m_Type for changed parms + + LightDesc_t(void) + { + } + + // constructors for various useful subtypes + + // a point light with infinite range + LightDesc_t( const Vector &pos, const Vector &color ) + { + InitPoint( pos, color ); + } + + /// a simple light. cone boundaries in radians. you pass a look_at point and the + /// direciton is derived from that. + LightDesc_t( const Vector &pos, const Vector &color, const Vector &point_at, + float inner_cone_boundary, float outer_cone_boundary ) + { + InitSpot( pos, color, point_at, inner_cone_boundary, outer_cone_boundary ); + } + + void InitPoint( const Vector &pos, const Vector &color ); + void InitDirectional( const Vector &dir, const Vector &color ); + void InitSpot(const Vector &pos, const Vector &color, const Vector &point_at, + float inner_cone_boundary, float outer_cone_boundary ); + + /// Given 4 points and 4 normals, ADD lighting from this light into "color". + void ComputeLightAtPoints( const FourVectors &pos, const FourVectors &normal, + FourVectors &color, bool DoHalfLambert=false ) const; + void ComputeNonincidenceLightAtPoints( const FourVectors &pos, FourVectors &color ) const; + void ComputeLightAtPointsForDirectional( const FourVectors &pos, + const FourVectors &normal, + FourVectors &color, bool DoHalfLambert=false ) const; + + // warning - modifies color!!! set color first!! + void SetupOldStyleAttenuation( float fQuadatricAttn, float fLinearAttn, float fConstantAttn ); + + void SetupNewStyleAttenuation( float fFiftyPercentDistance, float fZeroPercentDistance ); + + +/// given a direction relative to the light source position, is this ray within the + /// light cone (for spotlights..non spots consider all rays to be within their cone) + bool IsDirectionWithinLightCone(const Vector &rdir) const + { + return ((m_Type!=MATERIAL_LIGHT_SPOT) || (rdir.Dot(m_Direction)>=m_PhiDot)); + } + + float OneOverThetaDotMinusPhiDot() const + { + return OneOver_ThetaDot_Minus_PhiDot; + } +}; + + +//----------------------------------------------------------------------------- +// a point light with infinite range +//----------------------------------------------------------------------------- +inline void LightDesc_t::InitPoint( const Vector &pos, const Vector &color ) +{ + m_Type=MATERIAL_LIGHT_POINT; + m_Color=color; + m_Position=pos; + m_Range=0.0; // infinite + m_Attenuation0=1.0; + m_Attenuation1=0; + m_Attenuation2=0; + RecalculateDerivedValues(); +} + + +//----------------------------------------------------------------------------- +// a directional light with infinite range +//----------------------------------------------------------------------------- +inline void LightDesc_t::InitDirectional( const Vector &dir, const Vector &color ) +{ + m_Type=MATERIAL_LIGHT_DIRECTIONAL; + m_Color=color; + m_Direction=dir; + m_Range=0.0; // infinite + m_Attenuation0=1.0; + m_Attenuation1=0; + m_Attenuation2=0; + RecalculateDerivedValues(); +} + + +//----------------------------------------------------------------------------- +// a simple light. cone boundaries in radians. you pass a look_at point and the +// direciton is derived from that. +//----------------------------------------------------------------------------- +inline void LightDesc_t::InitSpot(const Vector &pos, const Vector &color, const Vector &point_at, + float inner_cone_boundary, float outer_cone_boundary) +{ + m_Type=MATERIAL_LIGHT_SPOT; + m_Color=color; + m_Position=pos; + m_Direction=point_at; + m_Direction-=pos; + VectorNormalizeFast(m_Direction); + m_Falloff=5.0; // linear angle falloff + m_Theta=inner_cone_boundary; + m_Phi=outer_cone_boundary; + + m_Range=0.0; // infinite + + m_Attenuation0=1.0; + m_Attenuation1=0; + m_Attenuation2=0; + RecalculateDerivedValues(); +} + + +#endif + diff --git a/public/mathlib/math_pfns.h b/public/mathlib/math_pfns.h new file mode 100644 index 0000000..d43411c --- /dev/null +++ b/public/mathlib/math_pfns.h @@ -0,0 +1,80 @@ +//========= Copyright Valve Corporation, All rights reserved. ============// +// +// Purpose: +// +//=====================================================================================// + +#ifndef _MATH_PFNS_H_ +#define _MATH_PFNS_H_ + +#if defined( _X360 ) +#include <xboxmath.h> +#endif + +#if !defined( _X360 ) + +// These globals are initialized by mathlib and redirected based on available fpu features +extern float (*pfSqrt)(float x); +extern float (*pfRSqrt)(float x); +extern float (*pfRSqrtFast)(float x); +extern void (*pfFastSinCos)(float x, float *s, float *c); +extern float (*pfFastCos)(float x); + +// The following are not declared as macros because they are often used in limiting situations, +// and sometimes the compiler simply refuses to inline them for some reason +#define FastSqrt(x) (*pfSqrt)(x) +#define FastRSqrt(x) (*pfRSqrt)(x) +#define FastRSqrtFast(x) (*pfRSqrtFast)(x) +#define FastSinCos(x,s,c) (*pfFastSinCos)(x,s,c) +#define FastCos(x) (*pfFastCos)(x) + +#if defined(__i386__) || defined(_M_IX86) +// On x86, the inline FPU or SSE sqrt instruction is faster than +// the overhead of setting up a function call and saving/restoring +// the FPU or SSE register state and can be scheduled better, too. +#undef FastSqrt +#define FastSqrt(x) ::sqrtf(x) +#endif + +#endif // !_X360 + +#if defined( _X360 ) + +FORCEINLINE float _VMX_Sqrt( float x ) +{ + return __fsqrts( x ); +} + +FORCEINLINE float _VMX_RSqrt( float x ) +{ + float rroot = __frsqrte( x ); + + // Single iteration NewtonRaphson on reciprocal square root estimate + return (0.5f * rroot) * (3.0f - (x * rroot) * rroot); +} + +FORCEINLINE float _VMX_RSqrtFast( float x ) +{ + return __frsqrte( x ); +} + +FORCEINLINE void _VMX_SinCos( float a, float *pS, float *pC ) +{ + XMScalarSinCos( pS, pC, a ); +} + +FORCEINLINE float _VMX_Cos( float a ) +{ + return XMScalarCos( a ); +} + +// the 360 has fixed hw and calls directly +#define FastSqrt(x) _VMX_Sqrt(x) +#define FastRSqrt(x) _VMX_RSqrt(x) +#define FastRSqrtFast(x) _VMX_RSqrtFast(x) +#define FastSinCos(x,s,c) _VMX_SinCos(x,s,c) +#define FastCos(x) _VMX_Cos(x) + +#endif // _X360 + +#endif // _MATH_PFNS_H_ diff --git a/public/mathlib/mathlib.h b/public/mathlib/mathlib.h new file mode 100644 index 0000000..a6d302f --- /dev/null +++ b/public/mathlib/mathlib.h @@ -0,0 +1,2187 @@ +//========= Copyright Valve Corporation, All rights reserved. ============// +// +// Purpose: +// +//===========================================================================// + +#ifndef MATH_LIB_H +#define MATH_LIB_H + +#include <math.h> +#include "minmax.h" +#include "tier0/basetypes.h" +#include "tier0/commonmacros.h" +#include "mathlib/vector.h" +#include "mathlib/vector2d.h" +#include "tier0/dbg.h" + +#include "mathlib/math_pfns.h" + +#if defined(__i386__) || defined(_M_IX86) +// For MMX intrinsics +#include <xmmintrin.h> +#endif + +// XXX remove me +#undef clamp + +// Uncomment this to enable FP exceptions in parts of the code. +// This can help track down FP bugs. However the code is not +// FP exception clean so this not a turnkey operation. +//#define FP_EXCEPTIONS_ENABLED + + +#ifdef FP_EXCEPTIONS_ENABLED +#include <float.h> // For _clearfp and _controlfp_s +#endif + +// FPExceptionDisabler and FPExceptionEnabler taken from my blog post +// at http://www.altdevblogaday.com/2012/04/20/exceptional-floating-point/ + +// Declare an object of this type in a scope in order to suppress +// all floating-point exceptions temporarily. The old exception +// state will be reset at the end. +class FPExceptionDisabler +{ +public: +#ifdef FP_EXCEPTIONS_ENABLED + FPExceptionDisabler(); + ~FPExceptionDisabler(); + +private: + unsigned int mOldValues; +#else + FPExceptionDisabler() {} + ~FPExceptionDisabler() {} +#endif + +private: + // Make the copy constructor and assignment operator private + // and unimplemented to prohibit copying. + FPExceptionDisabler(const FPExceptionDisabler&); + FPExceptionDisabler& operator=(const FPExceptionDisabler&); +}; + +// Declare an object of this type in a scope in order to enable a +// specified set of floating-point exceptions temporarily. The old +// exception state will be reset at the end. +// This class can be nested. +class FPExceptionEnabler +{ +public: + // Overflow, divide-by-zero, and invalid-operation are the FP + // exceptions most frequently associated with bugs. +#ifdef FP_EXCEPTIONS_ENABLED + FPExceptionEnabler(unsigned int enableBits = _EM_OVERFLOW | _EM_ZERODIVIDE | _EM_INVALID); + ~FPExceptionEnabler(); + +private: + unsigned int mOldValues; +#else + FPExceptionEnabler(unsigned int enableBits = 0) + { + } + ~FPExceptionEnabler() + { + } +#endif + +private: + // Make the copy constructor and assignment operator private + // and unimplemented to prohibit copying. + FPExceptionEnabler(const FPExceptionEnabler&); + FPExceptionEnabler& operator=(const FPExceptionEnabler&); +}; + + + +#ifdef DEBUG // stop crashing edit-and-continue +FORCEINLINE float clamp( float val, float minVal, float maxVal ) +{ + if ( maxVal < minVal ) + return maxVal; + else if( val < minVal ) + return minVal; + else if( val > maxVal ) + return maxVal; + else + return val; +} +#else // DEBUG +FORCEINLINE float clamp( float val, float minVal, float maxVal ) +{ +#if defined(__i386__) || defined(_M_IX86) + _mm_store_ss( &val, + _mm_min_ss( + _mm_max_ss( + _mm_load_ss(&val), + _mm_load_ss(&minVal) ), + _mm_load_ss(&maxVal) ) ); +#else + val = fpmax(minVal, val); + val = fpmin(maxVal, val); +#endif + return val; +} +#endif // DEBUG + +// +// Returns a clamped value in the range [min, max]. +// +template< class T > +inline T clamp( T const &val, T const &minVal, T const &maxVal ) +{ + if ( maxVal < minVal ) + return maxVal; + else if( val < minVal ) + return minVal; + else if( val > maxVal ) + return maxVal; + else + return val; +} + + +// plane_t structure +// !!! if this is changed, it must be changed in asm code too !!! +// FIXME: does the asm code even exist anymore? +// FIXME: this should move to a different file +struct cplane_t +{ + Vector normal; + float dist; + byte type; // for fast side tests + byte signbits; // signx + (signy<<1) + (signz<<1) + byte pad[2]; + +#ifdef VECTOR_NO_SLOW_OPERATIONS + cplane_t() {} + +private: + // No copy constructors allowed if we're in optimal mode + cplane_t(const cplane_t& vOther); +#endif +}; + +// structure offset for asm code +#define CPLANE_NORMAL_X 0 +#define CPLANE_NORMAL_Y 4 +#define CPLANE_NORMAL_Z 8 +#define CPLANE_DIST 12 +#define CPLANE_TYPE 16 +#define CPLANE_SIGNBITS 17 +#define CPLANE_PAD0 18 +#define CPLANE_PAD1 19 + +// 0-2 are axial planes +#define PLANE_X 0 +#define PLANE_Y 1 +#define PLANE_Z 2 + +// 3-5 are non-axial planes snapped to the nearest +#define PLANE_ANYX 3 +#define PLANE_ANYY 4 +#define PLANE_ANYZ 5 + + +//----------------------------------------------------------------------------- +// Frustum plane indices. +// WARNING: there is code that depends on these values +//----------------------------------------------------------------------------- + +enum +{ + FRUSTUM_RIGHT = 0, + FRUSTUM_LEFT = 1, + FRUSTUM_TOP = 2, + FRUSTUM_BOTTOM = 3, + FRUSTUM_NEARZ = 4, + FRUSTUM_FARZ = 5, + FRUSTUM_NUMPLANES = 6 +}; + +extern int SignbitsForPlane( cplane_t *out ); + +class Frustum_t +{ +public: + void SetPlane( int i, int nType, const Vector &vecNormal, float dist ) + { + m_Plane[i].normal = vecNormal; + m_Plane[i].dist = dist; + m_Plane[i].type = nType; + m_Plane[i].signbits = SignbitsForPlane( &m_Plane[i] ); + m_AbsNormal[i].Init( fabs(vecNormal.x), fabs(vecNormal.y), fabs(vecNormal.z) ); + } + + inline const cplane_t *GetPlane( int i ) const { return &m_Plane[i]; } + inline const Vector &GetAbsNormal( int i ) const { return m_AbsNormal[i]; } + +private: + cplane_t m_Plane[FRUSTUM_NUMPLANES]; + Vector m_AbsNormal[FRUSTUM_NUMPLANES]; +}; + +// Computes Y fov from an X fov and a screen aspect ratio + X from Y +float CalcFovY( float flFovX, float flScreenAspect ); +float CalcFovX( float flFovY, float flScreenAspect ); + +// Generate a frustum based on perspective view parameters +// NOTE: FOV is specified in degrees, as the *full* view angle (not half-angle) +void GeneratePerspectiveFrustum( const Vector& origin, const QAngle &angles, float flZNear, float flZFar, float flFovX, float flAspectRatio, Frustum_t &frustum ); +void GeneratePerspectiveFrustum( const Vector& origin, const Vector &forward, const Vector &right, const Vector &up, float flZNear, float flZFar, float flFovX, float flFovY, Frustum_t &frustum ); + +// Cull the world-space bounding box to the specified frustum. +bool R_CullBox( const Vector& mins, const Vector& maxs, const Frustum_t &frustum ); +bool R_CullBoxSkipNear( const Vector& mins, const Vector& maxs, const Frustum_t &frustum ); + +struct matrix3x4_t +{ + matrix3x4_t() {} + matrix3x4_t( + float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23 ) + { + m_flMatVal[0][0] = m00; m_flMatVal[0][1] = m01; m_flMatVal[0][2] = m02; m_flMatVal[0][3] = m03; + m_flMatVal[1][0] = m10; m_flMatVal[1][1] = m11; m_flMatVal[1][2] = m12; m_flMatVal[1][3] = m13; + m_flMatVal[2][0] = m20; m_flMatVal[2][1] = m21; m_flMatVal[2][2] = m22; m_flMatVal[2][3] = m23; + } + + //----------------------------------------------------------------------------- + // Creates a matrix where the X axis = forward + // the Y axis = left, and the Z axis = up + //----------------------------------------------------------------------------- + void Init( const Vector& xAxis, const Vector& yAxis, const Vector& zAxis, const Vector &vecOrigin ) + { + m_flMatVal[0][0] = xAxis.x; m_flMatVal[0][1] = yAxis.x; m_flMatVal[0][2] = zAxis.x; m_flMatVal[0][3] = vecOrigin.x; + m_flMatVal[1][0] = xAxis.y; m_flMatVal[1][1] = yAxis.y; m_flMatVal[1][2] = zAxis.y; m_flMatVal[1][3] = vecOrigin.y; + m_flMatVal[2][0] = xAxis.z; m_flMatVal[2][1] = yAxis.z; m_flMatVal[2][2] = zAxis.z; m_flMatVal[2][3] = vecOrigin.z; + } + + //----------------------------------------------------------------------------- + // Creates a matrix where the X axis = forward + // the Y axis = left, and the Z axis = up + //----------------------------------------------------------------------------- + matrix3x4_t( const Vector& xAxis, const Vector& yAxis, const Vector& zAxis, const Vector &vecOrigin ) + { + Init( xAxis, yAxis, zAxis, vecOrigin ); + } + + inline void Invalidate( void ) + { + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 4; j++) + { + m_flMatVal[i][j] = VEC_T_NAN; + } + } + } + + float *operator[]( int i ) { Assert(( i >= 0 ) && ( i < 3 )); return m_flMatVal[i]; } + const float *operator[]( int i ) const { Assert(( i >= 0 ) && ( i < 3 )); return m_flMatVal[i]; } + float *Base() { return &m_flMatVal[0][0]; } + const float *Base() const { return &m_flMatVal[0][0]; } + + float m_flMatVal[3][4]; +}; + + +#ifndef M_PI + #define M_PI 3.14159265358979323846 // matches value in gcc v2 math.h +#endif + +#define M_PI_F ((float)(M_PI)) // Shouldn't collide with anything. + +// NJS: Inlined to prevent floats from being autopromoted to doubles, as with the old system. +#ifndef RAD2DEG + #define RAD2DEG( x ) ( (float)(x) * (float)(180.f / M_PI_F) ) +#endif + +#ifndef DEG2RAD + #define DEG2RAD( x ) ( (float)(x) * (float)(M_PI_F / 180.f) ) +#endif + +// Used to represent sides of things like planes. +#define SIDE_FRONT 0 +#define SIDE_BACK 1 +#define SIDE_ON 2 +#define SIDE_CROSS -2 // necessary for polylib.c + +#define ON_VIS_EPSILON 0.01 // necessary for vvis (flow.c) -- again look into moving later! +#define EQUAL_EPSILON 0.001 // necessary for vbsp (faces.c) -- should look into moving it there? + +extern bool s_bMathlibInitialized; + +extern const Vector vec3_origin; +extern const QAngle vec3_angle; +extern const Quaternion quat_identity; +extern const Vector vec3_invalid; +extern const int nanmask; + +#define IS_NAN(x) (((*(int *)&x)&nanmask)==nanmask) + +FORCEINLINE vec_t DotProduct(const vec_t *v1, const vec_t *v2) +{ + return v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2]; +} +FORCEINLINE void VectorSubtract(const vec_t *a, const vec_t *b, vec_t *c) +{ + c[0]=a[0]-b[0]; + c[1]=a[1]-b[1]; + c[2]=a[2]-b[2]; +} +FORCEINLINE void VectorAdd(const vec_t *a, const vec_t *b, vec_t *c) +{ + c[0]=a[0]+b[0]; + c[1]=a[1]+b[1]; + c[2]=a[2]+b[2]; +} +FORCEINLINE void VectorCopy(const vec_t *a, vec_t *b) +{ + b[0]=a[0]; + b[1]=a[1]; + b[2]=a[2]; +} +FORCEINLINE void VectorClear(vec_t *a) +{ + a[0]=a[1]=a[2]=0; +} + +FORCEINLINE float VectorMaximum(const vec_t *v) +{ + return max( v[0], max( v[1], v[2] ) ); +} + +FORCEINLINE float VectorMaximum(const Vector& v) +{ + return max( v.x, max( v.y, v.z ) ); +} + +FORCEINLINE void VectorScale (const float* in, vec_t scale, float* out) +{ + out[0] = in[0]*scale; + out[1] = in[1]*scale; + out[2] = in[2]*scale; +} + + +// Cannot be forceinline as they have overloads: +inline void VectorFill(vec_t *a, float b) +{ + a[0]=a[1]=a[2]=b; +} + +inline void VectorNegate(vec_t *a) +{ + a[0]=-a[0]; + a[1]=-a[1]; + a[2]=-a[2]; +} + + +//#define VectorMaximum(a) ( max( (a)[0], max( (a)[1], (a)[2] ) ) ) +#define Vector2Clear(x) {(x)[0]=(x)[1]=0;} +#define Vector2Negate(x) {(x)[0]=-((x)[0]);(x)[1]=-((x)[1]);} +#define Vector2Copy(a,b) {(b)[0]=(a)[0];(b)[1]=(a)[1];} +#define Vector2Subtract(a,b,c) {(c)[0]=(a)[0]-(b)[0];(c)[1]=(a)[1]-(b)[1];} +#define Vector2Add(a,b,c) {(c)[0]=(a)[0]+(b)[0];(c)[1]=(a)[1]+(b)[1];} +#define Vector2Scale(a,b,c) {(c)[0]=(b)*(a)[0];(c)[1]=(b)*(a)[1];} + +// NJS: Some functions in VBSP still need to use these for dealing with mixing vec4's and shorts with vec_t's. +// remove when no longer needed. +#define VECTOR_COPY( A, B ) do { (B)[0] = (A)[0]; (B)[1] = (A)[1]; (B)[2]=(A)[2]; } while(0) +#define DOT_PRODUCT( A, B ) ( (A)[0]*(B)[0] + (A)[1]*(B)[1] + (A)[2]*(B)[2] ) + +FORCEINLINE void VectorMAInline( const float* start, float scale, const float* direction, float* dest ) +{ + dest[0]=start[0]+direction[0]*scale; + dest[1]=start[1]+direction[1]*scale; + dest[2]=start[2]+direction[2]*scale; +} + +FORCEINLINE void VectorMAInline( const Vector& start, float scale, const Vector& direction, Vector& dest ) +{ + dest.x=start.x+direction.x*scale; + dest.y=start.y+direction.y*scale; + dest.z=start.z+direction.z*scale; +} + +FORCEINLINE void VectorMA( const Vector& start, float scale, const Vector& direction, Vector& dest ) +{ + VectorMAInline(start, scale, direction, dest); +} + +FORCEINLINE void VectorMA( const float * start, float scale, const float *direction, float *dest ) +{ + VectorMAInline(start, scale, direction, dest); +} + + +int VectorCompare (const float *v1, const float *v2); + +inline float VectorLength(const float *v) +{ + return FastSqrt( v[0]*v[0] + v[1]*v[1] + v[2]*v[2] + FLT_EPSILON ); +} + +void CrossProduct (const float *v1, const float *v2, float *cross); + +qboolean VectorsEqual( const float *v1, const float *v2 ); + +inline vec_t RoundInt (vec_t in) +{ + return floor(in + 0.5f); +} + +int Q_log2(int val); + +// Math routines done in optimized assembly math package routines +void inline SinCos( float radians, float *sine, float *cosine ) +{ +#if defined( _X360 ) + XMScalarSinCos( sine, cosine, radians ); +#elif defined( PLATFORM_WINDOWS_PC32 ) + _asm + { + fld DWORD PTR [radians] + fsincos + + mov edx, DWORD PTR [cosine] + mov eax, DWORD PTR [sine] + + fstp DWORD PTR [edx] + fstp DWORD PTR [eax] + } +#elif defined( PLATFORM_WINDOWS_PC64 ) + *sine = sin( radians ); + *cosine = cos( radians ); +#elif defined( POSIX ) + double __cosr, __sinr; + __asm ("fsincos" : "=t" (__cosr), "=u" (__sinr) : "0" (radians)); + + *sine = __sinr; + *cosine = __cosr; +#endif +} + +#define SIN_TABLE_SIZE 256 +#define FTOIBIAS 12582912.f +extern float SinCosTable[SIN_TABLE_SIZE]; + +inline float TableCos( float theta ) +{ + union + { + int i; + float f; + } ftmp; + + // ideally, the following should compile down to: theta * constant + constant, changing any of these constants from defines sometimes fubars this. + ftmp.f = theta * ( float )( SIN_TABLE_SIZE / ( 2.0f * M_PI ) ) + ( FTOIBIAS + ( SIN_TABLE_SIZE / 4 ) ); + return SinCosTable[ ftmp.i & ( SIN_TABLE_SIZE - 1 ) ]; +} + +inline float TableSin( float theta ) +{ + union + { + int i; + float f; + } ftmp; + + // ideally, the following should compile down to: theta * constant + constant + ftmp.f = theta * ( float )( SIN_TABLE_SIZE / ( 2.0f * M_PI ) ) + FTOIBIAS; + return SinCosTable[ ftmp.i & ( SIN_TABLE_SIZE - 1 ) ]; +} + +template<class T> +FORCEINLINE T Square( T const &a ) +{ + return a * a; +} + + +// return the smallest power of two >= x. +// returns 0 if x == 0 or x > 0x80000000 (ie numbers that would be negative if x was signed) +// NOTE: the old code took an int, and if you pass in an int of 0x80000000 casted to a uint, +// you'll get 0x80000000, which is correct for uints, instead of 0, which was correct for ints +FORCEINLINE uint SmallestPowerOfTwoGreaterOrEqual( uint x ) +{ + x -= 1; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + return x + 1; +} + +// return the largest power of two <= x. Will return 0 if passed 0 +FORCEINLINE uint LargestPowerOfTwoLessThanOrEqual( uint x ) +{ + if ( x >= 0x80000000 ) + return 0x80000000; + + return SmallestPowerOfTwoGreaterOrEqual( x + 1 ) >> 1; +} + + +// Math routines for optimizing division +void FloorDivMod (double numer, double denom, int *quotient, int *rem); +int GreatestCommonDivisor (int i1, int i2); + +// Test for FPU denormal mode +bool IsDenormal( const float &val ); + +// MOVEMENT INFO +enum +{ + PITCH = 0, // up / down + YAW, // left / right + ROLL // fall over +}; + +void MatrixAngles( const matrix3x4_t & matrix, float *angles ); // !!!! +void MatrixVectors( const matrix3x4_t &matrix, Vector* pForward, Vector *pRight, Vector *pUp ); +void VectorTransform (const float *in1, const matrix3x4_t & in2, float *out); +void VectorITransform (const float *in1, const matrix3x4_t & in2, float *out); +void VectorRotate( const float *in1, const matrix3x4_t & in2, float *out); +void VectorRotate( const Vector &in1, const QAngle &in2, Vector &out ); +void VectorRotate( const Vector &in1, const Quaternion &in2, Vector &out ); +void VectorIRotate( const float *in1, const matrix3x4_t & in2, float *out); + +#ifndef VECTOR_NO_SLOW_OPERATIONS + +QAngle TransformAnglesToLocalSpace( const QAngle &angles, const matrix3x4_t &parentMatrix ); +QAngle TransformAnglesToWorldSpace( const QAngle &angles, const matrix3x4_t &parentMatrix ); + +#endif + +void MatrixInitialize( matrix3x4_t &mat, const Vector &vecOrigin, const Vector &vecXAxis, const Vector &vecYAxis, const Vector &vecZAxis ); +void MatrixCopy( const matrix3x4_t &in, matrix3x4_t &out ); +void MatrixInvert( const matrix3x4_t &in, matrix3x4_t &out ); + +// Matrix equality test +bool MatricesAreEqual( const matrix3x4_t &src1, const matrix3x4_t &src2, float flTolerance = 1e-5 ); + +void MatrixGetColumn( const matrix3x4_t &in, int column, Vector &out ); +void MatrixSetColumn( const Vector &in, int column, matrix3x4_t &out ); + +inline void MatrixGetTranslation( const matrix3x4_t &in, Vector &out ) +{ + MatrixGetColumn ( in, 3, out ); +} + +inline void MatrixSetTranslation( const Vector &in, matrix3x4_t &out ) +{ + MatrixSetColumn ( in, 3, out ); +} + +void MatrixScaleBy ( const float flScale, matrix3x4_t &out ); +void MatrixScaleByZero ( matrix3x4_t &out ); + +//void DecomposeRotation( const matrix3x4_t &mat, float *out ); +void ConcatRotations (const matrix3x4_t &in1, const matrix3x4_t &in2, matrix3x4_t &out); +void ConcatTransforms (const matrix3x4_t &in1, const matrix3x4_t &in2, matrix3x4_t &out); + +// For identical interface w/ VMatrix +inline void MatrixMultiply ( const matrix3x4_t &in1, const matrix3x4_t &in2, matrix3x4_t &out ) +{ + ConcatTransforms( in1, in2, out ); +} + +void QuaternionSlerp( const Quaternion &p, const Quaternion &q, float t, Quaternion &qt ); +void QuaternionSlerpNoAlign( const Quaternion &p, const Quaternion &q, float t, Quaternion &qt ); +void QuaternionBlend( const Quaternion &p, const Quaternion &q, float t, Quaternion &qt ); +void QuaternionBlendNoAlign( const Quaternion &p, const Quaternion &q, float t, Quaternion &qt ); +void QuaternionIdentityBlend( const Quaternion &p, float t, Quaternion &qt ); +float QuaternionAngleDiff( const Quaternion &p, const Quaternion &q ); +void QuaternionScale( const Quaternion &p, float t, Quaternion &q ); +void QuaternionAlign( const Quaternion &p, const Quaternion &q, Quaternion &qt ); +float QuaternionDotProduct( const Quaternion &p, const Quaternion &q ); +void QuaternionConjugate( const Quaternion &p, Quaternion &q ); +void QuaternionInvert( const Quaternion &p, Quaternion &q ); +float QuaternionNormalize( Quaternion &q ); +void QuaternionAdd( const Quaternion &p, const Quaternion &q, Quaternion &qt ); +void QuaternionMult( const Quaternion &p, const Quaternion &q, Quaternion &qt ); +void QuaternionMatrix( const Quaternion &q, matrix3x4_t &matrix ); +void QuaternionMatrix( const Quaternion &q, const Vector &pos, matrix3x4_t &matrix ); +void QuaternionAngles( const Quaternion &q, QAngle &angles ); +void AngleQuaternion( const QAngle& angles, Quaternion &qt ); +void QuaternionAngles( const Quaternion &q, RadianEuler &angles ); +void AngleQuaternion( RadianEuler const &angles, Quaternion &qt ); +void QuaternionAxisAngle( const Quaternion &q, Vector &axis, float &angle ); +void AxisAngleQuaternion( const Vector &axis, float angle, Quaternion &q ); +void BasisToQuaternion( const Vector &vecForward, const Vector &vecRight, const Vector &vecUp, Quaternion &q ); +void MatrixQuaternion( const matrix3x4_t &mat, Quaternion &q ); + +// A couple methods to find the dot product of a vector with a matrix row or column... +inline float MatrixRowDotProduct( const matrix3x4_t &in1, int row, const Vector& in2 ) +{ + Assert( (row >= 0) && (row < 3) ); + return DotProduct( in1[row], in2.Base() ); +} + +inline float MatrixColumnDotProduct( const matrix3x4_t &in1, int col, const Vector& in2 ) +{ + Assert( (col >= 0) && (col < 4) ); + return in1[0][col] * in2[0] + in1[1][col] * in2[1] + in1[2][col] * in2[2]; +} + +int __cdecl BoxOnPlaneSide (const float *emins, const float *emaxs, const cplane_t *plane); + +inline float anglemod(float a) +{ + a = (360.f/65536) * ((int)(a*(65536.f/360.0f)) & 65535); + return a; +} + +// Remap a value in the range [A,B] to [C,D]. +inline float RemapVal( float val, float A, float B, float C, float D) +{ + if ( A == B ) + return val >= B ? D : C; + return C + (D - C) * (val - A) / (B - A); +} + +inline float RemapValClamped( float val, float A, float B, float C, float D) +{ + if ( A == B ) + return val >= B ? D : C; + float cVal = (val - A) / (B - A); + cVal = clamp( cVal, 0.0f, 1.0f ); + + return C + (D - C) * cVal; +} + +// Returns A + (B-A)*flPercent. +// float Lerp( float flPercent, float A, float B ); +template <class T> +FORCEINLINE T Lerp( float flPercent, T const &A, T const &B ) +{ + return A + (B - A) * flPercent; +} + +FORCEINLINE float Sqr( float f ) +{ + return f*f; +} + +// 5-argument floating point linear interpolation. +// FLerp(f1,f2,i1,i2,x)= +// f1 at x=i1 +// f2 at x=i2 +// smooth lerp between f1 and f2 at x>i1 and x<i2 +// extrapolation for x<i1 or x>i2 +// +// If you know a function f(x)'s value (f1) at position i1, and its value (f2) at position i2, +// the function can be linearly interpolated with FLerp(f1,f2,i1,i2,x) +// i2=i1 will cause a divide by zero. +static inline float FLerp(float f1, float f2, float i1, float i2, float x) +{ + return f1+(f2-f1)*(x-i1)/(i2-i1); +} + + +#ifndef VECTOR_NO_SLOW_OPERATIONS + +// YWB: Specialization for interpolating euler angles via quaternions... +template<> FORCEINLINE QAngle Lerp<QAngle>( float flPercent, const QAngle& q1, const QAngle& q2 ) +{ + // Avoid precision errors + if ( q1 == q2 ) + return q1; + + Quaternion src, dest; + + // Convert to quaternions + AngleQuaternion( q1, src ); + AngleQuaternion( q2, dest ); + + Quaternion result; + + // Slerp + QuaternionSlerp( src, dest, flPercent, result ); + + // Convert to euler + QAngle output; + QuaternionAngles( result, output ); + return output; +} + +#else + +#pragma error + +// NOTE NOTE: I haven't tested this!! It may not work! Check out interpolatedvar.cpp in the client dll to try it +template<> FORCEINLINE QAngleByValue Lerp<QAngleByValue>( float flPercent, const QAngleByValue& q1, const QAngleByValue& q2 ) +{ + // Avoid precision errors + if ( q1 == q2 ) + return q1; + + Quaternion src, dest; + + // Convert to quaternions + AngleQuaternion( q1, src ); + AngleQuaternion( q2, dest ); + + Quaternion result; + + // Slerp + QuaternionSlerp( src, dest, flPercent, result ); + + // Convert to euler + QAngleByValue output; + QuaternionAngles( result, output ); + return output; +} + +#endif // VECTOR_NO_SLOW_OPERATIONS + + +/// Same as swap(), but won't cause problems with std::swap +template <class T> +FORCEINLINE void V_swap( T& x, T& y ) +{ + T temp = x; + x = y; + y = temp; +} + +template <class T> FORCEINLINE T AVG(T a, T b) +{ + return (a+b)/2; +} + +// number of elements in an array of static size +#define NELEMS(x) ARRAYSIZE(x) + +// XYZ macro, for printf type functions - ex printf("%f %f %f",XYZ(myvector)); +#define XYZ(v) (v).x,(v).y,(v).z + + +inline float Sign( float x ) +{ + return (x <0.0f) ? -1.0f : 1.0f; +} + +// +// Clamps the input integer to the given array bounds. +// Equivalent to the following, but without using any branches: +// +// if( n < 0 ) return 0; +// else if ( n > maxindex ) return maxindex; +// else return n; +// +// This is not always a clear performance win, but when you have situations where a clamped +// value is thrashing against a boundary this is a big win. (ie, valid, invalid, valid, invalid, ...) +// +// Note: This code has been run against all possible integers. +// +inline int ClampArrayBounds( int n, unsigned maxindex ) +{ + // mask is 0 if less than 4096, 0xFFFFFFFF if greater than + unsigned int inrangemask = 0xFFFFFFFF + (((unsigned) n) > maxindex ); + unsigned int lessthan0mask = 0xFFFFFFFF + ( n >= 0 ); + + // If the result was valid, set the result, (otherwise sets zero) + int result = (inrangemask & n); + + // if the result was out of range or zero. + result |= ((~inrangemask) & (~lessthan0mask)) & maxindex; + + return result; +} + + +#define BOX_ON_PLANE_SIDE(emins, emaxs, p) \ + (((p)->type < 3)? \ + ( \ + ((p)->dist <= (emins)[(p)->type])? \ + 1 \ + : \ + ( \ + ((p)->dist >= (emaxs)[(p)->type])?\ + 2 \ + : \ + 3 \ + ) \ + ) \ + : \ + BoxOnPlaneSide( (emins), (emaxs), (p))) + +//----------------------------------------------------------------------------- +// FIXME: Vector versions.... the float versions will go away hopefully soon! +//----------------------------------------------------------------------------- + +void AngleVectors (const QAngle& angles, Vector *forward); +void AngleVectors (const QAngle& angles, Vector *forward, Vector *right, Vector *up); +void AngleVectorsTranspose (const QAngle& angles, Vector *forward, Vector *right, Vector *up); +void AngleMatrix (const QAngle &angles, matrix3x4_t &mat ); +void AngleMatrix( const QAngle &angles, const Vector &position, matrix3x4_t &mat ); +void AngleMatrix (const RadianEuler &angles, matrix3x4_t &mat ); +void AngleMatrix( RadianEuler const &angles, const Vector &position, matrix3x4_t &mat ); +void AngleIMatrix (const QAngle &angles, matrix3x4_t &mat ); +void AngleIMatrix (const QAngle &angles, const Vector &position, matrix3x4_t &mat ); +void AngleIMatrix (const RadianEuler &angles, matrix3x4_t &mat ); +void VectorAngles( const Vector &forward, QAngle &angles ); +void VectorAngles( const Vector &forward, const Vector &pseudoup, QAngle &angles ); +void VectorMatrix( const Vector &forward, matrix3x4_t &mat ); +void VectorVectors( const Vector &forward, Vector &right, Vector &up ); +void SetIdentityMatrix( matrix3x4_t &mat ); +void SetScaleMatrix( float x, float y, float z, matrix3x4_t &dst ); +void MatrixBuildRotationAboutAxis( const Vector &vAxisOfRot, float angleDegrees, matrix3x4_t &dst ); + +inline void SetScaleMatrix( float flScale, matrix3x4_t &dst ) +{ + SetScaleMatrix( flScale, flScale, flScale, dst ); +} + +inline void SetScaleMatrix( const Vector& scale, matrix3x4_t &dst ) +{ + SetScaleMatrix( scale.x, scale.y, scale.z, dst ); +} + +// Computes the inverse transpose +void MatrixTranspose( matrix3x4_t& mat ); +void MatrixTranspose( const matrix3x4_t& src, matrix3x4_t& dst ); +void MatrixInverseTranspose( const matrix3x4_t& src, matrix3x4_t& dst ); + +inline void PositionMatrix( const Vector &position, matrix3x4_t &mat ) +{ + MatrixSetColumn( position, 3, mat ); +} + +inline void MatrixPosition( const matrix3x4_t &matrix, Vector &position ) +{ + MatrixGetColumn( matrix, 3, position ); +} + +inline void VectorRotate( const Vector& in1, const matrix3x4_t &in2, Vector &out) +{ + VectorRotate( &in1.x, in2, &out.x ); +} + +inline void VectorIRotate( const Vector& in1, const matrix3x4_t &in2, Vector &out) +{ + VectorIRotate( &in1.x, in2, &out.x ); +} + +inline void MatrixAngles( const matrix3x4_t &matrix, QAngle &angles ) +{ + MatrixAngles( matrix, &angles.x ); +} + +inline void MatrixAngles( const matrix3x4_t &matrix, QAngle &angles, Vector &position ) +{ + MatrixAngles( matrix, angles ); + MatrixPosition( matrix, position ); +} + +inline void MatrixAngles( const matrix3x4_t &matrix, RadianEuler &angles ) +{ + MatrixAngles( matrix, &angles.x ); + + angles.Init( DEG2RAD( angles.z ), DEG2RAD( angles.x ), DEG2RAD( angles.y ) ); +} + +void MatrixAngles( const matrix3x4_t &mat, RadianEuler &angles, Vector &position ); + +void MatrixAngles( const matrix3x4_t &mat, Quaternion &q, Vector &position ); + +inline int VectorCompare (const Vector& v1, const Vector& v2) +{ + return v1 == v2; +} + +inline void VectorTransform (const Vector& in1, const matrix3x4_t &in2, Vector &out) +{ + VectorTransform( &in1.x, in2, &out.x ); +} + +inline void VectorITransform (const Vector& in1, const matrix3x4_t &in2, Vector &out) +{ + VectorITransform( &in1.x, in2, &out.x ); +} + +/* +inline void DecomposeRotation( const matrix3x4_t &mat, Vector &out ) +{ + DecomposeRotation( mat, &out.x ); +} +*/ + +inline int BoxOnPlaneSide (const Vector& emins, const Vector& emaxs, const cplane_t *plane ) +{ + return BoxOnPlaneSide( &emins.x, &emaxs.x, plane ); +} + +inline void VectorFill(Vector& a, float b) +{ + a[0]=a[1]=a[2]=b; +} + +inline void VectorNegate(Vector& a) +{ + a[0] = -a[0]; + a[1] = -a[1]; + a[2] = -a[2]; +} + +inline vec_t VectorAvg(Vector& a) +{ + return ( a[0] + a[1] + a[2] ) / 3; +} + +//----------------------------------------------------------------------------- +// Box/plane test (slow version) +//----------------------------------------------------------------------------- +inline int FASTCALL BoxOnPlaneSide2 (const Vector& emins, const Vector& emaxs, const cplane_t *p, float tolerance = 0.f ) +{ + Vector corners[2]; + + if (p->normal[0] < 0) + { + corners[0][0] = emins[0]; + corners[1][0] = emaxs[0]; + } + else + { + corners[1][0] = emins[0]; + corners[0][0] = emaxs[0]; + } + + if (p->normal[1] < 0) + { + corners[0][1] = emins[1]; + corners[1][1] = emaxs[1]; + } + else + { + corners[1][1] = emins[1]; + corners[0][1] = emaxs[1]; + } + + if (p->normal[2] < 0) + { + corners[0][2] = emins[2]; + corners[1][2] = emaxs[2]; + } + else + { + corners[1][2] = emins[2]; + corners[0][2] = emaxs[2]; + } + + int sides = 0; + + float dist1 = DotProduct (p->normal, corners[0]) - p->dist; + if (dist1 >= tolerance) + sides = 1; + + float dist2 = DotProduct (p->normal, corners[1]) - p->dist; + if (dist2 < -tolerance) + sides |= 2; + + return sides; +} + +//----------------------------------------------------------------------------- +// Helpers for bounding box construction +//----------------------------------------------------------------------------- + +void ClearBounds (Vector& mins, Vector& maxs); +void AddPointToBounds (const Vector& v, Vector& mins, Vector& maxs); + +// +// COLORSPACE/GAMMA CONVERSION STUFF +// +void BuildGammaTable( float gamma, float texGamma, float brightness, int overbright ); + +// convert texture to linear 0..1 value +inline float TexLightToLinear( int c, int exponent ) +{ + extern float power2_n[256]; + Assert( exponent >= -128 && exponent <= 127 ); + return ( float )c * power2_n[exponent+128]; +} + + +// convert texture to linear 0..1 value +int LinearToTexture( float f ); +// converts 0..1 linear value to screen gamma (0..255) +int LinearToScreenGamma( float f ); +float TextureToLinear( int c ); + +// compressed color format +struct ColorRGBExp32 +{ + byte r, g, b; + signed char exponent; +}; + +void ColorRGBExp32ToVector( const ColorRGBExp32& in, Vector& out ); +void VectorToColorRGBExp32( const Vector& v, ColorRGBExp32 &c ); + +// solve for "x" where "a x^2 + b x + c = 0", return true if solution exists +bool SolveQuadratic( float a, float b, float c, float &root1, float &root2 ); + +// solves for "a, b, c" where "a x^2 + b x + c = y", return true if solution exists +bool SolveInverseQuadratic( float x1, float y1, float x2, float y2, float x3, float y3, float &a, float &b, float &c ); + +// solves for a,b,c specified as above, except that it always creates a monotonically increasing or +// decreasing curve if the data is monotonically increasing or decreasing. In order to enforce the +// monoticity condition, it is possible that the resulting quadratic will only approximate the data +// instead of interpolating it. This code is not especially fast. +bool SolveInverseQuadraticMonotonic( float x1, float y1, float x2, float y2, + float x3, float y3, float &a, float &b, float &c ); + + + + +// solves for "a, b, c" where "1/(a x^2 + b x + c ) = y", return true if solution exists +bool SolveInverseReciprocalQuadratic( float x1, float y1, float x2, float y2, float x3, float y3, float &a, float &b, float &c ); + +// rotate a vector around the Z axis (YAW) +void VectorYawRotate( const Vector& in, float flYaw, Vector &out); + + +// Bias takes an X value between 0 and 1 and returns another value between 0 and 1 +// The curve is biased towards 0 or 1 based on biasAmt, which is between 0 and 1. +// Lower values of biasAmt bias the curve towards 0 and higher values bias it towards 1. +// +// For example, with biasAmt = 0.2, the curve looks like this: +// +// 1 +// | * +// | * +// | * +// | ** +// | ** +// | **** +// |********* +// |___________________ +// 0 1 +// +// +// With biasAmt = 0.8, the curve looks like this: +// +// 1 +// | ************** +// | ** +// | * +// | * +// |* +// |* +// |* +// |___________________ +// 0 1 +// +// With a biasAmt of 0.5, Bias returns X. +float Bias( float x, float biasAmt ); + + +// Gain is similar to Bias, but biasAmt biases towards or away from 0.5. +// Lower bias values bias towards 0.5 and higher bias values bias away from it. +// +// For example, with biasAmt = 0.2, the curve looks like this: +// +// 1 +// | * +// | * +// | ** +// | *************** +// | ** +// | * +// |* +// |___________________ +// 0 1 +// +// +// With biasAmt = 0.8, the curve looks like this: +// +// 1 +// | ***** +// | *** +// | * +// | * +// | * +// | *** +// |***** +// |___________________ +// 0 1 +float Gain( float x, float biasAmt ); + + +// SmoothCurve maps a 0-1 value into another 0-1 value based on a cosine wave +// where the derivatives of the function at 0 and 1 (and 0.5) are 0. This is useful for +// any fadein/fadeout effect where it should start and end smoothly. +// +// The curve looks like this: +// +// 1 +// | ** +// | * * +// | * * +// | * * +// | * * +// | ** ** +// |*** *** +// |___________________ +// 0 1 +// +float SmoothCurve( float x ); + + +// This works like SmoothCurve, with two changes: +// +// 1. Instead of the curve peaking at 0.5, it will peak at flPeakPos. +// (So if you specify flPeakPos=0.2, then the peak will slide to the left). +// +// 2. flPeakSharpness is a 0-1 value controlling the sharpness of the peak. +// Low values blunt the peak and high values sharpen the peak. +float SmoothCurve_Tweak( float x, float flPeakPos=0.5, float flPeakSharpness=0.5 ); + + +//float ExponentialDecay( float halflife, float dt ); +//float ExponentialDecay( float decayTo, float decayTime, float dt ); + +// halflife is time for value to reach 50% +inline float ExponentialDecay( float halflife, float dt ) +{ + // log(0.5) == -0.69314718055994530941723212145818 + return expf( -0.69314718f / halflife * dt); +} + +// decayTo is factor the value should decay to in decayTime +inline float ExponentialDecay( float decayTo, float decayTime, float dt ) +{ + return expf( logf( decayTo ) / decayTime * dt); +} + +// Get the integrated distanced traveled +// decayTo is factor the value should decay to in decayTime +// dt is the time relative to the last velocity update +inline float ExponentialDecayIntegral( float decayTo, float decayTime, float dt ) +{ + return (powf( decayTo, dt / decayTime) * decayTime - decayTime) / logf( decayTo ); +} + +// hermite basis function for smooth interpolation +// Similar to Gain() above, but very cheap to call +// value should be between 0 & 1 inclusive +inline float SimpleSpline( float value ) +{ + float valueSquared = value * value; + + // Nice little ease-in, ease-out spline-like curve + return (3 * valueSquared - 2 * valueSquared * value); +} + +// remaps a value in [startInterval, startInterval+rangeInterval] from linear to +// spline using SimpleSpline +inline float SimpleSplineRemapVal( float val, float A, float B, float C, float D) +{ + if ( A == B ) + return val >= B ? D : C; + float cVal = (val - A) / (B - A); + return C + (D - C) * SimpleSpline( cVal ); +} + +// remaps a value in [startInterval, startInterval+rangeInterval] from linear to +// spline using SimpleSpline +inline float SimpleSplineRemapValClamped( float val, float A, float B, float C, float D ) +{ + if ( A == B ) + return val >= B ? D : C; + float cVal = (val - A) / (B - A); + cVal = clamp( cVal, 0.0f, 1.0f ); + return C + (D - C) * SimpleSpline( cVal ); +} + +FORCEINLINE int RoundFloatToInt(float f) +{ +#if defined(__i386__) || defined(_M_IX86) || defined( PLATFORM_WINDOWS_PC64 ) || defined(__x86_64__) + return _mm_cvtss_si32(_mm_load_ss(&f)); +#elif defined( _X360 ) +#ifdef Assert + Assert( IsFPUControlWordSet() ); +#endif + union + { + double flResult; + int pResult[2]; + }; + flResult = __fctiw( f ); + return pResult[1]; +#else +#error Unknown architecture +#endif +} + +FORCEINLINE unsigned char RoundFloatToByte(float f) +{ + int nResult = RoundFloatToInt(f); +#ifdef Assert + Assert( (nResult & ~0xFF) == 0 ); +#endif + return (unsigned char) nResult; +} + +FORCEINLINE unsigned long RoundFloatToUnsignedLong(float f) +{ +#if defined( _X360 ) +#ifdef Assert + Assert( IsFPUControlWordSet() ); +#endif + union + { + double flResult; + int pIntResult[2]; + unsigned long pResult[2]; + }; + flResult = __fctiw( f ); + Assert( pIntResult[1] >= 0 ); + return pResult[1]; +#else // !X360 + +#if defined( PLATFORM_WINDOWS_PC64 ) + uint nRet = ( uint ) f; + if ( nRet & 1 ) + { + if ( ( f - floor( f ) >= 0.5 ) ) + { + nRet++; + } + } + else + { + if ( ( f - floor( f ) > 0.5 ) ) + { + nRet++; + } + } + return nRet; +#else // PLATFORM_WINDOWS_PC64 + unsigned char nResult[8]; + + #if defined( _WIN32 ) + __asm + { + fld f + fistp qword ptr nResult + } + #elif POSIX + __asm __volatile__ ( + "fistpl %0;": "=m" (nResult): "t" (f) : "st" + ); + #endif + + return *((unsigned long*)nResult); +#endif // PLATFORM_WINDOWS_PC64 +#endif // !X360 +} + +FORCEINLINE bool IsIntegralValue( float flValue, float flTolerance = 0.001f ) +{ + return fabs( RoundFloatToInt( flValue ) - flValue ) < flTolerance; +} + +// Fast, accurate ftol: +FORCEINLINE int Float2Int( float a ) +{ +#if defined( _X360 ) + union + { + double flResult; + int pResult[2]; + }; + flResult = __fctiwz( a ); + return pResult[1]; +#else // !X360 + // Rely on compiler to generate CVTTSS2SI on x86 + return (int) a; +#endif +} + +// Over 15x faster than: (int)floor(value) +inline int Floor2Int( float a ) +{ + int RetVal; +#if defined( __i386__ ) + // Convert to int and back, compare, subtract one if too big + __m128 a128 = _mm_set_ss(a); + RetVal = _mm_cvtss_si32(a128); + __m128 rounded128 = _mm_cvt_si2ss(_mm_setzero_ps(), RetVal); + RetVal -= _mm_comigt_ss( rounded128, a128 ); +#else + RetVal = static_cast<int>( floor(a) ); +#endif + return RetVal; +} + +//----------------------------------------------------------------------------- +// Fast color conversion from float to unsigned char +//----------------------------------------------------------------------------- +FORCEINLINE unsigned int FastFToC( float c ) +{ +#if defined( __i386__ ) + // IEEE float bit manipulation works for values between [0, 1<<23) + union { float f; int i; } convert = { c*255.0f + (float)(1<<23) }; + return convert.i & 255; +#else + // consoles CPUs suffer from load-hit-store penalty + return Float2Int( c * 255.0f ); +#endif +} + +//----------------------------------------------------------------------------- +// Fast conversion from float to integer with magnitude less than 2**22 +//----------------------------------------------------------------------------- +FORCEINLINE int FastFloatToSmallInt( float c ) +{ +#if defined( __i386__ ) + // IEEE float bit manipulation works for values between [-1<<22, 1<<22) + union { float f; int i; } convert = { c + (float)(3<<22) }; + return (convert.i & ((1<<23)-1)) - (1<<22); +#else + // consoles CPUs suffer from load-hit-store penalty + return Float2Int( c ); +#endif +} + +//----------------------------------------------------------------------------- +// Purpose: Bound input float to .001 (millisecond) boundary +// Input : in - +// Output : inline float +//----------------------------------------------------------------------------- +inline float ClampToMsec( float in ) +{ + int msec = Floor2Int( in * 1000.0f + 0.5f ); + return 0.001f * msec; +} + +// Over 15x faster than: (int)ceil(value) +inline int Ceil2Int( float a ) +{ + int RetVal; +#if defined( __i386__ ) + // Convert to int and back, compare, add one if too small + __m128 a128 = _mm_load_ss(&a); + RetVal = _mm_cvtss_si32(a128); + __m128 rounded128 = _mm_cvt_si2ss(_mm_setzero_ps(), RetVal); + RetVal += _mm_comilt_ss( rounded128, a128 ); +#else + RetVal = static_cast<int>( ceil(a) ); +#endif + return RetVal; +} + + +// Regular signed area of triangle +#define TriArea2D( A, B, C ) \ + ( 0.5f * ( ( B.x - A.x ) * ( C.y - A.y ) - ( B.y - A.y ) * ( C.x - A.x ) ) ) + +// This version doesn't premultiply by 0.5f, so it's the area of the rectangle instead +#define TriArea2DTimesTwo( A, B, C ) \ + ( ( ( B.x - A.x ) * ( C.y - A.y ) - ( B.y - A.y ) * ( C.x - A.x ) ) ) + + +// Get the barycentric coordinates of "pt" in triangle [A,B,C]. +inline void GetBarycentricCoords2D( + Vector2D const &A, + Vector2D const &B, + Vector2D const &C, + Vector2D const &pt, + float bcCoords[3] ) +{ + // Note, because to top and bottom are both x2, the issue washes out in the composite + float invTriArea = 1.0f / TriArea2DTimesTwo( A, B, C ); + + // NOTE: We assume here that the lightmap coordinate vertices go counterclockwise. + // If not, TriArea2D() is negated so this works out right. + bcCoords[0] = TriArea2DTimesTwo( B, C, pt ) * invTriArea; + bcCoords[1] = TriArea2DTimesTwo( C, A, pt ) * invTriArea; + bcCoords[2] = TriArea2DTimesTwo( A, B, pt ) * invTriArea; +} + + +// Return true of the sphere might touch the box (the sphere is actually treated +// like a box itself, so this may return true if the sphere's bounding box touches +// a corner of the box but the sphere itself doesn't). +inline bool QuickBoxSphereTest( + const Vector& vOrigin, + float flRadius, + const Vector& bbMin, + const Vector& bbMax ) +{ + return vOrigin.x - flRadius < bbMax.x && vOrigin.x + flRadius > bbMin.x && + vOrigin.y - flRadius < bbMax.y && vOrigin.y + flRadius > bbMin.y && + vOrigin.z - flRadius < bbMax.z && vOrigin.z + flRadius > bbMin.z; +} + + +// Return true of the boxes intersect (but not if they just touch). +inline bool QuickBoxIntersectTest( + const Vector& vBox1Min, + const Vector& vBox1Max, + const Vector& vBox2Min, + const Vector& vBox2Max ) +{ + return + vBox1Min.x < vBox2Max.x && vBox1Max.x > vBox2Min.x && + vBox1Min.y < vBox2Max.y && vBox1Max.y > vBox2Min.y && + vBox1Min.z < vBox2Max.z && vBox1Max.z > vBox2Min.z; +} + + +extern float GammaToLinearFullRange( float gamma ); +extern float LinearToGammaFullRange( float linear ); +extern float GammaToLinear( float gamma ); +extern float LinearToGamma( float linear ); + +extern float SrgbGammaToLinear( float flSrgbGammaValue ); +extern float SrgbLinearToGamma( float flLinearValue ); +extern float X360GammaToLinear( float fl360GammaValue ); +extern float X360LinearToGamma( float flLinearValue ); +extern float SrgbGammaTo360Gamma( float flSrgbGammaValue ); + +// linear (0..4) to screen corrected vertex space (0..1?) +FORCEINLINE float LinearToVertexLight( float f ) +{ + extern float lineartovertex[4096]; + + // Gotta clamp before the multiply; could overflow... + // assume 0..4 range + int i = RoundFloatToInt( f * 1024.f ); + + // Presumably the comman case will be not to clamp, so check that first: + if( (unsigned)i > 4095 ) + { + if ( i < 0 ) + i = 0; // Compare to zero instead of 4095 to save 4 bytes in the instruction stream + else + i = 4095; + } + + return lineartovertex[i]; +} + + +FORCEINLINE unsigned char LinearToLightmap( float f ) +{ + extern unsigned char lineartolightmap[4096]; + + // Gotta clamp before the multiply; could overflow... + int i = RoundFloatToInt( f * 1024.f ); // assume 0..4 range + + // Presumably the comman case will be not to clamp, so check that first: + if ( (unsigned)i > 4095 ) + { + if ( i < 0 ) + i = 0; // Compare to zero instead of 4095 to save 4 bytes in the instruction stream + else + i = 4095; + } + + return lineartolightmap[i]; +} + +FORCEINLINE void ColorClamp( Vector& color ) +{ + float maxc = max( color.x, max( color.y, color.z ) ); + if ( maxc > 1.0f ) + { + float ooMax = 1.0f / maxc; + color.x *= ooMax; + color.y *= ooMax; + color.z *= ooMax; + } + + if ( color[0] < 0.f ) color[0] = 0.f; + if ( color[1] < 0.f ) color[1] = 0.f; + if ( color[2] < 0.f ) color[2] = 0.f; +} + +inline void ColorClampTruncate( Vector& color ) +{ + if (color[0] > 1.0f) color[0] = 1.0f; else if (color[0] < 0.0f) color[0] = 0.0f; + if (color[1] > 1.0f) color[1] = 1.0f; else if (color[1] < 0.0f) color[1] = 0.0f; + if (color[2] > 1.0f) color[2] = 1.0f; else if (color[2] < 0.0f) color[2] = 0.0f; +} + +// Interpolate a Catmull-Rom spline. +// t is a [0,1] value and interpolates a curve between p2 and p3. +void Catmull_Rom_Spline( + const Vector &p1, + const Vector &p2, + const Vector &p3, + const Vector &p4, + float t, + Vector &output ); + +// Interpolate a Catmull-Rom spline. +// Returns the tangent of the point at t of the spline +void Catmull_Rom_Spline_Tangent( + const Vector &p1, + const Vector &p2, + const Vector &p3, + const Vector &p4, + float t, + Vector &output ); + +// area under the curve [0..t] +void Catmull_Rom_Spline_Integral( + const Vector &p1, + const Vector &p2, + const Vector &p3, + const Vector &p4, + float t, + Vector& output ); + +// area under the curve [0..1] +void Catmull_Rom_Spline_Integral( + const Vector &p1, + const Vector &p2, + const Vector &p3, + const Vector &p4, + Vector& output ); + +// Interpolate a Catmull-Rom spline. +// Normalize p2->p1 and p3->p4 to be the same length as p2->p3 +void Catmull_Rom_Spline_Normalize( + const Vector &p1, + const Vector &p2, + const Vector &p3, + const Vector &p4, + float t, + Vector &output ); + +// area under the curve [0..t] +// Normalize p2->p1 and p3->p4 to be the same length as p2->p3 +void Catmull_Rom_Spline_Integral_Normalize( + const Vector &p1, + const Vector &p2, + const Vector &p3, + const Vector &p4, + float t, + Vector& output ); + +// Interpolate a Catmull-Rom spline. +// Normalize p2.x->p1.x and p3.x->p4.x to be the same length as p2.x->p3.x +void Catmull_Rom_Spline_NormalizeX( + const Vector &p1, + const Vector &p2, + const Vector &p3, + const Vector &p4, + float t, + Vector &output ); + +// area under the curve [0..t] +void Catmull_Rom_Spline_NormalizeX( + const Vector &p1, + const Vector &p2, + const Vector &p3, + const Vector &p4, + float t, + Vector& output ); + +// Interpolate a Hermite spline. +// t is a [0,1] value and interpolates a curve between p1 and p2 with the deltas d1 and d2. +void Hermite_Spline( + const Vector &p1, + const Vector &p2, + const Vector &d1, + const Vector &d2, + float t, + Vector& output ); + +float Hermite_Spline( + float p1, + float p2, + float d1, + float d2, + float t ); + +// t is a [0,1] value and interpolates a curve between p1 and p2 with the slopes p0->p1 and p1->p2 +void Hermite_Spline( + const Vector &p0, + const Vector &p1, + const Vector &p2, + float t, + Vector& output ); + +float Hermite_Spline( + float p0, + float p1, + float p2, + float t ); + + +void Hermite_SplineBasis( float t, float basis[] ); + +void Hermite_Spline( + const Quaternion &q0, + const Quaternion &q1, + const Quaternion &q2, + float t, + Quaternion &output ); + + +// See http://en.wikipedia.org/wiki/Kochanek-Bartels_curves +// +// Tension: -1 = Round -> 1 = Tight +// Bias: -1 = Pre-shoot (bias left) -> 1 = Post-shoot (bias right) +// Continuity: -1 = Box corners -> 1 = Inverted corners +// +// If T=B=C=0 it's the same matrix as Catmull-Rom. +// If T=1 & B=C=0 it's the same as Cubic. +// If T=B=0 & C=-1 it's just linear interpolation +// +// See http://news.povray.org/povray.binaries.tutorials/attachment/%[email protected]%3E/Splines.bas.txt +// for example code and descriptions of various spline types... +// +void Kochanek_Bartels_Spline( + float tension, + float bias, + float continuity, + const Vector &p1, + const Vector &p2, + const Vector &p3, + const Vector &p4, + float t, + Vector& output ); + +void Kochanek_Bartels_Spline_NormalizeX( + float tension, + float bias, + float continuity, + const Vector &p1, + const Vector &p2, + const Vector &p3, + const Vector &p4, + float t, + Vector& output ); + +// See link at Kochanek_Bartels_Spline for info on the basis matrix used +void Cubic_Spline( + const Vector &p1, + const Vector &p2, + const Vector &p3, + const Vector &p4, + float t, + Vector& output ); + +void Cubic_Spline_NormalizeX( + const Vector &p1, + const Vector &p2, + const Vector &p3, + const Vector &p4, + float t, + Vector& output ); + +// See link at Kochanek_Bartels_Spline for info on the basis matrix used +void BSpline( + const Vector &p1, + const Vector &p2, + const Vector &p3, + const Vector &p4, + float t, + Vector& output ); + +void BSpline_NormalizeX( + const Vector &p1, + const Vector &p2, + const Vector &p3, + const Vector &p4, + float t, + Vector& output ); + +// See link at Kochanek_Bartels_Spline for info on the basis matrix used +void Parabolic_Spline( + const Vector &p1, + const Vector &p2, + const Vector &p3, + const Vector &p4, + float t, + Vector& output ); + +void Parabolic_Spline_NormalizeX( + const Vector &p1, + const Vector &p2, + const Vector &p3, + const Vector &p4, + float t, + Vector& output ); + +// quintic interpolating polynomial from Perlin. +// 0->0, 1->1, smooth-in between with smooth tangents +FORCEINLINE float QuinticInterpolatingPolynomial(float t) +{ + // 6t^5-15t^4+10t^3 + return t * t * t *( t * ( t* 6.0 - 15.0 ) + 10.0 ); +} + +// given a table of sorted tabulated positions, return the two indices and blendfactor to linear +// interpolate. Does a search. Can be used to find the blend value to interpolate between +// keyframes. +void GetInterpolationData( float const *pKnotPositions, + float const *pKnotValues, + int nNumValuesinList, + int nInterpolationRange, + float flPositionToInterpolateAt, + bool bWrap, + float *pValueA, + float *pValueB, + float *pInterpolationValue); + +float RangeCompressor( float flValue, float flMin, float flMax, float flBase ); + +// Get the minimum distance from vOrigin to the bounding box defined by [mins,maxs] +// using voronoi regions. +// 0 is returned if the origin is inside the box. +float CalcSqrDistanceToAABB( const Vector &mins, const Vector &maxs, const Vector &point ); +void CalcClosestPointOnAABB( const Vector &mins, const Vector &maxs, const Vector &point, Vector &closestOut ); +void CalcSqrDistAndClosestPointOnAABB( const Vector &mins, const Vector &maxs, const Vector &point, Vector &closestOut, float &distSqrOut ); + +inline float CalcDistanceToAABB( const Vector &mins, const Vector &maxs, const Vector &point ) +{ + float flDistSqr = CalcSqrDistanceToAABB( mins, maxs, point ); + return sqrt(flDistSqr); +} + +// Get the closest point from P to the (infinite) line through vLineA and vLineB and +// calculate the shortest distance from P to the line. +// If you pass in a value for t, it will tell you the t for (A + (B-A)t) to get the closest point. +// If the closest point lies on the segment between A and B, then 0 <= t <= 1. +void CalcClosestPointOnLine( const Vector &P, const Vector &vLineA, const Vector &vLineB, Vector &vClosest, float *t=0 ); +float CalcDistanceToLine( const Vector &P, const Vector &vLineA, const Vector &vLineB, float *t=0 ); +float CalcDistanceSqrToLine( const Vector &P, const Vector &vLineA, const Vector &vLineB, float *t=0 ); + +// The same three functions as above, except now the line is closed between A and B. +void CalcClosestPointOnLineSegment( const Vector &P, const Vector &vLineA, const Vector &vLineB, Vector &vClosest, float *t=0 ); +float CalcDistanceToLineSegment( const Vector &P, const Vector &vLineA, const Vector &vLineB, float *t=0 ); +float CalcDistanceSqrToLineSegment( const Vector &P, const Vector &vLineA, const Vector &vLineB, float *t=0 ); + +// A function to compute the closes line segment connnection two lines (or false if the lines are parallel, etc.) +bool CalcLineToLineIntersectionSegment( + const Vector& p1,const Vector& p2,const Vector& p3,const Vector& p4,Vector *s1,Vector *s2, + float *t1, float *t2 ); + +// The above functions in 2D +void CalcClosestPointOnLine2D( Vector2D const &P, Vector2D const &vLineA, Vector2D const &vLineB, Vector2D &vClosest, float *t=0 ); +float CalcDistanceToLine2D( Vector2D const &P, Vector2D const &vLineA, Vector2D const &vLineB, float *t=0 ); +float CalcDistanceSqrToLine2D( Vector2D const &P, Vector2D const &vLineA, Vector2D const &vLineB, float *t=0 ); +void CalcClosestPointOnLineSegment2D( Vector2D const &P, Vector2D const &vLineA, Vector2D const &vLineB, Vector2D &vClosest, float *t=0 ); +float CalcDistanceToLineSegment2D( Vector2D const &P, Vector2D const &vLineA, Vector2D const &vLineB, float *t=0 ); +float CalcDistanceSqrToLineSegment2D( Vector2D const &P, Vector2D const &vLineA, Vector2D const &vLineB, float *t=0 ); + +// Init the mathlib +void MathLib_Init( float gamma = 2.2f, float texGamma = 2.2f, float brightness = 0.0f, int overbright = 2.0f, bool bAllow3DNow = true, bool bAllowSSE = true, bool bAllowSSE2 = true, bool bAllowMMX = true ); +bool MathLib_3DNowEnabled( void ); +bool MathLib_MMXEnabled( void ); +bool MathLib_SSEEnabled( void ); +bool MathLib_SSE2Enabled( void ); + +float Approach( float target, float value, float speed ); +float ApproachAngle( float target, float value, float speed ); +float AngleDiff( float destAngle, float srcAngle ); +float AngleDistance( float next, float cur ); +float AngleNormalize( float angle ); + +// ensure that 0 <= angle <= 360 +float AngleNormalizePositive( float angle ); + +bool AnglesAreEqual( float a, float b, float tolerance = 0.0f ); + + +void RotationDeltaAxisAngle( const QAngle &srcAngles, const QAngle &destAngles, Vector &deltaAxis, float &deltaAngle ); +void RotationDelta( const QAngle &srcAngles, const QAngle &destAngles, QAngle *out ); + +void ComputeTrianglePlane( const Vector& v1, const Vector& v2, const Vector& v3, Vector& normal, float& intercept ); +int PolyFromPlane( Vector *outVerts, const Vector& normal, float dist, float fHalfScale = 9000.0f ); +int ClipPolyToPlane( Vector *inVerts, int vertCount, Vector *outVerts, const Vector& normal, float dist, float fOnPlaneEpsilon = 0.1f ); +int ClipPolyToPlane_Precise( double *inVerts, int vertCount, double *outVerts, const double *normal, double dist, double fOnPlaneEpsilon = 0.1 ); + +//----------------------------------------------------------------------------- +// Computes a reasonable tangent space for a triangle +//----------------------------------------------------------------------------- +void CalcTriangleTangentSpace( const Vector &p0, const Vector &p1, const Vector &p2, + const Vector2D &t0, const Vector2D &t1, const Vector2D& t2, + Vector &sVect, Vector &tVect ); + +//----------------------------------------------------------------------------- +// Transforms a AABB into another space; which will inherently grow the box. +//----------------------------------------------------------------------------- +void TransformAABB( const matrix3x4_t &in1, const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut ); + +//----------------------------------------------------------------------------- +// Uses the inverse transform of in1 +//----------------------------------------------------------------------------- +void ITransformAABB( const matrix3x4_t &in1, const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut ); + +//----------------------------------------------------------------------------- +// Rotates a AABB into another space; which will inherently grow the box. +// (same as TransformAABB, but doesn't take the translation into account) +//----------------------------------------------------------------------------- +void RotateAABB( const matrix3x4_t &in1, const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut ); + +//----------------------------------------------------------------------------- +// Uses the inverse transform of in1 +//----------------------------------------------------------------------------- +void IRotateAABB( const matrix3x4_t &in1, const Vector &vecMinsIn, const Vector &vecMaxsIn, Vector &vecMinsOut, Vector &vecMaxsOut ); + +//----------------------------------------------------------------------------- +// Transform a plane +//----------------------------------------------------------------------------- +inline void MatrixTransformPlane( const matrix3x4_t &src, const cplane_t &inPlane, cplane_t &outPlane ) +{ + // What we want to do is the following: + // 1) transform the normal into the new space. + // 2) Determine a point on the old plane given by plane dist * plane normal + // 3) Transform that point into the new space + // 4) Plane dist = DotProduct( new normal, new point ) + + // An optimized version, which works if the plane is orthogonal. + // 1) Transform the normal into the new space + // 2) Realize that transforming the old plane point into the new space + // is given by [ d * n'x + Tx, d * n'y + Ty, d * n'z + Tz ] + // where d = old plane dist, n' = transformed normal, Tn = translational component of transform + // 3) Compute the new plane dist using the dot product of the normal result of #2 + + // For a correct result, this should be an inverse-transpose matrix + // but that only matters if there are nonuniform scale or skew factors in this matrix. + VectorRotate( inPlane.normal, src, outPlane.normal ); + outPlane.dist = inPlane.dist * DotProduct( outPlane.normal, outPlane.normal ); + outPlane.dist += outPlane.normal.x * src[0][3] + outPlane.normal.y * src[1][3] + outPlane.normal.z * src[2][3]; +} + +inline void MatrixITransformPlane( const matrix3x4_t &src, const cplane_t &inPlane, cplane_t &outPlane ) +{ + // The trick here is that Tn = translational component of transform, + // but for an inverse transform, Tn = - R^-1 * T + Vector vecTranslation; + MatrixGetColumn( src, 3, vecTranslation ); + + Vector vecInvTranslation; + VectorIRotate( vecTranslation, src, vecInvTranslation ); + + VectorIRotate( inPlane.normal, src, outPlane.normal ); + outPlane.dist = inPlane.dist * DotProduct( outPlane.normal, outPlane.normal ); + outPlane.dist -= outPlane.normal.x * vecInvTranslation[0] + outPlane.normal.y * vecInvTranslation[1] + outPlane.normal.z * vecInvTranslation[2]; +} + +int CeilPow2( int in ); +int FloorPow2( int in ); + +FORCEINLINE float * UnpackNormal_HEND3N( const unsigned int *pPackedNormal, float *pNormal ) +{ + int temp[3]; + temp[0] = ((*pPackedNormal >> 0L) & 0x7ff); + if ( temp[0] & 0x400 ) + { + temp[0] = 2048 - temp[0]; + } + temp[1] = ((*pPackedNormal >> 11L) & 0x7ff); + if ( temp[1] & 0x400 ) + { + temp[1] = 2048 - temp[1]; + } + temp[2] = ((*pPackedNormal >> 22L) & 0x3ff); + if ( temp[2] & 0x200 ) + { + temp[2] = 1024 - temp[2]; + } + pNormal[0] = (float)temp[0] * 1.0f/1023.0f; + pNormal[1] = (float)temp[1] * 1.0f/1023.0f; + pNormal[2] = (float)temp[2] * 1.0f/511.0f; + return pNormal; +} + +FORCEINLINE unsigned int * PackNormal_HEND3N( const float *pNormal, unsigned int *pPackedNormal ) +{ + int temp[3]; + + temp[0] = Float2Int( pNormal[0] * 1023.0f ); + temp[1] = Float2Int( pNormal[1] * 1023.0f ); + temp[2] = Float2Int( pNormal[2] * 511.0f ); + + // the normal is out of bounds, determine the source and fix + // clamping would be even more of a slowdown here + Assert( temp[0] >= -1023 && temp[0] <= 1023 ); + Assert( temp[1] >= -1023 && temp[1] <= 1023 ); + Assert( temp[2] >= -511 && temp[2] <= 511 ); + + *pPackedNormal = ( ( temp[2] & 0x3ff ) << 22L ) | + ( ( temp[1] & 0x7ff ) << 11L ) | + ( ( temp[0] & 0x7ff ) << 0L ); + return pPackedNormal; +} + +FORCEINLINE unsigned int * PackNormal_HEND3N( float nx, float ny, float nz, unsigned int *pPackedNormal ) +{ + int temp[3]; + + temp[0] = Float2Int( nx * 1023.0f ); + temp[1] = Float2Int( ny * 1023.0f ); + temp[2] = Float2Int( nz * 511.0f ); + + // the normal is out of bounds, determine the source and fix + // clamping would be even more of a slowdown here + Assert( temp[0] >= -1023 && temp[0] <= 1023 ); + Assert( temp[1] >= -1023 && temp[1] <= 1023 ); + Assert( temp[2] >= -511 && temp[2] <= 511 ); + + *pPackedNormal = ( ( temp[2] & 0x3ff ) << 22L ) | + ( ( temp[1] & 0x7ff ) << 11L ) | + ( ( temp[0] & 0x7ff ) << 0L ); + return pPackedNormal; +} + +FORCEINLINE float * UnpackNormal_SHORT2( const unsigned int *pPackedNormal, float *pNormal, bool bIsTangent = FALSE ) +{ + // Unpacks from Jason's 2-short format (fills in a 4th binormal-sign (+1/-1) value, if this is a tangent vector) + + // FIXME: short math is slow on 360 - use ints here instead (bit-twiddle to deal w/ the sign bits) + short iX = (*pPackedNormal & 0x0000FFFF); + short iY = (*pPackedNormal & 0xFFFF0000) >> 16; + + float zSign = +1; + if ( iX < 0 ) + { + zSign = -1; + iX = -iX; + } + float tSign = +1; + if ( iY < 0 ) + { + tSign = -1; + iY = -iY; + } + + pNormal[0] = ( iX - 16384.0f ) / 16384.0f; + pNormal[1] = ( iY - 16384.0f ) / 16384.0f; + pNormal[2] = zSign*sqrtf( 1.0f - ( pNormal[0]*pNormal[0] + pNormal[1]*pNormal[1] ) ); + if ( bIsTangent ) + { + pNormal[3] = tSign; + } + + return pNormal; +} + +FORCEINLINE unsigned int * PackNormal_SHORT2( float nx, float ny, float nz, unsigned int *pPackedNormal, float binormalSign = +1.0f ) +{ + // Pack a vector (ASSUMED TO BE NORMALIZED) into Jason's 4-byte (SHORT2) format. + // This simply reconstructs Z from X & Y. It uses the sign bits of the X & Y coords + // to reconstruct the sign of Z and, if this is a tangent vector, the sign of the + // binormal (this is needed because tangent/binormal vectors are supposed to follow + // UV gradients, but shaders reconstruct the binormal from the tangent and normal + // assuming that they form a right-handed basis). + + nx += 1; // [-1,+1] -> [0,2] + ny += 1; + nx *= 16384.0f; // [ 0, 2] -> [0,32768] + ny *= 16384.0f; + + // '0' and '32768' values are invalid encodings + nx = max( nx, 1.0f ); // Make sure there are no zero values + ny = max( ny, 1.0f ); + nx = min( nx, 32767.0f ); // Make sure there are no 32768 values + ny = min( ny, 32767.0f ); + + if ( nz < 0.0f ) + nx = -nx; // Set the sign bit for z + + ny *= binormalSign; // Set the sign bit for the binormal (use when encoding a tangent vector) + + // FIXME: short math is slow on 360 - use ints here instead (bit-twiddle to deal w/ the sign bits), also use Float2Int() + short sX = (short)nx; // signed short [1,32767] + short sY = (short)ny; + + *pPackedNormal = ( sX & 0x0000FFFF ) | ( sY << 16 ); // NOTE: The mask is necessary (if sX is negative and cast to an int...) + + return pPackedNormal; +} + +FORCEINLINE unsigned int * PackNormal_SHORT2( const float *pNormal, unsigned int *pPackedNormal, float binormalSign = +1.0f ) +{ + return PackNormal_SHORT2( pNormal[0], pNormal[1], pNormal[2], pPackedNormal, binormalSign ); +} + +// Unpacks a UBYTE4 normal (for a tangent, the result's fourth component receives the binormal 'sign') +FORCEINLINE float * UnpackNormal_UBYTE4( const unsigned int *pPackedNormal, float *pNormal, bool bIsTangent = FALSE ) +{ + unsigned char cX, cY; + if ( bIsTangent ) + { + cX = *pPackedNormal >> 16; // Unpack Z + cY = *pPackedNormal >> 24; // Unpack W + } + else + { + cX = *pPackedNormal >> 0; // Unpack X + cY = *pPackedNormal >> 8; // Unpack Y + } + + float x = cX - 128.0f; + float y = cY - 128.0f; + float z; + + float zSignBit = x < 0 ? 1.0f : 0.0f; // z and t negative bits (like slt asm instruction) + float tSignBit = y < 0 ? 1.0f : 0.0f; + float zSign = -( 2*zSignBit - 1 ); // z and t signs + float tSign = -( 2*tSignBit - 1 ); + + x = x*zSign - zSignBit; // 0..127 + y = y*tSign - tSignBit; + x = x - 64; // -64..63 + y = y - 64; + + float xSignBit = x < 0 ? 1.0f : 0.0f; // x and y negative bits (like slt asm instruction) + float ySignBit = y < 0 ? 1.0f : 0.0f; + float xSign = -( 2*xSignBit - 1 ); // x and y signs + float ySign = -( 2*ySignBit - 1 ); + + x = ( x*xSign - xSignBit ) / 63.0f; // 0..1 range + y = ( y*ySign - ySignBit ) / 63.0f; + z = 1.0f - x - y; + + float oolen = 1.0f / sqrt( x*x + y*y + z*z ); // Normalize and + x *= oolen * xSign; // Recover signs + y *= oolen * ySign; + z *= oolen * zSign; + + pNormal[0] = x; + pNormal[1] = y; + pNormal[2] = z; + if ( bIsTangent ) + { + pNormal[3] = tSign; + } + + return pNormal; +} + +////////////////////////////////////////////////////////////////////////////// +// See: http://www.oroboro.com/rafael/docserv.php/index/programming/article/unitv2 +// +// UBYTE4 encoding, using per-octant projection onto x+y+z=1 +// Assume input vector is already unit length +// +// binormalSign specifies 'sign' of binormal, stored in t sign bit of tangent +// (lets the shader know whether norm/tan/bin form a right-handed basis) +// +// bIsTangent is used to specify which WORD of the output to store the data +// The expected usage is to call once with the normal and once with +// the tangent and binormal sign flag, bitwise OR'ing the returned DWORDs +FORCEINLINE unsigned int * PackNormal_UBYTE4( float nx, float ny, float nz, unsigned int *pPackedNormal, bool bIsTangent = false, float binormalSign = +1.0f ) +{ + float xSign = nx < 0.0f ? -1.0f : 1.0f; // -1 or 1 sign + float ySign = ny < 0.0f ? -1.0f : 1.0f; + float zSign = nz < 0.0f ? -1.0f : 1.0f; + float tSign = binormalSign; + Assert( ( binormalSign == +1.0f ) || ( binormalSign == -1.0f ) ); + + float xSignBit = 0.5f*( 1 - xSign ); // [-1,+1] -> [1,0] + float ySignBit = 0.5f*( 1 - ySign ); // 1 is negative bit (like slt instruction) + float zSignBit = 0.5f*( 1 - zSign ); + float tSignBit = 0.5f*( 1 - binormalSign ); + + float absX = xSign*nx; // 0..1 range (abs) + float absY = ySign*ny; + float absZ = zSign*nz; + + float xbits = absX / ( absX + absY + absZ ); // Project onto x+y+z=1 plane + float ybits = absY / ( absX + absY + absZ ); + + xbits *= 63; // 0..63 + ybits *= 63; + + xbits = xbits * xSign - xSignBit; // -64..63 range + ybits = ybits * ySign - ySignBit; + xbits += 64.0f; // 0..127 range + ybits += 64.0f; + + xbits = xbits * zSign - zSignBit; // Negate based on z and t + ybits = ybits * tSign - tSignBit; // -128..127 range + + xbits += 128.0f; // 0..255 range + ybits += 128.0f; + + unsigned char cX = (unsigned char) xbits; + unsigned char cY = (unsigned char) ybits; + + if ( !bIsTangent ) + *pPackedNormal = (cX << 0) | (cY << 8); // xy for normal + else + *pPackedNormal = (cX << 16) | (cY << 24); // zw for tangent + + return pPackedNormal; +} + +FORCEINLINE unsigned int * PackNormal_UBYTE4( const float *pNormal, unsigned int *pPackedNormal, bool bIsTangent = false, float binormalSign = +1.0f ) +{ + return PackNormal_UBYTE4( pNormal[0], pNormal[1], pNormal[2], pPackedNormal, bIsTangent, binormalSign ); +} + + +//----------------------------------------------------------------------------- +// Convert RGB to HSV +//----------------------------------------------------------------------------- +void RGBtoHSV( const Vector &rgb, Vector &hsv ); + + +//----------------------------------------------------------------------------- +// Convert HSV to RGB +//----------------------------------------------------------------------------- +void HSVtoRGB( const Vector &hsv, Vector &rgb ); + + +//----------------------------------------------------------------------------- +// Fast version of pow and log +//----------------------------------------------------------------------------- + +float FastLog2(float i); // log2( i ) +float FastPow2(float i); // 2^i +float FastPow(float a, float b); // a^b +float FastPow10( float i ); // 10^i + +//----------------------------------------------------------------------------- +// For testing float equality +//----------------------------------------------------------------------------- + +inline bool CloseEnough( float a, float b, float epsilon = EQUAL_EPSILON ) +{ + return fabs( a - b ) <= epsilon; +} + +inline bool CloseEnough( const Vector &a, const Vector &b, float epsilon = EQUAL_EPSILON ) +{ + return fabs( a.x - b.x ) <= epsilon && + fabs( a.y - b.y ) <= epsilon && + fabs( a.z - b.z ) <= epsilon; +} + +// Fast compare +// maxUlps is the maximum error in terms of Units in the Last Place. This +// specifies how big an error we are willing to accept in terms of the value +// of the least significant digit of the floating point number�s +// representation. maxUlps can also be interpreted in terms of how many +// representable floats we are willing to accept between A and B. +// This function will allow maxUlps-1 floats between A and B. +bool AlmostEqual(float a, float b, int maxUlps = 10); + +inline bool AlmostEqual( const Vector &a, const Vector &b, int maxUlps = 10) +{ + return AlmostEqual( a.x, b.x, maxUlps ) && + AlmostEqual( a.y, b.y, maxUlps ) && + AlmostEqual( a.z, b.z, maxUlps ); +} + + +#endif // MATH_BASE_H + diff --git a/public/mathlib/matrixmath.h b/public/mathlib/matrixmath.h new file mode 100644 index 0000000..9c7f207 --- /dev/null +++ b/public/mathlib/matrixmath.h @@ -0,0 +1,385 @@ +//========= Copyright Valve Corporation, All rights reserved. ============// +// +// Purpose: +// +// A set of generic, template-based matrix functions. +//===========================================================================// + +#ifndef MATRIXMATH_H +#define MATRIXMATH_H + +#include <stdarg.h> + +// The operations in this file can perform basic matrix operations on matrices represented +// using any class that supports the necessary operations: +// +// .Element( row, col ) - return the element at a given matrox position +// .SetElement( row, col, val ) - modify an element +// .Width(), .Height() - get dimensions +// .SetDimensions( nrows, ncols) - set a matrix to be un-initted and the appropriate size +// +// Generally, vectors can be used with these functions by using N x 1 matrices to represent them. +// Matrices are addressed as row, column, and indices are 0-based +// +// +// Note that the template versions of these routines are defined for generality - it is expected +// that template specialization is used for common high performance cases. + +namespace MatrixMath +{ + /// M *= flScaleValue + template<class MATRIXCLASS> + void ScaleMatrix( MATRIXCLASS &matrix, float flScaleValue ) + { + for( int i = 0; i < matrix.Height(); i++ ) + { + for( int j = 0; j < matrix.Width(); j++ ) + { + matrix.SetElement( i, j, flScaleValue * matrix.Element( i, j ) ); + } + } + } + + /// AppendElementToMatrix - same as setting the element, except only works when all calls + /// happen in top to bottom left to right order, end you have to call FinishedAppending when + /// done. For normal matrix classes this is not different then SetElement, but for + /// CSparseMatrix, it is an accelerated way to fill a matrix from scratch. + template<class MATRIXCLASS> + FORCEINLINE void AppendElement( MATRIXCLASS &matrix, int nRow, int nCol, float flValue ) + { + matrix.SetElement( nRow, nCol, flValue ); // default implementation + } + + template<class MATRIXCLASS> + FORCEINLINE void FinishedAppending( MATRIXCLASS &matrix ) {} // default implementation + + /// M += fl + template<class MATRIXCLASS> + void AddToMatrix( MATRIXCLASS &matrix, float flAddend ) + { + for( int i = 0; i < matrix.Height(); i++ ) + { + for( int j = 0; j < matrix.Width(); j++ ) + { + matrix.SetElement( i, j, flAddend + matrix.Element( i, j ) ); + } + } + } + + /// transpose + template<class MATRIXCLASSIN, class MATRIXCLASSOUT> + void TransposeMatrix( MATRIXCLASSIN const &matrixIn, MATRIXCLASSOUT *pMatrixOut ) + { + pMatrixOut->SetDimensions( matrixIn.Width(), matrixIn.Height() ); + for( int i = 0; i < pMatrixOut->Height(); i++ ) + { + for( int j = 0; j < pMatrixOut->Width(); j++ ) + { + AppendElement( *pMatrixOut, i, j, matrixIn.Element( j, i ) ); + } + } + FinishedAppending( *pMatrixOut ); + } + + /// copy + template<class MATRIXCLASSIN, class MATRIXCLASSOUT> + void CopyMatrix( MATRIXCLASSIN const &matrixIn, MATRIXCLASSOUT *pMatrixOut ) + { + pMatrixOut->SetDimensions( matrixIn.Height(), matrixIn.Width() ); + for( int i = 0; i < matrixIn.Height(); i++ ) + { + for( int j = 0; j < matrixIn.Width(); j++ ) + { + AppendElement( *pMatrixOut, i, j, matrixIn.Element( i, j ) ); + } + } + FinishedAppending( *pMatrixOut ); + } + + + + /// M+=M + template<class MATRIXCLASSIN, class MATRIXCLASSOUT> + void AddMatrixToMatrix( MATRIXCLASSIN const &matrixIn, MATRIXCLASSOUT *pMatrixOut ) + { + for( int i = 0; i < matrixIn.Height(); i++ ) + { + for( int j = 0; j < matrixIn.Width(); j++ ) + { + pMatrixOut->SetElement( i, j, pMatrixOut->Element( i, j ) + matrixIn.Element( i, j ) ); + } + } + } + + // M += scale * M + template<class MATRIXCLASSIN, class MATRIXCLASSOUT> + void AddScaledMatrixToMatrix( float flScale, MATRIXCLASSIN const &matrixIn, MATRIXCLASSOUT *pMatrixOut ) + { + for( int i = 0; i < matrixIn.Height(); i++ ) + { + for( int j = 0; j < matrixIn.Width(); j++ ) + { + pMatrixOut->SetElement( i, j, pMatrixOut->Element( i, j ) + flScale * matrixIn.Element( i, j ) ); + } + } + } + + + // simple way to initialize a matrix with constants from code. + template<class MATRIXCLASSOUT> + void SetMatrixToIdentity( MATRIXCLASSOUT *pMatrixOut, float flDiagonalValue = 1.0 ) + { + for( int i = 0; i < pMatrixOut->Height(); i++ ) + { + for( int j = 0; j < pMatrixOut->Width(); j++ ) + { + AppendElement( *pMatrixOut, i, j, ( i == j ) ? flDiagonalValue : 0 ); + } + } + FinishedAppending( *pMatrixOut ); + } + + //// simple way to initialize a matrix with constants from code + template<class MATRIXCLASSOUT> + void SetMatrixValues( MATRIXCLASSOUT *pMatrix, int nRows, int nCols, ... ) + { + va_list argPtr; + va_start( argPtr, nCols ); + + pMatrix->SetDimensions( nRows, nCols ); + for( int nRow = 0; nRow < nRows; nRow++ ) + { + for( int nCol = 0; nCol < nCols; nCol++ ) + { + double flNewValue = va_arg( argPtr, double ); + pMatrix->SetElement( nRow, nCol, flNewValue ); + } + } + va_end( argPtr ); + } + + + /// row and colum accessors. treat a row or a column as a column vector + template<class MATRIXTYPE> class MatrixRowAccessor + { + public: + FORCEINLINE MatrixRowAccessor( MATRIXTYPE const &matrix, int nRow ) + { + m_pMatrix = &matrix; + m_nRow = nRow; + } + + FORCEINLINE float Element( int nRow, int nCol ) const + { + Assert( nCol == 0 ); + return m_pMatrix->Element( m_nRow, nRow ); + } + + FORCEINLINE int Width( void ) const { return 1; }; + FORCEINLINE int Height( void ) const { return m_pMatrix->Width(); } + + private: + MATRIXTYPE const *m_pMatrix; + int m_nRow; + }; + + template<class MATRIXTYPE> class MatrixColumnAccessor + { + public: + FORCEINLINE MatrixColumnAccessor( MATRIXTYPE const &matrix, int nColumn ) + { + m_pMatrix = &matrix; + m_nColumn = nColumn; + } + + FORCEINLINE float Element( int nRow, int nColumn ) const + { + Assert( nColumn == 0 ); + return m_pMatrix->Element( nRow, m_nColumn ); + } + + FORCEINLINE int Width( void ) const { return 1; } + FORCEINLINE int Height( void ) const { return m_pMatrix->Height(); } + private: + MATRIXTYPE const *m_pMatrix; + int m_nColumn; + }; + + /// this translator acts as a proxy for the transposed matrix + template<class MATRIXTYPE> class MatrixTransposeAccessor + { + public: + FORCEINLINE MatrixTransposeAccessor( MATRIXTYPE const & matrix ) + { + m_pMatrix = &matrix; + } + + FORCEINLINE float Element( int nRow, int nColumn ) const + { + return m_pMatrix->Element( nColumn, nRow ); + } + + FORCEINLINE int Width( void ) const { return m_pMatrix->Height(); } + FORCEINLINE int Height( void ) const { return m_pMatrix->Width(); } + private: + MATRIXTYPE const *m_pMatrix; + }; + + /// this tranpose returns a wrapper around it's argument, allowing things like AddMatrixToMatrix( Transpose( matA ), &matB ) without an extra copy + template<class MATRIXCLASSIN> + MatrixTransposeAccessor<MATRIXCLASSIN> TransposeMatrix( MATRIXCLASSIN const &matrixIn ) + { + return MatrixTransposeAccessor<MATRIXCLASSIN>( matrixIn ); + } + + + /// retrieve rows and columns + template<class MATRIXTYPE> + FORCEINLINE MatrixColumnAccessor<MATRIXTYPE> MatrixColumn( MATRIXTYPE const &matrix, int nColumn ) + { + return MatrixColumnAccessor<MATRIXTYPE>( matrix, nColumn ); + } + + template<class MATRIXTYPE> + FORCEINLINE MatrixRowAccessor<MATRIXTYPE> MatrixRow( MATRIXTYPE const &matrix, int nRow ) + { + return MatrixRowAccessor<MATRIXTYPE>( matrix, nRow ); + } + + //// dot product between vectors (or rows and/or columns via accessors) + template<class MATRIXACCESSORATYPE, class MATRIXACCESSORBTYPE > + float InnerProduct( MATRIXACCESSORATYPE const &vecA, MATRIXACCESSORBTYPE const &vecB ) + { + Assert( vecA.Width() == 1 ); + Assert( vecB.Width() == 1 ); + Assert( vecA.Height() == vecB.Height() ); + double flResult = 0; + for( int i = 0; i < vecA.Height(); i++ ) + { + flResult += vecA.Element( i, 0 ) * vecB.Element( i, 0 ); + } + return flResult; + } + + + + /// matrix x matrix multiplication + template<class MATRIXATYPE, class MATRIXBTYPE, class MATRIXOUTTYPE> + void MatrixMultiply( MATRIXATYPE const &matA, MATRIXBTYPE const &matB, MATRIXOUTTYPE *pMatrixOut ) + { + Assert( matA.Width() == matB.Height() ); + pMatrixOut->SetDimensions( matA.Height(), matB.Width() ); + for( int i = 0; i < matA.Height(); i++ ) + { + for( int j = 0; j < matB.Width(); j++ ) + { + pMatrixOut->SetElement( i, j, InnerProduct( MatrixRow( matA, i ), MatrixColumn( matB, j ) ) ); + } + } + } + + /// solve Ax=B via the conjugate graident method. Code and naming conventions based on the + /// wikipedia article. + template<class ATYPE, class XTYPE, class BTYPE> + void ConjugateGradient( ATYPE const &matA, BTYPE const &vecB, XTYPE &vecX, float flTolerance = 1.0e-20 ) + { + XTYPE vecR; + vecR.SetDimensions( vecX.Height(), 1 ); + MatrixMultiply( matA, vecX, &vecR ); + ScaleMatrix( vecR, -1 ); + AddMatrixToMatrix( vecB, &vecR ); + XTYPE vecP; + CopyMatrix( vecR, &vecP ); + float flRsOld = InnerProduct( vecR, vecR ); + for( int nIter = 0; nIter < 100; nIter++ ) + { + XTYPE vecAp; + MatrixMultiply( matA, vecP, &vecAp ); + float flDivisor = InnerProduct( vecAp, vecP ); + float flAlpha = flRsOld / flDivisor; + AddScaledMatrixToMatrix( flAlpha, vecP, &vecX ); + AddScaledMatrixToMatrix( -flAlpha, vecAp, &vecR ); + float flRsNew = InnerProduct( vecR, vecR ); + if ( flRsNew < flTolerance ) + { + break; + } + ScaleMatrix( vecP, flRsNew / flRsOld ); + AddMatrixToMatrix( vecR, &vecP ); + flRsOld = flRsNew; + } + } + + /// solve (A'*A) x=B via the conjugate gradient method. Code and naming conventions based on + /// the wikipedia article. Same as Conjugate gradient but allows passing in two matrices whose + /// product is used as the A matrix (in order to preserve sparsity) + template<class ATYPE, class APRIMETYPE, class XTYPE, class BTYPE> + void ConjugateGradient( ATYPE const &matA, APRIMETYPE const &matAPrime, BTYPE const &vecB, XTYPE &vecX, float flTolerance = 1.0e-20 ) + { + XTYPE vecR1; + vecR1.SetDimensions( vecX.Height(), 1 ); + MatrixMultiply( matA, vecX, &vecR1 ); + XTYPE vecR; + vecR.SetDimensions( vecR1.Height(), 1 ); + MatrixMultiply( matAPrime, vecR1, &vecR ); + ScaleMatrix( vecR, -1 ); + AddMatrixToMatrix( vecB, &vecR ); + XTYPE vecP; + CopyMatrix( vecR, &vecP ); + float flRsOld = InnerProduct( vecR, vecR ); + for( int nIter = 0; nIter < 100; nIter++ ) + { + XTYPE vecAp1; + MatrixMultiply( matA, vecP, &vecAp1 ); + XTYPE vecAp; + MatrixMultiply( matAPrime, vecAp1, &vecAp ); + float flDivisor = InnerProduct( vecAp, vecP ); + float flAlpha = flRsOld / flDivisor; + AddScaledMatrixToMatrix( flAlpha, vecP, &vecX ); + AddScaledMatrixToMatrix( -flAlpha, vecAp, &vecR ); + float flRsNew = InnerProduct( vecR, vecR ); + if ( flRsNew < flTolerance ) + { + break; + } + ScaleMatrix( vecP, flRsNew / flRsOld ); + AddMatrixToMatrix( vecR, &vecP ); + flRsOld = flRsNew; + } + } + + + template<class ATYPE, class XTYPE, class BTYPE> + void LeastSquaresFit( ATYPE const &matA, BTYPE const &vecB, XTYPE &vecX ) + { + // now, generate the normal equations + BTYPE vecBeta; + MatrixMath::MatrixMultiply( MatrixMath::TransposeMatrix( matA ), vecB, &vecBeta ); + + vecX.SetDimensions( matA.Width(), 1 ); + MatrixMath::SetMatrixToIdentity( &vecX ); + + ATYPE matATransposed; + TransposeMatrix( matA, &matATransposed ); + ConjugateGradient( matA, matATransposed, vecBeta, vecX, 1.0e-20 ); + } + +}; + +/// a simple fixed-size matrix class +template<int NUMROWS, int NUMCOLS> class CFixedMatrix +{ +public: + FORCEINLINE int Width( void ) const { return NUMCOLS; } + FORCEINLINE int Height( void ) const { return NUMROWS; } + FORCEINLINE float Element( int nRow, int nCol ) const { return m_flValues[nRow][nCol]; } + FORCEINLINE void SetElement( int nRow, int nCol, float flValue ) { m_flValues[nRow][nCol] = flValue; } + FORCEINLINE void SetDimensions( int nNumRows, int nNumCols ) { Assert( ( nNumRows == NUMROWS ) && ( nNumCols == NUMCOLS ) ); } + +private: + float m_flValues[NUMROWS][NUMCOLS]; +}; + + + +#endif //matrixmath_h diff --git a/public/mathlib/noise.h b/public/mathlib/noise.h new file mode 100644 index 0000000..19d3f72 --- /dev/null +++ b/public/mathlib/noise.h @@ -0,0 +1,35 @@ +//========= Copyright Valve Corporation, All rights reserved. ============// +// +// Purpose: +// +//=====================================================================================// + +#ifndef NOISE_H +#define NOISE_H + +#include <math.h> +#include "basetypes.h" +#include "mathlib/vector.h" +#include "tier0/dbg.h" + + +// The following code is the c-ification of Ken Perlin's new noise algorithm +// "JAVA REFERENCE IMPLEMENTATION OF IMPROVED NOISE - COPYRIGHT 2002 KEN PERLIN" +// as available here: http://mrl.nyu.edu/~perlin/noise/ +// it generates a single octave of noise in the -1..1 range +// this should at some point probably replace SparseConvolutionNoise - jd +float ImprovedPerlinNoise( Vector const &pnt ); + +// get the noise value at a point. Output range is 0..1. +float SparseConvolutionNoise( Vector const &pnt ); + +// get the noise value at a point, passing a custom noise shaping function. The noise shaping +// function should map the domain 0..1 to 0..1. +float SparseConvolutionNoise(Vector const &pnt, float (*pNoiseShapeFunction)(float) ); + +// returns a 1/f noise. more octaves take longer +float FractalNoise( Vector const &pnt, int n_octaves ); + +// returns a abs(f)*1/f noise i.e. turbulence +float Turbulence( Vector const &pnt, int n_octaves ); +#endif // NOISE_H diff --git a/public/mathlib/polyhedron.h b/public/mathlib/polyhedron.h new file mode 100644 index 0000000..38b465c --- /dev/null +++ b/public/mathlib/polyhedron.h @@ -0,0 +1,73 @@ +//========= Copyright Valve Corporation, All rights reserved. ============// +// +// Purpose: +// +// $NoKeywords: $ +// +//=============================================================================// + +#ifndef POLYHEDRON_H_ +#define POLYHEDRON_H_ + +#ifdef _WIN32 +#pragma once +#endif + +#include "mathlib/mathlib.h" + + + +struct Polyhedron_IndexedLine_t +{ + unsigned short iPointIndices[2]; +}; + +struct Polyhedron_IndexedLineReference_t +{ + unsigned short iLineIndex; + unsigned char iEndPointIndex; //since two polygons reference any one line, one needs to traverse the line backwards, this flags that behavior +}; + +struct Polyhedron_IndexedPolygon_t +{ + unsigned short iFirstIndex; + unsigned short iIndexCount; + Vector polyNormal; +}; + +class CPolyhedron //made into a class because it's going virtual to support distinctions between temp and permanent versions +{ +public: + Vector *pVertices; + Polyhedron_IndexedLine_t *pLines; + Polyhedron_IndexedLineReference_t *pIndices; + Polyhedron_IndexedPolygon_t *pPolygons; + + unsigned short iVertexCount; + unsigned short iLineCount; + unsigned short iIndexCount; + unsigned short iPolygonCount; + + virtual ~CPolyhedron( void ) {}; + virtual void Release( void ) = 0; + Vector Center( void ); +}; + +class CPolyhedron_AllocByNew : public CPolyhedron +{ +public: + virtual void Release( void ); + static CPolyhedron_AllocByNew *Allocate( unsigned short iVertices, unsigned short iLines, unsigned short iIndices, unsigned short iPolygons ); //creates the polyhedron along with enough memory to hold all it's data in a single allocation + +private: + CPolyhedron_AllocByNew( void ) { }; //CPolyhedron_AllocByNew::Allocate() is the only way to create one of these. +}; + +CPolyhedron *GeneratePolyhedronFromPlanes( const float *pOutwardFacingPlanes, int iPlaneCount, float fOnPlaneEpsilon, bool bUseTemporaryMemory = false ); //be sure to polyhedron->Release() +CPolyhedron *ClipPolyhedron( const CPolyhedron *pExistingPolyhedron, const float *pOutwardFacingPlanes, int iPlaneCount, float fOnPlaneEpsilon, bool bUseTemporaryMemory = false ); //this does NOT modify/delete the existing polyhedron + +CPolyhedron *GetTempPolyhedron( unsigned short iVertices, unsigned short iLines, unsigned short iIndices, unsigned short iPolygons ); //grab the temporary polyhedron. Avoids new/delete for quick work. Can only be in use by one chunk of code at a time + + +#endif //#ifndef POLYHEDRON_H_ + diff --git a/public/mathlib/quantize.h b/public/mathlib/quantize.h new file mode 100644 index 0000000..5e5b742 --- /dev/null +++ b/public/mathlib/quantize.h @@ -0,0 +1,141 @@ +//========= Copyright Valve Corporation, All rights reserved. ============// +// +// Purpose: +// +// $NoKeywords: $ +// +//=============================================================================// +#ifndef QUANTIZE_H +#define QUANTIZE_H + +#ifndef STRING_H +#include <string.h> +#endif + +#define MAXDIMS 768 +#define MAXQUANT 16000 + + +#include <tier0/platform.h> + +struct Sample; + +struct QuantizedValue { + double MinError; // minimum possible error. used + // for neighbor searches. + struct QuantizedValue *Children[2]; // splits + int32 value; // only exists for leaf nodes + struct Sample *Samples; // every sample quantized into this + // entry + int32 NSamples; // how many were quantized to this. + int32 TotSamples; + double *ErrorMeasure; // variance measure for each dimension + double TotalError; // sum of errors + uint8 *Mean; // average value of each dimension + uint8 *Mins; // min box for children and this + uint8 *Maxs; // max box for children and this + int NQuant; // the number of samples which were + // quantzied to this node since the + // last time OptimizeQuantizer() + // was called. + int *Sums; // sum used by OptimizeQuantizer + int sortdim; // dimension currently sorted along. +}; + +struct Sample { + int32 ID; // identifier of this sample. can + // be used for any purpose. + int32 Count; // number of samples this sample + // represents + int32 QNum; // what value this sample ended up quantized + // to. + struct QuantizedValue *qptr; // ptr to what this was quantized to. + uint8 Value[1]; // array of values for multi-dimensional + // variables. +}; + +void FreeQuantization(struct QuantizedValue *t); + +struct QuantizedValue *Quantize(struct Sample *s, int nsamples, int ndims, + int nvalues, uint8 *weights, int value0=0); + +int CompressSamples(struct Sample *s, int nsamples, int ndims); + +struct QuantizedValue *FindMatch(uint8 const *sample, + int ndims,uint8 *weights, + struct QuantizedValue *QTable); +void PrintSamples(struct Sample const *s, int nsamples, int ndims); + +struct QuantizedValue *FindQNode(struct QuantizedValue const *q, int32 code); + +inline struct Sample *NthSample(struct Sample *s, int i, int nd) +{ + uint8 *r=(uint8 *) s; + r+=i*(sizeof(*s)+(nd-1)); + return (struct Sample *) r; +} + +inline struct Sample *AllocSamples(int ns, int nd) +{ + size_t size5=(sizeof(struct Sample)+(nd-1))*ns; + void *ret=new uint8[size5]; + memset(ret,0,size5); + for(int i=0;i<ns;i++) + NthSample((struct Sample *)ret,i,nd)->Count=1; + return (struct Sample *) ret; +} + + +// MinimumError: what is the min error which will occur if quantizing +// a sample to the given qnode? This is just the error if the qnode +// is a leaf. +double MinimumError(struct QuantizedValue const *q, uint8 const *sample, + int ndims, uint8 const *weights); +double MaximumError(struct QuantizedValue const *q, uint8 const *sample, + int ndims, uint8 const *weights); + +void PrintQTree(struct QuantizedValue const *p,int idlevel=0); +void OptimizeQuantizer(struct QuantizedValue *q, int ndims); + +// RecalculateVelues: update the means in a sample tree, based upon +// the samples. can be used to reoptimize when samples are deleted, +// for instance. + +void RecalculateValues(struct QuantizedValue *q, int ndims); + +extern double SquaredError; // may be reset and examined. updated by + // FindMatch() + + + + +// the routines below can be used for uniform quantization via dart-throwing. +typedef void (*GENERATOR)(void *); // generate a random sample +typedef double (*COMPARER)(void const *a, void const *b); + +void *DartThrow(int NResults, int NTries, size_t itemsize, GENERATOR gen, + COMPARER cmp); +void *FindClosestDart(void *items,int NResults, size_t itemsize, + COMPARER cmp, void *lookfor, int *idx); + + + + +// color quantization of 24 bit images +#define QUANTFLAGS_NODITHER 1 // don't do Floyd-steinberg dither + +extern void ColorQuantize( +uint8 const *pImage, // 4 byte pixels ARGB +int nWidth, +int nHeight, +int nFlags, // QUANTFLAGS_xxx +int nColors, // # of colors to fill in in palette +uint8 *pOutPixels, // where to store resulting 8 bit pixels +uint8 *pOutPalette, // where to store resulting 768-byte palette +int nFirstColor); // first color to use in mapping + + + + + +#endif diff --git a/public/mathlib/simdvectormatrix.h b/public/mathlib/simdvectormatrix.h new file mode 100644 index 0000000..f88cd32 --- /dev/null +++ b/public/mathlib/simdvectormatrix.h @@ -0,0 +1,142 @@ +//========= Copyright Valve Corporation, All rights reserved. ============// +// +// Purpose: Provide a class (SSE/SIMD only) holding a 2d matrix of class FourVectors, +// for high speed processing in tools. +// +// $NoKeywords: $ +// +//=============================================================================// + +#ifndef SIMDVECTORMATRIX_H +#define SIMDVECTORMATRIX_H + +#ifdef _WIN32 +#pragma once +#endif + + +#include <string.h> +#include "tier0/platform.h" +#include "tier0/dbg.h" +#include "tier1/utlsoacontainer.h" +#include "mathlib/ssemath.h" + +class CSIMDVectorMatrix +{ +public: + int m_nWidth; // in actual vectors + int m_nHeight; + + int m_nPaddedWidth; // # of 4x wide elements + + FourVectors *m_pData; + +protected: + void Init( void ) + { + m_pData = NULL; + m_nWidth = 0; + m_nHeight = 0; + m_nPaddedWidth = 0; + } + + int NVectors( void ) const + { + return m_nHeight * m_nPaddedWidth; + } + +public: + // constructors and destructors + CSIMDVectorMatrix( void ) + { + Init(); + } + + ~CSIMDVectorMatrix( void ) + { + if ( m_pData ) + delete[] m_pData; + } + + // set up storage and fields for m x n matrix. destroys old data + void SetSize( int width, int height ) + { + if ( ( ! m_pData ) || ( width != m_nWidth ) || ( height != m_nHeight ) ) + { + if ( m_pData ) + delete[] m_pData; + + m_nWidth = width; + m_nHeight = height; + + m_nPaddedWidth = ( m_nWidth + 3) >> 2; + m_pData = NULL; + if ( width && height ) + m_pData = new FourVectors[ m_nPaddedWidth * m_nHeight ]; + } + } + + CSIMDVectorMatrix( int width, int height ) + { + Init(); + SetSize( width, height ); + } + + CSIMDVectorMatrix &operator=( CSIMDVectorMatrix const &src ) + { + SetSize( src.m_nWidth, src.m_nHeight ); + if ( m_pData ) + memcpy( m_pData, src.m_pData, m_nHeight*m_nPaddedWidth*sizeof(m_pData[0]) ); + return *this; + } + + CSIMDVectorMatrix &operator+=( CSIMDVectorMatrix const &src ); + + CSIMDVectorMatrix &operator*=( Vector const &src ); + + // create from an RGBA float bitmap. alpha ignored. + void CreateFromRGBA_FloatImageData(int srcwidth, int srcheight, float const *srcdata ); + + // create from 3 fields in a csoa + void CreateFromCSOAAttributes( CSOAContainer const *pSrc, + int nAttrIdx0, int nAttrIdx1, int nAttrIdx2 ); + + // Element access. If you are calling this a lot, you don't want to use this class, because + // you're not getting the sse advantage + Vector Element(int x, int y) const + { + Assert( m_pData ); + Assert( x < m_nWidth ); + Assert( y < m_nHeight ); + Vector ret; + FourVectors const *pData=m_pData+y*m_nPaddedWidth+(x >> 2); + + int xo=(x & 3); + ret.x=pData->X( xo ); + ret.y=pData->Y( xo ); + ret.z=pData->Z( xo ); + return ret; + } + + //addressing the individual fourvectors elements + FourVectors &CompoundElement(int x, int y) + { + Assert( m_pData ); + Assert( y < m_nHeight ); + Assert( x < m_nPaddedWidth ); + return m_pData[x + m_nPaddedWidth*y ]; + } + + // math operations on the whole image + void Clear( void ) + { + Assert( m_pData ); + memset( m_pData, 0, m_nHeight*m_nPaddedWidth*sizeof(m_pData[0]) ); + } + + void RaiseToPower( float power ); +}; + + + +#endif diff --git a/public/mathlib/spherical_geometry.h b/public/mathlib/spherical_geometry.h new file mode 100644 index 0000000..04310f4 --- /dev/null +++ b/public/mathlib/spherical_geometry.h @@ -0,0 +1,73 @@ +//========= Copyright Valve Corporation, All rights reserved. ============// +// +// Purpose: Functions for spherical geometry. +// +// $NoKeywords: $ +// +//=============================================================================// + +#ifndef SPHERICAL_GEOMETRY_H +#define SPHERICAL_GEOMETRY_H + +#ifdef _WIN32 +#pragma once +#endif + +#include <math.h> +#include <float.h> + +// see http://mathworld.wolfram.com/SphericalTrigonometry.html + +// return the spherical distance, in radians, between 2 points on the unit sphere. +FORCEINLINE float UnitSphereLineSegmentLength( Vector const &a, Vector const &b ) +{ + // check unit length + Assert( fabs( VectorLength( a ) - 1.0 ) < 1.0e-3 ); + Assert( fabs( VectorLength( b ) - 1.0 ) < 1.0e-3 ); + return acos( DotProduct( a, b ) ); +} + + +// given 3 points on the unit sphere, return the spherical area (in radians) of the triangle they form. +// valid for "small" triangles. +FORCEINLINE float UnitSphereTriangleArea( Vector const &a, Vector const &b , Vector const &c ) +{ + float flLengthA = UnitSphereLineSegmentLength( b, c ); + float flLengthB = UnitSphereLineSegmentLength( c, a ); + float flLengthC = UnitSphereLineSegmentLength( a, b ); + + if ( ( flLengthA == 0. ) || ( flLengthB == 0. ) || ( flLengthC == 0. ) ) + return 0.; // zero area triangle + + // now, find the 3 incribed angles for the triangle + float flHalfSumLens = 0.5 * ( flLengthA + flLengthB + flLengthC ); + float flSinSums = sin( flHalfSumLens ); + float flSinSMinusA= sin( flHalfSumLens - flLengthA ); + float flSinSMinusB= sin( flHalfSumLens - flLengthB ); + float flSinSMinusC= sin( flHalfSumLens - flLengthC ); + + float flTanAOver2 = sqrt ( ( flSinSMinusB * flSinSMinusC ) / ( flSinSums * flSinSMinusA ) ); + float flTanBOver2 = sqrt ( ( flSinSMinusA * flSinSMinusC ) / ( flSinSums * flSinSMinusB ) ); + float flTanCOver2 = sqrt ( ( flSinSMinusA * flSinSMinusB ) / ( flSinSums * flSinSMinusC ) ); + + // Girards formula : area = sum of angles - pi. + return 2.0 * ( atan( flTanAOver2 ) + atan( flTanBOver2 ) + atan( flTanCOver2 ) ) - M_PI; +} + +// spherical harmonics-related functions. Best explanation at http://www.research.scea.com/gdc2003/spherical-harmonic-lighting.pdf + +// Evaluate associated legendre polynomial P( l, m ) at flX, using recurrence relation +float AssociatedLegendrePolynomial( int nL, int nM, float flX ); + +// Evaluate order N spherical harmonic with spherical coordinates +// nL = band, 0..N +// nM = -nL .. nL +// theta = 0..M_PI +// phi = 0.. 2 * M_PHI +float SphericalHarmonic( int nL, int nM, float flTheta, float flPhi ); + +// evaluate spherical harmonic with normalized vector direction +float SphericalHarmonic( int nL, int nM, Vector const &vecDirection ); + + +#endif // SPHERICAL_GEOMETRY_H diff --git a/public/mathlib/ssemath.h b/public/mathlib/ssemath.h new file mode 100644 index 0000000..c2ff48d --- /dev/null +++ b/public/mathlib/ssemath.h @@ -0,0 +1,3107 @@ +//========= Copyright Valve Corporation, All rights reserved. ============// +// +// Purpose: - defines SIMD "structure of arrays" classes and functions. +// +//===========================================================================// +#ifndef SSEMATH_H +#define SSEMATH_H + +#if defined( _X360 ) +#include <xboxmath.h> +#else +#include <xmmintrin.h> +#endif + +#include <mathlib/vector.h> +#include <mathlib/mathlib.h> + +#if defined(GNUC) +#define USE_STDC_FOR_SIMD 0 +#else +#define USE_STDC_FOR_SIMD 0 +#endif + +#if (!defined(_X360) && (USE_STDC_FOR_SIMD == 0)) +#define _SSE1 1 +#endif + +// I thought about defining a class/union for the SIMD packed floats instead of using fltx4, +// but decided against it because (a) the nature of SIMD code which includes comparisons is to blur +// the relationship between packed floats and packed integer types and (b) not sure that the +// compiler would handle generating good code for the intrinsics. + +#if USE_STDC_FOR_SIMD + +typedef union +{ + float m128_f32[4]; + uint32 m128_u32[4]; +} fltx4; + +typedef fltx4 i32x4; +typedef fltx4 u32x4; + +#elif ( defined( _X360 ) ) + +typedef union +{ + // This union allows float/int access (which generally shouldn't be done in inner loops) + __vector4 vmx; + float m128_f32[4]; + uint32 m128_u32[4]; +} fltx4_union; + +typedef __vector4 fltx4; +typedef __vector4 i32x4; // a VMX register; just a way of making it explicit that we're doing integer ops. +typedef __vector4 u32x4; // a VMX register; just a way of making it explicit that we're doing unsigned integer ops. + +#else + +typedef __m128 fltx4; +typedef __m128 i32x4; +typedef __m128 u32x4; + +#endif + +// The FLTX4 type is a fltx4 used as a parameter to a function. +// On the 360, the best way to do this is pass-by-copy on the registers. +// On the PC, the best way is to pass by const reference. +// The compiler will sometimes, but not always, replace a pass-by-const-ref +// with a pass-in-reg on the 360; to avoid this confusion, you can +// explicitly use a FLTX4 as the parameter type. +#ifdef _X360 +typedef __vector4 FLTX4; +#else +typedef const fltx4 & FLTX4; +#endif + +// A 16-byte aligned int32 datastructure +// (for use when writing out fltx4's as SIGNED +// ints). +struct ALIGN16 intx4 +{ + int32 m_i32[4]; + + inline int & operator[](int which) + { + return m_i32[which]; + } + + inline const int & operator[](int which) const + { + return m_i32[which]; + } + + inline int32 *Base() { + return m_i32; + } + + inline const int32 *Base() const + { + return m_i32; + } + + inline const bool operator==(const intx4 &other) const + { + return m_i32[0] == other.m_i32[0] && + m_i32[1] == other.m_i32[1] && + m_i32[2] == other.m_i32[2] && + m_i32[3] == other.m_i32[3] ; + } +} ALIGN16_POST; + + +#if defined( _DEBUG ) && defined( _X360 ) +FORCEINLINE void TestVPUFlags() +{ + // Check that the VPU is in the appropriate (Java-compliant) mode (see 3.2.1 in altivec_pem.pdf on xds.xbox.com) + __vector4 a; + __asm + { + mfvscr a; + } + unsigned int * flags = (unsigned int *)&a; + unsigned int controlWord = flags[3]; + Assert(controlWord == 0); +} +#else // _DEBUG +FORCEINLINE void TestVPUFlags() {} +#endif // _DEBUG + + +// useful constants in SIMD packed float format: +// (note: some of these aren't stored on the 360, +// but are manufactured directly in one or two +// instructions, saving a load and possible L2 +// miss.) +#ifndef _X360 +extern const fltx4 Four_Zeros; // 0 0 0 0 +extern const fltx4 Four_Ones; // 1 1 1 1 +extern const fltx4 Four_Twos; // 2 2 2 2 +extern const fltx4 Four_Threes; // 3 3 3 3 +extern const fltx4 Four_Fours; // guess. +extern const fltx4 Four_Point225s; // .225 .225 .225 .225 +extern const fltx4 Four_PointFives; // .5 .5 .5 .5 +extern const fltx4 Four_Epsilons; // FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON +extern const fltx4 Four_2ToThe21s; // (1<<21).. +extern const fltx4 Four_2ToThe22s; // (1<<22).. +extern const fltx4 Four_2ToThe23s; // (1<<23).. +extern const fltx4 Four_2ToThe24s; // (1<<24).. +extern const fltx4 Four_Origin; // 0 0 0 1 (origin point, like vr0 on the PS2) +extern const fltx4 Four_NegativeOnes; // -1 -1 -1 -1 +#else +#define Four_Zeros XMVectorZero() // 0 0 0 0 +#define Four_Ones XMVectorSplatOne() // 1 1 1 1 +extern const fltx4 Four_Twos; // 2 2 2 2 +extern const fltx4 Four_Threes; // 3 3 3 3 +extern const fltx4 Four_Fours; // guess. +extern const fltx4 Four_Point225s; // .225 .225 .225 .225 +extern const fltx4 Four_PointFives; // .5 .5 .5 .5 +extern const fltx4 Four_Epsilons; // FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON +extern const fltx4 Four_2ToThe21s; // (1<<21).. +extern const fltx4 Four_2ToThe22s; // (1<<22).. +extern const fltx4 Four_2ToThe23s; // (1<<23).. +extern const fltx4 Four_2ToThe24s; // (1<<24).. +extern const fltx4 Four_Origin; // 0 0 0 1 (origin point, like vr0 on the PS2) +extern const fltx4 Four_NegativeOnes; // -1 -1 -1 -1 +#endif +extern const fltx4 Four_FLT_MAX; // FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX +extern const fltx4 Four_Negative_FLT_MAX; // -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX +extern const fltx4 g_SIMD_0123; // 0 1 2 3 as float + +// external aligned integer constants +extern const ALIGN16 uint32 g_SIMD_clear_signmask[] ALIGN16_POST; // 0x7fffffff x 4 +extern const ALIGN16 uint32 g_SIMD_signmask[] ALIGN16_POST; // 0x80000000 x 4 +extern const ALIGN16 uint32 g_SIMD_lsbmask[] ALIGN16_POST; // 0xfffffffe x 4 +extern const ALIGN16 uint32 g_SIMD_clear_wmask[] ALIGN16_POST; // -1 -1 -1 0 +extern const ALIGN16 uint32 g_SIMD_ComponentMask[4][4] ALIGN16_POST; // [0xFFFFFFFF 0 0 0], [0 0xFFFFFFFF 0 0], [0 0 0xFFFFFFFF 0], [0 0 0 0xFFFFFFFF] +extern const ALIGN16 uint32 g_SIMD_AllOnesMask[] ALIGN16_POST; // ~0,~0,~0,~0 +extern const ALIGN16 uint32 g_SIMD_Low16BitsMask[] ALIGN16_POST; // 0xffff x 4 + +// this mask is used for skipping the tail of things. If you have N elements in an array, and wish +// to mask out the tail, g_SIMD_SkipTailMask[N & 3] what you want to use for the last iteration. +extern const uint32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST; + +// Define prefetch macros. +// The characteristics of cache and prefetch are completely +// different between the different platforms, so you DO NOT +// want to just define one macro that maps to every platform +// intrinsic under the hood -- you need to prefetch at different +// intervals between x86 and PPC, for example, and that is +// a higher level code change. +// On the other hand, I'm tired of typing #ifdef _X360 +// all over the place, so this is just a nop on Intel, PS3. +#ifdef _X360 +#define PREFETCH360(address, offset) __dcbt(offset,address) +#else +#define PREFETCH360(x,y) // nothing +#endif + +#if USE_STDC_FOR_SIMD + +//--------------------------------------------------------------------- +// Standard C (fallback/Linux) implementation (only there for compat - slow) +//--------------------------------------------------------------------- + +FORCEINLINE float SubFloat( const fltx4 & a, int idx ) +{ + return a.m128_f32[ idx ]; +} + +FORCEINLINE float & SubFloat( fltx4 & a, int idx ) +{ + return a.m128_f32[idx]; +} + +FORCEINLINE uint32 SubInt( const fltx4 & a, int idx ) +{ + return a.m128_u32[idx]; +} + +FORCEINLINE uint32 & SubInt( fltx4 & a, int idx ) +{ + return a.m128_u32[idx]; +} + +// Return one in the fastest way -- on the x360, faster even than loading. +FORCEINLINE fltx4 LoadZeroSIMD( void ) +{ + return Four_Zeros; +} + +// Return one in the fastest way -- on the x360, faster even than loading. +FORCEINLINE fltx4 LoadOneSIMD( void ) +{ + return Four_Ones; +} + +FORCEINLINE fltx4 SplatXSIMD( const fltx4 & a ) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = SubFloat( a, 0 ); + SubFloat( retVal, 1 ) = SubFloat( a, 0 ); + SubFloat( retVal, 2 ) = SubFloat( a, 0 ); + SubFloat( retVal, 3 ) = SubFloat( a, 0 ); + return retVal; +} + +FORCEINLINE fltx4 SplatYSIMD( fltx4 a ) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = SubFloat( a, 1 ); + SubFloat( retVal, 1 ) = SubFloat( a, 1 ); + SubFloat( retVal, 2 ) = SubFloat( a, 1 ); + SubFloat( retVal, 3 ) = SubFloat( a, 1 ); + return retVal; +} + +FORCEINLINE fltx4 SplatZSIMD( fltx4 a ) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = SubFloat( a, 2 ); + SubFloat( retVal, 1 ) = SubFloat( a, 2 ); + SubFloat( retVal, 2 ) = SubFloat( a, 2 ); + SubFloat( retVal, 3 ) = SubFloat( a, 2 ); + return retVal; +} + +FORCEINLINE fltx4 SplatWSIMD( fltx4 a ) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = SubFloat( a, 3 ); + SubFloat( retVal, 1 ) = SubFloat( a, 3 ); + SubFloat( retVal, 2 ) = SubFloat( a, 3 ); + SubFloat( retVal, 3 ) = SubFloat( a, 3 ); + return retVal; +} + +FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x ) +{ + fltx4 result = a; + SubFloat( result, 0 ) = SubFloat( x, 0 ); + return result; +} + +FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y ) +{ + fltx4 result = a; + SubFloat( result, 1 ) = SubFloat( y, 1 ); + return result; +} + +FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z ) +{ + fltx4 result = a; + SubFloat( result, 2 ) = SubFloat( z, 2 ); + return result; +} + +FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w ) +{ + fltx4 result = a; + SubFloat( result, 3 ) = SubFloat( w, 3 ); + return result; +} + +FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, int nComponent, float flValue ) +{ + fltx4 result = a; + SubFloat( result, nComponent ) = flValue; + return result; +} + +// a b c d -> b c d a +FORCEINLINE fltx4 RotateLeft( const fltx4 & a ) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = SubFloat( a, 1 ); + SubFloat( retVal, 1 ) = SubFloat( a, 2 ); + SubFloat( retVal, 2 ) = SubFloat( a, 3 ); + SubFloat( retVal, 3 ) = SubFloat( a, 0 ); + return retVal; +} + +// a b c d -> c d a b +FORCEINLINE fltx4 RotateLeft2( const fltx4 & a ) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = SubFloat( a, 2 ); + SubFloat( retVal, 1 ) = SubFloat( a, 3 ); + SubFloat( retVal, 2 ) = SubFloat( a, 0 ); + SubFloat( retVal, 3 ) = SubFloat( a, 1 ); + return retVal; +} + +#define BINOP(op) \ + fltx4 retVal; \ + SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) op SubFloat( b, 0 ) ); \ + SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) op SubFloat( b, 1 ) ); \ + SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) op SubFloat( b, 2 ) ); \ + SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) op SubFloat( b, 3 ) ); \ + return retVal; + +#define IBINOP(op) \ + fltx4 retVal; \ + SubInt( retVal, 0 ) = ( SubInt( a, 0 ) op SubInt ( b, 0 ) ); \ + SubInt( retVal, 1 ) = ( SubInt( a, 1 ) op SubInt ( b, 1 ) ); \ + SubInt( retVal, 2 ) = ( SubInt( a, 2 ) op SubInt ( b, 2 ) ); \ + SubInt( retVal, 3 ) = ( SubInt( a, 3 ) op SubInt ( b, 3 ) ); \ + return retVal; + +FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b ) +{ + BINOP(+); +} + +FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b +{ + BINOP(-); +}; + +FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b +{ + BINOP(*); +} + +FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b +{ + BINOP(/); +} + + +FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c +{ + return AddSIMD( MulSIMD(a,b), c ); +} + +FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b +{ + return SubSIMD( c, MulSIMD(a,b) ); +}; + + +FORCEINLINE fltx4 SinSIMD( const fltx4 &radians ) +{ + fltx4 result; + SubFloat( result, 0 ) = sin( SubFloat( radians, 0 ) ); + SubFloat( result, 1 ) = sin( SubFloat( radians, 1 ) ); + SubFloat( result, 2 ) = sin( SubFloat( radians, 2 ) ); + SubFloat( result, 3 ) = sin( SubFloat( radians, 3 ) ); + return result; +} + +FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) +{ + SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) ); + SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) ); + SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) ); +} + +FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) +{ + SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) ); + SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) ); + SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) ); + SinCos( SubFloat( radians, 3 ), &SubFloat( sine, 3 ), &SubFloat( cosine, 3 ) ); +} + +FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine ) +{ + fltx4 result; + SubFloat( result, 0 ) = asin( SubFloat( sine, 0 ) ); + SubFloat( result, 1 ) = asin( SubFloat( sine, 1 ) ); + SubFloat( result, 2 ) = asin( SubFloat( sine, 2 ) ); + SubFloat( result, 3 ) = asin( SubFloat( sine, 3 ) ); + return result; +} + +FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs ) +{ + fltx4 result; + SubFloat( result, 0 ) = acos( SubFloat( cs, 0 ) ); + SubFloat( result, 1 ) = acos( SubFloat( cs, 1 ) ); + SubFloat( result, 2 ) = acos( SubFloat( cs, 2 ) ); + SubFloat( result, 3 ) = acos( SubFloat( cs, 3 ) ); + return result; +} + +// tan^1(a/b) .. ie, pass sin in as a and cos in as b +FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b ) +{ + fltx4 result; + SubFloat( result, 0 ) = atan2( SubFloat( a, 0 ), SubFloat( b, 0 ) ); + SubFloat( result, 1 ) = atan2( SubFloat( a, 1 ), SubFloat( b, 1 ) ); + SubFloat( result, 2 ) = atan2( SubFloat( a, 2 ), SubFloat( b, 2 ) ); + SubFloat( result, 3 ) = atan2( SubFloat( a, 3 ), SubFloat( b, 3 ) ); + return result; +} + +FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = max( SubFloat( a, 0 ), SubFloat( b, 0 ) ); + SubFloat( retVal, 1 ) = max( SubFloat( a, 1 ), SubFloat( b, 1 ) ); + SubFloat( retVal, 2 ) = max( SubFloat( a, 2 ), SubFloat( b, 2 ) ); + SubFloat( retVal, 3 ) = max( SubFloat( a, 3 ), SubFloat( b, 3 ) ); + return retVal; +} + +FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = min( SubFloat( a, 0 ), SubFloat( b, 0 ) ); + SubFloat( retVal, 1 ) = min( SubFloat( a, 1 ), SubFloat( b, 1 ) ); + SubFloat( retVal, 2 ) = min( SubFloat( a, 2 ), SubFloat( b, 2 ) ); + SubFloat( retVal, 3 ) = min( SubFloat( a, 3 ), SubFloat( b, 3 ) ); + return retVal; +} + +FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b +{ + IBINOP(&); +} + +FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // ~a & b +{ + fltx4 retVal; + SubInt( retVal, 0 ) = ~SubInt( a, 0 ) & SubInt( b, 0 ); + SubInt( retVal, 1 ) = ~SubInt( a, 1 ) & SubInt( b, 1 ); + SubInt( retVal, 2 ) = ~SubInt( a, 2 ) & SubInt( b, 2 ); + SubInt( retVal, 3 ) = ~SubInt( a, 3 ) & SubInt( b, 3 ); + return retVal; +} + +FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b +{ + IBINOP(^); +} + +FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b +{ + IBINOP(|); +} + +FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a +{ + fltx4 retval; + SubFloat( retval, 0 ) = -SubFloat( a, 0 ); + SubFloat( retval, 1 ) = -SubFloat( a, 1 ); + SubFloat( retval, 2 ) = -SubFloat( a, 2 ); + SubFloat( retval, 3 ) = -SubFloat( a, 3 ); + + return retval; +} + +FORCEINLINE bool IsAllZeros( const fltx4 & a ) // all floats of a zero? +{ + return ( SubFloat( a, 0 ) == 0.0 ) && + ( SubFloat( a, 1 ) == 0.0 ) && + ( SubFloat( a, 2 ) == 0.0 ) && + ( SubFloat( a, 3 ) == 0.0 ) ; +} + + +// for branching when a.xyzw > b.xyzw +FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b ) +{ + return SubFloat(a,0) > SubFloat(b,0) && + SubFloat(a,1) > SubFloat(b,1) && + SubFloat(a,2) > SubFloat(b,2) && + SubFloat(a,3) > SubFloat(b,3); +} + +// for branching when a.xyzw >= b.xyzw +FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b ) +{ + return SubFloat(a,0) >= SubFloat(b,0) && + SubFloat(a,1) >= SubFloat(b,1) && + SubFloat(a,2) >= SubFloat(b,2) && + SubFloat(a,3) >= SubFloat(b,3); +} + +// For branching if all a.xyzw == b.xyzw +FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b ) +{ + return SubFloat(a,0) == SubFloat(b,0) && + SubFloat(a,1) == SubFloat(b,1) && + SubFloat(a,2) == SubFloat(b,2) && + SubFloat(a,3) == SubFloat(b,3); +} + +FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set +{ + int nRet = 0; + + nRet |= ( SubInt( a, 0 ) & 0x80000000 ) >> 31; // sign(x) -> bit 0 + nRet |= ( SubInt( a, 1 ) & 0x80000000 ) >> 30; // sign(y) -> bit 1 + nRet |= ( SubInt( a, 2 ) & 0x80000000 ) >> 29; // sign(z) -> bit 2 + nRet |= ( SubInt( a, 3 ) & 0x80000000 ) >> 28; // sign(w) -> bit 3 + + return nRet; +} + +FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0) +{ + return (0 != TestSignSIMD( a )); +} + +FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0 +{ + fltx4 retVal; + SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) == SubFloat( b, 0 )) ? ~0 : 0; + SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) == SubFloat( b, 1 )) ? ~0 : 0; + SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) == SubFloat( b, 2 )) ? ~0 : 0; + SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) == SubFloat( b, 3 )) ? ~0 : 0; + return retVal; +} + +FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0 +{ + fltx4 retVal; + SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) > SubFloat( b, 0 )) ? ~0 : 0; + SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) > SubFloat( b, 1 )) ? ~0 : 0; + SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) > SubFloat( b, 2 )) ? ~0 : 0; + SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) > SubFloat( b, 3 )) ? ~0 : 0; + return retVal; +} + +FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0 +{ + fltx4 retVal; + SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) >= SubFloat( b, 0 )) ? ~0 : 0; + SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) >= SubFloat( b, 1 )) ? ~0 : 0; + SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) >= SubFloat( b, 2 )) ? ~0 : 0; + SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) >= SubFloat( b, 3 )) ? ~0 : 0; + return retVal; +} + +FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? ~0:0 +{ + fltx4 retVal; + SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) < SubFloat( b, 0 )) ? ~0 : 0; + SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) < SubFloat( b, 1 )) ? ~0 : 0; + SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) < SubFloat( b, 2 )) ? ~0 : 0; + SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) < SubFloat( b, 3 )) ? ~0 : 0; + return retVal; +} + +FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? ~0:0 +{ + fltx4 retVal; + SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 )) ? ~0 : 0; + SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 )) ? ~0 : 0; + SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 )) ? ~0 : 0; + SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 )) ? ~0 : 0; + return retVal; +} + +FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0 +{ + fltx4 retVal; + SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 ) && SubFloat( a, 0 ) >= -SubFloat( b, 0 ) ) ? ~0 : 0; + SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 ) && SubFloat( a, 1 ) >= -SubFloat( b, 1 ) ) ? ~0 : 0; + SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 ) && SubFloat( a, 2 ) >= -SubFloat( b, 2 ) ) ? ~0 : 0; + SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 ) && SubFloat( a, 3 ) >= -SubFloat( b, 3 ) ) ? ~0 : 0; + return retVal; +} + + +FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue ) +{ + return OrSIMD( + AndSIMD( ReplacementMask, NewValue ), + AndNotSIMD( ReplacementMask, OldValue ) ); +} + +FORCEINLINE fltx4 ReplicateX4( float flValue ) // a,a,a,a +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = flValue; + SubFloat( retVal, 1 ) = flValue; + SubFloat( retVal, 2 ) = flValue; + SubFloat( retVal, 3 ) = flValue; + return retVal; +} + +/// replicate a single 32 bit integer value to all 4 components of an m128 +FORCEINLINE fltx4 ReplicateIX4( int nValue ) +{ + fltx4 retVal; + SubInt( retVal, 0 ) = nValue; + SubInt( retVal, 1 ) = nValue; + SubInt( retVal, 2 ) = nValue; + SubInt( retVal, 3 ) = nValue; + return retVal; + +} + +// Round towards positive infinity +FORCEINLINE fltx4 CeilSIMD( const fltx4 &a ) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = ceil( SubFloat( a, 0 ) ); + SubFloat( retVal, 1 ) = ceil( SubFloat( a, 1 ) ); + SubFloat( retVal, 2 ) = ceil( SubFloat( a, 2 ) ); + SubFloat( retVal, 3 ) = ceil( SubFloat( a, 3 ) ); + return retVal; + +} + +// Round towards negative infinity +FORCEINLINE fltx4 FloorSIMD( const fltx4 &a ) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = floor( SubFloat( a, 0 ) ); + SubFloat( retVal, 1 ) = floor( SubFloat( a, 1 ) ); + SubFloat( retVal, 2 ) = floor( SubFloat( a, 2 ) ); + SubFloat( retVal, 3 ) = floor( SubFloat( a, 3 ) ); + return retVal; + +} + +FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = sqrt( SubFloat( a, 0 ) ); + SubFloat( retVal, 1 ) = sqrt( SubFloat( a, 1 ) ); + SubFloat( retVal, 2 ) = sqrt( SubFloat( a, 2 ) ); + SubFloat( retVal, 3 ) = sqrt( SubFloat( a, 3 ) ); + return retVal; +} + +FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = sqrt( SubFloat( a, 0 ) ); + SubFloat( retVal, 1 ) = sqrt( SubFloat( a, 1 ) ); + SubFloat( retVal, 2 ) = sqrt( SubFloat( a, 2 ) ); + SubFloat( retVal, 3 ) = sqrt( SubFloat( a, 3 ) ); + return retVal; +} + +FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) ); + SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) ); + SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) ); + SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) ); + return retVal; +} + +FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a ) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) != 0.0f ? SubFloat( a, 0 ) : FLT_EPSILON ); + SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) != 0.0f ? SubFloat( a, 1 ) : FLT_EPSILON ); + SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) != 0.0f ? SubFloat( a, 2 ) : FLT_EPSILON ); + SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) != 0.0f ? SubFloat( a, 3 ) : FLT_EPSILON ); + return retVal; +} + +FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) ); + SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) ); + SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) ); + SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) ); + return retVal; +} + +FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = 1.0 / SubFloat( a, 0 ); + SubFloat( retVal, 1 ) = 1.0 / SubFloat( a, 1 ); + SubFloat( retVal, 2 ) = 1.0 / SubFloat( a, 2 ); + SubFloat( retVal, 3 ) = 1.0 / SubFloat( a, 3 ); + return retVal; +} + +FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = 1.0 / SubFloat( a, 0 ); + SubFloat( retVal, 1 ) = 1.0 / SubFloat( a, 1 ); + SubFloat( retVal, 2 ) = 1.0 / SubFloat( a, 2 ); + SubFloat( retVal, 3 ) = 1.0 / SubFloat( a, 3 ); + return retVal; +} + +/// 1/x for all 4 values. +/// 1/0 will result in a big but NOT infinite result +FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a ) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = 1.0 / (SubFloat( a, 0 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 0 )); + SubFloat( retVal, 1 ) = 1.0 / (SubFloat( a, 1 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 1 )); + SubFloat( retVal, 2 ) = 1.0 / (SubFloat( a, 2 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 2 )); + SubFloat( retVal, 3 ) = 1.0 / (SubFloat( a, 3 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 3 )); + return retVal; +} + +FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a ) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = 1.0 / (SubFloat( a, 0 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 0 )); + SubFloat( retVal, 1 ) = 1.0 / (SubFloat( a, 1 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 1 )); + SubFloat( retVal, 2 ) = 1.0 / (SubFloat( a, 2 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 2 )); + SubFloat( retVal, 3 ) = 1.0 / (SubFloat( a, 3 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 3 )); + return retVal; +} + +// 2^x for all values (the antilog) +FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower ) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = powf( 2, SubFloat(toPower, 0) ); + SubFloat( retVal, 1 ) = powf( 2, SubFloat(toPower, 1) ); + SubFloat( retVal, 2 ) = powf( 2, SubFloat(toPower, 2) ); + SubFloat( retVal, 3 ) = powf( 2, SubFloat(toPower, 3) ); + + return retVal; +} + +FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b ) +{ + float flDot = SubFloat( a, 0 ) * SubFloat( b, 0 ) + + SubFloat( a, 1 ) * SubFloat( b, 1 ) + + SubFloat( a, 2 ) * SubFloat( b, 2 ); + return ReplicateX4( flDot ); +} + +FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b ) +{ + float flDot = SubFloat( a, 0 ) * SubFloat( b, 0 ) + + SubFloat( a, 1 ) * SubFloat( b, 1 ) + + SubFloat( a, 2 ) * SubFloat( b, 2 ) + + SubFloat( a, 3 ) * SubFloat( b, 3 ); + return ReplicateX4( flDot ); +} + +// Clamps the components of a vector to a specified minimum and maximum range. +FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max) +{ + return MaxSIMD( min, MinSIMD( max, in ) ); +} + +// Squelch the w component of a vector to +0.0. +// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy) +FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a ) +{ + fltx4 retval; + retval = a; + SubFloat( retval, 0 ) = 0; + return retval; +} + +FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD ) +{ + return *( reinterpret_cast< const fltx4 *> ( pSIMD ) ); +} + +FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD ) +{ + return *( reinterpret_cast< const fltx4 *> ( pSIMD ) ); +} + +FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD ) +{ + return *( reinterpret_cast< const fltx4 *> ( pSIMD ) ); +} + +// for the transitional class -- load a 3-by VectorAligned and squash its w component +FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD ) +{ + fltx4 retval = LoadAlignedSIMD(pSIMD.Base()); + // squelch w + SubInt( retval, 3 ) = 0; + return retval; +} + +FORCEINLINE void StoreAlignedSIMD( float *pSIMD, const fltx4 & a ) +{ + *( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a; +} + +FORCEINLINE void StoreUnalignedSIMD( float *pSIMD, const fltx4 & a ) +{ + *( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a; +} + +FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a ) +{ + *pSIMD = SubFloat(a, 0); + *(pSIMD+1) = SubFloat(a, 1); + *(pSIMD+2) = SubFloat(a, 2); +} + +// strongly typed -- syntactic castor oil used for typechecking as we transition to SIMD +FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a ) +{ + StoreAlignedSIMD(pSIMD->Base(),a); +} + +FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w ) +{ +#define SWAP_FLOATS( _a_, _ia_, _b_, _ib_ ) { float tmp = SubFloat( _a_, _ia_ ); SubFloat( _a_, _ia_ ) = SubFloat( _b_, _ib_ ); SubFloat( _b_, _ib_ ) = tmp; } + SWAP_FLOATS( x, 1, y, 0 ); + SWAP_FLOATS( x, 2, z, 0 ); + SWAP_FLOATS( x, 3, w, 0 ); + SWAP_FLOATS( y, 2, z, 1 ); + SWAP_FLOATS( y, 3, w, 1 ); + SWAP_FLOATS( z, 3, w, 2 ); +} + +// find the lowest component of a.x, a.y, a.z, +// and replicate it to the whole return value. +FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a ) +{ + float lowest = min( min( SubFloat(a, 0), SubFloat(a, 1) ), SubFloat(a, 2)); + return ReplicateX4(lowest); +} + +// find the highest component of a.x, a.y, a.z, +// and replicate it to the whole return value. +FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a ) +{ + float highest = max( max( SubFloat(a, 0), SubFloat(a, 1) ), SubFloat(a, 2)); + return ReplicateX4(highest); +} + +// Fixed-point conversion and save as SIGNED INTS. +// pDest->x = Int (vSrc.x) +// note: some architectures have means of doing +// fixed point conversion when the fix depth is +// specified as an immediate.. but there is no way +// to guarantee an immediate as a parameter to function +// like this. +FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc) +{ + (*pDest)[0] = SubFloat(vSrc, 0); + (*pDest)[1] = SubFloat(vSrc, 1); + (*pDest)[2] = SubFloat(vSrc, 2); + (*pDest)[3] = SubFloat(vSrc, 3); +} + +// ------------------------------------ +// INTEGER SIMD OPERATIONS. +// ------------------------------------ +// splat all components of a vector to a signed immediate int number. +FORCEINLINE fltx4 IntSetImmediateSIMD( int nValue ) +{ + fltx4 retval; + SubInt( retval, 0 ) = SubInt( retval, 1 ) = SubInt( retval, 2 ) = SubInt( retval, 3) = nValue; + return retval; +} + +// Load 4 aligned words into a SIMD register +FORCEINLINE i32x4 LoadAlignedIntSIMD(const void * RESTRICT pSIMD) +{ + return *( reinterpret_cast< const i32x4 *> ( pSIMD ) ); +} + +// Load 4 unaligned words into a SIMD register +FORCEINLINE i32x4 LoadUnalignedIntSIMD( const void * RESTRICT pSIMD) +{ + return *( reinterpret_cast< const i32x4 *> ( pSIMD ) ); +} + +// save into four words, 16-byte aligned +FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const fltx4 & a ) +{ + *( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a; +} + +FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a ) +{ + *( reinterpret_cast< i32x4 *> ( pSIMD.Base() ) ) = a; +} + +FORCEINLINE void StoreUnalignedIntSIMD( int32 *pSIMD, const fltx4 & a ) +{ + *( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a; +} + +// Take a fltx4 containing fixed-point uints and +// return them as single precision floats. No +// fixed point conversion is done. +FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const u32x4 &vSrcA ) +{ + Assert(0); /* pc has no such operation */ + fltx4 retval; + SubFloat( retval, 0 ) = ( (float) SubInt( retval, 0 ) ); + SubFloat( retval, 1 ) = ( (float) SubInt( retval, 1 ) ); + SubFloat( retval, 2 ) = ( (float) SubInt( retval, 2 ) ); + SubFloat( retval, 3 ) = ( (float) SubInt( retval, 3 ) ); + return retval; +} + + +#if 0 /* pc has no such op */ +// Take a fltx4 containing fixed-point sints and +// return them as single precision floats. No +// fixed point conversion is done. +FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA ) +{ + fltx4 retval; + SubFloat( retval, 0 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[0])) ); + SubFloat( retval, 1 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[1])) ); + SubFloat( retval, 2 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[2])) ); + SubFloat( retval, 3 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[3])) ); + return retval; +} + + +/* + works on fltx4's as if they are four uints. + the first parameter contains the words to be shifted, + the second contains the amount to shift by AS INTS + + for i = 0 to 3 + shift = vSrcB_i*32:(i*32)+4 + vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift +*/ +FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4 &vSrcA, const i32x4 &vSrcB) +{ + i32x4 retval; + SubInt(retval, 0) = SubInt(vSrcA, 0) << SubInt(vSrcB, 0); + SubInt(retval, 1) = SubInt(vSrcA, 1) << SubInt(vSrcB, 1); + SubInt(retval, 2) = SubInt(vSrcA, 2) << SubInt(vSrcB, 2); + SubInt(retval, 3) = SubInt(vSrcA, 3) << SubInt(vSrcB, 3); + + + return retval; +} +#endif + +#elif ( defined( _X360 ) ) + +//--------------------------------------------------------------------- +// X360 implementation +//--------------------------------------------------------------------- + +FORCEINLINE float & FloatSIMD( fltx4 & a, int idx ) +{ + fltx4_union & a_union = (fltx4_union &)a; + return a_union.m128_f32[idx]; +} + +FORCEINLINE unsigned int & UIntSIMD( fltx4 & a, int idx ) +{ + fltx4_union & a_union = (fltx4_union &)a; + return a_union.m128_u32[idx]; +} + +FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b ) +{ + return __vaddfp( a, b ); +} + +FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b +{ + return __vsubfp( a, b ); +} + +FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b +{ + return __vmulfp( a, b ); +} + +FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c +{ + return __vmaddfp( a, b, c ); +} + +FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b +{ + return __vnmsubfp( a, b, c ); +}; + +FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b ) +{ + return __vmsum3fp( a, b ); +} + +FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b ) +{ + return __vmsum4fp( a, b ); +} + +FORCEINLINE fltx4 SinSIMD( const fltx4 &radians ) +{ + return XMVectorSin( radians ); +} + +FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) +{ + XMVectorSinCos( &sine, &cosine, radians ); +} + +FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) +{ + XMVectorSinCos( &sine, &cosine, radians ); +} + +FORCEINLINE void CosSIMD( fltx4 &cosine, const fltx4 &radians ) +{ + cosine = XMVectorCos( radians ); +} + +FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine ) +{ + return XMVectorASin( sine ); +} + +FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs ) +{ + return XMVectorACos( cs ); +} + +// tan^1(a/b) .. ie, pass sin in as a and cos in as b +FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b ) +{ + return XMVectorATan2( a, b ); +} + +// DivSIMD defined further down, since it uses ReciprocalSIMD + +FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b) +{ + return __vmaxfp( a, b ); +} + +FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b) +{ + return __vminfp( a, b ); +} + +FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b +{ + return __vand( a, b ); +} + +FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // ~a & b +{ + // NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second + return __vandc( b, a ); +} + +FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b +{ + return __vxor( a, b ); +} + +FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b +{ + return __vor( a, b ); +} + +FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a +{ + return XMVectorNegate(a); +} + +FORCEINLINE bool IsAllZeros( const fltx4 & a ) // all floats of a zero? +{ + unsigned int equalFlags = 0; + __vcmpeqfpR( a, Four_Zeros, &equalFlags ); + return XMComparisonAllTrue( equalFlags ); +} + +FORCEINLINE bool IsAnyZeros( const fltx4 & a ) // any floats are zero? +{ + unsigned int conditionregister; + XMVectorEqualR(&conditionregister, a, XMVectorZero()); + return XMComparisonAnyTrue(conditionregister); +} + +FORCEINLINE bool IsAnyXYZZero( const fltx4 &a ) // are any of x,y,z zero? +{ + // copy a's x component into w, in case w was zero. + fltx4 temp = __vrlimi(a, a, 1, 1); + unsigned int conditionregister; + XMVectorEqualR(&conditionregister, temp, XMVectorZero()); + return XMComparisonAnyTrue(conditionregister); +} + +// for branching when a.xyzw > b.xyzw +FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b ) +{ + unsigned int cr; + XMVectorGreaterR(&cr,a,b); + return XMComparisonAllTrue(cr); +} + +// for branching when a.xyzw >= b.xyzw +FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b ) +{ + unsigned int cr; + XMVectorGreaterOrEqualR(&cr,a,b); + return XMComparisonAllTrue(cr); +} + +// For branching if all a.xyzw == b.xyzw +FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b ) +{ + unsigned int cr; + XMVectorEqualR(&cr,a,b); + return XMComparisonAllTrue(cr); +} + + +FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set +{ + // NOTE: this maps to SSE way better than it does to VMX (most code uses IsAnyNegative(), though) + int nRet = 0; + + const fltx4_union & a_union = (const fltx4_union &)a; + nRet |= ( a_union.m128_u32[0] & 0x80000000 ) >> 31; // sign(x) -> bit 0 + nRet |= ( a_union.m128_u32[1] & 0x80000000 ) >> 30; // sign(y) -> bit 1 + nRet |= ( a_union.m128_u32[2] & 0x80000000 ) >> 29; // sign(z) -> bit 2 + nRet |= ( a_union.m128_u32[3] & 0x80000000 ) >> 28; // sign(w) -> bit 3 + + return nRet; +} + +// Squelch the w component of a vector to +0.0. +// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy) +FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a ) +{ + return __vrlimi( a, __vzero(), 1, 0 ); +} + +FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0) +{ + // NOTE: this tests the top bits of each vector element using integer math + // (so it ignores NaNs - it will return true for "-NaN") + unsigned int equalFlags = 0; + fltx4 signMask = __vspltisw( -1 ); // 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF (low order 5 bits of each element = 31) + signMask = __vslw( signMask, signMask ); // 0x80000000 0x80000000 0x80000000 0x80000000 + __vcmpequwR( Four_Zeros, __vand( signMask, a ), &equalFlags ); + return !XMComparisonAllTrue( equalFlags ); +} + +FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0 +{ + return __vcmpeqfp( a, b ); +} + + +FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0 +{ + return __vcmpgtfp( a, b ); +} + +FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0 +{ + return __vcmpgefp( a, b ); +} + +FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? ~0:0 +{ + return __vcmpgtfp( b, a ); +} + +FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? ~0:0 +{ + return __vcmpgefp( b, a ); +} + +FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0 +{ + return XMVectorInBounds( a, b ); +} + +// returned[i] = ReplacementMask[i] == 0 ? OldValue : NewValue +FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue ) +{ + return __vsel( OldValue, NewValue, ReplacementMask ); +} + +// AKA "Broadcast", "Splat" +FORCEINLINE fltx4 ReplicateX4( float flValue ) // a,a,a,a +{ + // NOTE: if flValue comes from a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!) + float * pValue = &flValue; + Assert( pValue ); + Assert( ((unsigned int)pValue & 3) == 0); + return __vspltw( __lvlx( pValue, 0 ), 0 ); +} + +FORCEINLINE fltx4 ReplicateX4( const float *pValue ) // a,a,a,a +{ + Assert( pValue ); + return __vspltw( __lvlx( pValue, 0 ), 0 ); +} + +/// replicate a single 32 bit integer value to all 4 components of an m128 +FORCEINLINE fltx4 ReplicateIX4( int nValue ) +{ + // NOTE: if nValue comes from a register, this causes a Load-Hit-Store stall (should not mix ints with fltx4s!) + int * pValue = &nValue; + Assert( pValue ); + Assert( ((unsigned int)pValue & 3) == 0); + return __vspltw( __lvlx( pValue, 0 ), 0 ); +} + +// Round towards positive infinity +FORCEINLINE fltx4 CeilSIMD( const fltx4 &a ) +{ + return __vrfip(a); +} + +// Round towards nearest integer +FORCEINLINE fltx4 RoundSIMD( const fltx4 &a ) +{ + return __vrfin(a); +} + +// Round towards negative infinity +FORCEINLINE fltx4 FloorSIMD( const fltx4 &a ) +{ + return __vrfim(a); +} + +FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less +{ + // This is emulated from rsqrt + return XMVectorSqrtEst( a ); +} + +FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a) +{ + // This is emulated from rsqrt + return XMVectorSqrt( a ); +} + +FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less +{ + return __vrsqrtefp( a ); +} + +FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a ) +{ + // Convert zeros to epsilons + fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros ); + fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) ); + return ReciprocalSqrtEstSIMD( a_safe ); +} + +FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a) +{ + // This uses Newton-Raphson to improve the HW result + return XMVectorReciprocalSqrt( a ); +} + +FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less +{ + return __vrefp( a ); +} + +/// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration. +/// No error checking! +FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a +{ + // This uses Newton-Raphson to improve the HW result + return XMVectorReciprocal( a ); +} + +// FIXME: on 360, this is very slow, since it uses ReciprocalSIMD (do we need DivEstSIMD?) +FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b +{ + return MulSIMD( ReciprocalSIMD( b ), a ); +} + +/// 1/x for all 4 values. +/// 1/0 will result in a big but NOT infinite result +FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a ) +{ + // Convert zeros to epsilons + fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros ); + fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) ); + return ReciprocalEstSIMD( a_safe ); +} + +FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a ) +{ + // Convert zeros to epsilons + fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros ); + fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) ); + return ReciprocalSIMD( a_safe ); + + // FIXME: This could be faster (BUT: it doesn't preserve the sign of -0.0, whereas the above does) + // fltx4 zeroMask = CmpEqSIMD( Four_Zeros, a ); + // fltx4 a_safe = XMVectorSelect( a, Four_Epsilons, zeroMask ); + // return ReciprocalSIMD( a_safe ); +} + +// CHRISG: is it worth doing integer bitfiddling for this? +// 2^x for all values (the antilog) +FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower ) +{ + return XMVectorExp(toPower); +} + +// Clamps the components of a vector to a specified minimum and maximum range. +FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max) +{ + return XMVectorClamp(in, min, max); +} + +FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD ) +{ + return XMLoadVector4( pSIMD ); +} + +// load a 3-vector (as opposed to LoadUnalignedSIMD, which loads a 4-vec). +FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD ) +{ + return XMLoadVector3( pSIMD ); +} + +FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD ) +{ + return *( reinterpret_cast< const fltx4 *> ( pSIMD ) ); +} + +// for the transitional class -- load a 3-by VectorAligned and squash its w component +FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD ) +{ + fltx4 out = XMLoadVector3A(pSIMD.Base()); + // squelch w + return __vrlimi( out, __vzero(), 1, 0 ); +} + +// for the transitional class -- load a 3-by VectorAligned and squash its w component +FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned * RESTRICT pSIMD ) +{ + fltx4 out = XMLoadVector3A(pSIMD); + // squelch w + return __vrlimi( out, __vzero(), 1, 0 ); +} + +FORCEINLINE void StoreAlignedSIMD( float *pSIMD, const fltx4 & a ) +{ + *( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a; +} + +FORCEINLINE void StoreUnalignedSIMD( float *pSIMD, const fltx4 & a ) +{ + XMStoreVector4( pSIMD, a ); +} + +FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a ) +{ + XMStoreVector3( pSIMD, a ); +} + + +// strongly typed -- for typechecking as we transition to SIMD +FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a ) +{ + XMStoreVector3A(pSIMD->Base(),a); +} + + +// Fixed-point conversion and save as SIGNED INTS. +// pDest->x = Int (vSrc.x) +// note: some architectures have means of doing +// fixed point conversion when the fix depth is +// specified as an immediate.. but there is no way +// to guarantee an immediate as a parameter to function +// like this. +FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc) +{ + fltx4 asInt = __vctsxs( vSrc, 0 ); + XMStoreVector4A(pDest->Base(), asInt); +} + +FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w ) +{ + XMMATRIX xyzwMatrix = _XMMATRIX( x, y, z, w ); + xyzwMatrix = XMMatrixTranspose( xyzwMatrix ); + x = xyzwMatrix.r[0]; + y = xyzwMatrix.r[1]; + z = xyzwMatrix.r[2]; + w = xyzwMatrix.r[3]; +} + +// Return one in the fastest way -- faster even than loading. +FORCEINLINE fltx4 LoadZeroSIMD( void ) +{ + return XMVectorZero(); +} + +// Return one in the fastest way -- faster even than loading. +FORCEINLINE fltx4 LoadOneSIMD( void ) +{ + return XMVectorSplatOne(); +} + +FORCEINLINE fltx4 SplatXSIMD( fltx4 a ) +{ + return XMVectorSplatX( a ); +} + +FORCEINLINE fltx4 SplatYSIMD( fltx4 a ) +{ + return XMVectorSplatY( a ); +} + +FORCEINLINE fltx4 SplatZSIMD( fltx4 a ) +{ + return XMVectorSplatZ( a ); +} + +FORCEINLINE fltx4 SplatWSIMD( fltx4 a ) +{ + return XMVectorSplatW( a ); +} + +FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x ) +{ + fltx4 result = __vrlimi(a, x, 8, 0); + return result; +} + +FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y ) +{ + fltx4 result = __vrlimi(a, y, 4, 0); + return result; +} + +FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z ) +{ + fltx4 result = __vrlimi(a, z, 2, 0); + return result; +} + +FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w ) +{ + fltx4 result = __vrlimi(a, w, 1, 0); + return result; +} + +FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, int nComponent, float flValue ) +{ + static int s_nVrlimiMask[4] = { 8, 4, 2, 1 }; + fltx4 val = ReplicateX4( flValue ); + fltx4 result = __vrlimi(a, val, s_nVrlimiMask[nComponent], 0); + return result; +} + +FORCEINLINE fltx4 RotateLeft( const fltx4 & a ) +{ + fltx4 compareOne = a; + return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 1 ); +} + +FORCEINLINE fltx4 RotateLeft2( const fltx4 & a ) +{ + fltx4 compareOne = a; + return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 2 ); +} + + + +// find the lowest component of a.x, a.y, a.z, +// and replicate it to the whole return value. +// ignores a.w. +// Though this is only five instructions long, +// they are all dependent, making this stall city. +// Forcing this inline should hopefully help with scheduling. +FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a ) +{ + // a is [x,y,z,G] (where G is garbage) + // rotate left by one + fltx4 compareOne = a ; + compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 ); + // compareOne is [y,z,G,G] + fltx4 retval = MinSIMD( a, compareOne ); + // retVal is [min(x,y), min(y,z), G, G] + compareOne = __vrlimi( compareOne, a, 8 , 2); + // compareOne is [z, G, G, G] + retval = MinSIMD( retval, compareOne ); + // retVal = [ min(min(x,y),z), G, G, G ] + + // splat the x component out to the whole vector and return + return SplatXSIMD( retval ); +} + +// find the highest component of a.x, a.y, a.z, +// and replicate it to the whole return value. +// ignores a.w. +// Though this is only five instructions long, +// they are all dependent, making this stall city. +// Forcing this inline should hopefully help with scheduling. +FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a ) +{ + // a is [x,y,z,G] (where G is garbage) + // rotate left by one + fltx4 compareOne = a ; + compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 ); + // compareOne is [y,z,G,G] + fltx4 retval = MaxSIMD( a, compareOne ); + // retVal is [max(x,y), max(y,z), G, G] + compareOne = __vrlimi( compareOne, a, 8 , 2); + // compareOne is [z, G, G, G] + retval = MaxSIMD( retval, compareOne ); + // retVal = [ max(max(x,y),z), G, G, G ] + + // splat the x component out to the whole vector and return + return SplatXSIMD( retval ); +} + + +// Transform many (horizontal) points in-place by a 3x4 matrix, +// here already loaded onto three fltx4 registers. +// The points must be stored as 16-byte aligned. They are points +// and not vectors because we assume the w-component to be 1. +// To spare yourself the annoyance of loading the matrix yourself, +// use one of the overloads below. +void TransformManyPointsBy(VectorAligned * RESTRICT pVectors, unsigned int numVectors, FLTX4 mRow1, FLTX4 mRow2, FLTX4 mRow3); + +// Transform many (horizontal) points in-place by a 3x4 matrix. +// The points must be stored as 16-byte aligned. They are points +// and not vectors because we assume the w-component to be 1. +// In this function, the matrix need not be aligned. +FORCEINLINE void TransformManyPointsBy(VectorAligned * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t &pMatrix) +{ + return TransformManyPointsBy(pVectors, numVectors, + LoadUnalignedSIMD( pMatrix[0] ), LoadUnalignedSIMD( pMatrix[1] ), LoadUnalignedSIMD( pMatrix[2] ) ); +} + +// Transform many (horizontal) points in-place by a 3x4 matrix. +// The points must be stored as 16-byte aligned. They are points +// and not vectors because we assume the w-component to be 1. +// In this function, the matrix must itself be aligned on a 16-byte +// boundary. +FORCEINLINE void TransformManyPointsByA(VectorAligned * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t &pMatrix) +{ + return TransformManyPointsBy(pVectors, numVectors, + LoadAlignedSIMD( pMatrix[0] ), LoadAlignedSIMD( pMatrix[1] ), LoadAlignedSIMD( pMatrix[2] ) ); +} + +// ------------------------------------ +// INTEGER SIMD OPERATIONS. +// ------------------------------------ + +// Load 4 aligned words into a SIMD register +FORCEINLINE i32x4 LoadAlignedIntSIMD( const void * RESTRICT pSIMD) +{ + return XMLoadVector4A(pSIMD); +} + +// Load 4 unaligned words into a SIMD register +FORCEINLINE i32x4 LoadUnalignedIntSIMD(const void * RESTRICT pSIMD) +{ + return XMLoadVector4( pSIMD ); +} + +// save into four words, 16-byte aligned +FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const fltx4 & a ) +{ + *( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a; +} + +FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a ) +{ + *( reinterpret_cast< i32x4 *> ( pSIMD.Base() ) ) = a; +} + +FORCEINLINE void StoreUnalignedIntSIMD( int32 *pSIMD, const fltx4 & a ) +{ + XMStoreVector4(pSIMD, a); +} + + +// Take a fltx4 containing fixed-point uints and +// return them as single precision floats. No +// fixed point conversion is done. +FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA ) +{ + return __vcfux( vSrcA, 0 ); +} + + +// Take a fltx4 containing fixed-point sints and +// return them as single precision floats. No +// fixed point conversion is done. +FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA ) +{ + return __vcfsx( vSrcA, 0 ); +} + +// Take a fltx4 containing fixed-point uints and +// return them as single precision floats. Each uint +// will be divided by 2^immed after conversion +// (eg, this is fixed point math). +/* as if: + FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed ) + { + return __vcfux( vSrcA, uImmed ); + } +*/ +#define UnsignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfux( (vSrcA), (uImmed) )) + +// Take a fltx4 containing fixed-point sints and +// return them as single precision floats. Each int +// will be divided by 2^immed (eg, this is fixed point +// math). +/* as if: + FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed ) + { + return __vcfsx( vSrcA, uImmed ); + } +*/ +#define SignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfsx( (vSrcA), (uImmed) )) + +// set all components of a vector to a signed immediate int number. +/* as if: + FORCEINLINE fltx4 IntSetImmediateSIMD(int toImmediate) + { + return __vspltisw( toImmediate ); + } +*/ +#define IntSetImmediateSIMD(x) (__vspltisw(x)) + +/* + works on fltx4's as if they are four uints. + the first parameter contains the words to be shifted, + the second contains the amount to shift by AS INTS + + for i = 0 to 3 + shift = vSrcB_i*32:(i*32)+4 + vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift +*/ +FORCEINLINE fltx4 IntShiftLeftWordSIMD(fltx4 vSrcA, fltx4 vSrcB) +{ + return __vslw(vSrcA, vSrcB); +} + +FORCEINLINE float SubFloat( const fltx4 & a, int idx ) +{ + // NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!) + const fltx4_union & a_union = (const fltx4_union &)a; + return a_union.m128_f32[ idx ]; +} + +FORCEINLINE float & SubFloat( fltx4 & a, int idx ) +{ + fltx4_union & a_union = (fltx4_union &)a; + return a_union.m128_f32[idx]; +} + +FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx ) +{ + fltx4 t = __vctuxs( a, 0 ); + const fltx4_union & a_union = (const fltx4_union &)t; + return a_union.m128_u32[idx]; +} + + +FORCEINLINE uint32 SubInt( const fltx4 & a, int idx ) +{ + const fltx4_union & a_union = (const fltx4_union &)a; + return a_union.m128_u32[idx]; +} + +FORCEINLINE uint32 & SubInt( fltx4 & a, int idx ) +{ + fltx4_union & a_union = (fltx4_union &)a; + return a_union.m128_u32[idx]; +} + +#else + +//--------------------------------------------------------------------- +// Intel/SSE implementation +//--------------------------------------------------------------------- + +FORCEINLINE void StoreAlignedSIMD( float * RESTRICT pSIMD, const fltx4 & a ) +{ + _mm_store_ps( pSIMD, a ); +} + +FORCEINLINE void StoreUnalignedSIMD( float * RESTRICT pSIMD, const fltx4 & a ) +{ + _mm_storeu_ps( pSIMD, a ); +} + + +FORCEINLINE fltx4 RotateLeft( const fltx4 & a ); +FORCEINLINE fltx4 RotateLeft2( const fltx4 & a ); + +FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a ) +{ + _mm_store_ss(pSIMD, a); + _mm_store_ss(pSIMD+1, RotateLeft(a)); + _mm_store_ss(pSIMD+2, RotateLeft2(a)); +} + +// strongly typed -- syntactic castor oil used for typechecking as we transition to SIMD +FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a ) +{ + StoreAlignedSIMD( pSIMD->Base(),a ); +} + +FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD ) +{ + return _mm_load_ps( reinterpret_cast< const float *> ( pSIMD ) ); +} + +FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b +{ + return _mm_and_ps( a, b ); +} + +FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // ~a & b +{ + return _mm_andnot_ps( a, b ); +} + +FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b +{ + return _mm_xor_ps( a, b ); +} + +FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b +{ + return _mm_or_ps( a, b ); +} + +// Squelch the w component of a vector to +0.0. +// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy) +FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a ) +{ + return AndSIMD( a, LoadAlignedSIMD( g_SIMD_clear_wmask ) ); +} + +// for the transitional class -- load a 3-by VectorAligned and squash its w component +FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD ) +{ + return SetWToZeroSIMD( LoadAlignedSIMD(pSIMD.Base()) ); +} + +FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD ) +{ + return _mm_loadu_ps( reinterpret_cast<const float *>( pSIMD ) ); +} + +FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD ) +{ + return _mm_loadu_ps( reinterpret_cast<const float *>( pSIMD ) ); +} + +/// replicate a single 32 bit integer value to all 4 components of an m128 +FORCEINLINE fltx4 ReplicateIX4( int i ) +{ + fltx4 value = _mm_set_ss( * ( ( float *) &i ) );; + return _mm_shuffle_ps( value, value, 0); +} + + +FORCEINLINE fltx4 ReplicateX4( float flValue ) +{ + __m128 value = _mm_set_ss( flValue ); + return _mm_shuffle_ps( value, value, 0 ); +} + + +FORCEINLINE float SubFloat( const fltx4 & a, int idx ) +{ + // NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!) +#ifndef POSIX + return a.m128_f32[ idx ]; +#else + return (reinterpret_cast<float const *>(&a))[idx]; +#endif +} + +FORCEINLINE float & SubFloat( fltx4 & a, int idx ) +{ +#ifndef POSIX + return a.m128_f32[ idx ]; +#else + return (reinterpret_cast<float *>(&a))[idx]; +#endif +} + +FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx ) +{ + return (uint32)SubFloat(a,idx); +} + +FORCEINLINE uint32 SubInt( const fltx4 & a, int idx ) +{ +#ifndef POSIX + return a.m128_u32[idx]; +#else + return (reinterpret_cast<uint32 const *>(&a))[idx]; +#endif +} + +FORCEINLINE uint32 & SubInt( fltx4 & a, int idx ) +{ +#ifndef POSIX + return a.m128_u32[idx]; +#else + return (reinterpret_cast<uint32 *>(&a))[idx]; +#endif +} + +// Return one in the fastest way -- on the x360, faster even than loading. +FORCEINLINE fltx4 LoadZeroSIMD( void ) +{ + return Four_Zeros; +} + +// Return one in the fastest way -- on the x360, faster even than loading. +FORCEINLINE fltx4 LoadOneSIMD( void ) +{ + return Four_Ones; +} + +FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue ) +{ + return OrSIMD( + AndSIMD( ReplacementMask, NewValue ), + AndNotSIMD( ReplacementMask, OldValue ) ); +} + +// remember, the SSE numbers its words 3 2 1 0 +// The way we want to specify shuffles is backwards from the default +// MM_SHUFFLE_REV is in array index order (default is reversed) +#define MM_SHUFFLE_REV(a,b,c,d) _MM_SHUFFLE(d,c,b,a) + +FORCEINLINE fltx4 SplatXSIMD( fltx4 const & a ) +{ + return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 0, 0, 0, 0 ) ); +} + +FORCEINLINE fltx4 SplatYSIMD( fltx4 const &a ) +{ + return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 1, 1, 1, 1 ) ); +} + +FORCEINLINE fltx4 SplatZSIMD( fltx4 const &a ) +{ + return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 2, 2, 2 ) ); +} + +FORCEINLINE fltx4 SplatWSIMD( fltx4 const &a ) +{ + return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 3, 3, 3, 3 ) ); +} + +FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x ) +{ + fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[0] ), x, a ); + return result; +} + +FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y ) +{ + fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[1] ), y, a ); + return result; +} + +FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z ) +{ + fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[2] ), z, a ); + return result; +} + +FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w ) +{ + fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[3] ), w, a ); + return result; +} + +FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, int nComponent, float flValue ) +{ + fltx4 val = ReplicateX4( flValue ); + fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[nComponent] ), val, a ); + return result; +} + +// a b c d -> b c d a +FORCEINLINE fltx4 RotateLeft( const fltx4 & a ) +{ + return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 1, 2, 3, 0 ) ); +} + +// a b c d -> c d a b +FORCEINLINE fltx4 RotateLeft2( const fltx4 & a ) +{ + return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 3, 0, 1 ) ); +} + +// a b c d -> d a b c +FORCEINLINE fltx4 RotateRight( const fltx4 & a ) +{ + return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 0, 3, 2, 1) ); +} + +// a b c d -> c d a b +FORCEINLINE fltx4 RotateRight2( const fltx4 & a ) +{ + return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 1, 0, 3, 2 ) ); +} + + +FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b ) // a+b +{ + return _mm_add_ps( a, b ); +}; + +FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b +{ + return _mm_sub_ps( a, b ); +}; + +FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b +{ + return _mm_mul_ps( a, b ); +}; + +FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b +{ + return _mm_div_ps( a, b ); +}; + +FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c +{ + return AddSIMD( MulSIMD(a,b), c ); +} + +FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b +{ + return SubSIMD( c, MulSIMD(a,b) ); +}; + +FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b ) +{ + fltx4 m = MulSIMD( a, b ); + float flDot = SubFloat( m, 0 ) + SubFloat( m, 1 ) + SubFloat( m, 2 ); + return ReplicateX4( flDot ); +} + +FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b ) +{ + fltx4 m = MulSIMD( a, b ); + float flDot = SubFloat( m, 0 ) + SubFloat( m, 1 ) + SubFloat( m, 2 ) + SubFloat( m, 3 ); + return ReplicateX4( flDot ); +} + +//TODO: implement as four-way Taylor series (see xbox implementation) +FORCEINLINE fltx4 SinSIMD( const fltx4 &radians ) +{ + fltx4 result; + SubFloat( result, 0 ) = sin( SubFloat( radians, 0 ) ); + SubFloat( result, 1 ) = sin( SubFloat( radians, 1 ) ); + SubFloat( result, 2 ) = sin( SubFloat( radians, 2 ) ); + SubFloat( result, 3 ) = sin( SubFloat( radians, 3 ) ); + return result; +} + +FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) +{ + // FIXME: Make a fast SSE version + SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) ); + SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) ); + SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) ); +} + +FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) // a*b + c +{ + // FIXME: Make a fast SSE version + SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) ); + SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) ); + SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) ); + SinCos( SubFloat( radians, 3 ), &SubFloat( sine, 3 ), &SubFloat( cosine, 3 ) ); +} + +//TODO: implement as four-way Taylor series (see xbox implementation) +FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine ) +{ + // FIXME: Make a fast SSE version + fltx4 result; + SubFloat( result, 0 ) = asin( SubFloat( sine, 0 ) ); + SubFloat( result, 1 ) = asin( SubFloat( sine, 1 ) ); + SubFloat( result, 2 ) = asin( SubFloat( sine, 2 ) ); + SubFloat( result, 3 ) = asin( SubFloat( sine, 3 ) ); + return result; +} + +FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs ) +{ + fltx4 result; + SubFloat( result, 0 ) = acos( SubFloat( cs, 0 ) ); + SubFloat( result, 1 ) = acos( SubFloat( cs, 1 ) ); + SubFloat( result, 2 ) = acos( SubFloat( cs, 2 ) ); + SubFloat( result, 3 ) = acos( SubFloat( cs, 3 ) ); + return result; +} + +// tan^1(a/b) .. ie, pass sin in as a and cos in as b +FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b ) +{ + fltx4 result; + SubFloat( result, 0 ) = atan2( SubFloat( a, 0 ), SubFloat( b, 0 ) ); + SubFloat( result, 1 ) = atan2( SubFloat( a, 1 ), SubFloat( b, 1 ) ); + SubFloat( result, 2 ) = atan2( SubFloat( a, 2 ), SubFloat( b, 2 ) ); + SubFloat( result, 3 ) = atan2( SubFloat( a, 3 ), SubFloat( b, 3 ) ); + return result; +} + +FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a +{ + return SubSIMD(LoadZeroSIMD(),a); +} + +FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set +{ + return _mm_movemask_ps( a ); +} + +FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0) +{ + return (0 != TestSignSIMD( a )); +} + +FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0 +{ + return _mm_cmpeq_ps( a, b ); +} + +FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0 +{ + return _mm_cmpgt_ps( a, b ); +} + +FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0 +{ + return _mm_cmpge_ps( a, b ); +} + +FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? ~0:0 +{ + return _mm_cmplt_ps( a, b ); +} + +FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? ~0:0 +{ + return _mm_cmple_ps( a, b ); +} + +// for branching when a.xyzw > b.xyzw +FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b ) +{ + return TestSignSIMD( CmpLeSIMD( a, b ) ) == 0; +} + +// for branching when a.xyzw >= b.xyzw +FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b ) +{ + return TestSignSIMD( CmpLtSIMD( a, b ) ) == 0; +} + +// For branching if all a.xyzw == b.xyzw +FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b ) +{ + return TestSignSIMD( CmpEqSIMD( a, b ) ) == 0xf; +} + +FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0 +{ + return AndSIMD( CmpLeSIMD(a,b), CmpGeSIMD(a, NegSIMD(b)) ); +} + +FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b) +{ + return _mm_min_ps( a, b ); +} + +FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b) +{ + return _mm_max_ps( a, b ); +} + + + +// SSE lacks rounding operations. +// Really. +// You can emulate them by setting the rounding mode for the +// whole processor and then converting to int, and then back again. +// But every time you set the rounding mode, you clear out the +// entire pipeline. So, I can't do them per operation. You +// have to do it once, before the loop that would call these. +// Round towards positive infinity +FORCEINLINE fltx4 CeilSIMD( const fltx4 &a ) +{ + fltx4 retVal; + SubFloat( retVal, 0 ) = ceil( SubFloat( a, 0 ) ); + SubFloat( retVal, 1 ) = ceil( SubFloat( a, 1 ) ); + SubFloat( retVal, 2 ) = ceil( SubFloat( a, 2 ) ); + SubFloat( retVal, 3 ) = ceil( SubFloat( a, 3 ) ); + return retVal; + +} + +fltx4 fabs( const fltx4 & x ); +// Round towards negative infinity +// This is the implementation that was here before; it assumes +// you are in round-to-floor mode, which I guess is usually the +// case for us vis-a-vis SSE. It's totally unnecessary on +// VMX, which has a native floor op. +FORCEINLINE fltx4 FloorSIMD( const fltx4 &val ) +{ + fltx4 fl4Abs = fabs( val ); + fltx4 ival = SubSIMD( AddSIMD( fl4Abs, Four_2ToThe23s ), Four_2ToThe23s ); + ival = MaskedAssign( CmpGtSIMD( ival, fl4Abs ), SubSIMD( ival, Four_Ones ), ival ); + return XorSIMD( ival, XorSIMD( val, fl4Abs ) ); // restore sign bits +} + + + +inline bool IsAllZeros( const fltx4 & var ) +{ + return TestSignSIMD( CmpEqSIMD( var, Four_Zeros ) ) == 0xF; +} + +FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less +{ + return _mm_sqrt_ps( a ); +} + +FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a) +{ + return _mm_sqrt_ps( a ); +} + +FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less +{ + return _mm_rsqrt_ps( a ); +} + +FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a ) +{ + fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros ); + fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) ); + ret = ReciprocalSqrtEstSIMD( ret ); + return ret; +} + +/// uses newton iteration for higher precision results than ReciprocalSqrtEstSIMD +FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a) +{ + fltx4 guess = ReciprocalSqrtEstSIMD( a ); + // newton iteration for 1/sqrt(a) : y(n+1) = 1/2 (y(n)*(3-a*y(n)^2)); + guess = MulSIMD( guess, SubSIMD( Four_Threes, MulSIMD( a, MulSIMD( guess, guess )))); + guess = MulSIMD( Four_PointFives, guess); + return guess; +} + +FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less +{ + return _mm_rcp_ps( a ); +} + +/// 1/x for all 4 values, more or less +/// 1/0 will result in a big but NOT infinite result +FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a ) +{ + fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros ); + fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) ); + ret = ReciprocalEstSIMD( ret ); + return ret; +} + +/// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration. +/// No error checking! +FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a +{ + fltx4 ret = ReciprocalEstSIMD( a ); + // newton iteration is: Y(n+1) = 2*Y(n)-a*Y(n)^2 + ret = SubSIMD( AddSIMD( ret, ret ), MulSIMD( a, MulSIMD( ret, ret ) ) ); + return ret; +} + +/// 1/x for all 4 values. +/// 1/0 will result in a big but NOT infinite result +FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a ) +{ + fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros ); + fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) ); + ret = ReciprocalSIMD( ret ); + return ret; +} + +// CHRISG: is it worth doing integer bitfiddling for this? +// 2^x for all values (the antilog) +FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower ) +{ + fltx4 retval; + SubFloat( retval, 0 ) = powf( 2, SubFloat(toPower, 0) ); + SubFloat( retval, 1 ) = powf( 2, SubFloat(toPower, 1) ); + SubFloat( retval, 2 ) = powf( 2, SubFloat(toPower, 2) ); + SubFloat( retval, 3 ) = powf( 2, SubFloat(toPower, 3) ); + + return retval; +} + +// Clamps the components of a vector to a specified minimum and maximum range. +FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max) +{ + return MaxSIMD( min, MinSIMD( max, in ) ); +} + +FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w) +{ + _MM_TRANSPOSE4_PS( x, y, z, w ); +} + +FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 &a ) +{ + // a is [x,y,z,G] (where G is garbage) + // rotate left by one + fltx4 compareOne = RotateLeft( a ); + // compareOne is [y,z,G,x] + fltx4 retval = MinSIMD( a, compareOne ); + // retVal is [min(x,y), ... ] + compareOne = RotateLeft2( a ); + // compareOne is [z, G, x, y] + retval = MinSIMD( retval, compareOne ); + // retVal = [ min(min(x,y),z)..] + // splat the x component out to the whole vector and return + return SplatXSIMD( retval ); + +} + +FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 &a ) +{ + // a is [x,y,z,G] (where G is garbage) + // rotate left by one + fltx4 compareOne = RotateLeft( a ); + // compareOne is [y,z,G,x] + fltx4 retval = MaxSIMD( a, compareOne ); + // retVal is [max(x,y), ... ] + compareOne = RotateLeft2( a ); + // compareOne is [z, G, x, y] + retval = MaxSIMD( retval, compareOne ); + // retVal = [ max(max(x,y),z)..] + // splat the x component out to the whole vector and return + return SplatXSIMD( retval ); + +} + +// ------------------------------------ +// INTEGER SIMD OPERATIONS. +// ------------------------------------ + + +#if 0 /* pc does not have these ops */ +// splat all components of a vector to a signed immediate int number. +FORCEINLINE fltx4 IntSetImmediateSIMD(int to) +{ + //CHRISG: SSE2 has this, but not SSE1. What to do? + fltx4 retval; + SubInt( retval, 0 ) = to; + SubInt( retval, 1 ) = to; + SubInt( retval, 2 ) = to; + SubInt( retval, 3 ) = to; + return retval; +} +#endif + +// Load 4 aligned words into a SIMD register +FORCEINLINE i32x4 LoadAlignedIntSIMD( const void * RESTRICT pSIMD) +{ + return _mm_load_ps( reinterpret_cast<const float *>(pSIMD) ); +} + +// Load 4 unaligned words into a SIMD register +FORCEINLINE i32x4 LoadUnalignedIntSIMD( const void * RESTRICT pSIMD) +{ + return _mm_loadu_ps( reinterpret_cast<const float *>(pSIMD) ); +} + +// save into four words, 16-byte aligned +FORCEINLINE void StoreAlignedIntSIMD( int32 * RESTRICT pSIMD, const fltx4 & a ) +{ + _mm_store_ps( reinterpret_cast<float *>(pSIMD), a ); +} + +FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a ) +{ + _mm_store_ps( reinterpret_cast<float *>(pSIMD.Base()), a ); +} + +FORCEINLINE void StoreUnalignedIntSIMD( int32 * RESTRICT pSIMD, const fltx4 & a ) +{ + _mm_storeu_ps( reinterpret_cast<float *>(pSIMD), a ); +} + + +// CHRISG: the conversion functions all seem to operate on m64's only... +// how do we make them work here? + +// Take a fltx4 containing fixed-point uints and +// return them as single precision floats. No +// fixed point conversion is done. +FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const u32x4 &vSrcA ) +{ + fltx4 retval; + SubFloat( retval, 0 ) = ( (float) SubInt( retval, 0 ) ); + SubFloat( retval, 1 ) = ( (float) SubInt( retval, 1 ) ); + SubFloat( retval, 2 ) = ( (float) SubInt( retval, 2 ) ); + SubFloat( retval, 3 ) = ( (float) SubInt( retval, 3 ) ); + return retval; +} + + +// Take a fltx4 containing fixed-point sints and +// return them as single precision floats. No +// fixed point conversion is done. +FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA ) +{ + fltx4 retval; + SubFloat( retval, 0 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[0])); + SubFloat( retval, 1 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[1])); + SubFloat( retval, 2 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[2])); + SubFloat( retval, 3 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[3])); + return retval; +} + +/* + works on fltx4's as if they are four uints. + the first parameter contains the words to be shifted, + the second contains the amount to shift by AS INTS + + for i = 0 to 3 + shift = vSrcB_i*32:(i*32)+4 + vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift +*/ +FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4 &vSrcA, const i32x4 &vSrcB) +{ + i32x4 retval; + SubInt(retval, 0) = SubInt(vSrcA, 0) << SubInt(vSrcB, 0); + SubInt(retval, 1) = SubInt(vSrcA, 1) << SubInt(vSrcB, 1); + SubInt(retval, 2) = SubInt(vSrcA, 2) << SubInt(vSrcB, 2); + SubInt(retval, 3) = SubInt(vSrcA, 3) << SubInt(vSrcB, 3); + + + return retval; +} + + +// Fixed-point conversion and save as SIGNED INTS. +// pDest->x = Int (vSrc.x) +// note: some architectures have means of doing +// fixed point conversion when the fix depth is +// specified as an immediate.. but there is no way +// to guarantee an immediate as a parameter to function +// like this. +FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc) +{ +#if defined( COMPILER_MSVC64 ) + + (*pDest)[0] = SubFloat( vSrc, 0 ); + (*pDest)[1] = SubFloat( vSrc, 1 ); + (*pDest)[2] = SubFloat( vSrc, 2 ); + (*pDest)[3] = SubFloat( vSrc, 3 ); + +#else + __m64 bottom = _mm_cvttps_pi32( vSrc ); + __m64 top = _mm_cvttps_pi32( _mm_movehl_ps(vSrc,vSrc) ); + + *reinterpret_cast<__m64 *>(&(*pDest)[0]) = bottom; + *reinterpret_cast<__m64 *>(&(*pDest)[2]) = top; + + _mm_empty(); +#endif +} + + + +#endif + + + +/// class FourVectors stores 4 independent vectors for use in SIMD processing. These vectors are +/// stored in the format x x x x y y y y z z z z so that they can be efficiently SIMD-accelerated. +class ALIGN16 FourVectors +{ +public: + fltx4 x, y, z; + + FORCEINLINE void DuplicateVector(Vector const &v) //< set all 4 vectors to the same vector value + { + x=ReplicateX4(v.x); + y=ReplicateX4(v.y); + z=ReplicateX4(v.z); + } + + FORCEINLINE fltx4 const & operator[](int idx) const + { + return *((&x)+idx); + } + + FORCEINLINE fltx4 & operator[](int idx) + { + return *((&x)+idx); + } + + FORCEINLINE void operator+=(FourVectors const &b) //< add 4 vectors to another 4 vectors + { + x=AddSIMD(x,b.x); + y=AddSIMD(y,b.y); + z=AddSIMD(z,b.z); + } + + FORCEINLINE void operator-=(FourVectors const &b) //< subtract 4 vectors from another 4 + { + x=SubSIMD(x,b.x); + y=SubSIMD(y,b.y); + z=SubSIMD(z,b.z); + } + + FORCEINLINE void operator*=(FourVectors const &b) //< scale all four vectors per component scale + { + x=MulSIMD(x,b.x); + y=MulSIMD(y,b.y); + z=MulSIMD(z,b.z); + } + + FORCEINLINE void operator*=(const fltx4 & scale) //< scale + { + x=MulSIMD(x,scale); + y=MulSIMD(y,scale); + z=MulSIMD(z,scale); + } + + FORCEINLINE void operator*=(float scale) //< uniformly scale all 4 vectors + { + fltx4 scalepacked = ReplicateX4(scale); + *this *= scalepacked; + } + + FORCEINLINE fltx4 operator*(FourVectors const &b) const //< 4 dot products + { + fltx4 dot=MulSIMD(x,b.x); + dot=MaddSIMD(y,b.y,dot); + dot=MaddSIMD(z,b.z,dot); + return dot; + } + + FORCEINLINE fltx4 operator*(Vector const &b) const //< dot product all 4 vectors with 1 vector + { + fltx4 dot=MulSIMD(x,ReplicateX4(b.x)); + dot=MaddSIMD(y,ReplicateX4(b.y), dot); + dot=MaddSIMD(z,ReplicateX4(b.z), dot); + return dot; + } + + FORCEINLINE void VProduct(FourVectors const &b) //< component by component mul + { + x=MulSIMD(x,b.x); + y=MulSIMD(y,b.y); + z=MulSIMD(z,b.z); + } + FORCEINLINE void MakeReciprocal(void) //< (x,y,z)=(1/x,1/y,1/z) + { + x=ReciprocalSIMD(x); + y=ReciprocalSIMD(y); + z=ReciprocalSIMD(z); + } + + FORCEINLINE void MakeReciprocalSaturate(void) //< (x,y,z)=(1/x,1/y,1/z), 1/0=1.0e23 + { + x=ReciprocalSaturateSIMD(x); + y=ReciprocalSaturateSIMD(y); + z=ReciprocalSaturateSIMD(z); + } + + // Assume the given matrix is a rotation, and rotate these vectors by it. + // If you have a long list of FourVectors structures that you all want + // to rotate by the same matrix, use FourVectors::RotateManyBy() instead. + inline void RotateBy(const matrix3x4_t& matrix); + + /// You can use this to rotate a long array of FourVectors all by the same + /// matrix. The first parameter is the head of the array. The second is the + /// number of vectors to rotate. The third is the matrix. + static void RotateManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix ); + + /// Assume the vectors are points, and transform them in place by the matrix. + inline void TransformBy(const matrix3x4_t& matrix); + + /// You can use this to Transform a long array of FourVectors all by the same + /// matrix. The first parameter is the head of the array. The second is the + /// number of vectors to rotate. The third is the matrix. The fourth is the + /// output buffer, which must not overlap the pVectors buffer. This is not + /// an in-place transformation. + static void TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut ); + + /// You can use this to Transform a long array of FourVectors all by the same + /// matrix. The first parameter is the head of the array. The second is the + /// number of vectors to rotate. The third is the matrix. The fourth is the + /// output buffer, which must not overlap the pVectors buffer. + /// This is an in-place transformation. + static void TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix ); + + // X(),Y(),Z() - get at the desired component of the i'th (0..3) vector. + FORCEINLINE const float & X(int idx) const + { + // NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!) + return SubFloat( (fltx4 &)x, idx ); + } + + FORCEINLINE const float & Y(int idx) const + { + return SubFloat( (fltx4 &)y, idx ); + } + + FORCEINLINE const float & Z(int idx) const + { + return SubFloat( (fltx4 &)z, idx ); + } + + FORCEINLINE float & X(int idx) + { + return SubFloat( x, idx ); + } + + FORCEINLINE float & Y(int idx) + { + return SubFloat( y, idx ); + } + + FORCEINLINE float & Z(int idx) + { + return SubFloat( z, idx ); + } + + FORCEINLINE Vector Vec(int idx) const //< unpack one of the vectors + { + return Vector( X(idx), Y(idx), Z(idx) ); + } + + FourVectors(void) + { + } + + FourVectors( FourVectors const &src ) + { + x=src.x; + y=src.y; + z=src.z; + } + + FORCEINLINE void operator=( FourVectors const &src ) + { + x=src.x; + y=src.y; + z=src.z; + } + + /// LoadAndSwizzle - load 4 Vectors into a FourVectors, performing transpose op + FORCEINLINE void LoadAndSwizzle(Vector const &a, Vector const &b, Vector const &c, Vector const &d) + { + // TransposeSIMD has large sub-expressions that the compiler can't eliminate on x360 + // use an unfolded implementation here +#if _X360 + fltx4 tx = LoadUnalignedSIMD( &a.x ); + fltx4 ty = LoadUnalignedSIMD( &b.x ); + fltx4 tz = LoadUnalignedSIMD( &c.x ); + fltx4 tw = LoadUnalignedSIMD( &d.x ); + fltx4 r0 = __vmrghw(tx, tz); + fltx4 r1 = __vmrghw(ty, tw); + fltx4 r2 = __vmrglw(tx, tz); + fltx4 r3 = __vmrglw(ty, tw); + + x = __vmrghw(r0, r1); + y = __vmrglw(r0, r1); + z = __vmrghw(r2, r3); +#else + x = LoadUnalignedSIMD( &( a.x )); + y = LoadUnalignedSIMD( &( b.x )); + z = LoadUnalignedSIMD( &( c.x )); + fltx4 w = LoadUnalignedSIMD( &( d.x )); + // now, matrix is: + // x y z ? + // x y z ? + // x y z ? + // x y z ? + TransposeSIMD(x, y, z, w); +#endif + } + + /// LoadAndSwizzleAligned - load 4 Vectors into a FourVectors, performing transpose op. + /// all 4 vectors must be 128 bit boundary + FORCEINLINE void LoadAndSwizzleAligned(const float *RESTRICT a, const float *RESTRICT b, const float *RESTRICT c, const float *RESTRICT d) + { +#if _X360 + fltx4 tx = LoadAlignedSIMD(a); + fltx4 ty = LoadAlignedSIMD(b); + fltx4 tz = LoadAlignedSIMD(c); + fltx4 tw = LoadAlignedSIMD(d); + fltx4 r0 = __vmrghw(tx, tz); + fltx4 r1 = __vmrghw(ty, tw); + fltx4 r2 = __vmrglw(tx, tz); + fltx4 r3 = __vmrglw(ty, tw); + + x = __vmrghw(r0, r1); + y = __vmrglw(r0, r1); + z = __vmrghw(r2, r3); +#else + x = LoadAlignedSIMD( a ); + y = LoadAlignedSIMD( b ); + z = LoadAlignedSIMD( c ); + fltx4 w = LoadAlignedSIMD( d ); + // now, matrix is: + // x y z ? + // x y z ? + // x y z ? + // x y z ? + TransposeSIMD( x, y, z, w ); +#endif + } + + FORCEINLINE void LoadAndSwizzleAligned(Vector const &a, Vector const &b, Vector const &c, Vector const &d) + { + LoadAndSwizzleAligned( &a.x, &b.x, &c.x, &d.x ); + } + + /// return the squared length of all 4 vectors + FORCEINLINE fltx4 length2(void) const + { + return (*this)*(*this); + } + + /// return the approximate length of all 4 vectors. uses the sqrt approximation instruction + FORCEINLINE fltx4 length(void) const + { + return SqrtEstSIMD(length2()); + } + + /// normalize all 4 vectors in place. not mega-accurate (uses reciprocal approximation instruction) + FORCEINLINE void VectorNormalizeFast(void) + { + fltx4 mag_sq=(*this)*(*this); // length^2 + (*this) *= ReciprocalSqrtEstSIMD(mag_sq); // *(1.0/sqrt(length^2)) + } + + /// normalize all 4 vectors in place. + FORCEINLINE void VectorNormalize(void) + { + fltx4 mag_sq=(*this)*(*this); // length^2 + (*this) *= ReciprocalSqrtSIMD(mag_sq); // *(1.0/sqrt(length^2)) + } + + /// construct a FourVectors from 4 separate Vectors + FORCEINLINE FourVectors(Vector const &a, Vector const &b, Vector const &c, Vector const &d) + { + LoadAndSwizzle(a,b,c,d); + } + + /// construct a FourVectors from 4 separate Vectors + FORCEINLINE FourVectors(VectorAligned const &a, VectorAligned const &b, VectorAligned const &c, VectorAligned const &d) + { + LoadAndSwizzleAligned(a,b,c,d); + } + + FORCEINLINE fltx4 DistToSqr( FourVectors const &pnt ) + { + fltx4 fl4dX = SubSIMD( pnt.x, x ); + fltx4 fl4dY = SubSIMD( pnt.y, y ); + fltx4 fl4dZ = SubSIMD( pnt.z, z ); + return AddSIMD( MulSIMD( fl4dX, fl4dX), AddSIMD( MulSIMD( fl4dY, fl4dY ), MulSIMD( fl4dZ, fl4dZ ) ) ); + + } + + FORCEINLINE fltx4 TValueOfClosestPointOnLine( FourVectors const &p0, FourVectors const &p1 ) const + { + FourVectors lineDelta = p1; + lineDelta -= p0; + fltx4 OOlineDirDotlineDir = ReciprocalSIMD( p1 * p1 ); + FourVectors v4OurPnt = *this; + v4OurPnt -= p0; + return MulSIMD( OOlineDirDotlineDir, v4OurPnt * lineDelta ); + } + + FORCEINLINE fltx4 DistSqrToLineSegment( FourVectors const &p0, FourVectors const &p1 ) const + { + FourVectors lineDelta = p1; + FourVectors v4OurPnt = *this; + v4OurPnt -= p0; + lineDelta -= p0; + + fltx4 OOlineDirDotlineDir = ReciprocalSIMD( lineDelta * lineDelta ); + + fltx4 fl4T = MulSIMD( OOlineDirDotlineDir, v4OurPnt * lineDelta ); + + fl4T = MinSIMD( fl4T, Four_Ones ); + fl4T = MaxSIMD( fl4T, Four_Zeros ); + lineDelta *= fl4T; + return v4OurPnt.DistToSqr( lineDelta ); + } + +}; + +/// form 4 cross products +inline FourVectors operator ^(const FourVectors &a, const FourVectors &b) +{ + FourVectors ret; + ret.x=SubSIMD(MulSIMD(a.y,b.z),MulSIMD(a.z,b.y)); + ret.y=SubSIMD(MulSIMD(a.z,b.x),MulSIMD(a.x,b.z)); + ret.z=SubSIMD(MulSIMD(a.x,b.y),MulSIMD(a.y,b.x)); + return ret; +} + +/// component-by-componentwise MAX operator +inline FourVectors maximum(const FourVectors &a, const FourVectors &b) +{ + FourVectors ret; + ret.x=MaxSIMD(a.x,b.x); + ret.y=MaxSIMD(a.y,b.y); + ret.z=MaxSIMD(a.z,b.z); + return ret; +} + +/// component-by-componentwise MIN operator +inline FourVectors minimum(const FourVectors &a, const FourVectors &b) +{ + FourVectors ret; + ret.x=MinSIMD(a.x,b.x); + ret.y=MinSIMD(a.y,b.y); + ret.z=MinSIMD(a.z,b.z); + return ret; +} + +/// calculate reflection vector. incident and normal dir assumed normalized +FORCEINLINE FourVectors VectorReflect( const FourVectors &incident, const FourVectors &normal ) +{ + FourVectors ret = incident; + fltx4 iDotNx2 = incident * normal; + iDotNx2 = AddSIMD( iDotNx2, iDotNx2 ); + FourVectors nPart = normal; + nPart *= iDotNx2; + ret -= nPart; // i-2(n*i)n + return ret; +} + +/// calculate slide vector. removes all components of a vector which are perpendicular to a normal vector. +FORCEINLINE FourVectors VectorSlide( const FourVectors &incident, const FourVectors &normal ) +{ + FourVectors ret = incident; + fltx4 iDotN = incident * normal; + FourVectors nPart = normal; + nPart *= iDotN; + ret -= nPart; // i-(n*i)n + return ret; +} + + +// Assume the given matrix is a rotation, and rotate these vectors by it. +// If you have a long list of FourVectors structures that you all want +// to rotate by the same matrix, use FourVectors::RotateManyBy() instead. +void FourVectors::RotateBy(const matrix3x4_t& matrix) +{ + // Splat out each of the entries in the matrix to a fltx4. Do this + // in the order that we will need them, to hide latency. I'm + // avoiding making an array of them, so that they'll remain in + // registers. + fltx4 matSplat00, matSplat01, matSplat02, + matSplat10, matSplat11, matSplat12, + matSplat20, matSplat21, matSplat22; + + { + // Load the matrix into local vectors. Sadly, matrix3x4_ts are + // often unaligned. The w components will be the tranpose row of + // the matrix, but we don't really care about that. + fltx4 matCol0 = LoadUnalignedSIMD( matrix[0] ); + fltx4 matCol1 = LoadUnalignedSIMD( matrix[1] ); + fltx4 matCol2 = LoadUnalignedSIMD( matrix[2] ); + + matSplat00 = SplatXSIMD( matCol0 ); + matSplat01 = SplatYSIMD( matCol0 ); + matSplat02 = SplatZSIMD( matCol0 ); + + matSplat10 = SplatXSIMD( matCol1 ); + matSplat11 = SplatYSIMD( matCol1 ); + matSplat12 = SplatZSIMD( matCol1 ); + + matSplat20 = SplatXSIMD( matCol2 ); + matSplat21 = SplatYSIMD( matCol2 ); + matSplat22 = SplatZSIMD( matCol2 ); + } + + // Trust in the compiler to schedule these operations correctly: + fltx4 outX, outY, outZ; + outX = AddSIMD( AddSIMD( MulSIMD( x, matSplat00 ), MulSIMD( y, matSplat01 ) ), MulSIMD( z, matSplat02 ) ); + outY = AddSIMD( AddSIMD( MulSIMD( x, matSplat10 ), MulSIMD( y, matSplat11 ) ), MulSIMD( z, matSplat12 ) ); + outZ = AddSIMD( AddSIMD( MulSIMD( x, matSplat20 ), MulSIMD( y, matSplat21 ) ), MulSIMD( z, matSplat22 ) ); + + x = outX; + y = outY; + z = outZ; +} + +// Assume the given matrix is a rotation, and rotate these vectors by it. +// If you have a long list of FourVectors structures that you all want +// to rotate by the same matrix, use FourVectors::RotateManyBy() instead. +void FourVectors::TransformBy(const matrix3x4_t& matrix) +{ + // Splat out each of the entries in the matrix to a fltx4. Do this + // in the order that we will need them, to hide latency. I'm + // avoiding making an array of them, so that they'll remain in + // registers. + fltx4 matSplat00, matSplat01, matSplat02, + matSplat10, matSplat11, matSplat12, + matSplat20, matSplat21, matSplat22; + + { + // Load the matrix into local vectors. Sadly, matrix3x4_ts are + // often unaligned. The w components will be the tranpose row of + // the matrix, but we don't really care about that. + fltx4 matCol0 = LoadUnalignedSIMD( matrix[0] ); + fltx4 matCol1 = LoadUnalignedSIMD( matrix[1] ); + fltx4 matCol2 = LoadUnalignedSIMD( matrix[2] ); + + matSplat00 = SplatXSIMD( matCol0 ); + matSplat01 = SplatYSIMD( matCol0 ); + matSplat02 = SplatZSIMD( matCol0 ); + + matSplat10 = SplatXSIMD( matCol1 ); + matSplat11 = SplatYSIMD( matCol1 ); + matSplat12 = SplatZSIMD( matCol1 ); + + matSplat20 = SplatXSIMD( matCol2 ); + matSplat21 = SplatYSIMD( matCol2 ); + matSplat22 = SplatZSIMD( matCol2 ); + } + + // Trust in the compiler to schedule these operations correctly: + fltx4 outX, outY, outZ; + + outX = MaddSIMD( z, matSplat02, AddSIMD( MulSIMD( x, matSplat00 ), MulSIMD( y, matSplat01 ) ) ); + outY = MaddSIMD( z, matSplat12, AddSIMD( MulSIMD( x, matSplat10 ), MulSIMD( y, matSplat11 ) ) ); + outZ = MaddSIMD( z, matSplat22, AddSIMD( MulSIMD( x, matSplat20 ), MulSIMD( y, matSplat21 ) ) ); + + x = AddSIMD( outX, ReplicateX4( matrix[0][3] )); + y = AddSIMD( outY, ReplicateX4( matrix[1][3] )); + z = AddSIMD( outZ, ReplicateX4( matrix[2][3] )); +} + + + +/// quick, low quality perlin-style noise() function suitable for real time use. +/// return value is -1..1. Only reliable around +/- 1 million or so. +fltx4 NoiseSIMD( const fltx4 & x, const fltx4 & y, const fltx4 & z ); +fltx4 NoiseSIMD( FourVectors const &v ); + +// vector valued noise direction +FourVectors DNoiseSIMD( FourVectors const &v ); + +// vector value "curl" noise function. see http://hyperphysics.phy-astr.gsu.edu/hbase/curl.html +FourVectors CurlNoiseSIMD( FourVectors const &v ); + + +/// calculate the absolute value of a packed single +inline fltx4 fabs( const fltx4 & x ) +{ + return AndSIMD( x, LoadAlignedSIMD( g_SIMD_clear_signmask ) ); +} + +/// negate all four components of a SIMD packed single +inline fltx4 fnegate( const fltx4 & x ) +{ + return XorSIMD( x, LoadAlignedSIMD( g_SIMD_signmask ) ); +} + + +fltx4 Pow_FixedPoint_Exponent_SIMD( const fltx4 & x, int exponent); + +// PowSIMD - raise a SIMD register to a power. This is analogous to the C pow() function, with some +// restictions: fractional exponents are only handled with 2 bits of precision. Basically, +// fractions of 0,.25,.5, and .75 are handled. PowSIMD(x,.30) will be the same as PowSIMD(x,.25). +// negative and fractional powers are handled by the SIMD reciprocal and square root approximation +// instructions and so are not especially accurate ----Note that this routine does not raise +// numeric exceptions because it uses SIMD--- This routine is O(log2(exponent)). +inline fltx4 PowSIMD( const fltx4 & x, float exponent ) +{ + return Pow_FixedPoint_Exponent_SIMD(x,(int) (4.0*exponent)); +} + + + +// random number generation - generate 4 random numbers quickly. + +void SeedRandSIMD(uint32 seed); // seed the random # generator +fltx4 RandSIMD( int nContext = 0 ); // return 4 numbers in the 0..1 range + +// for multithreaded, you need to use these and use the argument form of RandSIMD: +int GetSIMDRandContext( void ); +void ReleaseSIMDRandContext( int nContext ); + +FORCEINLINE fltx4 RandSignedSIMD( void ) // -1..1 +{ + return SubSIMD( MulSIMD( Four_Twos, RandSIMD() ), Four_Ones ); +} + + +// SIMD versions of mathlib simplespline functions +// hermite basis function for smooth interpolation +// Similar to Gain() above, but very cheap to call +// value should be between 0 & 1 inclusive +inline fltx4 SimpleSpline( const fltx4 & value ) +{ + // Arranged to avoid a data dependency between these two MULs: + fltx4 valueDoubled = MulSIMD( value, Four_Twos ); + fltx4 valueSquared = MulSIMD( value, value ); + + // Nice little ease-in, ease-out spline-like curve + return SubSIMD( + MulSIMD( Four_Threes, valueSquared ), + MulSIMD( valueDoubled, valueSquared ) ); +} + +// remaps a value in [startInterval, startInterval+rangeInterval] from linear to +// spline using SimpleSpline +inline fltx4 SimpleSplineRemapValWithDeltas( const fltx4 & val, + const fltx4 & A, const fltx4 & BMinusA, + const fltx4 & OneOverBMinusA, const fltx4 & C, + const fltx4 & DMinusC ) +{ +// if ( A == B ) +// return val >= B ? D : C; + fltx4 cVal = MulSIMD( SubSIMD( val, A), OneOverBMinusA ); + return AddSIMD( C, MulSIMD( DMinusC, SimpleSpline( cVal ) ) ); +} + +inline fltx4 SimpleSplineRemapValWithDeltasClamped( const fltx4 & val, + const fltx4 & A, const fltx4 & BMinusA, + const fltx4 & OneOverBMinusA, const fltx4 & C, + const fltx4 & DMinusC ) +{ +// if ( A == B ) +// return val >= B ? D : C; + fltx4 cVal = MulSIMD( SubSIMD( val, A), OneOverBMinusA ); + cVal = MinSIMD( Four_Ones, MaxSIMD( Four_Zeros, cVal ) ); + return AddSIMD( C, MulSIMD( DMinusC, SimpleSpline( cVal ) ) ); +} + +FORCEINLINE fltx4 FracSIMD( const fltx4 &val ) +{ + fltx4 fl4Abs = fabs( val ); + fltx4 ival = SubSIMD( AddSIMD( fl4Abs, Four_2ToThe23s ), Four_2ToThe23s ); + ival = MaskedAssign( CmpGtSIMD( ival, fl4Abs ), SubSIMD( ival, Four_Ones ), ival ); + return XorSIMD( SubSIMD( fl4Abs, ival ), XorSIMD( val, fl4Abs ) ); // restore sign bits +} + +FORCEINLINE fltx4 Mod2SIMD( const fltx4 &val ) +{ + fltx4 fl4Abs = fabs( val ); + fltx4 ival = SubSIMD( AndSIMD( LoadAlignedSIMD( (float *) g_SIMD_lsbmask ), AddSIMD( fl4Abs, Four_2ToThe23s ) ), Four_2ToThe23s ); + ival = MaskedAssign( CmpGtSIMD( ival, fl4Abs ), SubSIMD( ival, Four_Twos ), ival ); + return XorSIMD( SubSIMD( fl4Abs, ival ), XorSIMD( val, fl4Abs ) ); // restore sign bits +} + +FORCEINLINE fltx4 Mod2SIMDPositiveInput( const fltx4 &val ) +{ + fltx4 ival = SubSIMD( AndSIMD( LoadAlignedSIMD( g_SIMD_lsbmask ), AddSIMD( val, Four_2ToThe23s ) ), Four_2ToThe23s ); + ival = MaskedAssign( CmpGtSIMD( ival, val ), SubSIMD( ival, Four_Twos ), ival ); + return SubSIMD( val, ival ); +} + + +// approximate sin of an angle, with -1..1 representing the whole sin wave period instead of -pi..pi. +// no range reduction is done - for values outside of 0..1 you won't like the results +FORCEINLINE fltx4 _SinEst01SIMD( const fltx4 &val ) +{ + // really rough approximation - x*(4-x*4) - a parabola. s(0) = 0, s(.5) = 1, s(1)=0, smooth in-between. + // sufficient for simple oscillation. + return MulSIMD( val, SubSIMD( Four_Fours, MulSIMD( val, Four_Fours ) ) ); +} + +FORCEINLINE fltx4 _Sin01SIMD( const fltx4 &val ) +{ + // not a bad approximation : parabola always over-estimates. Squared parabola always + // underestimates. So lets blend between them: goodsin = badsin + .225*( badsin^2-badsin) + fltx4 fl4BadEst = MulSIMD( val, SubSIMD( Four_Fours, MulSIMD( val, Four_Fours ) ) ); + return AddSIMD( MulSIMD( Four_Point225s, SubSIMD( MulSIMD( fl4BadEst, fl4BadEst ), fl4BadEst ) ), fl4BadEst ); +} + +// full range useable implementations +FORCEINLINE fltx4 SinEst01SIMD( const fltx4 &val ) +{ + fltx4 fl4Abs = fabs( val ); + fltx4 fl4Reduced2 = Mod2SIMDPositiveInput( fl4Abs ); + fltx4 fl4OddMask = CmpGeSIMD( fl4Reduced2, Four_Ones ); + fltx4 fl4val = SubSIMD( fl4Reduced2, AndSIMD( Four_Ones, fl4OddMask ) ); + fltx4 fl4Sin = _SinEst01SIMD( fl4val ); + fl4Sin = XorSIMD( fl4Sin, AndSIMD( LoadAlignedSIMD( g_SIMD_signmask ), XorSIMD( val, fl4OddMask ) ) ); + return fl4Sin; + +} + +FORCEINLINE fltx4 Sin01SIMD( const fltx4 &val ) +{ + fltx4 fl4Abs = fabs( val ); + fltx4 fl4Reduced2 = Mod2SIMDPositiveInput( fl4Abs ); + fltx4 fl4OddMask = CmpGeSIMD( fl4Reduced2, Four_Ones ); + fltx4 fl4val = SubSIMD( fl4Reduced2, AndSIMD( Four_Ones, fl4OddMask ) ); + fltx4 fl4Sin = _Sin01SIMD( fl4val ); + fl4Sin = XorSIMD( fl4Sin, AndSIMD( LoadAlignedSIMD( g_SIMD_signmask ), XorSIMD( val, fl4OddMask ) ) ); + return fl4Sin; + +} + +// Schlick style Bias approximation see graphics gems 4 : bias(t,a)= t/( (1/a-2)*(1-t)+1) + +FORCEINLINE fltx4 PreCalcBiasParameter( const fltx4 &bias_parameter ) +{ + // convert perlin-style-bias parameter to the value right for the approximation + return SubSIMD( ReciprocalSIMD( bias_parameter ), Four_Twos ); +} + +FORCEINLINE fltx4 BiasSIMD( const fltx4 &val, const fltx4 &precalc_param ) +{ + // similar to bias function except pass precalced bias value from calling PreCalcBiasParameter. + + //!!speed!! use reciprocal est? + //!!speed!! could save one op by precalcing _2_ values + return DivSIMD( val, AddSIMD( MulSIMD( precalc_param, SubSIMD( Four_Ones, val ) ), Four_Ones ) ); +} + +//----------------------------------------------------------------------------- +// Box/plane test +// NOTE: The w component of emins + emaxs must be 1 for this to work +//----------------------------------------------------------------------------- +FORCEINLINE int BoxOnPlaneSideSIMD( const fltx4& emins, const fltx4& emaxs, const cplane_t *p, float tolerance = 0.f ) +{ + fltx4 corners[2]; + fltx4 normal = LoadUnalignedSIMD( p->normal.Base() ); + fltx4 dist = ReplicateX4( -p->dist ); + normal = SetWSIMD( normal, dist ); + fltx4 t4 = ReplicateX4( tolerance ); + fltx4 negt4 = ReplicateX4( -tolerance ); + fltx4 cmp = CmpGeSIMD( normal, Four_Zeros ); + corners[0] = MaskedAssign( cmp, emaxs, emins ); + corners[1] = MaskedAssign( cmp, emins, emaxs ); + fltx4 dot1 = Dot4SIMD( normal, corners[0] ); + fltx4 dot2 = Dot4SIMD( normal, corners[1] ); + cmp = CmpGeSIMD( dot1, t4 ); + fltx4 cmp2 = CmpGtSIMD( negt4, dot2 ); + fltx4 result = MaskedAssign( cmp, Four_Ones, Four_Zeros ); + fltx4 result2 = MaskedAssign( cmp2, Four_Twos, Four_Zeros ); + result = AddSIMD( result, result2 ); + intx4 sides; + ConvertStoreAsIntsSIMD( &sides, result ); + return sides[0]; +} + +#endif // _ssemath_h diff --git a/public/mathlib/ssequaternion.h b/public/mathlib/ssequaternion.h new file mode 100644 index 0000000..825a9e4 --- /dev/null +++ b/public/mathlib/ssequaternion.h @@ -0,0 +1,367 @@ +//========= Copyright Valve Corporation, All rights reserved. ============// +// +// Purpose: - defines SIMD "structure of arrays" classes and functions. +// +//===========================================================================// +#ifndef SSEQUATMATH_H +#define SSEQUATMATH_H + +#ifdef _WIN32 +#pragma once +#endif + + +#include "mathlib/ssemath.h" + +// Use this #define to allow SSE versions of Quaternion math +// to exist on PC. +// On PC, certain horizontal vector operations are not supported. +// This causes the SSE implementation of quaternion math to mix the +// vector and scalar floating point units, which is extremely +// performance negative if you don't compile to native SSE2 (which +// we don't as of Sept 1, 2007). So, it's best not to allow these +// functions to exist at all. It's not good enough to simply replace +// the contents of the functions with scalar math, because each call +// to LoadAligned and StoreAligned will result in an unnecssary copy +// of the quaternion, and several moves to and from the XMM registers. +// +// Basically, the problem you run into is that for efficient SIMD code, +// you need to load the quaternions and vectors into SIMD registers and +// keep them there as long as possible while doing only SIMD math, +// whereas for efficient scalar code, each time you copy onto or ever +// use a fltx4, it hoses your pipeline. So the difference has to be +// in the management of temporary variables in the calling function, +// not inside the math functions. +// +// If you compile assuming the presence of SSE2, the MSVC will abandon +// the traditional x87 FPU operations altogether and make everything use +// the SSE2 registers, which lessens this problem a little. + +// permitted only on 360, as we've done careful tuning on its Altivec math: +#ifdef _X360 +#define ALLOW_SIMD_QUATERNION_MATH 1 // not on PC! +#endif + + + +//--------------------------------------------------------------------- +// Load/store quaternions +//--------------------------------------------------------------------- +#ifndef _X360 +#if ALLOW_SIMD_QUATERNION_MATH +// Using STDC or SSE +FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned & pSIMD ) +{ + fltx4 retval = LoadAlignedSIMD( pSIMD.Base() ); + return retval; +} + +FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned * RESTRICT pSIMD ) +{ + fltx4 retval = LoadAlignedSIMD( pSIMD ); + return retval; +} + +FORCEINLINE void StoreAlignedSIMD( QuaternionAligned * RESTRICT pSIMD, const fltx4 & a ) +{ + StoreAlignedSIMD( pSIMD->Base(), a ); +} +#endif +#else + +// for the transitional class -- load a QuaternionAligned +FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned & pSIMD ) +{ + fltx4 retval = XMLoadVector4A( pSIMD.Base() ); + return retval; +} + +FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned * RESTRICT pSIMD ) +{ + fltx4 retval = XMLoadVector4A( pSIMD ); + return retval; +} + +FORCEINLINE void StoreAlignedSIMD( QuaternionAligned * RESTRICT pSIMD, const fltx4 & a ) +{ + XMStoreVector4A( pSIMD->Base(), a ); +} + +#endif + + +#if ALLOW_SIMD_QUATERNION_MATH +//--------------------------------------------------------------------- +// Make sure quaternions are within 180 degrees of one another, if not, reverse q +//--------------------------------------------------------------------- +FORCEINLINE fltx4 QuaternionAlignSIMD( const fltx4 &p, const fltx4 &q ) +{ + // decide if one of the quaternions is backwards + fltx4 a = SubSIMD( p, q ); + fltx4 b = AddSIMD( p, q ); + a = Dot4SIMD( a, a ); + b = Dot4SIMD( b, b ); + fltx4 cmp = CmpGtSIMD( a, b ); + fltx4 result = MaskedAssign( cmp, NegSIMD(q), q ); + return result; +} + +//--------------------------------------------------------------------- +// Normalize Quaternion +//--------------------------------------------------------------------- +#if USE_STDC_FOR_SIMD + +FORCEINLINE fltx4 QuaternionNormalizeSIMD( const fltx4 &q ) +{ + fltx4 radius, result; + radius = Dot4SIMD( q, q ); + + if ( SubFloat( radius, 0 ) ) // > FLT_EPSILON && ((radius < 1.0f - 4*FLT_EPSILON) || (radius > 1.0f + 4*FLT_EPSILON)) + { + float iradius = 1.0f / sqrt( SubFloat( radius, 0 ) ); + result = ReplicateX4( iradius ); + result = MulSIMD( result, q ); + return result; + } + return q; +} + +#else + +// SSE + X360 implementation +FORCEINLINE fltx4 QuaternionNormalizeSIMD( const fltx4 &q ) +{ + fltx4 radius, result, mask; + radius = Dot4SIMD( q, q ); + mask = CmpEqSIMD( radius, Four_Zeros ); // all ones iff radius = 0 + result = ReciprocalSqrtSIMD( radius ); + result = MulSIMD( result, q ); + return MaskedAssign( mask, q, result ); // if radius was 0, just return q +} + +#endif + + +//--------------------------------------------------------------------- +// 0.0 returns p, 1.0 return q. +//--------------------------------------------------------------------- +FORCEINLINE fltx4 QuaternionBlendNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t ) +{ + fltx4 sclp, sclq, result; + sclq = ReplicateX4( t ); + sclp = SubSIMD( Four_Ones, sclq ); + result = MulSIMD( sclp, p ); + result = MaddSIMD( sclq, q, result ); + return QuaternionNormalizeSIMD( result ); +} + + +//--------------------------------------------------------------------- +// Blend Quaternions +//--------------------------------------------------------------------- +FORCEINLINE fltx4 QuaternionBlendSIMD( const fltx4 &p, const fltx4 &q, float t ) +{ + // decide if one of the quaternions is backwards + fltx4 q2, result; + q2 = QuaternionAlignSIMD( p, q ); + result = QuaternionBlendNoAlignSIMD( p, q2, t ); + return result; +} + + +//--------------------------------------------------------------------- +// Multiply Quaternions +//--------------------------------------------------------------------- +#ifndef _X360 + +// SSE and STDC +FORCEINLINE fltx4 QuaternionMultSIMD( const fltx4 &p, const fltx4 &q ) +{ + // decide if one of the quaternions is backwards + fltx4 q2, result; + q2 = QuaternionAlignSIMD( p, q ); + SubFloat( result, 0 ) = SubFloat( p, 0 ) * SubFloat( q2, 3 ) + SubFloat( p, 1 ) * SubFloat( q2, 2 ) - SubFloat( p, 2 ) * SubFloat( q2, 1 ) + SubFloat( p, 3 ) * SubFloat( q2, 0 ); + SubFloat( result, 1 ) = -SubFloat( p, 0 ) * SubFloat( q2, 2 ) + SubFloat( p, 1 ) * SubFloat( q2, 3 ) + SubFloat( p, 2 ) * SubFloat( q2, 0 ) + SubFloat( p, 3 ) * SubFloat( q2, 1 ); + SubFloat( result, 2 ) = SubFloat( p, 0 ) * SubFloat( q2, 1 ) - SubFloat( p, 1 ) * SubFloat( q2, 0 ) + SubFloat( p, 2 ) * SubFloat( q2, 3 ) + SubFloat( p, 3 ) * SubFloat( q2, 2 ); + SubFloat( result, 3 ) = -SubFloat( p, 0 ) * SubFloat( q2, 0 ) - SubFloat( p, 1 ) * SubFloat( q2, 1 ) - SubFloat( p, 2 ) * SubFloat( q2, 2 ) + SubFloat( p, 3 ) * SubFloat( q2, 3 ); + return result; +} + +#else + +// X360 +extern const fltx4 g_QuatMultRowSign[4]; +FORCEINLINE fltx4 QuaternionMultSIMD( const fltx4 &p, const fltx4 &q ) +{ + fltx4 q2, row, result; + q2 = QuaternionAlignSIMD( p, q ); + + row = XMVectorSwizzle( q2, 3, 2, 1, 0 ); + row = MulSIMD( row, g_QuatMultRowSign[0] ); + result = Dot4SIMD( row, p ); + + row = XMVectorSwizzle( q2, 2, 3, 0, 1 ); + row = MulSIMD( row, g_QuatMultRowSign[1] ); + row = Dot4SIMD( row, p ); + result = __vrlimi( result, row, 4, 0 ); + + row = XMVectorSwizzle( q2, 1, 0, 3, 2 ); + row = MulSIMD( row, g_QuatMultRowSign[2] ); + row = Dot4SIMD( row, p ); + result = __vrlimi( result, row, 2, 0 ); + + row = MulSIMD( q2, g_QuatMultRowSign[3] ); + row = Dot4SIMD( row, p ); + result = __vrlimi( result, row, 1, 0 ); + return result; +} + +#endif + + +//--------------------------------------------------------------------- +// Quaternion scale +//--------------------------------------------------------------------- +#ifndef _X360 + +// SSE and STDC +FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t ) +{ + float r; + fltx4 q; + + // FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to + // use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale. + float sinom = sqrt( SubFloat( p, 0 ) * SubFloat( p, 0 ) + SubFloat( p, 1 ) * SubFloat( p, 1 ) + SubFloat( p, 2 ) * SubFloat( p, 2 ) ); + sinom = min( sinom, 1.f ); + + float sinsom = sin( asin( sinom ) * t ); + + t = sinsom / (sinom + FLT_EPSILON); + SubFloat( q, 0 ) = t * SubFloat( p, 0 ); + SubFloat( q, 1 ) = t * SubFloat( p, 1 ); + SubFloat( q, 2 ) = t * SubFloat( p, 2 ); + + // rescale rotation + r = 1.0f - sinsom * sinsom; + + // Assert( r >= 0 ); + if (r < 0.0f) + r = 0.0f; + r = sqrt( r ); + + // keep sign of rotation + SubFloat( q, 3 ) = fsel( SubFloat( p, 3 ), r, -r ); + return q; +} + +#else + +// X360 +FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t ) +{ + fltx4 sinom = Dot3SIMD( p, p ); + sinom = SqrtSIMD( sinom ); + sinom = MinSIMD( sinom, Four_Ones ); + fltx4 sinsom = ArcSinSIMD( sinom ); + fltx4 t4 = ReplicateX4( t ); + sinsom = MulSIMD( sinsom, t4 ); + sinsom = SinSIMD( sinsom ); + sinom = AddSIMD( sinom, Four_Epsilons ); + sinom = ReciprocalSIMD( sinom ); + t4 = MulSIMD( sinsom, sinom ); + fltx4 result = MulSIMD( p, t4 ); + + // rescale rotation + sinsom = MulSIMD( sinsom, sinsom ); + fltx4 r = SubSIMD( Four_Ones, sinsom ); + r = MaxSIMD( r, Four_Zeros ); + r = SqrtSIMD( r ); + + // keep sign of rotation + fltx4 cmp = CmpGeSIMD( p, Four_Zeros ); + r = MaskedAssign( cmp, r, NegSIMD( r ) ); + + result = __vrlimi(result, r, 1, 0); + return result; +} + +#endif + + +//----------------------------------------------------------------------------- +// Quaternion sphereical linear interpolation +//----------------------------------------------------------------------------- +#ifndef _X360 + +// SSE and STDC +FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t ) +{ + float omega, cosom, sinom, sclp, sclq; + + fltx4 result; + + // 0.0 returns p, 1.0 return q. + cosom = SubFloat( p, 0 ) * SubFloat( q, 0 ) + SubFloat( p, 1 ) * SubFloat( q, 1 ) + + SubFloat( p, 2 ) * SubFloat( q, 2 ) + SubFloat( p, 3 ) * SubFloat( q, 3 ); + + if ( (1.0f + cosom ) > 0.000001f ) + { + if ( (1.0f - cosom ) > 0.000001f ) + { + omega = acos( cosom ); + sinom = sin( omega ); + sclp = sin( (1.0f - t)*omega) / sinom; + sclq = sin( t*omega ) / sinom; + } + else + { + // TODO: add short circuit for cosom == 1.0f? + sclp = 1.0f - t; + sclq = t; + } + SubFloat( result, 0 ) = sclp * SubFloat( p, 0 ) + sclq * SubFloat( q, 0 ); + SubFloat( result, 1 ) = sclp * SubFloat( p, 1 ) + sclq * SubFloat( q, 1 ); + SubFloat( result, 2 ) = sclp * SubFloat( p, 2 ) + sclq * SubFloat( q, 2 ); + SubFloat( result, 3 ) = sclp * SubFloat( p, 3 ) + sclq * SubFloat( q, 3 ); + } + else + { + SubFloat( result, 0 ) = -SubFloat( q, 1 ); + SubFloat( result, 1 ) = SubFloat( q, 0 ); + SubFloat( result, 2 ) = -SubFloat( q, 3 ); + SubFloat( result, 3 ) = SubFloat( q, 2 ); + sclp = sin( (1.0f - t) * (0.5f * M_PI)); + sclq = sin( t * (0.5f * M_PI)); + SubFloat( result, 0 ) = sclp * SubFloat( p, 0 ) + sclq * SubFloat( result, 0 ); + SubFloat( result, 1 ) = sclp * SubFloat( p, 1 ) + sclq * SubFloat( result, 1 ); + SubFloat( result, 2 ) = sclp * SubFloat( p, 2 ) + sclq * SubFloat( result, 2 ); + } + + return result; +} + +#else + +// X360 +FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t ) +{ + return XMQuaternionSlerp( p, q, t ); +} + +#endif + + +FORCEINLINE fltx4 QuaternionSlerpSIMD( const fltx4 &p, const fltx4 &q, float t ) +{ + fltx4 q2, result; + q2 = QuaternionAlignSIMD( p, q ); + result = QuaternionSlerpNoAlignSIMD( p, q2, t ); + return result; +} + + +#endif // ALLOW_SIMD_QUATERNION_MATH + +#endif // SSEQUATMATH_H + diff --git a/public/mathlib/vector.h b/public/mathlib/vector.h new file mode 100644 index 0000000..c7654ba --- /dev/null +++ b/public/mathlib/vector.h @@ -0,0 +1,2311 @@ +//========= Copyright Valve Corporation, All rights reserved. ============// +// +// Purpose: +// +// $NoKeywords: $ +// +//=============================================================================// + +#ifndef VECTOR_H +#define VECTOR_H + +#ifdef _WIN32 +#pragma once +#endif + +#include <math.h> +#include <float.h> + +// For vec_t, put this somewhere else? +#include "tier0/basetypes.h" + +// For rand(). We really need a library! +#include <stdlib.h> + +#ifndef _X360 +// For MMX intrinsics +#include <xmmintrin.h> +#endif + +#include "tier0/dbg.h" +#include "tier0/threadtools.h" +#include "mathlib/vector2d.h" +#include "mathlib/math_pfns.h" + +// Uncomment this to add extra Asserts to check for NANs, uninitialized vecs, etc. +//#define VECTOR_PARANOIA 1 + +// Uncomment this to make sure we don't do anything slow with our vectors +//#define VECTOR_NO_SLOW_OPERATIONS 1 + + +// Used to make certain code easier to read. +#define X_INDEX 0 +#define Y_INDEX 1 +#define Z_INDEX 2 + + +#ifdef VECTOR_PARANOIA +#define CHECK_VALID( _v) Assert( (_v).IsValid() ) +#else +#ifdef GNUC +#define CHECK_VALID( _v) +#else +#define CHECK_VALID( _v) 0 +#endif +#endif + +#define VecToString(v) (static_cast<const char *>(CFmtStr("(%f, %f, %f)", (v).x, (v).y, (v).z))) // ** Note: this generates a temporary, don't hold reference! + +class VectorByValue; + +//========================================================= +// 3D Vector +//========================================================= +class Vector +{ +public: + // Members + vec_t x, y, z; + + // Construction/destruction: + Vector(void); + Vector(vec_t X, vec_t Y, vec_t Z); + explicit Vector(vec_t XYZ); ///< broadcast initialize + + // Initialization + void Init(vec_t ix=0.0f, vec_t iy=0.0f, vec_t iz=0.0f); + // TODO (Ilya): Should there be an init that takes a single float for consistency? + + // Got any nasty NAN's? + bool IsValid() const; + void Invalidate(); + + // array access... + vec_t operator[](int i) const; + vec_t& operator[](int i); + + // Base address... + vec_t* Base(); + vec_t const* Base() const; + + // Cast to Vector2D... + Vector2D& AsVector2D(); + const Vector2D& AsVector2D() const; + + // Initialization methods + void Random( vec_t minVal, vec_t maxVal ); + inline void Zero(); ///< zero out a vector + + // equality + bool operator==(const Vector& v) const; + bool operator!=(const Vector& v) const; + + // arithmetic operations + FORCEINLINE Vector& operator+=(const Vector &v); + FORCEINLINE Vector& operator-=(const Vector &v); + FORCEINLINE Vector& operator*=(const Vector &v); + FORCEINLINE Vector& operator*=(float s); + FORCEINLINE Vector& operator/=(const Vector &v); + FORCEINLINE Vector& operator/=(float s); + FORCEINLINE Vector& operator+=(float fl) ; ///< broadcast add + FORCEINLINE Vector& operator-=(float fl) ; ///< broadcast sub + +// negate the vector components + void Negate(); + + // Get the vector's magnitude. + inline vec_t Length() const; + + // Get the vector's magnitude squared. + FORCEINLINE vec_t LengthSqr(void) const + { + CHECK_VALID(*this); + return (x*x + y*y + z*z); + } + + // return true if this vector is (0,0,0) within tolerance + bool IsZero( float tolerance = 0.01f ) const + { + return (x > -tolerance && x < tolerance && + y > -tolerance && y < tolerance && + z > -tolerance && z < tolerance); + } + + vec_t NormalizeInPlace(); + Vector Normalized() const; + bool IsLengthGreaterThan( float val ) const; + bool IsLengthLessThan( float val ) const; + + // check if a vector is within the box defined by two other vectors + FORCEINLINE bool WithinAABox( Vector const &boxmin, Vector const &boxmax); + + // Get the distance from this vector to the other one. + vec_t DistTo(const Vector &vOther) const; + + // Get the distance from this vector to the other one squared. + // NJS: note, VC wasn't inlining it correctly in several deeply nested inlines due to being an 'out of line' inline. + // may be able to tidy this up after switching to VC7 + FORCEINLINE vec_t DistToSqr(const Vector &vOther) const + { + Vector delta; + + delta.x = x - vOther.x; + delta.y = y - vOther.y; + delta.z = z - vOther.z; + + return delta.LengthSqr(); + } + + // Copy + void CopyToArray(float* rgfl) const; + + // Multiply, add, and assign to this (ie: *this = a + b * scalar). This + // is about 12% faster than the actual vector equation (because it's done per-component + // rather than per-vector). + void MulAdd(const Vector& a, const Vector& b, float scalar); + + // Dot product. + vec_t Dot(const Vector& vOther) const; + + // assignment + Vector& operator=(const Vector &vOther); + + // 2d + vec_t Length2D(void) const; + vec_t Length2DSqr(void) const; + + operator VectorByValue &() { return *((VectorByValue *)(this)); } + operator const VectorByValue &() const { return *((const VectorByValue *)(this)); } + +#ifndef VECTOR_NO_SLOW_OPERATIONS + // copy constructors +// Vector(const Vector &vOther); + + // arithmetic operations + Vector operator-(void) const; + + Vector operator+(const Vector& v) const; + Vector operator-(const Vector& v) const; + Vector operator*(const Vector& v) const; + Vector operator/(const Vector& v) const; + Vector operator*(float fl) const; + Vector operator/(float fl) const; + + // Cross product between two vectors. + Vector Cross(const Vector &vOther) const; + + // Returns a vector with the min or max in X, Y, and Z. + Vector Min(const Vector &vOther) const; + Vector Max(const Vector &vOther) const; + +#else + +private: + // No copy constructors allowed if we're in optimal mode + Vector(const Vector& vOther); +#endif +}; + +FORCEINLINE void NetworkVarConstruct( Vector &v ) { v.Zero(); } + + +#define USE_M64S ( ( !defined( _X360 ) ) ) + + + +//========================================================= +// 4D Short Vector (aligned on 8-byte boundary) +//========================================================= +class ALIGN8 ShortVector +{ +public: + + short x, y, z, w; + + // Initialization + void Init(short ix = 0, short iy = 0, short iz = 0, short iw = 0 ); + + +#ifdef USE_M64S + __m64 &AsM64() { return *(__m64*)&x; } + const __m64 &AsM64() const { return *(const __m64*)&x; } +#endif + + // Setter + void Set( const ShortVector& vOther ); + void Set( const short ix, const short iy, const short iz, const short iw ); + + // array access... + short operator[](int i) const; + short& operator[](int i); + + // Base address... + short* Base(); + short const* Base() const; + + // equality + bool operator==(const ShortVector& v) const; + bool operator!=(const ShortVector& v) const; + + // Arithmetic operations + FORCEINLINE ShortVector& operator+=(const ShortVector &v); + FORCEINLINE ShortVector& operator-=(const ShortVector &v); + FORCEINLINE ShortVector& operator*=(const ShortVector &v); + FORCEINLINE ShortVector& operator*=(float s); + FORCEINLINE ShortVector& operator/=(const ShortVector &v); + FORCEINLINE ShortVector& operator/=(float s); + FORCEINLINE ShortVector operator*(float fl) const; + +private: + + // No copy constructors allowed if we're in optimal mode +// ShortVector(ShortVector const& vOther); + + // No assignment operators either... +// ShortVector& operator=( ShortVector const& src ); + +} ALIGN8_POST; + + + + + + +//========================================================= +// 4D Integer Vector +//========================================================= +class IntVector4D +{ +public: + + int x, y, z, w; + + // Initialization + void Init(int ix = 0, int iy = 0, int iz = 0, int iw = 0 ); + +#ifdef USE_M64S + __m64 &AsM64() { return *(__m64*)&x; } + const __m64 &AsM64() const { return *(const __m64*)&x; } +#endif + + // Setter + void Set( const IntVector4D& vOther ); + void Set( const int ix, const int iy, const int iz, const int iw ); + + // array access... + int operator[](int i) const; + int& operator[](int i); + + // Base address... + int* Base(); + int const* Base() const; + + // equality + bool operator==(const IntVector4D& v) const; + bool operator!=(const IntVector4D& v) const; + + // Arithmetic operations + FORCEINLINE IntVector4D& operator+=(const IntVector4D &v); + FORCEINLINE IntVector4D& operator-=(const IntVector4D &v); + FORCEINLINE IntVector4D& operator*=(const IntVector4D &v); + FORCEINLINE IntVector4D& operator*=(float s); + FORCEINLINE IntVector4D& operator/=(const IntVector4D &v); + FORCEINLINE IntVector4D& operator/=(float s); + FORCEINLINE IntVector4D operator*(float fl) const; + +private: + + // No copy constructors allowed if we're in optimal mode + // IntVector4D(IntVector4D const& vOther); + + // No assignment operators either... + // IntVector4D& operator=( IntVector4D const& src ); + +}; + + + +//----------------------------------------------------------------------------- +// Allows us to specifically pass the vector by value when we need to +//----------------------------------------------------------------------------- +class VectorByValue : public Vector +{ +public: + // Construction/destruction: + VectorByValue(void) : Vector() {} + VectorByValue(vec_t X, vec_t Y, vec_t Z) : Vector( X, Y, Z ) {} + VectorByValue(const VectorByValue& vOther) { *this = vOther; } +}; + + +//----------------------------------------------------------------------------- +// Utility to simplify table construction. No constructor means can use +// traditional C-style initialization +//----------------------------------------------------------------------------- +class TableVector +{ +public: + vec_t x, y, z; + + operator Vector &() { return *((Vector *)(this)); } + operator const Vector &() const { return *((const Vector *)(this)); } + + // array access... + inline vec_t& operator[](int i) + { + Assert( (i >= 0) && (i < 3) ); + return ((vec_t*)this)[i]; + } + + inline vec_t operator[](int i) const + { + Assert( (i >= 0) && (i < 3) ); + return ((vec_t*)this)[i]; + } +}; + + +//----------------------------------------------------------------------------- +// Here's where we add all those lovely SSE optimized routines +//----------------------------------------------------------------------------- + +class ALIGN16 VectorAligned : public Vector +{ +public: + inline VectorAligned(void) {}; + inline VectorAligned(vec_t X, vec_t Y, vec_t Z) + { + Init(X,Y,Z); + } + +#ifdef VECTOR_NO_SLOW_OPERATIONS + +private: + // No copy constructors allowed if we're in optimal mode + VectorAligned(const VectorAligned& vOther); + VectorAligned(const Vector &vOther); + +#else +public: + explicit VectorAligned(const Vector &vOther) + { + Init(vOther.x, vOther.y, vOther.z); + } + + VectorAligned& operator=(const Vector &vOther) + { + Init(vOther.x, vOther.y, vOther.z); + return *this; + } + +#endif + float w; // this space is used anyway +} ALIGN16_POST; + +//----------------------------------------------------------------------------- +// Vector related operations +//----------------------------------------------------------------------------- + +// Vector clear +FORCEINLINE void VectorClear( Vector& a ); + +// Copy +FORCEINLINE void VectorCopy( const Vector& src, Vector& dst ); + +// Vector arithmetic +FORCEINLINE void VectorAdd( const Vector& a, const Vector& b, Vector& result ); +FORCEINLINE void VectorSubtract( const Vector& a, const Vector& b, Vector& result ); +FORCEINLINE void VectorMultiply( const Vector& a, vec_t b, Vector& result ); +FORCEINLINE void VectorMultiply( const Vector& a, const Vector& b, Vector& result ); +FORCEINLINE void VectorDivide( const Vector& a, vec_t b, Vector& result ); +FORCEINLINE void VectorDivide( const Vector& a, const Vector& b, Vector& result ); +inline void VectorScale ( const Vector& in, vec_t scale, Vector& result ); +// Don't mark this as inline in its function declaration. That's only necessary on its +// definition, and 'inline' here leads to gcc warnings. +void VectorMA( const Vector& start, float scale, const Vector& direction, Vector& dest ); + +// Vector equality with tolerance +bool VectorsAreEqual( const Vector& src1, const Vector& src2, float tolerance = 0.0f ); + +#define VectorExpand(v) (v).x, (v).y, (v).z + + +// Normalization +// FIXME: Can't use quite yet +//vec_t VectorNormalize( Vector& v ); + +// Length +inline vec_t VectorLength( const Vector& v ); + +// Dot Product +FORCEINLINE vec_t DotProduct(const Vector& a, const Vector& b); + +// Cross product +void CrossProduct(const Vector& a, const Vector& b, Vector& result ); + +// Store the min or max of each of x, y, and z into the result. +void VectorMin( const Vector &a, const Vector &b, Vector &result ); +void VectorMax( const Vector &a, const Vector &b, Vector &result ); + +// Linearly interpolate between two vectors +void VectorLerp(const Vector& src1, const Vector& src2, vec_t t, Vector& dest ); +Vector VectorLerp(const Vector& src1, const Vector& src2, vec_t t ); + +FORCEINLINE Vector ReplicateToVector( float x ) +{ + return Vector( x, x, x ); +} + +// check if a point is in the field of a view of an object. supports up to 180 degree fov. +FORCEINLINE bool PointWithinViewAngle( Vector const &vecSrcPosition, + Vector const &vecTargetPosition, + Vector const &vecLookDirection, float flCosHalfFOV ) +{ + Vector vecDelta = vecTargetPosition - vecSrcPosition; + float cosDiff = DotProduct( vecLookDirection, vecDelta ); + + if ( cosDiff < 0 ) + return false; + + float flLen2 = vecDelta.LengthSqr(); + + // a/sqrt(b) > c == a^2 > b * c ^2 + return ( cosDiff * cosDiff > flLen2 * flCosHalfFOV * flCosHalfFOV ); + +} + + +#ifndef VECTOR_NO_SLOW_OPERATIONS + +// Cross product +Vector CrossProduct( const Vector& a, const Vector& b ); + +// Random vector creation +Vector RandomVector( vec_t minVal, vec_t maxVal ); + +#endif + +float RandomVectorInUnitSphere( Vector *pVector ); +float RandomVectorInUnitCircle( Vector2D *pVector ); + + +//----------------------------------------------------------------------------- +// +// Inlined Vector methods +// +//----------------------------------------------------------------------------- + + +//----------------------------------------------------------------------------- +// constructors +//----------------------------------------------------------------------------- +inline Vector::Vector(void) +{ +#ifdef _DEBUG +#ifdef VECTOR_PARANOIA + // Initialize to NAN to catch errors + x = y = z = VEC_T_NAN; +#endif +#endif +} + +inline Vector::Vector(vec_t X, vec_t Y, vec_t Z) +{ + x = X; y = Y; z = Z; + CHECK_VALID(*this); +} + +inline Vector::Vector(vec_t XYZ) +{ + x = y = z = XYZ; + CHECK_VALID(*this); +} + +//inline Vector::Vector(const float *pFloat) +//{ +// Assert( pFloat ); +// x = pFloat[0]; y = pFloat[1]; z = pFloat[2]; +// CHECK_VALID(*this); +//} + +#if 0 +//----------------------------------------------------------------------------- +// copy constructor +//----------------------------------------------------------------------------- + +inline Vector::Vector(const Vector &vOther) +{ + CHECK_VALID(vOther); + x = vOther.x; y = vOther.y; z = vOther.z; +} +#endif + +//----------------------------------------------------------------------------- +// initialization +//----------------------------------------------------------------------------- + +inline void Vector::Init( vec_t ix, vec_t iy, vec_t iz ) +{ + x = ix; y = iy; z = iz; + CHECK_VALID(*this); +} + +inline void Vector::Random( vec_t minVal, vec_t maxVal ) +{ + x = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal); + y = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal); + z = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal); + CHECK_VALID(*this); +} + +// This should really be a single opcode on the PowerPC (move r0 onto the vec reg) +inline void Vector::Zero() +{ + x = y = z = 0.0f; +} + +inline void VectorClear( Vector& a ) +{ + a.x = a.y = a.z = 0.0f; +} + +//----------------------------------------------------------------------------- +// assignment +//----------------------------------------------------------------------------- + +inline Vector& Vector::operator=(const Vector &vOther) +{ + CHECK_VALID(vOther); + x=vOther.x; y=vOther.y; z=vOther.z; + return *this; +} + + +//----------------------------------------------------------------------------- +// Array access +//----------------------------------------------------------------------------- +inline vec_t& Vector::operator[](int i) +{ + Assert( (i >= 0) && (i < 3) ); + return ((vec_t*)this)[i]; +} + +inline vec_t Vector::operator[](int i) const +{ + Assert( (i >= 0) && (i < 3) ); + return ((vec_t*)this)[i]; +} + + +//----------------------------------------------------------------------------- +// Base address... +//----------------------------------------------------------------------------- +inline vec_t* Vector::Base() +{ + return (vec_t*)this; +} + +inline vec_t const* Vector::Base() const +{ + return (vec_t const*)this; +} + +//----------------------------------------------------------------------------- +// Cast to Vector2D... +//----------------------------------------------------------------------------- + +inline Vector2D& Vector::AsVector2D() +{ + return *(Vector2D*)this; +} + +inline const Vector2D& Vector::AsVector2D() const +{ + return *(const Vector2D*)this; +} + +//----------------------------------------------------------------------------- +// IsValid? +//----------------------------------------------------------------------------- + +inline bool Vector::IsValid() const +{ + return IsFinite(x) && IsFinite(y) && IsFinite(z); +} + +//----------------------------------------------------------------------------- +// Invalidate +//----------------------------------------------------------------------------- + +inline void Vector::Invalidate() +{ +//#ifdef _DEBUG +//#ifdef VECTOR_PARANOIA + x = y = z = VEC_T_NAN; +//#endif +//#endif +} + +//----------------------------------------------------------------------------- +// comparison +//----------------------------------------------------------------------------- + +inline bool Vector::operator==( const Vector& src ) const +{ + CHECK_VALID(src); + CHECK_VALID(*this); + return (src.x == x) && (src.y == y) && (src.z == z); +} + +inline bool Vector::operator!=( const Vector& src ) const +{ + CHECK_VALID(src); + CHECK_VALID(*this); + return (src.x != x) || (src.y != y) || (src.z != z); +} + + +//----------------------------------------------------------------------------- +// Copy +//----------------------------------------------------------------------------- + +FORCEINLINE void VectorCopy( const Vector& src, Vector& dst ) +{ + CHECK_VALID(src); + dst.x = src.x; + dst.y = src.y; + dst.z = src.z; +} + +inline void Vector::CopyToArray(float* rgfl) const +{ + Assert( rgfl ); + CHECK_VALID(*this); + rgfl[0] = x, rgfl[1] = y, rgfl[2] = z; +} + +//----------------------------------------------------------------------------- +// standard math operations +//----------------------------------------------------------------------------- +// #pragma message("TODO: these should be SSE") + +inline void Vector::Negate() +{ + CHECK_VALID(*this); + x = -x; y = -y; z = -z; +} + +FORCEINLINE Vector& Vector::operator+=(const Vector& v) +{ + CHECK_VALID(*this); + CHECK_VALID(v); + x+=v.x; y+=v.y; z += v.z; + return *this; +} + +FORCEINLINE Vector& Vector::operator-=(const Vector& v) +{ + CHECK_VALID(*this); + CHECK_VALID(v); + x-=v.x; y-=v.y; z -= v.z; + return *this; +} + +FORCEINLINE Vector& Vector::operator*=(float fl) +{ + x *= fl; + y *= fl; + z *= fl; + CHECK_VALID(*this); + return *this; +} + +FORCEINLINE Vector& Vector::operator*=(const Vector& v) +{ + CHECK_VALID(v); + x *= v.x; + y *= v.y; + z *= v.z; + CHECK_VALID(*this); + return *this; +} + +// this ought to be an opcode. +FORCEINLINE Vector& Vector::operator+=(float fl) +{ + x += fl; + y += fl; + z += fl; + CHECK_VALID(*this); + return *this; +} + +FORCEINLINE Vector& Vector::operator-=(float fl) +{ + x -= fl; + y -= fl; + z -= fl; + CHECK_VALID(*this); + return *this; +} + + + +FORCEINLINE Vector& Vector::operator/=(float fl) +{ + Assert( fl != 0.0f ); + float oofl = 1.0f / fl; + x *= oofl; + y *= oofl; + z *= oofl; + CHECK_VALID(*this); + return *this; +} + +FORCEINLINE Vector& Vector::operator/=(const Vector& v) +{ + CHECK_VALID(v); + Assert( v.x != 0.0f && v.y != 0.0f && v.z != 0.0f ); + x /= v.x; + y /= v.y; + z /= v.z; + CHECK_VALID(*this); + return *this; +} + + + +//----------------------------------------------------------------------------- +// +// Inlined Short Vector methods +// +//----------------------------------------------------------------------------- + + +inline void ShortVector::Init( short ix, short iy, short iz, short iw ) +{ + x = ix; y = iy; z = iz; w = iw; +} + +FORCEINLINE void ShortVector::Set( const ShortVector& vOther ) +{ + x = vOther.x; + y = vOther.y; + z = vOther.z; + w = vOther.w; +} + +FORCEINLINE void ShortVector::Set( const short ix, const short iy, const short iz, const short iw ) +{ + x = ix; + y = iy; + z = iz; + w = iw; +} + + +//----------------------------------------------------------------------------- +// Array access +//----------------------------------------------------------------------------- +inline short ShortVector::operator[](int i) const +{ + Assert( (i >= 0) && (i < 4) ); + return ((short*)this)[i]; +} + +inline short& ShortVector::operator[](int i) +{ + Assert( (i >= 0) && (i < 4) ); + return ((short*)this)[i]; +} + +//----------------------------------------------------------------------------- +// Base address... +//----------------------------------------------------------------------------- +inline short* ShortVector::Base() +{ + return (short*)this; +} + +inline short const* ShortVector::Base() const +{ + return (short const*)this; +} + + +//----------------------------------------------------------------------------- +// comparison +//----------------------------------------------------------------------------- + +inline bool ShortVector::operator==( const ShortVector& src ) const +{ + return (src.x == x) && (src.y == y) && (src.z == z) && (src.w == w); +} + +inline bool ShortVector::operator!=( const ShortVector& src ) const +{ + return (src.x != x) || (src.y != y) || (src.z != z) || (src.w != w); +} + + + +//----------------------------------------------------------------------------- +// standard math operations +//----------------------------------------------------------------------------- + +FORCEINLINE ShortVector& ShortVector::operator+=(const ShortVector& v) +{ + x+=v.x; y+=v.y; z += v.z; w += v.w; + return *this; +} + +FORCEINLINE ShortVector& ShortVector::operator-=(const ShortVector& v) +{ + x-=v.x; y-=v.y; z -= v.z; w -= v.w; + return *this; +} + +FORCEINLINE ShortVector& ShortVector::operator*=(float fl) +{ + x *= fl; + y *= fl; + z *= fl; + w *= fl; + return *this; +} + +FORCEINLINE ShortVector& ShortVector::operator*=(const ShortVector& v) +{ + x *= v.x; + y *= v.y; + z *= v.z; + w *= v.w; + return *this; +} + +FORCEINLINE ShortVector& ShortVector::operator/=(float fl) +{ + Assert( fl != 0.0f ); + float oofl = 1.0f / fl; + x *= oofl; + y *= oofl; + z *= oofl; + w *= oofl; + return *this; +} + +FORCEINLINE ShortVector& ShortVector::operator/=(const ShortVector& v) +{ + Assert( v.x != 0 && v.y != 0 && v.z != 0 && v.w != 0 ); + x /= v.x; + y /= v.y; + z /= v.z; + w /= v.w; + return *this; +} + +FORCEINLINE void ShortVectorMultiply( const ShortVector& src, float fl, ShortVector& res ) +{ + Assert( IsFinite(fl) ); + res.x = src.x * fl; + res.y = src.y * fl; + res.z = src.z * fl; + res.w = src.w * fl; +} + +FORCEINLINE ShortVector ShortVector::operator*(float fl) const +{ + ShortVector res; + ShortVectorMultiply( *this, fl, res ); + return res; +} + + + + + + +//----------------------------------------------------------------------------- +// +// Inlined Integer Vector methods +// +//----------------------------------------------------------------------------- + + +inline void IntVector4D::Init( int ix, int iy, int iz, int iw ) +{ + x = ix; y = iy; z = iz; w = iw; +} + +FORCEINLINE void IntVector4D::Set( const IntVector4D& vOther ) +{ + x = vOther.x; + y = vOther.y; + z = vOther.z; + w = vOther.w; +} + +FORCEINLINE void IntVector4D::Set( const int ix, const int iy, const int iz, const int iw ) +{ + x = ix; + y = iy; + z = iz; + w = iw; +} + + +//----------------------------------------------------------------------------- +// Array access +//----------------------------------------------------------------------------- +inline int IntVector4D::operator[](int i) const +{ + Assert( (i >= 0) && (i < 4) ); + return ((int*)this)[i]; +} + +inline int& IntVector4D::operator[](int i) +{ + Assert( (i >= 0) && (i < 4) ); + return ((int*)this)[i]; +} + +//----------------------------------------------------------------------------- +// Base address... +//----------------------------------------------------------------------------- +inline int* IntVector4D::Base() +{ + return (int*)this; +} + +inline int const* IntVector4D::Base() const +{ + return (int const*)this; +} + + +//----------------------------------------------------------------------------- +// comparison +//----------------------------------------------------------------------------- + +inline bool IntVector4D::operator==( const IntVector4D& src ) const +{ + return (src.x == x) && (src.y == y) && (src.z == z) && (src.w == w); +} + +inline bool IntVector4D::operator!=( const IntVector4D& src ) const +{ + return (src.x != x) || (src.y != y) || (src.z != z) || (src.w != w); +} + + + +//----------------------------------------------------------------------------- +// standard math operations +//----------------------------------------------------------------------------- + +FORCEINLINE IntVector4D& IntVector4D::operator+=(const IntVector4D& v) +{ + x+=v.x; y+=v.y; z += v.z; w += v.w; + return *this; +} + +FORCEINLINE IntVector4D& IntVector4D::operator-=(const IntVector4D& v) +{ + x-=v.x; y-=v.y; z -= v.z; w -= v.w; + return *this; +} + +FORCEINLINE IntVector4D& IntVector4D::operator*=(float fl) +{ + x *= fl; + y *= fl; + z *= fl; + w *= fl; + return *this; +} + +FORCEINLINE IntVector4D& IntVector4D::operator*=(const IntVector4D& v) +{ + x *= v.x; + y *= v.y; + z *= v.z; + w *= v.w; + return *this; +} + +FORCEINLINE IntVector4D& IntVector4D::operator/=(float fl) +{ + Assert( fl != 0.0f ); + float oofl = 1.0f / fl; + x *= oofl; + y *= oofl; + z *= oofl; + w *= oofl; + return *this; +} + +FORCEINLINE IntVector4D& IntVector4D::operator/=(const IntVector4D& v) +{ + Assert( v.x != 0 && v.y != 0 && v.z != 0 && v.w != 0 ); + x /= v.x; + y /= v.y; + z /= v.z; + w /= v.w; + return *this; +} + +FORCEINLINE void IntVector4DMultiply( const IntVector4D& src, float fl, IntVector4D& res ) +{ + Assert( IsFinite(fl) ); + res.x = src.x * fl; + res.y = src.y * fl; + res.z = src.z * fl; + res.w = src.w * fl; +} + +FORCEINLINE IntVector4D IntVector4D::operator*(float fl) const +{ + IntVector4D res; + IntVector4DMultiply( *this, fl, res ); + return res; +} + + + +// ======================= + + +FORCEINLINE void VectorAdd( const Vector& a, const Vector& b, Vector& c ) +{ + CHECK_VALID(a); + CHECK_VALID(b); + c.x = a.x + b.x; + c.y = a.y + b.y; + c.z = a.z + b.z; +} + +FORCEINLINE void VectorSubtract( const Vector& a, const Vector& b, Vector& c ) +{ + CHECK_VALID(a); + CHECK_VALID(b); + c.x = a.x - b.x; + c.y = a.y - b.y; + c.z = a.z - b.z; +} + +FORCEINLINE void VectorMultiply( const Vector& a, vec_t b, Vector& c ) +{ + CHECK_VALID(a); + Assert( IsFinite(b) ); + c.x = a.x * b; + c.y = a.y * b; + c.z = a.z * b; +} + +FORCEINLINE void VectorMultiply( const Vector& a, const Vector& b, Vector& c ) +{ + CHECK_VALID(a); + CHECK_VALID(b); + c.x = a.x * b.x; + c.y = a.y * b.y; + c.z = a.z * b.z; +} + +// for backwards compatability +inline void VectorScale ( const Vector& in, vec_t scale, Vector& result ) +{ + VectorMultiply( in, scale, result ); +} + + +FORCEINLINE void VectorDivide( const Vector& a, vec_t b, Vector& c ) +{ + CHECK_VALID(a); + Assert( b != 0.0f ); + vec_t oob = 1.0f / b; + c.x = a.x * oob; + c.y = a.y * oob; + c.z = a.z * oob; +} + +FORCEINLINE void VectorDivide( const Vector& a, const Vector& b, Vector& c ) +{ + CHECK_VALID(a); + CHECK_VALID(b); + Assert( (b.x != 0.0f) && (b.y != 0.0f) && (b.z != 0.0f) ); + c.x = a.x / b.x; + c.y = a.y / b.y; + c.z = a.z / b.z; +} + +// FIXME: Remove +// For backwards compatability +inline void Vector::MulAdd(const Vector& a, const Vector& b, float scalar) +{ + CHECK_VALID(a); + CHECK_VALID(b); + x = a.x + b.x * scalar; + y = a.y + b.y * scalar; + z = a.z + b.z * scalar; +} + +inline void VectorLerp(const Vector& src1, const Vector& src2, vec_t t, Vector& dest ) +{ + CHECK_VALID(src1); + CHECK_VALID(src2); + dest.x = src1.x + (src2.x - src1.x) * t; + dest.y = src1.y + (src2.y - src1.y) * t; + dest.z = src1.z + (src2.z - src1.z) * t; +} + +inline Vector VectorLerp(const Vector& src1, const Vector& src2, vec_t t ) +{ + Vector result; + VectorLerp( src1, src2, t, result ); + return result; +} + +//----------------------------------------------------------------------------- +// Temporary storage for vector results so const Vector& results can be returned +//----------------------------------------------------------------------------- +inline Vector &AllocTempVector() +{ + static Vector s_vecTemp[128]; + static CInterlockedInt s_nIndex; + + int nIndex; + for (;;) + { + int nOldIndex = s_nIndex; + nIndex = ( (nOldIndex + 0x10001) & 0x7F ); + + if ( s_nIndex.AssignIf( nOldIndex, nIndex ) ) + { + break; + } + ThreadPause(); + } + return s_vecTemp[nIndex]; +} + + + +//----------------------------------------------------------------------------- +// dot, cross +//----------------------------------------------------------------------------- +FORCEINLINE vec_t DotProduct(const Vector& a, const Vector& b) +{ + CHECK_VALID(a); + CHECK_VALID(b); + return( a.x*b.x + a.y*b.y + a.z*b.z ); +} + +// for backwards compatability +inline vec_t Vector::Dot( const Vector& vOther ) const +{ + CHECK_VALID(vOther); + return DotProduct( *this, vOther ); +} + +inline void CrossProduct(const Vector& a, const Vector& b, Vector& result ) +{ + CHECK_VALID(a); + CHECK_VALID(b); + Assert( &a != &result ); + Assert( &b != &result ); + result.x = a.y*b.z - a.z*b.y; + result.y = a.z*b.x - a.x*b.z; + result.z = a.x*b.y - a.y*b.x; +} + +inline vec_t DotProductAbs( const Vector &v0, const Vector &v1 ) +{ + CHECK_VALID(v0); + CHECK_VALID(v1); + return FloatMakePositive(v0.x*v1.x) + FloatMakePositive(v0.y*v1.y) + FloatMakePositive(v0.z*v1.z); +} + +inline vec_t DotProductAbs( const Vector &v0, const float *v1 ) +{ + return FloatMakePositive(v0.x * v1[0]) + FloatMakePositive(v0.y * v1[1]) + FloatMakePositive(v0.z * v1[2]); +} + +//----------------------------------------------------------------------------- +// length +//----------------------------------------------------------------------------- + +inline vec_t VectorLength( const Vector& v ) +{ + CHECK_VALID(v); + return (vec_t)FastSqrt(v.x*v.x + v.y*v.y + v.z*v.z); +} + + +inline vec_t Vector::Length(void) const +{ + CHECK_VALID(*this); + return VectorLength( *this ); +} + + +//----------------------------------------------------------------------------- +// Normalization +//----------------------------------------------------------------------------- + +/* +// FIXME: Can't use until we're un-macroed in mathlib.h +inline vec_t VectorNormalize( Vector& v ) +{ + Assert( v.IsValid() ); + vec_t l = v.Length(); + if (l != 0.0f) + { + v /= l; + } + else + { + // FIXME: + // Just copying the existing implemenation; shouldn't res.z == 0? + v.x = v.y = 0.0f; v.z = 1.0f; + } + return l; +} +*/ + + +// check a point against a box +bool Vector::WithinAABox( Vector const &boxmin, Vector const &boxmax) +{ + return ( + ( x >= boxmin.x ) && ( x <= boxmax.x) && + ( y >= boxmin.y ) && ( y <= boxmax.y) && + ( z >= boxmin.z ) && ( z <= boxmax.z) + ); +} + +//----------------------------------------------------------------------------- +// Get the distance from this vector to the other one +//----------------------------------------------------------------------------- +inline vec_t Vector::DistTo(const Vector &vOther) const +{ + Vector delta; + VectorSubtract( *this, vOther, delta ); + return delta.Length(); +} + + +//----------------------------------------------------------------------------- +// Vector equality with tolerance +//----------------------------------------------------------------------------- +inline bool VectorsAreEqual( const Vector& src1, const Vector& src2, float tolerance ) +{ + if (FloatMakePositive(src1.x - src2.x) > tolerance) + return false; + if (FloatMakePositive(src1.y - src2.y) > tolerance) + return false; + return (FloatMakePositive(src1.z - src2.z) <= tolerance); +} + + +//----------------------------------------------------------------------------- +// Computes the closest point to vecTarget no farther than flMaxDist from vecStart +//----------------------------------------------------------------------------- +inline void ComputeClosestPoint( const Vector& vecStart, float flMaxDist, const Vector& vecTarget, Vector *pResult ) +{ + Vector vecDelta; + VectorSubtract( vecTarget, vecStart, vecDelta ); + float flDistSqr = vecDelta.LengthSqr(); + if ( flDistSqr <= flMaxDist * flMaxDist ) + { + *pResult = vecTarget; + } + else + { + vecDelta /= FastSqrt( flDistSqr ); + VectorMA( vecStart, flMaxDist, vecDelta, *pResult ); + } +} + + +//----------------------------------------------------------------------------- +// Takes the absolute value of a vector +//----------------------------------------------------------------------------- +inline void VectorAbs( const Vector& src, Vector& dst ) +{ + dst.x = FloatMakePositive(src.x); + dst.y = FloatMakePositive(src.y); + dst.z = FloatMakePositive(src.z); +} + + +//----------------------------------------------------------------------------- +// +// Slow methods +// +//----------------------------------------------------------------------------- + +#ifndef VECTOR_NO_SLOW_OPERATIONS + +//----------------------------------------------------------------------------- +// Returns a vector with the min or max in X, Y, and Z. +//----------------------------------------------------------------------------- +inline Vector Vector::Min(const Vector &vOther) const +{ + return Vector(x < vOther.x ? x : vOther.x, + y < vOther.y ? y : vOther.y, + z < vOther.z ? z : vOther.z); +} + +inline Vector Vector::Max(const Vector &vOther) const +{ + return Vector(x > vOther.x ? x : vOther.x, + y > vOther.y ? y : vOther.y, + z > vOther.z ? z : vOther.z); +} + + +//----------------------------------------------------------------------------- +// arithmetic operations +//----------------------------------------------------------------------------- + +inline Vector Vector::operator-(void) const +{ + return Vector(-x,-y,-z); +} + +inline Vector Vector::operator+(const Vector& v) const +{ + Vector res; + VectorAdd( *this, v, res ); + return res; +} + +inline Vector Vector::operator-(const Vector& v) const +{ + Vector res; + VectorSubtract( *this, v, res ); + return res; +} + +inline Vector Vector::operator*(float fl) const +{ + Vector res; + VectorMultiply( *this, fl, res ); + return res; +} + +inline Vector Vector::operator*(const Vector& v) const +{ + Vector res; + VectorMultiply( *this, v, res ); + return res; +} + +inline Vector Vector::operator/(float fl) const +{ + Vector res; + VectorDivide( *this, fl, res ); + return res; +} + +inline Vector Vector::operator/(const Vector& v) const +{ + Vector res; + VectorDivide( *this, v, res ); + return res; +} + +inline Vector operator*(float fl, const Vector& v) +{ + return v * fl; +} + +//----------------------------------------------------------------------------- +// cross product +//----------------------------------------------------------------------------- + +inline Vector Vector::Cross(const Vector& vOther) const +{ + Vector res; + CrossProduct( *this, vOther, res ); + return res; +} + +//----------------------------------------------------------------------------- +// 2D +//----------------------------------------------------------------------------- + +inline vec_t Vector::Length2D(void) const +{ + return (vec_t)FastSqrt(x*x + y*y); +} + +inline vec_t Vector::Length2DSqr(void) const +{ + return (x*x + y*y); +} + +inline Vector CrossProduct(const Vector& a, const Vector& b) +{ + return Vector( a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x ); +} + +inline void VectorMin( const Vector &a, const Vector &b, Vector &result ) +{ + result.x = fpmin(a.x, b.x); + result.y = fpmin(a.y, b.y); + result.z = fpmin(a.z, b.z); +} + +inline void VectorMax( const Vector &a, const Vector &b, Vector &result ) +{ + result.x = fpmax(a.x, b.x); + result.y = fpmax(a.y, b.y); + result.z = fpmax(a.z, b.z); +} + +inline float ComputeVolume( const Vector &vecMins, const Vector &vecMaxs ) +{ + Vector vecDelta; + VectorSubtract( vecMaxs, vecMins, vecDelta ); + return DotProduct( vecDelta, vecDelta ); +} + +// Get a random vector. +inline Vector RandomVector( float minVal, float maxVal ) +{ + Vector vRandom; + vRandom.Random( minVal, maxVal ); + return vRandom; +} + +#endif //slow + +//----------------------------------------------------------------------------- +// Helper debugging stuff.... +//----------------------------------------------------------------------------- + +inline bool operator==( float const* f, const Vector& v ) +{ + // AIIIEEEE!!!! + Assert(0); + return false; +} + +inline bool operator==( const Vector& v, float const* f ) +{ + // AIIIEEEE!!!! + Assert(0); + return false; +} + +inline bool operator!=( float const* f, const Vector& v ) +{ + // AIIIEEEE!!!! + Assert(0); + return false; +} + +inline bool operator!=( const Vector& v, float const* f ) +{ + // AIIIEEEE!!!! + Assert(0); + return false; +} + + +//----------------------------------------------------------------------------- +// AngularImpulse +//----------------------------------------------------------------------------- +// AngularImpulse are exponetial maps (an axis scaled by a "twist" angle in degrees) +typedef Vector AngularImpulse; + +#ifndef VECTOR_NO_SLOW_OPERATIONS + +inline AngularImpulse RandomAngularImpulse( float minVal, float maxVal ) +{ + AngularImpulse angImp; + angImp.Random( minVal, maxVal ); + return angImp; +} + +#endif + + +//----------------------------------------------------------------------------- +// Quaternion +//----------------------------------------------------------------------------- + +class RadianEuler; + +class Quaternion // same data-layout as engine's vec4_t, +{ // which is a vec_t[4] +public: + inline Quaternion(void) { + + // Initialize to NAN to catch errors +#ifdef _DEBUG +#ifdef VECTOR_PARANOIA + x = y = z = w = VEC_T_NAN; +#endif +#endif + } + inline Quaternion(vec_t ix, vec_t iy, vec_t iz, vec_t iw) : x(ix), y(iy), z(iz), w(iw) { } + inline Quaternion(RadianEuler const &angle); // evil auto type promotion!!! + + inline void Init(vec_t ix=0.0f, vec_t iy=0.0f, vec_t iz=0.0f, vec_t iw=0.0f) { x = ix; y = iy; z = iz; w = iw; } + + bool IsValid() const; + void Invalidate(); + + bool operator==( const Quaternion &src ) const; + bool operator!=( const Quaternion &src ) const; + + vec_t* Base() { return (vec_t*)this; } + const vec_t* Base() const { return (vec_t*)this; } + + // array access... + vec_t operator[](int i) const; + vec_t& operator[](int i); + + vec_t x, y, z, w; +}; + + +//----------------------------------------------------------------------------- +// Array access +//----------------------------------------------------------------------------- +inline vec_t& Quaternion::operator[](int i) +{ + Assert( (i >= 0) && (i < 4) ); + return ((vec_t*)this)[i]; +} + +inline vec_t Quaternion::operator[](int i) const +{ + Assert( (i >= 0) && (i < 4) ); + return ((vec_t*)this)[i]; +} + + +//----------------------------------------------------------------------------- +// Equality test +//----------------------------------------------------------------------------- +inline bool Quaternion::operator==( const Quaternion &src ) const +{ + return ( x == src.x ) && ( y == src.y ) && ( z == src.z ) && ( w == src.w ); +} + +inline bool Quaternion::operator!=( const Quaternion &src ) const +{ + return !operator==( src ); +} + + +//----------------------------------------------------------------------------- +// Quaternion equality with tolerance +//----------------------------------------------------------------------------- +inline bool QuaternionsAreEqual( const Quaternion& src1, const Quaternion& src2, float tolerance ) +{ + if (FloatMakePositive(src1.x - src2.x) > tolerance) + return false; + if (FloatMakePositive(src1.y - src2.y) > tolerance) + return false; + if (FloatMakePositive(src1.z - src2.z) > tolerance) + return false; + return (FloatMakePositive(src1.w - src2.w) <= tolerance); +} + + +//----------------------------------------------------------------------------- +// Here's where we add all those lovely SSE optimized routines +//----------------------------------------------------------------------------- +class ALIGN16 QuaternionAligned : public Quaternion +{ +public: + inline QuaternionAligned(void) {}; + inline QuaternionAligned(vec_t X, vec_t Y, vec_t Z, vec_t W) + { + Init(X,Y,Z,W); + } + +#ifdef VECTOR_NO_SLOW_OPERATIONS + +private: + // No copy constructors allowed if we're in optimal mode + QuaternionAligned(const QuaternionAligned& vOther); + QuaternionAligned(const Quaternion &vOther); + +#else +public: + explicit QuaternionAligned(const Quaternion &vOther) + { + Init(vOther.x, vOther.y, vOther.z, vOther.w); + } + + QuaternionAligned& operator=(const Quaternion &vOther) + { + Init(vOther.x, vOther.y, vOther.z, vOther.w); + return *this; + } + +#endif +} ALIGN16_POST; + + +//----------------------------------------------------------------------------- +// Radian Euler angle aligned to axis (NOT ROLL/PITCH/YAW) +//----------------------------------------------------------------------------- +class QAngle; +class RadianEuler +{ +public: + inline RadianEuler(void) { } + inline RadianEuler(vec_t X, vec_t Y, vec_t Z) { x = X; y = Y; z = Z; } + inline RadianEuler(Quaternion const &q); // evil auto type promotion!!! + inline RadianEuler(QAngle const &angles); // evil auto type promotion!!! + + // Initialization + inline void Init(vec_t ix=0.0f, vec_t iy=0.0f, vec_t iz=0.0f) { x = ix; y = iy; z = iz; } + + // conversion to qangle + QAngle ToQAngle( void ) const; + bool IsValid() const; + void Invalidate(); + + // array access... + vec_t operator[](int i) const; + vec_t& operator[](int i); + + vec_t x, y, z; +}; + + +extern void AngleQuaternion( RadianEuler const &angles, Quaternion &qt ); +extern void QuaternionAngles( Quaternion const &q, RadianEuler &angles ); + +FORCEINLINE void NetworkVarConstruct( Quaternion &q ) { q.x = q.y = q.z = q.w = 0.0f; } + +inline Quaternion::Quaternion(RadianEuler const &angle) +{ + AngleQuaternion( angle, *this ); +} + +inline bool Quaternion::IsValid() const +{ + return IsFinite(x) && IsFinite(y) && IsFinite(z) && IsFinite(w); +} + +inline void Quaternion::Invalidate() +{ +//#ifdef _DEBUG +//#ifdef VECTOR_PARANOIA + x = y = z = w = VEC_T_NAN; +//#endif +//#endif +} + +inline RadianEuler::RadianEuler(Quaternion const &q) +{ + QuaternionAngles( q, *this ); +} + +inline void VectorCopy( RadianEuler const& src, RadianEuler &dst ) +{ + CHECK_VALID(src); + dst.x = src.x; + dst.y = src.y; + dst.z = src.z; +} + +inline void VectorScale( RadianEuler const& src, float b, RadianEuler &dst ) +{ + CHECK_VALID(src); + Assert( IsFinite(b) ); + dst.x = src.x * b; + dst.y = src.y * b; + dst.z = src.z * b; +} + +inline bool RadianEuler::IsValid() const +{ + return IsFinite(x) && IsFinite(y) && IsFinite(z); +} + +inline void RadianEuler::Invalidate() +{ +//#ifdef _DEBUG +//#ifdef VECTOR_PARANOIA + x = y = z = VEC_T_NAN; +//#endif +//#endif +} + + +//----------------------------------------------------------------------------- +// Array access +//----------------------------------------------------------------------------- +inline vec_t& RadianEuler::operator[](int i) +{ + Assert( (i >= 0) && (i < 3) ); + return ((vec_t*)this)[i]; +} + +inline vec_t RadianEuler::operator[](int i) const +{ + Assert( (i >= 0) && (i < 3) ); + return ((vec_t*)this)[i]; +} + + +//----------------------------------------------------------------------------- +// Degree Euler QAngle pitch, yaw, roll +//----------------------------------------------------------------------------- +class QAngleByValue; + +class QAngle +{ +public: + // Members + vec_t x, y, z; + + // Construction/destruction + QAngle(void); + QAngle(vec_t X, vec_t Y, vec_t Z); +// QAngle(RadianEuler const &angles); // evil auto type promotion!!! + + // Allow pass-by-value + operator QAngleByValue &() { return *((QAngleByValue *)(this)); } + operator const QAngleByValue &() const { return *((const QAngleByValue *)(this)); } + + // Initialization + void Init(vec_t ix=0.0f, vec_t iy=0.0f, vec_t iz=0.0f); + void Random( vec_t minVal, vec_t maxVal ); + + // Got any nasty NAN's? + bool IsValid() const; + void Invalidate(); + + // array access... + vec_t operator[](int i) const; + vec_t& operator[](int i); + + // Base address... + vec_t* Base(); + vec_t const* Base() const; + + // equality + bool operator==(const QAngle& v) const; + bool operator!=(const QAngle& v) const; + + // arithmetic operations + QAngle& operator+=(const QAngle &v); + QAngle& operator-=(const QAngle &v); + QAngle& operator*=(float s); + QAngle& operator/=(float s); + + // Get the vector's magnitude. + vec_t Length() const; + vec_t LengthSqr() const; + + // negate the QAngle components + //void Negate(); + + // No assignment operators either... + QAngle& operator=( const QAngle& src ); + +#ifndef VECTOR_NO_SLOW_OPERATIONS + // copy constructors + + // arithmetic operations + QAngle operator-(void) const; + + QAngle operator+(const QAngle& v) const; + QAngle operator-(const QAngle& v) const; + QAngle operator*(float fl) const; + QAngle operator/(float fl) const; +#else + +private: + // No copy constructors allowed if we're in optimal mode + QAngle(const QAngle& vOther); + +#endif +}; + +FORCEINLINE void NetworkVarConstruct( QAngle &q ) { q.x = q.y = q.z = 0.0f; } + +//----------------------------------------------------------------------------- +// Allows us to specifically pass the vector by value when we need to +//----------------------------------------------------------------------------- +class QAngleByValue : public QAngle +{ +public: + // Construction/destruction: + QAngleByValue(void) : QAngle() {} + QAngleByValue(vec_t X, vec_t Y, vec_t Z) : QAngle( X, Y, Z ) {} + QAngleByValue(const QAngleByValue& vOther) { *this = vOther; } +}; + + +inline void VectorAdd( const QAngle& a, const QAngle& b, QAngle& result ) +{ + CHECK_VALID(a); + CHECK_VALID(b); + result.x = a.x + b.x; + result.y = a.y + b.y; + result.z = a.z + b.z; +} + +inline void VectorMA( const QAngle &start, float scale, const QAngle &direction, QAngle &dest ) +{ + CHECK_VALID(start); + CHECK_VALID(direction); + dest.x = start.x + scale * direction.x; + dest.y = start.y + scale * direction.y; + dest.z = start.z + scale * direction.z; +} + + +//----------------------------------------------------------------------------- +// constructors +//----------------------------------------------------------------------------- +inline QAngle::QAngle(void) +{ +#ifdef _DEBUG +#ifdef VECTOR_PARANOIA + // Initialize to NAN to catch errors + x = y = z = VEC_T_NAN; +#endif +#endif +} + +inline QAngle::QAngle(vec_t X, vec_t Y, vec_t Z) +{ + x = X; y = Y; z = Z; + CHECK_VALID(*this); +} + + +//----------------------------------------------------------------------------- +// initialization +//----------------------------------------------------------------------------- +inline void QAngle::Init( vec_t ix, vec_t iy, vec_t iz ) +{ + x = ix; y = iy; z = iz; + CHECK_VALID(*this); +} + +inline void QAngle::Random( vec_t minVal, vec_t maxVal ) +{ + x = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal); + y = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal); + z = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal); + CHECK_VALID(*this); +} + +#ifndef VECTOR_NO_SLOW_OPERATIONS + +inline QAngle RandomAngle( float minVal, float maxVal ) +{ + Vector vRandom; + vRandom.Random( minVal, maxVal ); + QAngle ret( vRandom.x, vRandom.y, vRandom.z ); + return ret; +} + +#endif + + +inline RadianEuler::RadianEuler(QAngle const &angles) +{ + Init( + angles.z * 3.14159265358979323846f / 180.f, + angles.x * 3.14159265358979323846f / 180.f, + angles.y * 3.14159265358979323846f / 180.f ); +} + + + + +inline QAngle RadianEuler::ToQAngle( void) const +{ + return QAngle( + y * 180.f / 3.14159265358979323846f, + z * 180.f / 3.14159265358979323846f, + x * 180.f / 3.14159265358979323846f ); +} + + +//----------------------------------------------------------------------------- +// assignment +//----------------------------------------------------------------------------- +inline QAngle& QAngle::operator=(const QAngle &vOther) +{ + CHECK_VALID(vOther); + x=vOther.x; y=vOther.y; z=vOther.z; + return *this; +} + + +//----------------------------------------------------------------------------- +// Array access +//----------------------------------------------------------------------------- +inline vec_t& QAngle::operator[](int i) +{ + Assert( (i >= 0) && (i < 3) ); + return ((vec_t*)this)[i]; +} + +inline vec_t QAngle::operator[](int i) const +{ + Assert( (i >= 0) && (i < 3) ); + return ((vec_t*)this)[i]; +} + + +//----------------------------------------------------------------------------- +// Base address... +//----------------------------------------------------------------------------- +inline vec_t* QAngle::Base() +{ + return (vec_t*)this; +} + +inline vec_t const* QAngle::Base() const +{ + return (vec_t const*)this; +} + + +//----------------------------------------------------------------------------- +// IsValid? +//----------------------------------------------------------------------------- +inline bool QAngle::IsValid() const +{ + return IsFinite(x) && IsFinite(y) && IsFinite(z); +} + +//----------------------------------------------------------------------------- +// Invalidate +//----------------------------------------------------------------------------- + +inline void QAngle::Invalidate() +{ +//#ifdef _DEBUG +//#ifdef VECTOR_PARANOIA + x = y = z = VEC_T_NAN; +//#endif +//#endif +} + +//----------------------------------------------------------------------------- +// comparison +//----------------------------------------------------------------------------- +inline bool QAngle::operator==( const QAngle& src ) const +{ + CHECK_VALID(src); + CHECK_VALID(*this); + return (src.x == x) && (src.y == y) && (src.z == z); +} + +inline bool QAngle::operator!=( const QAngle& src ) const +{ + CHECK_VALID(src); + CHECK_VALID(*this); + return (src.x != x) || (src.y != y) || (src.z != z); +} + + +//----------------------------------------------------------------------------- +// Copy +//----------------------------------------------------------------------------- +inline void VectorCopy( const QAngle& src, QAngle& dst ) +{ + CHECK_VALID(src); + dst.x = src.x; + dst.y = src.y; + dst.z = src.z; +} + + +//----------------------------------------------------------------------------- +// standard math operations +//----------------------------------------------------------------------------- +inline QAngle& QAngle::operator+=(const QAngle& v) +{ + CHECK_VALID(*this); + CHECK_VALID(v); + x+=v.x; y+=v.y; z += v.z; + return *this; +} + +inline QAngle& QAngle::operator-=(const QAngle& v) +{ + CHECK_VALID(*this); + CHECK_VALID(v); + x-=v.x; y-=v.y; z -= v.z; + return *this; +} + +inline QAngle& QAngle::operator*=(float fl) +{ + x *= fl; + y *= fl; + z *= fl; + CHECK_VALID(*this); + return *this; +} + +inline QAngle& QAngle::operator/=(float fl) +{ + Assert( fl != 0.0f ); + float oofl = 1.0f / fl; + x *= oofl; + y *= oofl; + z *= oofl; + CHECK_VALID(*this); + return *this; +} + + +//----------------------------------------------------------------------------- +// length +//----------------------------------------------------------------------------- +inline vec_t QAngle::Length( ) const +{ + CHECK_VALID(*this); + return (vec_t)FastSqrt( LengthSqr( ) ); +} + + +inline vec_t QAngle::LengthSqr( ) const +{ + CHECK_VALID(*this); + return x * x + y * y + z * z; +} + + +//----------------------------------------------------------------------------- +// Vector equality with tolerance +//----------------------------------------------------------------------------- +inline bool QAnglesAreEqual( const QAngle& src1, const QAngle& src2, float tolerance = 0.0f ) +{ + if (FloatMakePositive(src1.x - src2.x) > tolerance) + return false; + if (FloatMakePositive(src1.y - src2.y) > tolerance) + return false; + return (FloatMakePositive(src1.z - src2.z) <= tolerance); +} + + +//----------------------------------------------------------------------------- +// arithmetic operations (SLOW!!) +//----------------------------------------------------------------------------- +#ifndef VECTOR_NO_SLOW_OPERATIONS + +inline QAngle QAngle::operator-(void) const +{ + QAngle ret(-x,-y,-z); + return ret; +} + +inline QAngle QAngle::operator+(const QAngle& v) const +{ + QAngle res; + res.x = x + v.x; + res.y = y + v.y; + res.z = z + v.z; + return res; +} + +inline QAngle QAngle::operator-(const QAngle& v) const +{ + QAngle res; + res.x = x - v.x; + res.y = y - v.y; + res.z = z - v.z; + return res; +} + +inline QAngle QAngle::operator*(float fl) const +{ + QAngle res; + res.x = x * fl; + res.y = y * fl; + res.z = z * fl; + return res; +} + +inline QAngle QAngle::operator/(float fl) const +{ + QAngle res; + res.x = x / fl; + res.y = y / fl; + res.z = z / fl; + return res; +} + +inline QAngle operator*(float fl, const QAngle& v) +{ + QAngle ret( v * fl ); + return ret; +} + +#endif // VECTOR_NO_SLOW_OPERATIONS + + +//----------------------------------------------------------------------------- +// NOTE: These are not completely correct. The representations are not equivalent +// unless the QAngle represents a rotational impulse along a coordinate axis (x,y,z) +inline void QAngleToAngularImpulse( const QAngle &angles, AngularImpulse &impulse ) +{ + impulse.x = angles.z; + impulse.y = angles.x; + impulse.z = angles.y; +} + +inline void AngularImpulseToQAngle( const AngularImpulse &impulse, QAngle &angles ) +{ + angles.x = impulse.y; + angles.y = impulse.z; + angles.z = impulse.x; +} + +#if !defined( _X360 ) + +FORCEINLINE vec_t InvRSquared( float const *v ) +{ +#if defined(__i386__) || defined(_M_IX86) + float sqrlen = v[0]*v[0]+v[1]*v[1]+v[2]*v[2] + 1.0e-10f, result; + _mm_store_ss(&result, _mm_rcp_ss( _mm_max_ss( _mm_set_ss(1.0f), _mm_load_ss(&sqrlen) ) )); + return result; +#else + return 1.f/fpmax(1.f, v[0]*v[0]+v[1]*v[1]+v[2]*v[2]); +#endif +} + +FORCEINLINE vec_t InvRSquared( const Vector &v ) +{ + return InvRSquared(&v.x); +} + +#if defined(__i386__) || defined(_M_IX86) +inline void _SSE_RSqrtInline( float a, float* out ) +{ + __m128 xx = _mm_load_ss( &a ); + __m128 xr = _mm_rsqrt_ss( xx ); + __m128 xt; + xt = _mm_mul_ss( xr, xr ); + xt = _mm_mul_ss( xt, xx ); + xt = _mm_sub_ss( _mm_set_ss(3.f), xt ); + xt = _mm_mul_ss( xt, _mm_set_ss(0.5f) ); + xr = _mm_mul_ss( xr, xt ); + _mm_store_ss( out, xr ); +} +#endif + +// FIXME: Change this back to a #define once we get rid of the vec_t version +FORCEINLINE float VectorNormalize( Vector& vec ) +{ +#ifndef DEBUG // stop crashing my edit-and-continue! + #if defined(__i386__) || defined(_M_IX86) + #define DO_SSE_OPTIMIZATION + #endif +#endif + +#if defined( DO_SSE_OPTIMIZATION ) + float sqrlen = vec.LengthSqr() + 1.0e-10f, invlen; + _SSE_RSqrtInline(sqrlen, &invlen); + vec.x *= invlen; + vec.y *= invlen; + vec.z *= invlen; + return sqrlen * invlen; +#else + extern float (FASTCALL *pfVectorNormalize)(Vector& v); + return (*pfVectorNormalize)(vec); +#endif +} + +// FIXME: Obsolete version of VectorNormalize, once we remove all the friggin float*s +FORCEINLINE float VectorNormalize( float * v ) +{ + return VectorNormalize(*(reinterpret_cast<Vector *>(v))); +} + +FORCEINLINE void VectorNormalizeFast( Vector &vec ) +{ + VectorNormalize(vec); +} + +#else + +FORCEINLINE float _VMX_InvRSquared( const Vector &v ) +{ + XMVECTOR xmV = XMVector3ReciprocalLength( XMLoadVector3( v.Base() ) ); + xmV = XMVector3Dot( xmV, xmV ); + return xmV.x; +} + +// call directly +FORCEINLINE float _VMX_VectorNormalize( Vector &vec ) +{ + float mag = XMVector3Length( XMLoadVector3( vec.Base() ) ).x; + float den = 1.f / (mag + FLT_EPSILON ); + vec.x *= den; + vec.y *= den; + vec.z *= den; + return mag; +} + +#define InvRSquared(x) _VMX_InvRSquared(x) + +// FIXME: Change this back to a #define once we get rid of the vec_t version +FORCEINLINE float VectorNormalize( Vector& v ) +{ + return _VMX_VectorNormalize( v ); +} +// FIXME: Obsolete version of VectorNormalize, once we remove all the friggin float*s +FORCEINLINE float VectorNormalize( float *pV ) +{ + return _VMX_VectorNormalize(*(reinterpret_cast<Vector*>(pV))); +} + +// call directly +FORCEINLINE void VectorNormalizeFast( Vector &vec ) +{ + XMVECTOR xmV = XMVector3LengthEst( XMLoadVector3( vec.Base() ) ); + float den = 1.f / (xmV.x + FLT_EPSILON); + vec.x *= den; + vec.y *= den; + vec.z *= den; +} + +#endif // _X360 + + +inline vec_t Vector::NormalizeInPlace() +{ + return VectorNormalize( *this ); +} + +inline Vector Vector::Normalized() const +{ + Vector norm = *this; + VectorNormalize( norm ); + return norm; +} + +inline bool Vector::IsLengthGreaterThan( float val ) const +{ + return LengthSqr() > val*val; +} + +inline bool Vector::IsLengthLessThan( float val ) const +{ + return LengthSqr() < val*val; +} + +#endif + diff --git a/public/mathlib/vector2d.h b/public/mathlib/vector2d.h new file mode 100644 index 0000000..4138558 --- /dev/null +++ b/public/mathlib/vector2d.h @@ -0,0 +1,670 @@ +//========= Copyright Valve Corporation, All rights reserved. ============// +// +// Purpose: +// +// $NoKeywords: $ +// +//=============================================================================// + +#ifndef VECTOR2D_H +#define VECTOR2D_H + +#ifdef _WIN32 +#pragma once +#endif + +#include <math.h> +#include <float.h> + +// For vec_t, put this somewhere else? +#include "tier0/basetypes.h" + +// For rand(). We really need a library! +#include <stdlib.h> + +#include "tier0/dbg.h" +#include "mathlib/math_pfns.h" + +//========================================================= +// 2D Vector2D +//========================================================= + +class Vector2D +{ +public: + // Members + vec_t x, y; + + // Construction/destruction + Vector2D(void); + Vector2D(vec_t X, vec_t Y); + Vector2D(const float *pFloat); + + // Initialization + void Init(vec_t ix=0.0f, vec_t iy=0.0f); + + // Got any nasty NAN's? + bool IsValid() const; + + // array access... + vec_t operator[](int i) const; + vec_t& operator[](int i); + + // Base address... + vec_t* Base(); + vec_t const* Base() const; + + // Initialization methods + void Random( float minVal, float maxVal ); + + // equality + bool operator==(const Vector2D& v) const; + bool operator!=(const Vector2D& v) const; + + // arithmetic operations + Vector2D& operator+=(const Vector2D &v); + Vector2D& operator-=(const Vector2D &v); + Vector2D& operator*=(const Vector2D &v); + Vector2D& operator*=(float s); + Vector2D& operator/=(const Vector2D &v); + Vector2D& operator/=(float s); + + // negate the Vector2D components + void Negate(); + + // Get the Vector2D's magnitude. + vec_t Length() const; + + // Get the Vector2D's magnitude squared. + vec_t LengthSqr(void) const; + + // return true if this vector is (0,0) within tolerance + bool IsZero( float tolerance = 0.01f ) const + { + return (x > -tolerance && x < tolerance && + y > -tolerance && y < tolerance); + } + + // Normalize in place and return the old length. + vec_t NormalizeInPlace(); + + // Compare length. + bool IsLengthGreaterThan( float val ) const; + bool IsLengthLessThan( float val ) const; + + // Get the distance from this Vector2D to the other one. + vec_t DistTo(const Vector2D &vOther) const; + + // Get the distance from this Vector2D to the other one squared. + vec_t DistToSqr(const Vector2D &vOther) const; + + // Copy + void CopyToArray(float* rgfl) const; + + // Multiply, add, and assign to this (ie: *this = a + b * scalar). This + // is about 12% faster than the actual Vector2D equation (because it's done per-component + // rather than per-Vector2D). + void MulAdd(const Vector2D& a, const Vector2D& b, float scalar); + + // Dot product. + vec_t Dot(const Vector2D& vOther) const; + + // assignment + Vector2D& operator=(const Vector2D &vOther); + +#ifndef VECTOR_NO_SLOW_OPERATIONS + // copy constructors + Vector2D(const Vector2D &vOther); + + // arithmetic operations + Vector2D operator-(void) const; + + Vector2D operator+(const Vector2D& v) const; + Vector2D operator-(const Vector2D& v) const; + Vector2D operator*(const Vector2D& v) const; + Vector2D operator/(const Vector2D& v) const; + Vector2D operator*(float fl) const; + Vector2D operator/(float fl) const; + + // Cross product between two vectors. + Vector2D Cross(const Vector2D &vOther) const; + + // Returns a Vector2D with the min or max in X, Y, and Z. + Vector2D Min(const Vector2D &vOther) const; + Vector2D Max(const Vector2D &vOther) const; + +#else + +private: + // No copy constructors allowed if we're in optimal mode + Vector2D(const Vector2D& vOther); +#endif +}; + +//----------------------------------------------------------------------------- + +const Vector2D vec2_origin(0,0); +const Vector2D vec2_invalid( FLT_MAX, FLT_MAX ); + +//----------------------------------------------------------------------------- +// Vector2D related operations +//----------------------------------------------------------------------------- + +// Vector2D clear +void Vector2DClear( Vector2D& a ); + +// Copy +void Vector2DCopy( const Vector2D& src, Vector2D& dst ); + +// Vector2D arithmetic +void Vector2DAdd( const Vector2D& a, const Vector2D& b, Vector2D& result ); +void Vector2DSubtract( const Vector2D& a, const Vector2D& b, Vector2D& result ); +void Vector2DMultiply( const Vector2D& a, vec_t b, Vector2D& result ); +void Vector2DMultiply( const Vector2D& a, const Vector2D& b, Vector2D& result ); +void Vector2DDivide( const Vector2D& a, vec_t b, Vector2D& result ); +void Vector2DDivide( const Vector2D& a, const Vector2D& b, Vector2D& result ); +void Vector2DMA( const Vector2D& start, float s, const Vector2D& dir, Vector2D& result ); + +// Store the min or max of each of x, y, and z into the result. +void Vector2DMin( const Vector2D &a, const Vector2D &b, Vector2D &result ); +void Vector2DMax( const Vector2D &a, const Vector2D &b, Vector2D &result ); + +#define Vector2DExpand( v ) (v).x, (v).y + +// Normalization +vec_t Vector2DNormalize( Vector2D& v ); + +// Length +vec_t Vector2DLength( const Vector2D& v ); + +// Dot Product +vec_t DotProduct2D(const Vector2D& a, const Vector2D& b); + +// Linearly interpolate between two vectors +void Vector2DLerp(const Vector2D& src1, const Vector2D& src2, vec_t t, Vector2D& dest ); + + +//----------------------------------------------------------------------------- +// +// Inlined Vector2D methods +// +//----------------------------------------------------------------------------- + + +//----------------------------------------------------------------------------- +// constructors +//----------------------------------------------------------------------------- + +inline Vector2D::Vector2D(void) +{ +#ifdef _DEBUG + // Initialize to NAN to catch errors + x = y = VEC_T_NAN; +#endif +} + +inline Vector2D::Vector2D(vec_t X, vec_t Y) +{ + x = X; y = Y; + Assert( IsValid() ); +} + +inline Vector2D::Vector2D(const float *pFloat) +{ + Assert( pFloat ); + x = pFloat[0]; y = pFloat[1]; + Assert( IsValid() ); +} + + +//----------------------------------------------------------------------------- +// copy constructor +//----------------------------------------------------------------------------- + +inline Vector2D::Vector2D(const Vector2D &vOther) +{ + Assert( vOther.IsValid() ); + x = vOther.x; y = vOther.y; +} + +//----------------------------------------------------------------------------- +// initialization +//----------------------------------------------------------------------------- + +inline void Vector2D::Init( vec_t ix, vec_t iy ) +{ + x = ix; y = iy; + Assert( IsValid() ); +} + +inline void Vector2D::Random( float minVal, float maxVal ) +{ + x = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal); + y = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal); +} + +inline void Vector2DClear( Vector2D& a ) +{ + a.x = a.y = 0.0f; +} + +//----------------------------------------------------------------------------- +// assignment +//----------------------------------------------------------------------------- + +inline Vector2D& Vector2D::operator=(const Vector2D &vOther) +{ + Assert( vOther.IsValid() ); + x=vOther.x; y=vOther.y; + return *this; +} + +//----------------------------------------------------------------------------- +// Array access +//----------------------------------------------------------------------------- + +inline vec_t& Vector2D::operator[](int i) +{ + Assert( (i >= 0) && (i < 2) ); + return ((vec_t*)this)[i]; +} + +inline vec_t Vector2D::operator[](int i) const +{ + Assert( (i >= 0) && (i < 2) ); + return ((vec_t*)this)[i]; +} + +//----------------------------------------------------------------------------- +// Base address... +//----------------------------------------------------------------------------- + +inline vec_t* Vector2D::Base() +{ + return (vec_t*)this; +} + +inline vec_t const* Vector2D::Base() const +{ + return (vec_t const*)this; +} + +//----------------------------------------------------------------------------- +// IsValid? +//----------------------------------------------------------------------------- + +inline bool Vector2D::IsValid() const +{ + return IsFinite(x) && IsFinite(y); +} + +//----------------------------------------------------------------------------- +// comparison +//----------------------------------------------------------------------------- + +inline bool Vector2D::operator==( const Vector2D& src ) const +{ + Assert( src.IsValid() && IsValid() ); + return (src.x == x) && (src.y == y); +} + +inline bool Vector2D::operator!=( const Vector2D& src ) const +{ + Assert( src.IsValid() && IsValid() ); + return (src.x != x) || (src.y != y); +} + + +//----------------------------------------------------------------------------- +// Copy +//----------------------------------------------------------------------------- + +inline void Vector2DCopy( const Vector2D& src, Vector2D& dst ) +{ + Assert( src.IsValid() ); + dst.x = src.x; + dst.y = src.y; +} + +inline void Vector2D::CopyToArray(float* rgfl) const +{ + Assert( IsValid() ); + Assert( rgfl ); + rgfl[0] = x; rgfl[1] = y; +} + +//----------------------------------------------------------------------------- +// standard math operations +//----------------------------------------------------------------------------- + +inline void Vector2D::Negate() +{ + Assert( IsValid() ); + x = -x; y = -y; +} + +inline Vector2D& Vector2D::operator+=(const Vector2D& v) +{ + Assert( IsValid() && v.IsValid() ); + x+=v.x; y+=v.y; + return *this; +} + +inline Vector2D& Vector2D::operator-=(const Vector2D& v) +{ + Assert( IsValid() && v.IsValid() ); + x-=v.x; y-=v.y; + return *this; +} + +inline Vector2D& Vector2D::operator*=(float fl) +{ + x *= fl; + y *= fl; + Assert( IsValid() ); + return *this; +} + +inline Vector2D& Vector2D::operator*=(const Vector2D& v) +{ + x *= v.x; + y *= v.y; + Assert( IsValid() ); + return *this; +} + +inline Vector2D& Vector2D::operator/=(float fl) +{ + Assert( fl != 0.0f ); + float oofl = 1.0f / fl; + x *= oofl; + y *= oofl; + Assert( IsValid() ); + return *this; +} + +inline Vector2D& Vector2D::operator/=(const Vector2D& v) +{ + Assert( v.x != 0.0f && v.y != 0.0f ); + x /= v.x; + y /= v.y; + Assert( IsValid() ); + return *this; +} + +inline void Vector2DAdd( const Vector2D& a, const Vector2D& b, Vector2D& c ) +{ + Assert( a.IsValid() && b.IsValid() ); + c.x = a.x + b.x; + c.y = a.y + b.y; +} + +inline void Vector2DSubtract( const Vector2D& a, const Vector2D& b, Vector2D& c ) +{ + Assert( a.IsValid() && b.IsValid() ); + c.x = a.x - b.x; + c.y = a.y - b.y; +} + +inline void Vector2DMultiply( const Vector2D& a, vec_t b, Vector2D& c ) +{ + Assert( a.IsValid() && IsFinite(b) ); + c.x = a.x * b; + c.y = a.y * b; +} + +inline void Vector2DMultiply( const Vector2D& a, const Vector2D& b, Vector2D& c ) +{ + Assert( a.IsValid() && b.IsValid() ); + c.x = a.x * b.x; + c.y = a.y * b.y; +} + + +inline void Vector2DDivide( const Vector2D& a, vec_t b, Vector2D& c ) +{ + Assert( a.IsValid() ); + Assert( b != 0.0f ); + vec_t oob = 1.0f / b; + c.x = a.x * oob; + c.y = a.y * oob; +} + +inline void Vector2DDivide( const Vector2D& a, const Vector2D& b, Vector2D& c ) +{ + Assert( a.IsValid() ); + Assert( (b.x != 0.0f) && (b.y != 0.0f) ); + c.x = a.x / b.x; + c.y = a.y / b.y; +} + +inline void Vector2DMA( const Vector2D& start, float s, const Vector2D& dir, Vector2D& result ) +{ + Assert( start.IsValid() && IsFinite(s) && dir.IsValid() ); + result.x = start.x + s*dir.x; + result.y = start.y + s*dir.y; +} + +// FIXME: Remove +// For backwards compatability +inline void Vector2D::MulAdd(const Vector2D& a, const Vector2D& b, float scalar) +{ + x = a.x + b.x * scalar; + y = a.y + b.y * scalar; +} + +inline void Vector2DLerp(const Vector2D& src1, const Vector2D& src2, vec_t t, Vector2D& dest ) +{ + dest[0] = src1[0] + (src2[0] - src1[0]) * t; + dest[1] = src1[1] + (src2[1] - src1[1]) * t; +} + +//----------------------------------------------------------------------------- +// dot, cross +//----------------------------------------------------------------------------- +inline vec_t DotProduct2D(const Vector2D& a, const Vector2D& b) +{ + Assert( a.IsValid() && b.IsValid() ); + return( a.x*b.x + a.y*b.y ); +} + +// for backwards compatability +inline vec_t Vector2D::Dot( const Vector2D& vOther ) const +{ + return DotProduct2D( *this, vOther ); +} + + +//----------------------------------------------------------------------------- +// length +//----------------------------------------------------------------------------- +inline vec_t Vector2DLength( const Vector2D& v ) +{ + Assert( v.IsValid() ); + return (vec_t)FastSqrt(v.x*v.x + v.y*v.y); +} + +inline vec_t Vector2D::LengthSqr(void) const +{ + Assert( IsValid() ); + return (x*x + y*y); +} + +inline vec_t Vector2D::NormalizeInPlace() +{ + return Vector2DNormalize( *this ); +} + +inline bool Vector2D::IsLengthGreaterThan( float val ) const +{ + return LengthSqr() > val*val; +} + +inline bool Vector2D::IsLengthLessThan( float val ) const +{ + return LengthSqr() < val*val; +} + +inline vec_t Vector2D::Length(void) const +{ + return Vector2DLength( *this ); +} + + +inline void Vector2DMin( const Vector2D &a, const Vector2D &b, Vector2D &result ) +{ + result.x = (a.x < b.x) ? a.x : b.x; + result.y = (a.y < b.y) ? a.y : b.y; +} + + +inline void Vector2DMax( const Vector2D &a, const Vector2D &b, Vector2D &result ) +{ + result.x = (a.x > b.x) ? a.x : b.x; + result.y = (a.y > b.y) ? a.y : b.y; +} + + +//----------------------------------------------------------------------------- +// Normalization +//----------------------------------------------------------------------------- +inline vec_t Vector2DNormalize( Vector2D& v ) +{ + Assert( v.IsValid() ); + vec_t l = v.Length(); + if (l != 0.0f) + { + v /= l; + } + else + { + v.x = v.y = 0.0f; + } + return l; +} + + +//----------------------------------------------------------------------------- +// Get the distance from this Vector2D to the other one +//----------------------------------------------------------------------------- +inline vec_t Vector2D::DistTo(const Vector2D &vOther) const +{ + Vector2D delta; + Vector2DSubtract( *this, vOther, delta ); + return delta.Length(); +} + +inline vec_t Vector2D::DistToSqr(const Vector2D &vOther) const +{ + Vector2D delta; + Vector2DSubtract( *this, vOther, delta ); + return delta.LengthSqr(); +} + + +//----------------------------------------------------------------------------- +// Computes the closest point to vecTarget no farther than flMaxDist from vecStart +//----------------------------------------------------------------------------- +inline void ComputeClosestPoint2D( const Vector2D& vecStart, float flMaxDist, const Vector2D& vecTarget, Vector2D *pResult ) +{ + Vector2D vecDelta; + Vector2DSubtract( vecTarget, vecStart, vecDelta ); + float flDistSqr = vecDelta.LengthSqr(); + if ( flDistSqr <= flMaxDist * flMaxDist ) + { + *pResult = vecTarget; + } + else + { + vecDelta /= FastSqrt( flDistSqr ); + Vector2DMA( vecStart, flMaxDist, vecDelta, *pResult ); + } +} + + + +//----------------------------------------------------------------------------- +// +// Slow methods +// +//----------------------------------------------------------------------------- + +#ifndef VECTOR_NO_SLOW_OPERATIONS + +//----------------------------------------------------------------------------- +// Returns a Vector2D with the min or max in X, Y, and Z. +//----------------------------------------------------------------------------- + +inline Vector2D Vector2D::Min(const Vector2D &vOther) const +{ + return Vector2D(x < vOther.x ? x : vOther.x, + y < vOther.y ? y : vOther.y); +} + +inline Vector2D Vector2D::Max(const Vector2D &vOther) const +{ + return Vector2D(x > vOther.x ? x : vOther.x, + y > vOther.y ? y : vOther.y); +} + + +//----------------------------------------------------------------------------- +// arithmetic operations +//----------------------------------------------------------------------------- + +inline Vector2D Vector2D::operator-(void) const +{ + return Vector2D(-x,-y); +} + +inline Vector2D Vector2D::operator+(const Vector2D& v) const +{ + Vector2D res; + Vector2DAdd( *this, v, res ); + return res; +} + +inline Vector2D Vector2D::operator-(const Vector2D& v) const +{ + Vector2D res; + Vector2DSubtract( *this, v, res ); + return res; +} + +inline Vector2D Vector2D::operator*(float fl) const +{ + Vector2D res; + Vector2DMultiply( *this, fl, res ); + return res; +} + +inline Vector2D Vector2D::operator*(const Vector2D& v) const +{ + Vector2D res; + Vector2DMultiply( *this, v, res ); + return res; +} + +inline Vector2D Vector2D::operator/(float fl) const +{ + Vector2D res; + Vector2DDivide( *this, fl, res ); + return res; +} + +inline Vector2D Vector2D::operator/(const Vector2D& v) const +{ + Vector2D res; + Vector2DDivide( *this, v, res ); + return res; +} + +inline Vector2D operator*(float fl, const Vector2D& v) +{ + return v * fl; +} + +#endif //slow + +#endif // VECTOR2D_H + diff --git a/public/mathlib/vector4d.h b/public/mathlib/vector4d.h new file mode 100644 index 0000000..2b20c88 --- /dev/null +++ b/public/mathlib/vector4d.h @@ -0,0 +1,686 @@ +//========= Copyright Valve Corporation, All rights reserved. ============// +// +// Purpose: +// +// $NoKeywords: $ +// +//=============================================================================// + +#ifndef VECTOR4D_H +#define VECTOR4D_H + +#ifdef _WIN32 +#pragma once +#endif + +#include <math.h> +#include <stdlib.h> // For rand(). We really need a library! +#include <float.h> +#if !defined( _X360 ) +#include <xmmintrin.h> // For SSE +#endif +#include "basetypes.h" // For vec_t, put this somewhere else? +#include "tier0/dbg.h" +#include "mathlib/math_pfns.h" + +// forward declarations +class Vector; +class Vector2D; + +//========================================================= +// 4D Vector4D +//========================================================= + +class Vector4D +{ +public: + // Members + vec_t x, y, z, w; + + // Construction/destruction + Vector4D(void); + Vector4D(vec_t X, vec_t Y, vec_t Z, vec_t W); + Vector4D(const float *pFloat); + + // Initialization + void Init(vec_t ix=0.0f, vec_t iy=0.0f, vec_t iz=0.0f, vec_t iw=0.0f); + + // Got any nasty NAN's? + bool IsValid() const; + + // array access... + vec_t operator[](int i) const; + vec_t& operator[](int i); + + // Base address... + inline vec_t* Base(); + inline vec_t const* Base() const; + + // Cast to Vector and Vector2D... + Vector& AsVector3D(); + Vector const& AsVector3D() const; + + Vector2D& AsVector2D(); + Vector2D const& AsVector2D() const; + + // Initialization methods + void Random( vec_t minVal, vec_t maxVal ); + + // equality + bool operator==(const Vector4D& v) const; + bool operator!=(const Vector4D& v) const; + + // arithmetic operations + Vector4D& operator+=(const Vector4D &v); + Vector4D& operator-=(const Vector4D &v); + Vector4D& operator*=(const Vector4D &v); + Vector4D& operator*=(float s); + Vector4D& operator/=(const Vector4D &v); + Vector4D& operator/=(float s); + + // negate the Vector4D components + void Negate(); + + // Get the Vector4D's magnitude. + vec_t Length() const; + + // Get the Vector4D's magnitude squared. + vec_t LengthSqr(void) const; + + // return true if this vector is (0,0,0,0) within tolerance + bool IsZero( float tolerance = 0.01f ) const + { + return (x > -tolerance && x < tolerance && + y > -tolerance && y < tolerance && + z > -tolerance && z < tolerance && + w > -tolerance && w < tolerance); + } + + // Get the distance from this Vector4D to the other one. + vec_t DistTo(const Vector4D &vOther) const; + + // Get the distance from this Vector4D to the other one squared. + vec_t DistToSqr(const Vector4D &vOther) const; + + // Copy + void CopyToArray(float* rgfl) const; + + // Multiply, add, and assign to this (ie: *this = a + b * scalar). This + // is about 12% faster than the actual Vector4D equation (because it's done per-component + // rather than per-Vector4D). + void MulAdd(Vector4D const& a, Vector4D const& b, float scalar); + + // Dot product. + vec_t Dot(Vector4D const& vOther) const; + + // No copy constructors allowed if we're in optimal mode +#ifdef VECTOR_NO_SLOW_OPERATIONS +private: +#else +public: +#endif + Vector4D(Vector4D const& vOther); + + // No assignment operators either... + Vector4D& operator=( Vector4D const& src ); +}; + +const Vector4D vec4_origin( 0.0f, 0.0f, 0.0f, 0.0f ); +const Vector4D vec4_invalid( FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX ); + +//----------------------------------------------------------------------------- +// SSE optimized routines +//----------------------------------------------------------------------------- + +class ALIGN16 Vector4DAligned : public Vector4D +{ +public: + Vector4DAligned(void) {} + Vector4DAligned( vec_t X, vec_t Y, vec_t Z, vec_t W ); + + inline void Set( vec_t X, vec_t Y, vec_t Z, vec_t W ); + inline void InitZero( void ); + + inline __m128 &AsM128() { return *(__m128*)&x; } + inline const __m128 &AsM128() const { return *(const __m128*)&x; } + +private: + // No copy constructors allowed if we're in optimal mode + Vector4DAligned( Vector4DAligned const& vOther ); + + // No assignment operators either... + Vector4DAligned& operator=( Vector4DAligned const& src ); +} ALIGN16_POST; + +//----------------------------------------------------------------------------- +// Vector4D related operations +//----------------------------------------------------------------------------- + +// Vector4D clear +void Vector4DClear( Vector4D& a ); + +// Copy +void Vector4DCopy( Vector4D const& src, Vector4D& dst ); + +// Vector4D arithmetic +void Vector4DAdd( Vector4D const& a, Vector4D const& b, Vector4D& result ); +void Vector4DSubtract( Vector4D const& a, Vector4D const& b, Vector4D& result ); +void Vector4DMultiply( Vector4D const& a, vec_t b, Vector4D& result ); +void Vector4DMultiply( Vector4D const& a, Vector4D const& b, Vector4D& result ); +void Vector4DDivide( Vector4D const& a, vec_t b, Vector4D& result ); +void Vector4DDivide( Vector4D const& a, Vector4D const& b, Vector4D& result ); +void Vector4DMA( Vector4D const& start, float s, Vector4D const& dir, Vector4D& result ); + +// Vector4DAligned arithmetic +void Vector4DMultiplyAligned( Vector4DAligned const& a, vec_t b, Vector4DAligned& result ); + + +#define Vector4DExpand( v ) (v).x, (v).y, (v).z, (v).w + +// Normalization +vec_t Vector4DNormalize( Vector4D& v ); + +// Length +vec_t Vector4DLength( Vector4D const& v ); + +// Dot Product +vec_t DotProduct4D(Vector4D const& a, Vector4D const& b); + +// Linearly interpolate between two vectors +void Vector4DLerp(Vector4D const& src1, Vector4D const& src2, vec_t t, Vector4D& dest ); + + +//----------------------------------------------------------------------------- +// +// Inlined Vector4D methods +// +//----------------------------------------------------------------------------- + + +//----------------------------------------------------------------------------- +// constructors +//----------------------------------------------------------------------------- + +inline Vector4D::Vector4D(void) +{ +#ifdef _DEBUG + // Initialize to NAN to catch errors + x = y = z = w = VEC_T_NAN; +#endif +} + +inline Vector4D::Vector4D(vec_t X, vec_t Y, vec_t Z, vec_t W ) +{ + x = X; y = Y; z = Z; w = W; + Assert( IsValid() ); +} + +inline Vector4D::Vector4D(const float *pFloat) +{ + Assert( pFloat ); + x = pFloat[0]; y = pFloat[1]; z = pFloat[2]; w = pFloat[3]; + Assert( IsValid() ); +} + + +//----------------------------------------------------------------------------- +// copy constructor +//----------------------------------------------------------------------------- + +inline Vector4D::Vector4D(const Vector4D &vOther) +{ + Assert( vOther.IsValid() ); + x = vOther.x; y = vOther.y; z = vOther.z; w = vOther.w; +} + +//----------------------------------------------------------------------------- +// initialization +//----------------------------------------------------------------------------- + +inline void Vector4D::Init( vec_t ix, vec_t iy, vec_t iz, vec_t iw ) +{ + x = ix; y = iy; z = iz; w = iw; + Assert( IsValid() ); +} + +inline void Vector4D::Random( vec_t minVal, vec_t maxVal ) +{ + x = minVal + ((vec_t)rand() / VALVE_RAND_MAX) * (maxVal - minVal); + y = minVal + ((vec_t)rand() / VALVE_RAND_MAX) * (maxVal - minVal); + z = minVal + ((vec_t)rand() / VALVE_RAND_MAX) * (maxVal - minVal); + w = minVal + ((vec_t)rand() / VALVE_RAND_MAX) * (maxVal - minVal); +} + +inline void Vector4DClear( Vector4D& a ) +{ + a.x = a.y = a.z = a.w = 0.0f; +} + +//----------------------------------------------------------------------------- +// assignment +//----------------------------------------------------------------------------- + +inline Vector4D& Vector4D::operator=(const Vector4D &vOther) +{ + Assert( vOther.IsValid() ); + x=vOther.x; y=vOther.y; z=vOther.z; w=vOther.w; + return *this; +} + +//----------------------------------------------------------------------------- +// Array access +//----------------------------------------------------------------------------- + +inline vec_t& Vector4D::operator[](int i) +{ + Assert( (i >= 0) && (i < 4) ); + return ((vec_t*)this)[i]; +} + +inline vec_t Vector4D::operator[](int i) const +{ + Assert( (i >= 0) && (i < 4) ); + return ((vec_t*)this)[i]; +} + +//----------------------------------------------------------------------------- +// Cast to Vector and Vector2D... +//----------------------------------------------------------------------------- + +inline Vector& Vector4D::AsVector3D() +{ + return *(Vector*)this; +} + +inline Vector const& Vector4D::AsVector3D() const +{ + return *(Vector const*)this; +} + +inline Vector2D& Vector4D::AsVector2D() +{ + return *(Vector2D*)this; +} + +inline Vector2D const& Vector4D::AsVector2D() const +{ + return *(Vector2D const*)this; +} + +//----------------------------------------------------------------------------- +// Base address... +//----------------------------------------------------------------------------- + +inline vec_t* Vector4D::Base() +{ + return (vec_t*)this; +} + +inline vec_t const* Vector4D::Base() const +{ + return (vec_t const*)this; +} + +//----------------------------------------------------------------------------- +// IsValid? +//----------------------------------------------------------------------------- + +inline bool Vector4D::IsValid() const +{ + return IsFinite(x) && IsFinite(y) && IsFinite(z) && IsFinite(w); +} + +//----------------------------------------------------------------------------- +// comparison +//----------------------------------------------------------------------------- + +inline bool Vector4D::operator==( Vector4D const& src ) const +{ + Assert( src.IsValid() && IsValid() ); + return (src.x == x) && (src.y == y) && (src.z == z) && (src.w == w); +} + +inline bool Vector4D::operator!=( Vector4D const& src ) const +{ + Assert( src.IsValid() && IsValid() ); + return (src.x != x) || (src.y != y) || (src.z != z) || (src.w != w); +} + + +//----------------------------------------------------------------------------- +// Copy +//----------------------------------------------------------------------------- + +inline void Vector4DCopy( Vector4D const& src, Vector4D& dst ) +{ + Assert( src.IsValid() ); + dst.x = src.x; + dst.y = src.y; + dst.z = src.z; + dst.w = src.w; +} + +inline void Vector4D::CopyToArray(float* rgfl) const +{ + Assert( IsValid() ); + Assert( rgfl ); + rgfl[0] = x; rgfl[1] = y; rgfl[2] = z; rgfl[3] = w; +} + +//----------------------------------------------------------------------------- +// standard math operations +//----------------------------------------------------------------------------- + +inline void Vector4D::Negate() +{ + Assert( IsValid() ); + x = -x; y = -y; z = -z; w = -w; +} + +inline Vector4D& Vector4D::operator+=(const Vector4D& v) +{ + Assert( IsValid() && v.IsValid() ); + x+=v.x; y+=v.y; z += v.z; w += v.w; + return *this; +} + +inline Vector4D& Vector4D::operator-=(const Vector4D& v) +{ + Assert( IsValid() && v.IsValid() ); + x-=v.x; y-=v.y; z -= v.z; w -= v.w; + return *this; +} + +inline Vector4D& Vector4D::operator*=(float fl) +{ + x *= fl; + y *= fl; + z *= fl; + w *= fl; + Assert( IsValid() ); + return *this; +} + +inline Vector4D& Vector4D::operator*=(Vector4D const& v) +{ + x *= v.x; + y *= v.y; + z *= v.z; + w *= v.w; + Assert( IsValid() ); + return *this; +} + +inline Vector4D& Vector4D::operator/=(float fl) +{ + Assert( fl != 0.0f ); + float oofl = 1.0f / fl; + x *= oofl; + y *= oofl; + z *= oofl; + w *= oofl; + Assert( IsValid() ); + return *this; +} + +inline Vector4D& Vector4D::operator/=(Vector4D const& v) +{ + Assert( v.x != 0.0f && v.y != 0.0f && v.z != 0.0f && v.w != 0.0f ); + x /= v.x; + y /= v.y; + z /= v.z; + w /= v.w; + Assert( IsValid() ); + return *this; +} + +inline void Vector4DAdd( Vector4D const& a, Vector4D const& b, Vector4D& c ) +{ + Assert( a.IsValid() && b.IsValid() ); + c.x = a.x + b.x; + c.y = a.y + b.y; + c.z = a.z + b.z; + c.w = a.w + b.w; +} + +inline void Vector4DSubtract( Vector4D const& a, Vector4D const& b, Vector4D& c ) +{ + Assert( a.IsValid() && b.IsValid() ); + c.x = a.x - b.x; + c.y = a.y - b.y; + c.z = a.z - b.z; + c.w = a.w - b.w; +} + +inline void Vector4DMultiply( Vector4D const& a, vec_t b, Vector4D& c ) +{ + Assert( a.IsValid() && IsFinite(b) ); + c.x = a.x * b; + c.y = a.y * b; + c.z = a.z * b; + c.w = a.w * b; +} + +inline void Vector4DMultiply( Vector4D const& a, Vector4D const& b, Vector4D& c ) +{ + Assert( a.IsValid() && b.IsValid() ); + c.x = a.x * b.x; + c.y = a.y * b.y; + c.z = a.z * b.z; + c.w = a.w * b.w; +} + +inline void Vector4DDivide( Vector4D const& a, vec_t b, Vector4D& c ) +{ + Assert( a.IsValid() ); + Assert( b != 0.0f ); + vec_t oob = 1.0f / b; + c.x = a.x * oob; + c.y = a.y * oob; + c.z = a.z * oob; + c.w = a.w * oob; +} + +inline void Vector4DDivide( Vector4D const& a, Vector4D const& b, Vector4D& c ) +{ + Assert( a.IsValid() ); + Assert( (b.x != 0.0f) && (b.y != 0.0f) && (b.z != 0.0f) && (b.w != 0.0f) ); + c.x = a.x / b.x; + c.y = a.y / b.y; + c.z = a.z / b.z; + c.w = a.w / b.w; +} + +inline void Vector4DMA( Vector4D const& start, float s, Vector4D const& dir, Vector4D& result ) +{ + Assert( start.IsValid() && IsFinite(s) && dir.IsValid() ); + result.x = start.x + s*dir.x; + result.y = start.y + s*dir.y; + result.z = start.z + s*dir.z; + result.w = start.w + s*dir.w; +} + +// FIXME: Remove +// For backwards compatability +inline void Vector4D::MulAdd(Vector4D const& a, Vector4D const& b, float scalar) +{ + x = a.x + b.x * scalar; + y = a.y + b.y * scalar; + z = a.z + b.z * scalar; + w = a.w + b.w * scalar; +} + +inline void Vector4DLerp(const Vector4D& src1, const Vector4D& src2, vec_t t, Vector4D& dest ) +{ + dest[0] = src1[0] + (src2[0] - src1[0]) * t; + dest[1] = src1[1] + (src2[1] - src1[1]) * t; + dest[2] = src1[2] + (src2[2] - src1[2]) * t; + dest[3] = src1[3] + (src2[3] - src1[3]) * t; +} + +//----------------------------------------------------------------------------- +// dot, cross +//----------------------------------------------------------------------------- + +inline vec_t DotProduct4D(const Vector4D& a, const Vector4D& b) +{ + Assert( a.IsValid() && b.IsValid() ); + return( a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w ); +} + +// for backwards compatability +inline vec_t Vector4D::Dot( Vector4D const& vOther ) const +{ + return DotProduct4D( *this, vOther ); +} + + +//----------------------------------------------------------------------------- +// length +//----------------------------------------------------------------------------- + +inline vec_t Vector4DLength( Vector4D const& v ) +{ + Assert( v.IsValid() ); + return (vec_t)FastSqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w); +} + +inline vec_t Vector4D::LengthSqr(void) const +{ + Assert( IsValid() ); + return (x*x + y*y + z*z + w*w); +} + +inline vec_t Vector4D::Length(void) const +{ + return Vector4DLength( *this ); +} + + +//----------------------------------------------------------------------------- +// Normalization +//----------------------------------------------------------------------------- + +// FIXME: Can't use until we're un-macroed in mathlib.h +inline vec_t Vector4DNormalize( Vector4D& v ) +{ + Assert( v.IsValid() ); + vec_t l = v.Length(); + if (l != 0.0f) + { + v /= l; + } + else + { + v.x = v.y = v.z = v.w = 0.0f; + } + return l; +} + +//----------------------------------------------------------------------------- +// Get the distance from this Vector4D to the other one +//----------------------------------------------------------------------------- + +inline vec_t Vector4D::DistTo(const Vector4D &vOther) const +{ + Vector4D delta; + Vector4DSubtract( *this, vOther, delta ); + return delta.Length(); +} + +inline vec_t Vector4D::DistToSqr(const Vector4D &vOther) const +{ + Vector4D delta; + Vector4DSubtract( *this, vOther, delta ); + return delta.LengthSqr(); +} + + +//----------------------------------------------------------------------------- +// Vector4DAligned routines +//----------------------------------------------------------------------------- + +inline Vector4DAligned::Vector4DAligned( vec_t X, vec_t Y, vec_t Z, vec_t W ) +{ + x = X; y = Y; z = Z; w = W; + Assert( IsValid() ); +} + +inline void Vector4DAligned::Set( vec_t X, vec_t Y, vec_t Z, vec_t W ) +{ + x = X; y = Y; z = Z; w = W; + Assert( IsValid() ); +} + +inline void Vector4DAligned::InitZero( void ) +{ +#if !defined( _X360 ) + this->AsM128() = _mm_set1_ps( 0.0f ); +#else + this->AsM128() = __vspltisw( 0 ); +#endif + Assert( IsValid() ); +} + +inline void Vector4DMultiplyAligned( Vector4DAligned const& a, Vector4DAligned const& b, Vector4DAligned& c ) +{ + Assert( a.IsValid() && b.IsValid() ); +#if !defined( _X360 ) + c.x = a.x * b.x; + c.y = a.y * b.y; + c.z = a.z * b.z; + c.w = a.w * b.w; +#else + c.AsM128() = __vmulfp( a.AsM128(), b.AsM128() ); +#endif +} + +inline void Vector4DWeightMAD( vec_t w, Vector4DAligned const& vInA, Vector4DAligned& vOutA, Vector4DAligned const& vInB, Vector4DAligned& vOutB ) +{ + Assert( vInA.IsValid() && vInB.IsValid() && IsFinite(w) ); + +#if !defined( _X360 ) + vOutA.x += vInA.x * w; + vOutA.y += vInA.y * w; + vOutA.z += vInA.z * w; + vOutA.w += vInA.w * w; + + vOutB.x += vInB.x * w; + vOutB.y += vInB.y * w; + vOutB.z += vInB.z * w; + vOutB.w += vInB.w * w; +#else + __vector4 temp; + + temp = __lvlx( &w, 0 ); + temp = __vspltw( temp, 0 ); + + vOutA.AsM128() = __vmaddfp( vInA.AsM128(), temp, vOutA.AsM128() ); + vOutB.AsM128() = __vmaddfp( vInB.AsM128(), temp, vOutB.AsM128() ); +#endif +} + +inline void Vector4DWeightMADSSE( vec_t w, Vector4DAligned const& vInA, Vector4DAligned& vOutA, Vector4DAligned const& vInB, Vector4DAligned& vOutB ) +{ + Assert( vInA.IsValid() && vInB.IsValid() && IsFinite(w) ); + +#if !defined( _X360 ) + // Replicate scalar float out to 4 components + __m128 packed = _mm_set1_ps( w ); + + // 4D SSE Vector MAD + vOutA.AsM128() = _mm_add_ps( vOutA.AsM128(), _mm_mul_ps( vInA.AsM128(), packed ) ); + vOutB.AsM128() = _mm_add_ps( vOutB.AsM128(), _mm_mul_ps( vInB.AsM128(), packed ) ); +#else + __vector4 temp; + + temp = __lvlx( &w, 0 ); + temp = __vspltw( temp, 0 ); + + vOutA.AsM128() = __vmaddfp( vInA.AsM128(), temp, vOutA.AsM128() ); + vOutB.AsM128() = __vmaddfp( vInB.AsM128(), temp, vOutB.AsM128() ); +#endif +} + +#endif // VECTOR4D_H + diff --git a/public/mathlib/vmatrix.h b/public/mathlib/vmatrix.h new file mode 100644 index 0000000..e49a888 --- /dev/null +++ b/public/mathlib/vmatrix.h @@ -0,0 +1,947 @@ +//========= Copyright Valve Corporation, All rights reserved. ============// +// +// Purpose: +// +// $NoKeywords: $ +// +//=============================================================================// +// +// VMatrix always postmultiply vectors as in Ax = b. +// Given a set of basis vectors ((F)orward, (L)eft, (U)p), and a (T)ranslation, +// a matrix to transform a vector into that space looks like this: +// Fx Lx Ux Tx +// Fy Ly Uy Ty +// Fz Lz Uz Tz +// 0 0 0 1 + +// Note that concatenating matrices needs to multiply them in reverse order. +// ie: if I want to apply matrix A, B, then C, the equation needs to look like this: +// C * B * A * v +// ie: +// v = A * v; +// v = B * v; +// v = C * v; +//============================================================================= + +#ifndef VMATRIX_H +#define VMATRIX_H + +#ifdef _WIN32 +#pragma once +#endif + +#include <string.h> +#include "mathlib/vector.h" +#include "mathlib/vplane.h" +#include "mathlib/vector4d.h" +#include "mathlib/mathlib.h" + +struct cplane_t; + + +class VMatrix +{ +public: + + VMatrix(); + VMatrix( + vec_t m00, vec_t m01, vec_t m02, vec_t m03, + vec_t m10, vec_t m11, vec_t m12, vec_t m13, + vec_t m20, vec_t m21, vec_t m22, vec_t m23, + vec_t m30, vec_t m31, vec_t m32, vec_t m33 + ); + + // Creates a matrix where the X axis = forward + // the Y axis = left, and the Z axis = up + VMatrix( const Vector& forward, const Vector& left, const Vector& up ); + VMatrix( const Vector& forward, const Vector& left, const Vector& up, const Vector& translation ); + + // Construct from a 3x4 matrix + VMatrix( const matrix3x4_t& matrix3x4 ); + + // Set the values in the matrix. + void Init( + vec_t m00, vec_t m01, vec_t m02, vec_t m03, + vec_t m10, vec_t m11, vec_t m12, vec_t m13, + vec_t m20, vec_t m21, vec_t m22, vec_t m23, + vec_t m30, vec_t m31, vec_t m32, vec_t m33 + ); + + + // Initialize from a 3x4 + void Init( const matrix3x4_t& matrix3x4 ); + + // array access + inline float* operator[](int i) + { + return m[i]; + } + + inline const float* operator[](int i) const + { + return m[i]; + } + + // Get a pointer to m[0][0] + inline float *Base() + { + return &m[0][0]; + } + + inline const float *Base() const + { + return &m[0][0]; + } + + void SetLeft(const Vector &vLeft); + void SetUp(const Vector &vUp); + void SetForward(const Vector &vForward); + + void GetBasisVectors(Vector &vForward, Vector &vLeft, Vector &vUp) const; + void SetBasisVectors(const Vector &vForward, const Vector &vLeft, const Vector &vUp); + + // Get/set the translation. + Vector & GetTranslation( Vector &vTrans ) const; + void SetTranslation(const Vector &vTrans); + + void PreTranslate(const Vector &vTrans); + void PostTranslate(const Vector &vTrans); + + const matrix3x4_t& As3x4() const; + void CopyFrom3x4( const matrix3x4_t &m3x4 ); + void Set3x4( matrix3x4_t& matrix3x4 ) const; + + bool operator==( const VMatrix& src ) const; + bool operator!=( const VMatrix& src ) const { return !( *this == src ); } + +#ifndef VECTOR_NO_SLOW_OPERATIONS + // Access the basis vectors. + Vector GetLeft() const; + Vector GetUp() const; + Vector GetForward() const; + Vector GetTranslation() const; +#endif + + +// Matrix->vector operations. +public: + // Multiply by a 3D vector (same as operator*). + void V3Mul(const Vector &vIn, Vector &vOut) const; + + // Multiply by a 4D vector. + void V4Mul(const Vector4D &vIn, Vector4D &vOut) const; + +#ifndef VECTOR_NO_SLOW_OPERATIONS + // Applies the rotation (ignores translation in the matrix). (This just calls VMul3x3). + Vector ApplyRotation(const Vector &vVec) const; + + // Multiply by a vector (divides by w, assumes input w is 1). + Vector operator*(const Vector &vVec) const; + + // Multiply by the upper 3x3 part of the matrix (ie: only apply rotation). + Vector VMul3x3(const Vector &vVec) const; + + // Apply the inverse (transposed) rotation (only works on pure rotation matrix) + Vector VMul3x3Transpose(const Vector &vVec) const; + + // Multiply by the upper 3 rows. + Vector VMul4x3(const Vector &vVec) const; + + // Apply the inverse (transposed) transformation (only works on pure rotation/translation) + Vector VMul4x3Transpose(const Vector &vVec) const; +#endif + + +// Matrix->plane operations. +public: + // Transform the plane. The matrix can only contain translation and rotation. + void TransformPlane( const VPlane &inPlane, VPlane &outPlane ) const; + +#ifndef VECTOR_NO_SLOW_OPERATIONS + // Just calls TransformPlane and returns the result. + VPlane operator*(const VPlane &thePlane) const; +#endif + +// Matrix->matrix operations. +public: + + VMatrix& operator=(const VMatrix &mOther); + + // Multiply two matrices (out = this * vm). + void MatrixMul( const VMatrix &vm, VMatrix &out ) const; + + // Add two matrices. + const VMatrix& operator+=(const VMatrix &other); + +#ifndef VECTOR_NO_SLOW_OPERATIONS + // Just calls MatrixMul and returns the result. + VMatrix operator*(const VMatrix &mOther) const; + + // Add/Subtract two matrices. + VMatrix operator+(const VMatrix &other) const; + VMatrix operator-(const VMatrix &other) const; + + // Negation. + VMatrix operator-() const; + + // Return inverse matrix. Be careful because the results are undefined + // if the matrix doesn't have an inverse (ie: InverseGeneral returns false). + VMatrix operator~() const; +#endif + +// Matrix operations. +public: + // Set to identity. + void Identity(); + + bool IsIdentity() const; + + // Setup a matrix for origin and angles. + void SetupMatrixOrgAngles( const Vector &origin, const QAngle &vAngles ); + + // Setup a matrix for angles and no translation. + void SetupMatrixAngles( const QAngle &vAngles ); + + // General inverse. This may fail so check the return! + bool InverseGeneral(VMatrix &vInverse) const; + + // Does a fast inverse, assuming the matrix only contains translation and rotation. + void InverseTR( VMatrix &mRet ) const; + + // Usually used for debug checks. Returns true if the upper 3x3 contains + // unit vectors and they are all orthogonal. + bool IsRotationMatrix() const; + +#ifndef VECTOR_NO_SLOW_OPERATIONS + // This calls the other InverseTR and returns the result. + VMatrix InverseTR() const; + + // Get the scale of the matrix's basis vectors. + Vector GetScale() const; + + // (Fast) multiply by a scaling matrix setup from vScale. + VMatrix Scale(const Vector &vScale); + + // Normalize the basis vectors. + VMatrix NormalizeBasisVectors() const; + + // Transpose. + VMatrix Transpose() const; + + // Transpose upper-left 3x3. + VMatrix Transpose3x3() const; +#endif + +public: + // The matrix. + vec_t m[4][4]; +}; + + + +//----------------------------------------------------------------------------- +// Helper functions. +//----------------------------------------------------------------------------- + +#ifndef VECTOR_NO_SLOW_OPERATIONS + +// Setup an identity matrix. +VMatrix SetupMatrixIdentity(); + +// Setup as a scaling matrix. +VMatrix SetupMatrixScale(const Vector &vScale); + +// Setup a translation matrix. +VMatrix SetupMatrixTranslation(const Vector &vTranslation); + +// Setup a matrix to reflect around the plane. +VMatrix SetupMatrixReflection(const VPlane &thePlane); + +// Setup a matrix to project from vOrigin onto thePlane. +VMatrix SetupMatrixProjection(const Vector &vOrigin, const VPlane &thePlane); + +// Setup a matrix to rotate the specified amount around the specified axis. +VMatrix SetupMatrixAxisRot(const Vector &vAxis, vec_t fDegrees); + +// Setup a matrix from euler angles. Just sets identity and calls MatrixAngles. +VMatrix SetupMatrixAngles(const QAngle &vAngles); + +// Setup a matrix for origin and angles. +VMatrix SetupMatrixOrgAngles(const Vector &origin, const QAngle &vAngles); + +#endif + +#define VMatToString(mat) (static_cast<const char *>(CFmtStr("[ (%f, %f, %f), (%f, %f, %f), (%f, %f, %f), (%f, %f, %f) ]", mat.m[0][0], mat.m[0][1], mat.m[0][2], mat.m[0][3], mat.m[1][0], mat.m[1][1], mat.m[1][2], mat.m[1][3], mat.m[2][0], mat.m[2][1], mat.m[2][2], mat.m[2][3], mat.m[3][0], mat.m[3][1], mat.m[3][2], mat.m[3][3] ))) // ** Note: this generates a temporary, don't hold reference! + +//----------------------------------------------------------------------------- +// Returns the point at the intersection on the 3 planes. +// Returns false if it can't be solved (2 or more planes are parallel). +//----------------------------------------------------------------------------- +bool PlaneIntersection( const VPlane &vp1, const VPlane &vp2, const VPlane &vp3, Vector &vOut ); + + +//----------------------------------------------------------------------------- +// These methods are faster. Use them if you want faster code +//----------------------------------------------------------------------------- +void MatrixSetIdentity( VMatrix &dst ); +void MatrixTranspose( const VMatrix& src, VMatrix& dst ); +void MatrixCopy( const VMatrix& src, VMatrix& dst ); +void MatrixMultiply( const VMatrix& src1, const VMatrix& src2, VMatrix& dst ); + +// Accessors +void MatrixGetColumn( const VMatrix &src, int nCol, Vector *pColumn ); +void MatrixSetColumn( VMatrix &src, int nCol, const Vector &column ); +void MatrixGetRow( const VMatrix &src, int nCol, Vector *pColumn ); +void MatrixSetRow( VMatrix &src, int nCol, const Vector &column ); + +// Vector3DMultiply treats src2 as if it's a direction vector +void Vector3DMultiply( const VMatrix& src1, const Vector& src2, Vector& dst ); + +// Vector3DMultiplyPosition treats src2 as if it's a point (adds the translation) +inline void Vector3DMultiplyPosition( const VMatrix& src1, const VectorByValue src2, Vector& dst ); + +// Vector3DMultiplyPositionProjective treats src2 as if it's a point +// and does the perspective divide at the end +void Vector3DMultiplyPositionProjective( const VMatrix& src1, const Vector &src2, Vector& dst ); + +// Vector3DMultiplyPosition treats src2 as if it's a direction +// and does the perspective divide at the end +// NOTE: src1 had better be an inverse transpose to use this correctly +void Vector3DMultiplyProjective( const VMatrix& src1, const Vector &src2, Vector& dst ); + +void Vector4DMultiply( const VMatrix& src1, const Vector4D& src2, Vector4D& dst ); + +// Same as Vector4DMultiply except that src2 has an implicit W of 1 +void Vector4DMultiplyPosition( const VMatrix& src1, const Vector &src2, Vector4D& dst ); + +// Multiplies the vector by the transpose of the matrix +void Vector3DMultiplyTranspose( const VMatrix& src1, const Vector& src2, Vector& dst ); +void Vector4DMultiplyTranspose( const VMatrix& src1, const Vector4D& src2, Vector4D& dst ); + +// Transform a plane +void MatrixTransformPlane( const VMatrix &src, const cplane_t &inPlane, cplane_t &outPlane ); + +// Transform a plane that has an axis-aligned normal +void MatrixTransformAxisAlignedPlane( const VMatrix &src, int nDim, float flSign, float flDist, cplane_t &outPlane ); + +void MatrixBuildTranslation( VMatrix& dst, float x, float y, float z ); +void MatrixBuildTranslation( VMatrix& dst, const Vector &translation ); + +inline void MatrixTranslate( VMatrix& dst, const Vector &translation ) +{ + VMatrix matTranslation, temp; + MatrixBuildTranslation( matTranslation, translation ); + MatrixMultiply( dst, matTranslation, temp ); + dst = temp; +} + + +void MatrixBuildRotationAboutAxis( VMatrix& dst, const Vector& vAxisOfRot, float angleDegrees ); +void MatrixBuildRotateZ( VMatrix& dst, float angleDegrees ); + +inline void MatrixRotate( VMatrix& dst, const Vector& vAxisOfRot, float angleDegrees ) +{ + VMatrix rotation, temp; + MatrixBuildRotationAboutAxis( rotation, vAxisOfRot, angleDegrees ); + MatrixMultiply( dst, rotation, temp ); + dst = temp; +} + +// Builds a rotation matrix that rotates one direction vector into another +void MatrixBuildRotation( VMatrix &dst, const Vector& initialDirection, const Vector& finalDirection ); + +// Builds a scale matrix +void MatrixBuildScale( VMatrix &dst, float x, float y, float z ); +void MatrixBuildScale( VMatrix &dst, const Vector& scale ); + +// Build a perspective matrix. +// zNear and zFar are assumed to be positive. +// You end up looking down positive Z, X is to the right, Y is up. +// X range: [0..1] +// Y range: [0..1] +// Z range: [0..1] +void MatrixBuildPerspective( VMatrix &dst, float fovX, float fovY, float zNear, float zFar ); + +//----------------------------------------------------------------------------- +// Given a projection matrix, take the extremes of the space in transformed into world space and +// get a bounding box. +//----------------------------------------------------------------------------- +void CalculateAABBFromProjectionMatrix( const VMatrix &worldToVolume, Vector *pMins, Vector *pMaxs ); + +//----------------------------------------------------------------------------- +// Given a projection matrix, take the extremes of the space in transformed into world space and +// get a bounding sphere. +//----------------------------------------------------------------------------- +void CalculateSphereFromProjectionMatrix( const VMatrix &worldToVolume, Vector *pCenter, float *pflRadius ); + +//----------------------------------------------------------------------------- +// Given an inverse projection matrix, take the extremes of the space in transformed into world space and +// get a bounding box. +//----------------------------------------------------------------------------- +void CalculateAABBFromProjectionMatrixInverse( const VMatrix &volumeToWorld, Vector *pMins, Vector *pMaxs ); + +//----------------------------------------------------------------------------- +// Given an inverse projection matrix, take the extremes of the space in transformed into world space and +// get a bounding sphere. +//----------------------------------------------------------------------------- +void CalculateSphereFromProjectionMatrixInverse( const VMatrix &volumeToWorld, Vector *pCenter, float *pflRadius ); + +//----------------------------------------------------------------------------- +// Calculate frustum planes given a clip->world space transform. +//----------------------------------------------------------------------------- +void FrustumPlanesFromMatrix( const VMatrix &clipToWorld, Frustum_t &frustum ); + +//----------------------------------------------------------------------------- +// Setup a matrix from euler angles. +//----------------------------------------------------------------------------- +void MatrixFromAngles( const QAngle& vAngles, VMatrix& dst ); + +//----------------------------------------------------------------------------- +// Creates euler angles from a matrix +//----------------------------------------------------------------------------- +void MatrixToAngles( const VMatrix& src, QAngle& vAngles ); + +//----------------------------------------------------------------------------- +// Does a fast inverse, assuming the matrix only contains translation and rotation. +//----------------------------------------------------------------------------- +void MatrixInverseTR( const VMatrix& src, VMatrix &dst ); + +//----------------------------------------------------------------------------- +// Inverts any matrix at all +//----------------------------------------------------------------------------- +bool MatrixInverseGeneral(const VMatrix& src, VMatrix& dst); + +//----------------------------------------------------------------------------- +// Computes the inverse transpose +//----------------------------------------------------------------------------- +void MatrixInverseTranspose( const VMatrix& src, VMatrix& dst ); + + + +//----------------------------------------------------------------------------- +// VMatrix inlines. +//----------------------------------------------------------------------------- +inline VMatrix::VMatrix() +{ +} + +inline VMatrix::VMatrix( + vec_t m00, vec_t m01, vec_t m02, vec_t m03, + vec_t m10, vec_t m11, vec_t m12, vec_t m13, + vec_t m20, vec_t m21, vec_t m22, vec_t m23, + vec_t m30, vec_t m31, vec_t m32, vec_t m33) +{ + Init( + m00, m01, m02, m03, + m10, m11, m12, m13, + m20, m21, m22, m23, + m30, m31, m32, m33 + ); +} + + +inline VMatrix::VMatrix( const matrix3x4_t& matrix3x4 ) +{ + Init( matrix3x4 ); +} + + +//----------------------------------------------------------------------------- +// Creates a matrix where the X axis = forward +// the Y axis = left, and the Z axis = up +//----------------------------------------------------------------------------- +inline VMatrix::VMatrix( const Vector& xAxis, const Vector& yAxis, const Vector& zAxis ) +{ + Init( + xAxis.x, yAxis.x, zAxis.x, 0.0f, + xAxis.y, yAxis.y, zAxis.y, 0.0f, + xAxis.z, yAxis.z, zAxis.z, 0.0f, + 0.0f, 0.0f, 0.0f, 1.0f + ); +} + +inline VMatrix::VMatrix( const Vector& xAxis, const Vector& yAxis, const Vector& zAxis, const Vector& translation ) +{ + Init( + xAxis.x, yAxis.x, zAxis.x, translation.x, + xAxis.y, yAxis.y, zAxis.y, translation.y, + xAxis.z, yAxis.z, zAxis.z, translation.z, + 0.0f, 0.0f, 0.0f, 1.0f + ); +} + + +inline void VMatrix::Init( + vec_t m00, vec_t m01, vec_t m02, vec_t m03, + vec_t m10, vec_t m11, vec_t m12, vec_t m13, + vec_t m20, vec_t m21, vec_t m22, vec_t m23, + vec_t m30, vec_t m31, vec_t m32, vec_t m33 + ) +{ + m[0][0] = m00; + m[0][1] = m01; + m[0][2] = m02; + m[0][3] = m03; + + m[1][0] = m10; + m[1][1] = m11; + m[1][2] = m12; + m[1][3] = m13; + + m[2][0] = m20; + m[2][1] = m21; + m[2][2] = m22; + m[2][3] = m23; + + m[3][0] = m30; + m[3][1] = m31; + m[3][2] = m32; + m[3][3] = m33; +} + + +//----------------------------------------------------------------------------- +// Initialize from a 3x4 +//----------------------------------------------------------------------------- +inline void VMatrix::Init( const matrix3x4_t& matrix3x4 ) +{ + memcpy(m, matrix3x4.Base(), sizeof( matrix3x4_t ) ); + + m[3][0] = 0.0f; + m[3][1] = 0.0f; + m[3][2] = 0.0f; + m[3][3] = 1.0f; +} + + +//----------------------------------------------------------------------------- +// Methods related to the basis vectors of the matrix +//----------------------------------------------------------------------------- + +#ifndef VECTOR_NO_SLOW_OPERATIONS + +inline Vector VMatrix::GetForward() const +{ + return Vector(m[0][0], m[1][0], m[2][0]); +} + +inline Vector VMatrix::GetLeft() const +{ + return Vector(m[0][1], m[1][1], m[2][1]); +} + +inline Vector VMatrix::GetUp() const +{ + return Vector(m[0][2], m[1][2], m[2][2]); +} + +#endif + +inline void VMatrix::SetForward(const Vector &vForward) +{ + m[0][0] = vForward.x; + m[1][0] = vForward.y; + m[2][0] = vForward.z; +} + +inline void VMatrix::SetLeft(const Vector &vLeft) +{ + m[0][1] = vLeft.x; + m[1][1] = vLeft.y; + m[2][1] = vLeft.z; +} + +inline void VMatrix::SetUp(const Vector &vUp) +{ + m[0][2] = vUp.x; + m[1][2] = vUp.y; + m[2][2] = vUp.z; +} + +inline void VMatrix::GetBasisVectors(Vector &vForward, Vector &vLeft, Vector &vUp) const +{ + vForward.Init( m[0][0], m[1][0], m[2][0] ); + vLeft.Init( m[0][1], m[1][1], m[2][1] ); + vUp.Init( m[0][2], m[1][2], m[2][2] ); +} + +inline void VMatrix::SetBasisVectors(const Vector &vForward, const Vector &vLeft, const Vector &vUp) +{ + SetForward(vForward); + SetLeft(vLeft); + SetUp(vUp); +} + + +//----------------------------------------------------------------------------- +// Methods related to the translation component of the matrix +//----------------------------------------------------------------------------- +#ifndef VECTOR_NO_SLOW_OPERATIONS + +inline Vector VMatrix::GetTranslation() const +{ + return Vector(m[0][3], m[1][3], m[2][3]); +} + +#endif + +inline Vector& VMatrix::GetTranslation( Vector &vTrans ) const +{ + vTrans.x = m[0][3]; + vTrans.y = m[1][3]; + vTrans.z = m[2][3]; + return vTrans; +} + +inline void VMatrix::SetTranslation(const Vector &vTrans) +{ + m[0][3] = vTrans.x; + m[1][3] = vTrans.y; + m[2][3] = vTrans.z; +} + + +//----------------------------------------------------------------------------- +// appply translation to this matrix in the input space +//----------------------------------------------------------------------------- +inline void VMatrix::PreTranslate(const Vector &vTrans) +{ + Vector tmp; + Vector3DMultiplyPosition( *this, vTrans, tmp ); + m[0][3] = tmp.x; + m[1][3] = tmp.y; + m[2][3] = tmp.z; +} + + +//----------------------------------------------------------------------------- +// appply translation to this matrix in the output space +//----------------------------------------------------------------------------- +inline void VMatrix::PostTranslate(const Vector &vTrans) +{ + m[0][3] += vTrans.x; + m[1][3] += vTrans.y; + m[2][3] += vTrans.z; +} + +inline const matrix3x4_t& VMatrix::As3x4() const +{ + return *((const matrix3x4_t*)this); +} + +inline void VMatrix::CopyFrom3x4( const matrix3x4_t &m3x4 ) +{ + memcpy( m, m3x4.Base(), sizeof( matrix3x4_t ) ); + m[3][0] = m[3][1] = m[3][2] = 0; + m[3][3] = 1; +} + +inline void VMatrix::Set3x4( matrix3x4_t& matrix3x4 ) const +{ + memcpy(matrix3x4.Base(), m, sizeof( matrix3x4_t ) ); +} + + +//----------------------------------------------------------------------------- +// Matrix math operations +//----------------------------------------------------------------------------- +inline const VMatrix& VMatrix::operator+=(const VMatrix &other) +{ + for(int i=0; i < 4; i++) + { + for(int j=0; j < 4; j++) + { + m[i][j] += other.m[i][j]; + } + } + + return *this; +} + + +#ifndef VECTOR_NO_SLOW_OPERATIONS + +inline VMatrix VMatrix::operator+(const VMatrix &other) const +{ + VMatrix ret; + for(int i=0; i < 16; i++) + { + ((float*)ret.m)[i] = ((float*)m)[i] + ((float*)other.m)[i]; + } + return ret; +} + +inline VMatrix VMatrix::operator-(const VMatrix &other) const +{ + VMatrix ret; + + for(int i=0; i < 4; i++) + { + for(int j=0; j < 4; j++) + { + ret.m[i][j] = m[i][j] - other.m[i][j]; + } + } + + return ret; +} + +inline VMatrix VMatrix::operator-() const +{ + VMatrix ret; + for( int i=0; i < 16; i++ ) + { + ((float*)ret.m)[i] = ((float*)m)[i]; + } + return ret; +} + +#endif // VECTOR_NO_SLOW_OPERATIONS + + +//----------------------------------------------------------------------------- +// Vector transformation +//----------------------------------------------------------------------------- + +#ifndef VECTOR_NO_SLOW_OPERATIONS + +inline Vector VMatrix::operator*(const Vector &vVec) const +{ + Vector vRet; + vRet.x = m[0][0]*vVec.x + m[0][1]*vVec.y + m[0][2]*vVec.z + m[0][3]; + vRet.y = m[1][0]*vVec.x + m[1][1]*vVec.y + m[1][2]*vVec.z + m[1][3]; + vRet.z = m[2][0]*vVec.x + m[2][1]*vVec.y + m[2][2]*vVec.z + m[2][3]; + + return vRet; +} + +inline Vector VMatrix::VMul4x3(const Vector &vVec) const +{ + Vector vResult; + Vector3DMultiplyPosition( *this, vVec, vResult ); + return vResult; +} + + +inline Vector VMatrix::VMul4x3Transpose(const Vector &vVec) const +{ + Vector tmp = vVec; + tmp.x -= m[0][3]; + tmp.y -= m[1][3]; + tmp.z -= m[2][3]; + + return Vector( + m[0][0]*tmp.x + m[1][0]*tmp.y + m[2][0]*tmp.z, + m[0][1]*tmp.x + m[1][1]*tmp.y + m[2][1]*tmp.z, + m[0][2]*tmp.x + m[1][2]*tmp.y + m[2][2]*tmp.z + ); +} + +inline Vector VMatrix::VMul3x3(const Vector &vVec) const +{ + return Vector( + m[0][0]*vVec.x + m[0][1]*vVec.y + m[0][2]*vVec.z, + m[1][0]*vVec.x + m[1][1]*vVec.y + m[1][2]*vVec.z, + m[2][0]*vVec.x + m[2][1]*vVec.y + m[2][2]*vVec.z + ); +} + +inline Vector VMatrix::VMul3x3Transpose(const Vector &vVec) const +{ + return Vector( + m[0][0]*vVec.x + m[1][0]*vVec.y + m[2][0]*vVec.z, + m[0][1]*vVec.x + m[1][1]*vVec.y + m[2][1]*vVec.z, + m[0][2]*vVec.x + m[1][2]*vVec.y + m[2][2]*vVec.z + ); +} + +#endif // VECTOR_NO_SLOW_OPERATIONS + + +inline void VMatrix::V3Mul(const Vector &vIn, Vector &vOut) const +{ + vec_t rw; + + rw = 1.0f / (m[3][0]*vIn.x + m[3][1]*vIn.y + m[3][2]*vIn.z + m[3][3]); + vOut.x = (m[0][0]*vIn.x + m[0][1]*vIn.y + m[0][2]*vIn.z + m[0][3]) * rw; + vOut.y = (m[1][0]*vIn.x + m[1][1]*vIn.y + m[1][2]*vIn.z + m[1][3]) * rw; + vOut.z = (m[2][0]*vIn.x + m[2][1]*vIn.y + m[2][2]*vIn.z + m[2][3]) * rw; +} + +inline void VMatrix::V4Mul(const Vector4D &vIn, Vector4D &vOut) const +{ + vOut[0] = m[0][0]*vIn[0] + m[0][1]*vIn[1] + m[0][2]*vIn[2] + m[0][3]*vIn[3]; + vOut[1] = m[1][0]*vIn[0] + m[1][1]*vIn[1] + m[1][2]*vIn[2] + m[1][3]*vIn[3]; + vOut[2] = m[2][0]*vIn[0] + m[2][1]*vIn[1] + m[2][2]*vIn[2] + m[2][3]*vIn[3]; + vOut[3] = m[3][0]*vIn[0] + m[3][1]*vIn[1] + m[3][2]*vIn[2] + m[3][3]*vIn[3]; +} + + +//----------------------------------------------------------------------------- +// Plane transformation +//----------------------------------------------------------------------------- +inline void VMatrix::TransformPlane( const VPlane &inPlane, VPlane &outPlane ) const +{ + Vector vTrans; + Vector3DMultiply( *this, inPlane.m_Normal, outPlane.m_Normal ); + outPlane.m_Dist = inPlane.m_Dist * DotProduct( outPlane.m_Normal, outPlane.m_Normal ); + outPlane.m_Dist += DotProduct( outPlane.m_Normal, GetTranslation( vTrans ) ); +} + + +//----------------------------------------------------------------------------- +// Other random stuff +//----------------------------------------------------------------------------- +inline void VMatrix::Identity() +{ + MatrixSetIdentity( *this ); +} + + +inline bool VMatrix::IsIdentity() const +{ + return + m[0][0] == 1.0f && m[0][1] == 0.0f && m[0][2] == 0.0f && m[0][3] == 0.0f && + m[1][0] == 0.0f && m[1][1] == 1.0f && m[1][2] == 0.0f && m[1][3] == 0.0f && + m[2][0] == 0.0f && m[2][1] == 0.0f && m[2][2] == 1.0f && m[2][3] == 0.0f && + m[3][0] == 0.0f && m[3][1] == 0.0f && m[3][2] == 0.0f && m[3][3] == 1.0f; +} + +#ifndef VECTOR_NO_SLOW_OPERATIONS + +inline Vector VMatrix::ApplyRotation(const Vector &vVec) const +{ + return VMul3x3(vVec); +} + +inline VMatrix VMatrix::operator~() const +{ + VMatrix mRet; + InverseGeneral(mRet); + return mRet; +} + +#endif + + +//----------------------------------------------------------------------------- +// Accessors +//----------------------------------------------------------------------------- +inline void MatrixGetColumn( const VMatrix &src, int nCol, Vector *pColumn ) +{ + Assert( (nCol >= 0) && (nCol <= 3) ); + + pColumn->x = src[0][nCol]; + pColumn->y = src[1][nCol]; + pColumn->z = src[2][nCol]; +} + +inline void MatrixSetColumn( VMatrix &src, int nCol, const Vector &column ) +{ + Assert( (nCol >= 0) && (nCol <= 3) ); + + src.m[0][nCol] = column.x; + src.m[1][nCol] = column.y; + src.m[2][nCol] = column.z; +} + +inline void MatrixGetRow( const VMatrix &src, int nRow, Vector *pRow ) +{ + Assert( (nRow >= 0) && (nRow <= 3) ); + *pRow = *(Vector*)src[nRow]; +} + +inline void MatrixSetRow( VMatrix &dst, int nRow, const Vector &row ) +{ + Assert( (nRow >= 0) && (nRow <= 3) ); + *(Vector*)dst[nRow] = row; +} + + +//----------------------------------------------------------------------------- +// Vector3DMultiplyPosition treats src2 as if it's a point (adds the translation) +//----------------------------------------------------------------------------- +// NJS: src2 is passed in as a full vector rather than a reference to prevent the need +// for 2 branches and a potential copy in the body. (ie, handling the case when the src2 +// reference is the same as the dst reference ). +inline void Vector3DMultiplyPosition( const VMatrix& src1, const VectorByValue src2, Vector& dst ) +{ + dst[0] = src1[0][0] * src2.x + src1[0][1] * src2.y + src1[0][2] * src2.z + src1[0][3]; + dst[1] = src1[1][0] * src2.x + src1[1][1] * src2.y + src1[1][2] * src2.z + src1[1][3]; + dst[2] = src1[2][0] * src2.x + src1[2][1] * src2.y + src1[2][2] * src2.z + src1[2][3]; +} + + +//----------------------------------------------------------------------------- +// Transform a plane that has an axis-aligned normal +//----------------------------------------------------------------------------- +inline void MatrixTransformAxisAlignedPlane( const VMatrix &src, int nDim, float flSign, float flDist, cplane_t &outPlane ) +{ + // See MatrixTransformPlane in the .cpp file for an explanation of the algorithm. + MatrixGetColumn( src, nDim, &outPlane.normal ); + outPlane.normal *= flSign; + outPlane.dist = flDist * DotProduct( outPlane.normal, outPlane.normal ); + + // NOTE: Writing this out by hand because it doesn't inline (inline depth isn't large enough) + // This should read outPlane.dist += DotProduct( outPlane.normal, src.GetTranslation ); + outPlane.dist += outPlane.normal.x * src.m[0][3] + outPlane.normal.y * src.m[1][3] + outPlane.normal.z * src.m[2][3]; +} + + +//----------------------------------------------------------------------------- +// Matrix equality test +//----------------------------------------------------------------------------- +inline bool MatricesAreEqual( const VMatrix &src1, const VMatrix &src2, float flTolerance ) +{ + for ( int i = 0; i < 3; ++i ) + { + for ( int j = 0; j < 3; ++j ) + { + if ( fabs( src1[i][j] - src2[i][j] ) > flTolerance ) + return false; + } + } + return true; +} + +//----------------------------------------------------------------------------- +// +//----------------------------------------------------------------------------- +void MatrixBuildOrtho( VMatrix& dst, double left, double top, double right, double bottom, double zNear, double zFar ); +void MatrixBuildPerspectiveX( VMatrix& dst, double flFovX, double flAspect, double flZNear, double flZFar ); +void MatrixBuildPerspectiveOffCenterX( VMatrix& dst, double flFovX, double flAspect, double flZNear, double flZFar, double bottom, double top, double left, double right ); +void MatrixBuildPerspectiveZRange( VMatrix& dst, double flZNear, double flZFar ); + +inline void MatrixOrtho( VMatrix& dst, double left, double top, double right, double bottom, double zNear, double zFar ) +{ + VMatrix mat; + MatrixBuildOrtho( mat, left, top, right, bottom, zNear, zFar ); + + VMatrix temp; + MatrixMultiply( dst, mat, temp ); + dst = temp; +} + +inline void MatrixPerspectiveX( VMatrix& dst, double flFovX, double flAspect, double flZNear, double flZFar ) +{ + VMatrix mat; + MatrixBuildPerspectiveX( mat, flFovX, flAspect, flZNear, flZFar ); + + VMatrix temp; + MatrixMultiply( dst, mat, temp ); + dst = temp; +} + +inline void MatrixPerspectiveOffCenterX( VMatrix& dst, double flFovX, double flAspect, double flZNear, double flZFar, double bottom, double top, double left, double right ) +{ + VMatrix mat; + MatrixBuildPerspectiveOffCenterX( mat, flFovX, flAspect, flZNear, flZFar, bottom, top, left, right ); + + VMatrix temp; + MatrixMultiply( dst, mat, temp ); + dst = temp; +} + +#endif + + diff --git a/public/mathlib/vplane.h b/public/mathlib/vplane.h new file mode 100644 index 0000000..dd3d4a9 --- /dev/null +++ b/public/mathlib/vplane.h @@ -0,0 +1,182 @@ +//========= Copyright Valve Corporation, All rights reserved. ============// +// +// Purpose: +// +// $Workfile: $ +// $Date: $ +// $NoKeywords: $ +//=============================================================================// + +#ifndef VPLANE_H +#define VPLANE_H + +#ifdef _WIN32 +#pragma once +#endif + +#include "mathlib/vector.h" + +typedef int SideType; + +// Used to represent sides of things like planes. +#define SIDE_FRONT 0 +#define SIDE_BACK 1 +#define SIDE_ON 2 + +#define VP_EPSILON 0.01f + + +class VPlane +{ +public: + VPlane(); + VPlane(const Vector &vNormal, vec_t dist); + + void Init(const Vector &vNormal, vec_t dist); + + // Return the distance from the point to the plane. + vec_t DistTo(const Vector &vVec) const; + + // Copy. + VPlane& operator=(const VPlane &thePlane); + + // Returns SIDE_ON, SIDE_FRONT, or SIDE_BACK. + // The epsilon for SIDE_ON can be passed in. + SideType GetPointSide(const Vector &vPoint, vec_t sideEpsilon=VP_EPSILON) const; + + // Returns SIDE_FRONT or SIDE_BACK. + SideType GetPointSideExact(const Vector &vPoint) const; + + // Classify the box with respect to the plane. + // Returns SIDE_ON, SIDE_FRONT, or SIDE_BACK + SideType BoxOnPlaneSide(const Vector &vMin, const Vector &vMax) const; + +#ifndef VECTOR_NO_SLOW_OPERATIONS + // Flip the plane. + VPlane Flip(); + + // Get a point on the plane (normal*dist). + Vector GetPointOnPlane() const; + + // Snap the specified point to the plane (along the plane's normal). + Vector SnapPointToPlane(const Vector &vPoint) const; +#endif + +public: + Vector m_Normal; + vec_t m_Dist; + +#ifdef VECTOR_NO_SLOW_OPERATIONS +private: + // No copy constructors allowed if we're in optimal mode + VPlane(const VPlane& vOther); +#endif +}; + + +//----------------------------------------------------------------------------- +// Inlines. +//----------------------------------------------------------------------------- +inline VPlane::VPlane() +{ +} + +inline VPlane::VPlane(const Vector &vNormal, vec_t dist) +{ + m_Normal = vNormal; + m_Dist = dist; +} + +inline void VPlane::Init(const Vector &vNormal, vec_t dist) +{ + m_Normal = vNormal; + m_Dist = dist; +} + +inline vec_t VPlane::DistTo(const Vector &vVec) const +{ + return vVec.Dot(m_Normal) - m_Dist; +} + +inline VPlane& VPlane::operator=(const VPlane &thePlane) +{ + m_Normal = thePlane.m_Normal; + m_Dist = thePlane.m_Dist; + return *this; +} + +#ifndef VECTOR_NO_SLOW_OPERATIONS + +inline VPlane VPlane::Flip() +{ + return VPlane(-m_Normal, -m_Dist); +} + +inline Vector VPlane::GetPointOnPlane() const +{ + return m_Normal * m_Dist; +} + +inline Vector VPlane::SnapPointToPlane(const Vector &vPoint) const +{ + return vPoint - m_Normal * DistTo(vPoint); +} + +#endif + +inline SideType VPlane::GetPointSide(const Vector &vPoint, vec_t sideEpsilon) const +{ + vec_t fDist; + + fDist = DistTo(vPoint); + if(fDist >= sideEpsilon) + return SIDE_FRONT; + else if(fDist <= -sideEpsilon) + return SIDE_BACK; + else + return SIDE_ON; +} + +inline SideType VPlane::GetPointSideExact(const Vector &vPoint) const +{ + return DistTo(vPoint) > 0.0f ? SIDE_FRONT : SIDE_BACK; +} + + +// BUGBUG: This should either simply use the implementation in mathlib or cease to exist. +// mathlib implementation is much more efficient. Check to see that VPlane isn't used in +// performance critical code. +inline SideType VPlane::BoxOnPlaneSide(const Vector &vMin, const Vector &vMax) const +{ + int i, firstSide, side; + TableVector vPoints[8] = + { + { vMin.x, vMin.y, vMin.z }, + { vMin.x, vMin.y, vMax.z }, + { vMin.x, vMax.y, vMax.z }, + { vMin.x, vMax.y, vMin.z }, + + { vMax.x, vMin.y, vMin.z }, + { vMax.x, vMin.y, vMax.z }, + { vMax.x, vMax.y, vMax.z }, + { vMax.x, vMax.y, vMin.z }, + }; + + firstSide = GetPointSideExact(vPoints[0]); + for(i=1; i < 8; i++) + { + side = GetPointSideExact(vPoints[i]); + + // Does the box cross the plane? + if(side != firstSide) + return SIDE_ON; + } + + // Ok, they're all on the same side, return that. + return firstSide; +} + + + + +#endif // VPLANE_H |