diff options
| author | git perforce import user <a@b> | 2016-10-25 12:29:14 -0600 |
|---|---|---|
| committer | Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees> | 2016-10-25 18:56:37 -0500 |
| commit | 3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch) | |
| tree | fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /PxShared/src/foundation/include | |
| download | physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip | |
Initial commit:
PhysX 3.4.0 Update @ 21294896
APEX 1.4.0 Update @ 21275617
[CL 21300167]
Diffstat (limited to 'PxShared/src/foundation/include')
58 files changed, 22760 insertions, 0 deletions
diff --git a/PxShared/src/foundation/include/Ps.h b/PxShared/src/foundation/include/Ps.h new file mode 100644 index 00000000..53c289db --- /dev/null +++ b/PxShared/src/foundation/include/Ps.h @@ -0,0 +1,70 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PS_H +#define PSFOUNDATION_PS_H + +/*! \file top level include file for shared foundation */ + +#include "foundation/Px.h" + +/** +Platform specific defines +*/ +#if PX_WINDOWS_FAMILY || PX_XBOXONE +#pragma intrinsic(memcmp) +#pragma intrinsic(memcpy) +#pragma intrinsic(memset) +#pragma intrinsic(abs) +#pragma intrinsic(labs) +#endif + +// An expression that should expand to nothing in non PX_CHECKED builds. +// We currently use this only for tagging the purpose of containers for memory use tracking. +#if PX_CHECKED +#define PX_DEBUG_EXP(x) (x) +#else +#define PX_DEBUG_EXP(x) +#endif + +#define PX_SIGN_BITMASK 0x80000000 + +namespace physx +{ +namespace shdfnd +{ +// Int-as-bool type - has some uses for efficiency and with SIMD +typedef int IntBool; +static const IntBool IntFalse = 0; +static const IntBool IntTrue = 1; +} + +} // namespace physx + +#endif // #ifndef PSFOUNDATION_PS_H diff --git a/PxShared/src/foundation/include/PsAlignedMalloc.h b/PxShared/src/foundation/include/PsAlignedMalloc.h new file mode 100644 index 00000000..5dd889c4 --- /dev/null +++ b/PxShared/src/foundation/include/PsAlignedMalloc.h @@ -0,0 +1,88 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSALIGNEDMALLOC_H +#define PSFOUNDATION_PSALIGNEDMALLOC_H + +#include "PsUserAllocated.h" + +/*! +Allocate aligned memory. +Alignment must be a power of 2! +-- should be templated by a base allocator +*/ + +namespace physx +{ +namespace shdfnd +{ +/** +Allocator, which is used to access the global PxAllocatorCallback instance +(used for dynamic data types template instantiation), which can align memory +*/ + +// SCS: AlignedMalloc with 3 params not found, seems not used on PC either +// disabled for now to avoid GCC error + +template <uint32_t N, typename BaseAllocator = NonTrackingAllocator> +class AlignedAllocator : public BaseAllocator +{ + public: + AlignedAllocator(const BaseAllocator& base = BaseAllocator()) : BaseAllocator(base) + { + } + + void* allocate(size_t size, const char* file, int line) + { + size_t pad = N - 1 + sizeof(size_t); // store offset for delete. + uint8_t* base = reinterpret_cast<uint8_t*>(BaseAllocator::allocate(size + pad, file, line)); + if(!base) + return NULL; + + uint8_t* ptr = reinterpret_cast<uint8_t*>(size_t(base + pad) & ~(size_t(N) - 1)); // aligned pointer, ensuring N + // is a size_t + // wide mask + reinterpret_cast<size_t*>(ptr)[-1] = size_t(ptr - base); // store offset + + return ptr; + } + void deallocate(void* ptr) + { + if(ptr == NULL) + return; + + uint8_t* base = reinterpret_cast<uint8_t*>(ptr) - reinterpret_cast<size_t*>(ptr)[-1]; + BaseAllocator::deallocate(base); + } +}; + +} // namespace shdfnd +} // namespace physx + +#endif // #ifndef PSFOUNDATION_PSALIGNEDMALLOC_H diff --git a/PxShared/src/foundation/include/PsAlloca.h b/PxShared/src/foundation/include/PsAlloca.h new file mode 100644 index 00000000..f1f36732 --- /dev/null +++ b/PxShared/src/foundation/include/PsAlloca.h @@ -0,0 +1,76 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSALLOCA_H +#define PSFOUNDATION_PSALLOCA_H + +#include "PsTempAllocator.h" + +namespace physx +{ +namespace shdfnd +{ +template <typename T, typename Alloc = TempAllocator> +class ScopedPointer : private Alloc +{ + public: + ~ScopedPointer() + { + if(mOwned) + Alloc::deallocate(mPointer); + } + + operator T*() const + { + return mPointer; + } + + T* mPointer; + bool mOwned; +}; + +} // namespace shdfnd +} // namespace physx + +/*! Stack allocation for \c count instances of \c type. Falling back to temp allocator if using more than 1kB. */ +#ifdef __SPU__ +#define PX_ALLOCA(var, type, count) type* var = reinterpret_cast<type*>(PxAlloca(sizeof(type) * (count))) +#else +#define PX_ALLOCA(var, type, count) \ + physx::shdfnd::ScopedPointer<type> var; \ + { \ + uint32_t size = sizeof(type) * (count); \ + var.mOwned = size > 1024; \ + if(var.mOwned) \ + var.mPointer = reinterpret_cast<type*>(physx::shdfnd::TempAllocator().allocate(size, __FILE__, __LINE__)); \ + else \ + var.mPointer = reinterpret_cast<type*>(PxAlloca(size)); \ + } +#endif +#endif // #ifndef PSFOUNDATION_PSALLOCA_H diff --git a/PxShared/src/foundation/include/PsAllocator.h b/PxShared/src/foundation/include/PsAllocator.h new file mode 100644 index 00000000..4fed4614 --- /dev/null +++ b/PxShared/src/foundation/include/PsAllocator.h @@ -0,0 +1,364 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSALLOCATOR_H +#define PSFOUNDATION_PSALLOCATOR_H + +#include "foundation/PxAllocatorCallback.h" +#include "foundation/PxFoundation.h" +#include "Ps.h" +#include "foundation/PxAssert.h" + +#if(PX_WINDOWS_FAMILY || PX_XBOXONE) +#include <exception> +#include <typeinfo.h> +#endif +#if(PX_APPLE_FAMILY) +#include <typeinfo> +#endif + +#include <new> + +// Allocation macros going through user allocator +#if PX_CHECKED +#define PX_ALLOC(n, name) physx::shdfnd::NamedAllocator(name).allocate(n, __FILE__, __LINE__) +#else +#define PX_ALLOC(n, name) physx::shdfnd::NonTrackingAllocator().allocate(n, __FILE__, __LINE__) +#endif +#define PX_ALLOC_TEMP(n, name) PX_ALLOC(n, name) +#define PX_FREE(x) physx::shdfnd::NonTrackingAllocator().deallocate(x) +#define PX_FREE_AND_RESET(x) \ + { \ + PX_FREE(x); \ + x = 0; \ + } + +// The following macros support plain-old-types and classes derived from UserAllocated. +#define PX_NEW(T) new (physx::shdfnd::ReflectionAllocator<T>(), __FILE__, __LINE__) T +#define PX_NEW_TEMP(T) PX_NEW(T) +#define PX_DELETE(x) delete x +#define PX_DELETE_AND_RESET(x) \ + { \ + PX_DELETE(x); \ + x = 0; \ + } +#define PX_DELETE_POD(x) \ + { \ + PX_FREE(x); \ + x = 0; \ + } +#define PX_DELETE_ARRAY(x) \ + { \ + PX_DELETE([] x); \ + x = 0; \ + } + +// aligned allocation +#define PX_ALIGNED16_ALLOC(n) physx::shdfnd::AlignedAllocator<16>().allocate(n, __FILE__, __LINE__) +#define PX_ALIGNED16_FREE(x) physx::shdfnd::AlignedAllocator<16>().deallocate(x) + +//! placement new macro to make it easy to spot bad use of 'new' +#define PX_PLACEMENT_NEW(p, T) new (p) T + +#if PX_DEBUG || PX_CHECKED +#define PX_USE_NAMED_ALLOCATOR 1 +#else +#define PX_USE_NAMED_ALLOCATOR 0 +#endif + +// Don't use inline for alloca !!! +#if PX_WINDOWS_FAMILY +#include <malloc.h> +#define PxAlloca(x) _alloca(x) +#elif PX_LINUX || PX_ANDROID +#include <malloc.h> +#define PxAlloca(x) alloca(x) +#elif PX_APPLE_FAMILY +#include <alloca.h> +#define PxAlloca(x) alloca(x) +#elif PX_PS4 +#include <memory.h> +#define PxAlloca(x) alloca(x) +#elif PX_XBOXONE +#include <malloc.h> +#define PxAlloca(x) alloca(x) +#endif + +#define PxAllocaAligned(x, alignment) ((size_t(PxAlloca(x + alignment)) + (alignment - 1)) & ~size_t(alignment - 1)) + +namespace physx +{ +namespace shdfnd +{ + +PX_FOUNDATION_API PxAllocatorCallback& getAllocator(); + +/** +Allocator used to access the global PxAllocatorCallback instance without providing additional information. +*/ + +class PX_FOUNDATION_API Allocator +{ + public: + Allocator(const char* = 0) + { + } + void* allocate(size_t size, const char* file, int line); + void deallocate(void* ptr); +}; + +/* + * Bootstrap allocator using malloc/free. + * Don't use unless your objects get allocated before foundation is initialized. + */ +class RawAllocator +{ + public: + RawAllocator(const char* = 0) + { + } + void* allocate(size_t size, const char*, int) + { + // malloc returns valid pointer for size==0, no need to check + return ::malloc(size); + } + void deallocate(void* ptr) + { + // free(0) is guaranteed to have no side effect, no need to check + ::free(ptr); + } +}; + +/* + * Allocator that simply calls straight back to the application without tracking. + * This is used by the heap (Foundation::mNamedAllocMap) that tracks allocations + * because it needs to be able to grow as a result of an allocation. + * Making the hash table re-entrant to deal with this may not make sense. + */ +class NonTrackingAllocator +{ + public: + PX_FORCE_INLINE NonTrackingAllocator(const char* = 0) + { + } + PX_FORCE_INLINE void* allocate(size_t size, const char* file, int line) + { + return !size ? 0 : getAllocator().allocate(size, "NonTrackedAlloc", file, line); + } + PX_FORCE_INLINE void deallocate(void* ptr) + { + if(ptr) + getAllocator().deallocate(ptr); + } +}; + +/* +\brief Virtual allocator callback used to provide run-time defined allocators to foundation types like Array or Bitmap. + This is used by VirtualAllocator +*/ +class VirtualAllocatorCallback +{ + public: + VirtualAllocatorCallback() + { + } + virtual ~VirtualAllocatorCallback() + { + } + virtual void* allocate(const size_t size, const char* file, const int line) = 0; + virtual void deallocate(void* ptr) = 0; +}; + +/* +\brief Virtual allocator to be used by foundation types to provide run-time defined allocators. +Due to the fact that Array extends its allocator, rather than contains a reference/pointer to it, the VirtualAllocator +must +be a concrete type containing a pointer to a virtual callback. The callback may not be available at instantiation time, +therefore +methods are provided to set the callback later. +*/ +class VirtualAllocator +{ + public: + VirtualAllocator(VirtualAllocatorCallback* callback = NULL) : mCallback(callback) + { + } + + void* allocate(const size_t size, const char* file, const int line) + { + PX_ASSERT(mCallback); + if(size) + return mCallback->allocate(size, file, line); + return NULL; + } + void deallocate(void* ptr) + { + PX_ASSERT(mCallback); + if(ptr) + mCallback->deallocate(ptr); + } + + void setCallback(VirtualAllocatorCallback* callback) + { + mCallback = callback; + } + VirtualAllocatorCallback* getCallback() + { + return mCallback; + } + + private: + VirtualAllocatorCallback* mCallback; + VirtualAllocator& operator=(const VirtualAllocator&); +}; + +#if PX_USE_NAMED_ALLOCATOR // can be slow, so only use in debug/checked +class PX_FOUNDATION_API NamedAllocator +{ + public: + NamedAllocator(const PxEMPTY); + NamedAllocator(const char* name = 0); // todo: should not have default argument! + NamedAllocator(const NamedAllocator&); + ~NamedAllocator(); + NamedAllocator& operator=(const NamedAllocator&); + void* allocate(size_t size, const char* filename, int line); + void deallocate(void* ptr); +}; +#else +class NamedAllocator; +#endif // PX_DEBUG + +/** +Allocator used to access the global PxAllocatorCallback instance using a static name derived from T. +*/ + +template <typename T> +class ReflectionAllocator +{ + static const char* getName() + { + if(!PxGetFoundation().getReportAllocationNames()) + return "<allocation names disabled>"; +#if PX_GCC_FAMILY + return __PRETTY_FUNCTION__; +#else + // name() calls malloc(), raw_name() wouldn't + return typeid(T).name(); +#endif + } + + public: + ReflectionAllocator(const PxEMPTY) + { + } + ReflectionAllocator(const char* = 0) + { + } + inline ReflectionAllocator(const ReflectionAllocator&) + { + } + void* allocate(size_t size, const char* filename, int line) + { + return size ? getAllocator().allocate(size, getName(), filename, line) : 0; + } + void deallocate(void* ptr) + { + if(ptr) + getAllocator().deallocate(ptr); + } +}; + +template <typename T> +struct AllocatorTraits +{ +#if PX_USE_NAMED_ALLOCATOR + typedef NamedAllocator Type; +#else + typedef ReflectionAllocator<T> Type; +#endif +}; + +// if you get a build error here, you are trying to PX_NEW a class +// that is neither plain-old-type nor derived from UserAllocated +template <typename T, typename X> +union EnableIfPod +{ + int i; + T t; + typedef X Type; +}; + +} // namespace shdfnd +} // namespace physx + +// Global placement new for ReflectionAllocator templated by +// plain-old-type. Allows using PX_NEW for pointers and built-in-types. +// +// ATTENTION: You need to use PX_DELETE_POD or PX_FREE to deallocate +// memory, not PX_DELETE. PX_DELETE_POD redirects to PX_FREE. +// +// Rationale: PX_DELETE uses global operator delete(void*), which we dont' want to overload. +// Any other definition of PX_DELETE couldn't support array syntax 'PX_DELETE([]a);'. +// PX_DELETE_POD was preferred over PX_DELETE_ARRAY because it is used +// less often and applies to both single instances and arrays. +template <typename T> +PX_INLINE void* operator new(size_t size, physx::shdfnd::ReflectionAllocator<T> alloc, const char* fileName, + typename physx::shdfnd::EnableIfPod<T, int>::Type line) +{ + return alloc.allocate(size, fileName, line); +} + +template <typename T> +PX_INLINE void* operator new [](size_t size, physx::shdfnd::ReflectionAllocator<T> alloc, const char* fileName, + typename physx::shdfnd::EnableIfPod<T, int>::Type line) +{ return alloc.allocate(size, fileName, line); } + +// If construction after placement new throws, this placement delete is being called. +template <typename T> +PX_INLINE void operator delete(void* ptr, physx::shdfnd::ReflectionAllocator<T> alloc, const char* fileName, + typename physx::shdfnd::EnableIfPod<T, int>::Type line) +{ + PX_UNUSED(fileName); + PX_UNUSED(line); + + alloc.deallocate(ptr); +} + +// If construction after placement new throws, this placement delete is being called. +template <typename T> +PX_INLINE void operator delete [](void* ptr, physx::shdfnd::ReflectionAllocator<T> alloc, const char* fileName, + typename physx::shdfnd::EnableIfPod<T, int>::Type line) +{ + PX_UNUSED(fileName); + PX_UNUSED(line); + + alloc.deallocate(ptr); +} + +#endif // #ifndef PSFOUNDATION_PSALLOCATOR_H diff --git a/PxShared/src/foundation/include/PsAoS.h b/PxShared/src/foundation/include/PsAoS.h new file mode 100644 index 00000000..f892a4d8 --- /dev/null +++ b/PxShared/src/foundation/include/PsAoS.h @@ -0,0 +1,45 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSAOS_H +#define PSFOUNDATION_PSAOS_H + +#include "foundation/Px.h" + +#if PX_WINDOWS && !PX_NEON +#include "windows/PsWindowsAoS.h" +#elif(PX_UNIX_FAMILY || PX_PS4) +#include "unix/PsUnixAoS.h" +#elif PX_XBOXONE +#include "XboxOne/PsXboxOneAoS.h" +#else +#error "Platform not supported!" +#endif + +#endif diff --git a/PxShared/src/foundation/include/PsArray.h b/PxShared/src/foundation/include/PsArray.h new file mode 100644 index 00000000..19ed9b05 --- /dev/null +++ b/PxShared/src/foundation/include/PsArray.h @@ -0,0 +1,802 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSARRAY_H +#define PSFOUNDATION_PSARRAY_H + +#include "foundation/PxAssert.h" +#include "foundation/PxIntrinsics.h" +#include "PsAllocator.h" +#include "PsBasicTemplates.h" + +#if PX_LIBCPP +#include <type_traits> +#else +#include <tr1/type_traits> +#endif + +#if PX_VC == 9 || PX_VC == 10 +#pragma warning(push) +#pragma warning(disable : 4347) // behavior change: 'function template' is called instead of 'function' +#endif + +namespace physx +{ +namespace shdfnd +{ +template <class Serializer> +void exportArray(Serializer& stream, const void* data, uint32_t size, uint32_t sizeOfElement, uint32_t capacity); +char* importArray(char* address, void** data, uint32_t size, uint32_t sizeOfElement, uint32_t capacity); + +/*! +An array is a sequential container. + +Implementation note +* entries between 0 and size are valid objects +* we use inheritance to build this because the array is included inline in a lot + of objects and we want the allocator to take no space if it's not stateful, which + aggregation doesn't allow. Also, we want the metadata at the front for the inline + case where the allocator contains some inline storage space +*/ +template <class T, class Alloc = typename AllocatorTraits<T>::Type> +class Array : protected Alloc +{ + public: + typedef T* Iterator; + typedef const T* ConstIterator; + + explicit Array(const PxEMPTY v) : Alloc(v) + { + if(mData) + mCapacity |= PX_SIGN_BITMASK; + } + + /*! + Default array constructor. Initialize an empty array + */ + PX_INLINE explicit Array(const Alloc& alloc = Alloc()) : Alloc(alloc), mData(0), mSize(0), mCapacity(0) + { + } + + /*! + Initialize array with given capacity + */ + PX_INLINE explicit Array(uint32_t size, const T& a = T(), const Alloc& alloc = Alloc()) + : Alloc(alloc), mData(0), mSize(0), mCapacity(0) + { + resize(size, a); + } + + /*! + Copy-constructor. Copy all entries from other array + */ + template <class A> + PX_INLINE explicit Array(const Array<T, A>& other, const Alloc& alloc = Alloc()) + : Alloc(alloc) + { + copy(other); + } + + // This is necessary else the basic default copy constructor is used in the case of both arrays being of the same + // template instance + // The C++ standard clearly states that a template constructor is never a copy constructor [2]. In other words, + // the presence of a template constructor does not suppress the implicit declaration of the copy constructor. + // Also never make a copy constructor explicit, or copy-initialization* will no longer work. This is because + // 'binding an rvalue to a const reference requires an accessible copy constructor' (http://gcc.gnu.org/bugs/) + // *http://stackoverflow.com/questions/1051379/is-there-a-difference-in-c-between-copy-initialization-and-assignment-initializ + PX_INLINE Array(const Array& other, const Alloc& alloc = Alloc()) : Alloc(alloc) + { + copy(other); + } + + /*! + Initialize array with given length + */ + PX_INLINE explicit Array(const T* first, const T* last, const Alloc& alloc = Alloc()) + : Alloc(alloc), mSize(last < first ? 0 : uint32_t(last - first)), mCapacity(mSize) + { + mData = allocate(mSize); + copy(mData, mData + mSize, first); + } + + /*! + Destructor + */ + PX_INLINE ~Array() + { + destroy(mData, mData + mSize); + + if(capacity() && !isInUserMemory()) + deallocate(mData); + } + + /*! + Assignment operator. Copy content (deep-copy) + */ + template <class A> + PX_INLINE Array& operator=(const Array<T, A>& rhs) + { + if(&rhs == this) + return *this; + + clear(); + reserve(rhs.mSize); + copy(mData, mData + rhs.mSize, rhs.mData); + + mSize = rhs.mSize; + return *this; + } + + PX_INLINE Array& operator=(const Array& t) // Needs to be declared, see comment at copy-constructor + { + return operator=<Alloc>(t); + } + + PX_FORCE_INLINE static bool isArrayOfPOD() + { +#if PX_LIBCPP + return std::is_trivially_copyable<T>::value; +#else + return std::tr1::is_pod<T>::value; +#endif + } + + /*! + Array indexing operator. + \param i + The index of the element that will be returned. + \return + The element i in the array. + */ + PX_FORCE_INLINE const T& operator[](uint32_t i) const + { + PX_ASSERT(i < mSize); + return mData[i]; + } + + /*! + Array indexing operator. + \param i + The index of the element that will be returned. + \return + The element i in the array. + */ + PX_FORCE_INLINE T& operator[](uint32_t i) + { + PX_ASSERT(i < mSize); + return mData[i]; + } + + /*! + Returns a pointer to the initial element of the array. + \return + a pointer to the initial element of the array. + */ + PX_FORCE_INLINE ConstIterator begin() const + { + return mData; + } + + PX_FORCE_INLINE Iterator begin() + { + return mData; + } + + /*! + Returns an iterator beyond the last element of the array. Do not dereference. + \return + a pointer to the element beyond the last element of the array. + */ + + PX_FORCE_INLINE ConstIterator end() const + { + return mData + mSize; + } + + PX_FORCE_INLINE Iterator end() + { + return mData + mSize; + } + + /*! + Returns a reference to the first element of the array. Undefined if the array is empty. + \return a reference to the first element of the array + */ + + PX_FORCE_INLINE const T& front() const + { + PX_ASSERT(mSize); + return mData[0]; + } + + PX_FORCE_INLINE T& front() + { + PX_ASSERT(mSize); + return mData[0]; + } + + /*! + Returns a reference to the last element of the array. Undefined if the array is empty + \return a reference to the last element of the array + */ + + PX_FORCE_INLINE const T& back() const + { + PX_ASSERT(mSize); + return mData[mSize - 1]; + } + + PX_FORCE_INLINE T& back() + { + PX_ASSERT(mSize); + return mData[mSize - 1]; + } + + /*! + Returns the number of entries in the array. This can, and probably will, + differ from the array capacity. + \return + The number of of entries in the array. + */ + PX_FORCE_INLINE uint32_t size() const + { + return mSize; + } + + /*! + Clears the array. + */ + PX_INLINE void clear() + { + destroy(mData, mData + mSize); + mSize = 0; + } + + /*! + Returns whether the array is empty (i.e. whether its size is 0). + \return + true if the array is empty + */ + PX_FORCE_INLINE bool empty() const + { + return mSize == 0; + } + + /*! + Finds the first occurrence of an element in the array. + \param a + The element to find. + */ + + PX_INLINE Iterator find(const T& a) + { + uint32_t index; + for(index = 0; index < mSize && mData[index] != a; index++) + ; + return mData + index; + } + + PX_INLINE ConstIterator find(const T& a) const + { + uint32_t index; + for(index = 0; index < mSize && mData[index] != a; index++) + ; + return mData + index; + } + + ///////////////////////////////////////////////////////////////////////// + /*! + Adds one element to the end of the array. Operation is O(1). + \param a + The element that will be added to this array. + */ + ///////////////////////////////////////////////////////////////////////// + + PX_FORCE_INLINE T& pushBack(const T& a) + { + if(capacity() <= mSize) + return growAndPushBack(a); + + PX_PLACEMENT_NEW(reinterpret_cast<void*>(mData + mSize), T)(a); + + return mData[mSize++]; + } + + ///////////////////////////////////////////////////////////////////////// + /*! + Returns the element at the end of the array. Only legal if the array is non-empty. + */ + ///////////////////////////////////////////////////////////////////////// + PX_INLINE T popBack() + { + PX_ASSERT(mSize); + T t = mData[mSize - 1]; + + if(!isArrayOfPOD()) + { + mData[--mSize].~T(); + } + else + { + --mSize; + } + + return t; + } + + ///////////////////////////////////////////////////////////////////////// + /*! + Construct one element at the end of the array. Operation is O(1). + */ + ///////////////////////////////////////////////////////////////////////// + PX_INLINE T& insert() + { + if(capacity() <= mSize) + grow(capacityIncrement()); + + T* ptr = mData + mSize++; + new (ptr) T; // not 'T()' because PODs should not get default-initialized. + return *ptr; + } + + ///////////////////////////////////////////////////////////////////////// + /*! + Subtracts the element on position i from the array and replace it with + the last element. + Operation is O(1) + \param i + The position of the element that will be subtracted from this array. + */ + ///////////////////////////////////////////////////////////////////////// + PX_INLINE void replaceWithLast(uint32_t i) + { + PX_ASSERT(i < mSize); + mData[i] = mData[--mSize]; + + if(!isArrayOfPOD()) + { + mData[mSize].~T(); + } + } + + PX_INLINE void replaceWithLast(Iterator i) + { + replaceWithLast(static_cast<uint32_t>(i - mData)); + } + + ///////////////////////////////////////////////////////////////////////// + /*! + Replaces the first occurrence of the element a with the last element + Operation is O(n) + \param a + The position of the element that will be subtracted from this array. + \return true if the element has been removed. + */ + ///////////////////////////////////////////////////////////////////////// + + PX_INLINE bool findAndReplaceWithLast(const T& a) + { + uint32_t index = 0; + while(index < mSize && mData[index] != a) + ++index; + if(index == mSize) + return false; + replaceWithLast(index); + return true; + } + + ///////////////////////////////////////////////////////////////////////// + /*! + Subtracts the element on position i from the array. Shift the entire + array one step. + Operation is O(n) + \param i + The position of the element that will be subtracted from this array. + */ + ///////////////////////////////////////////////////////////////////////// + PX_INLINE void remove(uint32_t i) + { + PX_ASSERT(i < mSize); + + if(isArrayOfPOD()) + { + if(i + 1 != mSize) + { + physx::intrinsics::memMove(mData + i, mData + i + 1, (mSize - i - 1) * sizeof(T)); + } + } + else + { + T* it = mData + i++; + it->~T(); + do + { + new (it) T(mData[i]); + ++it; + it->~T(); + } while(++i < mSize); + } + + --mSize; + } + + ///////////////////////////////////////////////////////////////////////// + /*! + Removes a range from the array. Shifts the array so order is maintained. + Operation is O(n) + \param begin + The starting position of the element that will be subtracted from this array. + \param count + The number of elments that will be subtracted from this array. + */ + ///////////////////////////////////////////////////////////////////////// + PX_INLINE void removeRange(uint32_t begin, uint32_t count) + { + PX_ASSERT(begin < mSize); + PX_ASSERT((begin + count) <= mSize); + + if(!isArrayOfPOD()) + { + for(uint32_t i = 0; i < count; i++) + { + mData[begin + i].~T(); // call the destructor on the ones being removed first. + } + } + + T* dest = &mData[begin]; // location we are copying the tail end objects to + T* src = &mData[begin + count]; // start of tail objects + uint32_t move_count = mSize - (begin + count); // compute remainder that needs to be copied down + + if(isArrayOfPOD()) + { + physx::intrinsics::memMove(dest, src, move_count * sizeof(T)); + } + else + { + for(uint32_t i = 0; i < move_count; i++) + { + new (dest) T(*src); // copy the old one to the new location + src->~T(); // call the destructor on the old location + dest++; + src++; + } + } + mSize -= count; + } + + ////////////////////////////////////////////////////////////////////////// + /*! + Resize array + */ + ////////////////////////////////////////////////////////////////////////// + PX_NOINLINE void resize(const uint32_t size, const T& a = T()); + + PX_NOINLINE void resizeUninitialized(const uint32_t size); + + ////////////////////////////////////////////////////////////////////////// + /*! + Resize array such that only as much memory is allocated to hold the + existing elements + */ + ////////////////////////////////////////////////////////////////////////// + PX_INLINE void shrink() + { + recreate(mSize); + } + + ////////////////////////////////////////////////////////////////////////// + /*! + Deletes all array elements and frees memory. + */ + ////////////////////////////////////////////////////////////////////////// + PX_INLINE void reset() + { + resize(0); + shrink(); + } + + ////////////////////////////////////////////////////////////////////////// + /*! + Ensure that the array has at least size capacity. + */ + ////////////////////////////////////////////////////////////////////////// + PX_INLINE void reserve(const uint32_t capacity) + { + if(capacity > this->capacity()) + grow(capacity); + } + + ////////////////////////////////////////////////////////////////////////// + /*! + Query the capacity(allocated mem) for the array. + */ + ////////////////////////////////////////////////////////////////////////// + PX_FORCE_INLINE uint32_t capacity() const + { + return mCapacity & ~PX_SIGN_BITMASK; + } + + ////////////////////////////////////////////////////////////////////////// + /*! + Unsafe function to force the size of the array + */ + ////////////////////////////////////////////////////////////////////////// + PX_FORCE_INLINE void forceSize_Unsafe(uint32_t size) + { + PX_ASSERT(size <= mCapacity); + mSize = size; + } + + ////////////////////////////////////////////////////////////////////////// + /*! + Swap contents of an array without allocating temporary storage + */ + ////////////////////////////////////////////////////////////////////////// + PX_INLINE void swap(Array<T, Alloc>& other) + { + shdfnd::swap(mData, other.mData); + shdfnd::swap(mSize, other.mSize); + shdfnd::swap(mCapacity, other.mCapacity); + } + + ////////////////////////////////////////////////////////////////////////// + /*! + Assign a range of values to this vector (resizes to length of range) + */ + ////////////////////////////////////////////////////////////////////////// + PX_INLINE void assign(const T* first, const T* last) + { + resizeUninitialized(uint32_t(last - first)); + copy(begin(), end(), first); + } + + // We need one bit to mark arrays that have been deserialized from a user-provided memory block. + // For alignment & memory saving purpose we store that bit in the rarely used capacity member. + PX_FORCE_INLINE uint32_t isInUserMemory() const + { + return mCapacity & PX_SIGN_BITMASK; + } + + /// return reference to allocator + PX_INLINE Alloc& getAllocator() + { + return *this; + } + + protected: + // constructor for where we don't own the memory + Array(T* memory, uint32_t size, uint32_t capacity, const Alloc& alloc = Alloc()) + : Alloc(alloc), mData(memory), mSize(size), mCapacity(capacity | PX_SIGN_BITMASK) + { + } + + template <class A> + PX_NOINLINE void copy(const Array<T, A>& other); + + PX_INLINE T* allocate(uint32_t size) + { + if(size > 0) + { + T* p = reinterpret_cast<T*>(Alloc::allocate(sizeof(T) * size, __FILE__, __LINE__)); +/** +Mark a specified amount of memory with 0xcd pattern. This is used to check that the meta data +definition for serialized classes is complete in checked builds. +*/ +#if PX_CHECKED + if(p) + { + for(uint32_t i = 0; i < (sizeof(T) * size); ++i) + reinterpret_cast<uint8_t*>(p)[i] = 0xcd; + } +#endif + return p; + } + return 0; + } + + PX_INLINE void deallocate(void* mem) + { + Alloc::deallocate(mem); + } + + static PX_INLINE bool isZeroInit(const T& object) + { + char ZeroBuffOnStack[sizeof(object)] = {}; + return memcmp(&object, ZeroBuffOnStack, sizeof(object)) == 0; + } + + static PX_INLINE void create(T* first, T* last, const T& a) + { + if(isArrayOfPOD() && isZeroInit(a)) + { + if(last > first) + physx::intrinsics::memZero(first, uint32_t((last - first) * sizeof(T))); + } + else + { + for(; first < last; ++first) + ::new (first) T(a); + } + } + + static PX_INLINE void copy(T* first, T* last, const T* src) + { + if(last <= first) + return; + + if(isArrayOfPOD()) + { + physx::intrinsics::memCopy(first, src, uint32_t((last - first) * sizeof(T))); + } + else + { + for(; first < last; ++first, ++src) + ::new (first) T(*src); + } + } + + static PX_INLINE void destroy(T* first, T* last) + { + if(!isArrayOfPOD()) + { + for(; first < last; ++first) + first->~T(); + } + } + + /*! + Called when pushBack() needs to grow the array. + \param a The element that will be added to this array. + */ + PX_NOINLINE T& growAndPushBack(const T& a); + + /*! + Resizes the available memory for the array. + + \param capacity + The number of entries that the set should be able to hold. + */ + PX_INLINE void grow(uint32_t capacity) + { + PX_ASSERT(this->capacity() < capacity); + recreate(capacity); + } + + /*! + Creates a new memory block, copies all entries to the new block and destroys old entries. + + \param capacity + The number of entries that the set should be able to hold. + */ + PX_NOINLINE void recreate(uint32_t capacity); + + // The idea here is to prevent accidental bugs with pushBack or insert. Unfortunately + // it interacts badly with InlineArrays with smaller inline allocations. + // TODO(dsequeira): policy template arg, this is exactly what they're for. + PX_INLINE uint32_t capacityIncrement() const + { + const uint32_t capacity = this->capacity(); + return capacity == 0 ? 1 : capacity * 2; + } + + T* mData; + uint32_t mSize; + uint32_t mCapacity; +}; + +template <class T, class Alloc> +PX_NOINLINE void Array<T, Alloc>::resize(const uint32_t size, const T& a) +{ + reserve(size); + create(mData + mSize, mData + size, a); + destroy(mData + size, mData + mSize); + mSize = size; +} + +template <class T, class Alloc> +template <class A> +PX_NOINLINE void Array<T, Alloc>::copy(const Array<T, A>& other) +{ + if(!other.empty()) + { + mData = allocate(mSize = mCapacity = other.size()); + copy(mData, mData + mSize, other.begin()); + } + else + { + mData = NULL; + mSize = 0; + mCapacity = 0; + } + + // mData = allocate(other.mSize); + // mSize = other.mSize; + // mCapacity = other.mSize; + // copy(mData, mData + mSize, other.mData); +} + +template <class T, class Alloc> +PX_NOINLINE void Array<T, Alloc>::resizeUninitialized(const uint32_t size) +{ + reserve(size); + mSize = size; +} + +template <class T, class Alloc> +PX_NOINLINE T& Array<T, Alloc>::growAndPushBack(const T& a) +{ + uint32_t capacity = capacityIncrement(); + + T* newData = allocate(capacity); + PX_ASSERT((!capacity) || (newData && (newData != mData))); + copy(newData, newData + mSize, mData); + + // inserting element before destroying old array + // avoids referencing destroyed object when duplicating array element. + PX_PLACEMENT_NEW(reinterpret_cast<void*>(newData + mSize), T)(a); + + destroy(mData, mData + mSize); + if(!isInUserMemory()) + deallocate(mData); + + mData = newData; + mCapacity = capacity; + + return mData[mSize++]; +} + +template <class T, class Alloc> +PX_NOINLINE void Array<T, Alloc>::recreate(uint32_t capacity) +{ + T* newData = allocate(capacity); + PX_ASSERT((!capacity) || (newData && (newData != mData))); + + copy(newData, newData + mSize, mData); + destroy(mData, mData + mSize); + if(!isInUserMemory()) + deallocate(mData); + + mData = newData; + mCapacity = capacity; +} + +template <class T, class Alloc> +PX_INLINE void swap(Array<T, Alloc>& x, Array<T, Alloc>& y) +{ + x.swap(y); +} + +} // namespace shdfnd +} // namespace physx + +#if PX_VC == 9 || PX_VC == 10 +#pragma warning(pop) +#endif + +#endif // #ifndef PSFOUNDATION_PSARRAY_H diff --git a/PxShared/src/foundation/include/PsAtomic.h b/PxShared/src/foundation/include/PsAtomic.h new file mode 100644 index 00000000..4e24c84c --- /dev/null +++ b/PxShared/src/foundation/include/PsAtomic.h @@ -0,0 +1,63 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSATOMIC_H +#define PSFOUNDATION_PSATOMIC_H + +#include "Ps.h" + +namespace physx +{ +namespace shdfnd +{ +/* set *dest equal to val. Return the old value of *dest */ +PX_FOUNDATION_API int32_t atomicExchange(volatile int32_t* dest, int32_t val); + +/* if *dest == comp, replace with exch. Return original value of *dest */ +PX_FOUNDATION_API int32_t atomicCompareExchange(volatile int32_t* dest, int32_t exch, int32_t comp); + +/* if *dest == comp, replace with exch. Return original value of *dest */ +PX_FOUNDATION_API void* atomicCompareExchangePointer(volatile void** dest, void* exch, void* comp); + +/* increment the specified location. Return the incremented value */ +PX_FOUNDATION_API int32_t atomicIncrement(volatile int32_t* val); + +/* decrement the specified location. Return the decremented value */ +PX_FOUNDATION_API int32_t atomicDecrement(volatile int32_t* val); + +/* add delta to *val. Return the new value */ +PX_FOUNDATION_API int32_t atomicAdd(volatile int32_t* val, int32_t delta); + +/* compute the maximum of dest and val. Return the new value */ +PX_FOUNDATION_API int32_t atomicMax(volatile int32_t* val, int32_t val2); + +} // namespace shdfnd +} // namespace physx + +#endif // #ifndef PSFOUNDATION_PSATOMIC_H diff --git a/PxShared/src/foundation/include/PsBasicTemplates.h b/PxShared/src/foundation/include/PsBasicTemplates.h new file mode 100644 index 00000000..ec8503e4 --- /dev/null +++ b/PxShared/src/foundation/include/PsBasicTemplates.h @@ -0,0 +1,146 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSBASICTEMPLATES_H +#define PSFOUNDATION_PSBASICTEMPLATES_H + +#include "Ps.h" + +namespace physx +{ +namespace shdfnd +{ +template <typename A> +struct Equal +{ + bool operator()(const A& a, const A& b) const + { + return a == b; + } +}; + +template <typename A> +struct Less +{ + bool operator()(const A& a, const A& b) const + { + return a < b; + } +}; + +template <typename A> +struct Greater +{ + bool operator()(const A& a, const A& b) const + { + return a > b; + } +}; + +template <class F, class S> +class Pair +{ + public: + F first; + S second; + Pair() : first(F()), second(S()) + { + } + Pair(const F& f, const S& s) : first(f), second(s) + { + } + Pair(const Pair& p) : first(p.first), second(p.second) + { + } + // CN - fix for /.../PsBasicTemplates.h(61) : warning C4512: 'physx::shdfnd::Pair<F,S>' : assignment operator could + // not be generated + Pair& operator=(const Pair& p) + { + first = p.first; + second = p.second; + return *this; + } + bool operator==(const Pair& p) const + { + return first == p.first && second == p.second; + } + bool operator<(const Pair& p) const + { + if(first < p.first) + return true; + else + return !(p.first < first) && (second < p.second); + } +}; + +template <unsigned int A> +struct LogTwo +{ + static const unsigned int value = LogTwo<(A >> 1)>::value + 1; +}; +template <> +struct LogTwo<1> +{ + static const unsigned int value = 0; +}; + +template <typename T> +struct UnConst +{ + typedef T Type; +}; +template <typename T> +struct UnConst<const T> +{ + typedef T Type; +}; + +template <typename T> +T pointerOffset(void* p, ptrdiff_t offset) +{ + return reinterpret_cast<T>(reinterpret_cast<char*>(p) + offset); +} +template <typename T> +T pointerOffset(const void* p, ptrdiff_t offset) +{ + return reinterpret_cast<T>(reinterpret_cast<const char*>(p) + offset); +} + +template <class T> +PX_CUDA_CALLABLE PX_INLINE void swap(T& x, T& y) +{ + const T tmp = x; + x = y; + y = tmp; +} + +} // namespace shdfnd +} // namespace physx + +#endif // #ifndef PSFOUNDATION_PSBASICTEMPLATES_H diff --git a/PxShared/src/foundation/include/PsBitUtils.h b/PxShared/src/foundation/include/PsBitUtils.h new file mode 100644 index 00000000..7bfef7a7 --- /dev/null +++ b/PxShared/src/foundation/include/PsBitUtils.h @@ -0,0 +1,109 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSBITUTILS_H +#define PSFOUNDATION_PSBITUTILS_H + +#include "foundation/PxIntrinsics.h" +#include "foundation/PxAssert.h" +#include "PsIntrinsics.h" +#include "Ps.h" + +namespace physx +{ +namespace shdfnd +{ +PX_INLINE uint32_t bitCount(uint32_t v) +{ + // from http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel + uint32_t const w = v - ((v >> 1) & 0x55555555); + uint32_t const x = (w & 0x33333333) + ((w >> 2) & 0x33333333); + return (((x + (x >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24; +} + +PX_INLINE bool isPowerOfTwo(uint32_t x) +{ + return x != 0 && (x & (x - 1)) == 0; +} + +// "Next Largest Power of 2 +// Given a binary integer value x, the next largest power of 2 can be computed by a SWAR algorithm +// that recursively "folds" the upper bits into the lower bits. This process yields a bit vector with +// the same most significant 1 as x, but all 1's below it. Adding 1 to that value yields the next +// largest power of 2. For a 32-bit value:" +PX_INLINE uint32_t nextPowerOfTwo(uint32_t x) +{ + x |= (x >> 1); + x |= (x >> 2); + x |= (x >> 4); + x |= (x >> 8); + x |= (x >> 16); + return x + 1; +} + +/*! +Return the index of the highest set bit. Not valid for zero arg. +*/ + +PX_INLINE uint32_t lowestSetBit(uint32_t x) +{ + PX_ASSERT(x); + return lowestSetBitUnsafe(x); +} + +/*! +Return the index of the highest set bit. Not valid for zero arg. +*/ + +PX_INLINE uint32_t highestSetBit(uint32_t x) +{ + PX_ASSERT(x); + return highestSetBitUnsafe(x); +} + +// Helper function to approximate log2 of an integer value +// assumes that the input is actually power of two. +// todo: replace 2 usages with 'highestSetBit' +PX_INLINE uint32_t ilog2(uint32_t num) +{ + for(uint32_t i = 0; i < 32; i++) + { + num >>= 1; + if(num == 0) + return i; + } + + PX_ASSERT(0); + return uint32_t(-1); +} + +} // namespace shdfnd +} // namespace physx + +#endif // #ifndef PSFOUNDATION_PSBITUTILS_H diff --git a/PxShared/src/foundation/include/PsBroadcast.h b/PxShared/src/foundation/include/PsBroadcast.h new file mode 100644 index 00000000..a1e04c38 --- /dev/null +++ b/PxShared/src/foundation/include/PsBroadcast.h @@ -0,0 +1,277 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PXPVDSDK_PXBROADCAST_H +#define PXPVDSDK_PXBROADCAST_H + +#include "Ps.h" +#include "PsInlineArray.h" + +#include "foundation/PxSimpleTypes.h" +#include "foundation/PxErrorCallback.h" + +namespace physx +{ +namespace shdfnd +{ + +/** +\brief Abstract listener class that listens to allocation and deallocation events from the + foundation memory system. + +<b>Threading:</b> All methods of this class should be thread safe as it can be called from the user thread +or the physics processing thread(s). +*/ +class AllocationListener +{ + public: + /** + \brief callback when memory is allocated. + \param size Size of the allocation in bytes. + \param typeName Type this data is being allocated for. + \param filename File the allocation came from. + \param line the allocation came from. + \param allocatedMemory memory that will be returned from the allocation. + */ + virtual void onAllocation(size_t size, const char* typeName, const char* filename, int line, + void* allocatedMemory) = 0; + + /** + \brief callback when memory is deallocated. + \param allocatedMemory memory just before allocation. + */ + virtual void onDeallocation(void* allocatedMemory) = 0; + + protected: + virtual ~AllocationListener() + { + } +}; + +/** +\brief Broadcast class implementation, registering listeners. + +<b>Threading:</b> All methods of this class should be thread safe as it can be called from the user thread +or the physics processing thread(s). There is not internal locking +*/ +template <class Listener, class Base> +class Broadcast : public Base +{ + public: + static const uint32_t MAX_NB_LISTENERS = 16; + + /** + \brief The default constructor. + */ + Broadcast() + { + } + + /** + \brief Register new listener. + + \note It is NOT SAFE to register and deregister listeners while allocations may be taking place. + moreover, there is no thread safety to registration/deregistration. + + \param listener Listener to register. + */ + void registerListener(Listener& listener) + { + if(mListeners.size() < MAX_NB_LISTENERS) + mListeners.pushBack(&listener); + } + + /** + \brief Deregister an existing listener. + + \note It is NOT SAFE to register and deregister listeners while allocations may be taking place. + moreover, there is no thread safety to registration/deregistration. + + \param listener Listener to deregister. + */ + void deregisterListener(Listener& listener) + { + mListeners.findAndReplaceWithLast(&listener); + } + + /** + \brief Get number of registered listeners. + + \return Number of listeners. + */ + uint32_t getNbListeners() const + { + return mListeners.size(); + } + + /** + \brief Get an existing listener from given index. + + \param index Index of the listener. + \return Listener on given index. + */ + Listener& getListener(uint32_t index) + { + PX_ASSERT(index <= mListeners.size()); + return *mListeners[index]; + } + + protected: + virtual ~Broadcast() + { + } + + physx::shdfnd::InlineArray<Listener*, MAX_NB_LISTENERS, physx::shdfnd::NonTrackingAllocator> mListeners; +}; + +/** +\brief Abstract base class for an application defined memory allocator that allows an external listener +to audit the memory allocations. +*/ +class BroadcastingAllocator : public Broadcast<AllocationListener, PxAllocatorCallback> +{ + PX_NOCOPY(BroadcastingAllocator) + + public: + /** + \brief The default constructor. + */ + BroadcastingAllocator(PxAllocatorCallback& allocator, PxErrorCallback& error) : mAllocator(allocator), mError(error) + { + mListeners.clear(); + } + + /** + \brief The default constructor. + */ + virtual ~BroadcastingAllocator() + { + mListeners.clear(); + } + + /** + \brief Allocates size bytes of memory, which must be 16-byte aligned. + + This method should never return NULL. If you run out of memory, then + you should terminate the app or take some other appropriate action. + + <b>Threading:</b> This function should be thread safe as it can be called in the context of the user thread + and physics processing thread(s). + + \param size Number of bytes to allocate. + \param typeName Name of the datatype that is being allocated + \param filename The source file which allocated the memory + \param line The source line which allocated the memory + \return The allocated block of memory. + */ + void* allocate(size_t size, const char* typeName, const char* filename, int line) + { + void* mem = mAllocator.allocate(size, typeName, filename, line); + + if(!mem) + { + mError.reportError(PxErrorCode::eABORT, "User allocator returned NULL.", __FILE__, __LINE__); + return NULL; + } + + if((reinterpret_cast<size_t>(mem) & 15)) + { + mError.reportError(PxErrorCode::eABORT, "Allocations must be 16-byte aligned.", __FILE__, __LINE__); + return NULL; + } + + for(uint32_t i = 0; i < mListeners.size(); i++) + mListeners[i]->onAllocation(size, typeName, filename, line, mem); + + return mem; + } + + /** + \brief Frees memory previously allocated by allocate(). + + <b>Threading:</b> This function should be thread safe as it can be called in the context of the user thread + and physics processing thread(s). + + \param ptr Memory to free. + */ + void deallocate(void* ptr) + { + for(uint32_t i = 0; i < mListeners.size(); i++) + { + mListeners[i]->onDeallocation(ptr); + } + mAllocator.deallocate(ptr); + } + + private: + PxAllocatorCallback& mAllocator; + PxErrorCallback& mError; +}; + +/** +\brief Abstract base class for an application defined error callback that allows an external listener +to report errors. +*/ +class BroadcastingErrorCallback : public Broadcast<PxErrorCallback, PxErrorCallback> +{ + PX_NOCOPY(BroadcastingErrorCallback) + public: + /** + \brief The default constructor. + */ + BroadcastingErrorCallback(PxErrorCallback& errorCallback) + { + registerListener(errorCallback); + } + + /** + \brief The default destructor. + */ + virtual ~BroadcastingErrorCallback() + { + mListeners.clear(); + } + + /** + \brief Reports an error code. + \param code Error code, see #PxErrorCode + \param message Message to display. + \param file File error occured in. + \param line Line number error occured on. + */ + void reportError(PxErrorCode::Enum code, const char* message, const char* file, int line) + { + for(uint32_t i = 0; i < mListeners.size(); i++) + mListeners[i]->reportError(code, message, file, line); + } +}; +} +} // namespace physx + +#endif // PXPVDSDK_PXBROADCAST_H diff --git a/PxShared/src/foundation/include/PsCpu.h b/PxShared/src/foundation/include/PsCpu.h new file mode 100644 index 00000000..52aef045 --- /dev/null +++ b/PxShared/src/foundation/include/PsCpu.h @@ -0,0 +1,47 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSCPU_H +#define PSFOUNDATION_PSCPU_H + +#include "Ps.h" + +namespace physx +{ +namespace shdfnd +{ +class Cpu +{ + public: + static uint8_t getCpuId(); +}; +} +} + +#endif // #ifndef PSFOUNDATION_PSCPU_H diff --git a/PxShared/src/foundation/include/PsFPU.h b/PxShared/src/foundation/include/PsFPU.h new file mode 100644 index 00000000..d8f0df75 --- /dev/null +++ b/PxShared/src/foundation/include/PsFPU.h @@ -0,0 +1,103 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSFPU_H +#define PSFOUNDATION_PSFPU_H + +#include "Ps.h" +#include "PsIntrinsics.h" + +#define PX_IR(x) ((uint32_t&)(x)) +#define PX_SIR(x) ((int32_t&)(x)) +#define PX_FR(x) ((float&)(x)) + +// signed integer representation of a floating-point value. + +// Floating-point representation of a integer value. + +#define PX_SIGN_BITMASK 0x80000000 + +#define PX_FPU_GUARD shdfnd::FPUGuard scopedFpGuard; +#define PX_SIMD_GUARD shdfnd::SIMDGuard scopedFpGuard; + +#define PX_SUPPORT_GUARDS (PX_WINDOWS_FAMILY || PX_XBOXONE || (PX_LINUX && (PX_X86 || PX_X64)) || PX_PS4 || PX_OSX) + +namespace physx +{ +namespace shdfnd +{ +// sets the default SDK state for scalar and SIMD units +class PX_FOUNDATION_API FPUGuard +{ + public: + FPUGuard(); // set fpu control word for PhysX + ~FPUGuard(); // restore fpu control word + private: + uint32_t mControlWords[8]; +}; + +// sets default SDK state for simd unit only, lighter weight than FPUGuard +class SIMDGuard +{ + public: + PX_INLINE SIMDGuard(); // set simd control word for PhysX + PX_INLINE ~SIMDGuard(); // restore simd control word + private: +#if PX_SUPPORT_GUARDS + uint32_t mControlWord; +#endif +}; + +/** +\brief Enables floating point exceptions for the scalar and SIMD unit +*/ +PX_FOUNDATION_API void enableFPExceptions(); + +/** +\brief Disables floating point exceptions for the scalar and SIMD unit +*/ +PX_FOUNDATION_API void disableFPExceptions(); + +} // namespace shdfnd +} // namespace physx + +#if PX_WINDOWS_FAMILY || PX_XBOXONE +#include "windows/PsWindowsFPU.h" +#elif PX_LINUX || PX_PS4 || PX_OSX +#include "unix/PsUnixFPU.h" +#else +PX_INLINE physx::shdfnd::SIMDGuard::SIMDGuard() +{ +} +PX_INLINE physx::shdfnd::SIMDGuard::~SIMDGuard() +{ +} +#endif + +#endif // #ifndef PSFOUNDATION_PSFPU_H diff --git a/PxShared/src/foundation/include/PsFoundation.h b/PxShared/src/foundation/include/PsFoundation.h new file mode 100644 index 00000000..27e94321 --- /dev/null +++ b/PxShared/src/foundation/include/PsFoundation.h @@ -0,0 +1,216 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PX_FOUNDATION_PSFOUNDATION_H +#define PX_FOUNDATION_PSFOUNDATION_H + +#include "foundation/PxFoundation.h" +#include "foundation/PxErrors.h" +#include "foundation/PxProfiler.h" + +#include "PsBroadcast.h" +#include "PsAllocator.h" +#include "PsTempAllocator.h" +#include "PsMutex.h" +#include "PsHashMap.h" +#include "PsUserAllocated.h" + +#include <stdarg.h> + +namespace physx +{ +namespace shdfnd +{ + +#if PX_VC +#pragma warning(push) +#pragma warning(disable : 4251) // class needs to have dll-interface to be used by clients of class +#endif + +class PX_FOUNDATION_API Foundation : public PxFoundation, public UserAllocated +{ + PX_NOCOPY(Foundation) + + public: + typedef MutexT<Allocator> Mutex; + + typedef HashMap<const NamedAllocator*, const char*, Hash<const NamedAllocator*>, NonTrackingAllocator> AllocNameMap; + typedef Array<TempAllocatorChunk*, Allocator> AllocFreeTable; + + public: + // factory + // note, you MUST eventually call release if createInstance returned true! + static Foundation* createInstance(PxU32 version, PxErrorCallback& errc, PxAllocatorCallback& alloc); + static Foundation& getInstance(); + void release(); + static void incRefCount(); // this call requires a foundation object to exist already + static void decRefCount(); // this call requires a foundation object to exist already + + // Begin Errors + virtual PxErrorCallback& getErrorCallback() + { + return mErrorCallback; + } // Return the user's error callback + PxErrorCallback& getInternalErrorCallback() + { + return mBroadcastingError; + } // Return the broadcasting error callback + + void registerErrorCallback(PxErrorCallback& listener); + void deregisterErrorCallback(PxErrorCallback& listener); + + virtual void setErrorLevel(PxErrorCode::Enum mask) + { + mErrorMask = mask; + } + virtual PxErrorCode::Enum getErrorLevel() const + { + return mErrorMask; + } + + void error(PxErrorCode::Enum, const char* file, int line, const char* messageFmt, ...); // Report errors with the + // broadcasting + void errorImpl(PxErrorCode::Enum, const char* file, int line, const char* messageFmt, va_list); // error callback + static PxU32 getWarnOnceTimestamp(); + + // End errors + + // Begin Allocations + virtual PxAllocatorCallback& getAllocatorCallback() + { + return mAllocatorCallback; + } // Return the user's allocator callback + PxAllocatorCallback& getAllocator() + { + return mBroadcastingAllocator; + } // Return the broadcasting allocator + + void registerAllocationListener(physx::shdfnd::AllocationListener& listener); + void deregisterAllocationListener(physx::shdfnd::AllocationListener& listener); + + virtual bool getReportAllocationNames() const + { + return mReportAllocationNames; + } + virtual void setReportAllocationNames(bool value) + { + mReportAllocationNames = value; + } + + PX_INLINE AllocNameMap& getNamedAllocMap() + { + return mNamedAllocMap; + } + PX_INLINE Mutex& getNamedAllocMutex() + { + return mNamedAllocMutex; + } + + PX_INLINE AllocFreeTable& getTempAllocFreeTable() + { + return mTempAllocFreeTable; + } + PX_INLINE Mutex& getTempAllocMutex() + { + return mTempAllocMutex; + } + // End allocations + + private: + static void destroyInstance(); + + Foundation(PxErrorCallback& errc, PxAllocatorCallback& alloc); + ~Foundation(); + + // init order is tricky here: the mutexes require the allocator, the allocator may require the error stream + PxAllocatorCallback& mAllocatorCallback; + PxErrorCallback& mErrorCallback; + + BroadcastingAllocator mBroadcastingAllocator; + BroadcastingErrorCallback mBroadcastingError; + + bool mReportAllocationNames; + + PxErrorCode::Enum mErrorMask; + Mutex mErrorMutex; + + AllocNameMap mNamedAllocMap; + Mutex mNamedAllocMutex; + + AllocFreeTable mTempAllocFreeTable; + Mutex mTempAllocMutex; + + Mutex mListenerMutex; + + static Foundation* mInstance; + static PxU32 mRefCount; + static PxU32 mWarnOnceTimestap; +}; +#if PX_VC +#pragma warning(pop) +#endif + +PX_INLINE Foundation& getFoundation() +{ + return Foundation::getInstance(); +} + +} // namespace shdfnd +} // namespace physx + +// shortcut macros: +// usage: Foundation::error(PX_WARN, "static friction %f is is lower than dynamic friction %d", sfr, dfr); +#define PX_WARN ::physx::PxErrorCode::eDEBUG_WARNING, __FILE__, __LINE__ +#define PX_INFO ::physx::PxErrorCode::eDEBUG_INFO, __FILE__, __LINE__ + +#if PX_DEBUG || PX_CHECKED +#define PX_WARN_ONCE(string) \ + { \ + static PxU32 timestamp = 0; \ + if(timestamp != Ps::getFoundation().getWarnOnceTimestamp()) \ + { \ + timestamp = Ps::getFoundation().getWarnOnceTimestamp(); \ + Ps::getFoundation().error(PX_WARN, string); \ + } \ + \ +} +#define PX_WARN_ONCE_IF(condition, string) \ + { \ + if(condition) \ + { \ + PX_WARN_ONCE(string) \ + } \ + \ +} +#else +#define PX_WARN_ONCE(string) ((void)0) +#define PX_WARN_ONCE_IF(condition, string) ((void)0) +#endif + +#endif diff --git a/PxShared/src/foundation/include/PsHash.h b/PxShared/src/foundation/include/PsHash.h new file mode 100644 index 00000000..0c7ab22a --- /dev/null +++ b/PxShared/src/foundation/include/PsHash.h @@ -0,0 +1,162 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSHASH_H +#define PSFOUNDATION_PSHASH_H + +#include "Ps.h" +#include "PsBasicTemplates.h" + +#if PX_VC +#pragma warning(push) +#pragma warning(disable : 4302) +#endif + +#if PX_LINUX +#include "foundation/PxSimpleTypes.h" +#endif + +/*! +Central definition of hash functions +*/ + +namespace physx +{ +namespace shdfnd +{ +// Hash functions + +// Thomas Wang's 32 bit mix +// http://www.cris.com/~Ttwang/tech/inthash.htm +PX_FORCE_INLINE uint32_t hash(const uint32_t key) +{ + uint32_t k = key; + k += ~(k << 15); + k ^= (k >> 10); + k += (k << 3); + k ^= (k >> 6); + k += ~(k << 11); + k ^= (k >> 16); + return uint32_t(k); +} + +PX_FORCE_INLINE uint32_t hash(const int32_t key) +{ + return hash(uint32_t(key)); +} + +// Thomas Wang's 64 bit mix +// http://www.cris.com/~Ttwang/tech/inthash.htm +PX_FORCE_INLINE uint32_t hash(const uint64_t key) +{ + uint64_t k = key; + k += ~(k << 32); + k ^= (k >> 22); + k += ~(k << 13); + k ^= (k >> 8); + k += (k << 3); + k ^= (k >> 15); + k += ~(k << 27); + k ^= (k >> 31); + return uint32_t(UINT32_MAX & k); +} + +#if PX_APPLE_FAMILY +// hash for size_t, to make gcc happy +PX_INLINE uint32_t hash(const size_t key) +{ +#if PX_P64_FAMILY + return hash(uint64_t(key)); +#else + return hash(uint32_t(key)); +#endif +} +#endif + +// Hash function for pointers +PX_INLINE uint32_t hash(const void* ptr) +{ +#if PX_P64_FAMILY + return hash(uint64_t(ptr)); +#else + return hash(uint32_t(UINT32_MAX & size_t(ptr))); +#endif +} + +// Hash function for pairs +template <typename F, typename S> +PX_INLINE uint32_t hash(const Pair<F, S>& p) +{ + uint32_t seed = 0x876543; + uint32_t m = 1000007; + return hash(p.second) ^ (m * (hash(p.first) ^ (m * seed))); +} + +// hash object for hash map template parameter +template <class Key> +struct Hash +{ + uint32_t operator()(const Key& k) const + { + return hash(k); + } + bool equal(const Key& k0, const Key& k1) const + { + return k0 == k1; + } +}; + +// specialization for strings +template <> +struct Hash<const char*> +{ + public: + uint32_t operator()(const char* _string) const + { + // "DJB" string hash + const uint8_t* string = reinterpret_cast<const uint8_t*>(_string); + uint32_t h = 5381; + for(const uint8_t* ptr = string; *ptr; ptr++) + h = ((h << 5) + h) ^ uint32_t(*ptr); + return h; + } + bool equal(const char* string0, const char* string1) const + { + return !strcmp(string0, string1); + } +}; + +} // namespace shdfnd +} // namespace physx + +#if PX_VC +#pragma warning(pop) +#endif + +#endif // #ifndef PSFOUNDATION_PSHASH_H diff --git a/PxShared/src/foundation/include/PsHashInternals.h b/PxShared/src/foundation/include/PsHashInternals.h new file mode 100644 index 00000000..5aa7adcf --- /dev/null +++ b/PxShared/src/foundation/include/PsHashInternals.h @@ -0,0 +1,795 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSHASHINTERNALS_H +#define PSFOUNDATION_PSHASHINTERNALS_H + +#include "PsBasicTemplates.h" +#include "PsArray.h" +#include "PsBitUtils.h" +#include "PsHash.h" +#include "foundation/PxIntrinsics.h" + +#if PX_VC +#pragma warning(push) +#pragma warning(disable : 4127) // conditional expression is constant +#endif +namespace physx +{ +namespace shdfnd +{ +namespace internal +{ +template <class Entry, class Key, class HashFn, class GetKey, class Allocator, bool compacting> +class HashBase : private Allocator +{ + void init(uint32_t initialTableSize, float loadFactor) + { + mBuffer = NULL; + mEntries = NULL; + mEntriesNext = NULL; + mHash = NULL; + mEntriesCapacity = 0; + mHashSize = 0; + mLoadFactor = loadFactor; + mFreeList = uint32_t(EOL); + mTimestamp = 0; + mEntriesCount = 0; + + if(initialTableSize) + reserveInternal(initialTableSize); + } + + public: + typedef Entry EntryType; + + HashBase(uint32_t initialTableSize = 64, float loadFactor = 0.75f) : Allocator(PX_DEBUG_EXP("hashBase")) + { + init(initialTableSize, loadFactor); + } + + HashBase(uint32_t initialTableSize, float loadFactor, const Allocator& alloc) : Allocator(alloc) + { + init(initialTableSize, loadFactor); + } + + HashBase(const Allocator& alloc) : Allocator(alloc) + { + init(64, 0.75f); + } + + ~HashBase() + { + destroy(); // No need to clear() + + if(mBuffer) + Allocator::deallocate(mBuffer); + } + + static const uint32_t EOL = 0xffffffff; + + PX_INLINE Entry* create(const Key& k, bool& exists) + { + uint32_t h = 0; + if(mHashSize) + { + h = hash(k); + uint32_t index = mHash[h]; + while(index != EOL && !HashFn().equal(GetKey()(mEntries[index]), k)) + index = mEntriesNext[index]; + exists = index != EOL; + if(exists) + return mEntries + index; + } + else + exists = false; + + if(freeListEmpty()) + { + grow(); + h = hash(k); + } + + uint32_t entryIndex = freeListGetNext(); + + mEntriesNext[entryIndex] = mHash[h]; + mHash[h] = entryIndex; + + mEntriesCount++; + mTimestamp++; + + return mEntries + entryIndex; + } + + PX_INLINE const Entry* find(const Key& k) const + { + if(!mEntriesCount) + return NULL; + + const uint32_t h = hash(k); + uint32_t index = mHash[h]; + while(index != EOL && !HashFn().equal(GetKey()(mEntries[index]), k)) + index = mEntriesNext[index]; + return index != EOL ? mEntries + index : NULL; + } + + PX_INLINE bool erase(const Key& k, Entry& e) + { + if(!mEntriesCount) + return false; + + const uint32_t h = hash(k); + uint32_t* ptr = mHash + h; + while(*ptr != EOL && !HashFn().equal(GetKey()(mEntries[*ptr]), k)) + ptr = mEntriesNext + *ptr; + + if(*ptr == EOL) + return false; + + PX_PLACEMENT_NEW(&e, Entry)(mEntries[*ptr]); + + return eraseInternal(ptr); + } + + PX_INLINE bool erase(const Key& k) + { + if(!mEntriesCount) + return false; + + const uint32_t h = hash(k); + uint32_t* ptr = mHash + h; + while(*ptr != EOL && !HashFn().equal(GetKey()(mEntries[*ptr]), k)) + ptr = mEntriesNext + *ptr; + + if(*ptr == EOL) + return false; + + return eraseInternal(ptr); + } + + PX_INLINE uint32_t size() const + { + return mEntriesCount; + } + + PX_INLINE uint32_t capacity() const + { + return mHashSize; + } + + void clear() + { + if(!mHashSize || mEntriesCount == 0) + return; + + destroy(); + + intrinsics::memSet(mHash, EOL, mHashSize * sizeof(uint32_t)); + + const uint32_t sizeMinus1 = mEntriesCapacity - 1; + for(uint32_t i = 0; i < sizeMinus1; i++) + { + prefetchLine(mEntriesNext + i, 128); + mEntriesNext[i] = i + 1; + } + mEntriesNext[mEntriesCapacity - 1] = uint32_t(EOL); + mFreeList = 0; + mEntriesCount = 0; + } + + void reserve(uint32_t size) + { + if(size > mHashSize) + reserveInternal(size); + } + + PX_INLINE const Entry* getEntries() const + { + return mEntries; + } + + PX_INLINE Entry* insertUnique(const Key& k) + { + PX_ASSERT(find(k) == NULL); + uint32_t h = hash(k); + + uint32_t entryIndex = freeListGetNext(); + + mEntriesNext[entryIndex] = mHash[h]; + mHash[h] = entryIndex; + + mEntriesCount++; + mTimestamp++; + + return mEntries + entryIndex; + } + + private: + void destroy() + { + for(uint32_t i = 0; i < mHashSize; i++) + { + for(uint32_t j = mHash[i]; j != EOL; j = mEntriesNext[j]) + mEntries[j].~Entry(); + } + } + + template <typename HK, typename GK, class A, bool comp> + PX_NOINLINE void copy(const HashBase<Entry, Key, HK, GK, A, comp>& other); + + // free list management - if we're coalescing, then we use mFreeList to hold + // the top of the free list and it should always be equal to size(). Otherwise, + // we build a free list in the next() pointers. + + PX_INLINE void freeListAdd(uint32_t index) + { + if(compacting) + { + mFreeList--; + PX_ASSERT(mFreeList == mEntriesCount); + } + else + { + mEntriesNext[index] = mFreeList; + mFreeList = index; + } + } + + PX_INLINE void freeListAdd(uint32_t start, uint32_t end) + { + if(!compacting) + { + for(uint32_t i = start; i < end - 1; i++) // add the new entries to the free list + mEntriesNext[i] = i + 1; + + // link in old free list + mEntriesNext[end - 1] = mFreeList; + PX_ASSERT(mFreeList != end - 1); + mFreeList = start; + } + else if(mFreeList == EOL) // don't reset the free ptr for the compacting hash unless it's empty + mFreeList = start; + } + + PX_INLINE uint32_t freeListGetNext() + { + PX_ASSERT(!freeListEmpty()); + if(compacting) + { + PX_ASSERT(mFreeList == mEntriesCount); + return mFreeList++; + } + else + { + uint32_t entryIndex = mFreeList; + mFreeList = mEntriesNext[mFreeList]; + return entryIndex; + } + } + + PX_INLINE bool freeListEmpty() const + { + if(compacting) + return mEntriesCount == mEntriesCapacity; + else + return mFreeList == EOL; + } + + PX_INLINE void replaceWithLast(uint32_t index) + { + PX_PLACEMENT_NEW(mEntries + index, Entry)(mEntries[mEntriesCount]); + mEntries[mEntriesCount].~Entry(); + mEntriesNext[index] = mEntriesNext[mEntriesCount]; + + uint32_t h = hash(GetKey()(mEntries[index])); + uint32_t* ptr; + for(ptr = mHash + h; *ptr != mEntriesCount; ptr = mEntriesNext + *ptr) + PX_ASSERT(*ptr != EOL); + *ptr = index; + } + + PX_INLINE uint32_t hash(const Key& k, uint32_t hashSize) const + { + return HashFn()(k) & (hashSize - 1); + } + + PX_INLINE uint32_t hash(const Key& k) const + { + return hash(k, mHashSize); + } + + PX_INLINE bool eraseInternal(uint32_t* ptr) + { + const uint32_t index = *ptr; + + *ptr = mEntriesNext[index]; + + mEntries[index].~Entry(); + + mEntriesCount--; + mTimestamp++; + + if (compacting && index != mEntriesCount) + replaceWithLast(index); + + freeListAdd(index); + return true; + } + + void reserveInternal(uint32_t size) + { + if(!isPowerOfTwo(size)) + size = nextPowerOfTwo(size); + + PX_ASSERT(!(size & (size - 1))); + + // decide whether iteration can be done on the entries directly + bool resizeCompact = compacting || freeListEmpty(); + + // define new table sizes + uint32_t oldEntriesCapacity = mEntriesCapacity; + uint32_t newEntriesCapacity = uint32_t(float(size) * mLoadFactor); + uint32_t newHashSize = size; + + // allocate new common buffer and setup pointers to new tables + uint8_t* newBuffer; + uint32_t* newHash; + uint32_t* newEntriesNext; + Entry* newEntries; + { + uint32_t newHashByteOffset = 0; + uint32_t newEntriesNextBytesOffset = newHashByteOffset + newHashSize * sizeof(uint32_t); + uint32_t newEntriesByteOffset = newEntriesNextBytesOffset + newEntriesCapacity * sizeof(uint32_t); + newEntriesByteOffset += (16 - (newEntriesByteOffset & 15)) & 15; + uint32_t newBufferByteSize = newEntriesByteOffset + newEntriesCapacity * sizeof(Entry); + + newBuffer = reinterpret_cast<uint8_t*>(Allocator::allocate(newBufferByteSize, __FILE__, __LINE__)); + PX_ASSERT(newBuffer); + + newHash = reinterpret_cast<uint32_t*>(newBuffer + newHashByteOffset); + newEntriesNext = reinterpret_cast<uint32_t*>(newBuffer + newEntriesNextBytesOffset); + newEntries = reinterpret_cast<Entry*>(newBuffer + newEntriesByteOffset); + } + + // initialize new hash table + intrinsics::memSet(newHash, uint32_t(EOL), newHashSize * sizeof(uint32_t)); + + // iterate over old entries, re-hash and create new entries + if(resizeCompact) + { + // check that old free list is empty - we don't need to copy the next entries + PX_ASSERT(compacting || mFreeList == EOL); + + for(uint32_t index = 0; index < mEntriesCount; ++index) + { + uint32_t h = hash(GetKey()(mEntries[index]), newHashSize); + newEntriesNext[index] = newHash[h]; + newHash[h] = index; + + PX_PLACEMENT_NEW(newEntries + index, Entry)(mEntries[index]); + mEntries[index].~Entry(); + } + } + else + { + // copy old free list, only required for non compact resizing + intrinsics::memCopy(newEntriesNext, mEntriesNext, mEntriesCapacity * sizeof(uint32_t)); + + for(uint32_t bucket = 0; bucket < mHashSize; bucket++) + { + uint32_t index = mHash[bucket]; + while(index != EOL) + { + uint32_t h = hash(GetKey()(mEntries[index]), newHashSize); + newEntriesNext[index] = newHash[h]; + PX_ASSERT(index != newHash[h]); + + newHash[h] = index; + + PX_PLACEMENT_NEW(newEntries + index, Entry)(mEntries[index]); + mEntries[index].~Entry(); + + index = mEntriesNext[index]; + } + } + } + + // swap buffer and pointers + Allocator::deallocate(mBuffer); + mBuffer = newBuffer; + mHash = newHash; + mHashSize = newHashSize; + mEntriesNext = newEntriesNext; + mEntries = newEntries; + mEntriesCapacity = newEntriesCapacity; + + freeListAdd(oldEntriesCapacity, newEntriesCapacity); + } + + void grow() + { + PX_ASSERT((mFreeList == EOL) || (compacting && (mEntriesCount == mEntriesCapacity))); + + uint32_t size = mHashSize == 0 ? 16 : mHashSize * 2; + reserve(size); + } + + uint8_t* mBuffer; + Entry* mEntries; + uint32_t* mEntriesNext; // same size as mEntries + uint32_t* mHash; + uint32_t mEntriesCapacity; + uint32_t mHashSize; + float mLoadFactor; + uint32_t mFreeList; + uint32_t mTimestamp; + uint32_t mEntriesCount; // number of entries + + public: + class Iter + { + public: + PX_INLINE Iter(HashBase& b) : mBucket(0), mEntry(uint32_t(b.EOL)), mTimestamp(b.mTimestamp), mBase(b) + { + if(mBase.mEntriesCapacity > 0) + { + mEntry = mBase.mHash[0]; + skip(); + } + } + + PX_INLINE void check() const + { + PX_ASSERT(mTimestamp == mBase.mTimestamp); + } + PX_INLINE const Entry& operator*() const + { + check(); + return mBase.mEntries[mEntry]; + } + PX_INLINE Entry& operator*() + { + check(); + return mBase.mEntries[mEntry]; + } + PX_INLINE const Entry* operator->() const + { + check(); + return mBase.mEntries + mEntry; + } + PX_INLINE Entry* operator->() + { + check(); + return mBase.mEntries + mEntry; + } + PX_INLINE Iter operator++() + { + check(); + advance(); + return *this; + } + PX_INLINE Iter operator++(int) + { + check(); + Iter i = *this; + advance(); + return i; + } + PX_INLINE bool done() const + { + check(); + return mEntry == mBase.EOL; + } + + private: + PX_INLINE void advance() + { + mEntry = mBase.mEntriesNext[mEntry]; + skip(); + } + PX_INLINE void skip() + { + while(mEntry == mBase.EOL) + { + if(++mBucket == mBase.mHashSize) + break; + mEntry = mBase.mHash[mBucket]; + } + } + + Iter& operator=(const Iter&); + + uint32_t mBucket; + uint32_t mEntry; + uint32_t mTimestamp; + HashBase& mBase; + }; + + /*! + Iterate over entries in a hash base and allow entry erase while iterating + */ + class EraseIterator + { + public: + PX_INLINE EraseIterator(HashBase& b): mBase(b) + { + reset(); + } + + PX_INLINE Entry* eraseCurrentGetNext(bool eraseCurrent) + { + if(eraseCurrent && mCurrentEntryIndexPtr) + { + mBase.eraseInternal(mCurrentEntryIndexPtr); + // if next was valid return the same ptr, if next was EOL search new hash entry + if(*mCurrentEntryIndexPtr != mBase.EOL) + return mBase.mEntries + *mCurrentEntryIndexPtr; + else + return traverseHashEntries(); + } + + // traverse mHash to find next entry + if(mCurrentEntryIndexPtr == NULL) + return traverseHashEntries(); + + const uint32_t index = *mCurrentEntryIndexPtr; + if(mBase.mEntriesNext[index] == mBase.EOL) + { + return traverseHashEntries(); + } + else + { + mCurrentEntryIndexPtr = mBase.mEntriesNext + index; + return mBase.mEntries + *mCurrentEntryIndexPtr; + } + } + + PX_INLINE void reset() + { + mCurrentHashIndex = 0; + mCurrentEntryIndexPtr = NULL; + } + + private: + PX_INLINE Entry* traverseHashEntries() + { + mCurrentEntryIndexPtr = NULL; + while (mCurrentEntryIndexPtr == NULL && mCurrentHashIndex < mBase.mHashSize) + { + if (mBase.mHash[mCurrentHashIndex] != mBase.EOL) + { + mCurrentEntryIndexPtr = mBase.mHash + mCurrentHashIndex; + mCurrentHashIndex++; + return mBase.mEntries + *mCurrentEntryIndexPtr; + } + else + { + mCurrentHashIndex++; + } + } + return NULL; + } + + EraseIterator& operator=(const EraseIterator&); + private: + uint32_t* mCurrentEntryIndexPtr; + uint32_t mCurrentHashIndex; + HashBase& mBase; + }; +}; + +template <class Entry, class Key, class HashFn, class GetKey, class Allocator, bool compacting> +template <typename HK, typename GK, class A, bool comp> +PX_NOINLINE void +HashBase<Entry, Key, HashFn, GetKey, Allocator, compacting>::copy(const HashBase<Entry, Key, HK, GK, A, comp>& other) +{ + reserve(other.mEntriesCount); + + for(uint32_t i = 0; i < other.mEntriesCount; i++) + { + for(uint32_t j = other.mHash[i]; j != EOL; j = other.mEntriesNext[j]) + { + const Entry& otherEntry = other.mEntries[j]; + + bool exists; + Entry* newEntry = create(GK()(otherEntry), exists); + PX_ASSERT(!exists); + + PX_PLACEMENT_NEW(newEntry, Entry)(otherEntry); + } + } +} + +template <class Key, class HashFn, class Allocator = typename AllocatorTraits<Key>::Type, bool Coalesced = false> +class HashSetBase +{ + PX_NOCOPY(HashSetBase) + public: + struct GetKey + { + PX_INLINE const Key& operator()(const Key& e) + { + return e; + } + }; + + typedef HashBase<Key, Key, HashFn, GetKey, Allocator, Coalesced> BaseMap; + typedef typename BaseMap::Iter Iterator; + + HashSetBase(uint32_t initialTableSize, float loadFactor, const Allocator& alloc) + : mBase(initialTableSize, loadFactor, alloc) + { + } + + HashSetBase(const Allocator& alloc) : mBase(64, 0.75f, alloc) + { + } + + HashSetBase(uint32_t initialTableSize = 64, float loadFactor = 0.75f) : mBase(initialTableSize, loadFactor) + { + } + + bool insert(const Key& k) + { + bool exists; + Key* e = mBase.create(k, exists); + if(!exists) + PX_PLACEMENT_NEW(e, Key)(k); + return !exists; + } + + PX_INLINE bool contains(const Key& k) const + { + return mBase.find(k) != 0; + } + PX_INLINE bool erase(const Key& k) + { + return mBase.erase(k); + } + PX_INLINE uint32_t size() const + { + return mBase.size(); + } + PX_INLINE uint32_t capacity() const + { + return mBase.capacity(); + } + PX_INLINE void reserve(uint32_t size) + { + mBase.reserve(size); + } + PX_INLINE void clear() + { + mBase.clear(); + } + + protected: + BaseMap mBase; +}; + +template <class Key, class Value, class HashFn, class Allocator = typename AllocatorTraits<Pair<const Key, Value> >::Type> +class HashMapBase +{ + PX_NOCOPY(HashMapBase) + public: + typedef Pair<const Key, Value> Entry; + + struct GetKey + { + PX_INLINE const Key& operator()(const Entry& e) + { + return e.first; + } + }; + + typedef HashBase<Entry, Key, HashFn, GetKey, Allocator, true> BaseMap; + typedef typename BaseMap::Iter Iterator; + typedef typename BaseMap::EraseIterator EraseIterator; + + HashMapBase(uint32_t initialTableSize, float loadFactor, const Allocator& alloc) + : mBase(initialTableSize, loadFactor, alloc) + { + } + + HashMapBase(const Allocator& alloc) : mBase(64, 0.75f, alloc) + { + } + + HashMapBase(uint32_t initialTableSize = 64, float loadFactor = 0.75f) : mBase(initialTableSize, loadFactor) + { + } + + bool insert(const Key /*&*/ k, const Value /*&*/ v) + { + bool exists; + Entry* e = mBase.create(k, exists); + if(!exists) + PX_PLACEMENT_NEW(e, Entry)(k, v); + return !exists; + } + + Value& operator[](const Key& k) + { + bool exists; + Entry* e = mBase.create(k, exists); + if(!exists) + PX_PLACEMENT_NEW(e, Entry)(k, Value()); + + return e->second; + } + + PX_INLINE const Entry* find(const Key& k) const + { + return mBase.find(k); + } + PX_INLINE bool erase(const Key& k) + { + return mBase.erase(k); + } + PX_INLINE bool erase(const Key& k, Entry& e) + { + return mBase.erase(k, e); + } + PX_INLINE uint32_t size() const + { + return mBase.size(); + } + PX_INLINE uint32_t capacity() const + { + return mBase.capacity(); + } + PX_INLINE Iterator getIterator() + { + return Iterator(mBase); + } + PX_INLINE EraseIterator getEraseIterator() + { + return EraseIterator(mBase); + } + PX_INLINE void reserve(uint32_t size) + { + mBase.reserve(size); + } + PX_INLINE void clear() + { + mBase.clear(); + } + + protected: + BaseMap mBase; +}; +} + +} // namespace shdfnd +} // namespace physx + +#if PX_VC +#pragma warning(pop) +#endif +#endif // #ifndef PSFOUNDATION_PSHASHINTERNALS_H diff --git a/PxShared/src/foundation/include/PsHashMap.h b/PxShared/src/foundation/include/PsHashMap.h new file mode 100644 index 00000000..8af35174 --- /dev/null +++ b/PxShared/src/foundation/include/PsHashMap.h @@ -0,0 +1,118 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSHASHMAP_H +#define PSFOUNDATION_PSHASHMAP_H + +#include "PsHashInternals.h" + +// TODO: make this doxy-format +// +// This header defines two hash maps. Hash maps +// * support custom initial table sizes (rounded up internally to power-of-2) +// * support custom static allocator objects +// * auto-resize, based on a load factor (i.e. a 64-entry .75 load factor hash will resize +// when the 49th element is inserted) +// * are based on open hashing +// * have O(1) contains, erase +// +// Maps have STL-like copying semantics, and properly initialize and destruct copies of objects +// +// There are two forms of map: coalesced and uncoalesced. Coalesced maps keep the entries in the +// initial segment of an array, so are fast to iterate over; however deletion is approximately +// twice as expensive. +// +// HashMap<T>: +// bool insert(const Key& k, const Value& v) O(1) amortized (exponential resize policy) +// Value & operator[](const Key& k) O(1) for existing objects, else O(1) amortized +// const Entry * find(const Key& k); O(1) +// bool erase(const T& k); O(1) +// uint32_t size(); constant +// void reserve(uint32_t size); O(MAX(currentOccupancy,size)) +// void clear(); O(currentOccupancy) (with zero constant for objects +// without +// destructors) +// Iterator getIterator(); +// +// operator[] creates an entry if one does not exist, initializing with the default constructor. +// CoalescedHashMap<T> does not support getIterator, but instead supports +// const Key *getEntries(); +// +// Use of iterators: +// +// for(HashMap::Iterator iter = test.getIterator(); !iter.done(); ++iter) +// myFunction(iter->first, iter->second); + +namespace physx +{ +namespace shdfnd +{ +template <class Key, class Value, class HashFn = Hash<Key>, class Allocator = NonTrackingAllocator> +class HashMap : public internal::HashMapBase<Key, Value, HashFn, Allocator> +{ + public: + typedef internal::HashMapBase<Key, Value, HashFn, Allocator> HashMapBase; + typedef typename HashMapBase::Iterator Iterator; + + HashMap(uint32_t initialTableSize = 64, float loadFactor = 0.75f) : HashMapBase(initialTableSize, loadFactor) + { + } + HashMap(uint32_t initialTableSize, float loadFactor, const Allocator& alloc) + : HashMapBase(initialTableSize, loadFactor, alloc) + { + } + HashMap(const Allocator& alloc) : HashMapBase(64, 0.75f, alloc) + { + } + Iterator getIterator() + { + return Iterator(HashMapBase::mBase); + } +}; + +template <class Key, class Value, class HashFn = Hash<Key>, class Allocator = NonTrackingAllocator> +class CoalescedHashMap : public internal::HashMapBase<Key, Value, HashFn, Allocator> +{ + public: + typedef internal::HashMapBase<Key, Value, HashFn, Allocator> HashMapBase; + + CoalescedHashMap(uint32_t initialTableSize = 64, float loadFactor = 0.75f) + : HashMapBase(initialTableSize, loadFactor) + { + } + const Pair<const Key, Value>* getEntries() const + { + return HashMapBase::mBase.getEntries(); + } +}; + +} // namespace shdfnd +} // namespace physx + +#endif // #ifndef PSFOUNDATION_PSHASHMAP_H diff --git a/PxShared/src/foundation/include/PsHashSet.h b/PxShared/src/foundation/include/PsHashSet.h new file mode 100644 index 00000000..73fb7502 --- /dev/null +++ b/PxShared/src/foundation/include/PsHashSet.h @@ -0,0 +1,127 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSHASHSET_H +#define PSFOUNDATION_PSHASHSET_H + +#include "PsHashInternals.h" + +// TODO: make this doxy-format + +// This header defines two hash sets. Hash sets +// * support custom initial table sizes (rounded up internally to power-of-2) +// * support custom static allocator objects +// * auto-resize, based on a load factor (i.e. a 64-entry .75 load factor hash will resize +// when the 49th element is inserted) +// * are based on open hashing +// +// Sets have STL-like copying semantics, and properly initialize and destruct copies of objects +// +// There are two forms of set: coalesced and uncoalesced. Coalesced sets keep the entries in the +// initial segment of an array, so are fast to iterate over; however deletion is approximately +// twice as expensive. +// +// HashSet<T>: +// bool insert(const T& k) amortized O(1) (exponential resize policy) +// bool contains(const T& k) const; O(1) +// bool erase(const T& k); O(1) +// uint32_t size() const; constant +// void reserve(uint32_t size); O(MAX(size, currentOccupancy)) +// void clear(); O(currentOccupancy) (with zero constant for objects without +// destructors) +// Iterator getIterator(); +// +// Use of iterators: +// +// for(HashSet::Iterator iter = test.getIterator(); !iter.done(); ++iter) +// myFunction(*iter); +// +// CoalescedHashSet<T> does not support getIterator, but instead supports +// const Key *getEntries(); +// +// insertion into a set already containing the element fails returning false, as does +// erasure of an element not in the set +// + +namespace physx +{ +namespace shdfnd +{ +template <class Key, class HashFn = Hash<Key>, class Allocator = NonTrackingAllocator> +class HashSet : public internal::HashSetBase<Key, HashFn, Allocator, false> +{ + public: + typedef internal::HashSetBase<Key, HashFn, Allocator, false> HashSetBase; + typedef typename HashSetBase::Iterator Iterator; + + HashSet(uint32_t initialTableSize = 64, float loadFactor = 0.75f) : HashSetBase(initialTableSize, loadFactor) + { + } + HashSet(uint32_t initialTableSize, float loadFactor, const Allocator& alloc) + : HashSetBase(initialTableSize, loadFactor, alloc) + { + } + HashSet(const Allocator& alloc) : HashSetBase(64, 0.75f, alloc) + { + } + Iterator getIterator() + { + return Iterator(HashSetBase::mBase); + } +}; + +template <class Key, class HashFn = Hash<Key>, class Allocator = NonTrackingAllocator> +class CoalescedHashSet : public internal::HashSetBase<Key, HashFn, Allocator, true> +{ + public: + typedef typename internal::HashSetBase<Key, HashFn, Allocator, true> HashSetBase; + + CoalescedHashSet(uint32_t initialTableSize = 64, float loadFactor = 0.75f) + : HashSetBase(initialTableSize, loadFactor) + { + } + + CoalescedHashSet(uint32_t initialTableSize, float loadFactor, const Allocator& alloc) + : HashSetBase(initialTableSize, loadFactor, alloc) + { + } + CoalescedHashSet(const Allocator& alloc) : HashSetBase(64, 0.75f, alloc) + { + } + + const Key* getEntries() const + { + return HashSetBase::mBase.getEntries(); + } +}; + +} // namespace shdfnd +} // namespace physx + +#endif // #ifndef PSFOUNDATION_PSHASHSET_H diff --git a/PxShared/src/foundation/include/PsInlineAllocator.h b/PxShared/src/foundation/include/PsInlineAllocator.h new file mode 100644 index 00000000..9e129f37 --- /dev/null +++ b/PxShared/src/foundation/include/PsInlineAllocator.h @@ -0,0 +1,91 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSINLINEALLOCATOR_H +#define PSFOUNDATION_PSINLINEALLOCATOR_H + +#include "PsUserAllocated.h" + +namespace physx +{ +namespace shdfnd +{ +// this is used by the array class to allocate some space for a small number +// of objects along with the metadata +template <uint32_t N, typename BaseAllocator> +class InlineAllocator : private BaseAllocator +{ + public: + InlineAllocator(const PxEMPTY v) : BaseAllocator(v) + { + } + + InlineAllocator(const BaseAllocator& alloc = BaseAllocator()) : BaseAllocator(alloc), mBufferUsed(false) + { + } + + InlineAllocator(const InlineAllocator& aloc) : BaseAllocator(aloc), mBufferUsed(false) + { + } + + void* allocate(uint32_t size, const char* filename, int line) + { + if(!mBufferUsed && size <= N) + { + mBufferUsed = true; + return mBuffer; + } + return BaseAllocator::allocate(size, filename, line); + } + + void deallocate(void* ptr) + { + if(ptr == mBuffer) + mBufferUsed = false; + else + BaseAllocator::deallocate(ptr); + } + + PX_FORCE_INLINE uint8_t* getInlineBuffer() + { + return mBuffer; + } + PX_FORCE_INLINE bool isBufferUsed() const + { + return mBufferUsed; + } + + protected: + uint8_t mBuffer[N]; + bool mBufferUsed; +}; +} // namespace shdfnd +} // namespace physx + +#endif // #ifndef PSFOUNDATION_PSINLINEALLOCATOR_H diff --git a/PxShared/src/foundation/include/PsInlineAoS.h b/PxShared/src/foundation/include/PsInlineAoS.h new file mode 100644 index 00000000..cd051180 --- /dev/null +++ b/PxShared/src/foundation/include/PsInlineAoS.h @@ -0,0 +1,48 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSINLINEAOS_H +#define PSFOUNDATION_PSINLINEAOS_H + +#include "foundation/PxPreprocessor.h" + +#if PX_WINDOWS +#include "windows/PsWindowsTrigConstants.h" +#include "windows/PsWindowsInlineAoS.h" +#elif(PX_UNIX_FAMILY || PX_PS4) +#include "unix/PsUnixTrigConstants.h" +#include "unix/PsUnixInlineAoS.h" +#elif PX_XBOXONE +#include "XboxOne/PsXboxOneTrigConstants.h" +#include "XboxOne/PsXboxOneInlineAoS.h" +#else +#error "Platform not supported!" +#endif + +#endif diff --git a/PxShared/src/foundation/include/PsInlineArray.h b/PxShared/src/foundation/include/PsInlineArray.h new file mode 100644 index 00000000..10434f91 --- /dev/null +++ b/PxShared/src/foundation/include/PsInlineArray.h @@ -0,0 +1,68 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSINLINEARRAY_H +#define PSFOUNDATION_PSINLINEARRAY_H + +#include "PsArray.h" +#include "PsInlineAllocator.h" + +namespace physx +{ +namespace shdfnd +{ + +// array that pre-allocates for N elements +template <typename T, uint32_t N, typename Alloc = typename AllocatorTraits<T>::Type> +class InlineArray : public Array<T, InlineAllocator<N * sizeof(T), Alloc> > +{ + typedef InlineAllocator<N * sizeof(T), Alloc> Allocator; + + public: + InlineArray(const PxEMPTY v) : Array<T, Allocator>(v) + { + if(isInlined()) + this->mData = reinterpret_cast<T*>(Array<T, Allocator>::getInlineBuffer()); + } + + PX_INLINE bool isInlined() const + { + return Allocator::isBufferUsed(); + } + + PX_INLINE explicit InlineArray(const Alloc& alloc = Alloc()) : Array<T, Allocator>(alloc) + { + this->mData = this->allocate(N); + this->mCapacity = N; + } +}; +} // namespace shdfnd +} // namespace physx + +#endif // #ifndef PSFOUNDATION_PSINLINEARRAY_H diff --git a/PxShared/src/foundation/include/PsIntrinsics.h b/PxShared/src/foundation/include/PsIntrinsics.h new file mode 100644 index 00000000..2657d2a6 --- /dev/null +++ b/PxShared/src/foundation/include/PsIntrinsics.h @@ -0,0 +1,45 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSINTRINSICS_H +#define PSFOUNDATION_PSINTRINSICS_H + +#include "foundation/PxPreprocessor.h" + +#if PX_WINDOWS_FAMILY +#include "windows/PsWindowsIntrinsics.h" +#elif(PX_LINUX || PX_ANDROID || PX_APPLE_FAMILY || PX_PS4) +#include "unix/PsUnixIntrinsics.h" +#elif PX_XBOXONE +#include "XboxOne/PsXboxOneIntrinsics.h" +#else +#error "Platform not supported!" +#endif + +#endif // #ifndef PSFOUNDATION_PSINTRINSICS_H diff --git a/PxShared/src/foundation/include/PsMathUtils.h b/PxShared/src/foundation/include/PsMathUtils.h new file mode 100644 index 00000000..e3c6e8c9 --- /dev/null +++ b/PxShared/src/foundation/include/PsMathUtils.h @@ -0,0 +1,692 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSMATHUTILS_H +#define PSFOUNDATION_PSMATHUTILS_H + +#include "foundation/PxPreprocessor.h" +#include "foundation/PxTransform.h" +#include "foundation/PxMat33.h" +#include "Ps.h" +#include "PsIntrinsics.h" + +// General guideline is: if it's an abstract math function, it belongs here. +// If it's a math function where the inputs have specific semantics (e.g. +// separateSwingTwist) it doesn't. + +namespace physx +{ +namespace shdfnd +{ +/** +\brief sign returns the sign of its argument. The sign of zero is undefined. +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 sign(const PxF32 a) +{ + return intrinsics::sign(a); +} + +/** +\brief sign returns the sign of its argument. The sign of zero is undefined. +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 sign(const PxF64 a) +{ + return (a >= 0.0) ? 1.0 : -1.0; +} + +/** +\brief sign returns the sign of its argument. The sign of zero is undefined. +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxI32 sign(const PxI32 a) +{ + return (a >= 0) ? 1 : -1; +} + +/** +\brief Returns true if the two numbers are within eps of each other. +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE bool equals(const PxF32 a, const PxF32 b, const PxF32 eps) +{ + return (PxAbs(a - b) < eps); +} + +/** +\brief Returns true if the two numbers are within eps of each other. +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE bool equals(const PxF64 a, const PxF64 b, const PxF64 eps) +{ + return (PxAbs(a - b) < eps); +} + +/** +\brief The floor function returns a floating-point value representing the largest integer that is less than or equal to +x. +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 floor(const PxF32 a) +{ + return floatFloor(a); +} + +/** +\brief The floor function returns a floating-point value representing the largest integer that is less than or equal to +x. +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 floor(const PxF64 a) +{ + return ::floor(a); +} + +/** +\brief The ceil function returns a single value representing the smallest integer that is greater than or equal to x. +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 ceil(const PxF32 a) +{ + return ::ceilf(a); +} + +/** +\brief The ceil function returns a double value representing the smallest integer that is greater than or equal to x. +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 ceil(const PxF64 a) +{ + return ::ceil(a); +} + +/** +\brief mod returns the floating-point remainder of x / y. + +If the value of y is 0.0, mod returns a quiet NaN. +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 mod(const PxF32 x, const PxF32 y) +{ + return PxF32(::fmodf(x, y)); +} + +/** +\brief mod returns the floating-point remainder of x / y. + +If the value of y is 0.0, mod returns a quiet NaN. +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 mod(const PxF64 x, const PxF64 y) +{ + return ::fmod(x, y); +} + +/** +\brief Square. +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 sqr(const PxF32 a) +{ + return a * a; +} + +/** +\brief Square. +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 sqr(const PxF64 a) +{ + return a * a; +} + +/** +\brief Calculates x raised to the power of y. +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 pow(const PxF32 x, const PxF32 y) +{ + return ::powf(x, y); +} + +/** +\brief Calculates x raised to the power of y. +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 pow(const PxF64 x, const PxF64 y) +{ + return ::pow(x, y); +} + +/** +\brief Calculates e^n +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 exp(const PxF32 a) +{ + return ::expf(a); +} +/** + +\brief Calculates e^n +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 exp(const PxF64 a) +{ + return ::exp(a); +} + +/** +\brief Calculates 2^n +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 exp2(const PxF32 a) +{ + return ::expf(a * 0.693147180559945309417f); +} +/** + +\brief Calculates 2^n +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 exp2(const PxF64 a) +{ + return ::exp(a * 0.693147180559945309417); +} + +/** +\brief Calculates logarithms. +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 logE(const PxF32 a) +{ + return ::logf(a); +} + +/** +\brief Calculates logarithms. +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 logE(const PxF64 a) +{ + return ::log(a); +} + +/** +\brief Calculates logarithms. +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 log2(const PxF32 a) +{ + return ::logf(a) / 0.693147180559945309417f; +} + +/** +\brief Calculates logarithms. +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 log2(const PxF64 a) +{ + return ::log(a) / 0.693147180559945309417; +} + +/** +\brief Calculates logarithms. +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 log10(const PxF32 a) +{ + return ::log10f(a); +} + +/** +\brief Calculates logarithms. +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 log10(const PxF64 a) +{ + return ::log10(a); +} + +/** +\brief Converts degrees to radians. +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 degToRad(const PxF32 a) +{ + return 0.01745329251994329547f * a; +} + +/** +\brief Converts degrees to radians. +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 degToRad(const PxF64 a) +{ + return 0.01745329251994329547 * a; +} + +/** +\brief Converts radians to degrees. +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 radToDeg(const PxF32 a) +{ + return 57.29577951308232286465f * a; +} + +/** +\brief Converts radians to degrees. +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 radToDeg(const PxF64 a) +{ + return 57.29577951308232286465 * a; +} + +//! \brief compute sine and cosine at the same time. There is a 'fsincos' on PC that we probably want to use here +PX_CUDA_CALLABLE PX_FORCE_INLINE void sincos(const PxF32 radians, PxF32& sin, PxF32& cos) +{ + /* something like: + _asm fld Local + _asm fsincos + _asm fstp LocalCos + _asm fstp LocalSin + */ + sin = PxSin(radians); + cos = PxCos(radians); +} + +/** +\brief uniform random number in [a,b] +*/ +PX_FORCE_INLINE PxI32 rand(const PxI32 a, const PxI32 b) +{ + return a + PxI32(::rand() % (b - a + 1)); +} + +/** +\brief uniform random number in [a,b] +*/ +PX_FORCE_INLINE PxF32 rand(const PxF32 a, const PxF32 b) +{ + return a + (b - a) * ::rand() / RAND_MAX; +} + +//! \brief return angle between two vectors in radians +PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 angle(const PxVec3& v0, const PxVec3& v1) +{ + const PxF32 cos = v0.dot(v1); // |v0|*|v1|*Cos(Angle) + const PxF32 sin = (v0.cross(v1)).magnitude(); // |v0|*|v1|*Sin(Angle) + return PxAtan2(sin, cos); +} + +//! If possible use instead fsel on the dot product /*fsel(d.dot(p),onething,anotherthing);*/ +//! Compares orientations (more readable, user-friendly function) +PX_CUDA_CALLABLE PX_FORCE_INLINE bool sameDirection(const PxVec3& d, const PxVec3& p) +{ + return d.dot(p) >= 0.0f; +} + +//! Checks 2 values have different signs +PX_CUDA_CALLABLE PX_FORCE_INLINE IntBool differentSign(PxReal f0, PxReal f1) +{ + union + { + PxU32 u; + PxReal f; + } u1, u2; + u1.f = f0; + u2.f = f1; + return IntBool((u1.u ^ u2.u) & PX_SIGN_BITMASK); +} + +PX_CUDA_CALLABLE PX_FORCE_INLINE PxMat33 star(const PxVec3& v) +{ + return PxMat33(PxVec3(0, v.z, -v.y), PxVec3(-v.z, 0, v.x), PxVec3(v.y, -v.x, 0)); +} + +PX_CUDA_CALLABLE PX_INLINE PxVec3 log(const PxQuat& q) +{ + const PxReal s = q.getImaginaryPart().magnitude(); + if(s < 1e-12f) + return PxVec3(0.0f); + // force the half-angle to have magnitude <= pi/2 + PxReal halfAngle = q.w < 0 ? PxAtan2(-s, -q.w) : PxAtan2(s, q.w); + PX_ASSERT(halfAngle >= -PxPi / 2 && halfAngle <= PxPi / 2); + + return q.getImaginaryPart().getNormalized() * 2.f * halfAngle; +} + +PX_CUDA_CALLABLE PX_INLINE PxQuat exp(const PxVec3& v) +{ + const PxReal m = v.magnitudeSquared(); + return m < 1e-24f ? PxQuat(PxIdentity) : PxQuat(PxSqrt(m), v * PxRecipSqrt(m)); +} + +// quat to rotate v0 t0 v1 +PX_CUDA_CALLABLE PX_INLINE PxQuat rotationArc(const PxVec3& v0, const PxVec3& v1) +{ + const PxVec3 cross = v0.cross(v1); + const PxReal d = v0.dot(v1); + if(d <= -0.99999f) + return (PxAbs(v0.x) < 0.1f ? PxQuat(0.0f, v0.z, -v0.y, 0.0f) : PxQuat(v0.y, -v0.x, 0.0, 0.0)).getNormalized(); + + const PxReal s = PxSqrt((1 + d) * 2), r = 1 / s; + + return PxQuat(cross.x * r, cross.y * r, cross.z * r, s * 0.5f).getNormalized(); +} + +/** +\brief returns largest axis +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxU32 largestAxis(const PxVec3& v) +{ + PxU32 m = PxU32(v.y > v.x ? 1 : 0); + return v.z > v[m] ? 2 : m; +} + +/** +\brief returns indices for the largest axis and 2 other axii +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxU32 largestAxis(const PxVec3& v, PxU32& other1, PxU32& other2) +{ + if(v.x >= PxMax(v.y, v.z)) + { + other1 = 1; + other2 = 2; + return 0; + } + else if(v.y >= v.z) + { + other1 = 0; + other2 = 2; + return 1; + } + else + { + other1 = 0; + other2 = 1; + return 2; + } +} + +/** +\brief returns axis with smallest absolute value +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE PxU32 closestAxis(const PxVec3& v) +{ + PxU32 m = PxU32(PxAbs(v.y) > PxAbs(v.x) ? 1 : 0); + return PxAbs(v.z) > PxAbs(v[m]) ? 2 : m; +} + +PX_CUDA_CALLABLE PX_INLINE PxU32 closestAxis(const PxVec3& v, PxU32& j, PxU32& k) +{ + // find largest 2D plane projection + const PxF32 absPx = PxAbs(v.x); + const PxF32 absNy = PxAbs(v.y); + const PxF32 absNz = PxAbs(v.z); + + PxU32 m = 0; // x biggest axis + j = 1; + k = 2; + if(absNy > absPx && absNy > absNz) + { + // y biggest + j = 2; + k = 0; + m = 1; + } + else if(absNz > absPx) + { + // z biggest + j = 0; + k = 1; + m = 2; + } + return m; +} + +/*! +Extend an edge along its length by a factor +*/ +PX_CUDA_CALLABLE PX_FORCE_INLINE void makeFatEdge(PxVec3& p0, PxVec3& p1, PxReal fatCoeff) +{ + PxVec3 delta = p1 - p0; + + const PxReal m = delta.magnitude(); + if(m > 0.0f) + { + delta *= fatCoeff / m; + p0 -= delta; + p1 += delta; + } +} + +//! Compute point as combination of barycentric coordinates +PX_CUDA_CALLABLE PX_FORCE_INLINE PxVec3 +computeBarycentricPoint(const PxVec3& p0, const PxVec3& p1, const PxVec3& p2, PxReal u, PxReal v) +{ + // This seems to confuse the compiler... + // return (1.0f - u - v)*p0 + u*p1 + v*p2; + const PxF32 w = 1.0f - u - v; + return PxVec3(w * p0.x + u * p1.x + v * p2.x, w * p0.y + u * p1.y + v * p2.y, w * p0.z + u * p1.z + v * p2.z); +} + +// generates a pair of quaternions (swing, twist) such that in = swing * twist, with +// swing.x = 0 +// twist.y = twist.z = 0, and twist is a unit quat +PX_FORCE_INLINE void separateSwingTwist(const PxQuat& q, PxQuat& swing, PxQuat& twist) +{ + twist = q.x != 0.0f ? PxQuat(q.x, 0, 0, q.w).getNormalized() : PxQuat(PxIdentity); + swing = q * twist.getConjugate(); +} + +// generate two tangent vectors to a given normal +PX_FORCE_INLINE void normalToTangents(const PxVec3& normal, PxVec3& tangent0, PxVec3& tangent1) +{ + tangent0 = PxAbs(normal.x) < 0.70710678f ? PxVec3(0, -normal.z, normal.y) : PxVec3(-normal.y, normal.x, 0); + tangent0.normalize(); + tangent1 = normal.cross(tangent0); +} + +// todo: what is this function doing? +PX_FOUNDATION_API PxQuat computeQuatFromNormal(const PxVec3& n); + +/** +\brief computes a oriented bounding box around the scaled basis. +\param basis Input = skewed basis, Output = (normalized) orthogonal basis. +\return Bounding box extent. +*/ +PX_FOUNDATION_API PxVec3 optimizeBoundingBox(PxMat33& basis); + +PX_FOUNDATION_API PxQuat slerp(const PxReal t, const PxQuat& left, const PxQuat& right); + +PX_CUDA_CALLABLE PX_INLINE PxVec3 ellipseClamp(const PxVec3& point, const PxVec3& radii) +{ + // This function need to be implemented in the header file because + // it is included in a spu shader program. + + // finds the closest point on the ellipse to a given point + + // (p.y, p.z) is the input point + // (e.y, e.z) are the radii of the ellipse + + // lagrange multiplier method with Newton/Halley hybrid root-finder. + // see http://www.geometrictools.com/Documentation/DistancePointToEllipse2.pdf + // for proof of Newton step robustness and initial estimate. + // Halley converges much faster but sometimes overshoots - when that happens we take + // a newton step instead + + // converges in 1-2 iterations where D&C works well, and it's good with 4 iterations + // with any ellipse that isn't completely crazy + + const PxU32 MAX_ITERATIONS = 20; + const PxReal convergenceThreshold = 1e-4f; + + // iteration requires first quadrant but we recover generality later + + PxVec3 q(0, PxAbs(point.y), PxAbs(point.z)); + const PxReal tinyEps = 1e-6f; // very close to minor axis is numerically problematic but trivial + if(radii.y >= radii.z) + { + if(q.z < tinyEps) + return PxVec3(0, point.y > 0 ? radii.y : -radii.y, 0); + } + else + { + if(q.y < tinyEps) + return PxVec3(0, 0, point.z > 0 ? radii.z : -radii.z); + } + + PxVec3 denom, e2 = radii.multiply(radii), eq = radii.multiply(q); + + // we can use any initial guess which is > maximum(-e.y^2,-e.z^2) and for which f(t) is > 0. + // this guess works well near the axes, but is weak along the diagonals. + + PxReal t = PxMax(eq.y - e2.y, eq.z - e2.z); + + for(PxU32 i = 0; i < MAX_ITERATIONS; i++) + { + denom = PxVec3(0, 1 / (t + e2.y), 1 / (t + e2.z)); + PxVec3 denom2 = eq.multiply(denom); + + PxVec3 fv = denom2.multiply(denom2); + PxReal f = fv.y + fv.z - 1; + + // although in exact arithmetic we are guaranteed f>0, we can get here + // on the first iteration via catastrophic cancellation if the point is + // very close to the origin. In that case we just behave as if f=0 + + if(f < convergenceThreshold) + return e2.multiply(point).multiply(denom); + + PxReal df = fv.dot(denom) * -2.0f; + t = t - f / df; + } + + // we didn't converge, so clamp what we have + PxVec3 r = e2.multiply(point).multiply(denom); + return r * PxRecipSqrt(sqr(r.y / radii.y) + sqr(r.z / radii.z)); +} + +PX_CUDA_CALLABLE PX_INLINE PxReal tanHalf(PxReal sin, PxReal cos) +{ + return sin / (1 + cos); +} + +PX_INLINE PxQuat quatFromTanQVector(const PxVec3& v) +{ + PxReal v2 = v.dot(v); + if(v2 < 1e-12f) + return PxQuat(PxIdentity); + PxReal d = 1 / (1 + v2); + return PxQuat(v.x * 2, v.y * 2, v.z * 2, 1 - v2) * d; +} + +PX_FORCE_INLINE PxVec3 cross100(const PxVec3& b) +{ + return PxVec3(0.0f, -b.z, b.y); +} +PX_FORCE_INLINE PxVec3 cross010(const PxVec3& b) +{ + return PxVec3(b.z, 0.0f, -b.x); +} +PX_FORCE_INLINE PxVec3 cross001(const PxVec3& b) +{ + return PxVec3(-b.y, b.x, 0.0f); +} + +PX_INLINE void decomposeVector(PxVec3& normalCompo, PxVec3& tangentCompo, const PxVec3& outwardDir, + const PxVec3& outwardNormal) +{ + normalCompo = outwardNormal * (outwardDir.dot(outwardNormal)); + tangentCompo = outwardDir - normalCompo; +} + +//! \brief Return (i+1)%3 +// Avoid variable shift for XBox: +// PX_INLINE PxU32 Ps::getNextIndex3(PxU32 i) { return (1<<i) & 3; } +PX_INLINE PxU32 getNextIndex3(PxU32 i) +{ + return (i + 1 + (i >> 1)) & 3; +} + +PX_INLINE PxMat33 rotFrom2Vectors(const PxVec3& from, const PxVec3& to) +{ + // See bottom of http://www.euclideanspace.com/maths/algebra/matrix/orthogonal/rotation/index.htm + + // Early exit if to = from + if((from - to).magnitudeSquared() < 1e-4f) + return PxMat33(PxIdentity); + + // Early exit if to = -from + if((from + to).magnitudeSquared() < 1e-4f) + return PxMat33::createDiagonal(PxVec3(1.0f, -1.0f, -1.0f)); + + PxVec3 n = from.cross(to); + + PxReal C = from.dot(to), S = PxSqrt(1 - C * C), CC = 1 - C; + + PxReal xx = n.x * n.x, yy = n.y * n.y, zz = n.z * n.z, xy = n.x * n.y, yz = n.y * n.z, xz = n.x * n.z; + + PxMat33 R; + + R(0, 0) = 1 + CC * (xx - 1); + R(0, 1) = -n.z * S + CC * xy; + R(0, 2) = n.y * S + CC * xz; + + R(1, 0) = n.z * S + CC * xy; + R(1, 1) = 1 + CC * (yy - 1); + R(1, 2) = -n.x * S + CC * yz; + + R(2, 0) = -n.y * S + CC * xz; + R(2, 1) = n.x * S + CC * yz; + R(2, 2) = 1 + CC * (zz - 1); + + return R; +} + +PX_FOUNDATION_API void integrateTransform(const PxTransform& curTrans, const PxVec3& linvel, const PxVec3& angvel, + PxReal timeStep, PxTransform& result); + +PX_INLINE void computeBasis(const PxVec3& dir, PxVec3& right, PxVec3& up) +{ + // Derive two remaining vectors + if(PxAbs(dir.y) <= 0.9999f) + { + right = PxVec3(dir.z, 0.0f, -dir.x); + right.normalize(); + + // PT: normalize not needed for 'up' because dir & right are unit vectors, + // and by construction the angle between them is 90 degrees (i.e. sin(angle)=1) + up = PxVec3(dir.y * right.z, dir.z * right.x - dir.x * right.z, -dir.y * right.x); + } + else + { + right = PxVec3(1.0f, 0.0f, 0.0f); + + up = PxVec3(0.0f, dir.z, -dir.y); + up.normalize(); + } +} + +PX_INLINE void computeBasis(const PxVec3& p0, const PxVec3& p1, PxVec3& dir, PxVec3& right, PxVec3& up) +{ + // Compute the new direction vector + dir = p1 - p0; + dir.normalize(); + + // Derive two remaining vectors + computeBasis(dir, right, up); +} + +PX_FORCE_INLINE bool isAlmostZero(const PxVec3& v) +{ + if(PxAbs(v.x) > 1e-6f || PxAbs(v.y) > 1e-6f || PxAbs(v.z) > 1e-6f) + return false; + return true; +} + +} // namespace shdfnd +} // namespace physx + +#endif diff --git a/PxShared/src/foundation/include/PsMutex.h b/PxShared/src/foundation/include/PsMutex.h new file mode 100644 index 00000000..a75d5567 --- /dev/null +++ b/PxShared/src/foundation/include/PsMutex.h @@ -0,0 +1,330 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSMUTEX_H +#define PSFOUNDATION_PSMUTEX_H + +#include "PsAllocator.h" + +/* + * This <new> inclusion is a best known fix for gcc 4.4.1 error: + * Creating object file for apex/src/PsAllocator.cpp ... + * In file included from apex/include/PsFoundation.h:30, + * from apex/src/PsAllocator.cpp:26: + * apex/include/PsMutex.h: In constructor 'physx::shdfnd::MutexT<Alloc>::MutexT(const Alloc&)': + * apex/include/PsMutex.h:92: error: no matching function for call to 'operator new(unsigned int, + * physx::shdfnd::MutexImpl*&)' + * <built-in>:0: note: candidates are: void* operator new(unsigned int) + */ +#include <new> + +namespace physx +{ +namespace shdfnd +{ +class PX_FOUNDATION_API MutexImpl +{ + public: + /** + The constructor for Mutex creates a mutex. It is initially unlocked. + */ + MutexImpl(); + + /** + The destructor for Mutex deletes the mutex. + */ + ~MutexImpl(); + + /** + Acquire (lock) the mutex. If the mutex is already locked + by another thread, this method blocks until the mutex is + unlocked. + */ + void lock(); + + /** + Acquire (lock) the mutex. If the mutex is already locked + by another thread, this method returns false without blocking. + */ + bool trylock(); + + /** + Release (unlock) the mutex. + */ + void unlock(); + + /** + Size of this class. + */ + static const uint32_t& getSize(); +}; + +template <typename Alloc = ReflectionAllocator<MutexImpl> > +class MutexT : protected Alloc +{ + PX_NOCOPY(MutexT) + public: + class ScopedLock + { + MutexT<Alloc>& mMutex; + PX_NOCOPY(ScopedLock) + public: + PX_INLINE ScopedLock(MutexT<Alloc>& mutex) : mMutex(mutex) + { + mMutex.lock(); + } + PX_INLINE ~ScopedLock() + { + mMutex.unlock(); + } + }; + + /** + The constructor for Mutex creates a mutex. It is initially unlocked. + */ + MutexT(const Alloc& alloc = Alloc()) : Alloc(alloc) + { + mImpl = reinterpret_cast<MutexImpl*>(Alloc::allocate(MutexImpl::getSize(), __FILE__, __LINE__)); + PX_PLACEMENT_NEW(mImpl, MutexImpl)(); + } + + /** + The destructor for Mutex deletes the mutex. + */ + ~MutexT() + { + mImpl->~MutexImpl(); + Alloc::deallocate(mImpl); + } + + /** + Acquire (lock) the mutex. If the mutex is already locked + by another thread, this method blocks until the mutex is + unlocked. + */ + void lock() const + { + mImpl->lock(); + } + + /** + Acquire (lock) the mutex. If the mutex is already locked + by another thread, this method returns false without blocking, + returns true if lock is successfully acquired + */ + bool trylock() const + { + return mImpl->trylock(); + } + + /** + Release (unlock) the mutex, the calling thread must have + previously called lock() or method will error + */ + void unlock() const + { + mImpl->unlock(); + } + + private: + MutexImpl* mImpl; +}; + +class PX_FOUNDATION_API ReadWriteLock +{ + PX_NOCOPY(ReadWriteLock) + public: + ReadWriteLock(); + ~ReadWriteLock(); + + void lockReader(); + void lockWriter(); + + void unlockReader(); + void unlockWriter(); + + private: + class ReadWriteLockImpl* mImpl; +}; + +class ScopedReadLock +{ + PX_NOCOPY(ScopedReadLock) + public: + PX_INLINE ScopedReadLock(ReadWriteLock& lock) : mLock(lock) + { + mLock.lockReader(); + } + PX_INLINE ~ScopedReadLock() + { + mLock.unlockReader(); + } + + private: + ReadWriteLock& mLock; +}; + +class ScopedWriteLock +{ + PX_NOCOPY(ScopedWriteLock) + public: + PX_INLINE ScopedWriteLock(ReadWriteLock& lock) : mLock(lock) + { + mLock.lockWriter(); + } + PX_INLINE ~ScopedWriteLock() + { + mLock.unlockWriter(); + } + + private: + ReadWriteLock& mLock; +}; + +typedef MutexT<> Mutex; + +/* + * Use this type of lock for mutex behaviour that must operate on SPU and PPU + * On non-PS3 platforms, it is implemented using Mutex + */ +class AtomicLock +{ + Mutex mMutex; + PX_NOCOPY(AtomicLock) + + public: + AtomicLock() + { + } + + bool lock() + { + mMutex.lock(); + return true; + } + + bool trylock() + { + return mMutex.trylock(); + } + + bool unlock() + { + mMutex.unlock(); + return true; + } +}; + +class AtomicLockCopy +{ + AtomicLock* pLock; + + public: + AtomicLockCopy() : pLock(NULL) + { + } + + AtomicLockCopy& operator=(AtomicLock& lock) + { + pLock = &lock; + return *this; + } + + bool lock() + { + return pLock->lock(); + } + + bool trylock() + { + return pLock->trylock(); + } + + bool unlock() + { + return pLock->unlock(); + } +}; + +class AtomicRwLock +{ + ReadWriteLock m_Lock; + PX_NOCOPY(AtomicRwLock) + + public: + AtomicRwLock() + { + } + + void lockReader() + { + m_Lock.lockReader(); + } + void lockWriter() + { + m_Lock.lockWriter(); + } + + bool tryLockReader() + { + // Todo - implement this + m_Lock.lockReader(); + return true; + } + + void unlockReader() + { + m_Lock.unlockReader(); + } + void unlockWriter() + { + m_Lock.unlockWriter(); + } +}; + +class ScopedAtomicLock +{ + PX_INLINE ScopedAtomicLock(AtomicLock& lock) : mLock(lock) + { + mLock.lock(); + } + PX_INLINE ~ScopedAtomicLock() + { + mLock.unlock(); + } + + PX_NOCOPY(ScopedAtomicLock) + private: + AtomicLock& mLock; +}; + +} // namespace shdfnd +} // namespace physx + +#endif // #ifndef PSFOUNDATION_PSMUTEX_H diff --git a/PxShared/src/foundation/include/PsPool.h b/PxShared/src/foundation/include/PsPool.h new file mode 100644 index 00000000..48280100 --- /dev/null +++ b/PxShared/src/foundation/include/PsPool.h @@ -0,0 +1,298 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSPOOL_H +#define PSFOUNDATION_PSPOOL_H + +#include "PsArray.h" +#include "PsSort.h" +#include "PsBasicTemplates.h" +#include "PsInlineArray.h" + +namespace physx +{ +namespace shdfnd +{ + +/*! +Simple allocation pool +*/ +template <class T, class Alloc = typename AllocatorTraits<T>::Type> +class PoolBase : public UserAllocated, public Alloc +{ + PX_NOCOPY(PoolBase) + protected: + PoolBase(const Alloc& alloc, uint32_t elementsPerSlab, uint32_t slabSize) + : Alloc(alloc), mSlabs(alloc), mElementsPerSlab(elementsPerSlab), mUsed(0), mSlabSize(slabSize), mFreeElement(0) + { + PX_COMPILE_TIME_ASSERT(sizeof(T) >= sizeof(size_t)); + } + + public: + ~PoolBase() + { + if(mUsed) + disposeElements(); + + for(void** slabIt = mSlabs.begin(), *slabEnd = mSlabs.end(); slabIt != slabEnd; ++slabIt) + Alloc::deallocate(*slabIt); + } + + // Allocate space for single object + PX_INLINE T* allocate() + { + if(mFreeElement == 0) + allocateSlab(); + T* p = reinterpret_cast<T*>(mFreeElement); + mFreeElement = mFreeElement->mNext; + mUsed++; +/** +Mark a specified amount of memory with 0xcd pattern. This is used to check that the meta data +definition for serialized classes is complete in checked builds. +*/ +#if PX_CHECKED + for(uint32_t i = 0; i < sizeof(T); ++i) + reinterpret_cast<uint8_t*>(p)[i] = 0xcd; +#endif + return p; + } + + // Put space for a single element back in the lists + PX_INLINE void deallocate(T* p) + { + if(p) + { + PX_ASSERT(mUsed); + mUsed--; + push(reinterpret_cast<FreeList*>(p)); + } + } + + PX_INLINE T* construct() + { + T* t = allocate(); + return t ? new (t) T() : 0; + } + + template <class A1> + PX_INLINE T* construct(A1& a) + { + T* t = allocate(); + return t ? new (t) T(a) : 0; + } + + template <class A1, class A2> + PX_INLINE T* construct(A1& a, A2& b) + { + T* t = allocate(); + return t ? new (t) T(a, b) : 0; + } + + template <class A1, class A2, class A3> + PX_INLINE T* construct(A1& a, A2& b, A3& c) + { + T* t = allocate(); + return t ? new (t) T(a, b, c) : 0; + } + + template <class A1, class A2, class A3> + PX_INLINE T* construct(A1* a, A2& b, A3& c) + { + T* t = allocate(); + return t ? new (t) T(a, b, c) : 0; + } + + template <class A1, class A2, class A3, class A4> + PX_INLINE T* construct(A1& a, A2& b, A3& c, A4& d) + { + T* t = allocate(); + return t ? new (t) T(a, b, c, d) : 0; + } + + template <class A1, class A2, class A3, class A4, class A5> + PX_INLINE T* construct(A1& a, A2& b, A3& c, A4& d, A5& e) + { + T* t = allocate(); + return t ? new (t) T(a, b, c, d, e) : 0; + } + + PX_INLINE void destroy(T* const p) + { + if(p) + { + p->~T(); + deallocate(p); + } + } + + protected: + struct FreeList + { + FreeList* mNext; + }; + + // All the allocated slabs, sorted by pointer + InlineArray<void*, 64, Alloc> mSlabs; + + uint32_t mElementsPerSlab; + uint32_t mUsed; + uint32_t mSlabSize; + + FreeList* mFreeElement; // Head of free-list + + // Helper function to get bitmap of allocated elements + + void push(FreeList* p) + { + p->mNext = mFreeElement; + mFreeElement = p; + } + + // Allocate a slab and segregate it into the freelist + void allocateSlab() + { + T* slab = reinterpret_cast<T*>(Alloc::allocate(mSlabSize, __FILE__, __LINE__)); + + mSlabs.pushBack(slab); + + // Build a chain of nodes for the freelist + T* it = slab + mElementsPerSlab; + while(--it >= slab) + push(reinterpret_cast<FreeList*>(it)); + } + + /* + Cleanup method. Go through all active slabs and call destructor for live objects, + then free their memory + */ + void disposeElements() + { + Array<void*, Alloc> freeNodes(*this); + while(mFreeElement) + { + freeNodes.pushBack(mFreeElement); + mFreeElement = mFreeElement->mNext; + } + Alloc& alloc(*this); + sort(freeNodes.begin(), freeNodes.size(), Less<void*>(), alloc); + sort(mSlabs.begin(), mSlabs.size(), Less<void*>(), alloc); + + typename Array<void*, Alloc>::Iterator slabIt = mSlabs.begin(), slabEnd = mSlabs.end(); + for(typename Array<void*, Alloc>::Iterator freeIt = freeNodes.begin(); slabIt != slabEnd; ++slabIt) + { + for(T* tIt = reinterpret_cast<T*>(*slabIt), *tEnd = tIt + mElementsPerSlab; tIt != tEnd; ++tIt) + { + if(freeIt != freeNodes.end() && *freeIt == tIt) + ++freeIt; + else + tIt->~T(); + } + } + } + + /* + Go through all slabs and call destructor if the slab is empty + */ + void releaseEmptySlabs() + { + Array<void*, Alloc> freeNodes(*this); + Array<void*, Alloc> slabNodes(mSlabs, *this); + while(mFreeElement) + { + freeNodes.pushBack(mFreeElement); + mFreeElement = mFreeElement->mNext; + } + + typename Array<void*, Alloc>::Iterator freeIt = freeNodes.begin(), freeEnd = freeNodes.end(), + lastCheck = freeNodes.end() - mElementsPerSlab; + + if(freeNodes.size() > mElementsPerSlab) + { + Alloc& alloc(*this); + sort(freeNodes.begin(), freeNodes.size(), Less<void*>(), alloc); + sort(slabNodes.begin(), slabNodes.size(), Less<void*>(), alloc); + + mSlabs.clear(); + for(void** slabIt = slabNodes.begin(), *slabEnd = slabNodes.end(); slabIt != slabEnd; ++slabIt) + { + while((freeIt < lastCheck) && (*slabIt > (*freeIt))) + { + push(reinterpret_cast<FreeList*>(*freeIt)); + freeIt++; + } + + if(*slabIt == (*freeIt)) // the slab's first element in freeList + { + const size_t endSlabAddress = size_t(*slabIt) + mSlabSize; + const size_t endFreeAddress = size_t(*(freeIt + mElementsPerSlab - 1)); + if(endFreeAddress + sizeof(T) == endSlabAddress) + { // all slab's element in freeList + Alloc::deallocate(*slabIt); + freeIt += mElementsPerSlab; + continue; + } + } + + mSlabs.pushBack(*slabIt); + } + } + + while(freeIt != freeEnd) + { + push(reinterpret_cast<FreeList*>(*freeIt)); + ++freeIt; + } + } +}; + +// original pool implementation +template <class T, class Alloc = typename AllocatorTraits<T>::Type> +class Pool : public PoolBase<T, Alloc> +{ + public: + Pool(const Alloc& alloc = Alloc(), uint32_t elementsPerSlab = 32) + : PoolBase<T, Alloc>(alloc, elementsPerSlab, elementsPerSlab * sizeof(T)) + { + } +}; + +// allows specification of the slab size instead of the occupancy +template <class T, uint32_t slabSize, class Alloc = typename AllocatorTraits<T>::Type> +class Pool2 : public PoolBase<T, Alloc> +{ + public: + Pool2(const Alloc& alloc = Alloc()) : PoolBase<T, Alloc>(alloc, slabSize / sizeof(T), slabSize) + { + } +}; + +} // namespace shdfnd +} // namespace physx + +#endif // #ifndef PSFOUNDATION_PSPOOL_H diff --git a/PxShared/src/foundation/include/PsSList.h b/PxShared/src/foundation/include/PsSList.h new file mode 100644 index 00000000..70db740d --- /dev/null +++ b/PxShared/src/foundation/include/PsSList.h @@ -0,0 +1,140 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSSLIST_H +#define PSFOUNDATION_PSSLIST_H + +#include "foundation/Px.h" +#include "foundation/PxAssert.h" +#include "PsAlignedMalloc.h" + +#if PX_P64_FAMILY +#define PX_SLIST_ALIGNMENT 16 +#else +#define PX_SLIST_ALIGNMENT 8 +#endif + +namespace physx +{ +namespace shdfnd +{ + +#if PX_VC +#pragma warning(push) +#pragma warning(disable : 4324) // Padding was added at the end of a structure because of a __declspec(align) value. +#endif + +#if !PX_GCC_FAMILY +__declspec(align(PX_SLIST_ALIGNMENT)) +#endif + class SListEntry +{ + friend struct SListImpl; + + public: + SListEntry() : mNext(NULL) + { + PX_ASSERT((size_t(this) & (PX_SLIST_ALIGNMENT - 1)) == 0); + } + + // Only use on elements returned by SList::flush() + // because the operation is not atomic. + SListEntry* next() + { + return mNext; + } + + private: + SListEntry* mNext; +} +#if PX_GCC_FAMILY +__attribute__((aligned(PX_SLIST_ALIGNMENT))); +#else +; +#endif + +#if PX_VC +#pragma warning(pop) +#endif + +// template-less implementation +struct PX_FOUNDATION_API SListImpl +{ + SListImpl(); + ~SListImpl(); + void push(SListEntry* entry); + SListEntry* pop(); + SListEntry* flush(); + static const uint32_t& getSize(); +}; + +template <typename Alloc = ReflectionAllocator<SListImpl> > +class SListT : protected Alloc +{ + public: + SListT(const Alloc& alloc = Alloc()) : Alloc(alloc) + { + mImpl = reinterpret_cast<SListImpl*>(Alloc::allocate(SListImpl::getSize(), __FILE__, __LINE__)); + PX_ASSERT((size_t(mImpl) & (PX_SLIST_ALIGNMENT - 1)) == 0); + PX_PLACEMENT_NEW(mImpl, SListImpl)(); + } + ~SListT() + { + mImpl->~SListImpl(); + Alloc::deallocate(mImpl); + } + + // pushes a new element to the list + void push(SListEntry& entry) + { + mImpl->push(&entry); + } + + // pops an element from the list + SListEntry* pop() + { + return mImpl->pop(); + } + + // removes all items from list, returns pointer to first element + SListEntry* flush() + { + return mImpl->flush(); + } + + private: + SListImpl* mImpl; +}; + +typedef SListT<> SList; + +} // namespace shdfnd +} // namespace physx + +#endif // #ifndef PSFOUNDATION_PSSLIST_H diff --git a/PxShared/src/foundation/include/PsSocket.h b/PxShared/src/foundation/include/PsSocket.h new file mode 100644 index 00000000..5f65b9fb --- /dev/null +++ b/PxShared/src/foundation/include/PsSocket.h @@ -0,0 +1,186 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSSOCKET_H +#define PSFOUNDATION_PSSOCKET_H + +#include "PsUserAllocated.h" + +namespace physx +{ +namespace shdfnd +{ +/** +Socket abstraction API +*/ + +class PX_FOUNDATION_API Socket : public UserAllocated +{ + public: + static const uint32_t DEFAULT_BUFFER_SIZE; + + Socket(bool inEnableBuffering = true, bool blocking = true); + + virtual ~Socket(); + + /*! + Opens a network socket for input and/or output + + \param host + Name of the host to connect to. This can be an IP, URL, etc + + \param port + The port to connect to on the remote host + + \param timeout + Timeout in ms until the connection must be established. + + \return + True if the connection was successful, false otherwise + */ + bool connect(const char* host, uint16_t port, uint32_t timeout = 1000); + + /*! + Opens a network socket for input and/or output as a server. Put the connection in listening mode + + \param port + The port on which the socket listens + */ + bool listen(uint16_t port); + + /*! + Accept a connection on a socket that is in listening mode + + \note + This method only supports a single connection client. Additional clients + that connect to the listening port will overwrite the existing socket handle. + + \param block + whether or not the call should block + + \return whether a connection was established + */ + bool accept(bool block); + + /*! + Disconnects an open socket + */ + void disconnect(); + + /*! + Returns whether the socket is currently open (connected) or not. + + \return + True if the socket is connected, false otherwise + */ + bool isConnected() const; + + /*! + Returns the name of the connected host. This is the same as the string + that was supplied to the connect call. + + \return + The name of the connected host + */ + const char* getHost() const; + + /*! + Returns the port of the connected host. This is the same as the port + that was supplied to the connect call. + + \return + The port of the connected host + */ + uint16_t getPort() const; + + /*! + Flushes the output stream. Until the stream is flushed, there is no + guarantee that the written data has actually reached the destination + storage. Flush forces all buffered data to be sent to the output. + + \note flush always blocks. If the socket is in non-blocking mode, this will result + the thread spinning. + + \return + True if the flush was successful, false otherwise + */ + bool flush(); + + /*! + Writes data to the output stream. + + \param data + Pointer to a block of data to write to the stream + + \param length + Amount of data to write, in bytes + + \return + Number of bytes actually written. This could be lower than length if the socket is non-blocking. + */ + + uint32_t write(const uint8_t* data, uint32_t length); + + /*! + Reads data from the output stream. + + \param data + Pointer to a buffer where the read data will be stored. + + \param length + Amount of data to read, in bytes. + + \return + Number of bytes actually read. This could be lower than length if the stream end is + encountered or the socket is non-blocking. + */ + uint32_t read(uint8_t* data, uint32_t length); + + /*! + Sets blocking mode of the socket. + Socket must be connected, otherwise calling this method won't take any effect. + */ + void setBlocking(bool blocking); + + /*! + Returns whether read/write/flush calls to the socket are blocking. + + \return + True if the socket is blocking. + */ + bool isBlocking() const; + + private: + class SocketImpl* mImpl; +}; + +} // namespace shdfnd +} // namespace physx + +#endif // PSFOUNDATION_PSSOCKET_H diff --git a/PxShared/src/foundation/include/PsSort.h b/PxShared/src/foundation/include/PsSort.h new file mode 100644 index 00000000..14fa5732 --- /dev/null +++ b/PxShared/src/foundation/include/PsSort.h @@ -0,0 +1,130 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSSORT_H +#define PSFOUNDATION_PSSORT_H + +/** \addtogroup foundation +@{ +*/ + +#include "PsSortInternals.h" +#include "PsAlloca.h" + +#define PX_SORT_PARANOIA PX_DEBUG + +/** +\brief Sorts an array of objects in ascending order, assuming +that the predicate implements the < operator: + +\see Less, Greater +*/ + +#if PX_VC +#pragma warning(push) +#pragma warning(disable : 4706) // disable the warning that we did an assignment within a conditional expression, as +// this was intentional. +#endif + +namespace physx +{ +namespace shdfnd +{ +template <class T, class Predicate, class Allocator> +void sort(T* elements, uint32_t count, const Predicate& compare, const Allocator& inAllocator, + const uint32_t initialStackSize = 32) +{ + static const uint32_t SMALL_SORT_CUTOFF = 5; // must be >= 3 since we need 3 for median + + PX_ALLOCA(stackMem, int32_t, initialStackSize); + internal::Stack<Allocator> stack(stackMem, initialStackSize, inAllocator); + + int32_t first = 0, last = int32_t(count - 1); + if(last > first) + { + for(;;) + { + while(last > first) + { + PX_ASSERT(first >= 0 && last < int32_t(count)); + if(uint32_t(last - first) < SMALL_SORT_CUTOFF) + { + internal::smallSort(elements, first, last, compare); + break; + } + else + { + const int32_t partIndex = internal::partition(elements, first, last, compare); + + // push smaller sublist to minimize stack usage + if((partIndex - first) < (last - partIndex)) + { + stack.push(first, partIndex - 1); + first = partIndex + 1; + } + else + { + stack.push(partIndex + 1, last); + last = partIndex - 1; + } + } + } + + if(stack.empty()) + break; + + stack.pop(first, last); + } + } +#if PX_SORT_PARANOIA + for(uint32_t i = 1; i < count; i++) + PX_ASSERT(!compare(elements[i], elements[i - 1])); +#endif +} + +template <class T, class Predicate> +void sort(T* elements, uint32_t count, const Predicate& compare) +{ + sort(elements, count, compare, typename shdfnd::AllocatorTraits<T>::Type()); +} + +template <class T> +void sort(T* elements, uint32_t count) +{ + sort(elements, count, shdfnd::Less<T>(), typename shdfnd::AllocatorTraits<T>::Type()); +} + +} // namespace shdfnd +} // namespace physx + +#if PX_VC +#pragma warning(pop) +#endif + +#endif // #ifndef PSFOUNDATION_PSSORT_H diff --git a/PxShared/src/foundation/include/PsSortInternals.h b/PxShared/src/foundation/include/PsSortInternals.h new file mode 100644 index 00000000..7e9ee48e --- /dev/null +++ b/PxShared/src/foundation/include/PsSortInternals.h @@ -0,0 +1,188 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSSORTINTERNALS_H +#define PSFOUNDATION_PSSORTINTERNALS_H + +/** \addtogroup foundation +@{ +*/ + +#include "foundation/PxAssert.h" +#include "foundation/PxIntrinsics.h" +#include "PsBasicTemplates.h" +#include "PsUserAllocated.h" + +namespace physx +{ +namespace shdfnd +{ +namespace internal +{ +template <class T, class Predicate> +PX_INLINE void median3(T* elements, int32_t first, int32_t last, Predicate& compare) +{ + /* + This creates sentinels because we know there is an element at the start minimum(or equal) + than the pivot and an element at the end greater(or equal) than the pivot. Plus the + median of 3 reduces the chance of degenerate behavour. + */ + + int32_t mid = (first + last) / 2; + + if(compare(elements[mid], elements[first])) + swap(elements[first], elements[mid]); + + if(compare(elements[last], elements[first])) + swap(elements[first], elements[last]); + + if(compare(elements[last], elements[mid])) + swap(elements[mid], elements[last]); + + // keep the pivot at last-1 + swap(elements[mid], elements[last - 1]); +} + +template <class T, class Predicate> +PX_INLINE int32_t partition(T* elements, int32_t first, int32_t last, Predicate& compare) +{ + median3(elements, first, last, compare); + + /* + WARNING: using the line: + + T partValue = elements[last-1]; + + and changing the scan loops to: + + while(comparator.greater(partValue, elements[++i])); + while(comparator.greater(elements[--j], partValue); + + triggers a compiler optimizer bug on xenon where it stores a double to the stack for partValue + then loads it as a single...:-( + */ + + int32_t i = first; // we know first is less than pivot(but i gets pre incremented) + int32_t j = last - 1; // pivot is in last-1 (but j gets pre decremented) + + for(;;) + { + while(compare(elements[++i], elements[last - 1])) + ; + while(compare(elements[last - 1], elements[--j])) + ; + + if(i >= j) + break; + + PX_ASSERT(i <= last && j >= first); + swap(elements[i], elements[j]); + } + // put the pivot in place + + PX_ASSERT(i <= last && first <= (last - 1)); + swap(elements[i], elements[last - 1]); + + return i; +} + +template <class T, class Predicate> +PX_INLINE void smallSort(T* elements, int32_t first, int32_t last, Predicate& compare) +{ + // selection sort - could reduce to fsel on 360 with floats. + + for(int32_t i = first; i < last; i++) + { + int32_t m = i; + for(int32_t j = i + 1; j <= last; j++) + if(compare(elements[j], elements[m])) + m = j; + + if(m != i) + swap(elements[m], elements[i]); + } +} + +template <class Allocator> +class Stack +{ + Allocator mAllocator; + uint32_t mSize, mCapacity; + int32_t* mMemory; + bool mRealloc; + + public: + Stack(int32_t* memory, uint32_t capacity, const Allocator& inAllocator) + : mAllocator(inAllocator), mSize(0), mCapacity(capacity), mMemory(memory), mRealloc(false) + { + } + ~Stack() + { + if(mRealloc) + mAllocator.deallocate(mMemory); + } + + void grow() + { + mCapacity *= 2; + int32_t* newMem = + reinterpret_cast<int32_t*>(mAllocator.allocate(sizeof(int32_t) * mCapacity, __FILE__, __LINE__)); + intrinsics::memCopy(newMem, mMemory, mSize * sizeof(int32_t)); + if(mRealloc) + mAllocator.deallocate(mMemory); + mRealloc = true; + mMemory = newMem; + } + + PX_INLINE void push(int32_t start, int32_t end) + { + if(mSize >= mCapacity - 1) + grow(); + mMemory[mSize++] = start; + mMemory[mSize++] = end; + } + + PX_INLINE void pop(int32_t& start, int32_t& end) + { + PX_ASSERT(!empty()); + end = mMemory[--mSize]; + start = mMemory[--mSize]; + } + + PX_INLINE bool empty() + { + return mSize == 0; + } +}; +} // namespace internal + +} // namespace shdfnd +} // namespace physx + +#endif // #ifndef PSFOUNDATION_PSSORTINTERNALS_H diff --git a/PxShared/src/foundation/include/PsString.h b/PxShared/src/foundation/include/PsString.h new file mode 100644 index 00000000..7a69264a --- /dev/null +++ b/PxShared/src/foundation/include/PsString.h @@ -0,0 +1,90 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSSTRING_H +#define PSFOUNDATION_PSSTRING_H + +#include "foundation/PxPreprocessor.h" +#include "foundation/PxSimpleTypes.h" +#include <stdarg.h> + +namespace physx +{ +namespace shdfnd +{ + +// the following functions have C99 semantics. Note that C99 requires for snprintf and vsnprintf: +// * the resulting string is always NULL-terminated regardless of truncation. +// * in the case of truncation the return value is the number of characters that would have been created. + +PX_FOUNDATION_API int32_t sscanf(const char* buffer, const char* format, ...); +PX_FOUNDATION_API int32_t strcmp(const char* str1, const char* str2); +PX_FOUNDATION_API int32_t strncmp(const char* str1, const char* str2, size_t count); +PX_FOUNDATION_API int32_t snprintf(char* dst, size_t dstSize, const char* format, ...); +PX_FOUNDATION_API int32_t vsnprintf(char* dst, size_t dstSize, const char* src, va_list arg); + +// strlcat and strlcpy have BSD semantics: +// * dstSize is always the size of the destination buffer +// * the resulting string is always NULL-terminated regardless of truncation +// * in the case of truncation the return value is the length of the string that would have been created + +PX_FOUNDATION_API size_t strlcat(char* dst, size_t dstSize, const char* src); +PX_FOUNDATION_API size_t strlcpy(char* dst, size_t dstSize, const char* src); + +// case-insensitive string comparison +PX_FOUNDATION_API int32_t stricmp(const char* str1, const char* str2); +PX_FOUNDATION_API int32_t strnicmp(const char* str1, const char* str2, size_t count); + +// in-place string case conversion +PX_FOUNDATION_API void strlwr(char* str); +PX_FOUNDATION_API void strupr(char* str); + +/** +\brief The maximum supported formatted output string length +(number of characters after replacement). + +@see printFormatted() +*/ +static const size_t MAX_PRINTFORMATTED_LENGTH = 1024; + +/** +\brief Prints the formatted data, trying to make sure it's visible to the app programmer + +@see NS_MAX_PRINTFORMATTED_LENGTH +*/ +PX_FOUNDATION_API void printFormatted(const char*, ...); + +/** +\brief Prints the string literally (does not consume % specifier), trying to make sure it's visible to the app +programmer +*/ +PX_FOUNDATION_API void printString(const char*); +} +} +#endif // #ifndef PSFOUNDATION_PSSTRING_H diff --git a/PxShared/src/foundation/include/PsSync.h b/PxShared/src/foundation/include/PsSync.h new file mode 100644 index 00000000..e1db6cea --- /dev/null +++ b/PxShared/src/foundation/include/PsSync.h @@ -0,0 +1,138 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSSYNC_H +#define PSFOUNDATION_PSSYNC_H + +#include "PsAllocator.h" + +namespace physx +{ +namespace shdfnd +{ +/*! +Implementation notes: +* - Calling set() on an already signaled Sync does not change its state. +* - Calling reset() on an already reset Sync does not change its state. +* - Calling set() on a reset Sync wakes all waiting threads (potential for thread contention). +* - Calling wait() on an already signaled Sync will return true immediately. +* - NOTE: be careful when pulsing an event with set() followed by reset(), because a +* thread that is not waiting on the event will miss the signal. +*/ +class PX_FOUNDATION_API SyncImpl +{ + public: + static const uint32_t waitForever = 0xffffffff; + + SyncImpl(); + + ~SyncImpl(); + + /** Wait on the object for at most the given number of ms. Returns + * true if the object is signaled. Sync::waitForever will block forever + * or until the object is signaled. + */ + + bool wait(uint32_t milliseconds = waitForever); + + /** Signal the synchronization object, waking all threads waiting on it */ + + void set(); + + /** Reset the synchronization object */ + + void reset(); + + /** + Size of this class. + */ + static const uint32_t& getSize(); +}; + +/*! +Implementation notes: +* - Calling set() on an already signaled Sync does not change its state. +* - Calling reset() on an already reset Sync does not change its state. +* - Calling set() on a reset Sync wakes all waiting threads (potential for thread contention). +* - Calling wait() on an already signaled Sync will return true immediately. +* - NOTE: be careful when pulsing an event with set() followed by reset(), because a +* thread that is not waiting on the event will miss the signal. +*/ +template <typename Alloc = ReflectionAllocator<SyncImpl> > +class SyncT : protected Alloc +{ + public: + static const uint32_t waitForever = SyncImpl::waitForever; + + SyncT(const Alloc& alloc = Alloc()) : Alloc(alloc) + { + mImpl = reinterpret_cast<SyncImpl*>(Alloc::allocate(SyncImpl::getSize(), __FILE__, __LINE__)); + PX_PLACEMENT_NEW(mImpl, SyncImpl)(); + } + + ~SyncT() + { + mImpl->~SyncImpl(); + Alloc::deallocate(mImpl); + } + + /** Wait on the object for at most the given number of ms. Returns + * true if the object is signaled. Sync::waitForever will block forever + * or until the object is signaled. + */ + + bool wait(uint32_t milliseconds = SyncImpl::waitForever) + { + return mImpl->wait(milliseconds); + } + + /** Signal the synchronization object, waking all threads waiting on it */ + + void set() + { + mImpl->set(); + } + + /** Reset the synchronization object */ + + void reset() + { + mImpl->reset(); + } + + private: + class SyncImpl* mImpl; +}; + +typedef SyncT<> Sync; + +} // namespace shdfnd +} // namespace physx + +#endif // #ifndef PSFOUNDATION_PSSYNC_H diff --git a/PxShared/src/foundation/include/PsTempAllocator.h b/PxShared/src/foundation/include/PsTempAllocator.h new file mode 100644 index 00000000..40c029d0 --- /dev/null +++ b/PxShared/src/foundation/include/PsTempAllocator.h @@ -0,0 +1,62 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSTEMPALLOCATOR_H +#define PSFOUNDATION_PSTEMPALLOCATOR_H + +#include "PsAllocator.h" + +namespace physx +{ +namespace shdfnd +{ +union TempAllocatorChunk +{ + TempAllocatorChunk() : mNext(0) + { + } + TempAllocatorChunk* mNext; // while chunk is free + uint32_t mIndex; // while chunk is allocated + uint8_t mPad[16]; // 16 byte aligned allocations +}; + +class TempAllocator +{ + public: + PX_FORCE_INLINE TempAllocator(const char* = 0) + { + } + PX_FOUNDATION_API void* allocate(size_t size, const char* file, int line); + PX_FOUNDATION_API void deallocate(void* ptr); +}; + +} // namespace shdfnd +} // namespace physx + +#endif // #ifndef PSFOUNDATION_PSTEMPALLOCATOR_H diff --git a/PxShared/src/foundation/include/PsThread.h b/PxShared/src/foundation/include/PsThread.h new file mode 100644 index 00000000..950d5381 --- /dev/null +++ b/PxShared/src/foundation/include/PsThread.h @@ -0,0 +1,382 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSTHREAD_H +#define PSFOUNDATION_PSTHREAD_H + +#include "PsUserAllocated.h" + +// dsequeira: according to existing comment here (David Black would be my guess) +// "This is useful to reduce bus contention on tight spin locks. And it needs +// to be a macro as the xenon compiler often ignores even __forceinline." What's not +// clear is why a pause function needs inlining...? (TODO: check with XBox team) + +// todo: these need to go somewhere else + +#if PX_WINDOWS_FAMILY || PX_XBOXONE +#define PxSpinLockPause() __asm pause +#elif PX_LINUX || PX_ANDROID || PX_PS4 || PX_APPLE_FAMILY +#define PxSpinLockPause() asm("nop") +#else +#error "Platform not supported!" +#endif + +namespace physx +{ +namespace shdfnd +{ +struct ThreadPriority // todo: put in some other header file +{ + enum Enum + { + /** + \brief High priority + */ + eHIGH = 0, + + /** + \brief Above Normal priority + */ + eABOVE_NORMAL = 1, + + /** + \brief Normal/default priority + */ + eNORMAL = 2, + + /** + \brief Below Normal priority + */ + eBELOW_NORMAL = 3, + + /** + \brief Low priority. + */ + eLOW = 4, + eFORCE_DWORD = 0xffFFffFF + }; +}; + +class Runnable +{ + public: + Runnable() + { + } + virtual ~Runnable() + { + } + virtual void execute(void) + { + } +}; + +class PX_FOUNDATION_API ThreadImpl +{ + public: + typedef size_t Id; // space for a pointer or an integer + typedef void* (*ExecuteFn)(void*); + + static uint32_t getDefaultStackSize(); + static Id getId(); + + /** + Construct (but do not start) the thread object. The OS thread object will not be created + until start() is called. Executes in the context + of the spawning thread. + */ + + ThreadImpl(); + + /** + Construct and start the the thread, passing the given arg to the given fn. (pthread style) + */ + + ThreadImpl(ExecuteFn fn, void* arg); + + /** + Deallocate all resources associated with the thread. Should be called in the + context of the spawning thread. + */ + + ~ThreadImpl(); + + /** + Create the OS thread and start it running. Called in the context of the spawning thread. + If an affinity mask has previously been set then it will be applied after the + thread has been created. + */ + + void start(uint32_t stackSize, Runnable* r); + + /** + Violently kill the current thread. Blunt instrument, not recommended since + it can leave all kinds of things unreleased (stack, memory, mutexes...) Should + be called in the context of the spawning thread. + */ + + void kill(); + + /** + Stop the thread. Signals the spawned thread that it should stop, so the + thread should check regularly + */ + + void signalQuit(); + + /** + Wait for a thread to stop. Should be called in the context of the spawning + thread. Returns false if the thread has not been started. + */ + + bool waitForQuit(); + + /** + check whether the thread is signalled to quit. Called in the context of the + spawned thread. + */ + + bool quitIsSignalled(); + + /** + Cleanly shut down this thread. Called in the context of the spawned thread. + */ + void quit(); + + /** + Change the affinity mask for this thread. The mask is a platform + specific value. + + On Windows, Linux, PS4 and XboxOne platforms, each set mask bit represents + the index of a logical processor that the OS may schedule thread execution on. + Bits outside the range of valid logical processors may be ignored or cause + the function to return an error. + + On Apple platforms, this function has no effect. + + If the thread has not yet been started then the mask is stored + and applied when the thread is started. + + If the thread has already been started then this method returns the + previous affinity mask on success, otherwise it returns zero. + */ + uint32_t setAffinityMask(uint32_t mask); + + static ThreadPriority::Enum getPriority(Id threadId); + + /** Set thread priority. */ + void setPriority(ThreadPriority::Enum prio); + + /** set the thread's name */ + void setName(const char* name); + + /** Put the current thread to sleep for the given number of milliseconds */ + static void sleep(uint32_t ms); + + /** Yield the current thread's slot on the CPU */ + static void yield(); + + /** Return the number of physical cores (does not include hyper-threaded cores), returns 0 on failure */ + static uint32_t getNbPhysicalCores(); + + /** + Size of this class. + */ + static const uint32_t& getSize(); +}; + +/** +Thread abstraction API +*/ +template <typename Alloc = ReflectionAllocator<ThreadImpl> > +class ThreadT : protected Alloc, public UserAllocated, public Runnable +{ + public: + typedef ThreadImpl::Id Id; // space for a pointer or an integer + + /** + Construct (but do not start) the thread object. Executes in the context + of the spawning thread + */ + ThreadT(const Alloc& alloc = Alloc()) : Alloc(alloc) + { + mImpl = reinterpret_cast<ThreadImpl*>(Alloc::allocate(ThreadImpl::getSize(), __FILE__, __LINE__)); + PX_PLACEMENT_NEW(mImpl, ThreadImpl)(); + } + + /** + Construct and start the the thread, passing the given arg to the given fn. (pthread style) + */ + ThreadT(ThreadImpl::ExecuteFn fn, void* arg, const Alloc& alloc = Alloc()) : Alloc(alloc) + { + mImpl = reinterpret_cast<ThreadImpl*>(Alloc::allocate(ThreadImpl::getSize(), __FILE__, __LINE__)); + PX_PLACEMENT_NEW(mImpl, ThreadImpl)(fn, arg); + } + + /** + Deallocate all resources associated with the thread. Should be called in the + context of the spawning thread. + */ + virtual ~ThreadT() + { + mImpl->~ThreadImpl(); + Alloc::deallocate(mImpl); + } + + /** + start the thread running. Called in the context of the spawning thread. + */ + + void start(uint32_t stackSize = ThreadImpl::getDefaultStackSize()) + { + mImpl->start(stackSize, this); + } + + /** + Violently kill the current thread. Blunt instrument, not recommended since + it can leave all kinds of things unreleased (stack, memory, mutexes...) Should + be called in the context of the spawning thread. + */ + + void kill() + { + mImpl->kill(); + } + + /** + The virtual execute() method is the user defined function that will + run in the new thread. Called in the context of the spawned thread. + */ + + virtual void execute(void) + { + } + + /** + stop the thread. Signals the spawned thread that it should stop, so the + thread should check regularly + */ + + void signalQuit() + { + mImpl->signalQuit(); + } + + /** + Wait for a thread to stop. Should be called in the context of the spawning + thread. Returns false if the thread has not been started. + */ + + bool waitForQuit() + { + return mImpl->waitForQuit(); + } + + /** + check whether the thread is signalled to quit. Called in the context of the + spawned thread. + */ + + bool quitIsSignalled() + { + return mImpl->quitIsSignalled(); + } + + /** + Cleanly shut down this thread. Called in the context of the spawned thread. + */ + void quit() + { + mImpl->quit(); + } + + uint32_t setAffinityMask(uint32_t mask) + { + return mImpl->setAffinityMask(mask); + } + + static ThreadPriority::Enum getPriority(ThreadImpl::Id threadId) + { + return ThreadImpl::getPriority(threadId); + } + + /** Set thread priority. */ + void setPriority(ThreadPriority::Enum prio) + { + mImpl->setPriority(prio); + } + + /** set the thread's name */ + void setName(const char* name) + { + mImpl->setName(name); + } + + /** Put the current thread to sleep for the given number of milliseconds */ + static void sleep(uint32_t ms) + { + ThreadImpl::sleep(ms); + } + + /** Yield the current thread's slot on the CPU */ + static void yield() + { + ThreadImpl::yield(); + } + + static uint32_t getDefaultStackSize() + { + return ThreadImpl::getDefaultStackSize(); + } + + static ThreadImpl::Id getId() + { + return ThreadImpl::getId(); + } + + static uint32_t getNbPhysicalCores() + { + return ThreadImpl::getNbPhysicalCores(); + } + + private: + class ThreadImpl* mImpl; +}; + +typedef ThreadT<> Thread; + +PX_FOUNDATION_API uint32_t TlsAlloc(); +PX_FOUNDATION_API void TlsFree(uint32_t index); +PX_FOUNDATION_API void* TlsGet(uint32_t index); +PX_FOUNDATION_API uint32_t TlsSet(uint32_t index, void* value); + +} // namespace shdfnd +} // namespace physx + +#endif // #ifndef PSFOUNDATION_PSTHREAD_H diff --git a/PxShared/src/foundation/include/PsTime.h b/PxShared/src/foundation/include/PsTime.h new file mode 100644 index 00000000..09631e64 --- /dev/null +++ b/PxShared/src/foundation/include/PsTime.h @@ -0,0 +1,95 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSTIME_H +#define PSFOUNDATION_PSTIME_H + +#include "Ps.h" + +#if PX_LINUX || PX_ANDROID +#include <time.h> +#endif + +namespace physx +{ +namespace shdfnd +{ + +struct CounterFrequencyToTensOfNanos +{ + uint64_t mNumerator; + uint64_t mDenominator; + CounterFrequencyToTensOfNanos(uint64_t inNum, uint64_t inDenom) : mNumerator(inNum), mDenominator(inDenom) + { + } + + // quite slow. + uint64_t toTensOfNanos(uint64_t inCounter) const + { + return (inCounter * mNumerator) / mDenominator; + } +}; + +class PX_FOUNDATION_API Time +{ + public: + typedef double Second; + static const uint64_t sNumTensOfNanoSecondsInASecond = 100000000; + // This is supposedly guaranteed to not change after system boot + // regardless of processors, speedstep, etc. + static const CounterFrequencyToTensOfNanos& getBootCounterFrequency(); + + static CounterFrequencyToTensOfNanos getCounterFrequency(); + + static uint64_t getCurrentCounterValue(); + + // SLOW!! + // Thar be a 64 bit divide in thar! + static uint64_t getCurrentTimeInTensOfNanoSeconds() + { + uint64_t ticks = getCurrentCounterValue(); + return getBootCounterFrequency().toTensOfNanos(ticks); + } + + Time(); + Second getElapsedSeconds(); + Second peekElapsedSeconds(); + Second getLastTime() const; + + private: +#if PX_LINUX || PX_ANDROID || PX_APPLE_FAMILY || PX_PS4 + Second mLastTime; +#else + int64_t mTickCount; +#endif +}; +} // namespace shdfnd +} // namespace physx + +#endif // #ifndef PSFOUNDATION_PSTIME_H diff --git a/PxShared/src/foundation/include/PsUserAllocated.h b/PxShared/src/foundation/include/PsUserAllocated.h new file mode 100644 index 00000000..56b2e418 --- /dev/null +++ b/PxShared/src/foundation/include/PsUserAllocated.h @@ -0,0 +1,92 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSUSERALLOCATED_H +#define PSFOUNDATION_PSUSERALLOCATED_H + +#include "PsAllocator.h" + +namespace physx +{ +namespace shdfnd +{ +/** +Provides new and delete using a UserAllocator. +Guarantees that 'delete x;' uses the UserAllocator too. +*/ +class UserAllocated +{ + public: + // PX_SERIALIZATION + PX_INLINE void* operator new(size_t, void* address) + { + return address; + } + //~PX_SERIALIZATION + // Matching operator delete to the above operator new. Don't ask me + // how this makes any sense - Nuernberger. + PX_INLINE void operator delete(void*, void*) + { + } + + template <typename Alloc> + PX_INLINE void* operator new(size_t size, Alloc alloc, const char* fileName, int line) + { + return alloc.allocate(size, fileName, line); + } + template <typename Alloc> + PX_INLINE void* operator new [](size_t size, Alloc alloc, const char* fileName, int line) + { return alloc.allocate(size, fileName, line); } + + // placement delete + template <typename Alloc> + PX_INLINE void operator delete(void* ptr, Alloc alloc, const char* fileName, int line) + { + PX_UNUSED(fileName); + PX_UNUSED(line); + alloc.deallocate(ptr); + } + template <typename Alloc> + PX_INLINE void operator delete [](void* ptr, Alloc alloc, const char* fileName, int line) + { + PX_UNUSED(fileName); + PX_UNUSED(line); + alloc.deallocate(ptr); + } PX_INLINE void + operator delete(void* ptr) + { + NonTrackingAllocator().deallocate(ptr); + } + PX_INLINE void operator delete [](void* ptr) + { NonTrackingAllocator().deallocate(ptr); } +}; +} // namespace shdfnd +} // namespace physx + +#endif // #ifndef PSFOUNDATION_PSUSERALLOCATED_H diff --git a/PxShared/src/foundation/include/PsUtilities.h b/PxShared/src/foundation/include/PsUtilities.h new file mode 100644 index 00000000..34c5e5c6 --- /dev/null +++ b/PxShared/src/foundation/include/PsUtilities.h @@ -0,0 +1,165 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSUTILITIES_H +#define PSFOUNDATION_PSUTILITIES_H + +#include "foundation/PxVec3.h" +#include "foundation/PxAssert.h" +#include "Ps.h" +#include "PsIntrinsics.h" +#include "PsBasicTemplates.h" + +namespace physx +{ +namespace shdfnd +{ +PX_INLINE char littleEndian() +{ + int i = 1; + return *(reinterpret_cast<char*>(&i)); +} + +// PT: checked casts +PX_CUDA_CALLABLE PX_FORCE_INLINE PxU32 to32(PxU64 value) +{ + PX_ASSERT(value <= 0xffffffff); + return PxU32(value); +} +PX_CUDA_CALLABLE PX_FORCE_INLINE PxU16 to16(PxU32 value) +{ + PX_ASSERT(value <= 0xffff); + return PxU16(value); +} +PX_CUDA_CALLABLE PX_FORCE_INLINE PxU8 to8(PxU16 value) +{ + PX_ASSERT(value <= 0xff); + return PxU8(value); +} +PX_CUDA_CALLABLE PX_FORCE_INLINE PxU8 to8(PxU32 value) +{ + PX_ASSERT(value <= 0xff); + return PxU8(value); +} +PX_CUDA_CALLABLE PX_FORCE_INLINE PxU8 to8(PxI32 value) +{ + PX_ASSERT(value <= 0xff); + PX_ASSERT(value >= 0); + return PxU8(value); +} +PX_CUDA_CALLABLE PX_FORCE_INLINE PxI8 toI8(PxU32 value) +{ + PX_ASSERT(value <= 0x7f); + return PxI8(value); +} + +/*! +Get number of elements in array +*/ +template <typename T, size_t N> +char (&ArraySizeHelper(T (&array)[N]))[N]; +#define PX_ARRAY_SIZE(_array) (sizeof(physx::shdfnd::ArraySizeHelper(_array))) + +/*! +Sort two elements using operator< + +On return x will be the smaller of the two +*/ +template <class T> +PX_CUDA_CALLABLE PX_FORCE_INLINE void order(T& x, T& y) +{ + if(y < x) + swap(x, y); +} + +// most architectures can do predication on real comparisons, and on VMX, it matters + +PX_CUDA_CALLABLE PX_FORCE_INLINE void order(PxReal& x, PxReal& y) +{ + PxReal newX = PxMin(x, y); + PxReal newY = PxMax(x, y); + x = newX; + y = newY; +} + +/*! +Sort two elements using operator< and also keep order +of any extra data +*/ +template <class T, class E1> +PX_CUDA_CALLABLE PX_FORCE_INLINE void order(T& x, T& y, E1& xe1, E1& ye1) +{ + if(y < x) + { + swap(x, y); + swap(xe1, ye1); + } +} + +#if PX_GCC_FAMILY && !PX_EMSCRIPTEN +__attribute__((noreturn)) +#endif + PX_INLINE void debugBreak() +{ +#if PX_WINDOWS || PX_XBOXONE + __debugbreak(); +#elif PX_ANDROID + raise(SIGTRAP); // works better than __builtin_trap. Proper call stack and can be continued. +#elif PX_LINUX + asm("int $3"); +#elif PX_GCC_FAMILY + __builtin_trap(); +#else + PX_ASSERT(false); +#endif +} + +bool checkValid(const float&); +bool checkValid(const PxVec3&); +bool checkValid(const PxQuat&); +bool checkValid(const PxMat33&); +bool checkValid(const PxTransform&); +bool checkValid(const char*); + +// equivalent to std::max_element +template <typename T> +inline const T* maxElement(const T* first, const T* last) +{ + const T* m = first; + for(const T* it = first + 1; it < last; ++it) + if(*m < *it) + m = it; + + return m; +} + +} // namespace shdfnd +} // namespace physx + +#endif diff --git a/PxShared/src/foundation/include/PsVecMath.h b/PxShared/src/foundation/include/PsVecMath.h new file mode 100644 index 00000000..25054aed --- /dev/null +++ b/PxShared/src/foundation/include/PsVecMath.h @@ -0,0 +1,1330 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSVECMATH_H +#define PSFOUNDATION_PSVECMATH_H + +#include "Ps.h" +#include "PsIntrinsics.h" +#include "foundation/PxVec3.h" +#include "foundation/PxVec4.h" +#include "foundation/PxMat33.h" +#include "foundation/PxUnionCast.h" + +// We can opt to use the scalar version of vectorised functions. +// This can catch type safety issues and might even work out more optimal on pc. +// It will also be useful for benchmarking and testing. +// NEVER submit with vector intrinsics deactivated without good reason. +// AM: deactivating SIMD for debug win64 just so autobuild will also exercise +// non-SIMD path, until a dedicated non-SIMD platform sich as Arm comes online. +// TODO: dima: reference all platforms with SIMD support here, +// all unknown/experimental cases should better default to NO SIMD. + +// enable/disable SIMD +#if PX_INTEL_FAMILY +#define COMPILE_VECTOR_INTRINSICS 1 +#elif PX_ANDROID&& PX_NEON +#define COMPILE_VECTOR_INTRINSICS 1 +#elif PX_IOS&& PX_NEON +#define COMPILE_VECTOR_INTRINSICS 1 +#else +#define COMPILE_VECTOR_INTRINSICS 0 +#endif + +#if defined(COMPILE_VECTOR_INTRINSICS) && PX_INTEL_FAMILY&&(PX_UNIX_FAMILY || PX_PS4) +// only SSE2 compatible platforms should reach this +#if PX_EMSCRIPTEN +#include <emmintrin.h> +#else +#include <xmmintrin.h> +#endif +#endif + +namespace physx +{ +namespace shdfnd +{ +namespace aos +{ + +// Basic AoS types are +// FloatV - 16-byte aligned representation of float. +// Vec3V - 16-byte aligned representation of PxVec3 stored as (x y z 0). +// Vec4V - 16-byte aligned representation of vector of 4 floats stored as (x y z w). +// BoolV - 16-byte aligned representation of vector of 4 bools stored as (x y z w). +// VecU32V - 16-byte aligned representation of 4 unsigned ints stored as (x y z w). +// VecI32V - 16-byte aligned representation of 4 signed ints stored as (x y z w). +// Mat33V - 16-byte aligned representation of any 3x3 matrix. +// Mat34V - 16-byte aligned representation of transformation matrix (rotation in col1,col2,col3 and translation in +// col4). +// Mat44V - 16-byte aligned representation of any 4x4 matrix. + +#if COMPILE_VECTOR_INTRINSICS +#include "PsAoS.h" +#else +#include "PsVecMathAoSScalar.h" +#endif + +////////////////////////////////////////// +// Construct a simd type from a scalar type +////////////////////////////////////////// + +// FloatV +//(f,f,f,f) +PX_FORCE_INLINE FloatV FLoad(const PxF32 f); + +// Vec3V +//(f,f,f,0) +PX_FORCE_INLINE Vec3V V3Load(const PxF32 f); +//(f.x,f.y,f.z,0) +PX_FORCE_INLINE Vec3V V3LoadU(const PxVec3& f); +//(f.x,f.y,f.z,0), f must be 16-byte aligned +PX_FORCE_INLINE Vec3V V3LoadA(const PxVec3& f); +//(f.x,f.y,f.z,w_undefined), f must be 16-byte aligned +PX_FORCE_INLINE Vec3V V3LoadUnsafeA(const PxVec3& f); +//(f.x,f.y,f.z,0) +PX_FORCE_INLINE Vec3V V3LoadU(const PxF32* f); +//(f.x,f.y,f.z,0), f must be 16-byte aligned +PX_FORCE_INLINE Vec3V V3LoadA(const PxF32* f); + +// Vec4V +//(f,f,f,f) +PX_FORCE_INLINE Vec4V V4Load(const PxF32 f); +//(f[0],f[1],f[2],f[3]) +PX_FORCE_INLINE Vec4V V4LoadU(const PxF32* const f); +//(f[0],f[1],f[2],f[3]), f must be 16-byte aligned +PX_FORCE_INLINE Vec4V V4LoadA(const PxF32* const f); +//(x,y,z,w) +PX_FORCE_INLINE Vec4V V4LoadXYZW(const PxF32& x, const PxF32& y, const PxF32& z, const PxF32& w); + +// BoolV +//(f,f,f,f) +PX_FORCE_INLINE BoolV BLoad(const bool f); +//(f[0],f[1],f[2],f[3]) +PX_FORCE_INLINE BoolV BLoad(const bool* const f); + +// VecU32V +//(f,f,f,f) +PX_FORCE_INLINE VecU32V U4Load(const PxU32 f); +//(f[0],f[1],f[2],f[3]) +PX_FORCE_INLINE VecU32V U4LoadU(const PxU32* f); +//(f[0],f[1],f[2],f[3]), f must be 16-byte aligned +PX_FORCE_INLINE VecU32V U4LoadA(const PxU32* f); +//((U32)x, (U32)y, (U32)z, (U32)w) +PX_FORCE_INLINE VecU32V U4LoadXYZW(PxU32 x, PxU32 y, PxU32 z, PxU32 w); + +// VecI32V +//(i,i,i,i) +PX_FORCE_INLINE VecI32V I4Load(const PxI32 i); +//(i,i,i,i) +PX_FORCE_INLINE VecI32V I4LoadU(const PxI32* i); +//(i,i,i,i) +PX_FORCE_INLINE VecI32V I4LoadA(const PxI32* i); + +// QuatV +//(x = v[0], y=v[1], z=v[2], w=v3[3]) and array don't need to aligned +PX_FORCE_INLINE QuatV QuatVLoadU(const PxF32* v); +//(x = v[0], y=v[1], z=v[2], w=v3[3]) and array need to aligned, fast load +PX_FORCE_INLINE QuatV QuatVLoadA(const PxF32* v); +//(x, y, z, w) +PX_FORCE_INLINE QuatV QuatVLoadXYZW(const PxF32 x, const PxF32 y, const PxF32 z, const PxF32 w); + +// not added to public api +Vec4V Vec4V_From_PxVec3_WUndefined(const PxVec3& v); + +/////////////////////////////////////////////////// +// Construct a simd type from a different simd type +/////////////////////////////////////////////////// + +// Vec3V +//(v.x,v.y,v.z,0) +PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V(Vec4V v); +//(v.x,v.y,v.z,undefined) - be very careful with w!=0 because many functions require w==0 for correct operation eg V3Dot, V3Length, V3Cross etc etc. +PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V_WUndefined(const Vec4V v); + +// Vec4V +//(f.x,f.y,f.z,f.w) +PX_FORCE_INLINE Vec4V Vec4V_From_Vec3V(Vec3V f); +//((PxF32)f.x, (PxF32)f.y, (PxF32)f.z, (PxF32)f.w) +PX_FORCE_INLINE Vec4V Vec4V_From_VecU32V(VecU32V a); +//((PxF32)f.x, (PxF32)f.y, (PxF32)f.z, (PxF32)f.w) +PX_FORCE_INLINE Vec4V Vec4V_From_VecI32V(VecI32V a); +//(*(reinterpret_cast<PxF32*>(&f.x), (reinterpret_cast<PxF32*>(&f.y), (reinterpret_cast<PxF32*>(&f.z), +//(reinterpret_cast<PxF32*>(&f.w)) +PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecU32V(VecU32V a); +//(*(reinterpret_cast<PxF32*>(&f.x), (reinterpret_cast<PxF32*>(&f.y), (reinterpret_cast<PxF32*>(&f.z), +//(reinterpret_cast<PxF32*>(&f.w)) +PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecI32V(VecI32V a); + +// VecU32V +//(*(reinterpret_cast<PxU32*>(&f.x), (reinterpret_cast<PxU32*>(&f.y), (reinterpret_cast<PxU32*>(&f.z), +//(reinterpret_cast<PxU32*>(&f.w)) +PX_FORCE_INLINE VecU32V VecU32V_ReinterpretFrom_Vec4V(Vec4V a); +//(b[0], b[1], b[2], b[3]) +PX_FORCE_INLINE VecU32V VecU32V_From_BoolV(const BoolVArg b); + +// VecI32V +//(*(reinterpret_cast<PxI32*>(&f.x), (reinterpret_cast<PxI32*>(&f.y), (reinterpret_cast<PxI32*>(&f.z), +//(reinterpret_cast<PxI32*>(&f.w)) +PX_FORCE_INLINE VecI32V VecI32V_ReinterpretFrom_Vec4V(Vec4V a); +//((I32)a.x, (I32)a.y, (I32)a.z, (I32)a.w) +PX_FORCE_INLINE VecI32V VecI32V_From_Vec4V(Vec4V a); +//((I32)b.x, (I32)b.y, (I32)b.z, (I32)b.w) +PX_FORCE_INLINE VecI32V VecI32V_From_BoolV(const BoolVArg b); + +/////////////////////////////////////////////////// +// Convert from a simd type back to a scalar type +/////////////////////////////////////////////////// + +// FloatV +// a.x +PX_FORCE_INLINE void FStore(const FloatV a, PxF32* PX_RESTRICT f); + +// Vec3V +//(a.x,a.y,a.z) +PX_FORCE_INLINE void V3StoreA(const Vec3V a, PxVec3& f); +//(a.x,a.y,a.z) +PX_FORCE_INLINE void V3StoreU(const Vec3V a, PxVec3& f); + +// Vec4V +PX_FORCE_INLINE void V4StoreA(const Vec4V a, PxF32* f); +PX_FORCE_INLINE void V4StoreU(const Vec4V a, PxF32* f); + +// BoolV +PX_FORCE_INLINE void BStoreA(const BoolV b, PxU32* f); + +// VecU32V +PX_FORCE_INLINE void U4StoreA(const VecU32V uv, PxU32* u); + +// VecI32V +PX_FORCE_INLINE void I4StoreA(const VecI32V iv, PxI32* i); + +////////////////////////////////////////////////////////////////// +// Test that simd types have elements in the floating point range +////////////////////////////////////////////////////////////////// + +// check for each component is valid ie in floating point range +PX_FORCE_INLINE bool isFiniteFloatV(const FloatV a); +// check for each component is valid ie in floating point range +PX_FORCE_INLINE bool isFiniteVec3V(const Vec3V a); +// check for each component is valid ie in floating point range +PX_FORCE_INLINE bool isFiniteVec4V(const Vec4V a); + +// Check that w-component is zero. +PX_FORCE_INLINE bool isValidVec3V(const Vec3V a); + +////////////////////////////////////////////////////////////////// +// Tests that all elements of two 16-byte types are completely equivalent. +// Use these tests for unit testing and asserts only. +////////////////////////////////////////////////////////////////// + +namespace _VecMathTests +{ +PX_FORCE_INLINE Vec3V getInvalidVec3V(); +PX_FORCE_INLINE bool allElementsEqualFloatV(const FloatV a, const FloatV b); +PX_FORCE_INLINE bool allElementsEqualVec3V(const Vec3V a, const Vec3V b); +PX_FORCE_INLINE bool allElementsEqualVec4V(const Vec4V a, const Vec4V b); +PX_FORCE_INLINE bool allElementsEqualBoolV(const BoolV a, const BoolV b); +PX_FORCE_INLINE bool allElementsEqualVecU32V(const VecU32V a, const VecU32V b); +PX_FORCE_INLINE bool allElementsEqualVecI32V(const VecI32V a, const VecI32V b); + +PX_FORCE_INLINE bool allElementsEqualMat33V(const Mat33V& a, const Mat33V& b) +{ + return (allElementsEqualVec3V(a.col0, b.col0) && allElementsEqualVec3V(a.col1, b.col1) && + allElementsEqualVec3V(a.col2, b.col2)); +} +PX_FORCE_INLINE bool allElementsEqualMat34V(const Mat34V& a, const Mat34V& b) +{ + return (allElementsEqualVec3V(a.col0, b.col0) && allElementsEqualVec3V(a.col1, b.col1) && + allElementsEqualVec3V(a.col2, b.col2) && allElementsEqualVec3V(a.col3, b.col3)); +} +PX_FORCE_INLINE bool allElementsEqualMat44V(const Mat44V& a, const Mat44V& b) +{ + return (allElementsEqualVec4V(a.col0, b.col0) && allElementsEqualVec4V(a.col1, b.col1) && + allElementsEqualVec4V(a.col2, b.col2) && allElementsEqualVec4V(a.col3, b.col3)); +} + +PX_FORCE_INLINE bool allElementsNearEqualFloatV(const FloatV a, const FloatV b); +PX_FORCE_INLINE bool allElementsNearEqualVec3V(const Vec3V a, const Vec3V b); +PX_FORCE_INLINE bool allElementsNearEqualVec4V(const Vec4V a, const Vec4V b); +PX_FORCE_INLINE bool allElementsNearEqualMat33V(const Mat33V& a, const Mat33V& b) +{ + return (allElementsNearEqualVec3V(a.col0, b.col0) && allElementsNearEqualVec3V(a.col1, b.col1) && + allElementsNearEqualVec3V(a.col2, b.col2)); +} +PX_FORCE_INLINE bool allElementsNearEqualMat34V(const Mat34V& a, const Mat34V& b) +{ + return (allElementsNearEqualVec3V(a.col0, b.col0) && allElementsNearEqualVec3V(a.col1, b.col1) && + allElementsNearEqualVec3V(a.col2, b.col2) && allElementsNearEqualVec3V(a.col3, b.col3)); +} +PX_FORCE_INLINE bool allElementsNearEqualMat44V(const Mat44V& a, const Mat44V& b) +{ + return (allElementsNearEqualVec4V(a.col0, b.col0) && allElementsNearEqualVec4V(a.col1, b.col1) && + allElementsNearEqualVec4V(a.col2, b.col2) && allElementsNearEqualVec4V(a.col3, b.col3)); +} +} + +////////////////////////////////////////////////////////////////// +// Math operations on FloatV +////////////////////////////////////////////////////////////////// + +//(0,0,0,0) +PX_FORCE_INLINE FloatV FZero(); +//(1,1,1,1) +PX_FORCE_INLINE FloatV FOne(); +//(0.5,0.5,0.5,0.5) +PX_FORCE_INLINE FloatV FHalf(); +//(PX_EPS_REAL,PX_EPS_REAL,PX_EPS_REAL,PX_EPS_REAL) +PX_FORCE_INLINE FloatV FEps(); +//(PX_MAX_REAL, PX_MAX_REAL, PX_MAX_REAL PX_MAX_REAL) +PX_FORCE_INLINE FloatV FMax(); +//(-PX_MAX_REAL, -PX_MAX_REAL, -PX_MAX_REAL -PX_MAX_REAL) +PX_FORCE_INLINE FloatV FNegMax(); +//(1e-6f, 1e-6f, 1e-6f, 1e-6f) +PX_FORCE_INLINE FloatV FEps6(); +//((PxF32*)&1, (PxF32*)&1, (PxF32*)&1, (PxF32*)&1) + +//-f (per component) +PX_FORCE_INLINE FloatV FNeg(const FloatV f); +// a+b (per component) +PX_FORCE_INLINE FloatV FAdd(const FloatV a, const FloatV b); +// a-b (per component) +PX_FORCE_INLINE FloatV FSub(const FloatV a, const FloatV b); +// a*b (per component) +PX_FORCE_INLINE FloatV FMul(const FloatV a, const FloatV b); +// a/b (per component) +PX_FORCE_INLINE FloatV FDiv(const FloatV a, const FloatV b); +// a/b (per component) +PX_FORCE_INLINE FloatV FDivFast(const FloatV a, const FloatV b); +// 1.0f/a +PX_FORCE_INLINE FloatV FRecip(const FloatV a); +// 1.0f/a +PX_FORCE_INLINE FloatV FRecipFast(const FloatV a); +// 1.0f/sqrt(a) +PX_FORCE_INLINE FloatV FRsqrt(const FloatV a); +// 1.0f/sqrt(a) +PX_FORCE_INLINE FloatV FRsqrtFast(const FloatV a); +// sqrt(a) +PX_FORCE_INLINE FloatV FSqrt(const FloatV a); +// a*b+c +PX_FORCE_INLINE FloatV FScaleAdd(const FloatV a, const FloatV b, const FloatV c); +// c-a*b +PX_FORCE_INLINE FloatV FNegScaleSub(const FloatV a, const FloatV b, const FloatV c); +// fabs(a) +PX_FORCE_INLINE FloatV FAbs(const FloatV a); +// c ? a : b (per component) +PX_FORCE_INLINE FloatV FSel(const BoolV c, const FloatV a, const FloatV b); +// a>b (per component) +PX_FORCE_INLINE BoolV FIsGrtr(const FloatV a, const FloatV b); +// a>=b (per component) +PX_FORCE_INLINE BoolV FIsGrtrOrEq(const FloatV a, const FloatV b); +// a==b (per component) +PX_FORCE_INLINE BoolV FIsEq(const FloatV a, const FloatV b); +// Max(a,b) (per component) +PX_FORCE_INLINE FloatV FMax(const FloatV a, const FloatV b); +// Min(a,b) (per component) +PX_FORCE_INLINE FloatV FMin(const FloatV a, const FloatV b); +// Clamp(a,b) (per component) +PX_FORCE_INLINE FloatV FClamp(const FloatV a, const FloatV minV, const FloatV maxV); + +// a.x>b.x +PX_FORCE_INLINE PxU32 FAllGrtr(const FloatV a, const FloatV b); +// a.x>=b.x +PX_FORCE_INLINE PxU32 FAllGrtrOrEq(const FloatV a, const FloatV b); +// a.x==b.x +PX_FORCE_INLINE PxU32 FAllEq(const FloatV a, const FloatV b); +// a<min || a>max +PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV min, const FloatV max); +// a>=min && a<=max +PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV min, const FloatV max); +// a<-bounds || a>bounds +PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV bounds); +// a>=-bounds && a<=bounds +PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV bounds); + +// round float a to the near int +PX_FORCE_INLINE FloatV FRound(const FloatV a); +// calculate the sin of float a +PX_FORCE_INLINE FloatV FSin(const FloatV a); +// calculate the cos of float b +PX_FORCE_INLINE FloatV FCos(const FloatV a); + +////////////////////////////////////////////////////////////////// +// Math operations on Vec3V +////////////////////////////////////////////////////////////////// + +//(f,f,f,f) +PX_FORCE_INLINE Vec3V V3Splat(const FloatV f); + +//(x,y,z) +PX_FORCE_INLINE Vec3V V3Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z); + +//(1,0,0,0) +PX_FORCE_INLINE Vec3V V3UnitX(); +//(0,1,0,0) +PX_FORCE_INLINE Vec3V V3UnitY(); +//(0,0,1,0) +PX_FORCE_INLINE Vec3V V3UnitZ(); + +//(f.x,f.x,f.x,f.x) +PX_FORCE_INLINE FloatV V3GetX(const Vec3V f); +//(f.y,f.y,f.y,f.y) +PX_FORCE_INLINE FloatV V3GetY(const Vec3V f); +//(f.z,f.z,f.z,f.z) +PX_FORCE_INLINE FloatV V3GetZ(const Vec3V f); + +//(f,v.y,v.z,v.w) +PX_FORCE_INLINE Vec3V V3SetX(const Vec3V v, const FloatV f); +//(v.x,f,v.z,v.w) +PX_FORCE_INLINE Vec3V V3SetY(const Vec3V v, const FloatV f); +//(v.x,v.y,f,v.w) +PX_FORCE_INLINE Vec3V V3SetZ(const Vec3V v, const FloatV f); + +// v.x=f +PX_FORCE_INLINE void V3WriteX(Vec3V& v, const PxF32 f); +// v.y=f +PX_FORCE_INLINE void V3WriteY(Vec3V& v, const PxF32 f); +// v.z=f +PX_FORCE_INLINE void V3WriteZ(Vec3V& v, const PxF32 f); +// v.x=f.x, v.y=f.y, v.z=f.z +PX_FORCE_INLINE void V3WriteXYZ(Vec3V& v, const PxVec3& f); +// return v.x +PX_FORCE_INLINE PxF32 V3ReadX(const Vec3V& v); +// return v.y +PX_FORCE_INLINE PxF32 V3ReadY(const Vec3V& v); +// return v.y +PX_FORCE_INLINE PxF32 V3ReadZ(const Vec3V& v); +// return (v.x,v.y,v.z) +PX_FORCE_INLINE const PxVec3& V3ReadXYZ(const Vec3V& v); + +//(a.x, b.x, c.x) +PX_FORCE_INLINE Vec3V V3ColX(const Vec3V a, const Vec3V b, const Vec3V c); +//(a.y, b.y, c.y) +PX_FORCE_INLINE Vec3V V3ColY(const Vec3V a, const Vec3V b, const Vec3V c); +//(a.z, b.z, c.z) +PX_FORCE_INLINE Vec3V V3ColZ(const Vec3V a, const Vec3V b, const Vec3V c); + +//(0,0,0,0) +PX_FORCE_INLINE Vec3V V3Zero(); +//(1,1,1,1) +PX_FORCE_INLINE Vec3V V3One(); +//(PX_EPS_REAL,PX_EPS_REAL,PX_EPS_REAL,PX_EPS_REAL) +PX_FORCE_INLINE Vec3V V3Eps(); +//-c (per component) +PX_FORCE_INLINE Vec3V V3Neg(const Vec3V c); +// a+b (per component) +PX_FORCE_INLINE Vec3V V3Add(const Vec3V a, const Vec3V b); +// a-b (per component) +PX_FORCE_INLINE Vec3V V3Sub(const Vec3V a, const Vec3V b); +// a*b (per component) +PX_FORCE_INLINE Vec3V V3Scale(const Vec3V a, const FloatV b); +// a*b (per component) +PX_FORCE_INLINE Vec3V V3Mul(const Vec3V a, const Vec3V b); +// a/b (per component) +PX_FORCE_INLINE Vec3V V3ScaleInv(const Vec3V a, const FloatV b); +// a/b (per component) +PX_FORCE_INLINE Vec3V V3Div(const Vec3V a, const Vec3V b); +// a/b (per component) +PX_FORCE_INLINE Vec3V V3ScaleInvFast(const Vec3V a, const FloatV b); +// a/b (per component) +PX_FORCE_INLINE Vec3V V3DivFast(const Vec3V a, const Vec3V b); +// 1.0f/a +PX_FORCE_INLINE Vec3V V3Recip(const Vec3V a); +// 1.0f/a +PX_FORCE_INLINE Vec3V V3RecipFast(const Vec3V a); +// 1.0f/sqrt(a) +PX_FORCE_INLINE Vec3V V3Rsqrt(const Vec3V a); +// 1.0f/sqrt(a) +PX_FORCE_INLINE Vec3V V3RsqrtFast(const Vec3V a); +// a*b+c +PX_FORCE_INLINE Vec3V V3ScaleAdd(const Vec3V a, const FloatV b, const Vec3V c); +// c-a*b +PX_FORCE_INLINE Vec3V V3NegScaleSub(const Vec3V a, const FloatV b, const Vec3V c); +// a*b+c +PX_FORCE_INLINE Vec3V V3MulAdd(const Vec3V a, const Vec3V b, const Vec3V c); +// c-a*b +PX_FORCE_INLINE Vec3V V3NegMulSub(const Vec3V a, const Vec3V b, const Vec3V c); +// fabs(a) +PX_FORCE_INLINE Vec3V V3Abs(const Vec3V a); + +// a.b +// Note: a.w and b.w must have value zero +PX_FORCE_INLINE FloatV V3Dot(const Vec3V a, const Vec3V b); +// aXb +// Note: a.w and b.w must have value zero +PX_FORCE_INLINE Vec3V V3Cross(const Vec3V a, const Vec3V b); +// |a.a|^1/2 +// Note: a.w must have value zero +PX_FORCE_INLINE FloatV V3Length(const Vec3V a); +// a.a +// Note: a.w must have value zero +PX_FORCE_INLINE FloatV V3LengthSq(const Vec3V a); +// a*|a.a|^-1/2 +// Note: a.w must have value zero +PX_FORCE_INLINE Vec3V V3Normalize(const Vec3V a); +// a.a>0 ? a*|a.a|^-1/2 : (0,0,0,0) +// Note: a.w must have value zero +PX_FORCE_INLINE FloatV V3Length(const Vec3V a); +// a.a>0 ? a*|a.a|^-1/2 : unsafeReturnValue +// Note: a.w must have value zero +PX_FORCE_INLINE Vec3V V3NormalizeSafe(const Vec3V a, const Vec3V unsafeReturnValue); +// a.x + a.y + a.z +// Note: a.w must have value zero +PX_FORCE_INLINE FloatV V3SumElems(const Vec3V a); + +// c ? a : b (per component) +PX_FORCE_INLINE Vec3V V3Sel(const BoolV c, const Vec3V a, const Vec3V b); +// a>b (per component) +PX_FORCE_INLINE BoolV V3IsGrtr(const Vec3V a, const Vec3V b); +// a>=b (per component) +PX_FORCE_INLINE BoolV V3IsGrtrOrEq(const Vec3V a, const Vec3V b); +// a==b (per component) +PX_FORCE_INLINE BoolV V3IsEq(const Vec3V a, const Vec3V b); +// Max(a,b) (per component) +PX_FORCE_INLINE Vec3V V3Max(const Vec3V a, const Vec3V b); +// Min(a,b) (per component) +PX_FORCE_INLINE Vec3V V3Min(const Vec3V a, const Vec3V b); + +// Extract the maximum value from a +// Note: a.w must have value zero +PX_FORCE_INLINE FloatV V3ExtractMax(const Vec3V a); + +// Extract the minimum value from a +// Note: a.w must have value zero +PX_FORCE_INLINE FloatV V3ExtractMin(const Vec3V a); + +// Clamp(a,b) (per component) +PX_FORCE_INLINE Vec3V V3Clamp(const Vec3V a, const Vec3V minV, const Vec3V maxV); + +// Extract the sign for each component +PX_FORCE_INLINE Vec3V V3Sign(const Vec3V a); + +// Test all components. +// (a.x>b.x && a.y>b.y && a.z>b.z) +// Note: a.w and b.w must have value zero +PX_FORCE_INLINE PxU32 V3AllGrtr(const Vec3V a, const Vec3V b); +// (a.x>=b.x && a.y>=b.y && a.z>=b.z) +// Note: a.w and b.w must have value zero +PX_FORCE_INLINE PxU32 V3AllGrtrOrEq(const Vec3V a, const Vec3V b); +// (a.x==b.x && a.y==b.y && a.z==b.z) +// Note: a.w and b.w must have value zero +PX_FORCE_INLINE PxU32 V3AllEq(const Vec3V a, const Vec3V b); +// a.x<min.x || a.y<min.y || a.z<min.z || a.x>max.x || a.y>max.y || a.z>max.z +// Note: a.w and min.w and max.w must have value zero +PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V min, const Vec3V max); +// a.x>=min.x && a.y>=min.y && a.z>=min.z && a.x<=max.x && a.y<=max.y && a.z<=max.z +// Note: a.w and min.w and max.w must have value zero +PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V min, const Vec3V max); +// a.x<-bounds.x || a.y<=-bounds.y || a.z<bounds.z || a.x>bounds.x || a.y>bounds.y || a.z>bounds.z +// Note: a.w and bounds.w must have value zero +PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V bounds); +// a.x>=-bounds.x && a.y>=-bounds.y && a.z>=-bounds.z && a.x<=bounds.x && a.y<=bounds.y && a.z<=bounds.z +// Note: a.w and bounds.w must have value zero +PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V bounds); + +//(floor(a.x + 0.5f), floor(a.y + 0.5f), floor(a.z + 0.5f)) +PX_FORCE_INLINE Vec3V V3Round(const Vec3V a); + +//(sinf(a.x), sinf(a.y), sinf(a.z)) +PX_FORCE_INLINE Vec3V V3Sin(const Vec3V a); +//(cosf(a.x), cosf(a.y), cosf(a.z)) +PX_FORCE_INLINE Vec3V V3Cos(const Vec3V a); + +//(a.y,a.z,a.z) +PX_FORCE_INLINE Vec3V V3PermYZZ(const Vec3V a); +//(a.x,a.y,a.x) +PX_FORCE_INLINE Vec3V V3PermXYX(const Vec3V a); +//(a.y,a.z,a.x) +PX_FORCE_INLINE Vec3V V3PermYZX(const Vec3V a); +//(a.z, a.x, a.y) +PX_FORCE_INLINE Vec3V V3PermZXY(const Vec3V a); +//(a.z,a.z,a.y) +PX_FORCE_INLINE Vec3V V3PermZZY(const Vec3V a); +//(a.y,a.x,a.x) +PX_FORCE_INLINE Vec3V V3PermYXX(const Vec3V a); +//(0, v1.z, v0.y) +PX_FORCE_INLINE Vec3V V3Perm_Zero_1Z_0Y(const Vec3V v0, const Vec3V v1); +//(v0.z, 0, v1.x) +PX_FORCE_INLINE Vec3V V3Perm_0Z_Zero_1X(const Vec3V v0, const Vec3V v1); +//(v1.y, v0.x, 0) +PX_FORCE_INLINE Vec3V V3Perm_1Y_0X_Zero(const Vec3V v0, const Vec3V v1); + +// Transpose 3 Vec3Vs inplace. Sets the w component to zero +// [ x0, y0, z0, w0] [ x1, y1, z1, w1] [ x2, y2, z2, w2] -> [x0 x1 x2 0] [y0 y1 y2 0] [z0 z1 z2 0] +PX_FORCE_INLINE void V3Transpose(Vec3V& col0, Vec3V& col1, Vec3V& col2); + +////////////////////////////////////////////////////////////////// +// Math operations on Vec4V +////////////////////////////////////////////////////////////////// + +//(f,f,f,f) +PX_FORCE_INLINE Vec4V V4Splat(const FloatV f); + +//(f[0],f[1],f[2],f[3]) +PX_FORCE_INLINE Vec4V V4Merge(const FloatV* const f); +//(x,y,z,w) +PX_FORCE_INLINE Vec4V V4Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z, const FloatVArg w); +//(x.w, y.w, z.w, w.w) +PX_FORCE_INLINE Vec4V V4MergeW(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w); +//(x.z, y.z, z.z, w.z) +PX_FORCE_INLINE Vec4V V4MergeZ(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w); +//(x.y, y.y, z.y, w.y) +PX_FORCE_INLINE Vec4V V4MergeY(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w); +//(x.x, y.x, z.x, w.x) +PX_FORCE_INLINE Vec4V V4MergeX(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w); + +//(a.x, b.x, a.y, b.y) +PX_FORCE_INLINE Vec4V V4UnpackXY(const Vec4VArg a, const Vec4VArg b); +//(a.z, b.z, a.w, b.w) +PX_FORCE_INLINE Vec4V V4UnpackZW(const Vec4VArg a, const Vec4VArg b); + +//(1,0,0,0) +PX_FORCE_INLINE Vec4V V4UnitW(); +//(0,1,0,0) +PX_FORCE_INLINE Vec4V V4UnitY(); +//(0,0,1,0) +PX_FORCE_INLINE Vec4V V4UnitZ(); +//(0,0,0,1) +PX_FORCE_INLINE Vec4V V4UnitW(); + +//(f.x,f.x,f.x,f.x) +PX_FORCE_INLINE FloatV V4GetX(const Vec4V f); +//(f.y,f.y,f.y,f.y) +PX_FORCE_INLINE FloatV V4GetY(const Vec4V f); +//(f.z,f.z,f.z,f.z) +PX_FORCE_INLINE FloatV V4GetZ(const Vec4V f); +//(f.w,f.w,f.w,f.w) +PX_FORCE_INLINE FloatV V4GetW(const Vec4V f); + +//(f,v.y,v.z,v.w) +PX_FORCE_INLINE Vec4V V4SetX(const Vec4V v, const FloatV f); +//(v.x,f,v.z,v.w) +PX_FORCE_INLINE Vec4V V4SetY(const Vec4V v, const FloatV f); +//(v.x,v.y,f,v.w) +PX_FORCE_INLINE Vec4V V4SetZ(const Vec4V v, const FloatV f); +//(v.x,v.y,v.z,f) +PX_FORCE_INLINE Vec4V V4SetW(const Vec4V v, const FloatV f); + +//(v.x,v.y,v.z,0) +PX_FORCE_INLINE Vec4V V4ClearW(const Vec4V v); + +//(a[elementIndex], a[elementIndex], a[elementIndex], a[elementIndex]) +template <int elementIndex> +PX_FORCE_INLINE Vec4V V4SplatElement(Vec4V a); + +// v.x=f +PX_FORCE_INLINE void V4WriteX(Vec4V& v, const PxF32 f); +// v.y=f +PX_FORCE_INLINE void V4WriteY(Vec4V& v, const PxF32 f); +// v.z=f +PX_FORCE_INLINE void V4WriteZ(Vec4V& v, const PxF32 f); +// v.w=f +PX_FORCE_INLINE void V4WriteW(Vec4V& v, const PxF32 f); +// v.x=f.x, v.y=f.y, v.z=f.z +PX_FORCE_INLINE void V4WriteXYZ(Vec4V& v, const PxVec3& f); +// return v.x +PX_FORCE_INLINE PxF32 V4ReadX(const Vec4V& v); +// return v.y +PX_FORCE_INLINE PxF32 V4ReadY(const Vec4V& v); +// return v.z +PX_FORCE_INLINE PxF32 V4ReadZ(const Vec4V& v); +// return v.w +PX_FORCE_INLINE PxF32 V4ReadW(const Vec4V& v); +// return (v.x,v.y,v.z) +PX_FORCE_INLINE const PxVec3& V4ReadXYZ(const Vec4V& v); + +//(0,0,0,0) +PX_FORCE_INLINE Vec4V V4Zero(); +//(1,1,1,1) +PX_FORCE_INLINE Vec4V V4One(); +//(PX_EPS_REAL,PX_EPS_REAL,PX_EPS_REAL,PX_EPS_REAL) +PX_FORCE_INLINE Vec4V V4Eps(); + +//-c (per component) +PX_FORCE_INLINE Vec4V V4Neg(const Vec4V c); +// a+b (per component) +PX_FORCE_INLINE Vec4V V4Add(const Vec4V a, const Vec4V b); +// a-b (per component) +PX_FORCE_INLINE Vec4V V4Sub(const Vec4V a, const Vec4V b); +// a*b (per component) +PX_FORCE_INLINE Vec4V V4Scale(const Vec4V a, const FloatV b); +// a*b (per component) +PX_FORCE_INLINE Vec4V V4Mul(const Vec4V a, const Vec4V b); +// a/b (per component) +PX_FORCE_INLINE Vec4V V4ScaleInv(const Vec4V a, const FloatV b); +// a/b (per component) +PX_FORCE_INLINE Vec4V V4Div(const Vec4V a, const Vec4V b); +// a/b (per component) +PX_FORCE_INLINE Vec4V V4ScaleInvFast(const Vec4V a, const FloatV b); +// a/b (per component) +PX_FORCE_INLINE Vec4V V4DivFast(const Vec4V a, const Vec4V b); +// 1.0f/a +PX_FORCE_INLINE Vec4V V4Recip(const Vec4V a); +// 1.0f/a +PX_FORCE_INLINE Vec4V V4RecipFast(const Vec4V a); +// 1.0f/sqrt(a) +PX_FORCE_INLINE Vec4V V4Rsqrt(const Vec4V a); +// 1.0f/sqrt(a) +PX_FORCE_INLINE Vec4V V4RsqrtFast(const Vec4V a); +// a*b+c +PX_FORCE_INLINE Vec4V V4ScaleAdd(const Vec4V a, const FloatV b, const Vec4V c); +// c-a*b +PX_FORCE_INLINE Vec4V V4NegScaleSub(const Vec4V a, const FloatV b, const Vec4V c); +// a*b+c +PX_FORCE_INLINE Vec4V V4MulAdd(const Vec4V a, const Vec4V b, const Vec4V c); +// c-a*b +PX_FORCE_INLINE Vec4V V4NegMulSub(const Vec4V a, const Vec4V b, const Vec4V c); + +// fabs(a) +PX_FORCE_INLINE Vec4V V4Abs(const Vec4V a); +// bitwise a & ~b +PX_FORCE_INLINE Vec4V V4Andc(const Vec4V a, const VecU32V b); + +// a.b (W is taken into account) +PX_FORCE_INLINE FloatV V4Dot(const Vec4V a, const Vec4V b); +// a.b (same computation as V3Dot. W is ignored in input) +PX_FORCE_INLINE FloatV V4Dot3(const Vec4V a, const Vec4V b); +// aXb (same computation as V3Cross. W is ignored in input and undefined in output) +PX_FORCE_INLINE Vec4V V4Cross(const Vec4V a, const Vec4V b); + +//|a.a|^1/2 +PX_FORCE_INLINE FloatV V4Length(const Vec4V a); +// a.a +PX_FORCE_INLINE FloatV V4LengthSq(const Vec4V a); + +// a*|a.a|^-1/2 +PX_FORCE_INLINE Vec4V V4Normalize(const Vec4V a); +// a.a>0 ? a*|a.a|^-1/2 : unsafeReturnValue +PX_FORCE_INLINE Vec4V V4NormalizeSafe(const Vec4V a, const Vec4V unsafeReturnValue); +// a*|a.a|^-1/2 +PX_FORCE_INLINE Vec4V V4NormalizeFast(const Vec4V a); + +// c ? a : b (per component) +PX_FORCE_INLINE Vec4V V4Sel(const BoolV c, const Vec4V a, const Vec4V b); +// a>b (per component) +PX_FORCE_INLINE BoolV V4IsGrtr(const Vec4V a, const Vec4V b); +// a>=b (per component) +PX_FORCE_INLINE BoolV V4IsGrtrOrEq(const Vec4V a, const Vec4V b); +// a==b (per component) +PX_FORCE_INLINE BoolV V4IsEq(const Vec4V a, const Vec4V b); +// Max(a,b) (per component) +PX_FORCE_INLINE Vec4V V4Max(const Vec4V a, const Vec4V b); +// Min(a,b) (per component) +PX_FORCE_INLINE Vec4V V4Min(const Vec4V a, const Vec4V b); +// Get the maximum component from a +PX_FORCE_INLINE FloatV V4ExtractMax(const Vec4V a); +// Get the minimum component from a +PX_FORCE_INLINE FloatV V4ExtractMin(const Vec4V a); + +// Clamp(a,b) (per component) +PX_FORCE_INLINE Vec4V V4Clamp(const Vec4V a, const Vec4V minV, const Vec4V maxV); + +// return 1 if all components of a are greater than all components of b. +PX_FORCE_INLINE PxU32 V4AllGrtr(const Vec4V a, const Vec4V b); +// return 1 if all components of a are greater than or equal to all components of b +PX_FORCE_INLINE PxU32 V4AllGrtrOrEq(const Vec4V a, const Vec4V b); +// return 1 if XYZ components of a are greater than or equal to XYZ components of b. W is ignored. +PX_FORCE_INLINE PxU32 V4AllGrtrOrEq3(const Vec4V a, const Vec4V b); +// return 1 if all components of a are equal to all components of b +PX_FORCE_INLINE PxU32 V4AllEq(const Vec4V a, const Vec4V b); +// return 1 if any XYZ component of a is greater than the corresponding component of b. W is ignored. +PX_FORCE_INLINE PxU32 V4AnyGrtr3(const Vec4V a, const Vec4V b); + +// round(a)(per component) +PX_FORCE_INLINE Vec4V V4Round(const Vec4V a); +// sin(a) (per component) +PX_FORCE_INLINE Vec4V V4Sin(const Vec4V a); +// cos(a) (per component) +PX_FORCE_INLINE Vec4V V4Cos(const Vec4V a); + +// Permute v into a new vec4v with YXWZ format +PX_FORCE_INLINE Vec4V V4PermYXWZ(const Vec4V v); +// Permute v into a new vec4v with XZXZ format +PX_FORCE_INLINE Vec4V V4PermXZXZ(const Vec4V v); +// Permute v into a new vec4v with YWYW format +PX_FORCE_INLINE Vec4V V4PermYWYW(const Vec4V v); +// Permute v into a new vec4v with YZXW format +PX_FORCE_INLINE Vec4V V4PermYZXW(const Vec4V v); + +// Permute v into a new vec4v with format {a[x], a[y], a[z], a[w]} +// V4Perm<1,3,1,3> is equal to V4PermYWYW +// V4Perm<0,2,0,2> is equal to V4PermXZXZ +// V3Perm<1,0,3,2> is equal to V4PermYXWZ +template <PxU8 x, PxU8 y, PxU8 z, PxU8 w> +PX_FORCE_INLINE Vec4V V4Perm(const Vec4V a); + +// Transpose 4 Vec4Vs inplace. +// [ x0, y0, z0, w0] [ x1, y1, z1, w1] [ x2, y2, z2, w2] [ x3, y3, z3, w3] -> +// [ x0, x1, x2, x3] [ y0, y1, y2, y3] [ z0, z1, z2, z3] [ w0, w1, w2, w3] +PX_FORCE_INLINE void V3Transpose(Vec3V& col0, Vec3V& col1, Vec3V& col2); + +// q = cos(a/2) + u*sin(a/2) +PX_FORCE_INLINE QuatV QuatV_From_RotationAxisAngle(const Vec3V u, const FloatV a); +// convert q to a unit quaternion +PX_FORCE_INLINE QuatV QuatNormalize(const QuatV q); +//|q.q|^1/2 +PX_FORCE_INLINE FloatV QuatLength(const QuatV q); +// q.q +PX_FORCE_INLINE FloatV QuatLengthSq(const QuatV q); +// a.b +PX_FORCE_INLINE FloatV QuatDot(const QuatV a, const QuatV b); +//(-q.x, -q.y, -q.z, q.w) +PX_FORCE_INLINE QuatV QuatConjugate(const QuatV q); +//(q.x, q.y, q.z) +PX_FORCE_INLINE Vec3V QuatGetImaginaryPart(const QuatV q); +// convert quaternion to matrix 33 +PX_FORCE_INLINE Mat33V QuatGetMat33V(const QuatVArg q); +// convert quaternion to matrix 33 +PX_FORCE_INLINE void QuatGetMat33V(const QuatVArg q, Vec3V& column0, Vec3V& column1, Vec3V& column2); +// convert matrix 33 to quaternion +PX_FORCE_INLINE QuatV Mat33GetQuatV(const Mat33V& a); +// brief computes rotation of x-axis +PX_FORCE_INLINE Vec3V QuatGetBasisVector0(const QuatV q); +// brief computes rotation of y-axis +PX_FORCE_INLINE Vec3V QuatGetBasisVector1(const QuatV q); +// brief computes rotation of z-axis +PX_FORCE_INLINE Vec3V QuatGetBasisVector2(const QuatV q); +// calculate the rotation vector from q and v +PX_FORCE_INLINE Vec3V QuatRotate(const QuatV q, const Vec3V v); +// calculate the rotation vector from the conjugate quaternion and v +PX_FORCE_INLINE Vec3V QuatRotateInv(const QuatV q, const Vec3V v); +// quaternion multiplication +PX_FORCE_INLINE QuatV QuatMul(const QuatV a, const QuatV b); +// quaternion add +PX_FORCE_INLINE QuatV QuatAdd(const QuatV a, const QuatV b); +// (-q.x, -q.y, -q.z, -q.w) +PX_FORCE_INLINE QuatV QuatNeg(const QuatV q); +// (a.x - b.x, a.y-b.y, a.z-b.z, a.w-b.w ) +PX_FORCE_INLINE QuatV QuatSub(const QuatV a, const QuatV b); +// (a.x*b, a.y*b, a.z*b, a.w*b) +PX_FORCE_INLINE QuatV QuatScale(const QuatV a, const FloatV b); +// (x = v[0], y = v[1], z = v[2], w =v[3]) +PX_FORCE_INLINE QuatV QuatMerge(const FloatV* const v); +// (x = v[0], y = v[1], z = v[2], w =v[3]) +PX_FORCE_INLINE QuatV QuatMerge(const FloatVArg x, const FloatVArg y, const FloatVArg z, const FloatVArg w); +// (x = 0.f, y = 0.f, z = 0.f, w = 1.f) +PX_FORCE_INLINE QuatV QuatIdentity(); +// check for each component is valid +PX_FORCE_INLINE bool isFiniteQuatV(const QuatV q); +// check for each component is valid +PX_FORCE_INLINE bool isValidQuatV(const QuatV q); +// check for each component is valid +PX_FORCE_INLINE bool isSaneQuatV(const QuatV q); + +// Math operations on 16-byte aligned booleans. +// x=false y=false z=false w=false +PX_FORCE_INLINE BoolV BFFFF(); +// x=false y=false z=false w=true +PX_FORCE_INLINE BoolV BFFFT(); +// x=false y=false z=true w=false +PX_FORCE_INLINE BoolV BFFTF(); +// x=false y=false z=true w=true +PX_FORCE_INLINE BoolV BFFTT(); +// x=false y=true z=false w=false +PX_FORCE_INLINE BoolV BFTFF(); +// x=false y=true z=false w=true +PX_FORCE_INLINE BoolV BFTFT(); +// x=false y=true z=true w=false +PX_FORCE_INLINE BoolV BFTTF(); +// x=false y=true z=true w=true +PX_FORCE_INLINE BoolV BFTTT(); +// x=true y=false z=false w=false +PX_FORCE_INLINE BoolV BTFFF(); +// x=true y=false z=false w=true +PX_FORCE_INLINE BoolV BTFFT(); +// x=true y=false z=true w=false +PX_FORCE_INLINE BoolV BTFTF(); +// x=true y=false z=true w=true +PX_FORCE_INLINE BoolV BTFTT(); +// x=true y=true z=false w=false +PX_FORCE_INLINE BoolV BTTFF(); +// x=true y=true z=false w=true +PX_FORCE_INLINE BoolV BTTFT(); +// x=true y=true z=true w=false +PX_FORCE_INLINE BoolV BTTTF(); +// x=true y=true z=true w=true +PX_FORCE_INLINE BoolV BTTTT(); + +// x=false y=false z=false w=true +PX_FORCE_INLINE BoolV BWMask(); +// x=true y=false z=false w=false +PX_FORCE_INLINE BoolV BXMask(); +// x=false y=true z=false w=false +PX_FORCE_INLINE BoolV BYMask(); +// x=false y=false z=true w=false +PX_FORCE_INLINE BoolV BZMask(); + +// get x component +PX_FORCE_INLINE BoolV BGetX(const BoolV f); +// get y component +PX_FORCE_INLINE BoolV BGetY(const BoolV f); +// get z component +PX_FORCE_INLINE BoolV BGetZ(const BoolV f); +// get w component +PX_FORCE_INLINE BoolV BGetW(const BoolV f); + +// Use elementIndex to splat xxxx or yyyy or zzzz or wwww +template <int elementIndex> +PX_FORCE_INLINE BoolV BSplatElement(Vec4V a); + +// component-wise && (AND) +PX_FORCE_INLINE BoolV BAnd(const BoolV a, const BoolV b); +// component-wise || (OR) +PX_FORCE_INLINE BoolV BOr(const BoolV a, const BoolV b); +// component-wise not +PX_FORCE_INLINE BoolV BNot(const BoolV a); + +// if all four components are true, return true, otherwise return false +PX_FORCE_INLINE BoolV BAllTrue4(const BoolV a); + +// if any four components is true, return true, otherwise return false +PX_FORCE_INLINE BoolV BAnyTrue4(const BoolV a); + +// if all three(0, 1, 2) components are true, return true, otherwise return false +PX_FORCE_INLINE BoolV BAllTrue3(const BoolV a); + +// if any three (0, 1, 2) components is true, return true, otherwise return false +PX_FORCE_INLINE BoolV BAnyTrue3(const BoolV a); + +// Return 1 if all components equal, zero otherwise. +PX_FORCE_INLINE PxU32 BAllEq(const BoolV a, const BoolV b); + +// Specialized/faster BAllEq function for b==TTTT +PX_FORCE_INLINE PxU32 BAllEqTTTT(const BoolV a); +// Specialized/faster BAllEq function for b==FFFF +PX_FORCE_INLINE PxU32 BAllEqFFFF(const BoolV a); + +/// Get BoolV as bits set in an PxU32. A bit in the output is set if the element is 'true' in the input. +/// There is a bit for each element in a, with element 0s value held in bit0, element 1 in bit 1s and so forth. +/// If nothing is true in the input it will return 0, and if all are true if will return 0xf. +/// NOTE! That performance of the function varies considerably by platform, thus it is recommended to use +/// where your algorithm really needs a BoolV in an integer variable. +PX_FORCE_INLINE PxU32 BGetBitMask(const BoolV a); + +// VecI32V stuff + +PX_FORCE_INLINE VecI32V VecI32V_Zero(); + +PX_FORCE_INLINE VecI32V VecI32V_One(); + +PX_FORCE_INLINE VecI32V VecI32V_Two(); + +PX_FORCE_INLINE VecI32V VecI32V_MinusOne(); + +// Compute a shift parameter for VecI32V_LeftShift and VecI32V_RightShift +// Each element of shift must be identical ie the vector must have form {count, count, count, count} with count>=0 +PX_FORCE_INLINE VecShiftV VecI32V_PrepareShift(const VecI32VArg shift); + +// Shift each element of a leftwards by the same amount +// Compute shift with VecI32V_PrepareShift +//{a.x<<shift[0], a.y<<shift[0], a.z<<shift[0], a.w<<shift[0]} +PX_FORCE_INLINE VecI32V VecI32V_LeftShift(const VecI32VArg a, const VecShiftVArg shift); + +// Shift each element of a rightwards by the same amount +// Compute shift with VecI32V_PrepareShift +//{a.x>>shift[0], a.y>>shift[0], a.z>>shift[0], a.w>>shift[0]} +PX_FORCE_INLINE VecI32V VecI32V_RightShift(const VecI32VArg a, const VecShiftVArg shift); + +PX_FORCE_INLINE VecI32V VecI32V_Add(const VecI32VArg a, const VecI32VArg b); + +PX_FORCE_INLINE VecI32V VecI32V_Or(const VecI32VArg a, const VecI32VArg b); + +PX_FORCE_INLINE VecI32V VecI32V_GetX(const VecI32VArg a); + +PX_FORCE_INLINE VecI32V VecI32V_GetY(const VecI32VArg a); + +PX_FORCE_INLINE VecI32V VecI32V_GetZ(const VecI32VArg a); + +PX_FORCE_INLINE VecI32V VecI32V_GetW(const VecI32VArg a); + +PX_FORCE_INLINE VecI32V VecI32V_Sub(const VecI32VArg a, const VecI32VArg b); + +PX_FORCE_INLINE BoolV VecI32V_IsGrtr(const VecI32VArg a, const VecI32VArg b); + +PX_FORCE_INLINE BoolV VecI32V_IsEq(const VecI32VArg a, const VecI32VArg b); + +PX_FORCE_INLINE VecI32V V4I32Sel(const BoolV c, const VecI32V a, const VecI32V b); + +// VecU32V stuff + +PX_FORCE_INLINE VecU32V U4Zero(); + +PX_FORCE_INLINE VecU32V U4One(); + +PX_FORCE_INLINE VecU32V U4Two(); + +PX_FORCE_INLINE BoolV V4IsEqU32(const VecU32V a, const VecU32V b); + +PX_FORCE_INLINE VecU32V V4U32Sel(const BoolV c, const VecU32V a, const VecU32V b); + +PX_FORCE_INLINE VecU32V V4U32or(VecU32V a, VecU32V b); + +PX_FORCE_INLINE VecU32V V4U32xor(VecU32V a, VecU32V b); + +PX_FORCE_INLINE VecU32V V4U32and(VecU32V a, VecU32V b); + +PX_FORCE_INLINE VecU32V V4U32Andc(VecU32V a, VecU32V b); + +// VecU32 - why does this not return a bool? +PX_FORCE_INLINE VecU32V V4IsGrtrV32u(const Vec4V a, const Vec4V b); + +// Math operations on 16-byte aligned Mat33s (represents any 3x3 matrix) +// a*b +PX_FORCE_INLINE Vec3V M33MulV3(const Mat33V& a, const Vec3V b); +// A*x + b +PX_FORCE_INLINE Vec3V M33MulV3AddV3(const Mat33V& A, const Vec3V b, const Vec3V c); +// transpose(a) * b +PX_FORCE_INLINE Vec3V M33TrnspsMulV3(const Mat33V& a, const Vec3V b); +// a*b +PX_FORCE_INLINE Mat33V M33MulM33(const Mat33V& a, const Mat33V& b); +// a+b +PX_FORCE_INLINE Mat33V M33Add(const Mat33V& a, const Mat33V& b); +// a+b +PX_FORCE_INLINE Mat33V M33Sub(const Mat33V& a, const Mat33V& b); +//-a +PX_FORCE_INLINE Mat33V M33Neg(const Mat33V& a); +// absolute value of the matrix +PX_FORCE_INLINE Mat33V M33Abs(const Mat33V& a); +// inverse mat +PX_FORCE_INLINE Mat33V M33Inverse(const Mat33V& a); +// transpose(a) +PX_FORCE_INLINE Mat33V M33Trnsps(const Mat33V& a); +// create an identity matrix +PX_FORCE_INLINE Mat33V M33Identity(); + +// create a vec3 to store the diagonal element of the M33 +PX_FORCE_INLINE Mat33V M33Diagonal(const Vec3VArg); + +// Not implemented +// return 1 if all components of a are equal to all components of b +// PX_FORCE_INLINE PxU32 V4U32AllEq(const VecU32V a, const VecU32V b); +// v.w=f +// PX_FORCE_INLINE void V3WriteW(Vec3V& v, const PxF32 f); +// PX_FORCE_INLINE PxF32 V3ReadW(const Vec3V& v); + +// Not used +// PX_FORCE_INLINE Vec4V V4LoadAligned(Vec4V* addr); +// PX_FORCE_INLINE Vec4V V4LoadUnaligned(Vec4V* addr); +// floor(a)(per component) +// PX_FORCE_INLINE Vec4V V4Floor(Vec4V a); +// ceil(a) (per component) +// PX_FORCE_INLINE Vec4V V4Ceil(Vec4V a); +// PX_FORCE_INLINE VecU32V V4ConvertToU32VSaturate(const Vec4V a, PxU32 power); + +// Math operations on 16-byte aligned Mat34s (represents transformation matrix - rotation and translation). +// namespace _Mat34V +//{ +// //a*b +// PX_FORCE_INLINE Vec3V multiplyV(const Mat34V& a, const Vec3V b); +// //a_rotation * b +// PX_FORCE_INLINE Vec3V multiply3X3V(const Mat34V& a, const Vec3V b); +// //transpose(a_rotation)*b +// PX_FORCE_INLINE Vec3V multiplyTranspose3X3V(const Mat34V& a, const Vec3V b); +// //a*b +// PX_FORCE_INLINE Mat34V multiplyV(const Mat34V& a, const Mat34V& b); +// //a_rotation*b +// PX_FORCE_INLINE Mat33V multiply3X3V(const Mat34V& a, const Mat33V& b); +// //a_rotation*b_rotation +// PX_FORCE_INLINE Mat33V multiply3X3V(const Mat34V& a, const Mat34V& b); +// //a+b +// PX_FORCE_INLINE Mat34V addV(const Mat34V& a, const Mat34V& b); +// //a^-1 +// PX_FORCE_INLINE Mat34V getInverseV(const Mat34V& a); +// //transpose(a_rotation) +// PX_FORCE_INLINE Mat33V getTranspose3X3(const Mat34V& a); +//}; //namespace _Mat34V + +// a*b +//#define M34MulV3(a,b) (M34MulV3(a,b)) +////a_rotation * b +//#define M34Mul33V3(a,b) (M34Mul33V3(a,b)) +////transpose(a_rotation)*b +//#define M34TrnspsMul33V3(a,b) (M34TrnspsMul33V3(a,b)) +////a*b +//#define M34MulM34(a,b) (_Mat34V::multiplyV(a,b)) +// a_rotation*b +//#define M34MulM33(a,b) (M34MulM33(a,b)) +// a_rotation*b_rotation +//#define M34Mul33MM34(a,b) (M34MulM33(a,b)) +// a+b +//#define M34Add(a,b) (M34Add(a,b)) +////a^-1 +//#define M34Inverse(a,b) (M34Inverse(a)) +// transpose(a_rotation) +//#define M34Trnsps33(a) (M33Trnsps3X3(a)) + +// Math operations on 16-byte aligned Mat44s (represents any 4x4 matrix) +// namespace _Mat44V +//{ +// //a*b +// PX_FORCE_INLINE Vec4V multiplyV(const Mat44V& a, const Vec4V b); +// //transpose(a)*b +// PX_FORCE_INLINE Vec4V multiplyTransposeV(const Mat44V& a, const Vec4V b); +// //a*b +// PX_FORCE_INLINE Mat44V multiplyV(const Mat44V& a, const Mat44V& b); +// //a+b +// PX_FORCE_INLINE Mat44V addV(const Mat44V& a, const Mat44V& b); +// //a&-1 +// PX_FORCE_INLINE Mat44V getInverseV(const Mat44V& a); +// //transpose(a) +// PX_FORCE_INLINE Mat44V getTransposeV(const Mat44V& a); +//}; //namespace _Mat44V + +// namespace _VecU32V +//{ +// // pack 8 U32s to 8 U16s with saturation +// PX_FORCE_INLINE VecU16V pack2U32VToU16VSaturate(VecU32V a, VecU32V b); +// PX_FORCE_INLINE VecU32V orV(VecU32V a, VecU32V b); +// PX_FORCE_INLINE VecU32V andV(VecU32V a, VecU32V b); +// PX_FORCE_INLINE VecU32V andcV(VecU32V a, VecU32V b); +// // conversion from integer to float +// PX_FORCE_INLINE Vec4V convertToVec4V(VecU32V a); +// // splat a[elementIndex] into all fields of a +// template<int elementIndex> +// PX_FORCE_INLINE VecU32V splatElement(VecU32V a); +// PX_FORCE_INLINE void storeAligned(VecU32V a, VecU32V* address); +//}; + +// namespace _VecI32V +//{ +// template<int a> PX_FORCE_INLINE VecI32V splatI32(); +//}; +// +// namespace _VecU16V +//{ +// PX_FORCE_INLINE VecU16V orV(VecU16V a, VecU16V b); +// PX_FORCE_INLINE VecU16V andV(VecU16V a, VecU16V b); +// PX_FORCE_INLINE VecU16V andcV(VecU16V a, VecU16V b); +// PX_FORCE_INLINE void storeAligned(VecU16V val, VecU16V *address); +// PX_FORCE_INLINE VecU16V loadAligned(VecU16V* addr); +// PX_FORCE_INLINE VecU16V loadUnaligned(VecU16V* addr); +// PX_FORCE_INLINE VecU16V compareGt(VecU16V a, VecU16V b); +// template<int elementIndex> +// PX_FORCE_INLINE VecU16V splatElement(VecU16V a); +// PX_FORCE_INLINE VecU16V subtractModulo(VecU16V a, VecU16V b); +// PX_FORCE_INLINE VecU16V addModulo(VecU16V a, VecU16V b); +// PX_FORCE_INLINE VecU32V getLo16(VecU16V a); // [0,2,4,6] 16-bit values to [0,1,2,3] 32-bit vector +// PX_FORCE_INLINE VecU32V getHi16(VecU16V a); // [1,3,5,7] 16-bit values to [0,1,2,3] 32-bit vector +//}; +// +// namespace _VecI16V +//{ +// template <int val> PX_FORCE_INLINE VecI16V splatImmediate(); +//}; +// +// namespace _VecU8V +//{ +//}; + +// a*b +//#define M44MulV4(a,b) (M44MulV4(a,b)) +////transpose(a)*b +//#define M44TrnspsMulV4(a,b) (M44TrnspsMulV4(a,b)) +////a*b +//#define M44MulM44(a,b) (M44MulM44(a,b)) +////a+b +//#define M44Add(a,b) (M44Add(a,b)) +////a&-1 +//#define M44Inverse(a) (M44Inverse(a)) +////transpose(a) +//#define M44Trnsps(a) (M44Trnsps(a)) + +// dsequeira: these used to be assert'd out in SIMD builds, but they're necessary if +// we want to be able to write some scalar functions which run using SIMD data structures + +PX_FORCE_INLINE void V3WriteX(Vec3V& v, const PxF32 f) +{ + reinterpret_cast<PxVec3&>(v).x = f; +} + +PX_FORCE_INLINE void V3WriteY(Vec3V& v, const PxF32 f) +{ + reinterpret_cast<PxVec3&>(v).y = f; +} + +PX_FORCE_INLINE void V3WriteZ(Vec3V& v, const PxF32 f) +{ + reinterpret_cast<PxVec3&>(v).z = f; +} + +PX_FORCE_INLINE void V3WriteXYZ(Vec3V& v, const PxVec3& f) +{ + reinterpret_cast<PxVec3&>(v) = f; +} + +PX_FORCE_INLINE PxF32 V3ReadX(const Vec3V& v) +{ + return reinterpret_cast<const PxVec3&>(v).x; +} + +PX_FORCE_INLINE PxF32 V3ReadY(const Vec3V& v) +{ + return reinterpret_cast<const PxVec3&>(v).y; +} + +PX_FORCE_INLINE PxF32 V3ReadZ(const Vec3V& v) +{ + return reinterpret_cast<const PxVec3&>(v).z; +} + +PX_FORCE_INLINE const PxVec3& V3ReadXYZ(const Vec3V& v) +{ + return reinterpret_cast<const PxVec3&>(v); +} + +PX_FORCE_INLINE void V4WriteX(Vec4V& v, const PxF32 f) +{ + reinterpret_cast<PxVec4&>(v).x = f; +} + +PX_FORCE_INLINE void V4WriteY(Vec4V& v, const PxF32 f) +{ + reinterpret_cast<PxVec4&>(v).y = f; +} + +PX_FORCE_INLINE void V4WriteZ(Vec4V& v, const PxF32 f) +{ + reinterpret_cast<PxVec4&>(v).z = f; +} + +PX_FORCE_INLINE void V4WriteW(Vec4V& v, const PxF32 f) +{ + reinterpret_cast<PxVec4&>(v).w = f; +} + +PX_FORCE_INLINE void V4WriteXYZ(Vec4V& v, const PxVec3& f) +{ + reinterpret_cast<PxVec3&>(v) = f; +} + +PX_FORCE_INLINE PxF32 V4ReadX(const Vec4V& v) +{ + return reinterpret_cast<const PxVec4&>(v).x; +} + +PX_FORCE_INLINE PxF32 V4ReadY(const Vec4V& v) +{ + return reinterpret_cast<const PxVec4&>(v).y; +} + +PX_FORCE_INLINE PxF32 V4ReadZ(const Vec4V& v) +{ + return reinterpret_cast<const PxVec4&>(v).z; +} + +PX_FORCE_INLINE PxF32 V4ReadW(const Vec4V& v) +{ + return reinterpret_cast<const PxVec4&>(v).w; +} + +PX_FORCE_INLINE const PxVec3& V4ReadXYZ(const Vec4V& v) +{ + return reinterpret_cast<const PxVec3&>(v); +} + +// this macro transposes 4 Vec4V into 3 Vec4V (assuming that the W component can be ignored +#define PX_TRANSPOSE_44_34(inA, inB, inC, inD, outA, outB, outC) \ + \ +outA = V4UnpackXY(inA, inC); \ + \ +inA = V4UnpackZW(inA, inC); \ + \ +inC = V4UnpackXY(inB, inD); \ + \ +inB = V4UnpackZW(inB, inD); \ + \ +outB = V4UnpackZW(outA, inC); \ + \ +outA = V4UnpackXY(outA, inC); \ + \ +outC = V4UnpackXY(inA, inB); + +// this macro transposes 3 Vec4V into 4 Vec4V (with W components as garbage!) +#define PX_TRANSPOSE_34_44(inA, inB, inC, outA, outB, outC, outD) \ + outA = V4UnpackXY(inA, inC); \ + inA = V4UnpackZW(inA, inC); \ + outC = V4UnpackXY(inB, inB); \ + inC = V4UnpackZW(inB, inB); \ + outB = V4UnpackZW(outA, outC); \ + outA = V4UnpackXY(outA, outC); \ + outC = V4UnpackXY(inA, inC); \ + outD = V4UnpackZW(inA, inC); + +#define PX_TRANSPOSE_44(inA, inB, inC, inD, outA, outB, outC, outD) \ + outA = V4UnpackXY(inA, inC); \ + inA = V4UnpackZW(inA, inC); \ + inC = V4UnpackXY(inB, inD); \ + inB = V4UnpackZW(inB, inD); \ + outB = V4UnpackZW(outA, inC); \ + outA = V4UnpackXY(outA, inC); \ + outC = V4UnpackXY(inA, inB); \ + outD = V4UnpackZW(inA, inB); + +// This function returns a Vec4V, where each element is the dot product of one pair of Vec3Vs. On PC, each element in +// the result should be identical to the results if V3Dot was performed +// for each pair of Vec3V. +// However, on other platforms, the result might diverge by some small margin due to differences in FP rounding, e.g. if +// _mm_dp_ps was used or some other approximate dot product or fused madd operations +// were used. +// Where there does not exist a hw-accelerated dot-product operation, this approach should be the fastest way to compute +// the dot product of 4 vectors. +PX_FORCE_INLINE Vec4V V3Dot4(const Vec3VArg a0, const Vec3VArg b0, const Vec3VArg a1, const Vec3VArg b1, + const Vec3VArg a2, const Vec3VArg b2, const Vec3VArg a3, const Vec3VArg b3) +{ + Vec4V a0b0 = Vec4V_From_Vec3V(V3Mul(a0, b0)); + Vec4V a1b1 = Vec4V_From_Vec3V(V3Mul(a1, b1)); + Vec4V a2b2 = Vec4V_From_Vec3V(V3Mul(a2, b2)); + Vec4V a3b3 = Vec4V_From_Vec3V(V3Mul(a3, b3)); + + Vec4V aTrnsps, bTrnsps, cTrnsps; + + PX_TRANSPOSE_44_34(a0b0, a1b1, a2b2, a3b3, aTrnsps, bTrnsps, cTrnsps); + + return V4Add(V4Add(aTrnsps, bTrnsps), cTrnsps); +} + +//(f.x,f.y,f.z,0) - Alternative/faster V3LoadU implementation when it is safe to read "W", i.e. the 32bits after the PxVec3. +PX_FORCE_INLINE Vec3V V3LoadU_SafeReadW(const PxVec3& f) +{ + return Vec3V_From_Vec4V(V4LoadU(&f.x)); +} + +// Now for the cross-platform implementations of the 16-byte aligned maths functions (win32/360/ppu/spu etc). +#if COMPILE_VECTOR_INTRINSICS +#include "PsInlineAoS.h" +#else // #if COMPILE_VECTOR_INTRINSICS +#include "PsVecMathAoSScalarInline.h" +#endif // #if !COMPILE_VECTOR_INTRINSICS +#include "PsVecQuat.h" + +} // namespace aos +} // namespace shdfnd +} // namespace physx + +#endif // PSFOUNDATION_PSVECMATH_H diff --git a/PxShared/src/foundation/include/PsVecMathAoSScalar.h b/PxShared/src/foundation/include/PsVecMathAoSScalar.h new file mode 100644 index 00000000..e3a72a12 --- /dev/null +++ b/PxShared/src/foundation/include/PsVecMathAoSScalar.h @@ -0,0 +1,242 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSVECMATHAOSSCALAR_H +#define PSFOUNDATION_PSVECMATHAOSSCALAR_H + +#if COMPILE_VECTOR_INTRINSICS +#error Scalar version should not be included when using vector intrinsics. +#endif + +// Remove this define when all platforms use simd solver. +#define PX_SUPPORT_SIMD + +struct VecI16V; +struct VecU16V; +struct VecI32V; +struct VecU32V; +struct Vec4V; +typedef Vec4V QuatV; + +PX_ALIGN_PREFIX(16) +struct FloatV +{ + PxF32 x; + PxF32 pad[3]; + FloatV() + { + } + FloatV(const PxF32 _x) : x(_x) + { + } +} PX_ALIGN_SUFFIX(16); + +PX_ALIGN_PREFIX(16) +struct Vec4V +{ + PxF32 x, y, z, w; + Vec4V() + { + } + Vec4V(const PxF32 _x, const PxF32 _y, const PxF32 _z, const PxF32 _w) : x(_x), y(_y), z(_z), w(_w) + { + } +} PX_ALIGN_SUFFIX(16); + +PX_ALIGN_PREFIX(16) +struct Vec3V +{ + PxF32 x, y, z; + PxF32 pad; + Vec3V() + { + } + Vec3V(const PxF32 _x, const PxF32 _y, const PxF32 _z) : x(_x), y(_y), z(_z), pad(0.0f) + { + } +} PX_ALIGN_SUFFIX(16); + +PX_ALIGN_PREFIX(16) +struct BoolV +{ + PxU32 ux, uy, uz, uw; + BoolV() + { + } + BoolV(const PxU32 _x, const PxU32 _y, const PxU32 _z, const PxU32 _w) : ux(_x), uy(_y), uz(_z), uw(_w) + { + } +} PX_ALIGN_SUFFIX(16); + +struct Mat33V +{ + Mat33V() + { + } + Mat33V(const Vec3V& c0, const Vec3V& c1, const Vec3V& c2) : col0(c0), col1(c1), col2(c2) + { + } + Vec3V col0; + Vec3V col1; + Vec3V col2; +}; + +struct Mat34V +{ + Mat34V() + { + } + Mat34V(const Vec3V& c0, const Vec3V& c1, const Vec3V& c2, const Vec3V& c3) : col0(c0), col1(c1), col2(c2), col3(c3) + { + } + Vec3V col0; + Vec3V col1; + Vec3V col2; + Vec3V col3; +}; + +struct Mat43V +{ + Mat43V() + { + } + Mat43V(const Vec4V& c0, const Vec4V& c1, const Vec4V& c2) : col0(c0), col1(c1), col2(c2) + { + } + Vec4V col0; + Vec4V col1; + Vec4V col2; +}; + +struct Mat44V +{ + Mat44V() + { + } + Mat44V(const Vec4V& c0, const Vec4V& c1, const Vec4V& c2, const Vec4V& c3) : col0(c0), col1(c1), col2(c2), col3(c3) + { + } + Vec4V col0; + Vec4V col1; + Vec4V col2; + Vec4V col3; +}; + +PX_ALIGN_PREFIX(16) +struct VecU32V +{ + PxU32 u32[4]; + PX_FORCE_INLINE VecU32V() + { + } + PX_FORCE_INLINE VecU32V(PxU32 a, PxU32 b, PxU32 c, PxU32 d) + { + u32[0] = a; + u32[1] = b; + u32[2] = c; + u32[3] = d; + } +} PX_ALIGN_SUFFIX(16); + +PX_ALIGN_PREFIX(16) +struct VecI32V +{ + PxI32 i32[4]; + PX_FORCE_INLINE VecI32V() + { + } + PX_FORCE_INLINE VecI32V(PxI32 a, PxI32 b, PxI32 c, PxI32 d) + { + i32[0] = a; + i32[1] = b; + i32[2] = c; + i32[3] = d; + } +} PX_ALIGN_SUFFIX(16); + +PX_ALIGN_PREFIX(16) +struct VecI16V +{ + PxI16 i16[8]; + PX_FORCE_INLINE VecI16V() + { + } + PX_FORCE_INLINE VecI16V(PxI16 a, PxI16 b, PxI16 c, PxI16 d, PxI16 e, PxI16 f, PxI16 g, PxI16 h) + { + i16[0] = a; + i16[1] = b; + i16[2] = c; + i16[3] = d; + i16[4] = e; + i16[5] = f; + i16[6] = g; + i16[7] = h; + } +} PX_ALIGN_SUFFIX(16); + +PX_ALIGN_PREFIX(16) +struct VecU16V +{ + union + { + PxU16 u16[8]; + PxI16 i16[8]; + }; + PX_FORCE_INLINE VecU16V() + { + } + PX_FORCE_INLINE VecU16V(PxU16 a, PxU16 b, PxU16 c, PxU16 d, PxU16 e, PxU16 f, PxU16 g, PxU16 h) + { + u16[0] = a; + u16[1] = b; + u16[2] = c; + u16[3] = d; + u16[4] = e; + u16[5] = f; + u16[6] = g; + u16[7] = h; + } +} PX_ALIGN_SUFFIX(16); + +#define FloatVArg FloatV & +#define Vec3VArg Vec3V & +#define Vec4VArg Vec4V & +#define BoolVArg BoolV & +#define VecU32VArg VecU32V & +#define VecI32VArg VecI32V & +#define VecU16VArg VecU16V & +#define VecI16VArg VecI16V & +#define QuatVArg QuatV & + +#define VecCrossV Vec3V + +typedef VecI32V VecShiftV; +#define VecShiftVArg VecShiftV & + +#endif // PX_PHYSICS_COMMON_VECMATH_INLINE_SCALAR diff --git a/PxShared/src/foundation/include/PsVecMathAoSScalarInline.h b/PxShared/src/foundation/include/PsVecMathAoSScalarInline.h new file mode 100644 index 00000000..d1061290 --- /dev/null +++ b/PxShared/src/foundation/include/PsVecMathAoSScalarInline.h @@ -0,0 +1,2254 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSVECMATHAOSSCALARINLINE_H +#define PSFOUNDATION_PSVECMATHAOSSCALARINLINE_H + +#if COMPILE_VECTOR_INTRINSICS +#error Scalar version should not be included when using vector intrinsics. +#endif + +#define BOOL_TO_U32(b) (PxU32)(- PxI32(b)) +#define TRUE_TO_U32 (PxU32)(-1) +#define FALSE_TO_U32 (PxU32)(0) + +#define BOOL_TO_U16(b) (PxU16)(- PxI32(b)) + + +#define VECMATHAOS_ASSERT(x) { PX_ASSERT(x); } + +///////////////////////////////////////////////////////////////////// +////INTERNAL USE ONLY AND TESTS +///////////////////////////////////////////////////////////////////// + +namespace internalScalarSimd +{ +PX_FORCE_INLINE PxF32 FStore(const FloatV a) +{ + return a.x; +} + +PX_FORCE_INLINE bool hasZeroElementInFloatV(const FloatV a) +{ + return (0 == a.x); +} + +PX_FORCE_INLINE bool hasZeroElementInVec3V(const Vec3V a) +{ + return (0 == a.x || 0 == a.y || 0 == a.z); +} + +PX_FORCE_INLINE bool hasZeroElementInVec4V(const Vec4V a) +{ + return (0 == a.x || 0 == a.y || 0 == a.z || 0 == a.w); +} +} + +namespace _VecMathTests +{ +// PT: this function returns an invalid Vec3V (W!=0.0f) just for unit-testing 'isValidVec3V' +PX_FORCE_INLINE Vec3V getInvalidVec3V() +{ + Vec3V tmp; + tmp.x = tmp.y = tmp.z = 0.0f; + tmp.pad = 1.0f; + return tmp; +} + +PX_FORCE_INLINE bool allElementsEqualFloatV(const FloatV a, const FloatV b) +{ + return (a.x == b.x); +} + +PX_FORCE_INLINE bool allElementsEqualVec3V(const Vec3V a, const Vec3V b) +{ + return (a.x == b.x && a.y == b.y && a.z == b.z); +} + +PX_FORCE_INLINE bool allElementsEqualVec4V(const Vec4V a, const Vec4V b) +{ + return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w); +} + +PX_FORCE_INLINE bool allElementsEqualBoolV(const BoolV a, const BoolV b) +{ + return (a.ux == b.ux && a.uy == b.uy && a.uz == b.uz && a.uw == b.uw); +} + +PX_FORCE_INLINE bool allElementsEqualVecU32V(const VecU32V a, const VecU32V b) +{ + return (a.u32[0] == b.u32[0] && a.u32[1] == b.u32[1] && a.u32[2] == b.u32[2] && a.u32[3] == b.u32[3]); +} + +PX_FORCE_INLINE bool allElementsEqualVecI32V(const VecI32V a, const VecI32V b) +{ + return (a.i32[0] == b.i32[0] && a.i32[1] == b.i32[1] && a.i32[2] == b.i32[2] && a.i32[3] == b.i32[3]); +} + +#define VECMATH_AOS_EPSILON (1e-3f) + +PX_FORCE_INLINE bool allElementsNearEqualFloatV(const FloatV a, const FloatV b) +{ + const PxF32 cx = a.x - b.x; + return (cx > -VECMATH_AOS_EPSILON && cx < VECMATH_AOS_EPSILON); +} + +PX_FORCE_INLINE bool allElementsNearEqualVec3V(const Vec3V a, const Vec3V b) +{ + const PxF32 cx = a.x - b.x; + const PxF32 cy = a.y - b.y; + const PxF32 cz = a.z - b.z; + return (cx > -VECMATH_AOS_EPSILON && cx < VECMATH_AOS_EPSILON && cy > -VECMATH_AOS_EPSILON && + cy < VECMATH_AOS_EPSILON && cz > -VECMATH_AOS_EPSILON && cz < VECMATH_AOS_EPSILON); +} + +PX_FORCE_INLINE bool allElementsNearEqualVec4V(const Vec4V a, const Vec4V b) +{ + const PxF32 cx = a.x - b.x; + const PxF32 cy = a.y - b.y; + const PxF32 cz = a.z - b.z; + const PxF32 cw = a.w - b.w; + return (cx > -VECMATH_AOS_EPSILON && cx < VECMATH_AOS_EPSILON && cy > -VECMATH_AOS_EPSILON && + cy < VECMATH_AOS_EPSILON && cz > -VECMATH_AOS_EPSILON && cz < VECMATH_AOS_EPSILON && + cw > -VECMATH_AOS_EPSILON && cw < VECMATH_AOS_EPSILON); +} +} + +/////////////////////////////////////////////////////// + +PX_FORCE_INLINE bool isValidVec3V(const Vec3V a) +{ + return a.pad == 0.f; +} + +PX_FORCE_INLINE bool isFiniteFloatV(const FloatV a) +{ + return PxIsFinite(a.x); +} + +PX_FORCE_INLINE bool isFiniteVec3V(const Vec3V a) +{ + return PxIsFinite(a.x) && PxIsFinite(a.y) && PxIsFinite(a.z); +} + +PX_FORCE_INLINE bool isFiniteVec4V(const Vec4V a) +{ + return PxIsFinite(a.x) && PxIsFinite(a.y) && PxIsFinite(a.z) && PxIsFinite(a.w); +} + +///////////////////////////////////////////////////////////////////// +////VECTORISED FUNCTION IMPLEMENTATIONS +///////////////////////////////////////////////////////////////////// + +PX_FORCE_INLINE FloatV FLoad(const PxF32 f) +{ + return FloatV(f); +} + +PX_FORCE_INLINE Vec3V V3Load(const PxF32 f) +{ + return Vec3V(f, f, f); +} + +PX_FORCE_INLINE Vec4V V4Load(const PxF32 f) +{ + return Vec4V(f, f, f, f); +} + +PX_FORCE_INLINE BoolV BLoad(const bool f) +{ +#if PX_ARM + // SD: Android ARM builds fail if this is done with a cast. + // Might also fail because of something else but the select + // operator here seems to fix everything that failed in release builds. + return f ? BTTTT() : BFFFF(); +#else + return BoolV(BOOL_TO_U32(f), BOOL_TO_U32(f), BOOL_TO_U32(f), BOOL_TO_U32(f)); +#endif +} + +PX_FORCE_INLINE Vec3V V3LoadA(const PxVec3& f) +{ + return Vec3V(f.x, f.y, f.z); +} + +PX_FORCE_INLINE Vec3V V3LoadU(const PxVec3& f) +{ + return Vec3V(f.x, f.y, f.z); +} + +PX_FORCE_INLINE Vec3V V3LoadUnsafeA(const PxVec3& f) +{ + return Vec3V(f.x, f.y, f.z); +} + +PX_FORCE_INLINE Vec3V V3LoadA(const PxF32* const f) +{ + return Vec3V(f[0], f[1], f[2]); +} + +PX_FORCE_INLINE Vec3V V3LoadU(const PxF32* const f) +{ + return Vec3V(f[0], f[1], f[2]); +} + +PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V(Vec4V f) +{ + return Vec3V(f.x, f.y, f.z); +} + +PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V_WUndefined(const Vec4V v) +{ + return Vec3V(v.x, v.y, v.z); +} + +PX_FORCE_INLINE Vec4V Vec4V_From_Vec3V(Vec3V f) +{ + return Vec4V(f.x, f.y, f.z, 0.0f); +} + +PX_FORCE_INLINE Vec4V Vec4V_From_FloatV(FloatV f) +{ + return Vec4V(f.x, f.x, f.x, f.x); +} + +PX_FORCE_INLINE Vec3V Vec3V_From_FloatV(FloatV f) +{ + return Vec3V(f.x, f.x, f.x); +} + +PX_FORCE_INLINE Vec3V Vec3V_From_FloatV_WUndefined(FloatV f) +{ + return Vec3V(f.x, f.x, f.x); +} + +PX_FORCE_INLINE Vec4V V4LoadA(const PxF32* const f) +{ + return Vec4V(f[0], f[1], f[2], f[3]); +} + +PX_FORCE_INLINE void V4StoreA(const Vec4V a, PxF32* f) +{ + *reinterpret_cast<Vec4V*>(f) = a; +} + +PX_FORCE_INLINE void V4StoreU(const Vec4V a, PxF32* f) +{ + *reinterpret_cast<PxVec4*>(f) = *reinterpret_cast<const PxVec4*>(&a.x); +} + +PX_FORCE_INLINE void BStoreA(const BoolV a, PxU32* f) +{ + *reinterpret_cast<BoolV*>(f) = a; +} + +PX_FORCE_INLINE void U4StoreA(const VecU32V uv, PxU32* u) +{ + *reinterpret_cast<VecU32V*>(u) = uv; +} + +PX_FORCE_INLINE void I4StoreA(const VecI32V iv, PxI32* i) +{ + *reinterpret_cast<VecI32V*>(i) = iv; +} + +PX_FORCE_INLINE Vec4V V4LoadU(const PxF32* const f) +{ + return Vec4V(f[0], f[1], f[2], f[3]); +} + +PX_FORCE_INLINE Vec4V Vec4V_From_PxVec3_WUndefined(const PxVec3& f) +{ + return Vec4V(f[0], f[1], f[2], 0.0f); +} + +PX_FORCE_INLINE BoolV BLoad(const bool* const f) +{ + return BoolV(BOOL_TO_U32(f[0]), BOOL_TO_U32(f[1]), BOOL_TO_U32(f[2]), BOOL_TO_U32(f[3])); +} + +PX_FORCE_INLINE void FStore(const FloatV a, PxF32* PX_RESTRICT f) +{ + *f = a.x; +} + +PX_FORCE_INLINE void V3StoreA(const Vec3V a, PxVec3& f) +{ + f = PxVec3(a.x, a.y, a.z); +} + +PX_FORCE_INLINE void V3StoreU(const Vec3V a, PxVec3& f) +{ + f = PxVec3(a.x, a.y, a.z); +} + +PX_FORCE_INLINE void Store_From_BoolV(const BoolV b, PxU32* b2) +{ + *b2 = b.ux; +} + +////////////////////////// +// FLOATV +////////////////////////// + +PX_FORCE_INLINE FloatV FZero() +{ + return FLoad(0.0f); +} + +PX_FORCE_INLINE FloatV FOne() +{ + return FLoad(1.0f); +} + +PX_FORCE_INLINE FloatV FHalf() +{ + return FLoad(0.5f); +} + +PX_FORCE_INLINE FloatV FEps() +{ + return FLoad(PX_EPS_REAL); +} + +PX_FORCE_INLINE FloatV FEps6() +{ + return FLoad(1e-6f); +} + +PX_FORCE_INLINE FloatV FMax() +{ + return FLoad(PX_MAX_REAL); +} + +PX_FORCE_INLINE FloatV FNegMax() +{ + return FLoad(-PX_MAX_REAL); +} + +PX_FORCE_INLINE FloatV FNeg(const FloatV f) +{ + return FloatV(-f.x); +} + +PX_FORCE_INLINE FloatV FAdd(const FloatV a, const FloatV b) +{ + return FloatV(a.x + b.x); +} + +PX_FORCE_INLINE FloatV FSub(const FloatV a, const FloatV b) +{ + return FloatV(a.x - b.x); +} + +PX_FORCE_INLINE FloatV FMul(const FloatV a, const FloatV b) +{ + return FloatV(a.x * b.x); +} + +PX_FORCE_INLINE FloatV FDiv(const FloatV a, const FloatV b) +{ + VECMATHAOS_ASSERT(b.x != 0.0f); + return FloatV(a.x / b.x); +} + +PX_FORCE_INLINE FloatV FDivFast(const FloatV a, const FloatV b) +{ + VECMATHAOS_ASSERT(b.x != 0.0f); + return FloatV(a.x / b.x); +} + +PX_FORCE_INLINE FloatV FRecip(const FloatV a) +{ + VECMATHAOS_ASSERT(a.x != 0.0f); + return 1.0f / a.x; +} + +PX_FORCE_INLINE FloatV FRecipFast(const FloatV a) +{ + VECMATHAOS_ASSERT(a.x != 0.0f); + return 1.0f / a.x; +} + +PX_FORCE_INLINE FloatV FRsqrt(const FloatV a) +{ + VECMATHAOS_ASSERT(a.x != 0.0f); + return PxRecipSqrt(a.x); +} + +PX_FORCE_INLINE FloatV FSqrt(const FloatV a) +{ + return PxSqrt(a.x); +} + +PX_FORCE_INLINE FloatV FRsqrtFast(const FloatV a) +{ + VECMATHAOS_ASSERT(a.x != 0.0f); + return PxRecipSqrt(a.x); +} + +PX_FORCE_INLINE FloatV FScaleAdd(const FloatV a, const FloatV b, const FloatV c) +{ + return FAdd(FMul(a, b), c); +} + +PX_FORCE_INLINE FloatV FNegScaleSub(const FloatV a, const FloatV b, const FloatV c) +{ + return FSub(c, FMul(a, b)); +} + +PX_FORCE_INLINE FloatV FAbs(const FloatV a) +{ + return FloatV(PxAbs(a.x)); +} + +PX_FORCE_INLINE FloatV FSel(const BoolV c, const FloatV a, const FloatV b) +{ + return FloatV(c.ux ? a.x : b.x); +} + +PX_FORCE_INLINE BoolV FIsGrtr(const FloatV a, const FloatV b) +{ + return BLoad(a.x > b.x); +} + +PX_FORCE_INLINE BoolV FIsGrtrOrEq(const FloatV a, const FloatV b) +{ + return BLoad(a.x >= b.x); +} + +PX_FORCE_INLINE BoolV FIsEq(const FloatV a, const FloatV b) +{ + return BLoad(a.x == b.x); +} + +PX_FORCE_INLINE FloatV FMax(const FloatV a, const FloatV b) +{ + return (a.x > b.x ? FloatV(a.x) : FloatV(b.x)); +} + +PX_FORCE_INLINE FloatV FMin(const FloatV a, const FloatV b) +{ + return (a.x > b.x ? FloatV(b.x) : FloatV(a.x)); +} + +PX_FORCE_INLINE FloatV FClamp(const FloatV a, const FloatV minV, const FloatV maxV) +{ + return FMax(FMin(a, maxV), minV); +} + +PX_FORCE_INLINE PxU32 FAllGrtr(const FloatV a, const FloatV b) +{ + return BOOL_TO_U32(a.x > b.x); +} + +PX_FORCE_INLINE PxU32 FAllGrtrOrEq(const FloatV a, const FloatV b) +{ + return BOOL_TO_U32(a.x >= b.x); +} +PX_FORCE_INLINE PxU32 FAllEq(const FloatV a, const FloatV b) +{ + return BOOL_TO_U32(a.x == b.x); +} + +PX_FORCE_INLINE FloatV FRound(const FloatV a) +{ + return floor(a.x + 0.5f); +} + +PX_FORCE_INLINE FloatV FSin(const FloatV a) +{ + return sinf(a.x); +} + +PX_FORCE_INLINE FloatV FCos(const FloatV a) +{ + return cosf(a.x); +} + +PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV min, const FloatV max) +{ + return BOOL_TO_U32(a.x > max.x || a.x < min.x); +} + +PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV min, const FloatV max) +{ + return BOOL_TO_U32(a.x >= min.x && a.x <= max.x); +} + +PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV bounds) +{ + return FOutOfBounds(a, FNeg(bounds), bounds); +} + +PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV bounds) +{ + return FInBounds(a, FNeg(bounds), bounds); +} + +///////////////////// +// VEC3V +///////////////////// + +PX_FORCE_INLINE Vec3V V3Splat(const FloatV f) +{ + return Vec3V(f.x, f.x, f.x); +} + +PX_FORCE_INLINE Vec3V V3Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z) +{ + return Vec3V(x.x, y.x, z.x); +} + +PX_FORCE_INLINE Vec3V V3UnitX() +{ + return Vec3V(1.0f, 0.0f, 0.0f); +} + +PX_FORCE_INLINE Vec3V V3UnitY() +{ + return Vec3V(0.0f, 1.0f, 0.0f); +} + +PX_FORCE_INLINE Vec3V V3UnitZ() +{ + return Vec3V(0.0f, 0.0f, 1.0f); +} + +PX_FORCE_INLINE FloatV V3GetX(const Vec3V f) +{ + return FloatV(f.x); +} + +PX_FORCE_INLINE FloatV V3GetY(const Vec3V f) +{ + return FloatV(f.y); +} + +PX_FORCE_INLINE FloatV V3GetZ(const Vec3V f) +{ + return FloatV(f.z); +} + +PX_FORCE_INLINE Vec3V V3SetX(const Vec3V v, const FloatV f) +{ + return Vec3V(f.x, v.y, v.z); +} + +PX_FORCE_INLINE Vec3V V3SetY(const Vec3V v, const FloatV f) +{ + return Vec3V(v.x, f.x, v.z); +} + +PX_FORCE_INLINE Vec3V V3SetZ(const Vec3V v, const FloatV f) +{ + return Vec3V(v.x, v.y, f.x); +} + +PX_FORCE_INLINE Vec3V V3ColX(const Vec3V a, const Vec3V b, const Vec3V c) +{ + return Vec3V(a.x, b.x, c.x); +} + +PX_FORCE_INLINE Vec3V V3ColY(const Vec3V a, const Vec3V b, const Vec3V c) +{ + return Vec3V(a.y, b.y, c.y); +} + +PX_FORCE_INLINE Vec3V V3ColZ(const Vec3V a, const Vec3V b, const Vec3V c) +{ + return Vec3V(a.z, b.z, c.z); +} + +PX_FORCE_INLINE Vec3V V3Zero() +{ + return V3Load(0.0f); +} + +PX_FORCE_INLINE Vec3V V3One() +{ + return V3Load(1.0f); +} + +PX_FORCE_INLINE Vec3V V3Eps() +{ + return V3Load(PX_EPS_REAL); +} + +PX_FORCE_INLINE Vec3V V3Neg(const Vec3V c) +{ + return Vec3V(-c.x, -c.y, -c.z); +} + +PX_FORCE_INLINE Vec3V V3Add(const Vec3V a, const Vec3V b) +{ + return Vec3V(a.x + b.x, a.y + b.y, a.z + b.z); +} + +PX_FORCE_INLINE Vec3V V3Sub(const Vec3V a, const Vec3V b) +{ + return Vec3V(a.x - b.x, a.y - b.y, a.z - b.z); +} + +PX_FORCE_INLINE Vec3V V3Scale(const Vec3V a, const FloatV b) +{ + return Vec3V(a.x * b.x, a.y * b.x, a.z * b.x); +} + +PX_FORCE_INLINE Vec3V V3Mul(const Vec3V a, const Vec3V b) +{ + return Vec3V(a.x * b.x, a.y * b.y, a.z * b.z); +} + +PX_FORCE_INLINE Vec3V V3ScaleInv(const Vec3V a, const FloatV b) +{ + const PxF32 bInv = 1.0f / b.x; + return Vec3V(a.x * bInv, a.y * bInv, a.z * bInv); +} + +PX_FORCE_INLINE Vec3V V3Div(const Vec3V a, const Vec3V b) +{ + return Vec3V(a.x / b.x, a.y / b.y, a.z / b.z); +} + +PX_FORCE_INLINE Vec3V V3ScaleInvFast(const Vec3V a, const FloatV b) +{ + const PxF32 bInv = 1.0f / b.x; + return Vec3V(a.x * bInv, a.y * bInv, a.z * bInv); +} + +PX_FORCE_INLINE Vec3V V3DivFast(const Vec3V a, const Vec3V b) +{ + return Vec3V(a.x / b.x, a.y / b.y, a.z / b.z); +} + +PX_FORCE_INLINE Vec3V V3Recip(const Vec3V a) +{ + return Vec3V(1.0f / a.x, 1.0f / a.y, 1.0f / a.z); +} + +PX_FORCE_INLINE Vec3V V3RecipFast(const Vec3V a) +{ + return Vec3V(1.0f / a.x, 1.0f / a.y, 1.0f / a.z); +} + +PX_FORCE_INLINE Vec3V V3Rsqrt(const Vec3V a) +{ + return Vec3V(PxRecipSqrt(a.x), PxRecipSqrt(a.y), PxRecipSqrt(a.z)); +} + +PX_FORCE_INLINE Vec3V V3RsqrtFast(const Vec3V a) +{ + return Vec3V(PxRecipSqrt(a.x), PxRecipSqrt(a.y), PxRecipSqrt(a.z)); +} + +PX_FORCE_INLINE Vec3V V3ScaleAdd(const Vec3V a, const FloatV b, const Vec3V c) +{ + return V3Add(V3Scale(a, b), c); +} + +PX_FORCE_INLINE Vec3V V3NegScaleSub(const Vec3V a, const FloatV b, const Vec3V c) +{ + return V3Sub(c, V3Scale(a, b)); +} + +PX_FORCE_INLINE Vec3V V3MulAdd(const Vec3V a, const Vec3V b, const Vec3V c) +{ + return V3Add(V3Mul(a, b), c); +} + +PX_FORCE_INLINE Vec3V V3NegMulSub(const Vec3V a, const Vec3V b, const Vec3V c) +{ + return V3Sub(c, V3Mul(a, b)); +} + +PX_FORCE_INLINE FloatV V3Dot(const Vec3V a, const Vec3V b) +{ + return FloatV(a.x * b.x + a.y * b.y + a.z * b.z); +} + +PX_FORCE_INLINE VecCrossV V3PrepareCross(const Vec3VArg normal) +{ + return normal; +} + +PX_FORCE_INLINE Vec3V V3Cross(const Vec3V a, const Vec3V b) +{ + return Vec3V(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x); +} + +PX_FORCE_INLINE FloatV V3Length(const Vec3V a) +{ + return FloatV(PxSqrt(a.x * a.x + a.y * a.y + a.z * a.z)); +} + +PX_FORCE_INLINE FloatV V3LengthSq(const Vec3V a) +{ + return FloatV(a.x * a.x + a.y * a.y + a.z * a.z); +} + +PX_FORCE_INLINE Vec3V V3Normalize(const Vec3V a) +{ + VECMATHAOS_ASSERT(a.x != 0 || a.y != 0 || a.z != 0); + const PxF32 lengthInv = 1.0f / PxSqrt(a.x * a.x + a.y * a.y + a.z * a.z); + return Vec3V(a.x * lengthInv, a.y * lengthInv, a.z * lengthInv); +} + +PX_FORCE_INLINE Vec3V V3NormalizeSafe(const Vec3V a, const Vec3V unsafeReturnValue) +{ + const PxF32 length = PxSqrt(a.x * a.x + a.y * a.y + a.z * a.z); + if(PX_EPS_REAL >= length) + { + return unsafeReturnValue; + } + else + { + const PxF32 lengthInv = 1.0f / length; + return Vec3V(a.x * lengthInv, a.y * lengthInv, a.z * lengthInv); + } +} + +PX_FORCE_INLINE Vec3V V3NormalizeFast(const Vec3V a) +{ + VECMATHAOS_ASSERT(a.x != 0 || a.y != 0 || a.z != 0); + const PxF32 lengthInv = 1.0f / PxSqrt(a.x * a.x + a.y * a.y + a.z * a.z); + return Vec3V(a.x * lengthInv, a.y * lengthInv, a.z * lengthInv); +} + +PX_FORCE_INLINE Vec3V V3Sel(const BoolV c, const Vec3V a, const Vec3V b) +{ + return Vec3V(c.ux ? a.x : b.x, c.uy ? a.y : b.y, c.uz ? a.z : b.z); +} + +PX_FORCE_INLINE BoolV V3IsGrtr(const Vec3V a, const Vec3V b) +{ + return BoolV(BOOL_TO_U32(a.x > b.x), BOOL_TO_U32(a.y > b.y), BOOL_TO_U32(a.z > b.z), FALSE_TO_U32); +} + +PX_FORCE_INLINE BoolV V3IsGrtrOrEq(const Vec3V a, const Vec3V b) +{ + return BoolV(BOOL_TO_U32(a.x >= b.x), BOOL_TO_U32(a.y >= b.y), BOOL_TO_U32(a.z >= b.z), TRUE_TO_U32); +} + +PX_FORCE_INLINE BoolV V3IsEq(const Vec3V a, const Vec3V b) +{ + return BoolV(BOOL_TO_U32(a.x == b.x), BOOL_TO_U32(a.y == b.y), BOOL_TO_U32(a.z == b.z), TRUE_TO_U32); +} + +PX_FORCE_INLINE Vec3V V3Max(const Vec3V a, const Vec3V b) +{ + return Vec3V(a.x > b.x ? a.x : b.x, a.y > b.y ? a.y : b.y, a.z > b.z ? a.z : b.z); +} + +PX_FORCE_INLINE Vec3V V3Min(const Vec3V a, const Vec3V b) +{ + return Vec3V(a.x < b.x ? a.x : b.x, a.y < b.y ? a.y : b.y, a.z < b.z ? a.z : b.z); +} + +PX_FORCE_INLINE FloatV V3ExtractMax(const Vec3V a) +{ + const PxF32 t0 = (a.x >= a.y) ? a.x : a.y; + return t0 >= a.z ? t0 : a.z; +} + +PX_FORCE_INLINE FloatV V3ExtractMin(const Vec3V a) +{ + const PxF32 t0 = (a.x <= a.y) ? a.x : a.y; + return t0 <= a.z ? t0 : a.z; +} + +// return (a >= 0.0f) ? 1.0f : -1.0f; +PX_FORCE_INLINE Vec3V V3Sign(const Vec3V a) +{ + return Vec3V((a.x >= 0.f ? 1.f : -1.f), (a.y >= 0.f ? 1.f : -1.f), (a.z >= 0.f ? 1.f : -1.f)); +} + +PX_FORCE_INLINE Vec3V V3Clamp(const Vec3V a, const Vec3V minV, const Vec3V maxV) +{ + return V3Max(V3Min(a, maxV), minV); +} + +PX_FORCE_INLINE Vec3V V3Abs(const Vec3V a) +{ + return V3Max(a, V3Neg(a)); +} + +PX_FORCE_INLINE PxU32 V3AllGrtr(const Vec3V a, const Vec3V b) +{ + return BOOL_TO_U32((a.x > b.x) & (a.y > b.y) & (a.z > b.z)); +} + +PX_FORCE_INLINE PxU32 V3AllGrtrOrEq(const Vec3V a, const Vec3V b) +{ + return BOOL_TO_U32((a.x >= b.x) & (a.y >= b.y) & (a.z >= b.z)); +} + +PX_FORCE_INLINE PxU32 V3AllEq(const Vec3V a, const Vec3V b) +{ + return BOOL_TO_U32((a.x == b.x) & (a.y == b.y) & (a.z == b.z)); +} + +PX_FORCE_INLINE Vec3V V3Round(const Vec3V a) +{ + return Vec3V(floor(a.x + 0.5f), floor(a.y + 0.5f), floor(a.z + 0.5f)); +} + +PX_FORCE_INLINE Vec3V V3Sin(const Vec3V a) +{ + return Vec3V(sinf(a.x), sinf(a.y), sinf(a.z)); +} + +PX_FORCE_INLINE Vec3V V3Cos(const Vec3V a) +{ + return Vec3V(cosf(a.x), cosf(a.y), cosf(a.z)); +} + +PX_FORCE_INLINE Vec3V V3PermYZZ(const Vec3V a) +{ + return Vec3V(a.y, a.z, a.z); +} + +PX_FORCE_INLINE Vec3V V3PermXYX(const Vec3V a) +{ + return Vec3V(a.x, a.y, a.x); +} + +PX_FORCE_INLINE Vec3V V3PermYZX(const Vec3V a) +{ + return Vec3V(a.y, a.z, a.x); +} + +PX_FORCE_INLINE Vec3V V3PermZXY(const Vec3V a) +{ + return Vec3V(a.z, a.x, a.y); +} + +PX_FORCE_INLINE Vec3V V3PermZZY(const Vec3V a) +{ + return Vec3V(a.z, a.z, a.y); +} + +PX_FORCE_INLINE Vec3V V3PermYXX(const Vec3V a) +{ + return Vec3V(a.y, a.x, a.x); +} + +PX_FORCE_INLINE Vec3V V3Perm_Zero_1Z_0Y(const Vec3V v0, const Vec3V v1) +{ + return Vec3V(0.0f, v1.z, v0.y); +} + +PX_FORCE_INLINE Vec3V V3Perm_0Z_Zero_1X(const Vec3V v0, const Vec3V v1) +{ + return Vec3V(v0.z, 0.0f, v1.x); +} + +PX_FORCE_INLINE Vec3V V3Perm_1Y_0X_Zero(const Vec3V v0, const Vec3V v1) +{ + return Vec3V(v1.y, v0.x, 0.0f); +} + +PX_FORCE_INLINE FloatV V3SumElems(const Vec3V a) +{ + return FloatV(a.x + a.y + a.z); +} + +PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V min, const Vec3V max) +{ + return BOOL_TO_U32(a.x > max.x || a.y > max.y || a.z > max.z || a.x < min.x || a.y < min.y || a.z < min.z); +} + +PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V min, const Vec3V max) +{ + return BOOL_TO_U32(a.x <= max.x && a.y <= max.y && a.z <= max.z && a.x >= min.x && a.y >= min.y && a.z >= min.z); +} + +PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V bounds) +{ + return V3OutOfBounds(a, V3Neg(bounds), bounds); +} + +PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V bounds) +{ + return V3InBounds(a, V3Neg(bounds), bounds); +} + +PX_FORCE_INLINE void V3Transpose(Vec3V& col0, Vec3V& col1, Vec3V& col2) +{ + const PxF32 t01 = col0.y, t02 = col0.z, t12 = col1.z; + col0.y = col1.x; + col0.z = col2.x; + col1.z = col2.y; + col1.x = t01; + col2.x = t02; + col2.y = t12; +} + +///////////////////////// +// VEC4V +///////////////////////// + +PX_FORCE_INLINE Vec4V V4Splat(const FloatV f) +{ + return Vec4V(f.x, f.x, f.x, f.x); +} + +PX_FORCE_INLINE Vec4V V4Merge(const FloatV* const floatVArray) +{ + return Vec4V(floatVArray[0].x, floatVArray[1].x, floatVArray[2].x, floatVArray[3].x); +} + +PX_FORCE_INLINE Vec4V V4Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z, const FloatVArg w) +{ + return Vec4V(x.x, y.x, z.x, w.x); +} + +PX_FORCE_INLINE Vec4V V4MergeW(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w) +{ + return Vec4V(x.w, y.w, z.w, w.w); +} + +PX_FORCE_INLINE Vec4V V4MergeZ(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w) +{ + return Vec4V(x.z, y.z, z.z, w.z); +} + +PX_FORCE_INLINE Vec4V V4MergeY(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w) +{ + return Vec4V(x.y, y.y, z.y, w.y); +} + +PX_FORCE_INLINE Vec4V V4MergeX(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w) +{ + return Vec4V(x.x, y.x, z.x, w.x); +} + +PX_FORCE_INLINE Vec4V V4UnpackXY(const Vec4VArg a, const Vec4VArg b) +{ + return Vec4V(a.x, b.x, a.y, b.y); +} + +PX_FORCE_INLINE Vec4V V4UnpackZW(const Vec4VArg a, const Vec4VArg b) +{ + return Vec4V(a.z, b.z, a.w, b.w); +} + +PX_FORCE_INLINE Vec4V V4UnitX() +{ + return Vec4V(1.0f, 0.0f, 0.0f, 0.0f); +} + +PX_FORCE_INLINE Vec4V V4UnitY() +{ + return Vec4V(0.0f, 1.0f, 0.0f, 0.0f); +} + +PX_FORCE_INLINE Vec4V V4UnitZ() +{ + return Vec4V(0.0f, 0.0f, 1.0f, 0.0f); +} + +PX_FORCE_INLINE Vec4V V4UnitW() +{ + return Vec4V(0.0f, 0.0f, 0.0f, 1.0f); +} + +PX_FORCE_INLINE FloatV V4GetX(const Vec4V f) +{ + return FloatV(f.x); +} + +PX_FORCE_INLINE FloatV V4GetY(const Vec4V f) +{ + return FloatV(f.y); +} + +PX_FORCE_INLINE FloatV V4GetZ(const Vec4V f) +{ + return FloatV(f.z); +} + +PX_FORCE_INLINE FloatV V4GetW(const Vec4V f) +{ + return FloatV(f.w); +} + +PX_FORCE_INLINE Vec4V V4SetX(const Vec4V v, const FloatV f) +{ + return Vec4V(f.x, v.y, v.z, v.w); +} + +PX_FORCE_INLINE Vec4V V4SetY(const Vec4V v, const FloatV f) +{ + return Vec4V(v.x, f.x, v.z, v.w); +} + +PX_FORCE_INLINE Vec4V V4SetZ(const Vec4V v, const FloatV f) +{ + return Vec4V(v.x, v.y, f.x, v.w); +} + +PX_FORCE_INLINE Vec4V V4SetW(const Vec4V v, const FloatV f) +{ + return Vec4V(v.x, v.y, v.z, f.x); +} + +PX_FORCE_INLINE Vec4V V4SetW(const Vec3V v, const FloatV f) +{ + return Vec4V(v.x, v.y, v.z, f.x); +} + +PX_FORCE_INLINE Vec4V V4ClearW(const Vec4V v) +{ + return Vec4V(v.x, v.y, v.z, 0.0f); +} + +PX_FORCE_INLINE Vec4V V4PermYXWZ(const Vec4V v) +{ + return Vec4V(v.y, v.x, v.w, v.z); +} + +PX_FORCE_INLINE Vec4V V4PermXZXZ(const Vec4V v) +{ + return Vec4V(v.x, v.z, v.x, v.z); +} + +PX_FORCE_INLINE Vec4V V4PermYWYW(const Vec4V v) +{ + return Vec4V(v.y, v.w, v.y, v.w); +} + +PX_FORCE_INLINE Vec4V V4PermYZXW(const Vec4V v) +{ + return Vec4V(v.y, v.z, v.x, v.w); +} + +template <PxU8 _x, PxU8 _y, PxU8 _z, PxU8 _w> +PX_FORCE_INLINE Vec4V V4Perm(const Vec4V v) +{ + const PxF32 f[4] = { v.x, v.y, v.z, v.w }; + return Vec4V(f[_x], f[_y], f[_z], f[_w]); +} + +PX_FORCE_INLINE Vec4V V4Zero() +{ + return V4Load(0.0f); +} + +PX_FORCE_INLINE Vec4V V4One() +{ + return V4Load(1.0f); +} + +PX_FORCE_INLINE Vec4V V4Eps() +{ + return V4Load(PX_EPS_REAL); +} + +PX_FORCE_INLINE Vec4V V4Neg(const Vec4V c) +{ + return Vec4V(-c.x, -c.y, -c.z, -c.w); +} + +PX_FORCE_INLINE Vec4V V4Add(const Vec4V a, const Vec4V b) +{ + return Vec4V(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} + +PX_FORCE_INLINE Vec4V V4Sub(const Vec4V a, const Vec4V b) +{ + return Vec4V(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); +} + +PX_FORCE_INLINE Vec4V V4Scale(const Vec4V a, const FloatV b) +{ + return Vec4V(a.x * b.x, a.y * b.x, a.z * b.x, a.w * b.x); +} + +PX_FORCE_INLINE Vec4V V4Mul(const Vec4V a, const Vec4V b) +{ + return Vec4V(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); +} + +PX_FORCE_INLINE Vec4V V4ScaleInv(const Vec4V a, const FloatV b) +{ + const PxF32 bInv = 1.0f / b.x; + return Vec4V(a.x * bInv, a.y * bInv, a.z * bInv, a.w * bInv); +} + +PX_FORCE_INLINE Vec4V V4Div(const Vec4V a, const Vec4V b) +{ + VECMATHAOS_ASSERT(b.x != 0 && b.y != 0 && b.z != 0 && b.w != 0); + return Vec4V(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); +} + +PX_FORCE_INLINE Vec4V V4ScaleInvFast(const Vec4V a, const FloatV b) +{ + const PxF32 bInv = 1.0f / b.x; + return Vec4V(a.x * bInv, a.y * bInv, a.z * bInv, a.w * bInv); +} + +PX_FORCE_INLINE Vec4V V4DivFast(const Vec4V a, const Vec4V b) +{ + return Vec4V(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); +} + +PX_FORCE_INLINE Vec4V V4Recip(const Vec4V a) +{ + return Vec4V(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w); +} + +PX_FORCE_INLINE Vec4V V4RecipFast(const Vec4V a) +{ + return Vec4V(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w); +} + +PX_FORCE_INLINE Vec4V V4Rsqrt(const Vec4V a) +{ + return Vec4V(PxRecipSqrt(a.x), PxRecipSqrt(a.y), PxRecipSqrt(a.z), PxRecipSqrt(a.w)); +} + +PX_FORCE_INLINE Vec4V V4RsqrtFast(const Vec4V a) +{ + return Vec4V(PxRecipSqrt(a.x), PxRecipSqrt(a.y), PxRecipSqrt(a.z), PxRecipSqrt(a.w)); +} + +PX_FORCE_INLINE Vec4V V4Sqrt(const Vec4V a) +{ + return Vec4V(PxSqrt(a.x), PxSqrt(a.y), PxSqrt(a.z), PxSqrt(a.w)); +} + +PX_FORCE_INLINE Vec4V V4ScaleAdd(const Vec4V a, const FloatV b, const Vec4V c) +{ + return V4Add(V4Scale(a, b), c); +} + +PX_FORCE_INLINE Vec4V V4NegScaleSub(const Vec4V a, const FloatV b, const Vec4V c) +{ + return V4Sub(c, V4Scale(a, b)); +} + +PX_FORCE_INLINE Vec4V V4MulAdd(const Vec4V a, const Vec4V b, const Vec4V c) +{ + return V4Add(V4Mul(a, b), c); +} + +PX_FORCE_INLINE Vec4V V4NegMulSub(const Vec4V a, const Vec4V b, const Vec4V c) +{ + return V4Sub(c, V4Mul(a, b)); +} + +PX_FORCE_INLINE FloatV V4SumElements(const Vec4V a) +{ + return FloatV(a.x + a.y + a.z + a.w); +} + +PX_FORCE_INLINE FloatV V4Dot(const Vec4V a, const Vec4V b) +{ + return FloatV(a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w); +} + +PX_FORCE_INLINE FloatV V4Dot3(const Vec4V a, const Vec4V b) +{ + return FloatV(a.x * b.x + a.y * b.y + a.z * b.z); +} + +PX_FORCE_INLINE Vec4V V4Cross(const Vec4V a, const Vec4V b) +{ + return Vec4V(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f); +} + +PX_FORCE_INLINE FloatV V4Length(const Vec4V a) +{ + return FloatV(PxSqrt(a.x * a.x + a.y * a.y + a.z * a.z + a.w * a.w)); +} + +PX_FORCE_INLINE FloatV V4LengthSq(const Vec4V a) +{ + return V4Dot(a, a); +} + +PX_FORCE_INLINE Vec4V V4Normalize(const Vec4V a) +{ + VECMATHAOS_ASSERT(0 != a.x || 0 != a.y || 0 != a.z || 0 != a.w); + const FloatV length = FloatV(V4Length(a)); + return V4ScaleInv(a, length); +} + +PX_FORCE_INLINE Vec4V V4NormalizeSafe(const Vec4V a, const Vec4V unsafeReturnValue) +{ + const FloatV length = FloatV(V4Length(a)); + if(PX_EPS_REAL >= length.x) + { + return unsafeReturnValue; + } + else + { + return V4ScaleInv(a, length); + } +} +PX_FORCE_INLINE Vec4V V4NormalizeFast(const Vec4V a) +{ + VECMATHAOS_ASSERT(0 != a.x || 0 != a.y || 0 != a.z || 0 != a.w); + const FloatV length = FloatV(V4Length(a)); + return V4ScaleInv(a, length); +} + +PX_FORCE_INLINE Vec4V V4Sel(const BoolV c, const Vec4V a, const Vec4V b) +{ + return Vec4V(c.ux ? a.x : b.x, c.uy ? a.y : b.y, c.uz ? a.z : b.z, c.uw ? a.w : b.w); +} + +PX_FORCE_INLINE BoolV V4IsGrtr(const Vec4V a, const Vec4V b) +{ + return BoolV(BOOL_TO_U32(a.x > b.x), BOOL_TO_U32(a.y > b.y), BOOL_TO_U32(a.z > b.z), BOOL_TO_U32(a.w > b.w)); +} + +PX_FORCE_INLINE BoolV V4IsGrtrOrEq(const Vec4V a, const Vec4V b) +{ + return BoolV(BOOL_TO_U32(a.x >= b.x), BOOL_TO_U32(a.y >= b.y), BOOL_TO_U32(a.z >= b.z), BOOL_TO_U32(a.w >= b.w)); +} + +PX_FORCE_INLINE BoolV V4IsEq(const Vec4V a, const Vec4V b) +{ + return BoolV(BOOL_TO_U32(a.x == b.x), BOOL_TO_U32(a.y == b.y), BOOL_TO_U32(a.z == b.z), BOOL_TO_U32(a.w == b.w)); +} + +PX_FORCE_INLINE Vec4V V4Max(const Vec4V a, const Vec4V b) +{ + return Vec4V(a.x > b.x ? a.x : b.x, a.y > b.y ? a.y : b.y, a.z > b.z ? a.z : b.z, a.w > b.w ? a.w : b.w); +} + +PX_FORCE_INLINE Vec4V V4Min(const Vec4V a, const Vec4V b) +{ + return Vec4V(a.x < b.x ? a.x : b.x, a.y < b.y ? a.y : b.y, a.z < b.z ? a.z : b.z, a.w < b.w ? a.w : b.w); +} + +PX_FORCE_INLINE FloatV V4ExtractMax(const Vec4V a) +{ + const PxF32 t0 = (a.x >= a.y) ? a.x : a.y; + const PxF32 t1 = (a.z >= a.w) ? a.x : a.w; + return t0 >= t1 ? t0 : t1; +} + +PX_FORCE_INLINE FloatV V4ExtractMin(const Vec4V a) +{ + const PxF32 t0 = (a.x <= a.y) ? a.x : a.y; + const PxF32 t1 = (a.z <= a.w) ? a.x : a.w; + return t0 <= t1 ? t0 : t1; +} + +PX_FORCE_INLINE Vec4V V4Clamp(const Vec4V a, const Vec4V minV, const Vec4V maxV) +{ + return V4Max(V4Min(a, maxV), minV); +} + +PX_FORCE_INLINE Vec4V V4Round(const Vec4V a) +{ + return Vec4V(floor(a.x + 0.5f), floor(a.y + 0.5f), floor(a.z + 0.5f), floor(a.w + 0.5f)); +} + +PX_FORCE_INLINE Vec4V V4Sin(const Vec4V a) +{ + return Vec4V(sinf(a.x), sinf(a.y), sinf(a.z), sinf(a.w)); +} + +PX_FORCE_INLINE Vec4V V4Cos(const Vec4V a) +{ + return Vec4V(cosf(a.x), cosf(a.y), cosf(a.z), cosf(a.w)); +} + +PX_FORCE_INLINE PxU32 V4AllGrtr(const Vec4V a, const Vec4V b) +{ + return BOOL_TO_U32((a.x > b.x) & (a.y > b.y) & (a.z > b.z) & (a.w > b.w)); +} + +PX_FORCE_INLINE PxU32 V4AllGrtrOrEq(const Vec4V a, const Vec4V b) +{ + return BOOL_TO_U32((a.x >= b.x) & (a.y >= b.y) & (a.z >= b.z) & (a.w >= b.w)); +} + +PX_FORCE_INLINE PxU32 V4AllGrtrOrEq3(const Vec4V a, const Vec4V b) +{ + return BOOL_TO_U32((a.x >= b.x) & (a.y >= b.y) & (a.z >= b.z)); +} + +PX_FORCE_INLINE PxU32 V4AllEq(const Vec4V a, const Vec4V b) +{ + return BOOL_TO_U32((a.x == b.x) & (a.y == b.y) & (a.z == b.z) & (a.w == b.w)); +} + +PX_FORCE_INLINE PxU32 V4AnyGrtr3(const Vec4V a, const Vec4V b) +{ + return BOOL_TO_U32((a.x > b.x) | (a.y > b.y) | (a.z > b.z)); +} + +PX_FORCE_INLINE void V4Transpose(Vec4V& col0, Vec4V& col1, Vec4V& col2, Vec4V& col3) +{ + const PxF32 t01 = col0.y, t02 = col0.z, t03 = col0.w; + const PxF32 t12 = col1.z, t13 = col1.w; + const PxF32 t23 = col2.w; + col0.y = col1.x; + col0.z = col2.x; + col0.w = col3.x; + col1.z = col2.y; + col1.w = col3.y; + col2.w = col3.z; + col1.x = t01; + col2.x = t02; + col3.x = t03; + col2.y = t12; + col3.y = t13; + col3.z = t23; +} + +PX_FORCE_INLINE BoolV BFFFF() +{ + return BoolV(FALSE_TO_U32, FALSE_TO_U32, FALSE_TO_U32, FALSE_TO_U32); +} +PX_FORCE_INLINE BoolV BFFFT() +{ + return BoolV(FALSE_TO_U32, FALSE_TO_U32, FALSE_TO_U32, TRUE_TO_U32); +} +PX_FORCE_INLINE BoolV BFFTF() +{ + return BoolV(FALSE_TO_U32, FALSE_TO_U32, TRUE_TO_U32, FALSE_TO_U32); +} +PX_FORCE_INLINE BoolV BFFTT() +{ + return BoolV(FALSE_TO_U32, FALSE_TO_U32, TRUE_TO_U32, TRUE_TO_U32); +} +PX_FORCE_INLINE BoolV BFTFF() +{ + return BoolV(FALSE_TO_U32, TRUE_TO_U32, FALSE_TO_U32, FALSE_TO_U32); +} +PX_FORCE_INLINE BoolV BFTFT() +{ + return BoolV(FALSE_TO_U32, TRUE_TO_U32, FALSE_TO_U32, TRUE_TO_U32); +} +PX_FORCE_INLINE BoolV BFTTF() +{ + return BoolV(FALSE_TO_U32, TRUE_TO_U32, TRUE_TO_U32, FALSE_TO_U32); +} +PX_FORCE_INLINE BoolV BFTTT() +{ + return BoolV(FALSE_TO_U32, TRUE_TO_U32, TRUE_TO_U32, TRUE_TO_U32); +} +PX_FORCE_INLINE BoolV BTFFF() +{ + return BoolV(TRUE_TO_U32, FALSE_TO_U32, FALSE_TO_U32, FALSE_TO_U32); +} +PX_FORCE_INLINE BoolV BTFFT() +{ + return BoolV(TRUE_TO_U32, FALSE_TO_U32, FALSE_TO_U32, TRUE_TO_U32); +} +PX_FORCE_INLINE BoolV BTFTF() +{ + return BoolV(TRUE_TO_U32, FALSE_TO_U32, TRUE_TO_U32, FALSE_TO_U32); +} +PX_FORCE_INLINE BoolV BTFTT() +{ + return BoolV(TRUE_TO_U32, FALSE_TO_U32, TRUE_TO_U32, TRUE_TO_U32); +} +PX_FORCE_INLINE BoolV BTTFF() +{ + return BoolV(TRUE_TO_U32, TRUE_TO_U32, FALSE_TO_U32, FALSE_TO_U32); +} +PX_FORCE_INLINE BoolV BTTFT() +{ + return BoolV(TRUE_TO_U32, TRUE_TO_U32, FALSE_TO_U32, TRUE_TO_U32); +} +PX_FORCE_INLINE BoolV BTTTF() +{ + return BoolV(TRUE_TO_U32, TRUE_TO_U32, TRUE_TO_U32, FALSE_TO_U32); +} +PX_FORCE_INLINE BoolV BTTTT() +{ + return BoolV(TRUE_TO_U32, TRUE_TO_U32, TRUE_TO_U32, TRUE_TO_U32); +} + +PX_FORCE_INLINE BoolV BXMask() +{ + return BTFFF(); +} +PX_FORCE_INLINE BoolV BYMask() +{ + return BFTFF(); +} +PX_FORCE_INLINE BoolV BZMask() +{ + return BFFTF(); +} +PX_FORCE_INLINE BoolV BWMask() +{ + return BFFFT(); +} + +PX_FORCE_INLINE BoolV BGetX(const BoolV a) +{ + return BoolV(a.ux, a.ux, a.ux, a.ux); +} + +PX_FORCE_INLINE BoolV BGetY(const BoolV a) +{ + return BoolV(a.uy, a.uy, a.uy, a.uy); +} + +PX_FORCE_INLINE BoolV BGetZ(const BoolV a) +{ + return BoolV(a.uz, a.uz, a.uz, a.uz); +} + +PX_FORCE_INLINE BoolV BGetW(const BoolV a) +{ + return BoolV(a.uw, a.uw, a.uw, a.uw); +} + +PX_FORCE_INLINE BoolV BSetX(const BoolV v, const BoolV f) +{ + return BoolV(f.ux, v.uy, v.uz, v.uw); +} + +PX_FORCE_INLINE BoolV BSetY(const BoolV v, const BoolV f) +{ + return BoolV(v.ux, f.uy, v.uz, v.uw); +} + +PX_FORCE_INLINE BoolV BSetZ(const BoolV v, const BoolV f) +{ + return BoolV(v.ux, v.uy, f.uz, v.uw); +} + +PX_FORCE_INLINE BoolV BSetW(const BoolV v, const BoolV f) +{ + return BoolV(v.ux, v.uy, v.uz, f.uw); +} + +template <int index> +BoolV BSplatElement(BoolV a) +{ + PxU32* b = (PxU32*)&a; + return BoolV(b[index], b[index], b[index], b[index]); +} + +PX_FORCE_INLINE BoolV BAnd(const BoolV a, const BoolV b) +{ + return BoolV(BOOL_TO_U32(a.ux && b.ux), BOOL_TO_U32(a.uy && b.uy), BOOL_TO_U32(a.uz && b.uz), BOOL_TO_U32(a.uw && b.uw)); +} + +PX_FORCE_INLINE BoolV BAndNot(const BoolV a, const BoolV b) +{ + return BoolV(a.ux & ~b.ux, a.uy & ~b.uy, a.uz & ~b.uz, a.uw & ~b.uw); +} + +PX_FORCE_INLINE BoolV BNot(const BoolV a) +{ + return BoolV(~a.ux, ~a.uy, ~a.uz, ~a.uw); +} + +PX_FORCE_INLINE BoolV BOr(const BoolV a, const BoolV b) +{ + return BoolV(BOOL_TO_U32(a.ux || b.ux), BOOL_TO_U32(a.uy || b.uy), BOOL_TO_U32(a.uz || b.uz), BOOL_TO_U32(a.uw || b.uw)); +} + +PX_FORCE_INLINE PxU32 BAllEq(const BoolV a, const BoolV b) +{ + return (a.ux == b.ux && a.uy == b.uy && a.uz == b.uz && a.uw == b.uw ? TRUE_TO_U32 : FALSE_TO_U32); +} + +PX_FORCE_INLINE PxU32 BAllEqTTTT(const BoolV a) +{ + return BAllEq(a, BTTTT()); +} + +PX_FORCE_INLINE PxU32 BAllEqFFFF(const BoolV a) +{ + return BAllEq(a, BFFFF()); +} + +PX_FORCE_INLINE BoolV BAllTrue4(const BoolV a) +{ + return (a.ux & a.uy & a.uz & a.uw) ? BTTTT() : BFFFF(); +} + +PX_FORCE_INLINE BoolV BAnyTrue4(const BoolV a) +{ + return (a.ux | a.uy | a.uz | a.uw) ? BTTTT() : BFFFF(); +} + +PX_FORCE_INLINE BoolV BAllTrue3(const BoolV a) +{ + return (a.ux & a.uy & a.uz) ? BTTTT() : BFFFF(); +} + +PX_FORCE_INLINE BoolV BAnyTrue3(const BoolV a) +{ + return (a.ux | a.uy | a.uz) ? BTTTT() : BFFFF(); +} + +PX_FORCE_INLINE PxU32 BGetBitMask(const BoolV a) +{ + return (a.ux & 1) | (a.uy & 2) | (a.uz & 4) | (a.uw & 8); +} + +////////////////////////////////// +// MAT33V +////////////////////////////////// + +PX_FORCE_INLINE Vec3V M33MulV3(const Mat33V& a, const Vec3V b) +{ + return Vec3V(a.col0.x * b.x + a.col1.x * b.y + a.col2.x * b.z, a.col0.y * b.x + a.col1.y * b.y + a.col2.y * b.z, + a.col0.z * b.x + a.col1.z * b.y + a.col2.z * b.z); +} + +PX_FORCE_INLINE Vec3V M33TrnspsMulV3(const Mat33V& a, const Vec3V b) +{ + return Vec3V(a.col0.x * b.x + a.col0.y * b.y + a.col0.z * b.z, a.col1.x * b.x + a.col1.y * b.y + a.col1.z * b.z, + a.col2.x * b.x + a.col2.y * b.y + a.col2.z * b.z); +} + +PX_FORCE_INLINE Vec3V M33MulV3AddV3(const Mat33V& A, const Vec3V b, const Vec3V c) +{ + const FloatV x = V3GetX(b); + const FloatV y = V3GetY(b); + const FloatV z = V3GetZ(b); + Vec3V result = V3ScaleAdd(A.col0, x, c); + result = V3ScaleAdd(A.col1, y, result); + return V3ScaleAdd(A.col2, z, result); +} + +PX_FORCE_INLINE Mat33V M33MulM33(const Mat33V& a, const Mat33V& b) +{ + return Mat33V(M33MulV3(a, b.col0), M33MulV3(a, b.col1), M33MulV3(a, b.col2)); +} + +PX_FORCE_INLINE Mat33V M33Add(const Mat33V& a, const Mat33V& b) +{ + return Mat33V(V3Add(a.col0, b.col0), V3Add(a.col1, b.col1), V3Add(a.col2, b.col2)); +} + +PX_FORCE_INLINE Mat33V M33Scale(const Mat33V& a, const FloatV& b) +{ + return Mat33V(V3Scale(a.col0, b), V3Scale(a.col1, b), V3Scale(a.col2, b)); +} + +PX_FORCE_INLINE Mat33V M33Sub(const Mat33V& a, const Mat33V& b) +{ + return Mat33V(V3Sub(a.col0, b.col0), V3Sub(a.col1, b.col1), V3Sub(a.col2, b.col2)); +} + +PX_FORCE_INLINE Mat33V M33Neg(const Mat33V& a) +{ + return Mat33V(V3Neg(a.col0), V3Neg(a.col1), V3Neg(a.col2)); +} + +PX_FORCE_INLINE Mat33V M33Abs(const Mat33V& a) +{ + return Mat33V(V3Abs(a.col0), V3Abs(a.col1), V3Abs(a.col2)); +} + +PX_FORCE_INLINE Mat33V M33Diagonal(const Vec3VArg d) +{ + const Vec3V x = V3Mul(V3UnitX(), d); + const Vec3V y = V3Mul(V3UnitY(), d); + const Vec3V z = V3Mul(V3UnitZ(), d); + return Mat33V(x, y, z); +} + +PX_FORCE_INLINE Mat33V M33Inverse(const Mat33V& a) +{ + const PxF32 det = a.col0.x * (a.col1.y * a.col2.z - a.col1.z * a.col2.y) - + a.col1.x * (a.col0.y * a.col2.z - a.col2.y * a.col0.z) + + a.col2.x * (a.col0.y * a.col1.z - a.col1.y * a.col0.z); + + const PxF32 invDet = 1.0f / det; + + Mat33V ret; + ret.col0.x = invDet * (a.col1.y * a.col2.z - a.col2.y * a.col1.z); + ret.col0.y = invDet * (a.col2.y * a.col0.z - a.col0.y * a.col2.z); + ret.col0.z = invDet * (a.col0.y * a.col1.z - a.col1.y * a.col0.z); + + ret.col1.x = invDet * (a.col2.x * a.col1.z - a.col1.x * a.col2.z); + ret.col1.y = invDet * (a.col0.x * a.col2.z - a.col2.x * a.col0.z); + ret.col1.z = invDet * (a.col1.x * a.col0.z - a.col0.x * a.col1.z); + + ret.col2.x = invDet * (a.col1.x * a.col2.y - a.col2.x * a.col1.y); + ret.col2.y = invDet * (a.col2.x * a.col0.y - a.col0.x * a.col2.y); + ret.col2.z = invDet * (a.col0.x * a.col1.y - a.col1.x * a.col0.y); + + return ret; +} + +PX_FORCE_INLINE Mat33V Mat33V_From_PxMat33(const PxMat33& m) +{ + return Mat33V(V3LoadU(m.column0), V3LoadU(m.column1), V3LoadU(m.column2)); +} + +PX_FORCE_INLINE void PxMat33_From_Mat33V(const Mat33V& m, PxMat33& out) +{ + PX_ASSERT((size_t(&out) & 15) == 0); + V3StoreU(m.col0, out.column0); + V3StoreU(m.col1, out.column1); + V3StoreU(m.col2, out.column2); +} + +PX_FORCE_INLINE Mat33V M33Trnsps(const Mat33V& a) +{ + return Mat33V(Vec3V(a.col0.x, a.col1.x, a.col2.x), Vec3V(a.col0.y, a.col1.y, a.col2.y), + Vec3V(a.col0.z, a.col1.z, a.col2.z)); +} + +PX_FORCE_INLINE Mat33V M33Identity() +{ + return Mat33V(V3UnitX(), V3UnitY(), V3UnitZ()); +} + +////////////////////////////////// +// MAT34V +////////////////////////////////// + +PX_FORCE_INLINE Vec3V M34MulV3(const Mat34V& a, const Vec3V b) +{ + return Vec3V(a.col0.x * b.x + a.col1.x * b.y + a.col2.x * b.z + a.col3.x, + a.col0.y * b.x + a.col1.y * b.y + a.col2.y * b.z + a.col3.y, + a.col0.z * b.x + a.col1.z * b.y + a.col2.z * b.z + a.col3.z); +} + +PX_FORCE_INLINE Vec3V M34Mul33V3(const Mat34V& a, const Vec3V b) +{ + return Vec3V(a.col0.x * b.x + a.col1.x * b.y + a.col2.x * b.z, a.col0.y * b.x + a.col1.y * b.y + a.col2.y * b.z, + a.col0.z * b.x + a.col1.z * b.y + a.col2.z * b.z); +} + +PX_FORCE_INLINE Vec3V M34TrnspsMul33V3(const Mat34V& a, const Vec3V b) +{ + return Vec3V(a.col0.x * b.x + a.col0.y * b.y + a.col0.z * b.z, a.col1.x * b.x + a.col1.y * b.y + a.col1.z * b.z, + a.col2.x * b.x + a.col2.y * b.y + a.col2.z * b.z); +} + +PX_FORCE_INLINE Mat34V M34MulM34(const Mat34V& a, const Mat34V& b) +{ + return Mat34V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2), M34MulV3(a, b.col3)); +} + +PX_FORCE_INLINE Mat33V M34MulM33(const Mat34V& a, const Mat33V& b) +{ + return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2)); +} + +PX_FORCE_INLINE Mat33V M34Mul33V3(const Mat34V& a, const Mat33V& b) +{ + return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2)); +} + +PX_FORCE_INLINE Mat33V M34Mul33MM34(const Mat34V& a, const Mat34V& b) +{ + return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2)); +} + +PX_FORCE_INLINE Mat34V M34Add(const Mat34V& a, const Mat34V& b) +{ + return Mat34V(V3Add(a.col0, b.col0), V3Add(a.col1, b.col1), V3Add(a.col2, b.col2), V3Add(a.col3, b.col3)); +} + +PX_FORCE_INLINE Mat33V M34Trnsps33(const Mat34V& a) +{ + return Mat33V(Vec3V(a.col0.x, a.col1.x, a.col2.x), Vec3V(a.col0.y, a.col1.y, a.col2.y), + Vec3V(a.col0.z, a.col1.z, a.col2.z)); +} + +////////////////////////////////// +// MAT44V +////////////////////////////////// + +PX_FORCE_INLINE Vec4V M44MulV4(const Mat44V& a, const Vec4V b) +{ + return Vec4V(a.col0.x * b.x + a.col1.x * b.y + a.col2.x * b.z + a.col3.x * b.w, + a.col0.y * b.x + a.col1.y * b.y + a.col2.y * b.z + a.col3.y * b.w, + a.col0.z * b.x + a.col1.z * b.y + a.col2.z * b.z + a.col3.z * b.w, + a.col0.w * b.x + a.col1.w * b.y + a.col2.w * b.z + a.col3.w * b.w); +} + +PX_FORCE_INLINE Vec4V M44TrnspsMulV4(const Mat44V& a, const Vec4V b) +{ + return Vec4V(a.col0.x * b.x + a.col0.y * b.y + a.col0.z * b.z + a.col0.w * b.w, + a.col1.x * b.x + a.col1.y * b.y + a.col1.z * b.z + a.col1.w * b.w, + a.col2.x * b.x + a.col2.y * b.y + a.col2.z * b.z + a.col2.w * b.w, + a.col3.x * b.x + a.col3.y * b.y + a.col3.z * b.z + a.col3.w * b.w); +} + +PX_FORCE_INLINE Mat44V M44MulM44(const Mat44V& a, const Mat44V& b) +{ + return Mat44V(M44MulV4(a, b.col0), M44MulV4(a, b.col1), M44MulV4(a, b.col2), M44MulV4(a, b.col3)); +} + +PX_FORCE_INLINE Mat44V M44Add(const Mat44V& a, const Mat44V& b) +{ + return Mat44V(V4Add(a.col0, b.col0), V4Add(a.col1, b.col1), V4Add(a.col2, b.col2), V4Add(a.col3, b.col3)); +} + +PX_FORCE_INLINE Mat44V M44Inverse(const Mat44V& a) +{ + PxF32 tmp[12]; + PxF32 dst[16]; + PxF32 det; + + const PxF32 src[16] = { a.col0.x, a.col0.y, a.col0.z, a.col0.w, a.col1.x, a.col1.y, a.col1.z, a.col1.w, + a.col2.x, a.col2.y, a.col2.z, a.col2.w, a.col3.x, a.col3.y, a.col3.z, a.col3.w }; + + tmp[0] = src[10] * src[15]; + tmp[1] = src[11] * src[14]; + tmp[2] = src[9] * src[15]; + tmp[3] = src[11] * src[13]; + tmp[4] = src[9] * src[14]; + tmp[5] = src[10] * src[13]; + tmp[6] = src[8] * src[15]; + tmp[7] = src[11] * src[12]; + tmp[8] = src[8] * src[14]; + tmp[9] = src[10] * src[12]; + tmp[10] = src[8] * src[13]; + tmp[11] = src[9] * src[12]; + + dst[0] = tmp[0] * src[5] + tmp[3] * src[6] + tmp[4] * src[7]; + dst[0] -= tmp[1] * src[5] + tmp[2] * src[6] + tmp[5] * src[7]; + dst[1] = tmp[1] * src[4] + tmp[6] * src[6] + tmp[9] * src[7]; + dst[1] -= tmp[0] * src[4] + tmp[7] * src[6] + tmp[8] * src[7]; + dst[2] = tmp[2] * src[4] + tmp[7] * src[5] + tmp[10] * src[7]; + dst[2] -= tmp[3] * src[4] + tmp[6] * src[5] + tmp[11] * src[7]; + dst[3] = tmp[5] * src[4] + tmp[8] * src[5] + tmp[11] * src[6]; + dst[3] -= tmp[4] * src[4] + tmp[9] * src[5] + tmp[10] * src[6]; + dst[4] = tmp[1] * src[1] + tmp[2] * src[2] + tmp[5] * src[3]; + dst[4] -= tmp[0] * src[1] + tmp[3] * src[2] + tmp[4] * src[3]; + dst[5] = tmp[0] * src[0] + tmp[7] * src[2] + tmp[8] * src[3]; + dst[5] -= tmp[1] * src[0] + tmp[6] * src[2] + tmp[9] * src[3]; + dst[6] = tmp[3] * src[0] + tmp[6] * src[1] + tmp[11] * src[3]; + dst[6] -= tmp[2] * src[0] + tmp[7] * src[1] + tmp[10] * src[3]; + dst[7] = tmp[4] * src[0] + tmp[9] * src[1] + tmp[10] * src[2]; + dst[7] -= tmp[5] * src[0] + tmp[8] * src[1] + tmp[11] * src[2]; + + tmp[0] = src[2] * src[7]; + tmp[1] = src[3] * src[6]; + tmp[2] = src[1] * src[7]; + tmp[3] = src[3] * src[5]; + tmp[4] = src[1] * src[6]; + tmp[5] = src[2] * src[5]; + tmp[6] = src[0] * src[7]; + tmp[7] = src[3] * src[4]; + tmp[8] = src[0] * src[6]; + tmp[9] = src[2] * src[4]; + tmp[10] = src[0] * src[5]; + tmp[11] = src[1] * src[4]; + + dst[8] = tmp[0] * src[13] + tmp[3] * src[14] + tmp[4] * src[15]; + dst[8] -= tmp[1] * src[13] + tmp[2] * src[14] + tmp[5] * src[15]; + dst[9] = tmp[1] * src[12] + tmp[6] * src[14] + tmp[9] * src[15]; + dst[9] -= tmp[0] * src[12] + tmp[7] * src[14] + tmp[8] * src[15]; + dst[10] = tmp[2] * src[12] + tmp[7] * src[13] + tmp[10] * src[15]; + dst[10] -= tmp[3] * src[12] + tmp[6] * src[13] + tmp[11] * src[15]; + dst[11] = tmp[5] * src[12] + tmp[8] * src[13] + tmp[11] * src[14]; + dst[11] -= tmp[4] * src[12] + tmp[9] * src[13] + tmp[10] * src[14]; + dst[12] = tmp[2] * src[10] + tmp[5] * src[11] + tmp[1] * src[9]; + dst[12] -= tmp[4] * src[11] + tmp[0] * src[9] + tmp[3] * src[10]; + dst[13] = tmp[8] * src[11] + tmp[0] * src[8] + tmp[7] * src[10]; + dst[13] -= tmp[6] * src[10] + tmp[9] * src[11] + tmp[1] * src[8]; + dst[14] = tmp[6] * src[9] + tmp[11] * src[11] + tmp[3] * src[8]; + dst[14] -= tmp[10] * src[11] + tmp[2] * src[8] + tmp[7] * src[9]; + dst[15] = tmp[10] * src[10] + tmp[4] * src[8] + tmp[9] * src[9]; + dst[15] -= tmp[8] * src[9] + tmp[11] * src[10] + tmp[5] * src[8]; + + det = src[0] * dst[0] + src[1] * dst[1] + src[2] * dst[2] + src[3] * dst[3]; + + det = 1.0f / det; + for(PxU32 j = 0; j < 16; j++) + { + dst[j] *= det; + } + + return Mat44V(Vec4V(dst[0], dst[4], dst[8], dst[12]), Vec4V(dst[1], dst[5], dst[9], dst[13]), + Vec4V(dst[2], dst[6], dst[10], dst[14]), Vec4V(dst[3], dst[7], dst[11], dst[15])); +} + +PX_FORCE_INLINE Mat44V M44Trnsps(const Mat44V& a) +{ + return Mat44V(Vec4V(a.col0.x, a.col1.x, a.col2.x, a.col3.x), Vec4V(a.col0.y, a.col1.y, a.col2.y, a.col3.y), + Vec4V(a.col0.z, a.col1.z, a.col2.z, a.col3.z), Vec4V(a.col0.w, a.col1.w, a.col2.w, a.col3.w)); +} + +PX_FORCE_INLINE Vec4V V4LoadXYZW(const PxF32& x, const PxF32& y, const PxF32& z, const PxF32& w) +{ + return Vec4V(x, y, z, w); +} + +/* +PX_FORCE_INLINE VecU16V V4U32PK(VecU32V a, VecU32V b) +{ + return VecU16V( + PxU16(PxClamp<PxU32>((a).u32[0], 0, 0xFFFF)), + PxU16(PxClamp<PxU32>((a).u32[1], 0, 0xFFFF)), + PxU16(PxClamp<PxU32>((a).u32[2], 0, 0xFFFF)), + PxU16(PxClamp<PxU32>((a).u32[3], 0, 0xFFFF)), + PxU16(PxClamp<PxU32>((b).u32[0], 0, 0xFFFF)), + PxU16(PxClamp<PxU32>((b).u32[1], 0, 0xFFFF)), + PxU16(PxClamp<PxU32>((b).u32[2], 0, 0xFFFF)), + PxU16(PxClamp<PxU32>((b).u32[3], 0, 0xFFFF))); +} +*/ + +PX_FORCE_INLINE VecU32V V4U32Sel(const BoolV c, const VecU32V a, const VecU32V b) +{ + return VecU32V(c.ux ? a.u32[0] : b.u32[0], c.uy ? a.u32[1] : b.u32[1], c.uz ? a.u32[2] : b.u32[2], + c.uw ? a.u32[3] : b.u32[3]); +} + +PX_FORCE_INLINE VecU32V V4U32or(VecU32V a, VecU32V b) +{ + return VecU32V((a).u32[0] | (b).u32[0], (a).u32[1] | (b).u32[1], (a).u32[2] | (b).u32[2], (a).u32[3] | (b).u32[3]); +} + +PX_FORCE_INLINE VecU32V V4U32xor(VecU32V a, VecU32V b) +{ + return VecU32V((a).u32[0] ^ (b).u32[0], (a).u32[1] ^ (b).u32[1], (a).u32[2] ^ (b).u32[2], (a).u32[3] ^ (b).u32[3]); +} + +PX_FORCE_INLINE VecU32V V4U32and(VecU32V a, VecU32V b) +{ + return VecU32V((a).u32[0] & (b).u32[0], (a).u32[1] & (b).u32[1], (a).u32[2] & (b).u32[2], (a).u32[3] & (b).u32[3]); +} + +PX_FORCE_INLINE VecU32V V4U32Andc(VecU32V a, VecU32V b) +{ + return VecU32V((a).u32[0] & ~(b).u32[0], (a).u32[1] & ~(b).u32[1], (a).u32[2] & ~(b).u32[2], + (a).u32[3] & ~(b).u32[3]); +} + +/* +PX_FORCE_INLINE VecU16V V4U16Or(VecU16V a, VecU16V b) +{ + return VecU16V( + (a).u16[0]|(b).u16[0], (a).u16[1]|(b).u16[1], (a).u16[2]|(b).u16[2], (a).u16[3]|(b).u16[3], + (a).u16[4]|(b).u16[4], (a).u16[5]|(b).u16[5], (a).u16[6]|(b).u16[6], (a).u16[7]|(b).u16[7]); +} +*/ + +/* +PX_FORCE_INLINE VecU16V V4U16And(VecU16V a, VecU16V b) +{ + return VecU16V( + (a).u16[0]&(b).u16[0], (a).u16[1]&(b).u16[1], (a).u16[2]&(b).u16[2], (a).u16[3]&(b).u16[3], + (a).u16[4]&(b).u16[4], (a).u16[5]&(b).u16[5], (a).u16[6]&(b).u16[6], (a).u16[7]&(b).u16[7]); +} +*/ + +/* +PX_FORCE_INLINE VecU16V V4U16Andc(VecU16V a, VecU16V b) +{ + return VecU16V( + (a).u16[0]&~(b).u16[0], (a).u16[1]&~(b).u16[1], (a).u16[2]&~(b).u16[2], (a).u16[3]&~(b).u16[3], + (a).u16[4]&~(b).u16[4], (a).u16[5]&~(b).u16[5], (a).u16[6]&~(b).u16[6], (a).u16[7]&~(b).u16[7]); +} +*/ + +/* +template<int a> PX_FORCE_INLINE VecI32V V4ISplat() +{ + return VecI32V(a, a, a, a); +} + +template<PxU32 a> PX_FORCE_INLINE VecU32V V4USplat() +{ + return VecU32V(a, a, a, a); +} +*/ + +/* +PX_FORCE_INLINE void V4U16StoreAligned(VecU16V val, VecU16V* address) +{ + *address = val; +} +*/ + +PX_FORCE_INLINE void V4U32StoreAligned(VecU32V val, VecU32V* address) +{ + *address = val; +} + +PX_FORCE_INLINE Vec4V V4Andc(const Vec4V a, const VecU32V b) +{ + VecU32V r = V4U32Andc(*reinterpret_cast<const VecU32V*>(&a), b); + return (*reinterpret_cast<const Vec4V*>(&r)); +} + +PX_FORCE_INLINE VecU32V V4IsGrtrV32u(const Vec4V a, const Vec4V b) +{ + return VecU32V(a.x > b.x ? 0xFFFFffff : 0, a.y > b.y ? 0xFFFFffff : 0, a.z > b.z ? 0xFFFFffff : 0, + a.w > b.w ? 0xFFFFffff : 0); +} + +PX_FORCE_INLINE VecU16V V4U16LoadAligned(VecU16V* addr) +{ + return *addr; +} + +PX_FORCE_INLINE VecU16V V4U16LoadUnaligned(VecU16V* addr) +{ + return *addr; +} + +PX_FORCE_INLINE VecU16V V4U16CompareGt(VecU16V a, VecU16V b) +{ + return VecU16V + ( + BOOL_TO_U16(a.u16[0] > b.u16[0]), BOOL_TO_U16(a.u16[1] > b.u16[1]), BOOL_TO_U16(a.u16[2] > b.u16[2]), BOOL_TO_U16(a.u16[3] > b.u16[3]), + BOOL_TO_U16(a.u16[4] > b.u16[4]), BOOL_TO_U16(a.u16[5] > b.u16[5]), BOOL_TO_U16(a.u16[6] > b.u16[6]), BOOL_TO_U16(a.u16[7] > b.u16[7]) + ); +} + +PX_FORCE_INLINE VecU16V V4I16CompareGt(VecU16V a, VecU16V b) +{ + return VecU16V + ( + BOOL_TO_U16(a.i16[0] > b.i16[0]), BOOL_TO_U16(a.i16[1] > b.i16[1]), BOOL_TO_U16(a.i16[2] > b.i16[2]), BOOL_TO_U16(a.i16[3] > b.i16[3]), + BOOL_TO_U16(a.i16[4] > b.i16[4]), BOOL_TO_U16(a.i16[5] > b.i16[5]), BOOL_TO_U16(a.i16[6] > b.i16[6]), BOOL_TO_U16(a.i16[7] > b.i16[7]) + ); +} + +PX_FORCE_INLINE Vec4V Vec4V_From_VecU32V(VecU32V a) +{ + return Vec4V(PxF32((a).u32[0]), PxF32((a).u32[1]), PxF32((a).u32[2]), PxF32((a).u32[3])); +} + +PX_FORCE_INLINE Vec4V Vec4V_From_VecI32V(VecI32V a) +{ + return Vec4V(PxF32((a).i32[0]), PxF32((a).i32[1]), PxF32((a).i32[2]), PxF32((a).i32[3])); +} + +PX_FORCE_INLINE VecI32V VecI32V_From_Vec4V(Vec4V a) +{ + float* data = (float*)&a; + return VecI32V(PxI32(data[0]), PxI32(data[1]), PxI32(data[2]), PxI32(data[3])); +} + +PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecU32V(VecU32V a) +{ + Vec4V b = *reinterpret_cast<Vec4V*>(&a); + return b; +} + +PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecI32V(VecI32V a) +{ + Vec4V b = *reinterpret_cast<Vec4V*>(&a); + return b; +} + +PX_FORCE_INLINE VecU32V VecU32V_ReinterpretFrom_Vec4V(Vec4V a) +{ + VecU32V b = *reinterpret_cast<VecU32V*>(&a); + return b; +} + +PX_FORCE_INLINE VecI32V VecI32V_ReinterpretFrom_Vec4V(Vec4V a) +{ + VecI32V b = *reinterpret_cast<VecI32V*>(&a); + return b; +} + +template <int index> +PX_FORCE_INLINE VecU32V V4U32SplatElement(VecU32V a) +{ + return VecU32V((a).u32[index], (a).u32[index], (a).u32[index], (a).u32[index]); +} + +template <int index> +PX_FORCE_INLINE VecU32V V4U32SplatElement(BoolV a) +{ + const PxU32 u = (&a.ux)[index]; + return VecU32V(u, u, u, u); +} + +template <int index> +PX_FORCE_INLINE Vec4V V4SplatElement(Vec4V a) +{ + float* data = (float*)&a; + return Vec4V(data[index], data[index], data[index], data[index]); +} + +PX_FORCE_INLINE VecU32V U4LoadXYZW(PxU32 x, PxU32 y, PxU32 z, PxU32 w) +{ + return VecU32V(x, y, z, w); +} + +PX_FORCE_INLINE Vec4V V4Abs(const Vec4V a) +{ + return V4Max(a, V4Neg(a)); +} + +PX_FORCE_INLINE BoolV V4IsEqU32(const VecU32V a, const VecU32V b) +{ + return BoolV(BOOL_TO_U32(a.u32[0] == b.u32[0]), BOOL_TO_U32(a.u32[1] == b.u32[1]), BOOL_TO_U32(a.u32[2] == b.u32[2]), BOOL_TO_U32(a.u32[3] == b.u32[3])); +} + +PX_FORCE_INLINE VecU32V U4Load(const PxU32 i) +{ + return VecU32V(i, i, i, i); +} + +PX_FORCE_INLINE VecU32V U4LoadU(const PxU32* i) +{ + return VecU32V(i[0], i[1], i[2], i[3]); +} + +PX_FORCE_INLINE VecU32V U4LoadA(const PxU32* i) +{ + return VecU32V(i[0], i[1], i[2], i[3]); +} + +PX_FORCE_INLINE VecI32V I4Load(const PxI32 i) +{ + return VecI32V(i, i, i, i); +} + +PX_FORCE_INLINE VecI32V I4LoadU(const PxI32* i) +{ + return VecI32V(i[0], i[1], i[2], i[3]); +} + +PX_FORCE_INLINE VecI32V I4LoadA(const PxI32* i) +{ + return VecI32V(i[0], i[1], i[2], i[3]); +} + +PX_FORCE_INLINE VecI32V VecI32V_Add(const VecI32VArg a, const VecI32VArg b) +{ + return VecI32V(a.i32[0] + b.i32[0], a.i32[1] + b.i32[1], a.i32[2] + b.i32[2], a.i32[3] + b.i32[3]); +} + +PX_FORCE_INLINE VecI32V VecI32V_Sub(const VecI32VArg a, const VecI32VArg b) +{ + return VecI32V(a.i32[0] - b.i32[0], a.i32[1] - b.i32[1], a.i32[2] - b.i32[2], a.i32[3] - b.i32[3]); +} + +PX_FORCE_INLINE BoolV VecI32V_IsGrtr(const VecI32VArg a, const VecI32VArg b) +{ + return BoolV(BOOL_TO_U32(a.i32[0] > b.i32[0]), BOOL_TO_U32(a.i32[1] > b.i32[1]), BOOL_TO_U32(a.i32[2] > b.i32[2]), BOOL_TO_U32(a.i32[3] > b.i32[3])); +} + +PX_FORCE_INLINE BoolV VecI32V_IsEq(const VecI32VArg a, const VecI32VArg b) +{ + return BoolV(BOOL_TO_U32(a.i32[0] == b.i32[0]), BOOL_TO_U32(a.i32[1] == b.i32[1]), BOOL_TO_U32(a.i32[2] == b.i32[2]), BOOL_TO_U32(a.i32[3] == b.i32[3])); +} + +PX_FORCE_INLINE VecI32V V4I32Sel(const BoolV c, const VecI32V a, const VecI32V b) +{ + return VecI32V(c.ux ? a.i32[0] : b.i32[0], c.uy ? a.i32[1] : b.i32[1], c.uz ? a.i32[2] : b.i32[2], + c.uw ? a.i32[3] : b.i32[3]); +} + +PX_FORCE_INLINE VecI32V VecI32V_Zero() +{ + return VecI32V(0, 0, 0, 0); +} + +PX_FORCE_INLINE VecI32V VecI32V_One() +{ + return VecI32V(1, 1, 1, 1); +} + +PX_FORCE_INLINE VecI32V VecI32V_Two() +{ + return VecI32V(2, 2, 2, 2); +} + +PX_FORCE_INLINE VecI32V VecI32V_MinusOne() +{ + return VecI32V(-1, -1, -1, -1); +} + +PX_FORCE_INLINE VecU32V U4Zero() +{ + return VecU32V(0, 0, 0, 0); +} + +PX_FORCE_INLINE VecU32V U4One() +{ + return VecU32V(1, 1, 1, 1); +} + +PX_FORCE_INLINE VecU32V U4Two() +{ + return VecU32V(2, 2, 2, 2); +} + +PX_FORCE_INLINE VecShiftV VecI32V_PrepareShift(const VecI32VArg shift) +{ + return shift; +} + +PX_FORCE_INLINE VecI32V VecI32V_LeftShift(const VecI32VArg a, const VecShiftVArg count) +{ + return VecI32V(a.i32[0] << count.i32[0], a.i32[1] << count.i32[1], a.i32[2] << count.i32[2], a.i32[3] + << count.i32[3]); +} + +PX_FORCE_INLINE VecI32V VecI32V_RightShift(const VecI32VArg a, const VecShiftVArg count) +{ + return VecI32V(a.i32[0] >> count.i32[0], a.i32[1] >> count.i32[1], a.i32[2] >> count.i32[2], + a.i32[3] >> count.i32[3]); +} + +PX_FORCE_INLINE VecI32V VecI32V_And(const VecI32VArg a, const VecI32VArg b) +{ + return VecI32V(a.i32[0] & b.i32[0], a.i32[1] & b.i32[1], a.i32[2] & b.i32[2], a.i32[3] & b.i32[3]); +} + +PX_FORCE_INLINE VecI32V VecI32V_Or(const VecI32VArg a, const VecI32VArg b) +{ + return VecI32V(a.i32[0] | b.i32[0], a.i32[1] | b.i32[1], a.i32[2] | b.i32[2], a.i32[3] | b.i32[3]); +} + +PX_FORCE_INLINE VecI32V VecI32V_GetX(const VecI32VArg a) +{ + return VecI32V(a.i32[0], a.i32[0], a.i32[0], a.i32[0]); +} + +PX_FORCE_INLINE VecI32V VecI32V_GetY(const VecI32VArg a) +{ + return VecI32V(a.i32[1], a.i32[1], a.i32[1], a.i32[1]); +} + +PX_FORCE_INLINE VecI32V VecI32V_GetZ(const VecI32VArg a) +{ + return VecI32V(a.i32[2], a.i32[2], a.i32[2], a.i32[2]); +} + +PX_FORCE_INLINE VecI32V VecI32V_GetW(const VecI32VArg a) +{ + return VecI32V(a.i32[3], a.i32[3], a.i32[3], a.i32[3]); +} + +PX_FORCE_INLINE VecI32V VecI32V_Sel(const BoolV c, const VecI32VArg a, const VecI32VArg b) +{ + return VecI32V(c.ux ? a.i32[0] : b.i32[0], c.uy ? a.i32[1] : b.i32[1], c.uz ? a.i32[2] : b.i32[2], + c.uw ? a.i32[3] : b.i32[3]); +} + +PX_FORCE_INLINE VecI32V VecI32V_Merge(const VecI32VArg a, const VecI32VArg b, const VecI32VArg c, const VecI32VArg d) +{ + return VecI32V(a.i32[0], b.i32[0], c.i32[0], d.i32[0]); +} + +PX_FORCE_INLINE void PxI32_From_VecI32V(const VecI32VArg a, PxI32* i) +{ + *i = a.i32[0]; +} + +PX_FORCE_INLINE VecI32V VecI32V_From_BoolV(const BoolVArg b) +{ + return VecI32V(PxI32(b.ux), PxI32(b.uy), PxI32(b.uz), PxI32(b.uw)); +} + +PX_FORCE_INLINE VecU32V VecU32V_From_BoolV(const BoolVArg b) +{ + return VecU32V(b.ux, b.uy, b.uz, b.uw); +} + +PX_FORCE_INLINE void QuatGetMat33V(const QuatVArg q, Vec3V& column0, Vec3V& column1, Vec3V& column2) +{ + const FloatV one = FOne(); + const FloatV x = V4GetX(q); + const FloatV y = V4GetY(q); + const FloatV z = V4GetZ(q); + const FloatV w = V4GetW(q); + + const FloatV x2 = FAdd(x, x); + const FloatV y2 = FAdd(y, y); + const FloatV z2 = FAdd(z, z); + + const FloatV xx = FMul(x2, x); + const FloatV yy = FMul(y2, y); + const FloatV zz = FMul(z2, z); + + const FloatV xy = FMul(x2, y); + const FloatV xz = FMul(x2, z); + const FloatV xw = FMul(x2, w); + + const FloatV yz = FMul(y2, z); + const FloatV yw = FMul(y2, w); + const FloatV zw = FMul(z2, w); + + const FloatV v = FSub(one, xx); + + column0 = V3Merge(FSub(FSub(one, yy), zz), FAdd(xy, zw), FSub(xz, yw)); + column1 = V3Merge(FSub(xy, zw), FSub(v, zz), FAdd(yz, xw)); + column2 = V3Merge(FAdd(xz, yw), FSub(yz, xw), FSub(v, yy)); +} + + +// not used + +/* +PX_FORCE_INLINE Vec4V V4LoadAligned(Vec4V* addr) +{ + return *addr; +} +*/ + +/* +PX_FORCE_INLINE Vec4V V4LoadUnaligned(Vec4V* addr) +{ + return *addr; +} +*/ + +/* +PX_FORCE_INLINE Vec4V V4Ceil(const Vec4V a) +{ + return Vec4V(PxCeil(a.x), PxCeil(a.y), PxCeil(a.z), PxCeil(a.w)); +} + +PX_FORCE_INLINE Vec4V V4Floor(const Vec4V a) +{ + return Vec4V(PxFloor(a.x), PxFloor(a.y), PxFloor(a.z), PxFloor(a.w)); +} +*/ + +/* +PX_FORCE_INLINE VecU32V V4ConvertToU32VSaturate(const Vec4V a, PxU32 power) +{ + PX_ASSERT(power == 0 && "Non-zero power not supported in convertToU32VSaturate"); + PX_UNUSED(power); // prevent warning in release builds + PxF32 ffffFFFFasFloat = PxF32(0xFFFF0000); + return VecU32V( + PxU32(PxClamp<PxF32>((a).x, 0.0f, ffffFFFFasFloat)), + PxU32(PxClamp<PxF32>((a).y, 0.0f, ffffFFFFasFloat)), + PxU32(PxClamp<PxF32>((a).z, 0.0f, ffffFFFFasFloat)), + PxU32(PxClamp<PxF32>((a).w, 0.0f, ffffFFFFasFloat))); +} +*/ + +#endif // PSFOUNDATION_PSVECMATHAOSSCALARINLINE_H diff --git a/PxShared/src/foundation/include/PsVecMathSSE.h b/PxShared/src/foundation/include/PsVecMathSSE.h new file mode 100644 index 00000000..08027e73 --- /dev/null +++ b/PxShared/src/foundation/include/PsVecMathSSE.h @@ -0,0 +1,56 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSVECMATHSSE_H +#define PSFOUNDATION_PSVECMATHSSE_H + +namespace +{ + const PX_ALIGN(16, PxF32) minus1w[4] = { 0.0f, 0.0f, 0.0f, -1.0f }; +} + +PX_FORCE_INLINE void QuatGetMat33V(const QuatVArg q, Vec3V& column0, Vec3V& column1, Vec3V& column2) +{ + const __m128 q2 = V4Add(q, q); + const __m128 qw2 = V4MulAdd(q2, V4GetW(q), _mm_load_ps(minus1w)); // (2wx, 2wy, 2wz, 2ww-1) + const __m128 nw2 = Vec3V_From_Vec4V(V4Neg(qw2)); // (-2wx, -2wy, -2wz, 0) + const __m128 v = Vec3V_From_Vec4V(q); + + const __m128 a0 = _mm_shuffle_ps(qw2, nw2, _MM_SHUFFLE(3, 1, 2, 3)); // (2ww-1, 2wz, -2wy, 0) + column0 = V4MulAdd(v, V4GetX(q2), a0); + + const __m128 a1 = _mm_shuffle_ps(qw2, nw2, _MM_SHUFFLE(3, 2, 0, 3)); // (2ww-1, 2wx, -2wz, 0) + column1 = V4MulAdd(v, V4GetY(q2), _mm_shuffle_ps(a1, a1, _MM_SHUFFLE(3, 1, 0, 2))); + + const __m128 a2 = _mm_shuffle_ps(qw2, nw2, _MM_SHUFFLE(3, 0, 1, 3)); // (2ww-1, 2wy, -2wx, 0) + column2 = V4MulAdd(v, V4GetZ(q2), _mm_shuffle_ps(a2, a2, _MM_SHUFFLE(3, 0, 2, 1))); +} + +#endif // PSFOUNDATION_PSVECMATHSSE_H + diff --git a/PxShared/src/foundation/include/PsVecMathUtilities.h b/PxShared/src/foundation/include/PsVecMathUtilities.h new file mode 100644 index 00000000..7bdb4dab --- /dev/null +++ b/PxShared/src/foundation/include/PsVecMathUtilities.h @@ -0,0 +1,57 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSVECMATHUTILITIES_H +#define PSFOUNDATION_PSVECMATHUTILITIES_H + +#include "PsVecMath.h" + +namespace physx +{ +namespace shdfnd +{ +namespace aos +{ +/*! + Extend an edge along its length by a factor + */ +PX_FORCE_INLINE void makeFatEdge(Vec3V& p0, Vec3V& p1, const FloatVArg fatCoeff) +{ + const Vec3V delta = V3Sub(p1, p0); + const FloatV m = V3Length(delta); + const BoolV con = FIsGrtr(m, FZero()); + const Vec3V fatDelta = V3Scale(V3ScaleInv(delta, m), fatCoeff); + p0 = V3Sel(con, V3Sub(p0, fatDelta), p0); + p1 = V3Sel(con, V3Add(p1, fatDelta), p1); +} +} +} +} + +#endif diff --git a/PxShared/src/foundation/include/PsVecQuat.h b/PxShared/src/foundation/include/PsVecQuat.h new file mode 100644 index 00000000..73eddd1f --- /dev/null +++ b/PxShared/src/foundation/include/PsVecQuat.h @@ -0,0 +1,455 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSVECQUAT_H +#define PSFOUNDATION_PSVECQUAT_H + +//#include "PsInlineAoS.h" + +#ifndef PX_PIDIV2 +#define PX_PIDIV2 1.570796327f +#endif + +////////////////////////////////// +// QuatV +////////////////////////////////// +PX_FORCE_INLINE QuatV QuatVLoadXYZW(const PxF32 x, const PxF32 y, const PxF32 z, const PxF32 w) +{ + return V4LoadXYZW(x, y, z, w); +} + +PX_FORCE_INLINE QuatV QuatVLoadU(const PxF32* v) +{ + return V4LoadU(v); +} + +PX_FORCE_INLINE QuatV QuatVLoadA(const PxF32* v) +{ + return V4LoadA(v); +} + +PX_FORCE_INLINE QuatV QuatV_From_RotationAxisAngle(const Vec3V u, const FloatV a) +{ + // q = cos(a/2) + u*sin(a/2) + const FloatV half = FLoad(0.5f); + const FloatV hangle = FMul(a, half); + const FloatV piByTwo(FLoad(PX_PIDIV2)); + const FloatV PiByTwoMinHangle(FSub(piByTwo, hangle)); + const Vec4V hangle2(Vec4V_From_Vec3V(V3Merge(hangle, PiByTwoMinHangle, hangle))); + + /*const FloatV sina = FSin(hangle); + const FloatV cosa = FCos(hangle);*/ + + const Vec4V _sina = V4Sin(hangle2); + const FloatV sina = V4GetX(_sina); + const FloatV cosa = V4GetY(_sina); + + const Vec3V v = V3Scale(u, sina); + // return V4Sel(BTTTF(), Vec4V_From_Vec3V(v), V4Splat(cosa)); + return V4SetW(Vec4V_From_Vec3V(v), cosa); +} + +// Normalize +PX_FORCE_INLINE QuatV QuatNormalize(const QuatV q) +{ + return V4Normalize(q); +} + +PX_FORCE_INLINE FloatV QuatLength(const QuatV q) +{ + return V4Length(q); +} + +PX_FORCE_INLINE FloatV QuatLengthSq(const QuatV q) +{ + return V4LengthSq(q); +} + +PX_FORCE_INLINE FloatV QuatDot(const QuatV a, const QuatV b) // convert this PxQuat to a unit quaternion +{ + return V4Dot(a, b); +} + +PX_FORCE_INLINE QuatV QuatConjugate(const QuatV q) +{ + return V4SetW(V4Neg(q), V4GetW(q)); +} + +PX_FORCE_INLINE Vec3V QuatGetImaginaryPart(const QuatV q) +{ + return Vec3V_From_Vec4V(q); +} + +/** brief computes rotation of x-axis */ +PX_FORCE_INLINE Vec3V QuatGetBasisVector0(const QuatV q) +{ + /*const PxF32 x2 = x*2.0f; + const PxF32 w2 = w*2.0f; + return PxVec3( (w * w2) - 1.0f + x*x2, + (z * w2) + y*x2, + (-y * w2) + z*x2);*/ + + const FloatV two = FLoad(2.f); + const FloatV w = V4GetW(q); + const Vec3V u = Vec3V_From_Vec4V(q); + + const FloatV x2 = FMul(V3GetX(u), two); + const FloatV w2 = FMul(w, two); + + const Vec3V a = V3Scale(u, x2); + const Vec3V tmp = V3Merge(w, V3GetZ(u), FNeg(V3GetY(u))); + // const Vec3V b = V3Scale(tmp, w2); + // const Vec3V ab = V3Add(a, b); + const Vec3V ab = V3ScaleAdd(tmp, w2, a); + return V3SetX(ab, FSub(V3GetX(ab), FOne())); +} + +/** brief computes rotation of y-axis */ +PX_FORCE_INLINE Vec3V QuatGetBasisVector1(const QuatV q) +{ + /*const PxF32 y2 = y*2.0f; + const PxF32 w2 = w*2.0f; + return PxVec3( (-z * w2) + x*y2, + (w * w2) - 1.0f + y*y2, + (x * w2) + z*y2);*/ + + const FloatV two = FLoad(2.f); + const FloatV w = V4GetW(q); + const Vec3V u = Vec3V_From_Vec4V(q); + + const FloatV y2 = FMul(V3GetY(u), two); + const FloatV w2 = FMul(w, two); + + const Vec3V a = V3Scale(u, y2); + const Vec3V tmp = V3Merge(FNeg(V3GetZ(u)), w, V3GetX(u)); + // const Vec3V b = V3Scale(tmp, w2); + // const Vec3V ab = V3Add(a, b); + const Vec3V ab = V3ScaleAdd(tmp, w2, a); + return V3SetY(ab, FSub(V3GetY(ab), FOne())); +} + +/** brief computes rotation of z-axis */ +PX_FORCE_INLINE Vec3V QuatGetBasisVector2(const QuatV q) +{ + /*const PxF32 z2 = z*2.0f; + const PxF32 w2 = w*2.0f; + return PxVec3( (y * w2) + x*z2, + (-x * w2) + y*z2, + (w * w2) - 1.0f + z*z2);*/ + + const FloatV two = FLoad(2.f); + const FloatV w = V4GetW(q); + const Vec3V u = Vec3V_From_Vec4V(q); + + const FloatV z2 = FMul(V3GetZ(u), two); + const FloatV w2 = FMul(w, two); + + const Vec3V a = V3Scale(u, z2); + const Vec3V tmp = V3Merge(V3GetY(u), FNeg(V3GetX(u)), w); + /*const Vec3V b = V3Scale(tmp, w2); + const Vec3V ab = V3Add(a, b);*/ + const Vec3V ab = V3ScaleAdd(tmp, w2, a); + return V3SetZ(ab, FSub(V3GetZ(ab), FOne())); +} + +PX_FORCE_INLINE Vec3V QuatRotate(const QuatV q, const Vec3V v) +{ + /* + const PxVec3 qv(x,y,z); + return (v*(w*w-0.5f) + (qv.cross(v))*w + qv*(qv.dot(v)))*2; + */ + + const FloatV two = FLoad(2.f); + // const FloatV half = FloatV_From_F32(0.5f); + const FloatV nhalf = FLoad(-0.5f); + const Vec3V u = Vec3V_From_Vec4V(q); + const FloatV w = V4GetW(q); + // const FloatV w2 = FSub(FMul(w, w), half); + const FloatV w2 = FScaleAdd(w, w, nhalf); + const Vec3V a = V3Scale(v, w2); + // const Vec3V b = V3Scale(V3Cross(u, v), w); + // const Vec3V c = V3Scale(u, V3Dot(u, v)); + // return V3Scale(V3Add(V3Add(a, b), c), two); + const Vec3V temp = V3ScaleAdd(V3Cross(u, v), w, a); + return V3Scale(V3ScaleAdd(u, V3Dot(u, v), temp), two); +} + +PX_FORCE_INLINE Vec3V QuatTransform(const QuatV q, const Vec3V p, const Vec3V v) +{ + // p + q.rotate(v) + const FloatV two = FLoad(2.f); + // const FloatV half = FloatV_From_F32(0.5f); + const FloatV nhalf = FLoad(-0.5f); + const Vec3V u = Vec3V_From_Vec4V(q); + const FloatV w = V4GetW(q); + // const FloatV w2 = FSub(FMul(w, w), half); + const FloatV w2 = FScaleAdd(w, w, nhalf); + const Vec3V a = V3Scale(v, w2); + /*const Vec3V b = V3Scale(V3Cross(u, v), w); + const Vec3V c = V3Scale(u, V3Dot(u, v)); + return V3ScaleAdd(V3Add(V3Add(a, b), c), two, p);*/ + const Vec3V temp = V3ScaleAdd(V3Cross(u, v), w, a); + const Vec3V z = V3ScaleAdd(u, V3Dot(u, v), temp); + return V3ScaleAdd(z, two, p); +} + +PX_FORCE_INLINE Vec3V QuatRotateInv(const QuatV q, const Vec3V v) +{ + + // const PxVec3 qv(x,y,z); + // return (v*(w*w-0.5f) - (qv.cross(v))*w + qv*(qv.dot(v)))*2; + + const FloatV two = FLoad(2.f); + const FloatV nhalf = FLoad(-0.5f); + const Vec3V u = Vec3V_From_Vec4V(q); + const FloatV w = V4GetW(q); + const FloatV w2 = FScaleAdd(w, w, nhalf); + const Vec3V a = V3Scale(v, w2); + /*const Vec3V b = V3Scale(V3Cross(u, v), w); + const Vec3V c = V3Scale(u, V3Dot(u, v)); + return V3Scale(V3Add(V3Sub(a, b), c), two);*/ + const Vec3V temp = V3NegScaleSub(V3Cross(u, v), w, a); + return V3Scale(V3ScaleAdd(u, V3Dot(u, v), temp), two); +} + +PX_FORCE_INLINE QuatV QuatMul(const QuatV a, const QuatV b) +{ + const Vec3V imagA = Vec3V_From_Vec4V(a); + const Vec3V imagB = Vec3V_From_Vec4V(b); + const FloatV rA = V4GetW(a); + const FloatV rB = V4GetW(b); + + const FloatV real = FSub(FMul(rA, rB), V3Dot(imagA, imagB)); + const Vec3V v0 = V3Scale(imagA, rB); + const Vec3V v1 = V3Scale(imagB, rA); + const Vec3V v2 = V3Cross(imagA, imagB); + const Vec3V imag = V3Add(V3Add(v0, v1), v2); + + return V4SetW(Vec4V_From_Vec3V(imag), real); +} + +PX_FORCE_INLINE QuatV QuatAdd(const QuatV a, const QuatV b) +{ + return V4Add(a, b); +} + +PX_FORCE_INLINE QuatV QuatNeg(const QuatV q) +{ + return V4Neg(q); +} + +PX_FORCE_INLINE QuatV QuatSub(const QuatV a, const QuatV b) +{ + return V4Sub(a, b); +} + +PX_FORCE_INLINE QuatV QuatScale(const QuatV a, const FloatV b) +{ + return V4Scale(a, b); +} + +PX_FORCE_INLINE QuatV QuatMerge(const FloatV* const floatVArray) +{ + return V4Merge(floatVArray); +} + +PX_FORCE_INLINE QuatV QuatMerge(const FloatVArg x, const FloatVArg y, const FloatVArg z, const FloatVArg w) +{ + return V4Merge(x, y, z, w); +} + +PX_FORCE_INLINE QuatV QuatIdentity() +{ + return V4SetW(V4Zero(), FOne()); +} + +PX_FORCE_INLINE bool isFiniteQuatV(const QuatV q) +{ + return isFiniteVec4V(q); +} + +PX_FORCE_INLINE bool isValidQuatV(const QuatV q) +{ + const FloatV unitTolerance = FLoad(1e-4f); + const FloatV tmp = FAbs(FSub(QuatLength(q), FOne())); + const BoolV con = FIsGrtr(unitTolerance, tmp); + return isFiniteVec4V(q) & (BAllEqTTTT(con) == 1); +} + +PX_FORCE_INLINE bool isSaneQuatV(const QuatV q) +{ + const FloatV unitTolerance = FLoad(1e-2f); + const FloatV tmp = FAbs(FSub(QuatLength(q), FOne())); + const BoolV con = FIsGrtr(unitTolerance, tmp); + return isFiniteVec4V(q) & (BAllEqTTTT(con) == 1); +} + +PX_FORCE_INLINE Mat33V QuatGetMat33V(const QuatVArg q) +{ + // const FloatV two = FloatV_From_F32(2.f); + // const FloatV one = FOne(); + + // const FloatV x = V4GetX(q); + // const FloatV y = V4GetY(q); + // const FloatV z = V4GetZ(q); + // const Vec4V _q = V4Mul(q, two); + // + ////const FloatV w = V4GetW(q); + + // const Vec4V t0 = V4Mul(_q, x); // 2xx, 2xy, 2xz, 2xw + // const Vec4V t1 = V4Mul(_q, y); // 2xy, 2yy, 2yz, 2yw + // const Vec4V t2 = V4Mul(_q, z); // 2xz, 2yz, 2zz, 2zw + ////const Vec4V t3 = V4Mul(_q, w); // 2xw, 2yw, 2zw, 2ww + + // const FloatV xx2 = V4GetX(t0); + // const FloatV xy2 = V4GetY(t0); + // const FloatV xz2 = V4GetZ(t0); + // const FloatV xw2 = V4GetW(t0); + + // const FloatV yy2 = V4GetY(t1); + // const FloatV yz2 = V4GetZ(t1); + // const FloatV yw2 = V4GetW(t1); + + // const FloatV zz2 = V4GetZ(t2); + // const FloatV zw2 = V4GetW(t2); + + ////const FloatV ww2 = V4GetW(t3); + + // const FloatV c00 = FSub(one, FAdd(yy2, zz2)); + // const FloatV c01 = FSub(xy2, zw2); + // const FloatV c02 = FAdd(xz2, yw2); + + // const FloatV c10 = FAdd(xy2, zw2); + // const FloatV c11 = FSub(one, FAdd(xx2, zz2)); + // const FloatV c12 = FSub(yz2, xw2); + + // const FloatV c20 = FSub(xz2, yw2); + // const FloatV c21 = FAdd(yz2, xw2); + // const FloatV c22 = FSub(one, FAdd(xx2, yy2)); + + // const Vec3V c0 = V3Merge(c00, c10, c20); + // const Vec3V c1 = V3Merge(c01, c11, c21); + // const Vec3V c2 = V3Merge(c02, c12, c22); + + // return Mat33V(c0, c1, c2); + + const FloatV one = FOne(); + const FloatV x = V4GetX(q); + const FloatV y = V4GetY(q); + const FloatV z = V4GetZ(q); + const FloatV w = V4GetW(q); + + const FloatV x2 = FAdd(x, x); + const FloatV y2 = FAdd(y, y); + const FloatV z2 = FAdd(z, z); + + const FloatV xx = FMul(x2, x); + const FloatV yy = FMul(y2, y); + const FloatV zz = FMul(z2, z); + + const FloatV xy = FMul(x2, y); + const FloatV xz = FMul(x2, z); + const FloatV xw = FMul(x2, w); + + const FloatV yz = FMul(y2, z); + const FloatV yw = FMul(y2, w); + const FloatV zw = FMul(z2, w); + + const FloatV v = FSub(one, xx); + + const Vec3V column0 = V3Merge(FSub(FSub(one, yy), zz), FAdd(xy, zw), FSub(xz, yw)); + const Vec3V column1 = V3Merge(FSub(xy, zw), FSub(v, zz), FAdd(yz, xw)); + const Vec3V column2 = V3Merge(FAdd(xz, yw), FSub(yz, xw), FSub(v, yy)); + return Mat33V(column0, column1, column2); +} + +PX_FORCE_INLINE QuatV Mat33GetQuatV(const Mat33V& a) +{ + const FloatV one = FOne(); + const FloatV zero = FZero(); + const FloatV half = FLoad(0.5f); + const FloatV two = FLoad(2.f); + const FloatV scale = FLoad(0.25f); + const FloatV a00 = V3GetX(a.col0); + const FloatV a11 = V3GetY(a.col1); + const FloatV a22 = V3GetZ(a.col2); + + const FloatV a21 = V3GetZ(a.col1); // row=2, col=1; + const FloatV a12 = V3GetY(a.col2); // row=1, col=2; + const FloatV a02 = V3GetX(a.col2); // row=0, col=2; + const FloatV a20 = V3GetZ(a.col0); // row=2, col=0; + const FloatV a10 = V3GetY(a.col0); // row=1, col=0; + const FloatV a01 = V3GetX(a.col1); // row=0, col=1; + + const Vec3V vec0 = V3Merge(a21, a02, a10); + const Vec3V vec1 = V3Merge(a12, a20, a01); + const Vec3V v = V3Sub(vec0, vec1); + const Vec3V g = V3Add(vec0, vec1); + + const FloatV trace = FAdd(a00, FAdd(a11, a22)); + + if(FAllGrtrOrEq(trace, zero)) + { + const FloatV h = FSqrt(FAdd(trace, one)); + const FloatV w = FMul(half, h); + const FloatV s = FMul(half, FRecip(h)); + const Vec3V u = V3Scale(v, s); + return V4SetW(Vec4V_From_Vec3V(u), w); + } + else + { + const FloatV ntrace = FNeg(trace); + const Vec3V d = V3Merge(a00, a11, a22); + const BoolV con0 = BAllTrue3(V3IsGrtrOrEq(V3Splat(a00), d)); + const BoolV con1 = BAllTrue3(V3IsGrtrOrEq(V3Splat(a11), d)); + + const FloatV t0 = FAdd(one, FScaleAdd(a00, two, ntrace)); + const FloatV t1 = FAdd(one, FScaleAdd(a11, two, ntrace)); + const FloatV t2 = FAdd(one, FScaleAdd(a22, two, ntrace)); + + const FloatV t = FSel(con0, t0, FSel(con1, t1, t2)); + + const FloatV h = FMul(two, FSqrt(t)); + const FloatV s = FRecip(h); + const FloatV g0 = FMul(scale, h); + const Vec3V vs = V3Scale(v, s); + const Vec3V gs = V3Scale(g, s); + const FloatV gsx = V3GetX(gs); + const FloatV gsy = V3GetY(gs); + const FloatV gsz = V3GetZ(gs); + // vs.x= (a21 - a12)*s; vs.y=(a02 - a20)*s; vs.z=(a10 - a01)*s; + // gs.x= (a21 + a12)*s; gs.y=(a02 + a20)*s; gs.z=(a10 + a01)*s; + const Vec4V v0 = V4Merge(g0, gsz, gsy, V3GetX(vs)); + const Vec4V v1 = V4Merge(gsz, g0, gsx, V3GetY(vs)); + const Vec4V v2 = V4Merge(gsy, gsx, g0, V3GetZ(vs)); + return V4Sel(con0, v0, V4Sel(con1, v1, v2)); + } +} + +#endif diff --git a/PxShared/src/foundation/include/PsVecTransform.h b/PxShared/src/foundation/include/PsVecTransform.h new file mode 100644 index 00000000..974f6fa2 --- /dev/null +++ b/PxShared/src/foundation/include/PsVecTransform.h @@ -0,0 +1,283 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSVECTRANSFORM_H +#define PSFOUNDATION_PSVECTRANSFORM_H + +#include "PsVecMath.h" +#include "foundation/PxTransform.h" + +namespace physx +{ +namespace shdfnd +{ +namespace aos +{ + +class PsTransformV +{ + public: + QuatV q; + Vec3V p; + + PX_FORCE_INLINE PsTransformV(const PxTransform& orientation) + { + // const PxQuat oq = orientation.q; + // const PxF32 f[4] = {oq.x, oq.y, oq.z, oq.w}; + q = QuatVLoadXYZW(orientation.q.x, orientation.q.y, orientation.q.z, orientation.q.w); + // q = QuatV_From_F32Array(&oq.x); + p = V3LoadU(orientation.p); + } + + PX_FORCE_INLINE PsTransformV(const Vec3VArg p0 = V3Zero(), const QuatVArg q0 = QuatIdentity()) : q(q0), p(p0) + { + PX_ASSERT(isSaneQuatV(q0)); + } + + PX_FORCE_INLINE PsTransformV operator*(const PsTransformV& x) const + { + PX_ASSERT(x.isSane()); + return transform(x); + } + + PX_FORCE_INLINE PsTransformV getInverse() const + { + PX_ASSERT(isFinite()); + // return PxTransform(q.rotateInv(-p),q.getConjugate()); + return PsTransformV(QuatRotateInv(q, V3Neg(p)), QuatConjugate(q)); + } + + PX_FORCE_INLINE void normalize() + { + p = V3Zero(); + q = QuatIdentity(); + } + + PX_FORCE_INLINE void Invalidate() + { + p = V3Splat(FMax()); + q = QuatIdentity(); + } + + PX_FORCE_INLINE Vec3V transform(const Vec3VArg input) const + { + PX_ASSERT(isFinite()); + // return q.rotate(input) + p; + return QuatTransform(q, p, input); + } + + PX_FORCE_INLINE Vec3V transformInv(const Vec3VArg input) const + { + PX_ASSERT(isFinite()); + // return q.rotateInv(input-p); + return QuatRotateInv(q, V3Sub(input, p)); + } + + PX_FORCE_INLINE Vec3V rotate(const Vec3VArg input) const + { + PX_ASSERT(isFinite()); + // return q.rotate(input); + return QuatRotate(q, input); + } + + PX_FORCE_INLINE Vec3V rotateInv(const Vec3VArg input) const + { + PX_ASSERT(isFinite()); + // return q.rotateInv(input); + return QuatRotateInv(q, input); + } + + //! Transform transform to parent (returns compound transform: first src, then *this) + PX_FORCE_INLINE PsTransformV transform(const PsTransformV& src) const + { + PX_ASSERT(src.isSane()); + PX_ASSERT(isSane()); + // src = [srct, srcr] -> [r*srct + t, r*srcr] + // return PxTransform(q.rotate(src.p) + p, q*src.q); + return PsTransformV(V3Add(QuatRotate(q, src.p), p), QuatMul(q, src.q)); + } + + /** + \brief returns true if finite and q is a unit quaternion + */ + + PX_FORCE_INLINE bool isValid() const + { + // return p.isFinite() && q.isFinite() && q.isValid(); + return isFiniteVec3V(p) & isFiniteQuatV(q) & isValidQuatV(q); + } + + /** + \brief returns true if finite and quat magnitude is reasonably close to unit to allow for some accumulation of error + vs isValid + */ + + PX_FORCE_INLINE bool isSane() const + { + // return isFinite() && q.isSane(); + return isFinite() & isSaneQuatV(q); + } + + /** + \brief returns true if all elems are finite (not NAN or INF, etc.) + */ + PX_FORCE_INLINE bool isFinite() const + { + // return p.isFinite() && q.isFinite(); + return isFiniteVec3V(p) & isFiniteQuatV(q); + } + + //! Transform transform from parent (returns compound transform: first src, then this->inverse) + PX_FORCE_INLINE PsTransformV transformInv(const PsTransformV& src) const + { + PX_ASSERT(src.isSane()); + PX_ASSERT(isFinite()); + // src = [srct, srcr] -> [r^-1*(srct-t), r^-1*srcr] + /*PxQuat qinv = q.getConjugate(); + return PxTransform(qinv.rotate(src.p - p), qinv*src.q);*/ + const QuatV qinv = QuatConjugate(q); + const Vec3V v = QuatRotate(qinv, V3Sub(src.p, p)); + const QuatV rot = QuatMul(qinv, src.q); + return PsTransformV(v, rot); + } + + static PX_FORCE_INLINE PsTransformV createIdentity() + { + return PsTransformV(V3Zero()); + } +}; + +PX_FORCE_INLINE PsTransformV loadTransformA(const PxTransform& transform) +{ + const QuatV q0 = QuatVLoadA(&transform.q.x); + const Vec3V p0 = V3LoadA(&transform.p.x); + + return PsTransformV(p0, q0); +} + +PX_FORCE_INLINE PsTransformV loadTransformU(const PxTransform& transform) +{ + const QuatV q0 = QuatVLoadU(&transform.q.x); + const Vec3V p0 = V3LoadU(&transform.p.x); + + return PsTransformV(p0, q0); +} + +class PsMatTransformV +{ + public: + Mat33V rot; + Vec3V p; + + PX_FORCE_INLINE PsMatTransformV() + { + p = V3Zero(); + rot = M33Identity(); + } + PX_FORCE_INLINE PsMatTransformV(const Vec3VArg _p, const Mat33V& _rot) + { + p = _p; + rot = _rot; + } + + PX_FORCE_INLINE PsMatTransformV(const PsTransformV& other) + { + p = other.p; + QuatGetMat33V(other.q, rot.col0, rot.col1, rot.col2); + } + + PX_FORCE_INLINE PsMatTransformV(const Vec3VArg _p, const QuatV& quat) + { + p = _p; + QuatGetMat33V(quat, rot.col0, rot.col1, rot.col2); + } + + PX_FORCE_INLINE Vec3V getCol0() const + { + return rot.col0; + } + + PX_FORCE_INLINE Vec3V getCol1() const + { + return rot.col1; + } + + PX_FORCE_INLINE Vec3V getCol2() const + { + return rot.col2; + } + + PX_FORCE_INLINE void setCol0(const Vec3VArg col0) + { + rot.col0 = col0; + } + + PX_FORCE_INLINE void setCol1(const Vec3VArg col1) + { + rot.col1 = col1; + } + + PX_FORCE_INLINE void setCol2(const Vec3VArg col2) + { + rot.col2 = col2; + } + + PX_FORCE_INLINE Vec3V transform(const Vec3VArg input) const + { + return V3Add(p, M33MulV3(rot, input)); + } + + PX_FORCE_INLINE Vec3V transformInv(const Vec3VArg input) const + { + return M33TrnspsMulV3(rot, V3Sub(input, p)); // QuatRotateInv(q, V3Sub(input, p)); + } + + PX_FORCE_INLINE Vec3V rotate(const Vec3VArg input) const + { + return M33MulV3(rot, input); + } + + PX_FORCE_INLINE Vec3V rotateInv(const Vec3VArg input) const + { + return M33TrnspsMulV3(rot, input); + } + + PX_FORCE_INLINE PsMatTransformV transformInv(const PsMatTransformV& src) const + { + + const Vec3V v = M33TrnspsMulV3(rot, V3Sub(src.p, p)); + const Mat33V mat = M33MulM33(M33Trnsps(rot), src.rot); + return PsMatTransformV(v, mat); + } +}; +} +} +} + +#endif diff --git a/PxShared/src/foundation/include/unix/PsUnixAoS.h b/PxShared/src/foundation/include/unix/PsUnixAoS.h new file mode 100644 index 00000000..a40ddb73 --- /dev/null +++ b/PxShared/src/foundation/include/unix/PsUnixAoS.h @@ -0,0 +1,47 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSUNIXAOS_H +#define PSFOUNDATION_PSUNIXAOS_H + +// no includes here! this file should be included from PxcVecMath.h only!!! + +#if !COMPILE_VECTOR_INTRINSICS +#error Vector intrinsics should not be included when using scalar implementation. +#endif + +#if PX_INTEL_FAMILY +#include "sse2/PsUnixSse2AoS.h" +#elif PX_NEON +#include "neon/PsUnixNeonAoS.h" +#else +#error No SIMD implementation for this unix platform. +#endif + +#endif // PSFOUNDATION_PSUNIXAOS_H diff --git a/PxShared/src/foundation/include/unix/PsUnixFPU.h b/PxShared/src/foundation/include/unix/PsUnixFPU.h new file mode 100644 index 00000000..db1acc6d --- /dev/null +++ b/PxShared/src/foundation/include/unix/PsUnixFPU.h @@ -0,0 +1,66 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSUNIXFPU_H +#define PSFOUNDATION_PSUNIXFPU_H + +#include "foundation/PxPreprocessor.h" + +#if PX_LINUX || PX_PS4 || PX_OSX + +#if PX_X86 || PX_X64 +#include <xmmintrin.h> +#elif PX_NEON +#include <arm_neon.h> +#endif + + +PX_INLINE physx::shdfnd::SIMDGuard::SIMDGuard() +{ +#if !PX_EMSCRIPTEN && (PX_X86 || PX_X64) + mControlWord = _mm_getcsr(); + // set default (disable exceptions: _MM_MASK_MASK) and FTZ (_MM_FLUSH_ZERO_ON), DAZ (_MM_DENORMALS_ZERO_ON: (1<<6)) + _mm_setcsr(_MM_MASK_MASK | _MM_FLUSH_ZERO_ON | (1 << 6)); +#endif +} + +PX_INLINE physx::shdfnd::SIMDGuard::~SIMDGuard() +{ +#if !PX_EMSCRIPTEN && (PX_X86 || PX_X64) + // restore control word and clear exception flags + // (setting exception state flags cause exceptions on the first following fp operation) + _mm_setcsr(mControlWord & ~_MM_EXCEPT_MASK); +#endif +} + +#else +#error No SIMD implementation for this unix platform. +#endif // PX_LINUX || PX_PS4 || PX_OSX + +#endif // #ifndef PSFOUNDATION_PSUNIXFPU_H diff --git a/PxShared/src/foundation/include/unix/PsUnixInlineAoS.h b/PxShared/src/foundation/include/unix/PsUnixInlineAoS.h new file mode 100644 index 00000000..1ba626e8 --- /dev/null +++ b/PxShared/src/foundation/include/unix/PsUnixInlineAoS.h @@ -0,0 +1,48 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSUNIXINLINEAOS_H +#define PSFOUNDATION_PSUNIXINLINEAOS_H + +#if !COMPILE_VECTOR_INTRINSICS +#error Vector intrinsics should not be included when using scalar implementation. +#endif + +// Remove this define when all platforms use simd solver. +#define PX_SUPPORT_SIMD + +#if PX_INTEL_FAMILY +#include "sse2/PsUnixSse2InlineAoS.h" +#elif PX_NEON +#include "neon/PsUnixNeonInlineAoS.h" +#else +#error No SIMD implementation for this unix platform. +#endif + +#endif // PSFOUNDATION_PSUNIXINLINEAOS_H diff --git a/PxShared/src/foundation/include/unix/PsUnixIntrinsics.h b/PxShared/src/foundation/include/unix/PsUnixIntrinsics.h new file mode 100644 index 00000000..1b738518 --- /dev/null +++ b/PxShared/src/foundation/include/unix/PsUnixIntrinsics.h @@ -0,0 +1,153 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSUNIXINTRINSICS_H +#define PSFOUNDATION_PSUNIXINTRINSICS_H + +#include "Ps.h" +#include "foundation/PxAssert.h" +#include <math.h> + +#if PX_ANDROID +#include <signal.h> // for Ns::debugBreak() { raise(SIGTRAP); } +#endif + +#if 0 +#include <libkern/OSAtomic.h> +#endif + +// this file is for internal intrinsics - that is, intrinsics that are used in +// cross platform code but do not appear in the API + +#if !(PX_LINUX || PX_ANDROID || PX_PS4 || PX_APPLE_FAMILY) +#error "This file should only be included by unix builds!!" +#endif + +namespace physx +{ +namespace shdfnd +{ + +PX_FORCE_INLINE void memoryBarrier() +{ + __sync_synchronize(); +} + +/*! +Return the index of the highest set bit. Undefined for zero arg. +*/ +PX_INLINE uint32_t highestSetBitUnsafe(uint32_t v) +{ + + return 31 - __builtin_clz(v); +} + +/*! +Return the index of the highest set bit. Undefined for zero arg. +*/ +PX_INLINE int32_t lowestSetBitUnsafe(uint32_t v) +{ + return __builtin_ctz(v); +} + +/*! +Returns the index of the highest set bit. Returns 32 for v=0. +*/ +PX_INLINE uint32_t countLeadingZeros(uint32_t v) +{ + if(v) + return __builtin_clz(v); + else + return 32; +} + +/*! +Prefetch aligned 64B x86, 32b ARM around \c ptr+offset. +*/ +PX_FORCE_INLINE void prefetchLine(const void* ptr, uint32_t offset = 0) +{ + __builtin_prefetch(reinterpret_cast<const char* PX_RESTRICT>(ptr) + offset, 0, 3); +} + +/*! +Prefetch \c count bytes starting at \c ptr. +*/ +#if PX_ANDROID || PX_IOS +PX_FORCE_INLINE void prefetch(const void* ptr, uint32_t count = 1) +{ + const char* cp = static_cast<const char*>(ptr); + size_t p = reinterpret_cast<size_t>(ptr); + uint32_t startLine = uint32_t(p >> 5), endLine = uint32_t((p + count - 1) >> 5); + uint32_t lines = endLine - startLine + 1; + do + { + prefetchLine(cp); + cp += 32; + } while(--lines); +} +#else +PX_FORCE_INLINE void prefetch(const void* ptr, uint32_t count = 1) +{ + const char* cp = reinterpret_cast<const char*>(ptr); + uint64_t p = size_t(ptr); + uint64_t startLine = p >> 6, endLine = (p + count - 1) >> 6; + uint64_t lines = endLine - startLine + 1; + do + { + prefetchLine(cp); + cp += 64; + } while(--lines); +} +#endif + +//! \brief platform-specific reciprocal +PX_CUDA_CALLABLE PX_FORCE_INLINE float recipFast(float a) +{ + return 1.0f / a; +} + +//! \brief platform-specific fast reciprocal square root +PX_CUDA_CALLABLE PX_FORCE_INLINE float recipSqrtFast(float a) +{ + return 1.0f / ::sqrtf(a); +} + +//! \brief platform-specific floor +PX_CUDA_CALLABLE PX_FORCE_INLINE float floatFloor(float x) +{ + return ::floorf(x); +} + +#define NS_EXPECT_TRUE(x) x +#define NS_EXPECT_FALSE(x) x + +} // namespace shdfnd +} // namespace physx + +#endif // #ifndef PSFOUNDATION_PSUNIXINTRINSICS_H diff --git a/PxShared/src/foundation/include/unix/PsUnixTrigConstants.h b/PxShared/src/foundation/include/unix/PsUnixTrigConstants.h new file mode 100644 index 00000000..f742e293 --- /dev/null +++ b/PxShared/src/foundation/include/unix/PsUnixTrigConstants.h @@ -0,0 +1,82 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSUNIXTRIGCONSTANTS_H +#define PSFOUNDATION_PSUNIXTRIGCONSTANTS_H + +//#define PX_GLOBALCONST extern const __declspec(selectany) +#define PX_GLOBALCONST extern const __attribute__((weak)) + +PX_ALIGN_PREFIX(16) +struct PX_VECTORF32 +{ + float f[4]; +} PX_ALIGN_SUFFIX(16); + +PX_GLOBALCONST PX_VECTORF32 g_PXSinCoefficients0 = { { 1.0f, -0.166666667f, 8.333333333e-3f, -1.984126984e-4f } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXSinCoefficients1 = { { 2.755731922e-6f, -2.505210839e-8f, 1.605904384e-10f, -7.647163732e-13f } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXSinCoefficients2 = { { 2.811457254e-15f, -8.220635247e-18f, 1.957294106e-20f, -3.868170171e-23f } }; +PX_GLOBALCONST PX_VECTORF32 g_PXCosCoefficients0 = { { 1.0f, -0.5f, 4.166666667e-2f, -1.388888889e-3f } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXCosCoefficients1 = { { 2.480158730e-5f, -2.755731922e-7f, 2.087675699e-9f, -1.147074560e-11f } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXCosCoefficients2 = { { 4.779477332e-14f, -1.561920697e-16f, 4.110317623e-19f, -8.896791392e-22f } }; +PX_GLOBALCONST PX_VECTORF32 g_PXTanCoefficients0 = { { 1.0f, 0.333333333f, 0.133333333f, 5.396825397e-2f } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXTanCoefficients1 = { { 2.186948854e-2f, 8.863235530e-3f, 3.592128167e-3f, 1.455834485e-3f } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXTanCoefficients2 = { { 5.900274264e-4f, 2.391290764e-4f, 9.691537707e-5f, 3.927832950e-5f } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXASinCoefficients0 = { { -0.05806367563904f, -0.41861972469416f, 0.22480114791621f, 2.17337241360606f } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXASinCoefficients1 = { { 0.61657275907170f, 4.29696498283455f, -1.18942822255452f, -6.53784832094831f } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXASinCoefficients2 = { { -1.36926553863413f, -4.48179294237210f, 1.41810672941833f, 5.48179257935713f } }; +PX_GLOBALCONST PX_VECTORF32 g_PXATanCoefficients0 = { { 1.0f, 0.333333334f, 0.2f, 0.142857143f } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXATanCoefficients1 = { { 1.111111111e-1f, 9.090909091e-2f, 7.692307692e-2f, 6.666666667e-2f } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXATanCoefficients2 = { { 5.882352941e-2f, 5.263157895e-2f, 4.761904762e-2f, 4.347826087e-2f } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXSinEstCoefficients = { { 1.0f, -1.66521856991541e-1f, 8.199913018755e-3f, -1.61475937228e-4f } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXCosEstCoefficients = { { 1.0f, -4.95348008918096e-1f, 3.878259962881e-2f, -9.24587976263e-4f } }; +PX_GLOBALCONST PX_VECTORF32 g_PXTanEstCoefficients = { { 2.484f, -1.954923183e-1f, 2.467401101f, PxInvPi } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXATanEstCoefficients = { { 7.689891418951e-1f, 1.104742493348f, 8.661844266006e-1f, PxPiDivTwo } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXASinEstCoefficients = { { -1.36178272886711f, 2.37949493464538f, -8.08228565650486e-1f, 2.78440142746736e-1f } }; +PX_GLOBALCONST PX_VECTORF32 g_PXASinEstConstants = { { 1.00000011921f, PxPiDivTwo, 0.0f, 0.0f } }; +PX_GLOBALCONST PX_VECTORF32 g_PXPiConstants0 = { { PxPi, PxTwoPi, PxInvPi, PxInvTwoPi } }; +PX_GLOBALCONST PX_VECTORF32 g_PXReciprocalTwoPi = { { PxInvTwoPi, PxInvTwoPi, PxInvTwoPi, PxInvTwoPi } }; +PX_GLOBALCONST PX_VECTORF32 g_PXTwoPi = { { PxTwoPi, PxTwoPi, PxTwoPi, PxTwoPi } }; + +#endif diff --git a/PxShared/src/foundation/include/unix/neon/PsUnixNeonAoS.h b/PxShared/src/foundation/include/unix/neon/PsUnixNeonAoS.h new file mode 100644 index 00000000..3a3a02e1 --- /dev/null +++ b/PxShared/src/foundation/include/unix/neon/PsUnixNeonAoS.h @@ -0,0 +1,129 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSUNIXNEONAOS_H +#define PSFOUNDATION_PSUNIXNEONAOS_H + +// no includes here! this file should be included from PxcVecMath.h only!!! + +#if !COMPILE_VECTOR_INTRINSICS +#error Vector intrinsics should not be included when using scalar implementation. +#endif + +// only ARM NEON compatible platforms should reach this +#include <arm_neon.h> + +typedef float32x2_t FloatV; +typedef float32x4_t Vec3V; +typedef float32x4_t Vec4V; +typedef uint32x4_t BoolV; +typedef float32x4_t QuatV; + +typedef uint32x4_t VecU32V; +typedef int32x4_t VecI32V; +typedef uint16x8_t VecU16V; +typedef int16x8_t VecI16V; +typedef uint8x16_t VecU8V; + +#define FloatVArg FloatV & +#define Vec3VArg Vec3V & +#define Vec4VArg Vec4V & +#define BoolVArg BoolV & +#define VecU32VArg VecU32V & +#define VecI32VArg VecI32V & +#define VecU16VArg VecU16V & +#define VecI16VArg VecI16V & +#define VecU8VArg VecU8V & +#define QuatVArg QuatV & + +// KS - TODO - make an actual VecCrossV type for NEON +#define VecCrossV Vec3V + +typedef VecI32V VecShiftV; +#define VecShiftVArg VecShiftV & + +PX_ALIGN_PREFIX(16) +struct Mat33V +{ + Mat33V() + { + } + Mat33V(const Vec3V& c0, const Vec3V& c1, const Vec3V& c2) : col0(c0), col1(c1), col2(c2) + { + } + Vec3V PX_ALIGN(16, col0); + Vec3V PX_ALIGN(16, col1); + Vec3V PX_ALIGN(16, col2); +} PX_ALIGN_SUFFIX(16); + +PX_ALIGN_PREFIX(16) +struct Mat34V +{ + Mat34V() + { + } + Mat34V(const Vec3V& c0, const Vec3V& c1, const Vec3V& c2, const Vec3V& c3) : col0(c0), col1(c1), col2(c2), col3(c3) + { + } + Vec3V PX_ALIGN(16, col0); + Vec3V PX_ALIGN(16, col1); + Vec3V PX_ALIGN(16, col2); + Vec3V PX_ALIGN(16, col3); +} PX_ALIGN_SUFFIX(16); + +PX_ALIGN_PREFIX(16) +struct Mat43V +{ + Mat43V() + { + } + Mat43V(const Vec4V& c0, const Vec4V& c1, const Vec4V& c2) : col0(c0), col1(c1), col2(c2) + { + } + Vec4V PX_ALIGN(16, col0); + Vec4V PX_ALIGN(16, col1); + Vec4V PX_ALIGN(16, col2); +} PX_ALIGN_SUFFIX(16); + +PX_ALIGN_PREFIX(16) +struct Mat44V +{ + Mat44V() + { + } + Mat44V(const Vec4V& c0, const Vec4V& c1, const Vec4V& c2, const Vec4V& c3) : col0(c0), col1(c1), col2(c2), col3(c3) + { + } + Vec4V PX_ALIGN(16, col0); + Vec4V PX_ALIGN(16, col1); + Vec4V PX_ALIGN(16, col2); + Vec4V PX_ALIGN(16, col3); +} PX_ALIGN_SUFFIX(16); + +#endif // PSFOUNDATION_PSUNIXNEONAOS_H diff --git a/PxShared/src/foundation/include/unix/neon/PsUnixNeonInlineAoS.h b/PxShared/src/foundation/include/unix/neon/PsUnixNeonInlineAoS.h new file mode 100644 index 00000000..a4f820ea --- /dev/null +++ b/PxShared/src/foundation/include/unix/neon/PsUnixNeonInlineAoS.h @@ -0,0 +1,3577 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSUNIXNEONINLINEAOS_H +#define PSFOUNDATION_PSUNIXNEONINLINEAOS_H + +#if !COMPILE_VECTOR_INTRINSICS +#error Vector intrinsics should not be included when using scalar implementation. +#endif + +// improved estimates +#define VRECIPEQ recipq_newton<1> +#define VRECIPE recip_newton<1> +#define VRECIPSQRTEQ rsqrtq_newton<1> +#define VRECIPSQRTE rsqrt_newton<1> + +// "exact" +#define VRECIPQ recipq_newton<4> +#define VRECIP recip_newton<4> +#define VRECIPSQRTQ rsqrtq_newton<4> +#define VRECIPSQRT rsqrt_newton<4> + +#define VECMATH_AOS_EPSILON (1e-3f) + +// Remove this define when all platforms use simd solver. +#define PX_SUPPORT_SIMD + +////////////////////////////////////////////////////////////////////// +//Test that Vec3V and FloatV are legal +////////////////////////////////// + +#define FLOAT_COMPONENTS_EQUAL_THRESHOLD 0.01f +PX_FORCE_INLINE bool isValidFloatV(const FloatV a) +{ + /* + PX_ALIGN(16, PxF32) data[4]; + vst1_f32(reinterpret_cast<float32_t*>(data), a); + return + PxU32* intData = reinterpret_cast<PxU32*>(data); + return intData[0] == intData[1]; + */ + PX_ALIGN(16, PxF32) data[4]; + vst1_f32(reinterpret_cast<float32_t*>(data), a); + const float32_t x = data[0]; + const float32_t y = data[1]; + + if (PxAbs(x - y) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) + { + return true; + } + + if (PxAbs((x - y) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) + { + return true; + } + + return false; +} + +PX_FORCE_INLINE bool isValidVec3V(const Vec3V a) +{ + const float32_t w = vgetq_lane_f32(a, 3); + return (0.0f == w); + //const PxU32* intData = reinterpret_cast<const PxU32*>(&w); + //return *intData == 0; +} + +PX_FORCE_INLINE bool isAligned16(const void* a) +{ + return(0 == (size_t(a) & 0x0f)); +} + +#if PX_DEBUG +#define ASSERT_ISVALIDVEC3V(a) PX_ASSERT(isValidVec3V(a)) +#define ASSERT_ISVALIDFLOATV(a) PX_ASSERT(isValidFloatV(a)) +#define ASSERT_ISALIGNED16(a) PX_ASSERT(isAligned16(static_cast<const void*>(a))) +#else +#define ASSERT_ISVALIDVEC3V(a) +#define ASSERT_ISVALIDFLOATV(a) +#define ASSERT_ISALIGNED16(a) +#endif + +namespace internalUnitNeonSimd +{ +PX_FORCE_INLINE PxU32 BAllTrue4_R(const BoolV a) +{ + const uint16x4_t dHigh = vget_high_u16(vreinterpretq_u16_u32(a)); + const uint16x4_t dLow = vmovn_u32(a); + const uint16x8_t combined = vcombine_u16(dLow, dHigh); + const uint32x2_t finalReduce = vreinterpret_u32_u8(vmovn_u16(combined)); + return PxU32(vget_lane_u32(finalReduce, 0) == 0xffffFFFF); +} + +PX_FORCE_INLINE PxU32 BAllTrue3_R(const BoolV a) +{ + const uint16x4_t dHigh = vget_high_u16(vreinterpretq_u16_u32(a)); + const uint16x4_t dLow = vmovn_u32(a); + const uint16x8_t combined = vcombine_u16(dLow, dHigh); + const uint32x2_t finalReduce = vreinterpret_u32_u8(vmovn_u16(combined)); + return PxU32((vget_lane_u32(finalReduce, 0) & 0xffFFff) == 0xffFFff); +} + +PX_FORCE_INLINE PxU32 BAnyTrue4_R(const BoolV a) +{ + const uint16x4_t dHigh = vget_high_u16(vreinterpretq_u16_u32(a)); + const uint16x4_t dLow = vmovn_u32(a); + const uint16x8_t combined = vcombine_u16(dLow, dHigh); + const uint32x2_t finalReduce = vreinterpret_u32_u8(vmovn_u16(combined)); + return PxU32(vget_lane_u32(finalReduce, 0) != 0x0); +} + +PX_FORCE_INLINE PxU32 BAnyTrue3_R(const BoolV a) +{ + const uint16x4_t dHigh = vget_high_u16(vreinterpretq_u16_u32(a)); + const uint16x4_t dLow = vmovn_u32(a); + const uint16x8_t combined = vcombine_u16(dLow, dHigh); + const uint32x2_t finalReduce = vreinterpret_u32_u8(vmovn_u16(combined)); + return PxU32((vget_lane_u32(finalReduce, 0) & 0xffFFff) != 0); +} +} + +namespace _VecMathTests +{ +// PT: this function returns an invalid Vec3V (W!=0.0f) just for unit-testing 'isValidVec3V' +PX_FORCE_INLINE Vec3V getInvalidVec3V() +{ + PX_ALIGN(16, PxF32) data[4] = { 1.0f, 1.0f, 1.0f, 1.0f }; + return V4LoadA(data); +} + +PX_FORCE_INLINE bool allElementsEqualFloatV(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return vget_lane_u32(vceq_f32(a, b), 0) != 0; +} + +PX_FORCE_INLINE bool allElementsEqualVec3V(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return V3AllEq(a, b) != 0; +} + +PX_FORCE_INLINE bool allElementsEqualVec4V(const Vec4V a, const Vec4V b) +{ + return V4AllEq(a, b) != 0; +} + +PX_FORCE_INLINE bool allElementsEqualBoolV(const BoolV a, const BoolV b) +{ + return internalUnitNeonSimd::BAllTrue4_R(vceqq_u32(a, b)) != 0; +} + +PX_FORCE_INLINE PxU32 V4U32AllEq(const VecU32V a, const VecU32V b) +{ + return internalUnitNeonSimd::BAllTrue4_R(V4IsEqU32(a, b)); +} + +PX_FORCE_INLINE bool allElementsEqualVecU32V(const VecU32V a, const VecU32V b) +{ + return V4U32AllEq(a, b) != 0; +} + +PX_FORCE_INLINE BoolV V4IsEqI32(const VecI32V a, const VecI32V b) +{ + return vceqq_s32(a, b); +} + +PX_FORCE_INLINE PxU32 V4I32AllEq(const VecI32V a, const VecI32V b) +{ + return internalUnitNeonSimd::BAllTrue4_R(V4IsEqI32(a, b)); +} + +PX_FORCE_INLINE bool allElementsEqualVecI32V(const VecI32V a, const VecI32V b) +{ + return V4I32AllEq(a, b) != 0; +} + +PX_FORCE_INLINE bool allElementsNearEqualFloatV(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + + const float32x2_t c = vsub_f32(a, b); + const float32x2_t error = vdup_n_f32(VECMATH_AOS_EPSILON); +// absolute compare abs(error) > abs(c) + const uint32x2_t greater = vcagt_f32(error, c); + const uint32x2_t min = vpmin_u32(greater, greater); + return vget_lane_u32(min, 0) != 0x0; +} + +PX_FORCE_INLINE bool allElementsNearEqualVec3V(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + const float32x4_t c = vsubq_f32(a, b); + const float32x4_t error = vdupq_n_f32(VECMATH_AOS_EPSILON); +// absolute compare abs(error) > abs(c) + const uint32x4_t greater = vcagtq_f32(error, c); + return internalUnitNeonSimd::BAllTrue3_R(greater) != 0; +} + +PX_FORCE_INLINE bool allElementsNearEqualVec4V(const Vec4V a, const Vec4V b) +{ + const float32x4_t c = vsubq_f32(a, b); + const float32x4_t error = vdupq_n_f32(VECMATH_AOS_EPSILON); +// absolute compare abs(error) > abs(c) + const uint32x4_t greater = vcagtq_f32(error, c); + return internalUnitNeonSimd::BAllTrue4_R(greater) != 0x0; +} +} + +#if 0 // debugging printfs +#include <stdio.h> +PX_FORCE_INLINE void printVec(const float32x4_t& v, const char* name) +{ + PX_ALIGN(16, float32_t) data[4]; + vst1q_f32(data, v); + printf("%s: (%f, %f, %f, %f)\n", name, data[0], data[1], data[2], data[3]); +} + +PX_FORCE_INLINE void printVec(const float32x2_t& v, const char* name) +{ + PX_ALIGN(16, float32_t) data[2]; + vst1_f32(data, v); + printf("%s: (%f, %f)\n", name, data[0], data[1]); +} + +PX_FORCE_INLINE void printVec(const uint32x4_t& v, const char* name) +{ + PX_ALIGN(16, uint32_t) data[4]; + vst1q_u32(data, v); + printf("%s: (0x%x, 0x%x, 0x%x, 0x%x)\n", name, data[0], data[1], data[2], data[3]); +} + +PX_FORCE_INLINE void printVec(const uint16x8_t& v, const char* name) +{ + PX_ALIGN(16, uint16_t) data[8]; + vst1q_u16(data, v); + printf("%s: (0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x)\n", name, data[0], data[1], data[2], data[3], + data[4], data[5], data[6], data[7]); +} + +PX_FORCE_INLINE void printVec(const int32x4_t& v, const char* name) +{ + PX_ALIGN(16, int32_t) data[4]; + vst1q_s32(data, v); + printf("%s: (0x%x, 0x%x, 0x%x, 0x%x)\n", name, data[0], data[1], data[2], data[3]); +} + +PX_FORCE_INLINE void printVec(const int16x8_t& v, const char* name) +{ + PX_ALIGN(16, int16_t) data[8]; + vst1q_s16(data, v); + printf("%s: (0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x)\n", name, data[0], data[1], data[2], data[3], + data[4], data[5], data[6], data[7]); +} + +PX_FORCE_INLINE void printVec(const uint16x4_t& v, const char* name) +{ + PX_ALIGN(16, uint16_t) data[4]; + vst1_u16(data, v); + printf("%s: (0x%x, 0x%x, 0x%x, 0x%x)\n", name, data[0], data[1], data[2], data[3]); +} + +PX_FORCE_INLINE void printVec(const uint32x2_t& v, const char* name) +{ + PX_ALIGN(16, uint32_t) data[2]; + vst1_u32(data, v); + printf("%s: (0x%x, 0x%x)\n", name, data[0], data[1]); +} + +PX_FORCE_INLINE void printVar(const PxU32 v, const char* name) +{ + printf("%s: 0x%x\n", name, v); +} + +PX_FORCE_INLINE void printVar(const PxF32 v, const char* name) +{ + printf("%s: %f\n", name, v); +} + +#define PRINT_VAR(X) printVar((X), #X) +#define PRINT_VEC(X) printVec((X), #X) +#define PRINT_VEC_TITLE(TITLE, X) printVec((X), TITLE #X) +#endif // debugging printf + +///////////////////////////////////////////////////////////////////// +////FUNCTIONS USED ONLY FOR ASSERTS IN VECTORISED IMPLEMENTATIONS +///////////////////////////////////////////////////////////////////// + +PX_FORCE_INLINE bool isFiniteFloatV(const FloatV a) +{ + PX_ALIGN(16, PxF32) data[4]; + vst1_f32(reinterpret_cast<float32_t*>(data), a); + return PxIsFinite(data[0]) && PxIsFinite(data[1]); +} + +PX_FORCE_INLINE bool isFiniteVec3V(const Vec3V a) +{ + PX_ALIGN(16, PxF32) data[4]; + vst1q_f32(reinterpret_cast<float32_t*>(data), a); + return PxIsFinite(data[0]) && PxIsFinite(data[1]) && PxIsFinite(data[2]); +} + +PX_FORCE_INLINE bool isFiniteVec4V(const Vec4V a) +{ + PX_ALIGN(16, PxF32) data[4]; + vst1q_f32(reinterpret_cast<float32_t*>(data), a); + return PxIsFinite(data[0]) && PxIsFinite(data[1]) && PxIsFinite(data[2]) && PxIsFinite(data[3]); +} + +PX_FORCE_INLINE bool hasZeroElementinFloatV(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + return vget_lane_u32(vreinterpret_u32_f32(a), 0) == 0; +} + +PX_FORCE_INLINE bool hasZeroElementInVec3V(const Vec3V a) +{ + const uint32x2_t dLow = vget_low_u32(vreinterpretq_u32_f32(a)); + const uint32x2_t dMin = vpmin_u32(dLow, dLow); + + return vget_lane_u32(dMin, 0) == 0 || vgetq_lane_u32(vreinterpretq_u32_f32(a), 2) == 0; +} + +PX_FORCE_INLINE bool hasZeroElementInVec4V(const Vec4V a) +{ + const uint32x2_t dHigh = vget_high_u32(vreinterpretq_u32_f32(a)); + const uint32x2_t dLow = vget_low_u32(vreinterpretq_u32_f32(a)); + + const uint32x2_t dMin = vmin_u32(dHigh, dLow); + const uint32x2_t pairMin = vpmin_u32(dMin, dMin); + return vget_lane_u32(pairMin, 0) == 0; +} + +///////////////////////////////////////////////////////////////////// +////VECTORISED FUNCTION IMPLEMENTATIONS +///////////////////////////////////////////////////////////////////// + +PX_FORCE_INLINE FloatV FLoad(const PxF32 f) +{ + return vdup_n_f32(reinterpret_cast<const float32_t&>(f)); +} + +PX_FORCE_INLINE FloatV FLoadA(const PxF32* const f) +{ + ASSERT_ISALIGNED16(f); + return vld1_f32(reinterpret_cast<const float32_t*>(f)); +} + +PX_FORCE_INLINE Vec3V V3Load(const PxF32 f) +{ + PX_ALIGN(16, PxF32) data[4] = { f, f, f, 0.0f }; + return V4LoadA(data); +} + +PX_FORCE_INLINE Vec4V V4Load(const PxF32 f) +{ + return vdupq_n_f32(reinterpret_cast<const float32_t&>(f)); +} + +PX_FORCE_INLINE BoolV BLoad(const bool f) +{ + const PxU32 i = static_cast<PxU32>(-(static_cast<PxI32>(f))); + return vdupq_n_u32(i); +} + +PX_FORCE_INLINE Vec3V V3LoadA(const PxVec3& f) +{ + ASSERT_ISALIGNED16(&f); + PX_ALIGN(16, PxF32) data[4] = { f.x, f.y, f.z, 0.0f }; + return V4LoadA(data); +} + +PX_FORCE_INLINE Vec3V V3LoadU(const PxVec3& f) +{ + PX_ALIGN(16, PxF32) data[4] = { f.x, f.y, f.z, 0.0f }; + return V4LoadA(data); +} + +PX_FORCE_INLINE Vec3V V3LoadUnsafeA(const PxVec3& f) +{ + ASSERT_ISALIGNED16(&f); + PX_ALIGN(16, PxF32) data[4] = { f.x, f.y, f.z, 0.0f }; + return V4LoadA(data); +} + +PX_FORCE_INLINE Vec3V V3LoadA(const PxF32* f) +{ + ASSERT_ISALIGNED16(f); + PX_ALIGN(16, PxF32) data[4] = { f[0], f[1], f[2], 0.0f }; + return V4LoadA(data); +} + +PX_FORCE_INLINE Vec3V V3LoadU(const PxF32* f) +{ + PX_ALIGN(16, PxF32) data[4] = { f[0], f[1], f[2], 0.0f }; + return V4LoadA(data); +} + +PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V(Vec4V v) +{ + return vsetq_lane_f32(0.0f, v, 3); +} + +PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V_WUndefined(Vec4V v) +{ + return v; +} + +PX_FORCE_INLINE Vec4V Vec4V_From_Vec3V(Vec3V f) +{ + return f; // ok if it is implemented as the same type. +} + +PX_FORCE_INLINE Vec4V Vec4V_From_FloatV(FloatV f) +{ + return vcombine_f32(f, f); +} + +PX_FORCE_INLINE Vec3V Vec3V_From_FloatV(FloatV f) +{ + return Vec3V_From_Vec4V(Vec4V_From_FloatV(f)); +} + +PX_FORCE_INLINE Vec3V Vec3V_From_FloatV_WUndefined(FloatV f) +{ + return Vec3V_From_Vec4V_WUndefined(Vec4V_From_FloatV(f)); +} + +PX_FORCE_INLINE Vec4V Vec4V_From_PxVec3_WUndefined(const PxVec3& f) +{ + PX_ALIGN(16, PxF32) data[4] = { f.x, f.y, f.z, 0.0f }; + return V4LoadA(data); +} + +PX_FORCE_INLINE Mat33V Mat33V_From_PxMat33(const PxMat33& m) +{ + return Mat33V(V3LoadU(m.column0), V3LoadU(m.column1), V3LoadU(m.column2)); +} + +PX_FORCE_INLINE void PxMat33_From_Mat33V(const Mat33V& m, PxMat33& out) +{ + V3StoreU(m.col0, out.column0); + V3StoreU(m.col1, out.column1); + V3StoreU(m.col2, out.column2); +} + +PX_FORCE_INLINE Vec4V V4LoadA(const PxF32* const f) +{ + ASSERT_ISALIGNED16(f); + return vld1q_f32(reinterpret_cast<const float32_t*>(f)); +} + +PX_FORCE_INLINE void V4StoreA(Vec4V a, PxF32* f) +{ + ASSERT_ISALIGNED16(f); + vst1q_f32(reinterpret_cast<float32_t*>(f), a); +} + +PX_FORCE_INLINE void V4StoreU(const Vec4V a, PxF32* f) +{ + PX_ALIGN(16, PxF32) f2[4]; + vst1q_f32(reinterpret_cast<float32_t*>(f2), a); + f[0] = f2[0]; + f[1] = f2[1]; + f[2] = f2[2]; + f[3] = f2[3]; +} + +PX_FORCE_INLINE void BStoreA(const BoolV a, PxU32* u) +{ + ASSERT_ISALIGNED16(u); + vst1q_u32(reinterpret_cast<uint32_t*>(u), a); +} + +PX_FORCE_INLINE void U4StoreA(const VecU32V uv, PxU32* u) +{ + ASSERT_ISALIGNED16(u); + vst1q_u32(reinterpret_cast<uint32_t*>(u), uv); +} + +PX_FORCE_INLINE void I4StoreA(const VecI32V iv, PxI32* i) +{ + ASSERT_ISALIGNED16(i); + vst1q_s32(reinterpret_cast<int32_t*>(i), iv); +} + +PX_FORCE_INLINE Vec4V V4LoadU(const PxF32* const f) +{ + return vld1q_f32(reinterpret_cast<const float32_t*>(f)); +} + +PX_FORCE_INLINE BoolV BLoad(const bool* const f) +{ + const PX_ALIGN(16, PxU32) b[4] = { static_cast<PxU32>(-static_cast<PxI32>(f[0])), + static_cast<PxU32>(-static_cast<PxI32>(f[1])), + static_cast<PxU32>(-static_cast<PxI32>(f[2])), + static_cast<PxU32>(-static_cast<PxI32>(f[3])) }; + return vld1q_u32(b); +} + +PX_FORCE_INLINE void FStore(const FloatV a, PxF32* PX_RESTRICT f) +{ + ASSERT_ISVALIDFLOATV(a); + // vst1q_lane_f32(f, a, 0); // causes vst1 alignment bug + *f = vget_lane_f32(a, 0); +} + +PX_FORCE_INLINE void Store_From_BoolV(const BoolV a, PxU32* PX_RESTRICT f) +{ + *f = vget_lane_u32(vget_low_u32(a), 0); +} + +PX_FORCE_INLINE void V3StoreA(const Vec3V a, PxVec3& f) +{ + ASSERT_ISALIGNED16(&f); + PX_ALIGN(16, PxF32) f2[4]; + vst1q_f32(reinterpret_cast<float32_t*>(f2), a); + f = PxVec3(f2[0], f2[1], f2[2]); +} + +PX_FORCE_INLINE void V3StoreU(const Vec3V a, PxVec3& f) +{ + PX_ALIGN(16, PxF32) f2[4]; + vst1q_f32(reinterpret_cast<float32_t*>(f2), a); + f = PxVec3(f2[0], f2[1], f2[2]); +} + +////////////////////////////////// +// FLOATV +////////////////////////////////// + +PX_FORCE_INLINE FloatV FZero() +{ + return FLoad(0.0f); +} + +PX_FORCE_INLINE FloatV FOne() +{ + return FLoad(1.0f); +} + +PX_FORCE_INLINE FloatV FHalf() +{ + return FLoad(0.5f); +} + +PX_FORCE_INLINE FloatV FEps() +{ + return FLoad(PX_EPS_REAL); +} + +PX_FORCE_INLINE FloatV FEps6() +{ + return FLoad(1e-6f); +} + +PX_FORCE_INLINE FloatV FMax() +{ + return FLoad(PX_MAX_REAL); +} + +PX_FORCE_INLINE FloatV FNegMax() +{ + return FLoad(-PX_MAX_REAL); +} + +PX_FORCE_INLINE FloatV IZero() +{ + return vreinterpret_f32_u32(vdup_n_u32(0)); +} + +PX_FORCE_INLINE FloatV IOne() +{ + return vreinterpret_f32_u32(vdup_n_u32(1)); +} + +PX_FORCE_INLINE FloatV ITwo() +{ + return vreinterpret_f32_u32(vdup_n_u32(2)); +} + +PX_FORCE_INLINE FloatV IThree() +{ + return vreinterpret_f32_u32(vdup_n_u32(3)); +} + +PX_FORCE_INLINE FloatV IFour() +{ + return vreinterpret_f32_u32(vdup_n_u32(4)); +} + +PX_FORCE_INLINE FloatV FNeg(const FloatV f) +{ + ASSERT_ISVALIDFLOATV(f); + return vneg_f32(f); +} + +PX_FORCE_INLINE FloatV FAdd(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return vadd_f32(a, b); +} + +PX_FORCE_INLINE FloatV FSub(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return vsub_f32(a, b); +} + +PX_FORCE_INLINE FloatV FMul(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return vmul_f32(a, b); +} + +template <int n> +PX_FORCE_INLINE float32x2_t recip_newton(const float32x2_t& in) +{ + float32x2_t recip = vrecpe_f32(in); + for(int i = 0; i < n; ++i) + recip = vmul_f32(recip, vrecps_f32(in, recip)); + return recip; +} + +template <int n> +PX_FORCE_INLINE float32x4_t recipq_newton(const float32x4_t& in) +{ + float32x4_t recip = vrecpeq_f32(in); + for(int i = 0; i < n; ++i) + recip = vmulq_f32(recip, vrecpsq_f32(recip, in)); + return recip; +} + +template <int n> +PX_FORCE_INLINE float32x2_t rsqrt_newton(const float32x2_t& in) +{ + float32x2_t rsqrt = vrsqrte_f32(in); + for(int i = 0; i < n; ++i) + rsqrt = vmul_f32(rsqrt, vrsqrts_f32(vmul_f32(rsqrt, rsqrt), in)); + return rsqrt; +} + +template <int n> +PX_FORCE_INLINE float32x4_t rsqrtq_newton(const float32x4_t& in) +{ + float32x4_t rsqrt = vrsqrteq_f32(in); + for(int i = 0; i < n; ++i) + rsqrt = vmulq_f32(rsqrt, vrsqrtsq_f32(vmulq_f32(rsqrt, rsqrt), in)); + return rsqrt; +} + +PX_FORCE_INLINE FloatV FDiv(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return vmul_f32(a, VRECIP(b)); +} + +PX_FORCE_INLINE FloatV FDivFast(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return vmul_f32(a, VRECIPE(b)); +} + +PX_FORCE_INLINE FloatV FRecip(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + return VRECIP(a); +} + +PX_FORCE_INLINE FloatV FRecipFast(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + return VRECIPE(a); +} + +PX_FORCE_INLINE FloatV FRsqrt(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + return VRECIPSQRT(a); +} + +PX_FORCE_INLINE FloatV FSqrt(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + return FSel(FIsEq(a, FZero()), a, vmul_f32(a, VRECIPSQRT(a))); +} + +PX_FORCE_INLINE FloatV FRsqrtFast(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + return VRECIPSQRTE(a); +} + +PX_FORCE_INLINE FloatV FScaleAdd(const FloatV a, const FloatV b, const FloatV c) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + ASSERT_ISVALIDFLOATV(c); + return vmla_f32(c, a, b); +} + +PX_FORCE_INLINE FloatV FNegScaleSub(const FloatV a, const FloatV b, const FloatV c) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + ASSERT_ISVALIDFLOATV(c); + return vmls_f32(c, a, b); +} + +PX_FORCE_INLINE FloatV FAbs(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + return vabs_f32(a); +} + +PX_FORCE_INLINE FloatV FSel(const BoolV c, const FloatV a, const FloatV b) +{ + PX_ASSERT( _VecMathTests::allElementsEqualBoolV(c, BTTTT()) || + _VecMathTests::allElementsEqualBoolV(c, BFFFF())); + ASSERT_ISVALIDFLOATV(vbsl_f32(vget_low_u32(c), a, b)); + return vbsl_f32(vget_low_u32(c), a, b); +} + +PX_FORCE_INLINE BoolV FIsGrtr(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return vdupq_lane_u32(vcgt_f32(a, b), 0); +} + +PX_FORCE_INLINE BoolV FIsGrtrOrEq(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return vdupq_lane_u32(vcge_f32(a, b), 0); +} + +PX_FORCE_INLINE BoolV FIsEq(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return vdupq_lane_u32(vceq_f32(a, b), 0); +} + +PX_FORCE_INLINE FloatV FMax(const FloatV a, const FloatV b) +{ + //ASSERT_ISVALIDFLOATV(a); + //ASSERT_ISVALIDFLOATV(b); + return vmax_f32(a, b); +} + +PX_FORCE_INLINE FloatV FMin(const FloatV a, const FloatV b) +{ + //ASSERT_ISVALIDFLOATV(a); + //ASSERT_ISVALIDFLOATV(b); + return vmin_f32(a, b); +} + +PX_FORCE_INLINE FloatV FClamp(const FloatV a, const FloatV minV, const FloatV maxV) +{ + ASSERT_ISVALIDFLOATV(minV); + ASSERT_ISVALIDFLOATV(maxV); + return vmax_f32(vmin_f32(a, maxV), minV); +} + +PX_FORCE_INLINE PxU32 FAllGrtr(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return vget_lane_u32(vcgt_f32(a, b), 0); +} + +PX_FORCE_INLINE PxU32 FAllGrtrOrEq(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return vget_lane_u32(vcge_f32(a, b), 0); +} + +PX_FORCE_INLINE PxU32 FAllEq(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return vget_lane_u32(vceq_f32(a, b), 0); +} + +PX_FORCE_INLINE FloatV FRound(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + + // truncate(a + (0.5f - sign(a))) + const float32x2_t half = vdup_n_f32(0.5f); + const float32x2_t sign = vcvt_f32_u32((vshr_n_u32(vreinterpret_u32_f32(a), 31))); + const float32x2_t aPlusHalf = vadd_f32(a, half); + const float32x2_t aRound = vsub_f32(aPlusHalf, sign); + int32x2_t tmp = vcvt_s32_f32(aRound); + return vcvt_f32_s32(tmp); +} + +PX_FORCE_INLINE FloatV FSin(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + + // Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI + const FloatV recipTwoPi = FLoadA(g_PXReciprocalTwoPi.f); + const FloatV twoPi = FLoadA(g_PXTwoPi.f); + const FloatV tmp = FMul(a, recipTwoPi); + const FloatV b = FRound(tmp); + const FloatV V1 = FNegScaleSub(twoPi, b, a); + + // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! - + // V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI) + const FloatV V2 = FMul(V1, V1); + const FloatV V3 = FMul(V2, V1); + const FloatV V5 = FMul(V3, V2); + const FloatV V7 = FMul(V5, V2); + const FloatV V9 = FMul(V7, V2); + const FloatV V11 = FMul(V9, V2); + const FloatV V13 = FMul(V11, V2); + const FloatV V15 = FMul(V13, V2); + const FloatV V17 = FMul(V15, V2); + const FloatV V19 = FMul(V17, V2); + const FloatV V21 = FMul(V19, V2); + const FloatV V23 = FMul(V21, V2); + + const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f); + const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f); + const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f); + + const FloatV S1 = V4GetY(sinCoefficients0); + const FloatV S2 = V4GetZ(sinCoefficients0); + const FloatV S3 = V4GetW(sinCoefficients0); + const FloatV S4 = V4GetX(sinCoefficients1); + const FloatV S5 = V4GetY(sinCoefficients1); + const FloatV S6 = V4GetZ(sinCoefficients1); + const FloatV S7 = V4GetW(sinCoefficients1); + const FloatV S8 = V4GetX(sinCoefficients2); + const FloatV S9 = V4GetY(sinCoefficients2); + const FloatV S10 = V4GetZ(sinCoefficients2); + const FloatV S11 = V4GetW(sinCoefficients2); + + FloatV Result; + Result = FScaleAdd(S1, V3, V1); + Result = FScaleAdd(S2, V5, Result); + Result = FScaleAdd(S3, V7, Result); + Result = FScaleAdd(S4, V9, Result); + Result = FScaleAdd(S5, V11, Result); + Result = FScaleAdd(S6, V13, Result); + Result = FScaleAdd(S7, V15, Result); + Result = FScaleAdd(S8, V17, Result); + Result = FScaleAdd(S9, V19, Result); + Result = FScaleAdd(S10, V21, Result); + Result = FScaleAdd(S11, V23, Result); + + return Result; +} + +PX_FORCE_INLINE FloatV FCos(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + + // Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI + const FloatV recipTwoPi = FLoadA(g_PXReciprocalTwoPi.f); + const FloatV twoPi = FLoadA(g_PXTwoPi.f); + const FloatV tmp = FMul(a, recipTwoPi); + const FloatV b = FRound(tmp); + const FloatV V1 = FNegScaleSub(twoPi, b, a); + + // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! - + // V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI) + const FloatV V2 = FMul(V1, V1); + const FloatV V4 = FMul(V2, V2); + const FloatV V6 = FMul(V4, V2); + const FloatV V8 = FMul(V4, V4); + const FloatV V10 = FMul(V6, V4); + const FloatV V12 = FMul(V6, V6); + const FloatV V14 = FMul(V8, V6); + const FloatV V16 = FMul(V8, V8); + const FloatV V18 = FMul(V10, V8); + const FloatV V20 = FMul(V10, V10); + const FloatV V22 = FMul(V12, V10); + + const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f); + const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f); + const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f); + + const FloatV C1 = V4GetY(cosCoefficients0); + const FloatV C2 = V4GetZ(cosCoefficients0); + const FloatV C3 = V4GetW(cosCoefficients0); + const FloatV C4 = V4GetX(cosCoefficients1); + const FloatV C5 = V4GetY(cosCoefficients1); + const FloatV C6 = V4GetZ(cosCoefficients1); + const FloatV C7 = V4GetW(cosCoefficients1); + const FloatV C8 = V4GetX(cosCoefficients2); + const FloatV C9 = V4GetY(cosCoefficients2); + const FloatV C10 = V4GetZ(cosCoefficients2); + const FloatV C11 = V4GetW(cosCoefficients2); + + FloatV Result; + Result = FScaleAdd(C1, V2, FOne()); + Result = FScaleAdd(C2, V4, Result); + Result = FScaleAdd(C3, V6, Result); + Result = FScaleAdd(C4, V8, Result); + Result = FScaleAdd(C5, V10, Result); + Result = FScaleAdd(C6, V12, Result); + Result = FScaleAdd(C7, V14, Result); + Result = FScaleAdd(C8, V16, Result); + Result = FScaleAdd(C9, V18, Result); + Result = FScaleAdd(C10, V20, Result); + Result = FScaleAdd(C11, V22, Result); + + return Result; +} + +PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV min, const FloatV max) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(min); + ASSERT_ISVALIDFLOATV(max); + + const BoolV c = BOr(FIsGrtr(a, max), FIsGrtr(min, a)); + return PxU32(!BAllEqFFFF(c)); +} + +PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV min, const FloatV max) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(min); + ASSERT_ISVALIDFLOATV(max); + + const BoolV c = BAnd(FIsGrtrOrEq(a, min), FIsGrtrOrEq(max, a)); + return PxU32(BAllEqTTTT(c)); +} + +PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV bounds) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(bounds); + const uint32x2_t greater = vcagt_f32(a, bounds); + return vget_lane_u32(greater, 0); +} + +PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV bounds) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(bounds); + const uint32x2_t geq = vcage_f32(bounds, a); + return vget_lane_u32(geq, 0); +} + +////////////////////////////////// +// VEC3V +////////////////////////////////// + +PX_FORCE_INLINE Vec3V V3Splat(const FloatV f) +{ + ASSERT_ISVALIDFLOATV(f); + + const uint32x2_t mask = { 0xffffFFFF, 0x0 }; + + const uint32x2_t uHigh = vreinterpret_u32_f32(f); + const float32x2_t dHigh = vreinterpret_f32_u32(vand_u32(uHigh, mask)); + + return vcombine_f32(f, dHigh); +} + +PX_FORCE_INLINE Vec3V V3Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z) +{ + ASSERT_ISVALIDFLOATV(x); + ASSERT_ISVALIDFLOATV(y); + ASSERT_ISVALIDFLOATV(z); + + const uint32x2_t mask = { 0xffffFFFF, 0x0 }; + + const uint32x2_t dHigh = vand_u32(vreinterpret_u32_f32(z), mask); + const uint32x2_t dLow = vext_u32(vreinterpret_u32_f32(x), vreinterpret_u32_f32(y), 1); + return vreinterpretq_f32_u32(vcombine_u32(dLow, dHigh)); +} + +PX_FORCE_INLINE Vec3V V3UnitX() +{ + const float32x4_t x = { 1.0f, 0.0f, 0.0f, 0.0f }; + return x; +} + +PX_FORCE_INLINE Vec3V V3UnitY() +{ + const float32x4_t y = { 0, 1.0f, 0, 0 }; + return y; +} + +PX_FORCE_INLINE Vec3V V3UnitZ() +{ + const float32x4_t z = { 0, 0, 1.0f, 0 }; + return z; +} + +PX_FORCE_INLINE FloatV V3GetX(const Vec3V f) +{ + ASSERT_ISVALIDVEC3V(f); + const float32x2_t fLow = vget_low_f32(f); + return vdup_lane_f32(fLow, 0); +} + +PX_FORCE_INLINE FloatV V3GetY(const Vec3V f) +{ + ASSERT_ISVALIDVEC3V(f); + const float32x2_t fLow = vget_low_f32(f); + return vdup_lane_f32(fLow, 1); +} + +PX_FORCE_INLINE FloatV V3GetZ(const Vec3V f) +{ + ASSERT_ISVALIDVEC3V(f); + const float32x2_t fhigh = vget_high_f32(f); + return vdup_lane_f32(fhigh, 0); +} + +PX_FORCE_INLINE Vec3V V3SetX(const Vec3V v, const FloatV f) +{ + ASSERT_ISVALIDVEC3V(v); + ASSERT_ISVALIDFLOATV(f); + return V4Sel(BFTTT(), v, vcombine_f32(f, f)); +} + +PX_FORCE_INLINE Vec3V V3SetY(const Vec3V v, const FloatV f) +{ + ASSERT_ISVALIDVEC3V(v); + ASSERT_ISVALIDFLOATV(f); + return V4Sel(BTFTT(), v, vcombine_f32(f, f)); +} + +PX_FORCE_INLINE Vec3V V3SetZ(const Vec3V v, const FloatV f) +{ + ASSERT_ISVALIDVEC3V(v); + ASSERT_ISVALIDFLOATV(f); + return V4Sel(BTTFT(), v, vcombine_f32(f, f)); +} + +PX_FORCE_INLINE Vec3V V3ColX(const Vec3V a, const Vec3V b, const Vec3V c) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + ASSERT_ISVALIDVEC3V(c); + + const float32x2_t aLow = vget_low_f32(a); + const float32x2_t bLow = vget_low_f32(b); + const float32x2_t cLow = vget_low_f32(c); + const float32x2_t zero = vdup_n_f32(0.0f); + + const float32x2x2_t zipL = vzip_f32(aLow, bLow); + const float32x2x2_t zipH = vzip_f32(cLow, zero); + + return vcombine_f32(zipL.val[0], zipH.val[0]); +} + +PX_FORCE_INLINE Vec3V V3ColY(const Vec3V a, const Vec3V b, const Vec3V c) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + ASSERT_ISVALIDVEC3V(c); + + const float32x2_t aLow = vget_low_f32(a); + const float32x2_t bLow = vget_low_f32(b); + const float32x2_t cLow = vget_low_f32(c); + const float32x2_t zero = vdup_n_f32(0.0f); + + const float32x2x2_t zipL = vzip_f32(aLow, bLow); + const float32x2x2_t zipH = vzip_f32(cLow, zero); + + return vcombine_f32(zipL.val[1], zipH.val[1]); +} + +PX_FORCE_INLINE Vec3V V3ColZ(const Vec3V a, const Vec3V b, const Vec3V c) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + ASSERT_ISVALIDVEC3V(c); + + const float32x2_t aHi = vget_high_f32(a); + const float32x2_t bHi = vget_high_f32(b); + const float32x2_t cHi = vget_high_f32(c); + + const float32x2x2_t zipL = vzip_f32(aHi, bHi); + + return vcombine_f32(zipL.val[0], cHi); +} + +PX_FORCE_INLINE Vec3V V3Zero() +{ + return vdupq_n_f32(0.0f); +} + +PX_FORCE_INLINE Vec3V V3Eps() +{ + return V3Load(PX_EPS_REAL); +} + +PX_FORCE_INLINE Vec3V V3One() +{ + return V3Load(1.0f); +} + +PX_FORCE_INLINE Vec3V V3Neg(const Vec3V f) +{ + ASSERT_ISVALIDVEC3V(f); + const float32x4_t tmp = vnegq_f32(f); + return vsetq_lane_f32(0.0f, tmp, 3); +} + +PX_FORCE_INLINE Vec3V V3Add(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return vaddq_f32(a, b); +} + +PX_FORCE_INLINE Vec3V V3Add(const Vec3V a, const FloatV b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDFLOATV(b); + return vaddq_f32(a, Vec3V_From_FloatV(b)); +} + +PX_FORCE_INLINE Vec3V V3Sub(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return vsubq_f32(a, b); +} + +PX_FORCE_INLINE Vec3V V3Sub(const Vec3V a, const FloatV b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDFLOATV(b); + return vsubq_f32(a, Vec3V_From_FloatV(b)); +} + +PX_FORCE_INLINE Vec3V V3Scale(const Vec3V a, const FloatV b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDFLOATV(b); + const float32x4_t tmp = vmulq_lane_f32(a, b, 0); + return vsetq_lane_f32(0.0f, tmp, 3); +} + +PX_FORCE_INLINE Vec3V V3Mul(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return vmulq_f32(a, b); +} + +PX_FORCE_INLINE Vec3V V3ScaleInv(const Vec3V a, const FloatV b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDFLOATV(b); + const float32x2_t invB = VRECIP(b); + const float32x4_t tmp = vmulq_lane_f32(a, invB, 0); + return vsetq_lane_f32(0.0f, tmp, 3); +} + +PX_FORCE_INLINE Vec3V V3Div(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + float32x4_t invB = VRECIPQ(b); + invB = vsetq_lane_f32(0.0f, invB, 3); + return vmulq_f32(a, invB); +} + +PX_FORCE_INLINE Vec3V V3ScaleInvFast(const Vec3V a, const FloatV b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDFLOATV(b); + const float32x2_t invB = VRECIPE(b); + const float32x4_t tmp = vmulq_lane_f32(a, invB, 0); + return vsetq_lane_f32(0.0f, tmp, 3); +} + +PX_FORCE_INLINE Vec3V V3DivFast(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + float32x4_t invB = VRECIPEQ(b); + invB = vsetq_lane_f32(0.0f, invB, 3); + return vmulq_f32(a, invB); +} + +PX_FORCE_INLINE Vec3V V3Recip(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + const float32x4_t recipA = VRECIPQ(a); + return vsetq_lane_f32(0.0f, recipA, 3); +} + +PX_FORCE_INLINE Vec3V V3RecipFast(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + const float32x4_t recipA = VRECIPEQ(a); + return vsetq_lane_f32(0.0f, recipA, 3); +} + +PX_FORCE_INLINE Vec3V V3Rsqrt(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + const float32x4_t rSqrA = VRECIPSQRTQ(a); + return vsetq_lane_f32(0.0f, rSqrA, 3); +} + +PX_FORCE_INLINE Vec3V V3RsqrtFast(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + const float32x4_t rSqrA = VRECIPSQRTEQ(a); + return vsetq_lane_f32(0.0f, rSqrA, 3); +} + +PX_FORCE_INLINE Vec3V V3ScaleAdd(const Vec3V a, const FloatV b, const Vec3V c) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDFLOATV(b); + ASSERT_ISVALIDVEC3V(c); + + float32x4_t tmp = vmlaq_lane_f32(c, a, b, 0); + // using vsetq_lane_f32 resulted in failures, + // probably related to a compiler bug on + // ndk r9d-win32, gcc 4.8, cardhu/shield + + // code with issue + // return vsetq_lane_f32(0.0f, tmp, 3); + + // workaround + float32x2_t w_z = vget_high_f32(tmp); + float32x2_t y_x = vget_low_f32(tmp); + w_z = vset_lane_f32(0.0f, w_z, 1); + return vcombine_f32(y_x, w_z); +} + +PX_FORCE_INLINE Vec3V V3NegScaleSub(const Vec3V a, const FloatV b, const Vec3V c) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDFLOATV(b); + ASSERT_ISVALIDVEC3V(c); + + float32x4_t tmp = vmlsq_lane_f32(c, a, b, 0); + // using vsetq_lane_f32 resulted in failures, + // probably related to a compiler bug on + // ndk r9d-win32, gcc 4.8, cardhu/shield + + // code with issue + // return vsetq_lane_f32(0.0f, tmp, 3); + + // workaround + float32x2_t w_z = vget_high_f32(tmp); + float32x2_t y_x = vget_low_f32(tmp); + w_z = vset_lane_f32(0.0f, w_z, 1); + return vcombine_f32(y_x, w_z); +} + +PX_FORCE_INLINE Vec3V V3MulAdd(const Vec3V a, const Vec3V b, const Vec3V c) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + ASSERT_ISVALIDVEC3V(c); + return vmlaq_f32(c, a, b); +} + +PX_FORCE_INLINE Vec3V V3NegMulSub(const Vec3V a, const Vec3V b, const Vec3V c) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + ASSERT_ISVALIDVEC3V(c); + return vmlsq_f32(c, a, b); +} + +PX_FORCE_INLINE Vec3V V3Abs(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + return vabsq_f32(a); +} + +PX_FORCE_INLINE FloatV V3Dot(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + + // const uint32x2_t mask = {0xffffFFFF, 0x0}; + const float32x4_t tmp = vmulq_f32(a, b); + + const float32x2_t low = vget_low_f32(tmp); + const float32x2_t high = vget_high_f32(tmp); + // const float32x2_t high = vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(high_), mask)); + + const float32x2_t sumTmp = vpadd_f32(low, high); // = {0+z, x+y} + const float32x2_t sum0ZYX = vpadd_f32(sumTmp, sumTmp); // = {x+y+z, x+y+z} + + return sum0ZYX; +} + +PX_FORCE_INLINE Vec3V V3Cross(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + + const uint32x2_t TF = { 0xffffFFFF, 0x0 }; + const float32x2_t ay_ax = vget_low_f32(a); // d2 + const float32x2_t aw_az = vget_high_f32(a); // d3 + const float32x2_t by_bx = vget_low_f32(b); // d4 + const float32x2_t bw_bz = vget_high_f32(b); // d5 + // Hi, Lo + const float32x2_t bz_by = vext_f32(by_bx, bw_bz, 1); // bz, by + const float32x2_t az_ay = vext_f32(ay_ax, aw_az, 1); // az, ay + + const float32x2_t azbx = vmul_f32(aw_az, by_bx); // 0, az*bx + const float32x2_t aybz_axby = vmul_f32(ay_ax, bz_by); // ay*bz, ax*by + + const float32x2_t azbxSUBaxbz = vmls_f32(azbx, bw_bz, ay_ax); // 0, az*bx-ax*bz + const float32x2_t aybzSUBazby_axbySUBaybx = vmls_f32(aybz_axby, by_bx, az_ay); // ay*bz-az*by, ax*by-ay*bx + + const float32x2_t retLow = vext_f32(aybzSUBazby_axbySUBaybx, azbxSUBaxbz, 1); // az*bx-ax*bz, ay*bz-az*by + const uint32x2_t retHigh = vand_u32(TF, vreinterpret_u32_f32(aybzSUBazby_axbySUBaybx)); // 0, ax*by-ay*bx + + return vcombine_f32(retLow, vreinterpret_f32_u32(retHigh)); +} + +PX_FORCE_INLINE VecCrossV V3PrepareCross(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + return a; +} + +PX_FORCE_INLINE FloatV V3Length(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + + // const uint32x2_t mask = {0xffffFFFF, 0x0}; + + const float32x4_t tmp = vmulq_f32(a, a); + const float32x2_t low = vget_low_f32(tmp); + const float32x2_t high = vget_high_f32(tmp); + // const float32x2_t high = vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(high_), mask)); + + const float32x2_t sumTmp = vpadd_f32(low, high); // = {0+z, x+y} + const float32x2_t sum0ZYX = vpadd_f32(sumTmp, sumTmp); // = {x+y+z, x+y+z} + + return FSqrt(sum0ZYX); +} + +PX_FORCE_INLINE FloatV V3LengthSq(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + return V3Dot(a, a); +} + +PX_FORCE_INLINE Vec3V V3Normalize(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + //PX_ASSERT(!FAllEq(V4LengthSq(a), FZero())); + return V3ScaleInv(a, V3Length(a)); +} + +PX_FORCE_INLINE Vec3V V3NormalizeFast(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + //PX_ASSERT(!FAllEq(V4LengthSq(a), FZero())); + return V3Scale(a, VRECIPSQRTE(V3Dot(a, a))); +} + +PX_FORCE_INLINE Vec3V V3NormalizeSafe(const Vec3V a, const Vec3V unsafeReturnValue) +{ + ASSERT_ISVALIDVEC3V(a); + const FloatV zero = vdup_n_f32(0.0f); + const FloatV length = V3Length(a); + const uint32x4_t isGreaterThanZero = FIsGrtr(length, zero); + return V3Sel(isGreaterThanZero, V3ScaleInv(a, length), unsafeReturnValue); +} + +PX_FORCE_INLINE Vec3V V3Sel(const BoolV c, const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V( vbslq_f32(c, a, b)); + return vbslq_f32(c, a, b); +} + +PX_FORCE_INLINE BoolV V3IsGrtr(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return vcgtq_f32(a, b); +} + +PX_FORCE_INLINE BoolV V3IsGrtrOrEq(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return vcgeq_f32(a, b); +} + +PX_FORCE_INLINE BoolV V3IsEq(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return vceqq_f32(a, b); +} + +PX_FORCE_INLINE Vec3V V3Max(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return vmaxq_f32(a, b); +} + +PX_FORCE_INLINE Vec3V V3Min(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return vminq_f32(a, b); +} + +PX_FORCE_INLINE FloatV V3ExtractMax(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + + const float32x2_t low = vget_low_f32(a); + const float32x2_t high = vget_high_f32(a); + + const float32x2_t zz = vdup_lane_f32(high, 0); + const float32x2_t max0 = vpmax_f32(zz, low); + const float32x2_t max1 = vpmax_f32(max0, max0); + + return max1; +} + +PX_FORCE_INLINE FloatV V3ExtractMin(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + + const float32x2_t low = vget_low_f32(a); + const float32x2_t high = vget_high_f32(a); + + const float32x2_t zz = vdup_lane_f32(high, 0); + const float32x2_t min0 = vpmin_f32(zz, low); + const float32x2_t min1 = vpmin_f32(min0, min0); + + return min1; +} + +// return (a >= 0.0f) ? 1.0f : -1.0f; +PX_FORCE_INLINE Vec3V V3Sign(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + const Vec3V zero = V3Zero(); + const Vec3V one = V3One(); + const Vec3V none = V3Neg(one); + return V3Sel(V3IsGrtrOrEq(a, zero), one, none); +} + +PX_FORCE_INLINE Vec3V V3Clamp(const Vec3V a, const Vec3V minV, const Vec3V maxV) +{ + ASSERT_ISVALIDVEC3V(minV); + ASSERT_ISVALIDVEC3V(maxV); + return V3Max(V3Min(a, maxV), minV); +} + +PX_FORCE_INLINE PxU32 V3AllGrtr(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return internalUnitNeonSimd::BAllTrue3_R(V4IsGrtr(a, b)); +} + +PX_FORCE_INLINE PxU32 V3AllGrtrOrEq(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return internalUnitNeonSimd::BAllTrue3_R(V4IsGrtrOrEq(a, b)); +} + +PX_FORCE_INLINE PxU32 V3AllEq(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return internalUnitNeonSimd::BAllTrue3_R(V4IsEq(a, b)); +} + +PX_FORCE_INLINE Vec3V V3Round(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + // truncate(a + (0.5f - sign(a))) + const Vec3V half = V3Load(0.5f); + const float32x4_t sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(a), 31))); + const Vec3V aPlusHalf = V3Add(a, half); + const Vec3V aRound = V3Sub(aPlusHalf, sign); + return vcvtq_f32_s32(vcvtq_s32_f32(aRound)); +} + +PX_FORCE_INLINE Vec3V V3Sin(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + + // Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI + const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f); + const Vec4V twoPi = V4LoadA(g_PXTwoPi.f); + const Vec3V tmp = V4Mul(a, recipTwoPi); + const Vec3V b = V3Round(tmp); + const Vec3V V1 = V4NegMulSub(twoPi, b, a); + + // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! - + // V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI) + const Vec3V V2 = V3Mul(V1, V1); + const Vec3V V3 = V3Mul(V2, V1); + const Vec3V V5 = V3Mul(V3, V2); + const Vec3V V7 = V3Mul(V5, V2); + const Vec3V V9 = V3Mul(V7, V2); + const Vec3V V11 = V3Mul(V9, V2); + const Vec3V V13 = V3Mul(V11, V2); + const Vec3V V15 = V3Mul(V13, V2); + const Vec3V V17 = V3Mul(V15, V2); + const Vec3V V19 = V3Mul(V17, V2); + const Vec3V V21 = V3Mul(V19, V2); + const Vec3V V23 = V3Mul(V21, V2); + + const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f); + const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f); + const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f); + + const FloatV S1 = V4GetY(sinCoefficients0); + const FloatV S2 = V4GetZ(sinCoefficients0); + const FloatV S3 = V4GetW(sinCoefficients0); + const FloatV S4 = V4GetX(sinCoefficients1); + const FloatV S5 = V4GetY(sinCoefficients1); + const FloatV S6 = V4GetZ(sinCoefficients1); + const FloatV S7 = V4GetW(sinCoefficients1); + const FloatV S8 = V4GetX(sinCoefficients2); + const FloatV S9 = V4GetY(sinCoefficients2); + const FloatV S10 = V4GetZ(sinCoefficients2); + const FloatV S11 = V4GetW(sinCoefficients2); + + Vec3V Result; + Result = V4ScaleAdd(V3, S1, V1); + Result = V4ScaleAdd(V5, S2, Result); + Result = V4ScaleAdd(V7, S3, Result); + Result = V4ScaleAdd(V9, S4, Result); + Result = V4ScaleAdd(V11, S5, Result); + Result = V4ScaleAdd(V13, S6, Result); + Result = V4ScaleAdd(V15, S7, Result); + Result = V4ScaleAdd(V17, S8, Result); + Result = V4ScaleAdd(V19, S9, Result); + Result = V4ScaleAdd(V21, S10, Result); + Result = V4ScaleAdd(V23, S11, Result); + + return Result; +} + +PX_FORCE_INLINE Vec3V V3Cos(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + + // Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI + const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f); + const Vec4V twoPi = V4LoadA(g_PXTwoPi.f); + const Vec3V tmp = V4Mul(a, recipTwoPi); + const Vec3V b = V3Round(tmp); + const Vec3V V1 = V4NegMulSub(twoPi, b, a); + + // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! - + // V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI) + const Vec3V V2 = V3Mul(V1, V1); + const Vec3V V4 = V3Mul(V2, V2); + const Vec3V V6 = V3Mul(V4, V2); + const Vec3V V8 = V3Mul(V4, V4); + const Vec3V V10 = V3Mul(V6, V4); + const Vec3V V12 = V3Mul(V6, V6); + const Vec3V V14 = V3Mul(V8, V6); + const Vec3V V16 = V3Mul(V8, V8); + const Vec3V V18 = V3Mul(V10, V8); + const Vec3V V20 = V3Mul(V10, V10); + const Vec3V V22 = V3Mul(V12, V10); + + const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f); + const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f); + const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f); + + const FloatV C1 = V4GetY(cosCoefficients0); + const FloatV C2 = V4GetZ(cosCoefficients0); + const FloatV C3 = V4GetW(cosCoefficients0); + const FloatV C4 = V4GetX(cosCoefficients1); + const FloatV C5 = V4GetY(cosCoefficients1); + const FloatV C6 = V4GetZ(cosCoefficients1); + const FloatV C7 = V4GetW(cosCoefficients1); + const FloatV C8 = V4GetX(cosCoefficients2); + const FloatV C9 = V4GetY(cosCoefficients2); + const FloatV C10 = V4GetZ(cosCoefficients2); + const FloatV C11 = V4GetW(cosCoefficients2); + + Vec3V Result; + Result = V4ScaleAdd(V2, C1, V4One()); + Result = V4ScaleAdd(V4, C2, Result); + Result = V4ScaleAdd(V6, C3, Result); + Result = V4ScaleAdd(V8, C4, Result); + Result = V4ScaleAdd(V10, C5, Result); + Result = V4ScaleAdd(V12, C6, Result); + Result = V4ScaleAdd(V14, C7, Result); + Result = V4ScaleAdd(V16, C8, Result); + Result = V4ScaleAdd(V18, C9, Result); + Result = V4ScaleAdd(V20, C10, Result); + Result = V4ScaleAdd(V22, C11, Result); + + return V4ClearW(Result); +} + +PX_FORCE_INLINE Vec3V V3PermYZZ(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + const float32x2_t xy = vget_low_f32(a); + const float32x2_t zw = vget_high_f32(a); + const float32x2_t yz = vext_f32(xy, zw, 1); + return vcombine_f32(yz, zw); +} + +PX_FORCE_INLINE Vec3V V3PermXYX(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + const uint32x2_t mask = { 0xffffFFFF, 0x0 }; + + const uint32x2_t xy = vget_low_u32(vreinterpretq_u32_f32(a)); + const uint32x2_t xw = vand_u32(xy, mask); + return vreinterpretq_f32_u32(vcombine_u32(xy, xw)); +} + +PX_FORCE_INLINE Vec3V V3PermYZX(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + const uint32x2_t mask = { 0xffffFFFF, 0x0 }; + + const uint32x2_t xy = vget_low_u32(vreinterpretq_u32_f32(a)); + const uint32x2_t zw = vget_high_u32(vreinterpretq_u32_f32(a)); + const uint32x2_t yz = vext_u32(xy, zw, 1); + const uint32x2_t xw = vand_u32(xy, mask); + return vreinterpretq_f32_u32(vcombine_u32(yz, xw)); +} + +PX_FORCE_INLINE Vec3V V3PermZXY(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + + const uint32x2_t xy = vget_low_u32(vreinterpretq_u32_f32(a)); + const uint32x2_t zw = vget_high_u32(vreinterpretq_u32_f32(a)); + const uint32x2_t wz = vrev64_u32(zw); + + const uint32x2_t zx = vext_u32(wz, xy, 1); + const uint32x2_t yw = vext_u32(xy, wz, 1); + + return vreinterpretq_f32_u32(vcombine_u32(zx, yw)); +} + +PX_FORCE_INLINE Vec3V V3PermZZY(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + + const uint32x2_t xy = vget_low_u32(vreinterpretq_u32_f32(a)); + const uint32x2_t zw = vget_high_u32(vreinterpretq_u32_f32(a)); + + const uint32x2_t wz = vrev64_u32(zw); + const uint32x2_t yw = vext_u32(xy, wz, 1); + const uint32x2_t zz = vdup_lane_u32(wz, 1); + + return vreinterpretq_f32_u32(vcombine_u32(zz, yw)); +} + +PX_FORCE_INLINE Vec3V V3PermYXX(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + + const uint32x2_t mask = { 0xffffFFFF, 0x0 }; + + const uint32x2_t xy = vget_low_u32(vreinterpretq_u32_f32(a)); + const uint32x2_t yx = vrev64_u32(xy); + const uint32x2_t xw = vand_u32(xy, mask); + return vreinterpretq_f32_u32(vcombine_u32(yx, xw)); +} + +PX_FORCE_INLINE Vec3V V3Perm_Zero_1Z_0Y(const Vec3V v0, const Vec3V v1) +{ + ASSERT_ISVALIDVEC3V(v0); + ASSERT_ISVALIDVEC3V(v1); + + const uint32x2_t xy = vget_low_u32(vreinterpretq_u32_f32(v0)); + const uint32x2_t zw = vget_high_u32(vreinterpretq_u32_f32(v1)); + const uint32x2_t wz = vrev64_u32(zw); + const uint32x2_t yw = vext_u32(xy, wz, 1); + + return vreinterpretq_f32_u32(vcombine_u32(wz, yw)); +} + +PX_FORCE_INLINE Vec3V V3Perm_0Z_Zero_1X(const Vec3V v0, const Vec3V v1) +{ + ASSERT_ISVALIDVEC3V(v0); + ASSERT_ISVALIDVEC3V(v1); + + const uint32x2_t mask = { 0xffffFFFF, 0x0 }; + + const uint32x2_t zw = vget_high_u32(vreinterpretq_u32_f32(v0)); + const uint32x2_t xy = vget_low_u32(vreinterpretq_u32_f32(v1)); + const uint32x2_t xw = vand_u32(xy, mask); + + return vreinterpretq_f32_u32(vcombine_u32(zw, xw)); +} + +PX_FORCE_INLINE Vec3V V3Perm_1Y_0X_Zero(const Vec3V v0, const Vec3V v1) +{ + ASSERT_ISVALIDVEC3V(v0); + ASSERT_ISVALIDVEC3V(v1); + + const uint32x2_t axy = vget_low_u32(vreinterpretq_u32_f32(v0)); + const uint32x2_t bxy = vget_low_u32(vreinterpretq_u32_f32(v1)); + const uint32x2_t byax = vext_u32(bxy, axy, 1); + const uint32x2_t ww = vdup_n_u32(0); + + return vreinterpretq_f32_u32(vcombine_u32(byax, ww)); +} + +PX_FORCE_INLINE FloatV V3SumElems(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + + // const uint32x2_t mask = {0xffffFFFF, 0x0}; + + const float32x2_t low = vget_low_f32(a); + const float32x2_t high = vget_high_f32(a); + // const float32x2_t high = vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(high_), mask)); + + const float32x2_t sumTmp = vpadd_f32(low, high); // = {0+z, x+y} + const float32x2_t sum0ZYX = vpadd_f32(sumTmp, sumTmp); // = {x+y+z, x+y+z} + + return sum0ZYX; +} + +PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V min, const Vec3V max) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(min); + ASSERT_ISVALIDVEC3V(max); + + const BoolV c = BOr(V3IsGrtr(a, max), V3IsGrtr(min, a)); + return internalUnitNeonSimd::BAnyTrue3_R(c); +} + +PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V min, const Vec3V max) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(min); + ASSERT_ISVALIDVEC3V(max); + + const BoolV c = BAnd(V3IsGrtrOrEq(a, min), V3IsGrtrOrEq(max, a)); + return internalUnitNeonSimd::BAllTrue4_R(c); +} + +PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V bounds) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(bounds); + + const BoolV greater = V3IsGrtr(V3Abs(a), bounds); + return internalUnitNeonSimd::BAnyTrue3_R(greater); +} + +PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V bounds) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(bounds); + + const BoolV greaterOrEq = V3IsGrtrOrEq(bounds, V3Abs(a)); + return internalUnitNeonSimd::BAllTrue4_R(greaterOrEq); +} + +PX_FORCE_INLINE void V3Transpose(Vec3V& col0, Vec3V& col1, Vec3V& col2) +{ + ASSERT_ISVALIDVEC3V(col0); + ASSERT_ISVALIDVEC3V(col1); + ASSERT_ISVALIDVEC3V(col2); + + Vec3V col3 = V3Zero(); + const float32x4x2_t v0v1 = vzipq_f32(col0, col2); + const float32x4x2_t v2v3 = vzipq_f32(col1, col3); + const float32x4x2_t zip0 = vzipq_f32(v0v1.val[0], v2v3.val[0]); + const float32x4x2_t zip1 = vzipq_f32(v0v1.val[1], v2v3.val[1]); + col0 = zip0.val[0]; + col1 = zip0.val[1]; + col2 = zip1.val[0]; + // col3 = zip1.val[1]; +} + +////////////////////////////////// +// VEC4V +////////////////////////////////// + +PX_FORCE_INLINE Vec4V V4Splat(const FloatV f) +{ + ASSERT_ISVALIDFLOATV(f); + return vcombine_f32(f, f); +} + +PX_FORCE_INLINE Vec4V V4Merge(const FloatV* const floatVArray) +{ + ASSERT_ISVALIDFLOATV(floatVArray[0]); + ASSERT_ISVALIDFLOATV(floatVArray[1]); + ASSERT_ISVALIDFLOATV(floatVArray[2]); + ASSERT_ISVALIDFLOATV(floatVArray[3]); + + const uint32x2_t xLow = vreinterpret_u32_f32(floatVArray[0]); + const uint32x2_t yLow = vreinterpret_u32_f32(floatVArray[1]); + const uint32x2_t zLow = vreinterpret_u32_f32(floatVArray[2]); + const uint32x2_t wLow = vreinterpret_u32_f32(floatVArray[3]); + + const uint32x2_t dLow = vext_u32(xLow, yLow, 1); + const uint32x2_t dHigh = vext_u32(zLow, wLow, 1); + + return vreinterpretq_f32_u32(vcombine_u32(dLow, dHigh)); +} + +PX_FORCE_INLINE Vec4V V4Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z, const FloatVArg w) +{ + ASSERT_ISVALIDFLOATV(x); + ASSERT_ISVALIDFLOATV(y); + ASSERT_ISVALIDFLOATV(z); + ASSERT_ISVALIDFLOATV(w); + + const uint32x2_t xLow = vreinterpret_u32_f32(x); + const uint32x2_t yLow = vreinterpret_u32_f32(y); + const uint32x2_t zLow = vreinterpret_u32_f32(z); + const uint32x2_t wLow = vreinterpret_u32_f32(w); + + const uint32x2_t dLow = vext_u32(xLow, yLow, 1); + const uint32x2_t dHigh = vext_u32(zLow, wLow, 1); + + return vreinterpretq_f32_u32(vcombine_u32(dLow, dHigh)); +} + +PX_FORCE_INLINE Vec4V V4MergeW(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w) +{ + const float32x2_t xx = vget_high_f32(x); + const float32x2_t yy = vget_high_f32(y); + const float32x2_t zz = vget_high_f32(z); + const float32x2_t ww = vget_high_f32(w); + + const float32x2x2_t zipL = vzip_f32(xx, yy); + const float32x2x2_t zipH = vzip_f32(zz, ww); + + return vcombine_f32(zipL.val[1], zipH.val[1]); +} + +PX_FORCE_INLINE Vec4V V4MergeZ(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w) +{ + const float32x2_t xx = vget_high_f32(x); + const float32x2_t yy = vget_high_f32(y); + const float32x2_t zz = vget_high_f32(z); + const float32x2_t ww = vget_high_f32(w); + + const float32x2x2_t zipL = vzip_f32(xx, yy); + const float32x2x2_t zipH = vzip_f32(zz, ww); + + return vcombine_f32(zipL.val[0], zipH.val[0]); +} + +PX_FORCE_INLINE Vec4V V4MergeY(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w) +{ + const float32x2_t xx = vget_low_f32(x); + const float32x2_t yy = vget_low_f32(y); + const float32x2_t zz = vget_low_f32(z); + const float32x2_t ww = vget_low_f32(w); + + const float32x2x2_t zipL = vzip_f32(xx, yy); + const float32x2x2_t zipH = vzip_f32(zz, ww); + + return vcombine_f32(zipL.val[1], zipH.val[1]); +} + +PX_FORCE_INLINE Vec4V V4MergeX(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w) +{ + const float32x2_t xx = vget_low_f32(x); + const float32x2_t yy = vget_low_f32(y); + const float32x2_t zz = vget_low_f32(z); + const float32x2_t ww = vget_low_f32(w); + + const float32x2x2_t zipL = vzip_f32(xx, yy); + const float32x2x2_t zipH = vzip_f32(zz, ww); + + return vcombine_f32(zipL.val[0], zipH.val[0]); +} + +PX_FORCE_INLINE Vec4V V4UnpackXY(const Vec4VArg a, const Vec4VArg b) +{ + return vzipq_f32(a, b).val[0]; +} + +PX_FORCE_INLINE Vec4V V4UnpackZW(const Vec4VArg a, const Vec4VArg b) +{ + return vzipq_f32(a, b).val[1]; +} + +PX_FORCE_INLINE Vec4V V4UnitW() +{ + const float32x2_t zeros = vreinterpret_f32_u32(vmov_n_u32(0)); + const float32x2_t ones = vmov_n_f32(1.0f); + const float32x2_t zo = vext_f32(zeros, ones, 1); + return vcombine_f32(zeros, zo); +} + +PX_FORCE_INLINE Vec4V V4UnitX() +{ + const float32x2_t zeros = vreinterpret_f32_u32(vmov_n_u32(0)); + const float32x2_t ones = vmov_n_f32(1.0f); + const float32x2_t oz = vext_f32(ones, zeros, 1); + return vcombine_f32(oz, zeros); +} + +PX_FORCE_INLINE Vec4V V4UnitY() +{ + const float32x2_t zeros = vreinterpret_f32_u32(vmov_n_u32(0)); + const float32x2_t ones = vmov_n_f32(1.0f); + const float32x2_t zo = vext_f32(zeros, ones, 1); + return vcombine_f32(zo, zeros); +} + +PX_FORCE_INLINE Vec4V V4UnitZ() +{ + const float32x2_t zeros = vreinterpret_f32_u32(vmov_n_u32(0)); + const float32x2_t ones = vmov_n_f32(1.0f); + const float32x2_t oz = vext_f32(ones, zeros, 1); + return vcombine_f32(zeros, oz); +} + +PX_FORCE_INLINE FloatV V4GetW(const Vec4V f) +{ + const float32x2_t fhigh = vget_high_f32(f); + return vdup_lane_f32(fhigh, 1); +} + +PX_FORCE_INLINE FloatV V4GetX(const Vec4V f) +{ + const float32x2_t fLow = vget_low_f32(f); + return vdup_lane_f32(fLow, 0); +} + +PX_FORCE_INLINE FloatV V4GetY(const Vec4V f) +{ + const float32x2_t fLow = vget_low_f32(f); + return vdup_lane_f32(fLow, 1); +} + +PX_FORCE_INLINE FloatV V4GetZ(const Vec4V f) +{ + const float32x2_t fhigh = vget_high_f32(f); + return vdup_lane_f32(fhigh, 0); +} + +PX_FORCE_INLINE Vec4V V4SetW(const Vec4V v, const FloatV f) +{ + ASSERT_ISVALIDFLOATV(f); + return V4Sel(BTTTF(), v, vcombine_f32(f, f)); +} + +PX_FORCE_INLINE Vec4V V4SetX(const Vec4V v, const FloatV f) +{ + ASSERT_ISVALIDFLOATV(f); + return V4Sel(BFTTT(), v, vcombine_f32(f, f)); +} + +PX_FORCE_INLINE Vec4V V4SetY(const Vec4V v, const FloatV f) +{ + ASSERT_ISVALIDFLOATV(f); + return V4Sel(BTFTT(), v, vcombine_f32(f, f)); +} + +PX_FORCE_INLINE Vec4V V4SetZ(const Vec4V v, const FloatV f) +{ + ASSERT_ISVALIDFLOATV(f); + return V4Sel(BTTFT(), v, vcombine_f32(f, f)); +} + +PX_FORCE_INLINE Vec4V V4ClearW(const Vec4V v) +{ + return V4Sel(BTTTF(), v, V4Zero()); +} + +PX_FORCE_INLINE Vec4V V4PermYXWZ(const Vec4V a) +{ + const float32x2_t xy = vget_low_f32(a); + const float32x2_t zw = vget_high_f32(a); + const float32x2_t yx = vext_f32(xy, xy, 1); + const float32x2_t wz = vext_f32(zw, zw, 1); + return vcombine_f32(yx, wz); +} + +PX_FORCE_INLINE Vec4V V4PermXZXZ(const Vec4V a) +{ + const float32x2_t xy = vget_low_f32(a); + const float32x2_t zw = vget_high_f32(a); + const float32x2x2_t xzyw = vzip_f32(xy, zw); + return vcombine_f32(xzyw.val[0], xzyw.val[0]); +} + +PX_FORCE_INLINE Vec4V V4PermYWYW(const Vec4V a) +{ + const float32x2_t xy = vget_low_f32(a); + const float32x2_t zw = vget_high_f32(a); + const float32x2x2_t xzyw = vzip_f32(xy, zw); + return vcombine_f32(xzyw.val[1], xzyw.val[1]); +} + +PX_FORCE_INLINE Vec4V V4PermYZXW(const Vec4V a) +{ + const uint32x2_t xy = vget_low_u32(vreinterpretq_u32_f32(a)); + const uint32x2_t zw = vget_high_u32(vreinterpretq_u32_f32(a)); + const uint32x2_t yz = vext_u32(xy, zw, 1); + const uint32x2_t xw = vrev64_u32(vext_u32(zw, xy, 1)); + return vreinterpretq_f32_u32(vcombine_u32(yz, xw)); +} + +template <PxU8 E0, PxU8 E1, PxU8 E2, PxU8 E3> +PX_FORCE_INLINE Vec4V V4Perm(const Vec4V V) +{ + static const uint32_t ControlElement[4] = + { +#if 1 + 0x03020100, // XM_SWIZZLE_X + 0x07060504, // XM_SWIZZLE_Y + 0x0B0A0908, // XM_SWIZZLE_Z + 0x0F0E0D0C, // XM_SWIZZLE_W +#else + 0x00010203, // XM_SWIZZLE_X + 0x04050607, // XM_SWIZZLE_Y + 0x08090A0B, // XM_SWIZZLE_Z + 0x0C0D0E0F, // XM_SWIZZLE_W +#endif + }; + + uint8x8x2_t tbl; + tbl.val[0] = vreinterpret_u8_f32(vget_low_f32(V)); + tbl.val[1] = vreinterpret_u8_f32(vget_high_f32(V)); + + uint8x8_t idx = + vcreate_u8(static_cast<uint64_t>(ControlElement[E0]) | (static_cast<uint64_t>(ControlElement[E1]) << 32)); + const uint8x8_t rL = vtbl2_u8(tbl, idx); + idx = vcreate_u8(static_cast<uint64_t>(ControlElement[E2]) | (static_cast<uint64_t>(ControlElement[E3]) << 32)); + const uint8x8_t rH = vtbl2_u8(tbl, idx); + return vreinterpretq_f32_u8(vcombine_u8(rL, rH)); +} + +// PT: this seems measurably slower than the hardcoded version +/*PX_FORCE_INLINE Vec4V V4PermYZXW(const Vec4V a) +{ + return V4Perm<1, 2, 0, 3>(a); +}*/ + +PX_FORCE_INLINE Vec4V V4Zero() +{ + return vreinterpretq_f32_u32(vmovq_n_u32(0)); + // return vmovq_n_f32(0.0f); +} + +PX_FORCE_INLINE Vec4V V4One() +{ + return vmovq_n_f32(1.0f); +} + +PX_FORCE_INLINE Vec4V V4Eps() +{ + // return vmovq_n_f32(PX_EPS_REAL); + return V4Load(PX_EPS_REAL); +} + +PX_FORCE_INLINE Vec4V V4Neg(const Vec4V f) +{ + return vnegq_f32(f); +} + +PX_FORCE_INLINE Vec4V V4Add(const Vec4V a, const Vec4V b) +{ + return vaddq_f32(a, b); +} + +PX_FORCE_INLINE Vec4V V4Sub(const Vec4V a, const Vec4V b) +{ + return vsubq_f32(a, b); +} + +PX_FORCE_INLINE Vec4V V4Scale(const Vec4V a, const FloatV b) +{ + return vmulq_lane_f32(a, b, 0); +} + +PX_FORCE_INLINE Vec4V V4Mul(const Vec4V a, const Vec4V b) +{ + return vmulq_f32(a, b); +} + +PX_FORCE_INLINE Vec4V V4ScaleInv(const Vec4V a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(b); + const float32x2_t invB = VRECIP(b); + return vmulq_lane_f32(a, invB, 0); +} + +PX_FORCE_INLINE Vec4V V4Div(const Vec4V a, const Vec4V b) +{ + const float32x4_t invB = VRECIPQ(b); + return vmulq_f32(a, invB); +} + +PX_FORCE_INLINE Vec4V V4ScaleInvFast(const Vec4V a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(b); + const float32x2_t invB = VRECIPE(b); + return vmulq_lane_f32(a, invB, 0); +} + +PX_FORCE_INLINE Vec4V V4DivFast(const Vec4V a, const Vec4V b) +{ + const float32x4_t invB = VRECIPEQ(b); + return vmulq_f32(a, invB); +} + +PX_FORCE_INLINE Vec4V V4Recip(const Vec4V a) +{ + return VRECIPQ(a); +} + +PX_FORCE_INLINE Vec4V V4RecipFast(const Vec4V a) +{ + return VRECIPEQ(a); +} + +PX_FORCE_INLINE Vec4V V4Rsqrt(const Vec4V a) +{ + return VRECIPSQRTQ(a); +} + +PX_FORCE_INLINE Vec4V V4RsqrtFast(const Vec4V a) +{ + return VRECIPSQRTEQ(a); +} + +PX_FORCE_INLINE Vec4V V4Sqrt(const Vec4V a) +{ + return V4Sel(V4IsEq(a, V4Zero()), a, V4Mul(a, VRECIPSQRTQ(a))); +} + +PX_FORCE_INLINE Vec4V V4ScaleAdd(const Vec4V a, const FloatV b, const Vec4V c) +{ + ASSERT_ISVALIDFLOATV(b); + return vmlaq_lane_f32(c, a, b, 0); +} + +PX_FORCE_INLINE Vec4V V4NegScaleSub(const Vec4V a, const FloatV b, const Vec4V c) +{ + ASSERT_ISVALIDFLOATV(b); + return vmlsq_lane_f32(c, a, b, 0); +} + +PX_FORCE_INLINE Vec4V V4MulAdd(const Vec4V a, const Vec4V b, const Vec4V c) +{ + return vmlaq_f32(c, a, b); +} + +PX_FORCE_INLINE Vec4V V4NegMulSub(const Vec4V a, const Vec4V b, const Vec4V c) +{ + return vmlsq_f32(c, a, b); +} + +PX_FORCE_INLINE Vec4V V4Abs(const Vec4V a) +{ + return vabsq_f32(a); +} + +PX_FORCE_INLINE FloatV V4SumElements(const Vec4V a) +{ + const Vec4V xy = V4UnpackXY(a, a); // x,x,y,y + const Vec4V zw = V4UnpackZW(a, a); // z,z,w,w + const Vec4V xz_yw = V4Add(xy, zw); // x+z,x+z,y+w,y+w + const FloatV xz = V4GetX(xz_yw); // x+z + const FloatV yw = V4GetZ(xz_yw); // y+w + return FAdd(xz, yw); // sum +} + +PX_FORCE_INLINE FloatV V4Dot(const Vec4V a, const Vec4V b) +{ + const float32x4_t tmp = vmulq_f32(a, b); + const float32x2_t low = vget_low_f32(tmp); + const float32x2_t high = vget_high_f32(tmp); + + const float32x2_t sumTmp = vpadd_f32(low, high); // = {z+w, x+y} + const float32x2_t sumWZYX = vpadd_f32(sumTmp, sumTmp); // = {x+y+z+w, x+y+z+w} + return sumWZYX; +} + +PX_FORCE_INLINE FloatV V4Dot3(const Vec4V aa, const Vec4V bb) +{ + // PT: the V3Dot code relies on the fact that W=0 so we can't reuse it as-is, we need to clear W first. + // TODO: find a better implementation that does not need to clear W. + const Vec4V a = V4ClearW(aa); + const Vec4V b = V4ClearW(bb); + + const float32x4_t tmp = vmulq_f32(a, b); + const float32x2_t low = vget_low_f32(tmp); + const float32x2_t high = vget_high_f32(tmp); + + const float32x2_t sumTmp = vpadd_f32(low, high); // = {0+z, x+y} + const float32x2_t sum0ZYX = vpadd_f32(sumTmp, sumTmp); // = {x+y+z, x+y+z} + return sum0ZYX; +} + +PX_FORCE_INLINE Vec4V V4Cross(const Vec4V a, const Vec4V b) +{ + const uint32x2_t TF = { 0xffffFFFF, 0x0 }; + const float32x2_t ay_ax = vget_low_f32(a); // d2 + const float32x2_t aw_az = vget_high_f32(a); // d3 + const float32x2_t by_bx = vget_low_f32(b); // d4 + const float32x2_t bw_bz = vget_high_f32(b); // d5 + // Hi, Lo + const float32x2_t bz_by = vext_f32(by_bx, bw_bz, 1); // bz, by + const float32x2_t az_ay = vext_f32(ay_ax, aw_az, 1); // az, ay + + const float32x2_t azbx = vmul_f32(aw_az, by_bx); // 0, az*bx + const float32x2_t aybz_axby = vmul_f32(ay_ax, bz_by); // ay*bz, ax*by + + const float32x2_t azbxSUBaxbz = vmls_f32(azbx, bw_bz, ay_ax); // 0, az*bx-ax*bz + const float32x2_t aybzSUBazby_axbySUBaybx = vmls_f32(aybz_axby, by_bx, az_ay); // ay*bz-az*by, ax*by-ay*bx + + const float32x2_t retLow = vext_f32(aybzSUBazby_axbySUBaybx, azbxSUBaxbz, 1); // az*bx-ax*bz, ay*bz-az*by + const uint32x2_t retHigh = vand_u32(TF, vreinterpret_u32_f32(aybzSUBazby_axbySUBaybx)); // 0, ax*by-ay*bx + + return vcombine_f32(retLow, vreinterpret_f32_u32(retHigh)); +} + +PX_FORCE_INLINE FloatV V4Length(const Vec4V a) +{ + const float32x4_t tmp = vmulq_f32(a, a); + const float32x2_t low = vget_low_f32(tmp); + const float32x2_t high = vget_high_f32(tmp); + + const float32x2_t sumTmp = vpadd_f32(low, high); // = {0+z, x+y} + const float32x2_t sumWZYX = vpadd_f32(sumTmp, sumTmp); // = {x+y+z, x+y+z} + return FSqrt(sumWZYX); +} + +PX_FORCE_INLINE FloatV V4LengthSq(const Vec4V a) +{ + return V4Dot(a, a); +} + +PX_FORCE_INLINE Vec4V V4Normalize(const Vec4V a) +{ + //PX_ASSERT(!FAllEq(V4LengthSq(a), FZero())); + return V4ScaleInv(a, V4Length(a)); +} + +PX_FORCE_INLINE Vec4V V4NormalizeFast(const Vec4V a) +{ + //PX_ASSERT(!FAllEq(V4LengthSq(a), FZero())); + return V4Scale(a, FRsqrtFast(V4Dot(a, a))); +} + +PX_FORCE_INLINE Vec4V V4NormalizeSafe(const Vec4V a, const Vec4V unsafeReturnValue) +{ + const FloatV zero = FZero(); + const FloatV length = V4Length(a); + const uint32x4_t isGreaterThanZero = FIsGrtr(length, zero); + return V4Sel(isGreaterThanZero, V4ScaleInv(a, length), unsafeReturnValue); +} + +PX_FORCE_INLINE BoolV V4IsEqU32(const VecU32V a, const VecU32V b) +{ + return vceqq_u32(a, b); +} + +PX_FORCE_INLINE Vec4V V4Sel(const BoolV c, const Vec4V a, const Vec4V b) +{ + return vbslq_f32(c, a, b); +} + +PX_FORCE_INLINE BoolV V4IsGrtr(const Vec4V a, const Vec4V b) +{ + return vcgtq_f32(a, b); +} + +PX_FORCE_INLINE BoolV V4IsGrtrOrEq(const Vec4V a, const Vec4V b) +{ + return vcgeq_f32(a, b); +} + +PX_FORCE_INLINE BoolV V4IsEq(const Vec4V a, const Vec4V b) +{ + return vceqq_f32(a, b); +} + +PX_FORCE_INLINE Vec4V V4Max(const Vec4V a, const Vec4V b) +{ + return vmaxq_f32(a, b); +} + +PX_FORCE_INLINE Vec4V V4Min(const Vec4V a, const Vec4V b) +{ + return vminq_f32(a, b); +} + +PX_FORCE_INLINE FloatV V4ExtractMax(const Vec4V a) +{ + const float32x2_t low = vget_low_f32(a); + const float32x2_t high = vget_high_f32(a); + + const float32x2_t max0 = vpmax_f32(high, low); + const float32x2_t max1 = vpmax_f32(max0, max0); + + return max1; +} + +PX_FORCE_INLINE FloatV V4ExtractMin(const Vec4V a) +{ + const float32x2_t low = vget_low_f32(a); + const float32x2_t high = vget_high_f32(a); + + const float32x2_t min0 = vpmin_f32(high, low); + const float32x2_t min1 = vpmin_f32(min0, min0); + + return min1; +} + +PX_FORCE_INLINE Vec4V V4Clamp(const Vec4V a, const Vec4V minV, const Vec4V maxV) +{ + return V4Max(V4Min(a, maxV), minV); +} + +PX_FORCE_INLINE PxU32 V4AllGrtr(const Vec4V a, const Vec4V b) +{ + return internalUnitNeonSimd::BAllTrue4_R(V4IsGrtr(a, b)); +} + +PX_FORCE_INLINE PxU32 V4AllGrtrOrEq(const Vec4V a, const Vec4V b) +{ + return internalUnitNeonSimd::BAllTrue4_R(V4IsGrtrOrEq(a, b)); +} + +PX_FORCE_INLINE PxU32 V4AllGrtrOrEq3(const Vec4V a, const Vec4V b) +{ + return internalUnitNeonSimd::BAllTrue3_R(V4IsGrtrOrEq(a, b)); +} + +PX_FORCE_INLINE PxU32 V4AllEq(const Vec4V a, const Vec4V b) +{ + return internalUnitNeonSimd::BAllTrue4_R(V4IsEq(a, b)); +} + +PX_FORCE_INLINE PxU32 V4AnyGrtr3(const Vec4V a, const Vec4V b) +{ + return internalUnitNeonSimd::BAnyTrue3_R(V4IsGrtr(a, b)); +} + +PX_FORCE_INLINE Vec4V V4Round(const Vec4V a) +{ + // truncate(a + (0.5f - sign(a))) + const Vec4V half = V4Load(0.5f); + const float32x4_t sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(a), 31))); + const Vec4V aPlusHalf = V4Add(a, half); + const Vec4V aRound = V4Sub(aPlusHalf, sign); + return vcvtq_f32_s32(vcvtq_s32_f32(aRound)); +} + +PX_FORCE_INLINE Vec4V V4Sin(const Vec4V a) +{ + const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f); + const Vec4V twoPi = V4LoadA(g_PXTwoPi.f); + const Vec4V tmp = V4Mul(a, recipTwoPi); + const Vec4V b = V4Round(tmp); + const Vec4V V1 = V4NegMulSub(twoPi, b, a); + + // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! - + // V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI) + const Vec4V V2 = V4Mul(V1, V1); + const Vec4V V3 = V4Mul(V2, V1); + const Vec4V V5 = V4Mul(V3, V2); + const Vec4V V7 = V4Mul(V5, V2); + const Vec4V V9 = V4Mul(V7, V2); + const Vec4V V11 = V4Mul(V9, V2); + const Vec4V V13 = V4Mul(V11, V2); + const Vec4V V15 = V4Mul(V13, V2); + const Vec4V V17 = V4Mul(V15, V2); + const Vec4V V19 = V4Mul(V17, V2); + const Vec4V V21 = V4Mul(V19, V2); + const Vec4V V23 = V4Mul(V21, V2); + + const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f); + const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f); + const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f); + + const FloatV S1 = V4GetY(sinCoefficients0); + const FloatV S2 = V4GetZ(sinCoefficients0); + const FloatV S3 = V4GetW(sinCoefficients0); + const FloatV S4 = V4GetX(sinCoefficients1); + const FloatV S5 = V4GetY(sinCoefficients1); + const FloatV S6 = V4GetZ(sinCoefficients1); + const FloatV S7 = V4GetW(sinCoefficients1); + const FloatV S8 = V4GetX(sinCoefficients2); + const FloatV S9 = V4GetY(sinCoefficients2); + const FloatV S10 = V4GetZ(sinCoefficients2); + const FloatV S11 = V4GetW(sinCoefficients2); + + Vec4V Result; + Result = V4ScaleAdd(V3, S1, V1); + Result = V4ScaleAdd(V5, S2, Result); + Result = V4ScaleAdd(V7, S3, Result); + Result = V4ScaleAdd(V9, S4, Result); + Result = V4ScaleAdd(V11, S5, Result); + Result = V4ScaleAdd(V13, S6, Result); + Result = V4ScaleAdd(V15, S7, Result); + Result = V4ScaleAdd(V17, S8, Result); + Result = V4ScaleAdd(V19, S9, Result); + Result = V4ScaleAdd(V21, S10, Result); + Result = V4ScaleAdd(V23, S11, Result); + + return Result; +} + +PX_FORCE_INLINE Vec4V V4Cos(const Vec4V a) +{ + const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f); + const Vec4V twoPi = V4LoadA(g_PXTwoPi.f); + const Vec4V tmp = V4Mul(a, recipTwoPi); + const Vec4V b = V4Round(tmp); + const Vec4V V1 = V4NegMulSub(twoPi, b, a); + + // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! - + // V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI) + const Vec4V V2 = V4Mul(V1, V1); + const Vec4V V4 = V4Mul(V2, V2); + const Vec4V V6 = V4Mul(V4, V2); + const Vec4V V8 = V4Mul(V4, V4); + const Vec4V V10 = V4Mul(V6, V4); + const Vec4V V12 = V4Mul(V6, V6); + const Vec4V V14 = V4Mul(V8, V6); + const Vec4V V16 = V4Mul(V8, V8); + const Vec4V V18 = V4Mul(V10, V8); + const Vec4V V20 = V4Mul(V10, V10); + const Vec4V V22 = V4Mul(V12, V10); + + const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f); + const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f); + const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f); + + const FloatV C1 = V4GetY(cosCoefficients0); + const FloatV C2 = V4GetZ(cosCoefficients0); + const FloatV C3 = V4GetW(cosCoefficients0); + const FloatV C4 = V4GetX(cosCoefficients1); + const FloatV C5 = V4GetY(cosCoefficients1); + const FloatV C6 = V4GetZ(cosCoefficients1); + const FloatV C7 = V4GetW(cosCoefficients1); + const FloatV C8 = V4GetX(cosCoefficients2); + const FloatV C9 = V4GetY(cosCoefficients2); + const FloatV C10 = V4GetZ(cosCoefficients2); + const FloatV C11 = V4GetW(cosCoefficients2); + + Vec4V Result; + Result = V4ScaleAdd(V2, C1, V4One()); + Result = V4ScaleAdd(V4, C2, Result); + Result = V4ScaleAdd(V6, C3, Result); + Result = V4ScaleAdd(V8, C4, Result); + Result = V4ScaleAdd(V10, C5, Result); + Result = V4ScaleAdd(V12, C6, Result); + Result = V4ScaleAdd(V14, C7, Result); + Result = V4ScaleAdd(V16, C8, Result); + Result = V4ScaleAdd(V18, C9, Result); + Result = V4ScaleAdd(V20, C10, Result); + Result = V4ScaleAdd(V22, C11, Result); + + return Result; +} + +PX_FORCE_INLINE void V4Transpose(Vec4V& col0, Vec4V& col1, Vec4V& col2, Vec4V& col3) +{ + const float32x4x2_t v0v1 = vzipq_f32(col0, col2); + const float32x4x2_t v2v3 = vzipq_f32(col1, col3); + const float32x4x2_t zip0 = vzipq_f32(v0v1.val[0], v2v3.val[0]); + const float32x4x2_t zip1 = vzipq_f32(v0v1.val[1], v2v3.val[1]); + col0 = zip0.val[0]; + col1 = zip0.val[1]; + col2 = zip1.val[0]; + col3 = zip1.val[1]; +} + +////////////////////////////////// +// VEC4V +////////////////////////////////// + +PX_FORCE_INLINE BoolV BFFFF() +{ + return vmovq_n_u32(0); +} + +PX_FORCE_INLINE BoolV BFFFT() +{ + const uint32x2_t zeros = vmov_n_u32(0); + const uint32x2_t ones = vmov_n_u32(0xffffFFFF); + const uint32x2_t zo = vext_u32(zeros, ones, 1); + return vcombine_u32(zeros, zo); +} + +PX_FORCE_INLINE BoolV BFFTF() +{ + const uint32x2_t zeros = vmov_n_u32(0); + const uint32x2_t ones = vmov_n_u32(0xffffFFFF); + const uint32x2_t oz = vext_u32(ones, zeros, 1); + return vcombine_u32(zeros, oz); +} + +PX_FORCE_INLINE BoolV BFFTT() +{ + const uint32x2_t zeros = vmov_n_u32(0); + const uint32x2_t ones = vmov_n_u32(0xffffFFFF); + return vcombine_u32(zeros, ones); +} + +PX_FORCE_INLINE BoolV BFTFF() +{ + const uint32x2_t zeros = vmov_n_u32(0); + const uint32x2_t ones = vmov_n_u32(0xffffFFFF); + const uint32x2_t zo = vext_u32(zeros, ones, 1); + return vcombine_u32(zo, zeros); +} + +PX_FORCE_INLINE BoolV BFTFT() +{ + const uint32x2_t zeros = vmov_n_u32(0); + const uint32x2_t ones = vmov_n_u32(0xffffFFFF); + const uint32x2_t zo = vext_u32(zeros, ones, 1); + return vcombine_u32(zo, zo); +} + +PX_FORCE_INLINE BoolV BFTTF() +{ + const uint32x2_t zeros = vmov_n_u32(0); + const uint32x2_t ones = vmov_n_u32(0xffffFFFF); + const uint32x2_t zo = vext_u32(zeros, ones, 1); + const uint32x2_t oz = vext_u32(ones, zeros, 1); + return vcombine_u32(zo, oz); +} + +PX_FORCE_INLINE BoolV BFTTT() +{ + const uint32x2_t zeros = vmov_n_u32(0); + const uint32x2_t ones = vmov_n_u32(0xffffFFFF); + const uint32x2_t zo = vext_u32(zeros, ones, 1); + return vcombine_u32(zo, ones); +} + +PX_FORCE_INLINE BoolV BTFFF() +{ + const uint32x2_t zeros = vmov_n_u32(0); + const uint32x2_t ones = vmov_n_u32(0xffffFFFF); + // const uint32x2_t zo = vext_u32(zeros, ones, 1); + const uint32x2_t oz = vext_u32(ones, zeros, 1); + return vcombine_u32(oz, zeros); +} + +PX_FORCE_INLINE BoolV BTFFT() +{ + const uint32x2_t zeros = vmov_n_u32(0); + const uint32x2_t ones = vmov_n_u32(0xffffFFFF); + const uint32x2_t zo = vext_u32(zeros, ones, 1); + const uint32x2_t oz = vext_u32(ones, zeros, 1); + return vcombine_u32(oz, zo); +} + +PX_FORCE_INLINE BoolV BTFTF() +{ + const uint32x2_t zeros = vmov_n_u32(0); + const uint32x2_t ones = vmov_n_u32(0xffffFFFF); + const uint32x2_t oz = vext_u32(ones, zeros, 1); + return vcombine_u32(oz, oz); +} + +PX_FORCE_INLINE BoolV BTFTT() +{ + const uint32x2_t zeros = vmov_n_u32(0); + const uint32x2_t ones = vmov_n_u32(0xffffFFFF); + const uint32x2_t oz = vext_u32(ones, zeros, 1); + return vcombine_u32(oz, ones); +} + +PX_FORCE_INLINE BoolV BTTFF() +{ + const uint32x2_t zeros = vmov_n_u32(0); + const uint32x2_t ones = vmov_n_u32(0xffffFFFF); + return vcombine_u32(ones, zeros); +} + +PX_FORCE_INLINE BoolV BTTFT() +{ + const uint32x2_t zeros = vmov_n_u32(0); + const uint32x2_t ones = vmov_n_u32(0xffffFFFF); + const uint32x2_t zo = vext_u32(zeros, ones, 1); + return vcombine_u32(ones, zo); +} + +PX_FORCE_INLINE BoolV BTTTF() +{ + const uint32x2_t zeros = vmov_n_u32(0); + const uint32x2_t ones = vmov_n_u32(0xffffFFFF); + const uint32x2_t oz = vext_u32(ones, zeros, 1); + return vcombine_u32(ones, oz); +} + +PX_FORCE_INLINE BoolV BTTTT() +{ + return vmovq_n_u32(0xffffFFFF); +} + +PX_FORCE_INLINE BoolV BXMask() +{ + return BTFFF(); +} + +PX_FORCE_INLINE BoolV BYMask() +{ + return BFTFF(); +} + +PX_FORCE_INLINE BoolV BZMask() +{ + return BFFTF(); +} + +PX_FORCE_INLINE BoolV BWMask() +{ + return BFFFT(); +} + +PX_FORCE_INLINE BoolV BGetX(const BoolV f) +{ + const uint32x2_t fLow = vget_low_u32(f); + return vdupq_lane_u32(fLow, 0); +} + +PX_FORCE_INLINE BoolV BGetY(const BoolV f) +{ + const uint32x2_t fLow = vget_low_u32(f); + return vdupq_lane_u32(fLow, 1); +} + +PX_FORCE_INLINE BoolV BGetZ(const BoolV f) +{ + const uint32x2_t fHigh = vget_high_u32(f); + return vdupq_lane_u32(fHigh, 0); +} + +PX_FORCE_INLINE BoolV BGetW(const BoolV f) +{ + const uint32x2_t fHigh = vget_high_u32(f); + return vdupq_lane_u32(fHigh, 1); +} + +PX_FORCE_INLINE BoolV BSetX(const BoolV v, const BoolV f) +{ + return vbslq_u32(BFTTT(), v, f); +} + +PX_FORCE_INLINE BoolV BSetY(const BoolV v, const BoolV f) +{ + return vbslq_u32(BTFTT(), v, f); +} + +PX_FORCE_INLINE BoolV BSetZ(const BoolV v, const BoolV f) +{ + return vbslq_u32(BTTFT(), v, f); +} + +PX_FORCE_INLINE BoolV BSetW(const BoolV v, const BoolV f) +{ + return vbslq_u32(BTTTF(), v, f); +} + +PX_FORCE_INLINE BoolV BAnd(const BoolV a, const BoolV b) +{ + return vandq_u32(a, b); +} + +PX_FORCE_INLINE BoolV BNot(const BoolV a) +{ + return vmvnq_u32(a); +} + +PX_FORCE_INLINE BoolV BAndNot(const BoolV a, const BoolV b) +{ + // return vbicq_u32(a, b); + return vandq_u32(a, vmvnq_u32(b)); +} + +PX_FORCE_INLINE BoolV BOr(const BoolV a, const BoolV b) +{ + return vorrq_u32(a, b); +} + +PX_FORCE_INLINE BoolV BAllTrue4(const BoolV a) +{ + const uint32x2_t allTrue = vmov_n_u32(0xffffFFFF); + const uint16x4_t dHigh = vget_high_u16(vreinterpretq_u16_u32(a)); + const uint16x4_t dLow = vmovn_u32(a); + uint16x8_t combined = vcombine_u16(dLow, dHigh); + const uint32x2_t finalReduce = vreinterpret_u32_u8(vmovn_u16(combined)); + const uint32x2_t result = vceq_u32(finalReduce, allTrue); + return vdupq_lane_u32(result, 0); +} + +PX_FORCE_INLINE BoolV BAnyTrue4(const BoolV a) +{ + const uint32x2_t allTrue = vmov_n_u32(0xffffFFFF); + const uint16x4_t dHigh = vget_high_u16(vreinterpretq_u16_u32(a)); + const uint16x4_t dLow = vmovn_u32(a); + uint16x8_t combined = vcombine_u16(dLow, dHigh); + const uint32x2_t finalReduce = vreinterpret_u32_u8(vmovn_u16(combined)); + const uint32x2_t result = vtst_u32(finalReduce, allTrue); + return vdupq_lane_u32(result, 0); +} + +PX_FORCE_INLINE BoolV BAllTrue3(const BoolV a) +{ + const uint32x2_t allTrue3 = vmov_n_u32(0x00ffFFFF); + const uint16x4_t dHigh = vget_high_u16(vreinterpretq_u16_u32(a)); + const uint16x4_t dLow = vmovn_u32(a); + uint16x8_t combined = vcombine_u16(dLow, dHigh); + const uint32x2_t finalReduce = vreinterpret_u32_u8(vmovn_u16(combined)); + const uint32x2_t result = vceq_u32(vand_u32(finalReduce, allTrue3), allTrue3); + return vdupq_lane_u32(result, 0); +} + +PX_FORCE_INLINE BoolV BAnyTrue3(const BoolV a) +{ + const uint32x2_t allTrue3 = vmov_n_u32(0x00ffFFFF); + const uint16x4_t dHigh = vget_high_u16(vreinterpretq_u16_u32(a)); + const uint16x4_t dLow = vmovn_u32(a); + uint16x8_t combined = vcombine_u16(dLow, dHigh); + const uint32x2_t finalReduce = vreinterpret_u32_u8(vmovn_u16(combined)); + const uint32x2_t result = vtst_u32(vand_u32(finalReduce, allTrue3), allTrue3); + return vdupq_lane_u32(result, 0); +} + +PX_FORCE_INLINE PxU32 BAllEq(const BoolV a, const BoolV b) +{ + const BoolV bTest = vceqq_u32(a, b); + return internalUnitNeonSimd::BAllTrue4_R(bTest); +} + +PX_FORCE_INLINE PxU32 BAllEqTTTT(const BoolV a) +{ + return BAllEq(a, BTTTT()); +} + +PX_FORCE_INLINE PxU32 BAllEqFFFF(const BoolV a) +{ + return BAllEq(a, BFFFF()); +} + +PX_FORCE_INLINE PxU32 BGetBitMask(const BoolV a) +{ + static PX_ALIGN(16, const PxU32) bitMaskData[4] = { 1, 2, 4, 8 }; + const uint32x4_t bitMask = *(reinterpret_cast<const uint32x4_t*>(bitMaskData)); + const uint32x4_t t0 = vandq_u32(a, bitMask); + const uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0)); // Pairwise add (0 + 1), (2 + 3) + return PxU32(vget_lane_u32(vpadd_u32(t1, t1), 0)); +} + +////////////////////////////////// +// MAT33V +////////////////////////////////// + +PX_FORCE_INLINE Vec3V M33MulV3(const Mat33V& a, const Vec3V b) +{ + const FloatV x = V3GetX(b); + const FloatV y = V3GetY(b); + const FloatV z = V3GetZ(b); + const Vec3V v0 = V3Scale(a.col0, x); + const Vec3V v1 = V3Scale(a.col1, y); + const Vec3V v2 = V3Scale(a.col2, z); + const Vec3V v0PlusV1 = V3Add(v0, v1); + return V3Add(v0PlusV1, v2); +} + +PX_FORCE_INLINE Vec3V M33TrnspsMulV3(const Mat33V& a, const Vec3V b) +{ + const FloatV x = V3Dot(a.col0, b); + const FloatV y = V3Dot(a.col1, b); + const FloatV z = V3Dot(a.col2, b); + return V3Merge(x, y, z); +} + +PX_FORCE_INLINE Vec3V M33MulV3AddV3(const Mat33V& A, const Vec3V b, const Vec3V c) +{ + const FloatV x = V3GetX(b); + const FloatV y = V3GetY(b); + const FloatV z = V3GetZ(b); + Vec3V result = V3ScaleAdd(A.col0, x, c); + result = V3ScaleAdd(A.col1, y, result); + return V3ScaleAdd(A.col2, z, result); +} + +PX_FORCE_INLINE Mat33V M33MulM33(const Mat33V& a, const Mat33V& b) +{ + return Mat33V(M33MulV3(a, b.col0), M33MulV3(a, b.col1), M33MulV3(a, b.col2)); +} + +PX_FORCE_INLINE Mat33V M33Add(const Mat33V& a, const Mat33V& b) +{ + return Mat33V(V3Add(a.col0, b.col0), V3Add(a.col1, b.col1), V3Add(a.col2, b.col2)); +} + +PX_FORCE_INLINE Mat33V M33Scale(const Mat33V& a, const FloatV& b) +{ + return Mat33V(V3Scale(a.col0, b), V3Scale(a.col1, b), V3Scale(a.col2, b)); +} + +PX_FORCE_INLINE Mat33V M33Inverse(const Mat33V& a) +{ + const float32x2_t zeros = vreinterpret_f32_u32(vmov_n_u32(0)); + const BoolV btttf = BTTTF(); + + const Vec3V cross01 = V3Cross(a.col0, a.col1); + const Vec3V cross12 = V3Cross(a.col1, a.col2); + const Vec3V cross20 = V3Cross(a.col2, a.col0); + const FloatV dot = V3Dot(cross01, a.col2); + const FloatV invDet = FRecipFast(dot); + + const float32x4x2_t merge = vzipq_f32(cross12, cross01); + const float32x4_t mergeh = merge.val[0]; + const float32x4_t mergel = merge.val[1]; + + // const Vec3V colInv0 = XMVectorPermute(mergeh,cross20,PxPermuteControl(0,4,1,7)); + const float32x4_t colInv0_xxyy = vzipq_f32(mergeh, cross20).val[0]; + const float32x4_t colInv0 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(colInv0_xxyy), btttf)); + + // const Vec3V colInv1 = XMVectorPermute(mergeh,cross20,PxPermuteControl(2,5,3,7)); + const float32x2_t zw0 = vget_high_f32(mergeh); + const float32x2_t xy1 = vget_low_f32(cross20); + const float32x2_t yzero1 = vext_f32(xy1, zeros, 1); + const float32x2x2_t merge1 = vzip_f32(zw0, yzero1); + const float32x4_t colInv1 = vcombine_f32(merge1.val[0], merge1.val[1]); + + // const Vec3V colInv2 = XMVectorPermute(mergel,cross20,PxPermuteControl(0,6,1,7)); + const float32x2_t x0y0 = vget_low_f32(mergel); + const float32x2_t z1w1 = vget_high_f32(cross20); + const float32x2x2_t merge2 = vzip_f32(x0y0, z1w1); + const float32x4_t colInv2 = vcombine_f32(merge2.val[0], merge2.val[1]); + + return Mat33V(vmulq_lane_f32(colInv0, invDet, 0), vmulq_lane_f32(colInv1, invDet, 0), + vmulq_lane_f32(colInv2, invDet, 0)); +} + +PX_FORCE_INLINE Mat33V M33Trnsps(const Mat33V& a) +{ + return Mat33V(V3Merge(V3GetX(a.col0), V3GetX(a.col1), V3GetX(a.col2)), + V3Merge(V3GetY(a.col0), V3GetY(a.col1), V3GetY(a.col2)), + V3Merge(V3GetZ(a.col0), V3GetZ(a.col1), V3GetZ(a.col2))); +} + +PX_FORCE_INLINE Mat33V M33Identity() +{ + return Mat33V(V3UnitX(), V3UnitY(), V3UnitZ()); +} + +PX_FORCE_INLINE Mat33V M33Sub(const Mat33V& a, const Mat33V& b) +{ + return Mat33V(V3Sub(a.col0, b.col0), V3Sub(a.col1, b.col1), V3Sub(a.col2, b.col2)); +} + +PX_FORCE_INLINE Mat33V M33Neg(const Mat33V& a) +{ + return Mat33V(V3Neg(a.col0), V3Neg(a.col1), V3Neg(a.col2)); +} + +PX_FORCE_INLINE Mat33V M33Abs(const Mat33V& a) +{ + return Mat33V(V3Abs(a.col0), V3Abs(a.col1), V3Abs(a.col2)); +} + +PX_FORCE_INLINE Mat33V PromoteVec3V(const Vec3V v) +{ + const BoolV bTFFF = BTFFF(); + const BoolV bFTFF = BFTFF(); + const BoolV bFFTF = BTFTF(); + + const Vec3V zero = V3Zero(); + + return Mat33V(V3Sel(bTFFF, v, zero), V3Sel(bFTFF, v, zero), V3Sel(bFFTF, v, zero)); +} + +PX_FORCE_INLINE Mat33V M33Diagonal(const Vec3VArg d) +{ + const Vec3V x = V3Mul(V3UnitX(), d); + const Vec3V y = V3Mul(V3UnitY(), d); + const Vec3V z = V3Mul(V3UnitZ(), d); + return Mat33V(x, y, z); +} + +////////////////////////////////// +// MAT34V +////////////////////////////////// + +PX_FORCE_INLINE Vec3V M34MulV3(const Mat34V& a, const Vec3V b) +{ + const FloatV x = V3GetX(b); + const FloatV y = V3GetY(b); + const FloatV z = V3GetZ(b); + const Vec3V v0 = V3Scale(a.col0, x); + const Vec3V v1 = V3Scale(a.col1, y); + const Vec3V v2 = V3Scale(a.col2, z); + const Vec3V v0PlusV1 = V3Add(v0, v1); + const Vec3V v0PlusV1Plusv2 = V3Add(v0PlusV1, v2); + return V3Add(v0PlusV1Plusv2, a.col3); +} + +PX_FORCE_INLINE Vec3V M34Mul33V3(const Mat34V& a, const Vec3V b) +{ + const FloatV x = V3GetX(b); + const FloatV y = V3GetY(b); + const FloatV z = V3GetZ(b); + const Vec3V v0 = V3Scale(a.col0, x); + const Vec3V v1 = V3Scale(a.col1, y); + const Vec3V v2 = V3Scale(a.col2, z); + const Vec3V v0PlusV1 = V3Add(v0, v1); + return V3Add(v0PlusV1, v2); +} + +PX_FORCE_INLINE Vec3V M34TrnspsMul33V3(const Mat34V& a, const Vec3V b) +{ + const FloatV x = V3Dot(a.col0, b); + const FloatV y = V3Dot(a.col1, b); + const FloatV z = V3Dot(a.col2, b); + return V3Merge(x, y, z); +} + +PX_FORCE_INLINE Mat34V M34MulM34(const Mat34V& a, const Mat34V& b) +{ + return Mat34V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2), M34MulV3(a, b.col3)); +} + +PX_FORCE_INLINE Mat33V M34MulM33(const Mat34V& a, const Mat33V& b) +{ + return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2)); +} + +PX_FORCE_INLINE Mat33V M34Mul33MM34(const Mat34V& a, const Mat34V& b) +{ + return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2)); +} + +PX_FORCE_INLINE Mat34V M34Add(const Mat34V& a, const Mat34V& b) +{ + return Mat34V(V3Add(a.col0, b.col0), V3Add(a.col1, b.col1), V3Add(a.col2, b.col2), V3Add(a.col3, b.col3)); +} + +PX_FORCE_INLINE Mat33V M34Trnsps33(const Mat34V& a) +{ + return Mat33V(V3Merge(V3GetX(a.col0), V3GetX(a.col1), V3GetX(a.col2)), + V3Merge(V3GetY(a.col0), V3GetY(a.col1), V3GetY(a.col2)), + V3Merge(V3GetZ(a.col0), V3GetZ(a.col1), V3GetZ(a.col2))); +} + +////////////////////////////////// +// MAT44V +////////////////////////////////// + +PX_FORCE_INLINE Vec4V M44MulV4(const Mat44V& a, const Vec4V b) +{ + const FloatV x = V4GetX(b); + const FloatV y = V4GetY(b); + const FloatV z = V4GetZ(b); + const FloatV w = V4GetW(b); + + const Vec4V v0 = V4Scale(a.col0, x); + const Vec4V v1 = V4Scale(a.col1, y); + const Vec4V v2 = V4Scale(a.col2, z); + const Vec4V v3 = V4Scale(a.col3, w); + const Vec4V v0PlusV1 = V4Add(v0, v1); + const Vec4V v0PlusV1Plusv2 = V4Add(v0PlusV1, v2); + return V4Add(v0PlusV1Plusv2, v3); +} + +PX_FORCE_INLINE Vec4V M44TrnspsMulV4(const Mat44V& a, const Vec4V b) +{ + return V4Merge(V4Dot(a.col0, b), V4Dot(a.col1, b), V4Dot(a.col2, b), V4Dot(a.col3, b)); +} + +PX_FORCE_INLINE Mat44V M44MulM44(const Mat44V& a, const Mat44V& b) +{ + return Mat44V(M44MulV4(a, b.col0), M44MulV4(a, b.col1), M44MulV4(a, b.col2), M44MulV4(a, b.col3)); +} + +PX_FORCE_INLINE Mat44V M44Add(const Mat44V& a, const Mat44V& b) +{ + return Mat44V(V4Add(a.col0, b.col0), V4Add(a.col1, b.col1), V4Add(a.col2, b.col2), V4Add(a.col3, b.col3)); +} + +PX_FORCE_INLINE Mat44V M44Trnsps(const Mat44V& a) +{ + // asm volatile( + // "vzip.f32 %q0, %q2 \n\t" + // "vzip.f32 %q1, %q3 \n\t" + // "vzip.f32 %q0, %q1 \n\t" + // "vzip.f32 %q2, %q3 \n\t" + // : "+w" (a.col0), "+w" (a.col1), "+w" (a.col2), "+w" a.col3)); + + const float32x4x2_t v0v1 = vzipq_f32(a.col0, a.col2); + const float32x4x2_t v2v3 = vzipq_f32(a.col1, a.col3); + const float32x4x2_t zip0 = vzipq_f32(v0v1.val[0], v2v3.val[0]); + const float32x4x2_t zip1 = vzipq_f32(v0v1.val[1], v2v3.val[1]); + + return Mat44V(zip0.val[0], zip0.val[1], zip1.val[0], zip1.val[1]); +} + +PX_FORCE_INLINE Mat44V M44Inverse(const Mat44V& a) +{ + float32x4_t minor0, minor1, minor2, minor3; + float32x4_t row0, row1, row2, row3; + float32x4_t det, tmp1; + + tmp1 = vmovq_n_f32(0.0f); + row1 = vmovq_n_f32(0.0f); + row3 = vmovq_n_f32(0.0f); + + row0 = a.col0; + row1 = vextq_f32(a.col1, a.col1, 2); + row2 = a.col2; + row3 = vextq_f32(a.col3, a.col3, 2); + + tmp1 = vmulq_f32(row2, row3); + tmp1 = vrev64q_f32(tmp1); + minor0 = vmulq_f32(row1, tmp1); + minor1 = vmulq_f32(row0, tmp1); + tmp1 = vextq_f32(tmp1, tmp1, 2); + minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0); + minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1); + minor1 = vextq_f32(minor1, minor1, 2); + + tmp1 = vmulq_f32(row1, row2); + tmp1 = vrev64q_f32(tmp1); + minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0); + minor3 = vmulq_f32(row0, tmp1); + tmp1 = vextq_f32(tmp1, tmp1, 2); + minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1)); + minor3 = vsubq_f32(vmulq_f32(row0, tmp1), minor3); + minor3 = vextq_f32(minor3, minor3, 2); + + tmp1 = vmulq_f32(vextq_f32(row1, row1, 2), row3); + tmp1 = vrev64q_f32(tmp1); + row2 = vextq_f32(row2, row2, 2); + minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0); + minor2 = vmulq_f32(row0, tmp1); + tmp1 = vextq_f32(tmp1, tmp1, 2); + minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1)); + minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2); + minor2 = vextq_f32(minor2, minor2, 2); + + tmp1 = vmulq_f32(row0, row1); + tmp1 = vrev64q_f32(tmp1); + minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2); + minor3 = vsubq_f32(vmulq_f32(row2, tmp1), minor3); + tmp1 = vextq_f32(tmp1, tmp1, 2); + minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2); + minor3 = vsubq_f32(minor3, vmulq_f32(row2, tmp1)); + + tmp1 = vmulq_f32(row0, row3); + tmp1 = vrev64q_f32(tmp1); + minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1)); + minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2); + tmp1 = vextq_f32(tmp1, tmp1, 2); + minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1); + minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1)); + + tmp1 = vmulq_f32(row0, row2); + tmp1 = vrev64q_f32(tmp1); + minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1); + minor3 = vsubq_f32(minor3, vmulq_f32(row1, tmp1)); + tmp1 = vextq_f32(tmp1, tmp1, 2); + minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1)); + minor3 = vaddq_f32(vmulq_f32(row1, tmp1), minor3); + + det = vmulq_f32(row0, minor0); + det = vaddq_f32(vextq_f32(det, det, 2), det); + det = vaddq_f32(vrev64q_f32(det), det); + det = vdupq_lane_f32(VRECIPE(vget_low_f32(det)), 0); + + minor0 = vmulq_f32(det, minor0); + minor1 = vmulq_f32(det, minor1); + minor2 = vmulq_f32(det, minor2); + minor3 = vmulq_f32(det, minor3); + Mat44V invTrans(minor0, minor1, minor2, minor3); + return M44Trnsps(invTrans); +} + +PX_FORCE_INLINE Vec4V V4LoadXYZW(const PxF32& x, const PxF32& y, const PxF32& z, const PxF32& w) +{ + const float32x4_t ret = { x, y, z, w }; + return ret; +} + +/* +PX_FORCE_INLINE VecU16V V4U32PK(VecU32V a, VecU32V b) +{ + return vcombine_u16(vqmovn_u32(a), vqmovn_u32(b)); +} +*/ + +PX_FORCE_INLINE VecU32V V4U32Sel(const BoolV c, const VecU32V a, const VecU32V b) +{ + return vbslq_u32(c, a, b); +} + +PX_FORCE_INLINE VecU32V V4U32or(VecU32V a, VecU32V b) +{ + return vorrq_u32(a, b); +} + +PX_FORCE_INLINE VecU32V V4U32xor(VecU32V a, VecU32V b) +{ + return veorq_u32(a, b); +} + +PX_FORCE_INLINE VecU32V V4U32and(VecU32V a, VecU32V b) +{ + return vandq_u32(a, b); +} + +PX_FORCE_INLINE VecU32V V4U32Andc(VecU32V a, VecU32V b) +{ + // return vbicq_u32(a, b); // creates gcc compiler bug in RTreeQueries.cpp + return vandq_u32(a, vmvnq_u32(b)); +} + +/* +PX_FORCE_INLINE VecU16V V4U16Or(VecU16V a, VecU16V b) +{ + return vorrq_u16(a, b); +} +*/ + +/* +PX_FORCE_INLINE VecU16V V4U16And(VecU16V a, VecU16V b) +{ + return vandq_u16(a, b); +} +*/ +/* +PX_FORCE_INLINE VecU16V V4U16Andc(VecU16V a, VecU16V b) +{ + return vbicq_u16(a, b); +} +*/ + +PX_FORCE_INLINE VecI32V I4Load(const PxI32 i) +{ + return vdupq_n_s32(i); +} + +PX_FORCE_INLINE VecI32V I4LoadU(const PxI32* i) +{ + return vld1q_s32(i); +} + +PX_FORCE_INLINE VecI32V I4LoadA(const PxI32* i) +{ + return vld1q_s32(i); +} + +PX_FORCE_INLINE VecI32V VecI32V_Add(const VecI32VArg a, const VecI32VArg b) +{ + return vaddq_s32(a, b); +} + +PX_FORCE_INLINE VecI32V VecI32V_Sub(const VecI32VArg a, const VecI32VArg b) +{ + return vsubq_s32(a, b); +} + +PX_FORCE_INLINE BoolV VecI32V_IsGrtr(const VecI32VArg a, const VecI32VArg b) +{ + return vcgtq_s32(a, b); +} + +PX_FORCE_INLINE BoolV VecI32V_IsEq(const VecI32VArg a, const VecI32VArg b) +{ + return vceqq_s32(a, b); +} + +PX_FORCE_INLINE VecI32V V4I32Sel(const BoolV c, const VecI32V a, const VecI32V b) +{ + return vbslq_s32(c, a, b); +} + +PX_FORCE_INLINE VecI32V VecI32V_Zero() +{ + return vdupq_n_s32(0); +} + +PX_FORCE_INLINE VecI32V VecI32V_One() +{ + return vdupq_n_s32(1); +} + +PX_FORCE_INLINE VecI32V VecI32V_Two() +{ + return vdupq_n_s32(2); +} + +PX_FORCE_INLINE VecI32V VecI32V_MinusOne() +{ + return vdupq_n_s32(-1); +} + +PX_FORCE_INLINE VecU32V U4Zero() +{ + return U4Load(0); +} + +PX_FORCE_INLINE VecU32V U4One() +{ + return U4Load(1); +} + +PX_FORCE_INLINE VecU32V U4Two() +{ + return U4Load(2); +} + +PX_FORCE_INLINE VecShiftV VecI32V_PrepareShift(const VecI32VArg shift) +{ + return shift; +} + +PX_FORCE_INLINE VecI32V VecI32V_LeftShift(const VecI32VArg a, const VecShiftVArg count) +{ + return vshlq_s32(a, count); +} + +PX_FORCE_INLINE VecI32V VecI32V_RightShift(const VecI32VArg a, const VecShiftVArg count) +{ + return vshlq_s32(a, VecI32V_Sub(I4Load(0), count)); +} + +PX_FORCE_INLINE VecI32V VecI32V_And(const VecI32VArg a, const VecI32VArg b) +{ + return vandq_s32(a, b); +} + +PX_FORCE_INLINE VecI32V VecI32V_Or(const VecI32VArg a, const VecI32VArg b) +{ + return vorrq_s32(a, b); +} + +PX_FORCE_INLINE VecI32V VecI32V_GetX(const VecI32VArg f) +{ + const int32x2_t fLow = vget_low_s32(f); + return vdupq_lane_s32(fLow, 0); +} + +PX_FORCE_INLINE VecI32V VecI32V_GetY(const VecI32VArg f) +{ + const int32x2_t fLow = vget_low_s32(f); + return vdupq_lane_s32(fLow, 1); +} + +PX_FORCE_INLINE VecI32V VecI32V_GetZ(const VecI32VArg f) +{ + const int32x2_t fHigh = vget_high_s32(f); + return vdupq_lane_s32(fHigh, 0); +} + +PX_FORCE_INLINE VecI32V VecI32V_GetW(const VecI32VArg f) +{ + const int32x2_t fHigh = vget_high_s32(f); + return vdupq_lane_s32(fHigh, 1); +} + +PX_FORCE_INLINE VecI32V VecI32V_Sel(const BoolV c, const VecI32VArg a, const VecI32VArg b) +{ + return vbslq_s32(c, a, b); +} + +PX_FORCE_INLINE void PxI32_From_VecI32V(const VecI32VArg a, PxI32* i) +{ + *i = vgetq_lane_s32(a, 0); +} + +PX_FORCE_INLINE VecI32V VecI32V_Merge(const VecI32VArg a, const VecI32VArg b, const VecI32VArg c, const VecI32VArg d) +{ + const int32x2_t aLow = vget_low_s32(a); + const int32x2_t bLow = vget_low_s32(b); + const int32x2_t cLow = vget_low_s32(c); + const int32x2_t dLow = vget_low_s32(d); + + const int32x2_t low = vext_s32(aLow, bLow, 1); + const int32x2_t high = vext_s32(cLow, dLow, 1); + + return vcombine_s32(low, high); +} + +PX_FORCE_INLINE VecI32V VecI32V_From_BoolV(const BoolVArg a) +{ + return vreinterpretq_s32_u32(a); +} + +PX_FORCE_INLINE VecU32V VecU32V_From_BoolV(const BoolVArg a) +{ + return a; +} + +/* +template<int a> PX_FORCE_INLINE VecI32V V4ISplat() +{ + return vdupq_n_s32(a); +} + +template<PxU32 a> PX_FORCE_INLINE VecU32V V4USplat() +{ + return vdupq_n_u32(a); +} +*/ + +/* +PX_FORCE_INLINE void V4U16StoreAligned(VecU16V val, VecU16V* address) +{ + vst1q_u16((uint16_t*)address, val); +} +*/ + +PX_FORCE_INLINE void V4U32StoreAligned(VecU32V val, VecU32V* address) +{ + vst1q_u32(reinterpret_cast<uint32_t*>(address), val); +} + +PX_FORCE_INLINE Vec4V V4LoadAligned(Vec4V* addr) +{ + return vld1q_f32(reinterpret_cast<float32_t*>(addr)); +} + +PX_FORCE_INLINE Vec4V V4LoadUnaligned(Vec4V* addr) +{ + return vld1q_f32(reinterpret_cast<float32_t*>(addr)); +} + +PX_FORCE_INLINE Vec4V V4Andc(const Vec4V a, const VecU32V b) +{ + return vreinterpretq_f32_u32(V4U32Andc(vreinterpretq_u32_f32(a), b)); +} + +PX_FORCE_INLINE VecU32V V4IsGrtrV32u(const Vec4V a, const Vec4V b) +{ + return V4IsGrtr(a, b); +} + +PX_FORCE_INLINE VecU16V V4U16LoadAligned(VecU16V* addr) +{ + return vld1q_u16(reinterpret_cast<uint16_t*>(addr)); +} + +PX_FORCE_INLINE VecU16V V4U16LoadUnaligned(VecU16V* addr) +{ + return vld1q_u16(reinterpret_cast<uint16_t*>(addr)); +} + +PX_FORCE_INLINE VecU16V V4U16CompareGt(VecU16V a, VecU16V b) +{ + return vcgtq_u16(a, b); +} + +PX_FORCE_INLINE VecU16V V4I16CompareGt(VecI16V a, VecI16V b) +{ + return vcgtq_s16(a, b); +} + +PX_FORCE_INLINE Vec4V Vec4V_From_VecU32V(VecU32V a) +{ + return vcvtq_f32_u32(a); +} + +PX_FORCE_INLINE Vec4V Vec4V_From_VecI32V(VecI32V a) +{ + return vcvtq_f32_s32(a); +} + +PX_FORCE_INLINE VecI32V VecI32V_From_Vec4V(Vec4V a) +{ + return vcvtq_s32_f32(a); +} + +PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecU32V(VecU32V a) +{ + return vreinterpretq_f32_u32(a); +} + +PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecI32V(VecI32V a) +{ + return vreinterpretq_f32_s32(a); +} + +PX_FORCE_INLINE VecU32V VecU32V_ReinterpretFrom_Vec4V(Vec4V a) +{ + return vreinterpretq_u32_f32(a); +} + +PX_FORCE_INLINE VecI32V VecI32V_ReinterpretFrom_Vec4V(Vec4V a) +{ + return vreinterpretq_s32_f32(a); +} + +template <int index> +PX_FORCE_INLINE BoolV BSplatElement(BoolV a) +{ + if(index < 2) + { + return vdupq_lane_u32(vget_low_u32(a), index); + } + else if(index == 2) + { + return vdupq_lane_u32(vget_high_u32(a), 0); + } + else if(index == 3) + { + return vdupq_lane_u32(vget_high_u32(a), 1); + } +} + +template <int index> +PX_FORCE_INLINE VecU32V V4U32SplatElement(VecU32V a) +{ + if(index < 2) + { + return vdupq_lane_u32(vget_low_u32(a), index); + } + else if(index == 2) + { + return vdupq_lane_u32(vget_high_u32(a), 0); + } + else if(index == 3) + { + return vdupq_lane_u32(vget_high_u32(a), 1); + } +} + +template <int index> +PX_FORCE_INLINE Vec4V V4SplatElement(Vec4V a) +{ + if(index < 2) + { + return vdupq_lane_f32(vget_low_f32(a), index); + } + else if(index == 2) + { + return vdupq_lane_f32(vget_high_f32(a), 0); + } + else if(index == 3) + { + return vdupq_lane_f32(vget_high_f32(a), 1); + } +} + +PX_FORCE_INLINE VecU32V U4LoadXYZW(PxU32 x, PxU32 y, PxU32 z, PxU32 w) +{ + const uint32x4_t ret = { x, y, z, w }; + return ret; +} + +PX_FORCE_INLINE VecU32V U4Load(const PxU32 i) +{ + return vdupq_n_u32(i); +} + +PX_FORCE_INLINE VecU32V U4LoadU(const PxU32* i) +{ + return vld1q_u32(i); +} + +PX_FORCE_INLINE VecU32V U4LoadA(const PxU32* i) +{ + return vld1q_u32(i); +} + +PX_FORCE_INLINE Vec4V V4Ceil(const Vec4V in) +{ + const float32x4_t ones = vdupq_n_f32(1.0f); + const float32x4_t rdToZero = vcvtq_f32_s32(vcvtq_s32_f32(in)); + const float32x4_t rdToZeroPlusOne = vaddq_f32(rdToZero, ones); + const uint32x4_t gt = vcgtq_f32(in, rdToZero); + return vbslq_f32(gt, rdToZeroPlusOne, rdToZero); +} + +PX_FORCE_INLINE Vec4V V4Floor(const Vec4V in) +{ + const float32x4_t ones = vdupq_n_f32(1.0f); + const float32x4_t rdToZero = vcvtq_f32_s32(vcvtq_s32_f32(in)); + const float32x4_t rdToZeroMinusOne = vsubq_f32(rdToZero, ones); + const uint32x4_t lt = vcltq_f32(in, rdToZero); + return vbslq_f32(lt, rdToZeroMinusOne, rdToZero); +} + +PX_FORCE_INLINE VecU32V V4ConvertToU32VSaturate(const Vec4V in, PxU32 power) +{ + PX_ASSERT(power == 0 && "Non-zero power not supported in convertToU32VSaturate"); + PX_UNUSED(power); // prevent warning in release builds + + return vcvtq_u32_f32(in); +} + +PX_FORCE_INLINE void QuatGetMat33V(const QuatVArg q, Vec3V& column0, Vec3V& column1, Vec3V& column2) +{ + const FloatV one = FOne(); + const FloatV x = V4GetX(q); + const FloatV y = V4GetY(q); + const FloatV z = V4GetZ(q); + const FloatV w = V4GetW(q); + + const FloatV x2 = FAdd(x, x); + const FloatV y2 = FAdd(y, y); + const FloatV z2 = FAdd(z, z); + + const FloatV xx = FMul(x2, x); + const FloatV yy = FMul(y2, y); + const FloatV zz = FMul(z2, z); + + const FloatV xy = FMul(x2, y); + const FloatV xz = FMul(x2, z); + const FloatV xw = FMul(x2, w); + + const FloatV yz = FMul(y2, z); + const FloatV yw = FMul(y2, w); + const FloatV zw = FMul(z2, w); + + const FloatV v = FSub(one, xx); + + column0 = V3Merge(FSub(FSub(one, yy), zz), FAdd(xy, zw), FSub(xz, yw)); + column1 = V3Merge(FSub(xy, zw), FSub(v, zz), FAdd(yz, xw)); + column2 = V3Merge(FAdd(xz, yw), FSub(yz, xw), FSub(v, yy)); +} + +#endif // PSFOUNDATION_PSUNIXNEONINLINEAOS_H diff --git a/PxShared/src/foundation/include/unix/sse2/PsUnixSse2AoS.h b/PxShared/src/foundation/include/unix/sse2/PsUnixSse2AoS.h new file mode 100644 index 00000000..54c622f6 --- /dev/null +++ b/PxShared/src/foundation/include/unix/sse2/PsUnixSse2AoS.h @@ -0,0 +1,179 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSUNIXSSE2AOS_H +#define PSFOUNDATION_PSUNIXSSE2AOS_H + +// no includes here! this file should be included from PxcVecMath.h only!!! + +#if !COMPILE_VECTOR_INTRINSICS +#error Vector intrinsics should not be included when using scalar implementation. +#endif +#ifdef __EMSCRIPTEN__ +typedef int8_t __int8_t; +typedef int16_t __int16_t; +typedef int32_t __int32_t; +typedef int64_t __int64_t; +typedef uint16_t __uint16_t; +typedef uint32_t __uint32_t; +typedef uint64_t __uint64_t; +#endif + +typedef union UnionM128 +{ + UnionM128() + { + } + UnionM128(__m128 in) + { + m128 = in; + } + + UnionM128(__m128i in) + { + m128i = in; + } + + operator __m128() + { + return m128; + } + + operator const __m128() const + { + return m128; + } + + float m128_f32[4]; + __int8_t m128_i8[16]; + __int16_t m128_i16[8]; + __int32_t m128_i32[4]; + __int64_t m128_i64[2]; + __uint16_t m128_u16[8]; + __uint32_t m128_u32[4]; + __uint64_t m128_u64[2]; + __m128 m128; + __m128i m128i; +} UnionM128; + +typedef __m128 FloatV; +typedef __m128 Vec3V; +typedef __m128 Vec4V; +typedef __m128 BoolV; +typedef __m128 QuatV; +typedef __m128i VecI32V; +typedef UnionM128 VecU32V; +typedef UnionM128 VecU16V; +typedef UnionM128 VecI16V; +typedef UnionM128 VecU8V; + +#define FloatVArg FloatV & +#define Vec3VArg Vec3V & +#define Vec4VArg Vec4V & +#define BoolVArg BoolV & +#define VecU32VArg VecU32V & +#define VecI32VArg VecI32V & +#define VecU16VArg VecU16V & +#define VecI16VArg VecI16V & +#define VecU8VArg VecU8V & +#define QuatVArg QuatV & + +// Optimization for situations in which you cross product multiple vectors with the same vector. +// Avoids 2X shuffles per product +struct VecCrossV +{ + Vec3V mL1; + Vec3V mR1; +}; + +struct VecShiftV +{ + VecI32V shift; +}; +#define VecShiftVArg VecShiftV & + +PX_ALIGN_PREFIX(16) +struct Mat33V +{ + Mat33V() + { + } + Mat33V(const Vec3V& c0, const Vec3V& c1, const Vec3V& c2) : col0(c0), col1(c1), col2(c2) + { + } + Vec3V PX_ALIGN(16, col0); + Vec3V PX_ALIGN(16, col1); + Vec3V PX_ALIGN(16, col2); +} PX_ALIGN_SUFFIX(16); + +PX_ALIGN_PREFIX(16) +struct Mat34V +{ + Mat34V() + { + } + Mat34V(const Vec3V& c0, const Vec3V& c1, const Vec3V& c2, const Vec3V& c3) : col0(c0), col1(c1), col2(c2), col3(c3) + { + } + Vec3V PX_ALIGN(16, col0); + Vec3V PX_ALIGN(16, col1); + Vec3V PX_ALIGN(16, col2); + Vec3V PX_ALIGN(16, col3); +} PX_ALIGN_SUFFIX(16); + +PX_ALIGN_PREFIX(16) +struct Mat43V +{ + Mat43V() + { + } + Mat43V(const Vec4V& c0, const Vec4V& c1, const Vec4V& c2) : col0(c0), col1(c1), col2(c2) + { + } + Vec4V PX_ALIGN(16, col0); + Vec4V PX_ALIGN(16, col1); + Vec4V PX_ALIGN(16, col2); +} PX_ALIGN_SUFFIX(16); + +PX_ALIGN_PREFIX(16) +struct Mat44V +{ + Mat44V() + { + } + Mat44V(const Vec4V& c0, const Vec4V& c1, const Vec4V& c2, const Vec4V& c3) : col0(c0), col1(c1), col2(c2), col3(c3) + { + } + Vec4V PX_ALIGN(16, col0); + Vec4V PX_ALIGN(16, col1); + Vec4V PX_ALIGN(16, col2); + Vec4V PX_ALIGN(16, col3); +} PX_ALIGN_SUFFIX(16); + +#endif // PSFOUNDATION_PSUNIXSSE2AOS_H diff --git a/PxShared/src/foundation/include/unix/sse2/PsUnixSse2InlineAoS.h b/PxShared/src/foundation/include/unix/sse2/PsUnixSse2InlineAoS.h new file mode 100644 index 00000000..4503c80e --- /dev/null +++ b/PxShared/src/foundation/include/unix/sse2/PsUnixSse2InlineAoS.h @@ -0,0 +1,3208 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSUNIXSSE2INLINEAOS_H +#define PSFOUNDATION_PSUNIXSSE2INLINEAOS_H + +#if !COMPILE_VECTOR_INTRINSICS +#error Vector intrinsics should not be included when using scalar implementation. +#endif + +// Remove this define when all platforms use simd solver. +#define PX_SUPPORT_SIMD + +#ifdef __SSE4_2__ +#include "smmintrin.h" +#endif + +#include "../../PsVecMathSSE.h" + +#define PX_FPCLASS_SNAN 0x0001 /* signaling NaN */ +#define PX_FPCLASS_QNAN 0x0002 /* quiet NaN */ +#define PX_FPCLASS_NINF 0x0004 /* negative infinity */ +#define PX_FPCLASS_PINF 0x0200 /* positive infinity */ + +PX_FORCE_INLINE __m128 m128_I2F(__m128i n) +{ + return _mm_castsi128_ps(n); +} +PX_FORCE_INLINE __m128i m128_F2I(__m128 n) +{ + return _mm_castps_si128(n); +} + +////////////////////////////////////////////////////////////////////// +//Test that Vec3V and FloatV are legal +////////////////////////////////////////////////////////////////////// + +#define FLOAT_COMPONENTS_EQUAL_THRESHOLD 0.01f +PX_FORCE_INLINE static bool isValidFloatV(const FloatV a) +{ + const PxF32 x = V4ReadX(a); + const PxF32 y = V4ReadY(a); + const PxF32 z = V4ReadZ(a); + const PxF32 w = V4ReadW(a); + + if ( + (PxAbs(x - y) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) && + (PxAbs(x - z) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) && + (PxAbs(x - w) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) + ) + { + return true; + } + + if ( + (PxAbs((x - y) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) && + (PxAbs((x - z) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) && + (PxAbs((x - w) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) + ) + { + return true; + } + + return false; +} + +PX_FORCE_INLINE bool isValidVec3V(const Vec3V a) +{ + PX_ALIGN(16, PxF32 f[4]); + V4StoreA(a, f); + return (f[3] == 0.0f); +} + +PX_FORCE_INLINE bool isFiniteLength(const Vec3V a) +{ + return !FAllEq(V4LengthSq(a), FZero()); +} + +PX_FORCE_INLINE bool isAligned16(void* a) +{ + return(0 == (size_t(a) & 0x0f)); +} + +//ASSERT_FINITELENGTH is deactivated because there is a lot of code that calls a simd normalisation function with zero length but then ignores the result. + +#if PX_DEBUG +#define ASSERT_ISVALIDVEC3V(a) PX_ASSERT(isValidVec3V(a)) +#define ASSERT_ISVALIDFLOATV(a) PX_ASSERT(isValidFloatV(a)) +#define ASSERT_ISALIGNED16(a) PX_ASSERT(isAligned16(reinterpret_cast<void*>(a))) +#define ASSERT_ISFINITELENGTH(a) //PX_ASSERT(isFiniteLength(a)) +#else +#define ASSERT_ISVALIDVEC3V(a) +#define ASSERT_ISVALIDFLOATV(a) +#define ASSERT_ISALIGNED16(a) +#define ASSERT_ISFINITELENGTH(a) +#endif + + +namespace internalUnitSSE2Simd +{ +PX_FORCE_INLINE PxU32 BAllTrue4_R(const BoolV a) +{ + const PxI32 moveMask = _mm_movemask_ps(a); + return PxU32(moveMask == 0xf); +} + +PX_FORCE_INLINE PxU32 BAllTrue3_R(const BoolV a) +{ + const PxI32 moveMask = _mm_movemask_ps(a); + return PxU32((moveMask & 0x7) == 0x7); +} + +PX_FORCE_INLINE PxU32 BAnyTrue4_R(const BoolV a) +{ + const PxI32 moveMask = _mm_movemask_ps(a); + return PxU32(moveMask != 0x0); +} + +PX_FORCE_INLINE PxU32 BAnyTrue3_R(const BoolV a) +{ + const PxI32 moveMask = _mm_movemask_ps(a); + return PxU32((moveMask & 0x7) != 0x0); +} + +PX_FORCE_INLINE PxU32 FiniteTestEq(const Vec4V a, const Vec4V b) +{ + // This is a bit of a bodge. + //_mm_comieq_ss returns 1 if either value is nan so we need to re-cast a and b with true encoded as a non-nan + // number. + // There must be a better way of doing this in sse. + const BoolV one = FOne(); + const BoolV zero = FZero(); + const BoolV a1 = V4Sel(a, one, zero); + const BoolV b1 = V4Sel(b, one, zero); + return ( + _mm_comieq_ss(a1, b1) && + _mm_comieq_ss(_mm_shuffle_ps(a1, a1, _MM_SHUFFLE(1, 1, 1, 1)), _mm_shuffle_ps(b1, b1, _MM_SHUFFLE(1, 1, 1, 1))) && + _mm_comieq_ss(_mm_shuffle_ps(a1, a1, _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(b1, b1, _MM_SHUFFLE(2, 2, 2, 2))) && + _mm_comieq_ss(_mm_shuffle_ps(a1, a1, _MM_SHUFFLE(3, 3, 3, 3)), _mm_shuffle_ps(b1, b1, _MM_SHUFFLE(3, 3, 3, 3)))); +} + +const PX_ALIGN(16, PxF32 gMaskXYZ[4]) = { physx::PxUnionCast<PxF32>(0xffffffff), physx::PxUnionCast<PxF32>(0xffffffff), + physx::PxUnionCast<PxF32>(0xffffffff), 0 }; +} + +namespace _VecMathTests +{ +// PT: this function returns an invalid Vec3V (W!=0.0f) just for unit-testing 'isValidVec3V' +PX_FORCE_INLINE Vec3V getInvalidVec3V() +{ + const float f = 1.0f; + return _mm_load1_ps(&f); +} + +PX_FORCE_INLINE bool allElementsEqualFloatV(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_comieq_ss(a, b) != 0; +} + +PX_FORCE_INLINE bool allElementsEqualVec3V(const Vec3V a, const Vec3V b) +{ + return V3AllEq(a, b) != 0; +} + +PX_FORCE_INLINE bool allElementsEqualVec4V(const Vec4V a, const Vec4V b) +{ + return V4AllEq(a, b) != 0; +} + +PX_FORCE_INLINE bool allElementsEqualBoolV(const BoolV a, const BoolV b) +{ + return internalUnitSSE2Simd::BAllTrue4_R(VecI32V_IsEq(m128_F2I(a), m128_F2I(b))) != 0; +} + +PX_FORCE_INLINE bool allElementsEqualVecU32V(const VecU32V a, const VecU32V b) +{ + return internalUnitSSE2Simd::BAllTrue4_R(V4IsEqU32(a, b)) != 0; +} + +PX_FORCE_INLINE bool allElementsEqualVecI32V(const VecI32V a, const VecI32V b) +{ + BoolV c = m128_I2F(_mm_cmpeq_epi32(a, b)); + return internalUnitSSE2Simd::BAllTrue4_R(c) != 0; +} + +#define VECMATH_AOS_EPSILON (1e-3f) + +PX_FORCE_INLINE bool allElementsNearEqualFloatV(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + const FloatV c = FSub(a, b); + const FloatV minError = FLoad(-VECMATH_AOS_EPSILON); + const FloatV maxError = FLoad(VECMATH_AOS_EPSILON); + return _mm_comigt_ss(c, minError) && _mm_comilt_ss(c, maxError); +} + +PX_FORCE_INLINE bool allElementsNearEqualVec3V(const Vec3V a, const Vec3V b) +{ + const Vec3V c = V3Sub(a, b); + const Vec3V minError = V3Load(-VECMATH_AOS_EPSILON); + const Vec3V maxError = V3Load(VECMATH_AOS_EPSILON); + return (_mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), minError) && + _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), maxError) && + _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), minError) && + _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), maxError) && + _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), minError) && + _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), maxError)); +} + +PX_FORCE_INLINE bool allElementsNearEqualVec4V(const Vec4V a, const Vec4V b) +{ + const Vec4V c = V4Sub(a, b); + const Vec4V minError = V4Load(-VECMATH_AOS_EPSILON); + const Vec4V maxError = V4Load(VECMATH_AOS_EPSILON); + return (_mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), minError) && + _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), maxError) && + _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), minError) && + _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), maxError) && + _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), minError) && + _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), maxError) && + _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 3, 3, 3)), minError) && + _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 3, 3, 3)), maxError)); +} +} + +///////////////////////////////////////////////////////////////////// +////FUNCTIONS USED ONLY FOR ASSERTS IN VECTORISED IMPLEMENTATIONS +///////////////////////////////////////////////////////////////////// + +PX_FORCE_INLINE bool isFiniteFloatV(const FloatV a) +{ + PxF32 badNumber = + physx::PxUnionCast<PxF32, PxU32>(PX_FPCLASS_SNAN | PX_FPCLASS_QNAN | PX_FPCLASS_NINF | PX_FPCLASS_PINF); + const FloatV vBadNum = FLoad(badNumber); + const BoolV vMask = BAnd(vBadNum, a); + return internalUnitSSE2Simd::FiniteTestEq(vMask, BFFFF()) == 1; +} + +PX_FORCE_INLINE bool isFiniteVec3V(const Vec3V a) +{ + PxF32 badNumber = + physx::PxUnionCast<PxF32, PxU32>(PX_FPCLASS_SNAN | PX_FPCLASS_QNAN | PX_FPCLASS_NINF | PX_FPCLASS_PINF); + const Vec3V vBadNum = V3Load(badNumber); + const BoolV vMask = BAnd(BAnd(vBadNum, a), BTTTF()); + return internalUnitSSE2Simd::FiniteTestEq(vMask, BFFFF()) == 1; +} + +PX_FORCE_INLINE bool isFiniteVec4V(const Vec4V a) +{ + /*Vec4V a; + PX_ALIGN(16, PxF32 f[4]); + F32Array_Aligned_From_Vec4V(a, f); + return PxIsFinite(f[0]) + && PxIsFinite(f[1]) + && PxIsFinite(f[2]) + && PxIsFinite(f[3]);*/ + + PxF32 badNumber = + physx::PxUnionCast<PxF32, PxU32>(PX_FPCLASS_SNAN | PX_FPCLASS_QNAN | PX_FPCLASS_NINF | PX_FPCLASS_PINF); + const Vec4V vBadNum = V4Load(badNumber); + const BoolV vMask = BAnd(vBadNum, a); + + return internalUnitSSE2Simd::FiniteTestEq(vMask, BFFFF()) == 1; +} + +PX_FORCE_INLINE bool hasZeroElementinFloatV(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + return _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), FZero()) ? true : false; +} + +PX_FORCE_INLINE bool hasZeroElementInVec3V(const Vec3V a) +{ + return (_mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), FZero()) || + _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)), FZero()) || + _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)), FZero())); +} + +PX_FORCE_INLINE bool hasZeroElementInVec4V(const Vec4V a) +{ + return (_mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), FZero()) || + _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)), FZero()) || + _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)), FZero()) || + _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)), FZero())); +} + +///////////////////////////////////////////////////////////////////// +////VECTORISED FUNCTION IMPLEMENTATIONS +///////////////////////////////////////////////////////////////////// + +PX_FORCE_INLINE FloatV FLoad(const PxF32 f) +{ + return _mm_load1_ps(&f); +} + +PX_FORCE_INLINE Vec3V V3Load(const PxF32 f) +{ + return _mm_set_ps(0.0f, f, f, f); +} + +PX_FORCE_INLINE Vec4V V4Load(const PxF32 f) +{ + return _mm_load1_ps(&f); +} + +PX_FORCE_INLINE BoolV BLoad(const bool f) +{ + const PxU32 i = -PxI32(f); + return _mm_load1_ps(reinterpret_cast<const float*>(&i)); +} + +PX_FORCE_INLINE Vec3V V3LoadA(const PxVec3& f) +{ + ASSERT_ISALIGNED16(const_cast<PxVec3*>(&f)); + return _mm_and_ps(reinterpret_cast<const Vec3V&>(f), V4LoadA(internalUnitSSE2Simd::gMaskXYZ)); +} + +PX_FORCE_INLINE Vec3V V3LoadU(const PxVec3& f) +{ + return _mm_set_ps(0.0f, f.z, f.y, f.x); +} + +PX_FORCE_INLINE Vec3V V3LoadUnsafeA(const PxVec3& f) +{ + ASSERT_ISALIGNED16(const_cast<PxVec3*>(&f)); + return _mm_set_ps(0.0f, f.z, f.y, f.x); +} + +PX_FORCE_INLINE Vec3V V3LoadA(const PxF32* const f) +{ + ASSERT_ISALIGNED16(const_cast<PxF32*>(f)); + return _mm_and_ps(V4LoadA(f), V4LoadA(internalUnitSSE2Simd::gMaskXYZ)); +} + +PX_FORCE_INLINE Vec3V V3LoadU(const PxF32* const i) +{ + return _mm_set_ps(0.0f, i[2], i[1], i[0]); +} + +PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V(Vec4V v) +{ + return V4ClearW(v); +} + +PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V_WUndefined(const Vec4V v) +{ + return v; +} + +PX_FORCE_INLINE Vec4V Vec4V_From_Vec3V(Vec3V f) +{ + ASSERT_ISVALIDVEC3V(f); + return f; // ok if it is implemented as the same type. +} + +PX_FORCE_INLINE Vec4V Vec4V_From_PxVec3_WUndefined(const PxVec3& f) +{ + return _mm_set_ps(0.0f, f.z, f.y, f.x); +} + +PX_FORCE_INLINE Vec4V Vec4V_From_FloatV(FloatV f) +{ + return f; +} + +PX_FORCE_INLINE Vec3V Vec3V_From_FloatV(FloatV f) +{ + ASSERT_ISVALIDFLOATV(f); + return Vec3V_From_Vec4V(Vec4V_From_FloatV(f)); +} + +PX_FORCE_INLINE Vec3V Vec3V_From_FloatV_WUndefined(FloatV f) +{ + ASSERT_ISVALIDVEC3V(f); + return Vec3V_From_Vec4V_WUndefined(Vec4V_From_FloatV(f)); +} + +PX_FORCE_INLINE Mat33V Mat33V_From_PxMat33(const PxMat33& m) +{ + return Mat33V(V3LoadU(m.column0), V3LoadU(m.column1), V3LoadU(m.column2)); +} + +PX_FORCE_INLINE void PxMat33_From_Mat33V(const Mat33V& m, PxMat33& out) +{ + V3StoreU(m.col0, out.column0); + V3StoreU(m.col1, out.column1); + V3StoreU(m.col2, out.column2); +} + +PX_FORCE_INLINE Vec4V V4LoadA(const PxF32* const f) +{ + ASSERT_ISALIGNED16(const_cast<PxF32*>(f)); + return _mm_load_ps(f); +} + +PX_FORCE_INLINE void V4StoreA(Vec4V a, PxF32* f) +{ + ASSERT_ISALIGNED16(f); + _mm_store_ps(f, a); +} + +PX_FORCE_INLINE void V4StoreU(const Vec4V a, PxF32* f) +{ + _mm_storeu_ps(f, a); +} + +PX_FORCE_INLINE void BStoreA(const BoolV a, PxU32* f) +{ + ASSERT_ISALIGNED16(f); + _mm_store_ps(reinterpret_cast<PxF32*>(f), a); +} + +PX_FORCE_INLINE void U4StoreA(const VecU32V uv, PxU32* u) +{ + ASSERT_ISALIGNED16(u); + _mm_store_ps(reinterpret_cast<float*>(u), uv); +} + +PX_FORCE_INLINE void I4StoreA(const VecI32V iv, PxI32* i) +{ + ASSERT_ISALIGNED16(i); + _mm_store_ps(reinterpret_cast<float*>(i), m128_I2F(iv)); +} + +PX_FORCE_INLINE Vec4V V4LoadU(const PxF32* const f) +{ + return _mm_loadu_ps(f); +} + +PX_FORCE_INLINE BoolV BLoad(const bool* const f) +{ + const PX_ALIGN(16, PxI32) b[4] = { -PxI32(f[0]), -PxI32(f[1]), -PxI32(f[2]), -PxI32(f[3]) }; + return _mm_load_ps(reinterpret_cast<const float*>(&b)); +} + +PX_FORCE_INLINE void FStore(const FloatV a, PxF32* PX_RESTRICT f) +{ + ASSERT_ISVALIDFLOATV(a); + _mm_store_ss(f, a); +} + +PX_FORCE_INLINE void V3StoreA(const Vec3V a, PxVec3& f) +{ + ASSERT_ISALIGNED16(&f); + PX_ALIGN(16, PxF32) f2[4]; + _mm_store_ps(f2, a); + f = PxVec3(f2[0], f2[1], f2[2]); +} + +PX_FORCE_INLINE void V3StoreU(const Vec3V a, PxVec3& f) +{ + PX_ALIGN(16, PxF32) f2[4]; + _mm_store_ps(f2, a); + f = PxVec3(f2[0], f2[1], f2[2]); +} + +PX_FORCE_INLINE void Store_From_BoolV(const BoolV b, PxU32* b2) +{ + _mm_store_ss(reinterpret_cast<PxF32*>(b2), b); +} + +PX_FORCE_INLINE VecU32V U4Load(const PxU32 i) +{ + return _mm_load1_ps(reinterpret_cast<const PxF32*>(&i)); +} + +PX_FORCE_INLINE VecU32V U4LoadU(const PxU32* i) +{ + return _mm_loadu_ps(reinterpret_cast<const PxF32*>(i)); +} + +PX_FORCE_INLINE VecU32V U4LoadA(const PxU32* i) +{ + ASSERT_ISALIGNED16(const_cast<PxU32*>(i)); + return _mm_load_ps(reinterpret_cast<const PxF32*>(i)); +} + +////////////////////////////////// +// FLOATV +////////////////////////////////// + +PX_FORCE_INLINE FloatV FZero() +{ + return FLoad(0.0f); +} + +PX_FORCE_INLINE FloatV FOne() +{ + return FLoad(1.0f); +} + +PX_FORCE_INLINE FloatV FHalf() +{ + return FLoad(0.5f); +} + +PX_FORCE_INLINE FloatV FEps() +{ + return FLoad(PX_EPS_REAL); +} + +PX_FORCE_INLINE FloatV FEps6() +{ + return FLoad(1e-6f); +} + +PX_FORCE_INLINE FloatV FMax() +{ + return FLoad(PX_MAX_REAL); +} + +PX_FORCE_INLINE FloatV FNegMax() +{ + return FLoad(-PX_MAX_REAL); +} + +PX_FORCE_INLINE FloatV IZero() +{ + const PxU32 zero = 0; + return _mm_load1_ps(reinterpret_cast<const PxF32*>(&zero)); +} + +PX_FORCE_INLINE FloatV IOne() +{ + const PxU32 one = 1; + return _mm_load1_ps(reinterpret_cast<const PxF32*>(&one)); +} + +PX_FORCE_INLINE FloatV ITwo() +{ + const PxU32 two = 2; + return _mm_load1_ps(reinterpret_cast<const PxF32*>(&two)); +} + +PX_FORCE_INLINE FloatV IThree() +{ + const PxU32 three = 3; + return _mm_load1_ps(reinterpret_cast<const PxF32*>(&three)); +} + +PX_FORCE_INLINE FloatV IFour() +{ + PxU32 four = 4; + return _mm_load1_ps(reinterpret_cast<const PxF32*>(&four)); +} + +PX_FORCE_INLINE FloatV FNeg(const FloatV f) +{ + ASSERT_ISVALIDFLOATV(f); + return _mm_sub_ps(_mm_setzero_ps(), f); +} + +PX_FORCE_INLINE FloatV FAdd(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); +/* + if(!isValidFloatV(a)) + { +assert(false); + } + if(!isValidFloatV(b)) + { +assert(false); + } +*/ + return _mm_add_ps(a, b); +} + +PX_FORCE_INLINE FloatV FSub(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_sub_ps(a, b); +} + +PX_FORCE_INLINE FloatV FMul(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_mul_ps(a, b); +} + +PX_FORCE_INLINE FloatV FDiv(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_div_ps(a, b); +} + +PX_FORCE_INLINE FloatV FDivFast(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_mul_ps(a, _mm_rcp_ps(b)); +} + +PX_FORCE_INLINE FloatV FRecip(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + return _mm_div_ps(FOne(), a); +} + +PX_FORCE_INLINE FloatV FRecipFast(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + return _mm_rcp_ps(a); +} + +PX_FORCE_INLINE FloatV FRsqrt(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + return _mm_div_ps(FOne(), _mm_sqrt_ps(a)); +} + +PX_FORCE_INLINE FloatV FSqrt(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + return _mm_sqrt_ps(a); +} + +PX_FORCE_INLINE FloatV FRsqrtFast(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + return _mm_rsqrt_ps(a); +} + +PX_FORCE_INLINE FloatV FScaleAdd(const FloatV a, const FloatV b, const FloatV c) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + ASSERT_ISVALIDFLOATV(c); + return FAdd(FMul(a, b), c); +} + +PX_FORCE_INLINE FloatV FNegScaleSub(const FloatV a, const FloatV b, const FloatV c) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + ASSERT_ISVALIDFLOATV(c); + return FSub(c, FMul(a, b)); +} + +PX_FORCE_INLINE FloatV FAbs(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + PX_ALIGN(16, const PxU32) absMask[4] = { 0x7fFFffFF, 0x7fFFffFF, 0x7fFFffFF, 0x7fFFffFF }; + return _mm_and_ps(a, _mm_load_ps(reinterpret_cast<const PxF32*>(absMask))); +} + +PX_FORCE_INLINE FloatV FSel(const BoolV c, const FloatV a, const FloatV b) +{ + PX_ASSERT(_VecMathTests::allElementsEqualBoolV(c,BTTTT()) || + _VecMathTests::allElementsEqualBoolV(c,BFFFF())); + ASSERT_ISVALIDFLOATV(_mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a))); + return _mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a)); +} + +PX_FORCE_INLINE BoolV FIsGrtr(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_cmpgt_ps(a, b); +} + +PX_FORCE_INLINE BoolV FIsGrtrOrEq(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_cmpge_ps(a, b); +} + +PX_FORCE_INLINE BoolV FIsEq(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_cmpeq_ps(a, b); +} + +PX_FORCE_INLINE FloatV FMax(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_max_ps(a, b); +} + +PX_FORCE_INLINE FloatV FMin(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_min_ps(a, b); +} + +PX_FORCE_INLINE FloatV FClamp(const FloatV a, const FloatV minV, const FloatV maxV) +{ + ASSERT_ISVALIDFLOATV(minV); + ASSERT_ISVALIDFLOATV(maxV); + return _mm_max_ps(_mm_min_ps(a, maxV), minV); +} + +PX_FORCE_INLINE PxU32 FAllGrtr(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_comigt_ss(a, b); +} + +PX_FORCE_INLINE PxU32 FAllGrtrOrEq(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_comige_ss(a, b); +} + +PX_FORCE_INLINE PxU32 FAllEq(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_comieq_ss(a, b); +} + +PX_FORCE_INLINE FloatV FRound(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); +#ifdef __SSE4_2__ + return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); +#else + // return _mm_round_ps(a, 0x0); + const FloatV half = FLoad(0.5f); + const __m128 signBit = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_cvtps_epi32(a), 31)); + const FloatV aRound = FSub(FAdd(a, half), signBit); + __m128i tmp = _mm_cvttps_epi32(aRound); + return _mm_cvtepi32_ps(tmp); +#endif +} + +PX_FORCE_INLINE FloatV FSin(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + + // Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI + const FloatV recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f); + const FloatV twoPi = V4LoadA(g_PXTwoPi.f); + const FloatV tmp = FMul(a, recipTwoPi); + const FloatV b = FRound(tmp); + const FloatV V1 = FNegScaleSub(twoPi, b, a); + + // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! - + // V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI) + const FloatV V2 = FMul(V1, V1); + const FloatV V3 = FMul(V2, V1); + const FloatV V5 = FMul(V3, V2); + const FloatV V7 = FMul(V5, V2); + const FloatV V9 = FMul(V7, V2); + const FloatV V11 = FMul(V9, V2); + const FloatV V13 = FMul(V11, V2); + const FloatV V15 = FMul(V13, V2); + const FloatV V17 = FMul(V15, V2); + const FloatV V19 = FMul(V17, V2); + const FloatV V21 = FMul(V19, V2); + const FloatV V23 = FMul(V21, V2); + + const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f); + const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f); + const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f); + + const FloatV S1 = V4GetY(sinCoefficients0); + const FloatV S2 = V4GetZ(sinCoefficients0); + const FloatV S3 = V4GetW(sinCoefficients0); + const FloatV S4 = V4GetX(sinCoefficients1); + const FloatV S5 = V4GetY(sinCoefficients1); + const FloatV S6 = V4GetZ(sinCoefficients1); + const FloatV S7 = V4GetW(sinCoefficients1); + const FloatV S8 = V4GetX(sinCoefficients2); + const FloatV S9 = V4GetY(sinCoefficients2); + const FloatV S10 = V4GetZ(sinCoefficients2); + const FloatV S11 = V4GetW(sinCoefficients2); + + FloatV Result; + Result = FScaleAdd(S1, V3, V1); + Result = FScaleAdd(S2, V5, Result); + Result = FScaleAdd(S3, V7, Result); + Result = FScaleAdd(S4, V9, Result); + Result = FScaleAdd(S5, V11, Result); + Result = FScaleAdd(S6, V13, Result); + Result = FScaleAdd(S7, V15, Result); + Result = FScaleAdd(S8, V17, Result); + Result = FScaleAdd(S9, V19, Result); + Result = FScaleAdd(S10, V21, Result); + Result = FScaleAdd(S11, V23, Result); + + return Result; +} + +PX_FORCE_INLINE FloatV FCos(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + + // Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI + const FloatV recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f); + const FloatV twoPi = V4LoadA(g_PXTwoPi.f); + const FloatV tmp = FMul(a, recipTwoPi); + const FloatV b = FRound(tmp); + const FloatV V1 = FNegScaleSub(twoPi, b, a); + + // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! - + // V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI) + const FloatV V2 = FMul(V1, V1); + const FloatV V4 = FMul(V2, V2); + const FloatV V6 = FMul(V4, V2); + const FloatV V8 = FMul(V4, V4); + const FloatV V10 = FMul(V6, V4); + const FloatV V12 = FMul(V6, V6); + const FloatV V14 = FMul(V8, V6); + const FloatV V16 = FMul(V8, V8); + const FloatV V18 = FMul(V10, V8); + const FloatV V20 = FMul(V10, V10); + const FloatV V22 = FMul(V12, V10); + + const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f); + const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f); + const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f); + + const FloatV C1 = V4GetY(cosCoefficients0); + const FloatV C2 = V4GetZ(cosCoefficients0); + const FloatV C3 = V4GetW(cosCoefficients0); + const FloatV C4 = V4GetX(cosCoefficients1); + const FloatV C5 = V4GetY(cosCoefficients1); + const FloatV C6 = V4GetZ(cosCoefficients1); + const FloatV C7 = V4GetW(cosCoefficients1); + const FloatV C8 = V4GetX(cosCoefficients2); + const FloatV C9 = V4GetY(cosCoefficients2); + const FloatV C10 = V4GetZ(cosCoefficients2); + const FloatV C11 = V4GetW(cosCoefficients2); + + FloatV Result; + Result = FScaleAdd(C1, V2, V4One()); + Result = FScaleAdd(C2, V4, Result); + Result = FScaleAdd(C3, V6, Result); + Result = FScaleAdd(C4, V8, Result); + Result = FScaleAdd(C5, V10, Result); + Result = FScaleAdd(C6, V12, Result); + Result = FScaleAdd(C7, V14, Result); + Result = FScaleAdd(C8, V16, Result); + Result = FScaleAdd(C9, V18, Result); + Result = FScaleAdd(C10, V20, Result); + Result = FScaleAdd(C11, V22, Result); + + return Result; +} + +PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV min, const FloatV max) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(min); + ASSERT_ISVALIDFLOATV(max); + const BoolV c = BOr(FIsGrtr(a, max), FIsGrtr(min, a)); + return !BAllEqFFFF(c); +} + +PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV min, const FloatV max) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(min); + ASSERT_ISVALIDFLOATV(max) + const BoolV c = BAnd(FIsGrtrOrEq(a, min), FIsGrtrOrEq(max, a)); + return BAllEqTTTT(c); +} + +PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV bounds) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(bounds); + return FOutOfBounds(a, FNeg(bounds), bounds); +} + +PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV bounds) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(bounds); + return FInBounds(a, FNeg(bounds), bounds); +} + +////////////////////////////////// +// VEC3V +////////////////////////////////// + +PX_FORCE_INLINE Vec3V V3Splat(const FloatV f) +{ + ASSERT_ISVALIDFLOATV(f); + const __m128 zero = FZero(); + const __m128 fff0 = _mm_move_ss(f, zero); + return _mm_shuffle_ps(fff0, fff0, _MM_SHUFFLE(0, 1, 2, 3)); +} + +PX_FORCE_INLINE Vec3V V3Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z) +{ + ASSERT_ISVALIDFLOATV(x); + ASSERT_ISVALIDFLOATV(y); + ASSERT_ISVALIDFLOATV(z); + // static on zero causes compiler crash on x64 debug_opt + const __m128 zero = FZero(); + const __m128 xy = _mm_move_ss(x, y); + const __m128 z0 = _mm_move_ss(zero, z); + + return _mm_shuffle_ps(xy, z0, _MM_SHUFFLE(1, 0, 0, 1)); +} + +PX_FORCE_INLINE Vec3V V3UnitX() +{ + const PX_ALIGN(16, PxF32) x[4] = { 1.0f, 0.0f, 0.0f, 0.0f }; + const __m128 x128 = _mm_load_ps(x); + return x128; +} + +PX_FORCE_INLINE Vec3V V3UnitY() +{ + const PX_ALIGN(16, PxF32) y[4] = { 0.0f, 1.0f, 0.0f, 0.0f }; + const __m128 y128 = _mm_load_ps(y); + return y128; +} + +PX_FORCE_INLINE Vec3V V3UnitZ() +{ + const PX_ALIGN(16, PxF32) z[4] = { 0.0f, 0.0f, 1.0f, 0.0f }; + const __m128 z128 = _mm_load_ps(z); + return z128; +} + +PX_FORCE_INLINE FloatV V3GetX(const Vec3V f) +{ + ASSERT_ISVALIDVEC3V(f); + return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0, 0, 0, 0)); +} + +PX_FORCE_INLINE FloatV V3GetY(const Vec3V f) +{ + ASSERT_ISVALIDVEC3V(f) + return _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 1, 1)); +} + +PX_FORCE_INLINE FloatV V3GetZ(const Vec3V f) +{ + ASSERT_ISVALIDVEC3V(f); + return _mm_shuffle_ps(f, f, _MM_SHUFFLE(2, 2, 2, 2)); +} + +PX_FORCE_INLINE Vec3V V3SetX(const Vec3V v, const FloatV f) +{ + ASSERT_ISVALIDVEC3V(v); + ASSERT_ISVALIDFLOATV(f); + return V4Sel(BFTTT(), v, f); +} + +PX_FORCE_INLINE Vec3V V3SetY(const Vec3V v, const FloatV f) +{ + ASSERT_ISVALIDVEC3V(v); + ASSERT_ISVALIDFLOATV(f); + return V4Sel(BTFTT(), v, f); +} + +PX_FORCE_INLINE Vec3V V3SetZ(const Vec3V v, const FloatV f) +{ + ASSERT_ISVALIDVEC3V(v); + ASSERT_ISVALIDFLOATV(f); + return V4Sel(BTTFT(), v, f); +} + +PX_FORCE_INLINE Vec3V V3ColX(const Vec3V a, const Vec3V b, const Vec3V c) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + ASSERT_ISVALIDVEC3V(c); + Vec3V r = _mm_shuffle_ps(a, c, _MM_SHUFFLE(3, 0, 3, 0)); + return V3SetY(r, V3GetX(b)); +} + +PX_FORCE_INLINE Vec3V V3ColY(const Vec3V a, const Vec3V b, const Vec3V c) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + ASSERT_ISVALIDVEC3V(c) + Vec3V r = _mm_shuffle_ps(a, c, _MM_SHUFFLE(3, 1, 3, 1)); + return V3SetY(r, V3GetY(b)); +} + +PX_FORCE_INLINE Vec3V V3ColZ(const Vec3V a, const Vec3V b, const Vec3V c) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + ASSERT_ISVALIDVEC3V(c); + Vec3V r = _mm_shuffle_ps(a, c, _MM_SHUFFLE(3, 2, 3, 2)); + return V3SetY(r, V3GetZ(b)); +} + +PX_FORCE_INLINE Vec3V V3Zero() +{ + return V3Load(0.0f); +} + +PX_FORCE_INLINE Vec3V V3Eps() +{ + return V3Load(PX_EPS_REAL); +} +PX_FORCE_INLINE Vec3V V3One() +{ + return V3Load(1.0f); +} + +PX_FORCE_INLINE Vec3V V3Neg(const Vec3V f) +{ + ASSERT_ISVALIDVEC3V(f); + return _mm_sub_ps(_mm_setzero_ps(), f); +} + +PX_FORCE_INLINE Vec3V V3Add(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return _mm_add_ps(a, b); +} + +PX_FORCE_INLINE Vec3V V3Sub(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return _mm_sub_ps(a, b); +} + +PX_FORCE_INLINE Vec3V V3Scale(const Vec3V a, const FloatV b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_mul_ps(a, b); +} + +PX_FORCE_INLINE Vec3V V3Mul(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return _mm_mul_ps(a, b); +} + +PX_FORCE_INLINE Vec3V V3ScaleInv(const Vec3V a, const FloatV b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_div_ps(a, b); +} + +PX_FORCE_INLINE Vec3V V3Div(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return V4ClearW(_mm_div_ps(a, b)); +} + +PX_FORCE_INLINE Vec3V V3ScaleInvFast(const Vec3V a, const FloatV b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_mul_ps(a, _mm_rcp_ps(b)); +} + +PX_FORCE_INLINE Vec3V V3DivFast(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return V4ClearW(_mm_mul_ps(a, _mm_rcp_ps(b))); +} + +PX_FORCE_INLINE Vec3V V3Recip(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + const __m128 zero = V3Zero(); + const __m128 tttf = BTTTF(); + const __m128 recipA = _mm_div_ps(V3One(), a); + return V4Sel(tttf, recipA, zero); +} + +PX_FORCE_INLINE Vec3V V3RecipFast(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + const __m128 zero = V3Zero(); + const __m128 tttf = BTTTF(); + const __m128 recipA = _mm_rcp_ps(a); + return V4Sel(tttf, recipA, zero); +} + +PX_FORCE_INLINE Vec3V V3Rsqrt(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + const __m128 zero = V3Zero(); + const __m128 tttf = BTTTF(); + const __m128 recipA = _mm_div_ps(V3One(), _mm_sqrt_ps(a)); + return V4Sel(tttf, recipA, zero); +} + +PX_FORCE_INLINE Vec3V V3RsqrtFast(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + const __m128 zero = V3Zero(); + const __m128 tttf = BTTTF(); + const __m128 recipA = _mm_rsqrt_ps(a); + return V4Sel(tttf, recipA, zero); +} + +PX_FORCE_INLINE Vec3V V3ScaleAdd(const Vec3V a, const FloatV b, const Vec3V c) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDFLOATV(b); + ASSERT_ISVALIDVEC3V(c); + return V3Add(V3Scale(a, b), c); +} + +PX_FORCE_INLINE Vec3V V3NegScaleSub(const Vec3V a, const FloatV b, const Vec3V c) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDFLOATV(b); + ASSERT_ISVALIDVEC3V(c); + return V3Sub(c, V3Scale(a, b)); +} + +PX_FORCE_INLINE Vec3V V3MulAdd(const Vec3V a, const Vec3V b, const Vec3V c) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + ASSERT_ISVALIDVEC3V(c); + return V3Add(V3Mul(a, b), c); +} + +PX_FORCE_INLINE Vec3V V3NegMulSub(const Vec3V a, const Vec3V b, const Vec3V c) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + ASSERT_ISVALIDVEC3V(c); + return V3Sub(c, V3Mul(a, b)); +} + +PX_FORCE_INLINE Vec3V V3Abs(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + return V3Max(a, V3Neg(a)); +} + +PX_FORCE_INLINE FloatV V3Dot(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); +#ifdef __SSE4_2__ + return _mm_dp_ps(a, b, 0x7f); +#else + const __m128 t0 = _mm_mul_ps(a, b); // aw*bw | az*bz | ay*by | ax*bx + const __m128 t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(1,0,3,2)); // ay*by | ax*bx | aw*bw | az*bz + const __m128 t2 = _mm_add_ps(t0, t1); // ay*by + aw*bw | ax*bx + az*bz | aw*bw + ay*by | az*bz + ax*bx + const __m128 t3 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,3,0,1)); // ax*bx + az*bz | ay*by + aw*bw | az*bz + ax*bx | aw*bw + ay*by + return _mm_add_ps(t3, t2); // ax*bx + az*bz + ay*by + aw*bw + // ay*by + aw*bw + ax*bx + az*bz + // az*bz + ax*bx + aw*bw + ay*by + // aw*bw + ay*by + az*bz + ax*bx +#endif +} + +PX_FORCE_INLINE Vec3V V3Cross(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + const __m128 r1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w + const __m128 r2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w + const __m128 l1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w + const __m128 l2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w + return _mm_sub_ps(_mm_mul_ps(l1, l2), _mm_mul_ps(r1, r2)); +} + +PX_FORCE_INLINE VecCrossV V3PrepareCross(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + VecCrossV v; + v.mR1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w + v.mL1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w + return v; +} + +PX_FORCE_INLINE Vec3V V3Cross(const VecCrossV& a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(b); + const __m128 r2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w + const __m128 l2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w + return _mm_sub_ps(_mm_mul_ps(a.mL1, l2), _mm_mul_ps(a.mR1, r2)); +} + +PX_FORCE_INLINE Vec3V V3Cross(const Vec3V a, const VecCrossV& b) +{ + ASSERT_ISVALIDVEC3V(a); + const __m128 r2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w + const __m128 l2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w + return _mm_sub_ps(_mm_mul_ps(b.mR1, r2), _mm_mul_ps(b.mL1, l2)); +} + +PX_FORCE_INLINE Vec3V V3Cross(const VecCrossV& a, const VecCrossV& b) +{ + return _mm_sub_ps(_mm_mul_ps(a.mL1, b.mR1), _mm_mul_ps(a.mR1, b.mL1)); +} + +PX_FORCE_INLINE FloatV V3Length(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + return _mm_sqrt_ps(V3Dot(a, a)); +} + +PX_FORCE_INLINE FloatV V3LengthSq(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + return V3Dot(a, a); +} + +PX_FORCE_INLINE Vec3V V3Normalize(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISFINITELENGTH(a); + return V3ScaleInv(a, _mm_sqrt_ps(V3Dot(a, a))); +} + +PX_FORCE_INLINE Vec3V V3NormalizeFast(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISFINITELENGTH(a); + return V3Scale(a, _mm_rsqrt_ps(V3Dot(a, a))); +} + +PX_FORCE_INLINE Vec3V V3NormalizeSafe(const Vec3V a, const Vec3V unsafeReturnValue) +{ + ASSERT_ISVALIDVEC3V(a); + const __m128 eps = V3Eps(); + const __m128 length = V3Length(a); + const __m128 isGreaterThanZero = FIsGrtr(length, eps); + return V3Sel(isGreaterThanZero, V3ScaleInv(a, length), unsafeReturnValue); +} + +PX_FORCE_INLINE Vec3V V3Sel(const BoolV c, const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(_mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a))); + return _mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a)); +} + +PX_FORCE_INLINE BoolV V3IsGrtr(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return _mm_cmpgt_ps(a, b); +} + +PX_FORCE_INLINE BoolV V3IsGrtrOrEq(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return _mm_cmpge_ps(a, b); +} + +PX_FORCE_INLINE BoolV V3IsEq(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return _mm_cmpeq_ps(a, b); +} + +PX_FORCE_INLINE Vec3V V3Max(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return _mm_max_ps(a, b); +} + +PX_FORCE_INLINE Vec3V V3Min(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return _mm_min_ps(a, b); +} + +PX_FORCE_INLINE FloatV V3ExtractMax(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)); + const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)); + const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)); + + return _mm_max_ps(_mm_max_ps(shuf1, shuf2), shuf3); +} + +PX_FORCE_INLINE FloatV V3ExtractMin(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + + const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)); + const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)); + const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)); + + return _mm_min_ps(_mm_min_ps(shuf1, shuf2), shuf3); +} + +// return (a >= 0.0f) ? 1.0f : -1.0f; +PX_FORCE_INLINE Vec3V V3Sign(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + const __m128 zero = V3Zero(); + const __m128 one = V3One(); + const __m128 none = V3Neg(one); + return V3Sel(V3IsGrtrOrEq(a, zero), one, none); +} + +PX_FORCE_INLINE Vec3V V3Clamp(const Vec3V a, const Vec3V minV, const Vec3V maxV) +{ + ASSERT_ISVALIDVEC3V(maxV); + ASSERT_ISVALIDVEC3V(minV); + return V3Max(V3Min(a, maxV), minV); +} + +PX_FORCE_INLINE PxU32 V3AllGrtr(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return internalUnitSSE2Simd::BAllTrue3_R(V4IsGrtr(a, b)); +} + +PX_FORCE_INLINE PxU32 V3AllGrtrOrEq(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return internalUnitSSE2Simd::BAllTrue3_R(V4IsGrtrOrEq(a, b)); +} + +PX_FORCE_INLINE PxU32 V3AllEq(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return internalUnitSSE2Simd::BAllTrue3_R(V4IsEq(a, b)); +} + +PX_FORCE_INLINE Vec3V V3Round(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); +#ifdef __SSE4_2__ + return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); +#else + // return _mm_round_ps(a, 0x0); + const Vec3V half = V3Load(0.5f); + const __m128 signBit = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_cvtps_epi32(a), 31)); + const Vec3V aRound = V3Sub(V3Add(a, half), signBit); + __m128i tmp = _mm_cvttps_epi32(aRound); + return _mm_cvtepi32_ps(tmp); +#endif +} + +PX_FORCE_INLINE Vec3V V3Sin(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + // Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI + const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f); + const Vec4V twoPi = V4LoadA(g_PXTwoPi.f); + const Vec3V tmp = V3Scale(a, recipTwoPi); + const Vec3V b = V3Round(tmp); + const Vec3V V1 = V3NegScaleSub(b, twoPi, a); + + // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! - + // V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI) + const Vec3V V2 = V3Mul(V1, V1); + const Vec3V V3 = V3Mul(V2, V1); + const Vec3V V5 = V3Mul(V3, V2); + const Vec3V V7 = V3Mul(V5, V2); + const Vec3V V9 = V3Mul(V7, V2); + const Vec3V V11 = V3Mul(V9, V2); + const Vec3V V13 = V3Mul(V11, V2); + const Vec3V V15 = V3Mul(V13, V2); + const Vec3V V17 = V3Mul(V15, V2); + const Vec3V V19 = V3Mul(V17, V2); + const Vec3V V21 = V3Mul(V19, V2); + const Vec3V V23 = V3Mul(V21, V2); + + const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f); + const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f); + const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f); + + const FloatV S1 = V4GetY(sinCoefficients0); + const FloatV S2 = V4GetZ(sinCoefficients0); + const FloatV S3 = V4GetW(sinCoefficients0); + const FloatV S4 = V4GetX(sinCoefficients1); + const FloatV S5 = V4GetY(sinCoefficients1); + const FloatV S6 = V4GetZ(sinCoefficients1); + const FloatV S7 = V4GetW(sinCoefficients1); + const FloatV S8 = V4GetX(sinCoefficients2); + const FloatV S9 = V4GetY(sinCoefficients2); + const FloatV S10 = V4GetZ(sinCoefficients2); + const FloatV S11 = V4GetW(sinCoefficients2); + + Vec3V Result; + Result = V3ScaleAdd(V3, S1, V1); + Result = V3ScaleAdd(V5, S2, Result); + Result = V3ScaleAdd(V7, S3, Result); + Result = V3ScaleAdd(V9, S4, Result); + Result = V3ScaleAdd(V11, S5, Result); + Result = V3ScaleAdd(V13, S6, Result); + Result = V3ScaleAdd(V15, S7, Result); + Result = V3ScaleAdd(V17, S8, Result); + Result = V3ScaleAdd(V19, S9, Result); + Result = V3ScaleAdd(V21, S10, Result); + Result = V3ScaleAdd(V23, S11, Result); + + ASSERT_ISVALIDVEC3V(Result); + return Result; +} + +PX_FORCE_INLINE Vec3V V3Cos(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + + // Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI + const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f); + const Vec4V twoPi = V4LoadA(g_PXTwoPi.f); + const Vec3V tmp = V3Scale(a, recipTwoPi); + const Vec3V b = V3Round(tmp); + const Vec3V V1 = V3NegScaleSub(b, twoPi, a); + + // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! - + // V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI) + const Vec3V V2 = V3Mul(V1, V1); + const Vec3V V4 = V3Mul(V2, V2); + const Vec3V V6 = V3Mul(V4, V2); + const Vec3V V8 = V3Mul(V4, V4); + const Vec3V V10 = V3Mul(V6, V4); + const Vec3V V12 = V3Mul(V6, V6); + const Vec3V V14 = V3Mul(V8, V6); + const Vec3V V16 = V3Mul(V8, V8); + const Vec3V V18 = V3Mul(V10, V8); + const Vec3V V20 = V3Mul(V10, V10); + const Vec3V V22 = V3Mul(V12, V10); + + const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f); + const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f); + const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f); + + const FloatV C1 = V4GetY(cosCoefficients0); + const FloatV C2 = V4GetZ(cosCoefficients0); + const FloatV C3 = V4GetW(cosCoefficients0); + const FloatV C4 = V4GetX(cosCoefficients1); + const FloatV C5 = V4GetY(cosCoefficients1); + const FloatV C6 = V4GetZ(cosCoefficients1); + const FloatV C7 = V4GetW(cosCoefficients1); + const FloatV C8 = V4GetX(cosCoefficients2); + const FloatV C9 = V4GetY(cosCoefficients2); + const FloatV C10 = V4GetZ(cosCoefficients2); + const FloatV C11 = V4GetW(cosCoefficients2); + + Vec3V Result; + Result = V3ScaleAdd(V2, C1, V3One()); + Result = V3ScaleAdd(V4, C2, Result); + Result = V3ScaleAdd(V6, C3, Result); + Result = V3ScaleAdd(V8, C4, Result); + Result = V3ScaleAdd(V10, C5, Result); + Result = V3ScaleAdd(V12, C6, Result); + Result = V3ScaleAdd(V14, C7, Result); + Result = V3ScaleAdd(V16, C8, Result); + Result = V3ScaleAdd(V18, C9, Result); + Result = V3ScaleAdd(V20, C10, Result); + Result = V3ScaleAdd(V22, C11, Result); + + ASSERT_ISVALIDVEC3V(Result); + return Result; +} + +PX_FORCE_INLINE Vec3V V3PermYZZ(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 2, 1)); +} + +PX_FORCE_INLINE Vec3V V3PermXYX(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 1, 0)); +} + +PX_FORCE_INLINE Vec3V V3PermYZX(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); +} + +PX_FORCE_INLINE Vec3V V3PermZXY(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); +} + +PX_FORCE_INLINE Vec3V V3PermZZY(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 2, 2)); +} + +PX_FORCE_INLINE Vec3V V3PermYXX(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 0, 1)); +} + +PX_FORCE_INLINE Vec3V V3Perm_Zero_1Z_0Y(const Vec3V v0, const Vec3V v1) +{ + ASSERT_ISVALIDVEC3V(v0); + ASSERT_ISVALIDVEC3V(v1); + return _mm_shuffle_ps(v1, v0, _MM_SHUFFLE(3, 1, 2, 3)); +} + +PX_FORCE_INLINE Vec3V V3Perm_0Z_Zero_1X(const Vec3V v0, const Vec3V v1) +{ + ASSERT_ISVALIDVEC3V(v0); + ASSERT_ISVALIDVEC3V(v1); + return _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(3, 0, 3, 2)); +} + +PX_FORCE_INLINE Vec3V V3Perm_1Y_0X_Zero(const Vec3V v0, const Vec3V v1) +{ + ASSERT_ISVALIDVEC3V(v0); + ASSERT_ISVALIDVEC3V(v1); + // There must be a better way to do this. + Vec3V v2 = V3Zero(); + FloatV y1 = V3GetY(v1); + FloatV x0 = V3GetX(v0); + v2 = V3SetX(v2, y1); + return V3SetY(v2, x0); +} + +PX_FORCE_INLINE FloatV V3SumElems(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); +#ifdef __SSE4_2__ + Vec3V r = _mm_hadd_ps(a, a); + r = _mm_hadd_ps(r, r); + return r; +#else + __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)); // z,y,x,w + __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)); // y,x,w,z + __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)); // x,w,z,y + return _mm_add_ps(_mm_add_ps(shuf1, shuf2), shuf3); +#endif +} + +PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V min, const Vec3V max) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(min); + ASSERT_ISVALIDVEC3V(max); + const BoolV c = BOr(V3IsGrtr(a, max), V3IsGrtr(min, a)); + return !BAllEqFFFF(c); +} + +PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V min, const Vec3V max) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(min); + ASSERT_ISVALIDVEC3V(max); + const BoolV c = BAnd(V3IsGrtrOrEq(a, min), V3IsGrtrOrEq(max, a)); + return BAllEqTTTT(c); +} + +PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V bounds) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(bounds); + return V3OutOfBounds(a, V3Neg(bounds), bounds); +} + +PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V bounds) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(bounds) + return V3InBounds(a, V3Neg(bounds), bounds); +} + +PX_FORCE_INLINE void V3Transpose(Vec3V& col0, Vec3V& col1, Vec3V& col2) +{ + ASSERT_ISVALIDVEC3V(col0); + ASSERT_ISVALIDVEC3V(col1); + ASSERT_ISVALIDVEC3V(col2); + + const Vec3V col3 = _mm_setzero_ps(); + Vec3V tmp0 = _mm_unpacklo_ps(col0, col1); + Vec3V tmp2 = _mm_unpacklo_ps(col2, col3); + Vec3V tmp1 = _mm_unpackhi_ps(col0, col1); + Vec3V tmp3 = _mm_unpackhi_ps(col2, col3); + col0 = _mm_movelh_ps(tmp0, tmp2); + col1 = _mm_movehl_ps(tmp2, tmp0); + col2 = _mm_movelh_ps(tmp1, tmp3); +} + +////////////////////////////////// +// VEC4V +////////////////////////////////// + +PX_FORCE_INLINE Vec4V V4Splat(const FloatV f) +{ + ASSERT_ISVALIDFLOATV(f); + // return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0,0,0,0)); + return f; +} + +PX_FORCE_INLINE Vec4V V4Merge(const FloatV* const floatVArray) +{ + ASSERT_ISVALIDFLOATV(floatVArray[0]); + ASSERT_ISVALIDFLOATV(floatVArray[1]); + ASSERT_ISVALIDFLOATV(floatVArray[2]); + ASSERT_ISVALIDFLOATV(floatVArray[3]); + const __m128 xw = _mm_move_ss(floatVArray[1], floatVArray[0]); // y, y, y, x + const __m128 yz = _mm_move_ss(floatVArray[2], floatVArray[3]); // z, z, z, w + return _mm_shuffle_ps(xw, yz, _MM_SHUFFLE(0, 2, 1, 0)); +} + +PX_FORCE_INLINE Vec4V V4Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z, const FloatVArg w) +{ + ASSERT_ISVALIDFLOATV(x); + ASSERT_ISVALIDFLOATV(y); + ASSERT_ISVALIDFLOATV(z); + ASSERT_ISVALIDFLOATV(w); + const __m128 xw = _mm_move_ss(y, x); // y, y, y, x + const __m128 yz = _mm_move_ss(z, w); // z, z, z, w + return _mm_shuffle_ps(xw, yz, _MM_SHUFFLE(0, 2, 1, 0)); +} + +PX_FORCE_INLINE Vec4V V4MergeW(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w) +{ + const Vec4V xz = _mm_unpackhi_ps(x, z); + const Vec4V yw = _mm_unpackhi_ps(y, w); + return _mm_unpackhi_ps(xz, yw); +} + +PX_FORCE_INLINE Vec4V V4MergeZ(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w) +{ + const Vec4V xz = _mm_unpackhi_ps(x, z); + const Vec4V yw = _mm_unpackhi_ps(y, w); + return _mm_unpacklo_ps(xz, yw); +} + +PX_FORCE_INLINE Vec4V V4MergeY(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w) +{ + const Vec4V xz = _mm_unpacklo_ps(x, z); + const Vec4V yw = _mm_unpacklo_ps(y, w); + return _mm_unpackhi_ps(xz, yw); +} + +PX_FORCE_INLINE Vec4V V4MergeX(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w) +{ + const Vec4V xz = _mm_unpacklo_ps(x, z); + const Vec4V yw = _mm_unpacklo_ps(y, w); + return _mm_unpacklo_ps(xz, yw); +} + +PX_FORCE_INLINE Vec4V V4UnpackXY(const Vec4VArg a, const Vec4VArg b) +{ + return _mm_unpacklo_ps(a, b); +} + +PX_FORCE_INLINE Vec4V V4UnpackZW(const Vec4VArg a, const Vec4VArg b) +{ + return _mm_unpackhi_ps(a, b); +} + +PX_FORCE_INLINE Vec4V V4UnitW() +{ + const PX_ALIGN(16, PxF32) w[4] = { 0.0f, 0.0f, 0.0f, 1.0f }; + const __m128 w128 = _mm_load_ps(w); + return w128; +} + +PX_FORCE_INLINE Vec4V V4UnitX() +{ + const PX_ALIGN(16, PxF32) x[4] = { 1.0f, 0.0f, 0.0f, 0.0f }; + const __m128 x128 = _mm_load_ps(x); + return x128; +} + +PX_FORCE_INLINE Vec4V V4UnitY() +{ + const PX_ALIGN(16, PxF32) y[4] = { 0.0f, 1.0f, 0.0f, 0.0f }; + const __m128 y128 = _mm_load_ps(y); + return y128; +} + +PX_FORCE_INLINE Vec4V V4UnitZ() +{ + const PX_ALIGN(16, PxF32) z[4] = { 0.0f, 0.0f, 1.0f, 0.0f }; + const __m128 z128 = _mm_load_ps(z); + return z128; +} + +PX_FORCE_INLINE FloatV V4GetW(const Vec4V f) +{ + return _mm_shuffle_ps(f, f, _MM_SHUFFLE(3, 3, 3, 3)); +} + +PX_FORCE_INLINE FloatV V4GetX(const Vec4V f) +{ + return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0, 0, 0, 0)); +} + +PX_FORCE_INLINE FloatV V4GetY(const Vec4V f) +{ + return _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 1, 1)); +} + +PX_FORCE_INLINE FloatV V4GetZ(const Vec4V f) +{ + return _mm_shuffle_ps(f, f, _MM_SHUFFLE(2, 2, 2, 2)); +} + +PX_FORCE_INLINE Vec4V V4SetW(const Vec4V v, const FloatV f) +{ + ASSERT_ISVALIDFLOATV(f); + return V4Sel(BTTTF(), v, f); +} + +PX_FORCE_INLINE Vec4V V4SetX(const Vec4V v, const FloatV f) +{ + ASSERT_ISVALIDFLOATV(f); + return V4Sel(BFTTT(), v, f); +} + +PX_FORCE_INLINE Vec4V V4SetY(const Vec4V v, const FloatV f) +{ + ASSERT_ISVALIDFLOATV(f); + return V4Sel(BTFTT(), v, f); +} + +PX_FORCE_INLINE Vec4V V4SetZ(const Vec4V v, const FloatV f) +{ + ASSERT_ISVALIDFLOATV(f); + return V4Sel(BTTFT(), v, f); +} + +PX_FORCE_INLINE Vec4V V4ClearW(const Vec4V v) +{ + return _mm_and_ps(v, V4LoadA(internalUnitSSE2Simd::gMaskXYZ)); +} + +PX_FORCE_INLINE Vec4V V4PermYXWZ(const Vec4V a) +{ + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)); +} + +PX_FORCE_INLINE Vec4V V4PermXZXZ(const Vec4V a) +{ + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 0, 2, 0)); +} + +PX_FORCE_INLINE Vec4V V4PermYWYW(const Vec4V a) +{ + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 3, 1)); +} + +PX_FORCE_INLINE Vec4V V4PermYZXW(const Vec4V a) +{ + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); +} + +template <PxU8 x, PxU8 y, PxU8 z, PxU8 w> +PX_FORCE_INLINE Vec4V V4Perm(const Vec4V a) +{ + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(w, z, y, x)); +} + +PX_FORCE_INLINE Vec4V V4Zero() +{ + return V4Load(0.0f); +} + +PX_FORCE_INLINE Vec4V V4One() +{ + return V4Load(1.0f); +} + +PX_FORCE_INLINE Vec4V V4Eps() +{ + return V4Load(PX_EPS_REAL); +} + +PX_FORCE_INLINE Vec4V V4Neg(const Vec4V f) +{ + return _mm_sub_ps(_mm_setzero_ps(), f); +} + +PX_FORCE_INLINE Vec4V V4Add(const Vec4V a, const Vec4V b) +{ + return _mm_add_ps(a, b); +} + +PX_FORCE_INLINE Vec4V V4Sub(const Vec4V a, const Vec4V b) +{ + return _mm_sub_ps(a, b); +} + +PX_FORCE_INLINE Vec4V V4Scale(const Vec4V a, const FloatV b) +{ + return _mm_mul_ps(a, b); +} + +PX_FORCE_INLINE Vec4V V4Mul(const Vec4V a, const Vec4V b) +{ + return _mm_mul_ps(a, b); +} + +PX_FORCE_INLINE Vec4V V4ScaleInv(const Vec4V a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(b); + return _mm_div_ps(a, b); +} + +PX_FORCE_INLINE Vec4V V4Div(const Vec4V a, const Vec4V b) +{ + return _mm_div_ps(a, b); +} + +PX_FORCE_INLINE Vec4V V4ScaleInvFast(const Vec4V a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(b); + return _mm_mul_ps(a, _mm_rcp_ps(b)); +} + +PX_FORCE_INLINE Vec4V V4DivFast(const Vec4V a, const Vec4V b) +{ + return _mm_mul_ps(a, _mm_rcp_ps(b)); +} + +PX_FORCE_INLINE Vec4V V4Recip(const Vec4V a) +{ + return _mm_div_ps(V4One(), a); +} + +PX_FORCE_INLINE Vec4V V4RecipFast(const Vec4V a) +{ + return _mm_rcp_ps(a); +} + +PX_FORCE_INLINE Vec4V V4Rsqrt(const Vec4V a) +{ + return _mm_div_ps(V4One(), _mm_sqrt_ps(a)); +} + +PX_FORCE_INLINE Vec4V V4RsqrtFast(const Vec4V a) +{ + return _mm_rsqrt_ps(a); +} + +PX_FORCE_INLINE Vec4V V4Sqrt(const Vec4V a) +{ + return _mm_sqrt_ps(a); +} + +PX_FORCE_INLINE Vec4V V4ScaleAdd(const Vec4V a, const FloatV b, const Vec4V c) +{ + ASSERT_ISVALIDFLOATV(b); + return V4Add(V4Scale(a, b), c); +} + +PX_FORCE_INLINE Vec4V V4NegScaleSub(const Vec4V a, const FloatV b, const Vec4V c) +{ + ASSERT_ISVALIDFLOATV(b); + return V4Sub(c, V4Scale(a, b)); +} + +PX_FORCE_INLINE Vec4V V4MulAdd(const Vec4V a, const Vec4V b, const Vec4V c) +{ + return V4Add(V4Mul(a, b), c); +} + +PX_FORCE_INLINE Vec4V V4NegMulSub(const Vec4V a, const Vec4V b, const Vec4V c) +{ + return V4Sub(c, V4Mul(a, b)); +} + +PX_FORCE_INLINE Vec4V V4Abs(const Vec4V a) +{ + return V4Max(a, V4Neg(a)); +} + +PX_FORCE_INLINE FloatV V4SumElements(const Vec4V a) +{ +#ifdef __SSE4_2__ + Vec4V r = _mm_hadd_ps(a, a); + r = _mm_hadd_ps(r, r); + return r; +#else + const Vec4V xy = V4UnpackXY(a, a); // x,x,y,y + const Vec4V zw = V4UnpackZW(a, a); // z,z,w,w + const Vec4V xz_yw = V4Add(xy, zw); // x+z,x+z,y+w,y+w + const FloatV xz = V4GetX(xz_yw); // x+z + const FloatV yw = V4GetZ(xz_yw); // y+w + return FAdd(xz, yw); // sum +#endif +} + +PX_FORCE_INLINE FloatV V4Dot(const Vec4V a, const Vec4V b) +{ +#ifdef __SSE4_2__ + return _mm_dp_ps(a, b, 0xff); +#else + const __m128 dot1 = _mm_mul_ps(a, b); // x,y,z,w + const __m128 shuf1 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(2, 1, 0, 3)); // w,x,y,z + const __m128 shuf2 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(1, 0, 3, 2)); // z,w,x,y + const __m128 shuf3 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(0, 3, 2, 1)); // y,z,w,x + return _mm_add_ps(_mm_add_ps(shuf2, shuf3), _mm_add_ps(dot1, shuf1)); +#endif +} + +PX_FORCE_INLINE FloatV V4Dot3(const Vec4V a, const Vec4V b) +{ +#ifdef __SSE4_2__ + return _mm_dp_ps(a, b, 0x7f); +#else + const __m128 dot1 = _mm_mul_ps(a, b); // w,z,y,x + const __m128 shuf1 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(0, 0, 0, 0)); // z,y,x,w + const __m128 shuf2 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(1, 1, 1, 1)); // y,x,w,z + const __m128 shuf3 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(2, 2, 2, 2)); // x,w,z,y + return _mm_add_ps(_mm_add_ps(shuf1, shuf2), shuf3); +#endif +} + +PX_FORCE_INLINE Vec4V V4Cross(const Vec4V a, const Vec4V b) +{ + const __m128 r1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w + const __m128 r2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w + const __m128 l1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w + const __m128 l2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w + return _mm_sub_ps(_mm_mul_ps(l1, l2), _mm_mul_ps(r1, r2)); +} + +PX_FORCE_INLINE FloatV V4Length(const Vec4V a) +{ + return _mm_sqrt_ps(V4Dot(a, a)); +} + +PX_FORCE_INLINE FloatV V4LengthSq(const Vec4V a) +{ + return V4Dot(a, a); +} + +PX_FORCE_INLINE Vec4V V4Normalize(const Vec4V a) +{ + ASSERT_ISFINITELENGTH(a); + return V4ScaleInv(a, _mm_sqrt_ps(V4Dot(a, a))); +} + +PX_FORCE_INLINE Vec4V V4NormalizeFast(const Vec4V a) +{ + ASSERT_ISFINITELENGTH(a); + return V4ScaleInvFast(a, _mm_sqrt_ps(V4Dot(a, a))); +} + +PX_FORCE_INLINE Vec4V V4NormalizeSafe(const Vec4V a, const Vec3V unsafeReturnValue) +{ + const __m128 eps = V3Eps(); + const __m128 length = V4Length(a); + const __m128 isGreaterThanZero = V4IsGrtr(length, eps); + return V4Sel(isGreaterThanZero, V4ScaleInv(a, length), unsafeReturnValue); +} + +PX_FORCE_INLINE BoolV V4IsEqU32(const VecU32V a, const VecU32V b) +{ + return m128_I2F(_mm_cmpeq_epi32(m128_F2I(a), m128_F2I(b))); +} + +PX_FORCE_INLINE Vec4V V4Sel(const BoolV c, const Vec4V a, const Vec4V b) +{ + return _mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a)); +} + +PX_FORCE_INLINE BoolV V4IsGrtr(const Vec4V a, const Vec4V b) +{ + return _mm_cmpgt_ps(a, b); +} + +PX_FORCE_INLINE BoolV V4IsGrtrOrEq(const Vec4V a, const Vec4V b) +{ + return _mm_cmpge_ps(a, b); +} + +PX_FORCE_INLINE BoolV V4IsEq(const Vec4V a, const Vec4V b) +{ + return _mm_cmpeq_ps(a, b); +} + +PX_FORCE_INLINE Vec4V V4Max(const Vec4V a, const Vec4V b) +{ + return _mm_max_ps(a, b); +} + +PX_FORCE_INLINE Vec4V V4Min(const Vec4V a, const Vec4V b) +{ + return _mm_min_ps(a, b); +} + +PX_FORCE_INLINE FloatV V4ExtractMax(const Vec4V a) +{ + const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 1, 0, 3)); + const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)); + const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 3, 2, 1)); + + return _mm_max_ps(_mm_max_ps(a, shuf1), _mm_max_ps(shuf2, shuf3)); +} + +PX_FORCE_INLINE FloatV V4ExtractMin(const Vec4V a) +{ + const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 1, 0, 3)); + const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)); + const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 3, 2, 1)); + + return _mm_min_ps(_mm_min_ps(a, shuf1), _mm_min_ps(shuf2, shuf3)); +} + +PX_FORCE_INLINE Vec4V V4Clamp(const Vec4V a, const Vec4V minV, const Vec4V maxV) +{ + return V4Max(V4Min(a, maxV), minV); +} + +PX_FORCE_INLINE PxU32 V4AllGrtr(const Vec4V a, const Vec4V b) +{ + return internalUnitSSE2Simd::BAllTrue4_R(V4IsGrtr(a, b)); +} + +PX_FORCE_INLINE PxU32 V4AllGrtrOrEq(const Vec4V a, const Vec4V b) +{ + return internalUnitSSE2Simd::BAllTrue4_R(V4IsGrtrOrEq(a, b)); +} + +PX_FORCE_INLINE PxU32 V4AllGrtrOrEq3(const Vec4V a, const Vec4V b) +{ + return internalUnitSSE2Simd::BAllTrue3_R(V4IsGrtrOrEq(a, b)); +} + +PX_FORCE_INLINE PxU32 V4AllEq(const Vec4V a, const Vec4V b) +{ + return internalUnitSSE2Simd::BAllTrue4_R(V4IsEq(a, b)); +} + +PX_FORCE_INLINE PxU32 V4AnyGrtr3(const Vec4V a, const Vec4V b) +{ + return internalUnitSSE2Simd::BAnyTrue3_R(V4IsGrtr(a, b)); +} + +PX_FORCE_INLINE Vec4V V4Round(const Vec4V a) +{ +#ifdef __SSE4_2__ + return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); +#else + // return _mm_round_ps(a, 0x0); + const Vec4V half = V4Load(0.5f); + const __m128 signBit = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_cvtps_epi32(a), 31)); + const Vec4V aRound = V4Sub(V4Add(a, half), signBit); + __m128i tmp = _mm_cvttps_epi32(aRound); + return _mm_cvtepi32_ps(tmp); +#endif +} + +PX_FORCE_INLINE Vec4V V4Sin(const Vec4V a) +{ + const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f); + const Vec4V twoPi = V4LoadA(g_PXTwoPi.f); + const Vec4V tmp = V4Mul(a, recipTwoPi); + const Vec4V b = V4Round(tmp); + const Vec4V V1 = V4NegMulSub(twoPi, b, a); + + // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! - + // V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI) + const Vec4V V2 = V4Mul(V1, V1); + const Vec4V V3 = V4Mul(V2, V1); + const Vec4V V5 = V4Mul(V3, V2); + const Vec4V V7 = V4Mul(V5, V2); + const Vec4V V9 = V4Mul(V7, V2); + const Vec4V V11 = V4Mul(V9, V2); + const Vec4V V13 = V4Mul(V11, V2); + const Vec4V V15 = V4Mul(V13, V2); + const Vec4V V17 = V4Mul(V15, V2); + const Vec4V V19 = V4Mul(V17, V2); + const Vec4V V21 = V4Mul(V19, V2); + const Vec4V V23 = V4Mul(V21, V2); + + const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f); + const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f); + const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f); + + const FloatV S1 = V4GetY(sinCoefficients0); + const FloatV S2 = V4GetZ(sinCoefficients0); + const FloatV S3 = V4GetW(sinCoefficients0); + const FloatV S4 = V4GetX(sinCoefficients1); + const FloatV S5 = V4GetY(sinCoefficients1); + const FloatV S6 = V4GetZ(sinCoefficients1); + const FloatV S7 = V4GetW(sinCoefficients1); + const FloatV S8 = V4GetX(sinCoefficients2); + const FloatV S9 = V4GetY(sinCoefficients2); + const FloatV S10 = V4GetZ(sinCoefficients2); + const FloatV S11 = V4GetW(sinCoefficients2); + + Vec4V Result; + Result = V4MulAdd(S1, V3, V1); + Result = V4MulAdd(S2, V5, Result); + Result = V4MulAdd(S3, V7, Result); + Result = V4MulAdd(S4, V9, Result); + Result = V4MulAdd(S5, V11, Result); + Result = V4MulAdd(S6, V13, Result); + Result = V4MulAdd(S7, V15, Result); + Result = V4MulAdd(S8, V17, Result); + Result = V4MulAdd(S9, V19, Result); + Result = V4MulAdd(S10, V21, Result); + Result = V4MulAdd(S11, V23, Result); + + return Result; +} + +PX_FORCE_INLINE Vec4V V4Cos(const Vec4V a) +{ + const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f); + const Vec4V twoPi = V4LoadA(g_PXTwoPi.f); + const Vec4V tmp = V4Mul(a, recipTwoPi); + const Vec4V b = V4Round(tmp); + const Vec4V V1 = V4NegMulSub(twoPi, b, a); + + // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! - + // V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI) + const Vec4V V2 = V4Mul(V1, V1); + const Vec4V V4 = V4Mul(V2, V2); + const Vec4V V6 = V4Mul(V4, V2); + const Vec4V V8 = V4Mul(V4, V4); + const Vec4V V10 = V4Mul(V6, V4); + const Vec4V V12 = V4Mul(V6, V6); + const Vec4V V14 = V4Mul(V8, V6); + const Vec4V V16 = V4Mul(V8, V8); + const Vec4V V18 = V4Mul(V10, V8); + const Vec4V V20 = V4Mul(V10, V10); + const Vec4V V22 = V4Mul(V12, V10); + + const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f); + const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f); + const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f); + + const FloatV C1 = V4GetY(cosCoefficients0); + const FloatV C2 = V4GetZ(cosCoefficients0); + const FloatV C3 = V4GetW(cosCoefficients0); + const FloatV C4 = V4GetX(cosCoefficients1); + const FloatV C5 = V4GetY(cosCoefficients1); + const FloatV C6 = V4GetZ(cosCoefficients1); + const FloatV C7 = V4GetW(cosCoefficients1); + const FloatV C8 = V4GetX(cosCoefficients2); + const FloatV C9 = V4GetY(cosCoefficients2); + const FloatV C10 = V4GetZ(cosCoefficients2); + const FloatV C11 = V4GetW(cosCoefficients2); + + Vec4V Result; + Result = V4MulAdd(C1, V2, V4One()); + Result = V4MulAdd(C2, V4, Result); + Result = V4MulAdd(C3, V6, Result); + Result = V4MulAdd(C4, V8, Result); + Result = V4MulAdd(C5, V10, Result); + Result = V4MulAdd(C6, V12, Result); + Result = V4MulAdd(C7, V14, Result); + Result = V4MulAdd(C8, V16, Result); + Result = V4MulAdd(C9, V18, Result); + Result = V4MulAdd(C10, V20, Result); + Result = V4MulAdd(C11, V22, Result); + + return Result; +} + +PX_FORCE_INLINE void V4Transpose(Vec4V& col0, Vec4V& col1, Vec4V& col2, Vec4V& col3) +{ + Vec4V tmp0 = _mm_unpacklo_ps(col0, col1); + Vec4V tmp2 = _mm_unpacklo_ps(col2, col3); + Vec4V tmp1 = _mm_unpackhi_ps(col0, col1); + Vec4V tmp3 = _mm_unpackhi_ps(col2, col3); + col0 = _mm_movelh_ps(tmp0, tmp2); + col1 = _mm_movehl_ps(tmp2, tmp0); + col2 = _mm_movelh_ps(tmp1, tmp3); + col3 = _mm_movehl_ps(tmp3, tmp1); +} + +////////////////////////////////// +// BoolV +////////////////////////////////// + +PX_FORCE_INLINE BoolV BFFFF() +{ + return _mm_setzero_ps(); +} + +PX_FORCE_INLINE BoolV BFFFT() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0,0,0,0xFFFFFFFF}; + const __m128 ffft=_mm_load_ps((float*)&f); + return ffft;*/ + return m128_I2F(_mm_set_epi32(-1, 0, 0, 0)); +} + +PX_FORCE_INLINE BoolV BFFTF() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0}; + const __m128 fftf=_mm_load_ps((float*)&f); + return fftf;*/ + return m128_I2F(_mm_set_epi32(0, -1, 0, 0)); +} + +PX_FORCE_INLINE BoolV BFFTT() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0xFFFFFFFF}; + const __m128 fftt=_mm_load_ps((float*)&f); + return fftt;*/ + return m128_I2F(_mm_set_epi32(-1, -1, 0, 0)); +} + +PX_FORCE_INLINE BoolV BFTFF() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0}; + const __m128 ftff=_mm_load_ps((float*)&f); + return ftff;*/ + return m128_I2F(_mm_set_epi32(0, 0, -1, 0)); +} + +PX_FORCE_INLINE BoolV BFTFT() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0xFFFFFFFF}; + const __m128 ftft=_mm_load_ps((float*)&f); + return ftft;*/ + return m128_I2F(_mm_set_epi32(-1, 0, -1, 0)); +} + +PX_FORCE_INLINE BoolV BFTTF() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0xFFFFFFFF,0}; + const __m128 fttf=_mm_load_ps((float*)&f); + return fttf;*/ + return m128_I2F(_mm_set_epi32(0, -1, -1, 0)); +} + +PX_FORCE_INLINE BoolV BFTTT() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF}; + const __m128 fttt=_mm_load_ps((float*)&f); + return fttt;*/ + return m128_I2F(_mm_set_epi32(-1, -1, -1, 0)); +} + +PX_FORCE_INLINE BoolV BTFFF() +{ + // const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0}; + // const __m128 tfff=_mm_load_ps((float*)&f); + // return tfff; + return m128_I2F(_mm_set_epi32(0, 0, 0, -1)); +} + +PX_FORCE_INLINE BoolV BTFFT() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0xFFFFFFFF}; + const __m128 tfft=_mm_load_ps((float*)&f); + return tfft;*/ + return m128_I2F(_mm_set_epi32(-1, 0, 0, -1)); +} + +PX_FORCE_INLINE BoolV BTFTF() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0xFFFFFFFF,0}; + const __m128 tftf=_mm_load_ps((float*)&f); + return tftf;*/ + return m128_I2F(_mm_set_epi32(0, -1, 0, -1)); +} + +PX_FORCE_INLINE BoolV BTFTT() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0xFFFFFFFF,0xFFFFFFFF}; + const __m128 tftt=_mm_load_ps((float*)&f); + return tftt;*/ + return m128_I2F(_mm_set_epi32(-1, -1, 0, -1)); +} + +PX_FORCE_INLINE BoolV BTTFF() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0,0}; + const __m128 ttff=_mm_load_ps((float*)&f); + return ttff;*/ + return m128_I2F(_mm_set_epi32(0, 0, -1, -1)); +} + +PX_FORCE_INLINE BoolV BTTFT() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0,0xFFFFFFFF}; + const __m128 ttft=_mm_load_ps((float*)&f); + return ttft;*/ + return m128_I2F(_mm_set_epi32(-1, 0, -1, -1)); +} + +PX_FORCE_INLINE BoolV BTTTF() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0}; + const __m128 tttf=_mm_load_ps((float*)&f); + return tttf;*/ + return m128_I2F(_mm_set_epi32(0, -1, -1, -1)); +} + +PX_FORCE_INLINE BoolV BTTTT() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF}; + const __m128 tttt=_mm_load_ps((float*)&f); + return tttt;*/ + return m128_I2F(_mm_set_epi32(-1, -1, -1, -1)); +} + +PX_FORCE_INLINE BoolV BXMask() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0}; + const __m128 tfff=_mm_load_ps((float*)&f); + return tfff;*/ + return m128_I2F(_mm_set_epi32(0, 0, 0, -1)); +} + +PX_FORCE_INLINE BoolV BYMask() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0}; + const __m128 ftff=_mm_load_ps((float*)&f); + return ftff;*/ + return m128_I2F(_mm_set_epi32(0, 0, -1, 0)); +} + +PX_FORCE_INLINE BoolV BZMask() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0}; + const __m128 fftf=_mm_load_ps((float*)&f); + return fftf;*/ + return m128_I2F(_mm_set_epi32(0, -1, 0, 0)); +} + +PX_FORCE_INLINE BoolV BWMask() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0,0,0,0xFFFFFFFF}; + const __m128 ffft=_mm_load_ps((float*)&f); + return ffft;*/ + return m128_I2F(_mm_set_epi32(-1, 0, 0, 0)); +} + +PX_FORCE_INLINE BoolV BGetX(const BoolV f) +{ + return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0, 0, 0, 0)); +} + +PX_FORCE_INLINE BoolV BGetY(const BoolV f) +{ + return _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 1, 1)); +} + +PX_FORCE_INLINE BoolV BGetZ(const BoolV f) +{ + return _mm_shuffle_ps(f, f, _MM_SHUFFLE(2, 2, 2, 2)); +} + +PX_FORCE_INLINE BoolV BGetW(const BoolV f) +{ + return _mm_shuffle_ps(f, f, _MM_SHUFFLE(3, 3, 3, 3)); +} + +PX_FORCE_INLINE BoolV BSetX(const BoolV v, const BoolV f) +{ + return V4Sel(BFTTT(), v, f); +} + +PX_FORCE_INLINE BoolV BSetY(const BoolV v, const BoolV f) +{ + return V4Sel(BTFTT(), v, f); +} + +PX_FORCE_INLINE BoolV BSetZ(const BoolV v, const BoolV f) +{ + return V4Sel(BTTFT(), v, f); +} + +PX_FORCE_INLINE BoolV BSetW(const BoolV v, const BoolV f) +{ + return V4Sel(BTTTF(), v, f); +} + +PX_FORCE_INLINE BoolV BAnd(const BoolV a, const BoolV b) +{ + return _mm_and_ps(a, b); +} + +PX_FORCE_INLINE BoolV BNot(const BoolV a) +{ + const BoolV bAllTrue(BTTTT()); + return _mm_xor_ps(a, bAllTrue); +} + +PX_FORCE_INLINE BoolV BAndNot(const BoolV a, const BoolV b) +{ + return _mm_andnot_ps(b, a); +} + +PX_FORCE_INLINE BoolV BOr(const BoolV a, const BoolV b) +{ + return _mm_or_ps(a, b); +} + +PX_FORCE_INLINE BoolV BAllTrue4(const BoolV a) +{ + const BoolV bTmp = + _mm_and_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 2, 3))); + return _mm_and_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)), + _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1))); +} + +PX_FORCE_INLINE BoolV BAnyTrue4(const BoolV a) +{ + const BoolV bTmp = + _mm_or_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 2, 3))); + return _mm_or_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)), + _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1))); +} + +PX_FORCE_INLINE BoolV BAllTrue3(const BoolV a) +{ + const BoolV bTmp = + _mm_and_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2))); + return _mm_and_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)), + _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1))); +} + +PX_FORCE_INLINE BoolV BAnyTrue3(const BoolV a) +{ + const BoolV bTmp = + _mm_or_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2))); + return _mm_or_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)), + _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1))); +} + +PX_FORCE_INLINE PxU32 BAllEq(const BoolV a, const BoolV b) +{ + const BoolV bTest = m128_I2F(_mm_cmpeq_epi32(m128_F2I(a), m128_F2I(b))); + return internalUnitSSE2Simd::BAllTrue4_R(bTest); +} + +PX_FORCE_INLINE PxU32 BAllEqTTTT(const BoolV a) +{ + return PxU32(_mm_movemask_ps(a)==15); +} + +PX_FORCE_INLINE PxU32 BAllEqFFFF(const BoolV a) +{ + return PxU32(_mm_movemask_ps(a)==0); +} + +PX_FORCE_INLINE PxU32 BGetBitMask(const BoolV a) +{ + return PxU32(_mm_movemask_ps(a)); +} + +////////////////////////////////// +// MAT33V +////////////////////////////////// + +PX_FORCE_INLINE Vec3V M33MulV3(const Mat33V& a, const Vec3V b) +{ + const FloatV x = V3GetX(b); + const FloatV y = V3GetY(b); + const FloatV z = V3GetZ(b); + const Vec3V v0 = V3Scale(a.col0, x); + const Vec3V v1 = V3Scale(a.col1, y); + const Vec3V v2 = V3Scale(a.col2, z); + const Vec3V v0PlusV1 = V3Add(v0, v1); + return V3Add(v0PlusV1, v2); +} + +PX_FORCE_INLINE Vec3V M33TrnspsMulV3(const Mat33V& a, const Vec3V b) +{ + const FloatV x = V3Dot(a.col0, b); + const FloatV y = V3Dot(a.col1, b); + const FloatV z = V3Dot(a.col2, b); + return V3Merge(x, y, z); +} + +PX_FORCE_INLINE Vec3V M33MulV3AddV3(const Mat33V& A, const Vec3V b, const Vec3V c) +{ + const FloatV x = V3GetX(b); + const FloatV y = V3GetY(b); + const FloatV z = V3GetZ(b); + Vec3V result = V3ScaleAdd(A.col0, x, c); + result = V3ScaleAdd(A.col1, y, result); + return V3ScaleAdd(A.col2, z, result); +} + +PX_FORCE_INLINE Mat33V M33MulM33(const Mat33V& a, const Mat33V& b) +{ + return Mat33V(M33MulV3(a, b.col0), M33MulV3(a, b.col1), M33MulV3(a, b.col2)); +} + +PX_FORCE_INLINE Mat33V M33Add(const Mat33V& a, const Mat33V& b) +{ + return Mat33V(V3Add(a.col0, b.col0), V3Add(a.col1, b.col1), V3Add(a.col2, b.col2)); +} + +PX_FORCE_INLINE Mat33V M33Scale(const Mat33V& a, const FloatV& b) +{ + return Mat33V(V3Scale(a.col0, b), V3Scale(a.col1, b), V3Scale(a.col2, b)); +} + +PX_FORCE_INLINE Mat33V M33Inverse(const Mat33V& a) +{ + const BoolV tfft = BTFFT(); + const BoolV tttf = BTTTF(); + const FloatV zero = FZero(); + const Vec3V cross01 = V3Cross(a.col0, a.col1); + const Vec3V cross12 = V3Cross(a.col1, a.col2); + const Vec3V cross20 = V3Cross(a.col2, a.col0); + const FloatV dot = V3Dot(cross01, a.col2); + const FloatV invDet = _mm_rcp_ps(dot); + const Vec3V mergeh = _mm_unpacklo_ps(cross12, cross01); + const Vec3V mergel = _mm_unpackhi_ps(cross12, cross01); + Vec3V colInv0 = _mm_unpacklo_ps(mergeh, cross20); + colInv0 = _mm_or_ps(_mm_andnot_ps(tttf, zero), _mm_and_ps(tttf, colInv0)); + const Vec3V zppd = _mm_shuffle_ps(mergeh, cross20, _MM_SHUFFLE(3, 0, 0, 2)); + const Vec3V pbwp = _mm_shuffle_ps(cross20, mergeh, _MM_SHUFFLE(3, 3, 1, 0)); + const Vec3V colInv1 = _mm_or_ps(_mm_andnot_ps(BTFFT(), pbwp), _mm_and_ps(BTFFT(), zppd)); + const Vec3V xppd = _mm_shuffle_ps(mergel, cross20, _MM_SHUFFLE(3, 0, 0, 0)); + const Vec3V pcyp = _mm_shuffle_ps(cross20, mergel, _MM_SHUFFLE(3, 1, 2, 0)); + const Vec3V colInv2 = _mm_or_ps(_mm_andnot_ps(tfft, pcyp), _mm_and_ps(tfft, xppd)); + + return Mat33V(_mm_mul_ps(colInv0, invDet), _mm_mul_ps(colInv1, invDet), _mm_mul_ps(colInv2, invDet)); +} + +PX_FORCE_INLINE Mat33V M33Trnsps(const Mat33V& a) +{ + return Mat33V(V3Merge(V3GetX(a.col0), V3GetX(a.col1), V3GetX(a.col2)), + V3Merge(V3GetY(a.col0), V3GetY(a.col1), V3GetY(a.col2)), + V3Merge(V3GetZ(a.col0), V3GetZ(a.col1), V3GetZ(a.col2))); +} + +PX_FORCE_INLINE Mat33V M33Identity() +{ + return Mat33V(V3UnitX(), V3UnitY(), V3UnitZ()); +} + +PX_FORCE_INLINE Mat33V M33Sub(const Mat33V& a, const Mat33V& b) +{ + return Mat33V(V3Sub(a.col0, b.col0), V3Sub(a.col1, b.col1), V3Sub(a.col2, b.col2)); +} + +PX_FORCE_INLINE Mat33V M33Neg(const Mat33V& a) +{ + return Mat33V(V3Neg(a.col0), V3Neg(a.col1), V3Neg(a.col2)); +} + +PX_FORCE_INLINE Mat33V M33Abs(const Mat33V& a) +{ + return Mat33V(V3Abs(a.col0), V3Abs(a.col1), V3Abs(a.col2)); +} + +PX_FORCE_INLINE Mat33V PromoteVec3V(const Vec3V v) +{ + const BoolV bTFFF = BTFFF(); + const BoolV bFTFF = BFTFF(); + const BoolV bFFTF = BTFTF(); + + const Vec3V zero = V3Zero(); + + return Mat33V(V3Sel(bTFFF, v, zero), V3Sel(bFTFF, v, zero), V3Sel(bFFTF, v, zero)); +} + +PX_FORCE_INLINE Mat33V M33Diagonal(const Vec3VArg d) +{ + const FloatV x = V3Mul(V3UnitX(), d); + const FloatV y = V3Mul(V3UnitY(), d); + const FloatV z = V3Mul(V3UnitZ(), d); + return Mat33V(x, y, z); +} + +////////////////////////////////// +// MAT34V +////////////////////////////////// + +PX_FORCE_INLINE Vec3V M34MulV3(const Mat34V& a, const Vec3V b) +{ + const FloatV x = V3GetX(b); + const FloatV y = V3GetY(b); + const FloatV z = V3GetZ(b); + const Vec3V v0 = V3Scale(a.col0, x); + const Vec3V v1 = V3Scale(a.col1, y); + const Vec3V v2 = V3Scale(a.col2, z); + const Vec3V v0PlusV1 = V3Add(v0, v1); + const Vec3V v0PlusV1Plusv2 = V3Add(v0PlusV1, v2); + return V3Add(v0PlusV1Plusv2, a.col3); +} + +PX_FORCE_INLINE Vec3V M34Mul33V3(const Mat34V& a, const Vec3V b) +{ + const FloatV x = V3GetX(b); + const FloatV y = V3GetY(b); + const FloatV z = V3GetZ(b); + const Vec3V v0 = V3Scale(a.col0, x); + const Vec3V v1 = V3Scale(a.col1, y); + const Vec3V v2 = V3Scale(a.col2, z); + const Vec3V v0PlusV1 = V3Add(v0, v1); + return V3Add(v0PlusV1, v2); +} + +PX_FORCE_INLINE Vec3V M34TrnspsMul33V3(const Mat34V& a, const Vec3V b) +{ + const FloatV x = V3Dot(a.col0, b); + const FloatV y = V3Dot(a.col1, b); + const FloatV z = V3Dot(a.col2, b); + return V3Merge(x, y, z); +} + +PX_FORCE_INLINE Mat34V M34MulM34(const Mat34V& a, const Mat34V& b) +{ + return Mat34V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2), M34MulV3(a, b.col3)); +} + +PX_FORCE_INLINE Mat33V M34MulM33(const Mat34V& a, const Mat33V& b) +{ + return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2)); +} + +PX_FORCE_INLINE Mat33V M34Mul33MM34(const Mat34V& a, const Mat34V& b) +{ + return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2)); +} + +PX_FORCE_INLINE Mat34V M34Add(const Mat34V& a, const Mat34V& b) +{ + return Mat34V(V3Add(a.col0, b.col0), V3Add(a.col1, b.col1), V3Add(a.col2, b.col2), V3Add(a.col3, b.col3)); +} + +PX_FORCE_INLINE Mat33V M34Trnsps33(const Mat34V& a) +{ + return Mat33V(V3Merge(V3GetX(a.col0), V3GetX(a.col1), V3GetX(a.col2)), + V3Merge(V3GetY(a.col0), V3GetY(a.col1), V3GetY(a.col2)), + V3Merge(V3GetZ(a.col0), V3GetZ(a.col1), V3GetZ(a.col2))); +} + +////////////////////////////////// +// MAT44V +////////////////////////////////// + +PX_FORCE_INLINE Vec4V M44MulV4(const Mat44V& a, const Vec4V b) +{ + const FloatV x = V4GetX(b); + const FloatV y = V4GetY(b); + const FloatV z = V4GetZ(b); + const FloatV w = V4GetW(b); + + const Vec4V v0 = V4Scale(a.col0, x); + const Vec4V v1 = V4Scale(a.col1, y); + const Vec4V v2 = V4Scale(a.col2, z); + const Vec4V v3 = V4Scale(a.col3, w); + const Vec4V v0PlusV1 = V4Add(v0, v1); + const Vec4V v0PlusV1Plusv2 = V4Add(v0PlusV1, v2); + return V4Add(v0PlusV1Plusv2, v3); +} + +PX_FORCE_INLINE Vec4V M44TrnspsMulV4(const Mat44V& a, const Vec4V b) +{ + PX_ALIGN(16, FloatV) dotProdArray[4] = { V4Dot(a.col0, b), V4Dot(a.col1, b), V4Dot(a.col2, b), V4Dot(a.col3, b) }; + return V4Merge(dotProdArray); +} + +PX_FORCE_INLINE Mat44V M44MulM44(const Mat44V& a, const Mat44V& b) +{ + return Mat44V(M44MulV4(a, b.col0), M44MulV4(a, b.col1), M44MulV4(a, b.col2), M44MulV4(a, b.col3)); +} + +PX_FORCE_INLINE Mat44V M44Add(const Mat44V& a, const Mat44V& b) +{ + return Mat44V(V4Add(a.col0, b.col0), V4Add(a.col1, b.col1), V4Add(a.col2, b.col2), V4Add(a.col3, b.col3)); +} + +PX_FORCE_INLINE Mat44V M44Trnsps(const Mat44V& a) +{ + const Vec4V v0 = _mm_unpacklo_ps(a.col0, a.col2); + const Vec4V v1 = _mm_unpackhi_ps(a.col0, a.col2); + const Vec4V v2 = _mm_unpacklo_ps(a.col1, a.col3); + const Vec4V v3 = _mm_unpackhi_ps(a.col1, a.col3); + return Mat44V(_mm_unpacklo_ps(v0, v2), _mm_unpackhi_ps(v0, v2), _mm_unpacklo_ps(v1, v3), _mm_unpackhi_ps(v1, v3)); +} + +PX_FORCE_INLINE Mat44V M44Inverse(const Mat44V& a) +{ + __m128 minor0, minor1, minor2, minor3; + __m128 row0, row1, row2, row3; + __m128 det, tmp1; + + tmp1 = V4Zero(); + row1 = V4Zero(); + row3 = V4Zero(); + + row0 = a.col0; + row1 = _mm_shuffle_ps(a.col1, a.col1, _MM_SHUFFLE(1, 0, 3, 2)); + row2 = a.col2; + row3 = _mm_shuffle_ps(a.col3, a.col3, _MM_SHUFFLE(1, 0, 3, 2)); + + tmp1 = _mm_mul_ps(row2, row3); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + minor0 = _mm_mul_ps(row1, tmp1); + minor1 = _mm_mul_ps(row0, tmp1); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0); + minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1); + minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E); + + tmp1 = _mm_mul_ps(row1, row2); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0); + minor3 = _mm_mul_ps(row0, tmp1); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1)); + minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3); + minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E); + + tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + row2 = _mm_shuffle_ps(row2, row2, 0x4E); + minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0); + minor2 = _mm_mul_ps(row0, tmp1); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1)); + minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2); + minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E); + + tmp1 = _mm_mul_ps(row0, row1); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2); + minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2); + minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1)); + + tmp1 = _mm_mul_ps(row0, row3); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1)); + minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1); + minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1)); + + tmp1 = _mm_mul_ps(row0, row2); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1); + minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1)); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1)); + minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3); + + det = _mm_mul_ps(row0, minor0); + det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det); + det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det); + tmp1 = _mm_rcp_ss(det); +#if 0 + det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1))); + det = _mm_shuffle_ps(det, det, 0x00); +#else + det = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(0, 0, 0, 0)); +#endif + + minor0 = _mm_mul_ps(det, minor0); + minor1 = _mm_mul_ps(det, minor1); + minor2 = _mm_mul_ps(det, minor2); + minor3 = _mm_mul_ps(det, minor3); + Mat44V invTrans(minor0, minor1, minor2, minor3); + return M44Trnsps(invTrans); +} + +PX_FORCE_INLINE Vec4V V4LoadXYZW(const PxF32& x, const PxF32& y, const PxF32& z, const PxF32& w) +{ + return _mm_set_ps(w, z, y, x); +} + +/* +// AP: work in progress - use proper SSE intrinsics where possible +PX_FORCE_INLINE VecU16V V4U32PK(VecU32V a, VecU32V b) +{ + VecU16V result; + result.m128_u16[0] = PxU16(PxClamp<PxU32>((a).m128_u32[0], 0, 0xFFFF)); + result.m128_u16[1] = PxU16(PxClamp<PxU32>((a).m128_u32[1], 0, 0xFFFF)); + result.m128_u16[2] = PxU16(PxClamp<PxU32>((a).m128_u32[2], 0, 0xFFFF)); + result.m128_u16[3] = PxU16(PxClamp<PxU32>((a).m128_u32[3], 0, 0xFFFF)); + result.m128_u16[4] = PxU16(PxClamp<PxU32>((b).m128_u32[0], 0, 0xFFFF)); + result.m128_u16[5] = PxU16(PxClamp<PxU32>((b).m128_u32[1], 0, 0xFFFF)); + result.m128_u16[6] = PxU16(PxClamp<PxU32>((b).m128_u32[2], 0, 0xFFFF)); + result.m128_u16[7] = PxU16(PxClamp<PxU32>((b).m128_u32[3], 0, 0xFFFF)); + return result; +} +*/ + +PX_FORCE_INLINE VecU32V V4U32Sel(const BoolV c, const VecU32V a, const VecU32V b) +{ + return m128_I2F(_mm_or_si128(_mm_andnot_si128(m128_F2I(c), m128_F2I(b)), _mm_and_si128(m128_F2I(c), m128_F2I(a)))); +} + +PX_FORCE_INLINE VecU32V V4U32or(VecU32V a, VecU32V b) +{ + return m128_I2F(_mm_or_si128(m128_F2I(a), m128_F2I(b))); +} + +PX_FORCE_INLINE VecU32V V4U32xor(VecU32V a, VecU32V b) +{ + return m128_I2F(_mm_xor_si128(m128_F2I(a), m128_F2I(b))); +} + +PX_FORCE_INLINE VecU32V V4U32and(VecU32V a, VecU32V b) +{ + return m128_I2F(_mm_and_si128(m128_F2I(a), m128_F2I(b))); +} + +PX_FORCE_INLINE VecU32V V4U32Andc(VecU32V a, VecU32V b) +{ + return m128_I2F(_mm_andnot_si128(m128_F2I(b), m128_F2I(a))); +} + +/* +PX_FORCE_INLINE VecU16V V4U16Or(VecU16V a, VecU16V b) +{ + return m128_I2F(_mm_or_si128(m128_F2I(a), m128_F2I(b))); +} +*/ + +/* +PX_FORCE_INLINE VecU16V V4U16And(VecU16V a, VecU16V b) +{ + return m128_I2F(_mm_and_si128(m128_F2I(a), m128_F2I(b))); +} +*/ + +/* +PX_FORCE_INLINE VecU16V V4U16Andc(VecU16V a, VecU16V b) +{ + return m128_I2F(_mm_andnot_si128(m128_F2I(b), m128_F2I(a))); +} +*/ + +PX_FORCE_INLINE VecI32V I4Load(const PxI32 i) +{ + return m128_F2I(_mm_load1_ps(reinterpret_cast<const PxF32*>(&i))); +} + +PX_FORCE_INLINE VecI32V I4LoadU(const PxI32* i) +{ + return m128_F2I(_mm_loadu_ps(reinterpret_cast<const PxF32*>(i))); +} + +PX_FORCE_INLINE VecI32V I4LoadA(const PxI32* i) +{ + return m128_F2I(_mm_load_ps(reinterpret_cast<const PxF32*>(i))); +} + +PX_FORCE_INLINE VecI32V VecI32V_Add(const VecI32VArg a, const VecI32VArg b) +{ + return _mm_add_epi32(a, b); +} + +PX_FORCE_INLINE VecI32V VecI32V_Sub(const VecI32VArg a, const VecI32VArg b) +{ + return _mm_sub_epi32(a, b); +} + +PX_FORCE_INLINE BoolV VecI32V_IsGrtr(const VecI32VArg a, const VecI32VArg b) +{ + return m128_I2F(_mm_cmpgt_epi32(a, b)); +} + +PX_FORCE_INLINE BoolV VecI32V_IsEq(const VecI32VArg a, const VecI32VArg b) +{ + return m128_I2F(_mm_cmpeq_epi32(a, b)); +} + +PX_FORCE_INLINE VecI32V V4I32Sel(const BoolV c, const VecI32V a, const VecI32V b) +{ + return _mm_or_si128(_mm_andnot_si128(m128_F2I(c), b), _mm_and_si128(m128_F2I(c), a)); +} + +PX_FORCE_INLINE VecI32V VecI32V_Zero() +{ + return _mm_setzero_si128(); +} + +PX_FORCE_INLINE VecI32V VecI32V_One() +{ + return I4Load(1); +} + +PX_FORCE_INLINE VecI32V VecI32V_Two() +{ + return I4Load(2); +} + +PX_FORCE_INLINE VecI32V VecI32V_MinusOne() +{ + return I4Load(-1); +} + +PX_FORCE_INLINE VecU32V U4Zero() +{ + return U4Load(0); +} + +PX_FORCE_INLINE VecU32V U4One() +{ + return U4Load(1); +} + +PX_FORCE_INLINE VecU32V U4Two() +{ + return U4Load(2); +} + +PX_FORCE_INLINE VecI32V VecI32V_Sel(const BoolV c, const VecI32VArg a, const VecI32VArg b) +{ + return _mm_or_si128(_mm_andnot_si128(m128_F2I(c), b), _mm_and_si128(m128_F2I(c), a)); +} + +PX_FORCE_INLINE VecShiftV VecI32V_PrepareShift(const VecI32VArg shift) +{ + VecShiftV s; + s.shift = VecI32V_Sel(BTFFF(), shift, VecI32V_Zero()); + return s; +} + +PX_FORCE_INLINE VecI32V VecI32V_LeftShift(const VecI32VArg a, const VecShiftVArg count) +{ + return _mm_sll_epi32(a, count.shift); +} + +PX_FORCE_INLINE VecI32V VecI32V_RightShift(const VecI32VArg a, const VecShiftVArg count) +{ + return _mm_srl_epi32(a, count.shift); +} + +PX_FORCE_INLINE VecI32V VecI32V_And(const VecI32VArg a, const VecI32VArg b) +{ + return _mm_and_si128(a, b); +} + +PX_FORCE_INLINE VecI32V VecI32V_Or(const VecI32VArg a, const VecI32VArg b) +{ + return _mm_or_si128(a, b); +} + +PX_FORCE_INLINE VecI32V VecI32V_GetX(const VecI32VArg a) +{ + return m128_F2I(_mm_shuffle_ps(m128_I2F(a), m128_I2F(a), _MM_SHUFFLE(0, 0, 0, 0))); +} + +PX_FORCE_INLINE VecI32V VecI32V_GetY(const VecI32VArg a) +{ + return m128_F2I(_mm_shuffle_ps(m128_I2F(a), m128_I2F(a), _MM_SHUFFLE(1, 1, 1, 1))); +} + +PX_FORCE_INLINE VecI32V VecI32V_GetZ(const VecI32VArg a) +{ + return m128_F2I(_mm_shuffle_ps(m128_I2F(a), m128_I2F(a), _MM_SHUFFLE(2, 2, 2, 2))); +} + +PX_FORCE_INLINE VecI32V VecI32V_GetW(const VecI32VArg a) +{ + return m128_F2I(_mm_shuffle_ps(m128_I2F(a), m128_I2F(a), _MM_SHUFFLE(3, 3, 3, 3))); +} + +PX_FORCE_INLINE void PxI32_From_VecI32V(const VecI32VArg a, PxI32* i) +{ + _mm_store_ss(reinterpret_cast<PxF32*>(i), m128_I2F(a)); +} + +PX_FORCE_INLINE VecI32V VecI32V_Merge(const VecI32VArg x, const VecI32VArg y, const VecI32VArg z, const VecI32VArg w) +{ + const __m128 xw = _mm_move_ss(m128_I2F(y), m128_I2F(x)); // y, y, y, x + const __m128 yz = _mm_move_ss(m128_I2F(z), m128_I2F(w)); // z, z, z, w + return m128_F2I(_mm_shuffle_ps(xw, yz, _MM_SHUFFLE(0, 2, 1, 0))); +} + +PX_FORCE_INLINE VecI32V VecI32V_From_BoolV(const BoolVArg a) +{ + return m128_F2I(a); +} + +PX_FORCE_INLINE VecU32V VecU32V_From_BoolV(const BoolVArg a) +{ + return a; +} + +/* +template<int a> PX_FORCE_INLINE VecI32V V4ISplat() +{ + VecI32V result; + result.m128_i32[0] = a; + result.m128_i32[1] = a; + result.m128_i32[2] = a; + result.m128_i32[3] = a; + return result; +} + +template<PxU32 a> PX_FORCE_INLINE VecU32V V4USplat() +{ + VecU32V result; + result.m128_u32[0] = a; + result.m128_u32[1] = a; + result.m128_u32[2] = a; + result.m128_u32[3] = a; + return result; +} +*/ + +/* +PX_FORCE_INLINE void V4U16StoreAligned(VecU16V val, VecU16V* address) +{ + *address = val; +} +*/ + +PX_FORCE_INLINE void V4U32StoreAligned(VecU32V val, VecU32V* address) +{ + *address = val; +} + +PX_FORCE_INLINE Vec4V V4LoadAligned(Vec4V* addr) +{ + return *addr; +} + +PX_FORCE_INLINE Vec4V V4LoadUnaligned(Vec4V* addr) +{ + return V4LoadU(reinterpret_cast<float*>(addr)); +} + +PX_FORCE_INLINE Vec4V V4Andc(const Vec4V a, const VecU32V b) +{ + VecU32V result32(a); + result32 = V4U32Andc(result32, b); + return Vec4V(result32); +} + +PX_FORCE_INLINE VecU32V V4IsGrtrV32u(const Vec4V a, const Vec4V b) +{ + return V4IsGrtr(a, b); +} + +PX_FORCE_INLINE VecU16V V4U16LoadAligned(VecU16V* addr) +{ + return *addr; +} + +PX_FORCE_INLINE VecU16V V4U16LoadUnaligned(VecU16V* addr) +{ + return *addr; +} + +PX_FORCE_INLINE VecU16V V4U16CompareGt(VecU16V a, VecU16V b) +{ + // _mm_cmpgt_epi16 doesn't work for unsigned values unfortunately + // return m128_I2F(_mm_cmpgt_epi16(m128_F2I(a), m128_F2I(b))); + VecU16V result; + result.m128_u16[0] = (a).m128_u16[0] > (b).m128_u16[0]; + result.m128_u16[1] = (a).m128_u16[1] > (b).m128_u16[1]; + result.m128_u16[2] = (a).m128_u16[2] > (b).m128_u16[2]; + result.m128_u16[3] = (a).m128_u16[3] > (b).m128_u16[3]; + result.m128_u16[4] = (a).m128_u16[4] > (b).m128_u16[4]; + result.m128_u16[5] = (a).m128_u16[5] > (b).m128_u16[5]; + result.m128_u16[6] = (a).m128_u16[6] > (b).m128_u16[6]; + result.m128_u16[7] = (a).m128_u16[7] > (b).m128_u16[7]; + return result; +} + +PX_FORCE_INLINE VecU16V V4I16CompareGt(VecU16V a, VecU16V b) +{ + return m128_I2F(_mm_cmpgt_epi16(m128_F2I(a), m128_F2I(b))); +} + +PX_FORCE_INLINE Vec4V Vec4V_From_VecU32V(VecU32V a) +{ + Vec4V result = V4LoadXYZW(PxF32(a.m128_u32[0]), PxF32(a.m128_u32[1]), PxF32(a.m128_u32[2]), PxF32(a.m128_u32[3])); + return result; +} + +PX_FORCE_INLINE Vec4V Vec4V_From_VecI32V(VecI32V in) +{ + return _mm_cvtepi32_ps(in); +} + +PX_FORCE_INLINE VecI32V VecI32V_From_Vec4V(Vec4V a) +{ + return _mm_cvttps_epi32(a); +} + +PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecU32V(VecU32V a) +{ + return Vec4V(a); +} + +PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecI32V(VecI32V a) +{ + return m128_I2F(a); +} + +PX_FORCE_INLINE VecU32V VecU32V_ReinterpretFrom_Vec4V(Vec4V a) +{ + return VecU32V(a); +} + +PX_FORCE_INLINE VecI32V VecI32V_ReinterpretFrom_Vec4V(Vec4V a) +{ + return m128_F2I(a); +} + +/* +template<int index> PX_FORCE_INLINE BoolV BSplatElement(BoolV a) +{ + BoolV result; + result[0] = result[1] = result[2] = result[3] = a[index]; + return result; +} +*/ + +template <int index> +BoolV BSplatElement(BoolV a) +{ + float* data = reinterpret_cast<float*>(&a); + return V4Load(data[index]); +} + +template <int index> +PX_FORCE_INLINE VecU32V V4U32SplatElement(VecU32V a) +{ + VecU32V result; + result.m128_u32[0] = result.m128_u32[1] = result.m128_u32[2] = result.m128_u32[3] = a.m128_u32[index]; + return result; +} + +template <int index> +PX_FORCE_INLINE Vec4V V4SplatElement(Vec4V a) +{ + float* data = reinterpret_cast<float*>(&a); + return V4Load(data[index]); +} + +PX_FORCE_INLINE VecU32V U4LoadXYZW(PxU32 x, PxU32 y, PxU32 z, PxU32 w) +{ + VecU32V result; + result.m128_u32[0] = x; + result.m128_u32[1] = y; + result.m128_u32[2] = z; + result.m128_u32[3] = w; + return result; +} + +PX_FORCE_INLINE Vec4V V4Ceil(const Vec4V in) +{ + UnionM128 a(in); + return V4LoadXYZW(PxCeil(a.m128_f32[0]), PxCeil(a.m128_f32[1]), PxCeil(a.m128_f32[2]), PxCeil(a.m128_f32[3])); +} + +PX_FORCE_INLINE Vec4V V4Floor(const Vec4V in) +{ + UnionM128 a(in); + return V4LoadXYZW(PxFloor(a.m128_f32[0]), PxFloor(a.m128_f32[1]), PxFloor(a.m128_f32[2]), PxFloor(a.m128_f32[3])); +} + +PX_FORCE_INLINE VecU32V V4ConvertToU32VSaturate(const Vec4V in, PxU32 power) +{ + PX_ASSERT(power == 0 && "Non-zero power not supported in convertToU32VSaturate"); + PX_UNUSED(power); // prevent warning in release builds + PxF32 ffffFFFFasFloat = PxF32(0xFFFF0000); + UnionM128 a(in); + VecU32V result; + result.m128_u32[0] = PxU32(PxClamp<PxF32>((a).m128_f32[0], 0.0f, ffffFFFFasFloat)); + result.m128_u32[1] = PxU32(PxClamp<PxF32>((a).m128_f32[1], 0.0f, ffffFFFFasFloat)); + result.m128_u32[2] = PxU32(PxClamp<PxF32>((a).m128_f32[2], 0.0f, ffffFFFFasFloat)); + result.m128_u32[3] = PxU32(PxClamp<PxF32>((a).m128_f32[3], 0.0f, ffffFFFFasFloat)); + return result; +} + +#endif // PSFOUNDATION_PSUNIXSSE2INLINEAOS_H diff --git a/PxShared/src/foundation/include/windows/PsWindowsAoS.h b/PxShared/src/foundation/include/windows/PsWindowsAoS.h new file mode 100644 index 00000000..dd4288d5 --- /dev/null +++ b/PxShared/src/foundation/include/windows/PsWindowsAoS.h @@ -0,0 +1,131 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSWINDOWSAOS_H +#define PSFOUNDATION_PSWINDOWSAOS_H + +// no includes here! this file should be included from PxcVecMath.h only!!! + +#if !COMPILE_VECTOR_INTRINSICS +#error Vector intrinsics should not be included when using scalar implementation. +#endif + +typedef __m128 FloatV; +typedef __m128 Vec3V; +typedef __m128 Vec4V; +typedef __m128 BoolV; +typedef __m128 VecU32V; +typedef __m128 VecI32V; +typedef __m128 VecU16V; +typedef __m128 VecI16V; +typedef __m128 QuatV; + +#define FloatVArg FloatV & +#define Vec3VArg Vec3V & +#define Vec4VArg Vec4V & +#define BoolVArg BoolV & +#define VecU32VArg VecU32V & +#define VecI32VArg VecI32V & +#define VecU16VArg VecU16V & +#define VecI16VArg VecI16V & +#define QuatVArg QuatV & + +// Optimization for situations in which you cross product multiple vectors with the same vector. +// Avoids 2X shuffles per product +struct VecCrossV +{ + Vec3V mL1; + Vec3V mR1; +}; + +struct VecShiftV +{ + VecI32V shift; +}; +#define VecShiftVArg VecShiftV & + +PX_ALIGN_PREFIX(16) +struct Mat33V +{ + Mat33V() + { + } + Mat33V(const Vec3V& c0, const Vec3V& c1, const Vec3V& c2) : col0(c0), col1(c1), col2(c2) + { + } + Vec3V PX_ALIGN(16, col0); + Vec3V PX_ALIGN(16, col1); + Vec3V PX_ALIGN(16, col2); +} PX_ALIGN_SUFFIX(16); + +PX_ALIGN_PREFIX(16) +struct Mat34V +{ + Mat34V() + { + } + Mat34V(const Vec3V& c0, const Vec3V& c1, const Vec3V& c2, const Vec3V& c3) : col0(c0), col1(c1), col2(c2), col3(c3) + { + } + Vec3V PX_ALIGN(16, col0); + Vec3V PX_ALIGN(16, col1); + Vec3V PX_ALIGN(16, col2); + Vec3V PX_ALIGN(16, col3); +} PX_ALIGN_SUFFIX(16); + +PX_ALIGN_PREFIX(16) +struct Mat43V +{ + Mat43V() + { + } + Mat43V(const Vec4V& c0, const Vec4V& c1, const Vec4V& c2) : col0(c0), col1(c1), col2(c2) + { + } + Vec4V PX_ALIGN(16, col0); + Vec4V PX_ALIGN(16, col1); + Vec4V PX_ALIGN(16, col2); +} PX_ALIGN_SUFFIX(16); + +PX_ALIGN_PREFIX(16) +struct Mat44V +{ + Mat44V() + { + } + Mat44V(const Vec4V& c0, const Vec4V& c1, const Vec4V& c2, const Vec4V& c3) : col0(c0), col1(c1), col2(c2), col3(c3) + { + } + Vec4V PX_ALIGN(16, col0); + Vec4V PX_ALIGN(16, col1); + Vec4V PX_ALIGN(16, col2); + Vec4V PX_ALIGN(16, col3); +} PX_ALIGN_SUFFIX(16); + +#endif // PSFOUNDATION_PSWINDOWSAOS_H diff --git a/PxShared/src/foundation/include/windows/PsWindowsFPU.h b/PxShared/src/foundation/include/windows/PsWindowsFPU.h new file mode 100644 index 00000000..28694885 --- /dev/null +++ b/PxShared/src/foundation/include/windows/PsWindowsFPU.h @@ -0,0 +1,51 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSWINDOWSFPU_H +#define PSFOUNDATION_PSWINDOWSFPU_H + +PX_INLINE physx::shdfnd::SIMDGuard::SIMDGuard() +{ +#if !PX_ARM + mControlWord = _mm_getcsr(); + // set default (disable exceptions: _MM_MASK_MASK) and FTZ (_MM_FLUSH_ZERO_ON), DAZ (_MM_DENORMALS_ZERO_ON: (1<<6)) + _mm_setcsr(_MM_MASK_MASK | _MM_FLUSH_ZERO_ON | (1 << 6)); +#endif +} + +PX_INLINE physx::shdfnd::SIMDGuard::~SIMDGuard() +{ +#if !PX_ARM + // restore control word and clear any exception flags + // (setting exception state flags cause exceptions on the first following fp operation) + _mm_setcsr(mControlWord & ~_MM_EXCEPT_MASK); +#endif +} + +#endif // #ifndef PSFOUNDATION_PSWINDOWSFPU_H diff --git a/PxShared/src/foundation/include/windows/PsWindowsInclude.h b/PxShared/src/foundation/include/windows/PsWindowsInclude.h new file mode 100644 index 00000000..4b4fd2f9 --- /dev/null +++ b/PxShared/src/foundation/include/windows/PsWindowsInclude.h @@ -0,0 +1,96 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSWINDOWSINCLUDE_H +#define PSFOUNDATION_PSWINDOWSINCLUDE_H + +#include "Ps.h" + +#ifndef _WIN32 +#error "This file should only be included by Windows builds!!" +#endif + +#ifdef _WINDOWS_ // windows already included +#error "Only include windows.h through this file!!" +#endif + +// We only support >= Windows XP, and we need this for critical section and +#define _WIN32_WINNT 0x0501 + +// turn off as much as we can for windows. All we really need is the thread functions(critical sections/Interlocked* +// etc) +#define NOGDICAPMASKS +#define NOVIRTUALKEYCODES +#define NOWINMESSAGES +#define NOWINSTYLES +#define NOSYSMETRICS +#define NOMENUS +#define NOICONS +#define NOKEYSTATES +#define NOSYSCOMMANDS +#define NORASTEROPS +#define NOSHOWWINDOW +#define NOATOM +#define NOCLIPBOARD +#define NOCOLOR +#define NOCTLMGR +#define NODRAWTEXT +#define NOGDI +#define NOMB +#define NOMEMMGR +#define NOMETAFILE +#define NOMINMAX +#define NOOPENFILE +#define NOSCROLL +#define NOSERVICE +#define NOSOUND +#define NOTEXTMETRIC +#define NOWH +#define NOWINOFFSETS +#define NOCOMM +#define NOKANJI +#define NOHELP +#define NOPROFILER +#define NODEFERWINDOWPOS +#define NOMCX +#define WIN32_LEAN_AND_MEAN +#define NOUSER +#define NONLS +#define NOMSG + +#pragma warning(push) +#pragma warning(disable : 4668) //'symbol' is not defined as a preprocessor macro, replacing with '0' for 'directives' +#include <windows.h> +#pragma warning(pop) + +#if PX_SSE2 +#include <xmmintrin.h> +#endif + +#endif // #ifndef PSFOUNDATION_PSWINDOWSINCLUDE_H diff --git a/PxShared/src/foundation/include/windows/PsWindowsInlineAoS.h b/PxShared/src/foundation/include/windows/PsWindowsInlineAoS.h new file mode 100644 index 00000000..5bfb62f7 --- /dev/null +++ b/PxShared/src/foundation/include/windows/PsWindowsInlineAoS.h @@ -0,0 +1,3119 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSWINDOWSINLINEAOS_H +#define PSFOUNDATION_PSWINDOWSINLINEAOS_H + +#if !COMPILE_VECTOR_INTRINSICS +#error Vector intrinsics should not be included when using scalar implementation. +#endif + +// Remove this define when all platforms use simd solver. +#define PX_SUPPORT_SIMD + +#include "../PsVecMathSSE.h" + +////////////////////////////////////////////////////////////////////// +//Test that Vec3V and FloatV are legal +////////////////////////////////////////////////////////////////////// + +#define FLOAT_COMPONENTS_EQUAL_THRESHOLD 0.01f +PX_FORCE_INLINE bool isValidFloatV(const FloatV a) +{ + const PxF32 x = V4ReadX(a); + const PxF32 y = V4ReadY(a); + const PxF32 z = V4ReadZ(a); + const PxF32 w = V4ReadW(a); + + if ( + (PxAbs(x - y) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) && + (PxAbs(x - z) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) && + (PxAbs(x - w) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) + ) + { + return true; + } + + if ( + (PxAbs((x - y) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) && + (PxAbs((x - z) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) && + (PxAbs((x - w) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) + ) + { + return true; + } + return false; +} + +PX_FORCE_INLINE bool isValidVec3V(const Vec3V a) +{ + //using _mm_comieq_ss to do the comparison doesn't work for NaN. + PX_ALIGN(16, PxF32 f[4]); + V4StoreA((const Vec4V&)a, f); + return f[3] == 0.0f; +} + +PX_FORCE_INLINE bool isFiniteLength(const Vec3V a) +{ + return !FAllEq(V4LengthSq(a), FZero()); +} + +PX_FORCE_INLINE bool isAligned16(void* a) +{ + return(0 == ((size_t)a & 0x0f)); +} + +//ASSERT_FINITELENGTH is deactivated because there is a lot of code that calls a simd normalisation function with zero length but then ignores the result. + +#if PX_DEBUG +#define ASSERT_ISVALIDVEC3V(a) PX_ASSERT(isValidVec3V(a)) +#define ASSERT_ISVALIDFLOATV(a) PX_ASSERT(isValidFloatV(a)) +#define ASSERT_ISALIGNED16(a) PX_ASSERT(isAligned16((void*)a)) +#define ASSERT_ISFINITELENGTH(a) //PX_ASSERT(isFiniteLength(a)) +#else +#define ASSERT_ISVALIDVEC3V(a) +#define ASSERT_ISVALIDFLOATV(a) +#define ASSERT_ISALIGNED16(a) +#define ASSERT_ISFINITELENGTH(a) +#endif +///////////////////////////////////////////////////////////////////// +////FUNCTIONS USED ONLY FOR ASSERTS IN VECTORISED IMPLEMENTATIONS +///////////////////////////////////////////////////////////////////// + +////////////////////////////////////////////////////////////////////// +// USED ONLY INTERNALLY +////////////////////////////////////////////////////////////////////// + +namespace internalWindowsSimd +{ +PX_FORCE_INLINE __m128 m128_I2F(__m128i n) +{ + return _mm_castsi128_ps(n); +} + +PX_FORCE_INLINE __m128i m128_F2I(__m128 n) +{ + return _mm_castps_si128(n); +} + +PX_FORCE_INLINE PxU32 BAllTrue4_R(const BoolV a) +{ + const PxI32 moveMask = _mm_movemask_ps(a); + return PxU32(moveMask == 0xf); +} + +PX_FORCE_INLINE PxU32 BAllTrue3_R(const BoolV a) +{ + const PxI32 moveMask = _mm_movemask_ps(a); + return PxU32((moveMask & 0x7) == 0x7); +} + +PX_FORCE_INLINE PxU32 BAnyTrue4_R(const BoolV a) +{ + const PxI32 moveMask = _mm_movemask_ps(a); + return PxU32(moveMask != 0x0); +} + +PX_FORCE_INLINE PxU32 BAnyTrue3_R(const BoolV a) +{ + const PxI32 moveMask = _mm_movemask_ps(a); + return PxU32(((moveMask & 0x7) != 0x0)); +} + +PX_FORCE_INLINE PxU32 FiniteTestEq(const Vec4V a, const Vec4V b) +{ + // This is a bit of a bodge. + //_mm_comieq_ss returns 1 if either value is nan so we need to re-cast a and b with true encoded as a non-nan + // number. + // There must be a better way of doing this in sse. + const BoolV one = FOne(); + const BoolV zero = FZero(); + const BoolV a1 = V4Sel(a, one, zero); + const BoolV b1 = V4Sel(b, one, zero); + return (PxU32( + _mm_comieq_ss(a1, b1) && + _mm_comieq_ss(_mm_shuffle_ps(a1, a1, _MM_SHUFFLE(1, 1, 1, 1)), _mm_shuffle_ps(b1, b1, _MM_SHUFFLE(1, 1, 1, 1))) && + _mm_comieq_ss(_mm_shuffle_ps(a1, a1, _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(b1, b1, _MM_SHUFFLE(2, 2, 2, 2))) && + _mm_comieq_ss(_mm_shuffle_ps(a1, a1, _MM_SHUFFLE(3, 3, 3, 3)), _mm_shuffle_ps(b1, b1, _MM_SHUFFLE(3, 3, 3, 3))))); +} + +PX_FORCE_INLINE bool hasZeroElementinFloatV(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + return _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), FZero()) ? true : false; +} + +PX_FORCE_INLINE bool hasZeroElementInVec3V(const Vec3V a) +{ + return (_mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), FZero()) || + _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)), FZero()) || + _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)), FZero())); +} + +PX_FORCE_INLINE bool hasZeroElementInVec4V(const Vec4V a) +{ + return (_mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), FZero()) || + _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)), FZero()) || + _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)), FZero()) || + _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)), FZero())); +} + +const PX_ALIGN(16, PxU32 gMaskXYZ[4]) = { 0xffffffff, 0xffffffff, 0xffffffff, 0 }; +} //internalWindowsSimd + +namespace _VecMathTests +{ +// PT: this function returns an invalid Vec3V (W!=0.0f) just for unit-testing 'isValidVec3V' +PX_FORCE_INLINE Vec3V getInvalidVec3V() +{ + const float f = 1.0f; + return _mm_load1_ps(&f); +} + +PX_FORCE_INLINE bool allElementsEqualFloatV(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_comieq_ss(a, b) != 0; +} + +PX_FORCE_INLINE bool allElementsEqualVec3V(const Vec3V a, const Vec3V b) +{ + return V3AllEq(a, b) != 0; +} + +PX_FORCE_INLINE bool allElementsEqualVec4V(const Vec4V a, const Vec4V b) +{ + return V4AllEq(a, b) != 0; +} + +PX_FORCE_INLINE bool allElementsEqualBoolV(const BoolV a, const BoolV b) +{ + return internalWindowsSimd::BAllTrue4_R(VecI32V_IsEq(a, b)) != 0; +} + +PX_FORCE_INLINE bool allElementsEqualVecU32V(const VecU32V a, const VecU32V b) +{ + return internalWindowsSimd::BAllTrue4_R(V4IsEqU32(a, b)) != 0; +} + +PX_FORCE_INLINE bool allElementsEqualVecI32V(const VecI32V a, const VecI32V b) +{ + BoolV c = internalWindowsSimd::m128_I2F( + _mm_cmpeq_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b))); + return internalWindowsSimd::BAllTrue4_R(c) != 0; +} + +#define VECMATH_AOS_EPSILON (1e-3f) +static const FloatV minFError = FLoad(-VECMATH_AOS_EPSILON); +static const FloatV maxFError = FLoad(VECMATH_AOS_EPSILON); +static const Vec3V minV3Error = V3Load(-VECMATH_AOS_EPSILON); +static const Vec3V maxV3Error = V3Load(VECMATH_AOS_EPSILON); +static const Vec4V minV4Error = V4Load(-VECMATH_AOS_EPSILON); +static const Vec4V maxV4Error = V4Load(VECMATH_AOS_EPSILON); + +PX_FORCE_INLINE bool allElementsNearEqualFloatV(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + const FloatV c = FSub(a, b); + return _mm_comigt_ss(c, minFError) && _mm_comilt_ss(c, maxFError); +} + +PX_FORCE_INLINE bool allElementsNearEqualVec3V(const Vec3V a, const Vec3V b) +{ + const Vec3V c = V3Sub(a, b); + return (_mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), minV3Error) && + _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), maxV3Error) && + _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), minV3Error) && + _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), maxV3Error) && + _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), minV3Error) && + _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), maxV3Error)); +} + +PX_FORCE_INLINE bool allElementsNearEqualVec4V(const Vec4V a, const Vec4V b) +{ + const Vec4V c = V4Sub(a, b); + return (_mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), minV4Error) && + _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), maxV4Error) && + _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), minV4Error) && + _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), maxV4Error) && + _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), minV4Error) && + _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), maxV4Error) && + _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 3, 3, 3)), minV4Error) && + _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 3, 3, 3)), maxV4Error)); +} +} //_VecMathTests + +PX_FORCE_INLINE bool isFiniteFloatV(const FloatV a) +{ + PxF32 f; + FStore(a, &f); + return PxIsFinite(f); + /* + const PxU32 badNumber = (_FPCLASS_SNAN | _FPCLASS_QNAN | _FPCLASS_NINF | _FPCLASS_PINF); + const FloatV vBadNum = FloatV_From_F32((PxF32&)badNumber); + const BoolV vMask = BAnd(vBadNum, a); + return FiniteTestEq(vMask, BFFFF()) == 1; + */ +} + +PX_FORCE_INLINE bool isFiniteVec3V(const Vec3V a) +{ + PX_ALIGN(16, PxF32 f[4]); + V4StoreA((Vec4V&)a, f); + return PxIsFinite(f[0]) && PxIsFinite(f[1]) && PxIsFinite(f[2]); + + /* + const PxU32 badNumber = (_FPCLASS_SNAN | _FPCLASS_QNAN | _FPCLASS_NINF | _FPCLASS_PINF); + const Vec3V vBadNum = Vec3V_From_F32((PxF32&)badNumber); + const BoolV vMask = BAnd(BAnd(vBadNum, a), BTTTF()); + return FiniteTestEq(vMask, BFFFF()) == 1; + */ +} + +PX_FORCE_INLINE bool isFiniteVec4V(const Vec4V a) +{ + PX_ALIGN(16, PxF32 f[4]); + V4StoreA(a, f); + return PxIsFinite(f[0]) && PxIsFinite(f[1]) && PxIsFinite(f[2]) && PxIsFinite(f[3]); + + /* + const PxU32 badNumber = (_FPCLASS_SNAN | _FPCLASS_QNAN | _FPCLASS_NINF | _FPCLASS_PINF); + const Vec4V vBadNum = Vec4V_From_U32((PxF32&)badNumber); + const BoolV vMask = BAnd(vBadNum, a); + + return FiniteTestEq(vMask, BFFFF()) == 1; + */ +} + +///////////////////////////////////////////////////////////////////// +////VECTORISED FUNCTION IMPLEMENTATIONS +///////////////////////////////////////////////////////////////////// + +PX_FORCE_INLINE FloatV FLoad(const PxF32 f) +{ + return _mm_load1_ps(&f); +} + +PX_FORCE_INLINE Vec3V V3Load(const PxF32 f) +{ + return _mm_set_ps(0.0f, f, f, f); +} + +PX_FORCE_INLINE Vec4V V4Load(const PxF32 f) +{ + return _mm_load1_ps(&f); +} + +PX_FORCE_INLINE BoolV BLoad(const bool f) +{ + const PxU32 i = PxU32(-(PxI32)f); + return _mm_load1_ps((float*)&i); +} + +PX_FORCE_INLINE Vec3V V3LoadA(const PxVec3& f) +{ + ASSERT_ISALIGNED16(&f); + return _mm_and_ps(_mm_load_ps(&f.x), reinterpret_cast<const Vec4V&>(internalWindowsSimd::gMaskXYZ)); +} + +PX_FORCE_INLINE Vec3V V3LoadU(const PxVec3& f) +{ + return _mm_set_ps(0.0f, f.z, f.y, f.x); +} + +// w component of result is undefined +PX_FORCE_INLINE Vec3V V3LoadUnsafeA(const PxVec3& f) +{ + ASSERT_ISALIGNED16(&f); + return _mm_load_ps(&f.x); +} + +PX_FORCE_INLINE Vec3V V3LoadA(const PxF32* const f) +{ + ASSERT_ISALIGNED16(f); + return V4ClearW(_mm_load_ps(f)); +} + +PX_FORCE_INLINE Vec3V V3LoadU(const PxF32* const i) +{ + return _mm_set_ps(0.0f, i[2], i[1], i[0]); +} + +PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V(Vec4V v) +{ + return V4ClearW(v); +} + +PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V_WUndefined(const Vec4V v) +{ + return v; +} + +PX_FORCE_INLINE Vec4V Vec4V_From_Vec3V(Vec3V f) +{ + return f; // ok if it is implemented as the same type. +} + +PX_FORCE_INLINE Vec4V Vec4V_From_FloatV(FloatV f) +{ + return f; +} + +PX_FORCE_INLINE Vec3V Vec3V_From_FloatV(FloatV f) +{ + return Vec3V_From_Vec4V(Vec4V_From_FloatV(f)); +} + +PX_FORCE_INLINE Vec3V Vec3V_From_FloatV_WUndefined(FloatV f) +{ + return Vec3V_From_Vec4V_WUndefined(Vec4V_From_FloatV(f)); +} + +PX_FORCE_INLINE Vec4V Vec4V_From_PxVec3_WUndefined(const PxVec3& f) +{ + return _mm_set_ps(0.0f, f.z, f.y, f.x); +} + +PX_FORCE_INLINE Vec4V V4LoadA(const PxF32* const f) +{ + ASSERT_ISALIGNED16(f); + return _mm_load_ps(f); +} + +PX_FORCE_INLINE void V4StoreA(const Vec4V a, PxF32* f) +{ + ASSERT_ISALIGNED16(f); + _mm_store_ps(f, a); +} + +PX_FORCE_INLINE void V4StoreU(const Vec4V a, PxF32* f) +{ + _mm_storeu_ps(f, a); +} + +PX_FORCE_INLINE void BStoreA(const BoolV a, PxU32* f) +{ + ASSERT_ISALIGNED16(f); + _mm_store_ps((PxF32*)f, a); +} + +PX_FORCE_INLINE void U4StoreA(const VecU32V uv, PxU32* u) +{ + ASSERT_ISALIGNED16(u); + _mm_store_ps((PxF32*)u, uv); +} + +PX_FORCE_INLINE void I4StoreA(const VecI32V iv, PxI32* i) +{ + ASSERT_ISALIGNED16(i); + _mm_store_ps((PxF32*)i, iv); +} + +PX_FORCE_INLINE Vec4V V4LoadU(const PxF32* const f) +{ + return _mm_loadu_ps(f); +} + +PX_FORCE_INLINE BoolV BLoad(const bool* const f) +{ + const PX_ALIGN(16, PxU32 b[4]) = { PxU32(-(PxI32)f[0]), PxU32(-(PxI32)f[1]), + PxU32(-(PxI32)f[2]), PxU32(-(PxI32)f[3]) }; + return _mm_load_ps((float*)&b); +} + +PX_FORCE_INLINE void FStore(const FloatV a, PxF32* PX_RESTRICT f) +{ + ASSERT_ISVALIDFLOATV(a); + _mm_store_ss(f, a); +} + +PX_FORCE_INLINE void V3StoreA(const Vec3V a, PxVec3& f) +{ + ASSERT_ISALIGNED16(&f); + PX_ALIGN(16, PxF32 f2[4]); + _mm_store_ps(f2, a); + f = PxVec3(f2[0], f2[1], f2[2]); +} + +PX_FORCE_INLINE void Store_From_BoolV(const BoolV b, PxU32* b2) +{ + _mm_store_ss((PxF32*)b2, b); +} + +PX_FORCE_INLINE void V3StoreU(const Vec3V a, PxVec3& f) +{ + PX_ALIGN(16, PxF32 f2[4]); + _mm_store_ps(f2, a); + f = PxVec3(f2[0], f2[1], f2[2]); +} + +PX_FORCE_INLINE Mat33V Mat33V_From_PxMat33(const PxMat33& m) +{ + return Mat33V(V3LoadU(m.column0), V3LoadU(m.column1), V3LoadU(m.column2)); +} + +PX_FORCE_INLINE void PxMat33_From_Mat33V(const Mat33V& m, PxMat33& out) +{ + ASSERT_ISALIGNED16(&out); + V3StoreU(m.col0, out.column0); + V3StoreU(m.col1, out.column1); + V3StoreU(m.col2, out.column2); +} + +////////////////////////////////// +// FLOATV +////////////////////////////////// + +PX_FORCE_INLINE FloatV FZero() +{ + return _mm_setzero_ps(); +} + +PX_FORCE_INLINE FloatV FOne() +{ + return FLoad(1.0f); +} + +PX_FORCE_INLINE FloatV FHalf() +{ + return FLoad(0.5f); +} + +PX_FORCE_INLINE FloatV FEps() +{ + return FLoad(PX_EPS_REAL); +} + +PX_FORCE_INLINE FloatV FEps6() +{ + return FLoad(1e-6f); +} + +PX_FORCE_INLINE FloatV FMax() +{ + return FLoad(PX_MAX_REAL); +} + +PX_FORCE_INLINE FloatV FNegMax() +{ + return FLoad(-PX_MAX_REAL); +} + +PX_FORCE_INLINE FloatV IZero() +{ + const PxU32 zero = 0; + return _mm_load1_ps((PxF32*)&zero); +} + +PX_FORCE_INLINE FloatV IOne() +{ + const PxU32 one = 1; + return _mm_load1_ps((PxF32*)&one); +} + +PX_FORCE_INLINE FloatV ITwo() +{ + const PxU32 two = 2; + return _mm_load1_ps((PxF32*)&two); +} + +PX_FORCE_INLINE FloatV IThree() +{ + const PxU32 three = 3; + return _mm_load1_ps((PxF32*)&three); +} + +PX_FORCE_INLINE FloatV IFour() +{ + const PxU32 four = 4; + return _mm_load1_ps((PxF32*)&four); +} + +PX_FORCE_INLINE FloatV FNeg(const FloatV f) +{ + ASSERT_ISVALIDFLOATV(f); + return _mm_sub_ps(_mm_setzero_ps(), f); +} + +PX_FORCE_INLINE FloatV FAdd(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_add_ps(a, b); +} + +PX_FORCE_INLINE FloatV FSub(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_sub_ps(a, b); +} + +PX_FORCE_INLINE FloatV FMul(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_mul_ps(a, b); +} + +PX_FORCE_INLINE FloatV FDiv(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_div_ps(a, b); +} + +PX_FORCE_INLINE FloatV FDivFast(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_mul_ps(a, _mm_rcp_ps(b)); +} + +PX_FORCE_INLINE FloatV FRecip(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + return _mm_div_ps(FOne(), a); +} + +PX_FORCE_INLINE FloatV FRecipFast(const FloatV a) +{ + return _mm_rcp_ps(a); +} + +PX_FORCE_INLINE FloatV FRsqrt(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + return _mm_div_ps(FOne(), _mm_sqrt_ps(a)); +} + +PX_FORCE_INLINE FloatV FSqrt(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + return _mm_sqrt_ps(a); +} + +PX_FORCE_INLINE FloatV FRsqrtFast(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + return _mm_rsqrt_ps(a); +} + +PX_FORCE_INLINE FloatV FScaleAdd(const FloatV a, const FloatV b, const FloatV c) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + ASSERT_ISVALIDFLOATV(c); + return FAdd(FMul(a, b), c); +} + +PX_FORCE_INLINE FloatV FNegScaleSub(const FloatV a, const FloatV b, const FloatV c) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + ASSERT_ISVALIDFLOATV(c); + return FSub(c, FMul(a, b)); +} + +PX_FORCE_INLINE FloatV FAbs(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + PX_ALIGN(16, const static PxU32 absMask[4]) = { 0x7fFFffFF, 0x7fFFffFF, 0x7fFFffFF, 0x7fFFffFF }; + return _mm_and_ps(a, _mm_load_ps((PxF32*)absMask)); +} + +PX_FORCE_INLINE FloatV FSel(const BoolV c, const FloatV a, const FloatV b) +{ + PX_ASSERT(_VecMathTests::allElementsEqualBoolV(c, BTTTT()) || + _VecMathTests::allElementsEqualBoolV(c, BFFFF())); + ASSERT_ISVALIDFLOATV(_mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a))); + return _mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a)); +} + +PX_FORCE_INLINE BoolV FIsGrtr(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_cmpgt_ps(a, b); +} + +PX_FORCE_INLINE BoolV FIsGrtrOrEq(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_cmpge_ps(a, b); +} + +PX_FORCE_INLINE BoolV FIsEq(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_cmpeq_ps(a, b); +} + +PX_FORCE_INLINE FloatV FMax(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_max_ps(a, b); +} + +PX_FORCE_INLINE FloatV FMin(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_min_ps(a, b); +} + +PX_FORCE_INLINE FloatV FClamp(const FloatV a, const FloatV minV, const FloatV maxV) +{ + ASSERT_ISVALIDFLOATV(minV); + ASSERT_ISVALIDFLOATV(maxV); + return _mm_max_ps(_mm_min_ps(a, maxV), minV); +} + +PX_FORCE_INLINE PxU32 FAllGrtr(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return PxU32(_mm_comigt_ss(a, b)); +} + +PX_FORCE_INLINE PxU32 FAllGrtrOrEq(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return PxU32(_mm_comige_ss(a, b)); +} + +PX_FORCE_INLINE PxU32 FAllEq(const FloatV a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(b); + return PxU32(_mm_comieq_ss(a, b)); +} + +PX_FORCE_INLINE FloatV FRound(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + // return _mm_round_ps(a, 0x0); + const FloatV half = FLoad(0.5f); + const __m128 signBit = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_cvtps_epi32(a), 31)); + const FloatV aRound = FSub(FAdd(a, half), signBit); + __m128i tmp = _mm_cvttps_epi32(aRound); + return _mm_cvtepi32_ps(tmp); +} + +PX_FORCE_INLINE FloatV FSin(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + // Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI + const FloatV recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f); + const FloatV twoPi = V4LoadA(g_PXTwoPi.f); + const FloatV tmp = FMul(a, recipTwoPi); + const FloatV b = FRound(tmp); + const FloatV V1 = FNegScaleSub(twoPi, b, a); + + // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! - + // V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI) + const FloatV V2 = FMul(V1, V1); + const FloatV V3 = FMul(V2, V1); + const FloatV V5 = FMul(V3, V2); + const FloatV V7 = FMul(V5, V2); + const FloatV V9 = FMul(V7, V2); + const FloatV V11 = FMul(V9, V2); + const FloatV V13 = FMul(V11, V2); + const FloatV V15 = FMul(V13, V2); + const FloatV V17 = FMul(V15, V2); + const FloatV V19 = FMul(V17, V2); + const FloatV V21 = FMul(V19, V2); + const FloatV V23 = FMul(V21, V2); + + const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f); + const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f); + const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f); + + const FloatV S1 = V4GetY(sinCoefficients0); + const FloatV S2 = V4GetZ(sinCoefficients0); + const FloatV S3 = V4GetW(sinCoefficients0); + const FloatV S4 = V4GetX(sinCoefficients1); + const FloatV S5 = V4GetY(sinCoefficients1); + const FloatV S6 = V4GetZ(sinCoefficients1); + const FloatV S7 = V4GetW(sinCoefficients1); + const FloatV S8 = V4GetX(sinCoefficients2); + const FloatV S9 = V4GetY(sinCoefficients2); + const FloatV S10 = V4GetZ(sinCoefficients2); + const FloatV S11 = V4GetW(sinCoefficients2); + + FloatV Result; + Result = FScaleAdd(S1, V3, V1); + Result = FScaleAdd(S2, V5, Result); + Result = FScaleAdd(S3, V7, Result); + Result = FScaleAdd(S4, V9, Result); + Result = FScaleAdd(S5, V11, Result); + Result = FScaleAdd(S6, V13, Result); + Result = FScaleAdd(S7, V15, Result); + Result = FScaleAdd(S8, V17, Result); + Result = FScaleAdd(S9, V19, Result); + Result = FScaleAdd(S10, V21, Result); + Result = FScaleAdd(S11, V23, Result); + + return Result; +} + +PX_FORCE_INLINE FloatV FCos(const FloatV a) +{ + ASSERT_ISVALIDFLOATV(a); + + // Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI + const FloatV recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f); + const FloatV twoPi = V4LoadA(g_PXTwoPi.f); + const FloatV tmp = FMul(a, recipTwoPi); + const FloatV b = FRound(tmp); + const FloatV V1 = FNegScaleSub(twoPi, b, a); + + // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! - + // V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI) + const FloatV V2 = FMul(V1, V1); + const FloatV V4 = FMul(V2, V2); + const FloatV V6 = FMul(V4, V2); + const FloatV V8 = FMul(V4, V4); + const FloatV V10 = FMul(V6, V4); + const FloatV V12 = FMul(V6, V6); + const FloatV V14 = FMul(V8, V6); + const FloatV V16 = FMul(V8, V8); + const FloatV V18 = FMul(V10, V8); + const FloatV V20 = FMul(V10, V10); + const FloatV V22 = FMul(V12, V10); + + const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f); + const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f); + const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f); + + const FloatV C1 = V4GetY(cosCoefficients0); + const FloatV C2 = V4GetZ(cosCoefficients0); + const FloatV C3 = V4GetW(cosCoefficients0); + const FloatV C4 = V4GetX(cosCoefficients1); + const FloatV C5 = V4GetY(cosCoefficients1); + const FloatV C6 = V4GetZ(cosCoefficients1); + const FloatV C7 = V4GetW(cosCoefficients1); + const FloatV C8 = V4GetX(cosCoefficients2); + const FloatV C9 = V4GetY(cosCoefficients2); + const FloatV C10 = V4GetZ(cosCoefficients2); + const FloatV C11 = V4GetW(cosCoefficients2); + + FloatV Result; + Result = FScaleAdd(C1, V2, V4One()); + Result = FScaleAdd(C2, V4, Result); + Result = FScaleAdd(C3, V6, Result); + Result = FScaleAdd(C4, V8, Result); + Result = FScaleAdd(C5, V10, Result); + Result = FScaleAdd(C6, V12, Result); + Result = FScaleAdd(C7, V14, Result); + Result = FScaleAdd(C8, V16, Result); + Result = FScaleAdd(C9, V18, Result); + Result = FScaleAdd(C10, V20, Result); + Result = FScaleAdd(C11, V22, Result); + + return Result; +} + +PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV min, const FloatV max) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(min); + ASSERT_ISVALIDFLOATV(max); + const BoolV c = BOr(FIsGrtr(a, max), FIsGrtr(min, a)); + return PxU32(!BAllEqFFFF(c)); +} + +PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV min, const FloatV max) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(min); + ASSERT_ISVALIDFLOATV(max); + const BoolV c = BAnd(FIsGrtrOrEq(a, min), FIsGrtrOrEq(max, a)); + return BAllEqTTTT(c); +} + +PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV bounds) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(bounds); + return FOutOfBounds(a, FNeg(bounds), bounds); +} + +PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV bounds) +{ + ASSERT_ISVALIDFLOATV(a); + ASSERT_ISVALIDFLOATV(bounds); + return FInBounds(a, FNeg(bounds), bounds); +} + +////////////////////////////////// +// VEC3V +////////////////////////////////// + +PX_FORCE_INLINE Vec3V V3Splat(const FloatV f) +{ + ASSERT_ISVALIDFLOATV(f); + const __m128 zero = V3Zero(); + const __m128 fff0 = _mm_move_ss(f, zero); + return _mm_shuffle_ps(fff0, fff0, _MM_SHUFFLE(0, 1, 2, 3)); +} + +PX_FORCE_INLINE Vec3V V3Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z) +{ + ASSERT_ISVALIDFLOATV(x); + ASSERT_ISVALIDFLOATV(y); + ASSERT_ISVALIDFLOATV(z); + // static on zero causes compiler crash on x64 debug_opt + const __m128 zero = V3Zero(); + const __m128 xy = _mm_move_ss(x, y); + const __m128 z0 = _mm_move_ss(zero, z); + + return _mm_shuffle_ps(xy, z0, _MM_SHUFFLE(1, 0, 0, 1)); +} + +PX_FORCE_INLINE Vec3V V3UnitX() +{ + const PX_ALIGN(16, PxF32 x[4]) = { 1.0f, 0.0f, 0.0f, 0.0f }; + const __m128 x128 = _mm_load_ps(x); + return x128; +} + +PX_FORCE_INLINE Vec3V V3UnitY() +{ + const PX_ALIGN(16, PxF32 y[4]) = { 0.0f, 1.0f, 0.0f, 0.0f }; + const __m128 y128 = _mm_load_ps(y); + return y128; +} + +PX_FORCE_INLINE Vec3V V3UnitZ() +{ + const PX_ALIGN(16, PxF32 z[4]) = { 0.0f, 0.0f, 1.0f, 0.0f }; + const __m128 z128 = _mm_load_ps(z); + return z128; +} + +PX_FORCE_INLINE FloatV V3GetX(const Vec3V f) +{ + ASSERT_ISVALIDVEC3V(f); + return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0, 0, 0, 0)); +} + +PX_FORCE_INLINE FloatV V3GetY(const Vec3V f) +{ + ASSERT_ISVALIDVEC3V(f); + return _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 1, 1)); +} + +PX_FORCE_INLINE FloatV V3GetZ(const Vec3V f) +{ + ASSERT_ISVALIDVEC3V(f); + return _mm_shuffle_ps(f, f, _MM_SHUFFLE(2, 2, 2, 2)); +} + +PX_FORCE_INLINE Vec3V V3SetX(const Vec3V v, const FloatV f) +{ + ASSERT_ISVALIDVEC3V(v); + ASSERT_ISVALIDFLOATV(f); + return V4Sel(BFTTT(), v, f); +} + +PX_FORCE_INLINE Vec3V V3SetY(const Vec3V v, const FloatV f) +{ + ASSERT_ISVALIDVEC3V(v); + ASSERT_ISVALIDFLOATV(f); + return V4Sel(BTFTT(), v, f); +} + +PX_FORCE_INLINE Vec3V V3SetZ(const Vec3V v, const FloatV f) +{ + ASSERT_ISVALIDVEC3V(v); + ASSERT_ISVALIDFLOATV(f); + return V4Sel(BTTFT(), v, f); +} + +PX_FORCE_INLINE Vec3V V3ColX(const Vec3V a, const Vec3V b, const Vec3V c) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + ASSERT_ISVALIDVEC3V(c); + Vec3V r = _mm_shuffle_ps(a, c, _MM_SHUFFLE(3, 0, 3, 0)); + return V3SetY(r, V3GetX(b)); +} + +PX_FORCE_INLINE Vec3V V3ColY(const Vec3V a, const Vec3V b, const Vec3V c) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + ASSERT_ISVALIDVEC3V(c); + Vec3V r = _mm_shuffle_ps(a, c, _MM_SHUFFLE(3, 1, 3, 1)); + return V3SetY(r, V3GetY(b)); +} + +PX_FORCE_INLINE Vec3V V3ColZ(const Vec3V a, const Vec3V b, const Vec3V c) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + ASSERT_ISVALIDVEC3V(c); + Vec3V r = _mm_shuffle_ps(a, c, _MM_SHUFFLE(3, 2, 3, 2)); + return V3SetY(r, V3GetZ(b)); +} + +PX_FORCE_INLINE Vec3V V3Zero() +{ + return _mm_setzero_ps(); +} + +PX_FORCE_INLINE Vec3V V3One() +{ + return V3Load(1.0f); +} + +PX_FORCE_INLINE Vec3V V3Eps() +{ + return V3Load(PX_EPS_REAL); +} + +PX_FORCE_INLINE Vec3V V3Neg(const Vec3V f) +{ + ASSERT_ISVALIDVEC3V(f); + return _mm_sub_ps(_mm_setzero_ps(), f); +} + +PX_FORCE_INLINE Vec3V V3Add(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return _mm_add_ps(a, b); +} + +PX_FORCE_INLINE Vec3V V3Sub(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return _mm_sub_ps(a, b); +} + +PX_FORCE_INLINE Vec3V V3Scale(const Vec3V a, const FloatV b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_mul_ps(a, b); +} + +PX_FORCE_INLINE Vec3V V3Mul(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return _mm_mul_ps(a, b); +} + +PX_FORCE_INLINE Vec3V V3ScaleInv(const Vec3V a, const FloatV b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_div_ps(a, b); +} + +PX_FORCE_INLINE Vec3V V3Div(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return V4ClearW(_mm_div_ps(a, b)); +} + +PX_FORCE_INLINE Vec3V V3ScaleInvFast(const Vec3V a, const FloatV b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDFLOATV(b); + return _mm_mul_ps(a, _mm_rcp_ps(b)); +} + +PX_FORCE_INLINE Vec3V V3DivFast(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return V4ClearW(_mm_mul_ps(a, _mm_rcp_ps(b))); +} + +PX_FORCE_INLINE Vec3V V3Recip(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + const __m128 zero = V3Zero(); + const __m128 tttf = BTTTF(); + const __m128 recipA = _mm_div_ps(V3One(), a); + return V4Sel(tttf, recipA, zero); +} + +PX_FORCE_INLINE Vec3V V3RecipFast(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + const __m128 zero = V3Zero(); + const __m128 tttf = BTTTF(); + const __m128 recipA = _mm_rcp_ps(a); + return V4Sel(tttf, recipA, zero); +} + +PX_FORCE_INLINE Vec3V V3Rsqrt(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + const __m128 zero = V3Zero(); + const __m128 tttf = BTTTF(); + const __m128 recipA = _mm_div_ps(V3One(), _mm_sqrt_ps(a)); + return V4Sel(tttf, recipA, zero); +} + +PX_FORCE_INLINE Vec3V V3RsqrtFast(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + const __m128 zero = V3Zero(); + const __m128 tttf = BTTTF(); + const __m128 recipA = _mm_rsqrt_ps(a); + return V4Sel(tttf, recipA, zero); +} + +PX_FORCE_INLINE Vec3V V3ScaleAdd(const Vec3V a, const FloatV b, const Vec3V c) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDFLOATV(b); + ASSERT_ISVALIDVEC3V(c); + return V3Add(V3Scale(a, b), c); +} + +PX_FORCE_INLINE Vec3V V3NegScaleSub(const Vec3V a, const FloatV b, const Vec3V c) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDFLOATV(b); + ASSERT_ISVALIDVEC3V(c); + return V3Sub(c, V3Scale(a, b)); +} + +PX_FORCE_INLINE Vec3V V3MulAdd(const Vec3V a, const Vec3V b, const Vec3V c) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + ASSERT_ISVALIDVEC3V(c); + return V3Add(V3Mul(a, b), c); +} + +PX_FORCE_INLINE Vec3V V3NegMulSub(const Vec3V a, const Vec3V b, const Vec3V c) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + ASSERT_ISVALIDVEC3V(c); + return V3Sub(c, V3Mul(a, b)); +} + +PX_FORCE_INLINE Vec3V V3Abs(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + return V3Max(a, V3Neg(a)); +} + +PX_FORCE_INLINE FloatV V3Dot(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + + const __m128 t0 = _mm_mul_ps(a, b); // aw*bw | az*bz | ay*by | ax*bx + const __m128 t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(1,0,3,2)); // ay*by | ax*bx | aw*bw | az*bz + const __m128 t2 = _mm_add_ps(t0, t1); // ay*by + aw*bw | ax*bx + az*bz | aw*bw + ay*by | az*bz + ax*bx + const __m128 t3 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,3,0,1)); // ax*bx + az*bz | ay*by + aw*bw | az*bz + ax*bx | aw*bw + ay*by + return _mm_add_ps(t3, t2); // ax*bx + az*bz + ay*by + aw*bw + // ay*by + aw*bw + ax*bx + az*bz + // az*bz + ax*bx + aw*bw + ay*by + // aw*bw + ay*by + az*bz + ax*bx +} + +PX_FORCE_INLINE Vec3V V3Cross(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + const __m128 r1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w + const __m128 r2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w + const __m128 l1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w + const __m128 l2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w + return _mm_sub_ps(_mm_mul_ps(l1, l2), _mm_mul_ps(r1, r2)); +} + +PX_FORCE_INLINE VecCrossV V3PrepareCross(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + VecCrossV v; + v.mR1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w + v.mL1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w + return v; +} + +PX_FORCE_INLINE Vec3V V3Cross(const VecCrossV& a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(b); + const __m128 r2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w + const __m128 l2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w + return _mm_sub_ps(_mm_mul_ps(a.mL1, l2), _mm_mul_ps(a.mR1, r2)); +} + +PX_FORCE_INLINE Vec3V V3Cross(const Vec3V a, const VecCrossV& b) +{ + ASSERT_ISVALIDVEC3V(a); + const __m128 r2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w + const __m128 l2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w + return _mm_sub_ps(_mm_mul_ps(b.mR1, r2), _mm_mul_ps(b.mL1, l2)); +} + +PX_FORCE_INLINE Vec3V V3Cross(const VecCrossV& a, const VecCrossV& b) +{ + return _mm_sub_ps(_mm_mul_ps(a.mL1, b.mR1), _mm_mul_ps(a.mR1, b.mL1)); +} + +PX_FORCE_INLINE FloatV V3Length(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + return _mm_sqrt_ps(V3Dot(a, a)); +} + +PX_FORCE_INLINE FloatV V3LengthSq(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + return V3Dot(a, a); +} + +PX_FORCE_INLINE Vec3V V3Normalize(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISFINITELENGTH(a); + return V3ScaleInv(a, _mm_sqrt_ps(V3Dot(a, a))); +} + +PX_FORCE_INLINE Vec3V V3NormalizeFast(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISFINITELENGTH(a); + return V3Scale(a, _mm_rsqrt_ps(V3Dot(a, a))); +} + +PX_FORCE_INLINE Vec3V V3NormalizeSafe(const Vec3V a, const Vec3V unsafeReturnValue) +{ + ASSERT_ISVALIDVEC3V(a); + const __m128 eps = FEps(); + const __m128 length = V3Length(a); + const __m128 isGreaterThanZero = FIsGrtr(length, eps); + return V3Sel(isGreaterThanZero, V3ScaleInv(a, length), unsafeReturnValue); +} + +PX_FORCE_INLINE Vec3V V3Sel(const BoolV c, const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(_mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a))); + return _mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a)); +} + +PX_FORCE_INLINE BoolV V3IsGrtr(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return _mm_cmpgt_ps(a, b); +} + +PX_FORCE_INLINE BoolV V3IsGrtrOrEq(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return _mm_cmpge_ps(a, b); +} + +PX_FORCE_INLINE BoolV V3IsEq(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return _mm_cmpeq_ps(a, b); +} + +PX_FORCE_INLINE Vec3V V3Max(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return _mm_max_ps(a, b); +} + +PX_FORCE_INLINE Vec3V V3Min(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return _mm_min_ps(a, b); +} + +PX_FORCE_INLINE FloatV V3ExtractMax(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)); + const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)); + const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)); + return _mm_max_ps(_mm_max_ps(shuf1, shuf2), shuf3); +} + +PX_FORCE_INLINE FloatV V3ExtractMin(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)); + const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)); + const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)); + return _mm_min_ps(_mm_min_ps(shuf1, shuf2), shuf3); +} + +//// if(a > 0.0f) return 1.0f; else if a == 0.f return 0.f, else return -1.f; +// PX_FORCE_INLINE Vec3V V3MathSign(const Vec3V a) +//{ +// VECMATHAOS_ASSERT(isValidVec3V(a)); +// +// const __m128i ai = _mm_cvtps_epi32(a); +// const __m128i bi = _mm_cvtps_epi32(V3Neg(a)); +// const __m128 aa = _mm_cvtepi32_ps(_mm_srai_epi32(ai, 31)); +// const __m128 bb = _mm_cvtepi32_ps(_mm_srai_epi32(bi, 31)); +// return _mm_or_ps(aa, bb); +//} + +// return (a >= 0.0f) ? 1.0f : -1.0f; +PX_FORCE_INLINE Vec3V V3Sign(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + const __m128 zero = V3Zero(); + const __m128 one = V3One(); + const __m128 none = V3Neg(one); + return V3Sel(V3IsGrtrOrEq(a, zero), one, none); +} + +PX_FORCE_INLINE Vec3V V3Clamp(const Vec3V a, const Vec3V minV, const Vec3V maxV) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(minV); + ASSERT_ISVALIDVEC3V(maxV); + return V3Max(V3Min(a, maxV), minV); +} + +PX_FORCE_INLINE PxU32 V3AllGrtr(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return internalWindowsSimd::BAllTrue3_R(V4IsGrtr(a, b)); +} + +PX_FORCE_INLINE PxU32 V3AllGrtrOrEq(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return internalWindowsSimd::BAllTrue3_R(V4IsGrtrOrEq(a, b)); +} + +PX_FORCE_INLINE PxU32 V3AllEq(const Vec3V a, const Vec3V b) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(b); + return internalWindowsSimd::BAllTrue3_R(V4IsEq(a, b)); +} + +PX_FORCE_INLINE Vec3V V3Round(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + + // return _mm_round_ps(a, 0x0); + const Vec3V half = V3Load(0.5f); + const __m128 signBit = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_cvtps_epi32(a), 31)); + const Vec3V aRound = V3Sub(V3Add(a, half), signBit); + __m128i tmp = _mm_cvttps_epi32(aRound); + return _mm_cvtepi32_ps(tmp); +} + +PX_FORCE_INLINE Vec3V V3Sin(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + + // Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI + const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f); + const Vec4V twoPi = V4LoadA(g_PXTwoPi.f); + const Vec3V tmp = V3Scale(a, recipTwoPi); + const Vec3V b = V3Round(tmp); + const Vec3V V1 = V3NegScaleSub(b, twoPi, a); + + // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! - + // V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI) + const Vec3V V2 = V3Mul(V1, V1); + const Vec3V V3 = V3Mul(V2, V1); + const Vec3V V5 = V3Mul(V3, V2); + const Vec3V V7 = V3Mul(V5, V2); + const Vec3V V9 = V3Mul(V7, V2); + const Vec3V V11 = V3Mul(V9, V2); + const Vec3V V13 = V3Mul(V11, V2); + const Vec3V V15 = V3Mul(V13, V2); + const Vec3V V17 = V3Mul(V15, V2); + const Vec3V V19 = V3Mul(V17, V2); + const Vec3V V21 = V3Mul(V19, V2); + const Vec3V V23 = V3Mul(V21, V2); + + const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f); + const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f); + const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f); + + const FloatV S1 = V4GetY(sinCoefficients0); + const FloatV S2 = V4GetZ(sinCoefficients0); + const FloatV S3 = V4GetW(sinCoefficients0); + const FloatV S4 = V4GetX(sinCoefficients1); + const FloatV S5 = V4GetY(sinCoefficients1); + const FloatV S6 = V4GetZ(sinCoefficients1); + const FloatV S7 = V4GetW(sinCoefficients1); + const FloatV S8 = V4GetX(sinCoefficients2); + const FloatV S9 = V4GetY(sinCoefficients2); + const FloatV S10 = V4GetZ(sinCoefficients2); + const FloatV S11 = V4GetW(sinCoefficients2); + + Vec3V Result; + Result = V3ScaleAdd(V3, S1, V1); + Result = V3ScaleAdd(V5, S2, Result); + Result = V3ScaleAdd(V7, S3, Result); + Result = V3ScaleAdd(V9, S4, Result); + Result = V3ScaleAdd(V11, S5, Result); + Result = V3ScaleAdd(V13, S6, Result); + Result = V3ScaleAdd(V15, S7, Result); + Result = V3ScaleAdd(V17, S8, Result); + Result = V3ScaleAdd(V19, S9, Result); + Result = V3ScaleAdd(V21, S10, Result); + Result = V3ScaleAdd(V23, S11, Result); + + ASSERT_ISVALIDVEC3V(Result); + return Result; +} + +PX_FORCE_INLINE Vec3V V3Cos(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + + // Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI + const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f); + const Vec4V twoPi = V4LoadA(g_PXTwoPi.f); + const Vec3V tmp = V3Scale(a, recipTwoPi); + const Vec3V b = V3Round(tmp); + const Vec3V V1 = V3NegScaleSub(b, twoPi, a); + + // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! - + // V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI) + const Vec3V V2 = V3Mul(V1, V1); + const Vec3V V4 = V3Mul(V2, V2); + const Vec3V V6 = V3Mul(V4, V2); + const Vec3V V8 = V3Mul(V4, V4); + const Vec3V V10 = V3Mul(V6, V4); + const Vec3V V12 = V3Mul(V6, V6); + const Vec3V V14 = V3Mul(V8, V6); + const Vec3V V16 = V3Mul(V8, V8); + const Vec3V V18 = V3Mul(V10, V8); + const Vec3V V20 = V3Mul(V10, V10); + const Vec3V V22 = V3Mul(V12, V10); + + const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f); + const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f); + const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f); + + const FloatV C1 = V4GetY(cosCoefficients0); + const FloatV C2 = V4GetZ(cosCoefficients0); + const FloatV C3 = V4GetW(cosCoefficients0); + const FloatV C4 = V4GetX(cosCoefficients1); + const FloatV C5 = V4GetY(cosCoefficients1); + const FloatV C6 = V4GetZ(cosCoefficients1); + const FloatV C7 = V4GetW(cosCoefficients1); + const FloatV C8 = V4GetX(cosCoefficients2); + const FloatV C9 = V4GetY(cosCoefficients2); + const FloatV C10 = V4GetZ(cosCoefficients2); + const FloatV C11 = V4GetW(cosCoefficients2); + + Vec3V Result; + Result = V3ScaleAdd(V2, C1, V3One()); + Result = V3ScaleAdd(V4, C2, Result); + Result = V3ScaleAdd(V6, C3, Result); + Result = V3ScaleAdd(V8, C4, Result); + Result = V3ScaleAdd(V10, C5, Result); + Result = V3ScaleAdd(V12, C6, Result); + Result = V3ScaleAdd(V14, C7, Result); + Result = V3ScaleAdd(V16, C8, Result); + Result = V3ScaleAdd(V18, C9, Result); + Result = V3ScaleAdd(V20, C10, Result); + Result = V3ScaleAdd(V22, C11, Result); + + ASSERT_ISVALIDVEC3V(Result); + return Result; +} + +PX_FORCE_INLINE Vec3V V3PermYZZ(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 2, 1)); +} + +PX_FORCE_INLINE Vec3V V3PermXYX(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 1, 0)); +} + +PX_FORCE_INLINE Vec3V V3PermYZX(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); +} + +PX_FORCE_INLINE Vec3V V3PermZXY(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); +} + +PX_FORCE_INLINE Vec3V V3PermZZY(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 2, 2)); +} + +PX_FORCE_INLINE Vec3V V3PermYXX(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 0, 1)); +} + +PX_FORCE_INLINE Vec3V V3Perm_Zero_1Z_0Y(const Vec3V v0, const Vec3V v1) +{ + ASSERT_ISVALIDVEC3V(v0); + ASSERT_ISVALIDVEC3V(v1); + return _mm_shuffle_ps(v1, v0, _MM_SHUFFLE(3, 1, 2, 3)); +} + +PX_FORCE_INLINE Vec3V V3Perm_0Z_Zero_1X(const Vec3V v0, const Vec3V v1) +{ + ASSERT_ISVALIDVEC3V(v0); + ASSERT_ISVALIDVEC3V(v1); + return _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(3, 0, 3, 2)); +} + +PX_FORCE_INLINE Vec3V V3Perm_1Y_0X_Zero(const Vec3V v0, const Vec3V v1) +{ + ASSERT_ISVALIDVEC3V(v0); + ASSERT_ISVALIDVEC3V(v1); + // There must be a better way to do this. + Vec3V v2 = V3Zero(); + FloatV y1 = V3GetY(v1); + FloatV x0 = V3GetX(v0); + v2 = V3SetX(v2, y1); + return V3SetY(v2, x0); +} + +PX_FORCE_INLINE FloatV V3SumElems(const Vec3V a) +{ + ASSERT_ISVALIDVEC3V(a); + const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)); // z,y,x,w + const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)); // y,x,w,z + const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)); // x,w,z,y + return _mm_add_ps(_mm_add_ps(shuf1, shuf2), shuf3); +} + +PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V min, const Vec3V max) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(min); + ASSERT_ISVALIDVEC3V(max); + const BoolV c = BOr(V3IsGrtr(a, max), V3IsGrtr(min, a)); + return PxU32(!BAllEqFFFF(c)); +} + +PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V min, const Vec3V max) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(min); + ASSERT_ISVALIDVEC3V(max); + const BoolV c = BAnd(V3IsGrtrOrEq(a, min), V3IsGrtrOrEq(max, a)); + return BAllEqTTTT(c); +} + +PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V bounds) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(bounds); + return V3OutOfBounds(a, V3Neg(bounds), bounds); +} + +PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V bounds) +{ + ASSERT_ISVALIDVEC3V(a); + ASSERT_ISVALIDVEC3V(bounds); + return V3InBounds(a, V3Neg(bounds), bounds); +} + +PX_FORCE_INLINE void V3Transpose(Vec3V& col0, Vec3V& col1, Vec3V& col2) +{ + ASSERT_ISVALIDVEC3V(col0); + ASSERT_ISVALIDVEC3V(col1); + ASSERT_ISVALIDVEC3V(col2); + const Vec3V col3 = _mm_setzero_ps(); + Vec3V tmp0 = _mm_unpacklo_ps(col0, col1); + Vec3V tmp2 = _mm_unpacklo_ps(col2, col3); + Vec3V tmp1 = _mm_unpackhi_ps(col0, col1); + Vec3V tmp3 = _mm_unpackhi_ps(col2, col3); + col0 = _mm_movelh_ps(tmp0, tmp2); + col1 = _mm_movehl_ps(tmp2, tmp0); + col2 = _mm_movelh_ps(tmp1, tmp3); +} + +////////////////////////////////// +// VEC4V +////////////////////////////////// + +PX_FORCE_INLINE Vec4V V4Splat(const FloatV f) +{ + ASSERT_ISVALIDFLOATV(f); + // return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0,0,0,0)); + return f; +} + +PX_FORCE_INLINE Vec4V V4Merge(const FloatV* const floatVArray) +{ + ASSERT_ISVALIDFLOATV(floatVArray[0]); + ASSERT_ISVALIDFLOATV(floatVArray[1]); + ASSERT_ISVALIDFLOATV(floatVArray[2]); + ASSERT_ISVALIDFLOATV(floatVArray[3]); + const __m128 xw = _mm_move_ss(floatVArray[1], floatVArray[0]); // y, y, y, x + const __m128 yz = _mm_move_ss(floatVArray[2], floatVArray[3]); // z, z, z, w + return _mm_shuffle_ps(xw, yz, _MM_SHUFFLE(0, 2, 1, 0)); +} + +PX_FORCE_INLINE Vec4V V4Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z, const FloatVArg w) +{ + ASSERT_ISVALIDFLOATV(x); + ASSERT_ISVALIDFLOATV(y); + ASSERT_ISVALIDFLOATV(z); + ASSERT_ISVALIDFLOATV(w); + const __m128 xw = _mm_move_ss(y, x); // y, y, y, x + const __m128 yz = _mm_move_ss(z, w); // z, z, z, w + return _mm_shuffle_ps(xw, yz, _MM_SHUFFLE(0, 2, 1, 0)); +} + +PX_FORCE_INLINE Vec4V V4MergeW(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w) +{ + const Vec4V xz = _mm_unpackhi_ps(x, z); + const Vec4V yw = _mm_unpackhi_ps(y, w); + return _mm_unpackhi_ps(xz, yw); +} + +PX_FORCE_INLINE Vec4V V4MergeZ(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w) +{ + const Vec4V xz = _mm_unpackhi_ps(x, z); + const Vec4V yw = _mm_unpackhi_ps(y, w); + return _mm_unpacklo_ps(xz, yw); +} + +PX_FORCE_INLINE Vec4V V4MergeY(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w) +{ + const Vec4V xz = _mm_unpacklo_ps(x, z); + const Vec4V yw = _mm_unpacklo_ps(y, w); + return _mm_unpackhi_ps(xz, yw); +} + +PX_FORCE_INLINE Vec4V V4MergeX(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w) +{ + const Vec4V xz = _mm_unpacklo_ps(x, z); + const Vec4V yw = _mm_unpacklo_ps(y, w); + return _mm_unpacklo_ps(xz, yw); +} + +PX_FORCE_INLINE Vec4V V4UnpackXY(const Vec4VArg a, const Vec4VArg b) +{ + return _mm_unpacklo_ps(a, b); +} + +PX_FORCE_INLINE Vec4V V4UnpackZW(const Vec4VArg a, const Vec4VArg b) +{ + return _mm_unpackhi_ps(a, b); +} + +PX_FORCE_INLINE Vec4V V4PermYXWZ(const Vec4V a) +{ + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)); +} + +PX_FORCE_INLINE Vec4V V4PermXZXZ(const Vec4V a) +{ + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 0, 2, 0)); +} + +PX_FORCE_INLINE Vec4V V4PermYWYW(const Vec4V a) +{ + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 3, 1)); +} + +PX_FORCE_INLINE Vec4V V4PermYZXW(const Vec4V a) +{ + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); +} + +template <PxU8 x, PxU8 y, PxU8 z, PxU8 w> +PX_FORCE_INLINE Vec4V V4Perm(const Vec4V a) +{ + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(w, z, y, x)); +} + +PX_FORCE_INLINE Vec4V V4UnitW() +{ + const PX_ALIGN(16, PxF32 w[4]) = { 0.0f, 0.0f, 0.0f, 1.0f }; + const __m128 w128 = _mm_load_ps(w); + return w128; +} + +PX_FORCE_INLINE Vec4V V4UnitX() +{ + const PX_ALIGN(16, PxF32 x[4]) = { 1.0f, 0.0f, 0.0f, 0.0f }; + const __m128 x128 = _mm_load_ps(x); + return x128; +} + +PX_FORCE_INLINE Vec4V V4UnitY() +{ + const PX_ALIGN(16, PxF32 y[4]) = { 0.0f, 1.0f, 0.0f, 0.0f }; + const __m128 y128 = _mm_load_ps(y); + return y128; +} + +PX_FORCE_INLINE Vec4V V4UnitZ() +{ + const PX_ALIGN(16, PxF32 z[4]) = { 0.0f, 0.0f, 1.0f, 0.0f }; + const __m128 z128 = _mm_load_ps(z); + return z128; +} + +PX_FORCE_INLINE FloatV V4GetW(const Vec4V f) +{ + return _mm_shuffle_ps(f, f, _MM_SHUFFLE(3, 3, 3, 3)); +} + +PX_FORCE_INLINE FloatV V4GetX(const Vec4V f) +{ + return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0, 0, 0, 0)); +} + +PX_FORCE_INLINE FloatV V4GetY(const Vec4V f) +{ + return _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 1, 1)); +} + +PX_FORCE_INLINE FloatV V4GetZ(const Vec4V f) +{ + return _mm_shuffle_ps(f, f, _MM_SHUFFLE(2, 2, 2, 2)); +} + +PX_FORCE_INLINE Vec4V V4SetW(const Vec4V v, const FloatV f) +{ + ASSERT_ISVALIDFLOATV(f); + return V4Sel(BTTTF(), v, f); +} + +PX_FORCE_INLINE Vec4V V4ClearW(const Vec4V v) +{ + return _mm_and_ps(v, (VecI32V&)internalWindowsSimd::gMaskXYZ); +} + +PX_FORCE_INLINE Vec4V V4SetX(const Vec4V v, const FloatV f) +{ + ASSERT_ISVALIDFLOATV(f); + return V4Sel(BFTTT(), v, f); +} + +PX_FORCE_INLINE Vec4V V4SetY(const Vec4V v, const FloatV f) +{ + ASSERT_ISVALIDFLOATV(f); + return V4Sel(BTFTT(), v, f); +} + +PX_FORCE_INLINE Vec4V V4SetZ(const Vec4V v, const FloatV f) +{ + ASSERT_ISVALIDFLOATV(f); + return V4Sel(BTTFT(), v, f); +} + +PX_FORCE_INLINE Vec4V V4Zero() +{ + return _mm_setzero_ps(); +} + +PX_FORCE_INLINE Vec4V V4One() +{ + return V4Load(1.0f); +} + +PX_FORCE_INLINE Vec4V V4Eps() +{ + return V4Load(PX_EPS_REAL); +} + +PX_FORCE_INLINE Vec4V V4Neg(const Vec4V f) +{ + return _mm_sub_ps(_mm_setzero_ps(), f); +} + +PX_FORCE_INLINE Vec4V V4Add(const Vec4V a, const Vec4V b) +{ + return _mm_add_ps(a, b); +} + +PX_FORCE_INLINE Vec4V V4Sub(const Vec4V a, const Vec4V b) +{ + return _mm_sub_ps(a, b); +} + +PX_FORCE_INLINE Vec4V V4Scale(const Vec4V a, const FloatV b) +{ + return _mm_mul_ps(a, b); +} + +PX_FORCE_INLINE Vec4V V4Mul(const Vec4V a, const Vec4V b) +{ + return _mm_mul_ps(a, b); +} + +PX_FORCE_INLINE Vec4V V4ScaleInv(const Vec4V a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(b); + return _mm_div_ps(a, b); +} + +PX_FORCE_INLINE Vec4V V4Div(const Vec4V a, const Vec4V b) +{ + return _mm_div_ps(a, b); +} + +PX_FORCE_INLINE Vec4V V4ScaleInvFast(const Vec4V a, const FloatV b) +{ + ASSERT_ISVALIDFLOATV(b); + return _mm_mul_ps(a, _mm_rcp_ps(b)); +} + +PX_FORCE_INLINE Vec4V V4DivFast(const Vec4V a, const Vec4V b) +{ + return _mm_mul_ps(a, _mm_rcp_ps(b)); +} + +PX_FORCE_INLINE Vec4V V4Recip(const Vec4V a) +{ + return _mm_div_ps(V4One(), a); +} + +PX_FORCE_INLINE Vec4V V4RecipFast(const Vec4V a) +{ + return _mm_rcp_ps(a); +} + +PX_FORCE_INLINE Vec4V V4Rsqrt(const Vec4V a) +{ + return _mm_div_ps(V4One(), _mm_sqrt_ps(a)); +} + +PX_FORCE_INLINE Vec4V V4RsqrtFast(const Vec4V a) +{ + return _mm_rsqrt_ps(a); +} + +PX_FORCE_INLINE Vec4V V4Sqrt(const Vec4V a) +{ + return _mm_sqrt_ps(a); +} + +PX_FORCE_INLINE Vec4V V4ScaleAdd(const Vec4V a, const FloatV b, const Vec4V c) +{ + ASSERT_ISVALIDFLOATV(b); + return V4Add(V4Scale(a, b), c); +} + +PX_FORCE_INLINE Vec4V V4NegScaleSub(const Vec4V a, const FloatV b, const Vec4V c) +{ + ASSERT_ISVALIDFLOATV(b); + return V4Sub(c, V4Scale(a, b)); +} + +PX_FORCE_INLINE Vec4V V4MulAdd(const Vec4V a, const Vec4V b, const Vec4V c) +{ + return V4Add(V4Mul(a, b), c); +} + +PX_FORCE_INLINE Vec4V V4NegMulSub(const Vec4V a, const Vec4V b, const Vec4V c) +{ + return V4Sub(c, V4Mul(a, b)); +} + +PX_FORCE_INLINE Vec4V V4Abs(const Vec4V a) +{ + return V4Max(a, V4Neg(a)); +} + +PX_FORCE_INLINE FloatV V4SumElements(const Vec4V a) +{ + const Vec4V xy = V4UnpackXY(a, a); // x,x,y,y + const Vec4V zw = V4UnpackZW(a, a); // z,z,w,w + const Vec4V xz_yw = V4Add(xy, zw); // x+z,x+z,y+w,y+w + const FloatV xz = V4GetX(xz_yw); // x+z + const FloatV yw = V4GetZ(xz_yw); // y+w + return FAdd(xz, yw); // sum +} + +PX_FORCE_INLINE FloatV V4Dot(const Vec4V a, const Vec4V b) +{ + const __m128 dot1 = _mm_mul_ps(a, b); // x,y,z,w + const __m128 shuf1 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(2, 1, 0, 3)); // w,x,y,z + const __m128 shuf2 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(1, 0, 3, 2)); // z,w,x,y + const __m128 shuf3 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(0, 3, 2, 1)); // y,z,w,x + return _mm_add_ps(_mm_add_ps(shuf2, shuf3), _mm_add_ps(dot1, shuf1)); + + // PT: this version has two less instructions but we should check its accuracy + // aw*bw | az*bz | ay*by | ax*bx + // const __m128 t0 = _mm_mul_ps(a, b); + // ay*by | ax*bx | aw*bw | az*bz + // const __m128 t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(1,0,3,2)); + // ay*by + aw*bw | ax*bx + az*bz | aw*bw + ay*by | az*bz + ax*bx + // const __m128 t2 = _mm_add_ps(t0, t1); + // ax*bx + az*bz | ay*by + aw*bw | az*bz + ax*bx | aw*bw + ay*by + // const __m128 t3 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,3,0,1)); + // ax*bx + az*bz + ay*by + aw*bw + // return _mm_add_ps(t3, t2); + // ay*by + aw*bw + ax*bx + az*bz + // az*bz + ax*bx + aw*bw + ay*by + // aw*bw + ay*by + az*bz + ax*bx +} + +PX_FORCE_INLINE FloatV V4Dot3(const Vec4V a, const Vec4V b) +{ + const __m128 dot1 = _mm_mul_ps(a, b); // aw*bw | az*bz | ay*by | ax*bx + const __m128 shuf1 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(0, 0, 0, 0)); // ax*bx | ax*bx | ax*bx | ax*bx + const __m128 shuf2 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(1, 1, 1, 1)); // ay*by | ay*by | ay*by | ay*by + const __m128 shuf3 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(2, 2, 2, 2)); // az*bz | az*bz | az*bz | az*bz + return _mm_add_ps(_mm_add_ps(shuf1, shuf2), shuf3); // ax*bx + ay*by + az*bz in each component +} + +PX_FORCE_INLINE Vec4V V4Cross(const Vec4V a, const Vec4V b) +{ + const __m128 r1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w + const __m128 r2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w + const __m128 l1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w + const __m128 l2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w + return _mm_sub_ps(_mm_mul_ps(l1, l2), _mm_mul_ps(r1, r2)); +} + +PX_FORCE_INLINE FloatV V4Length(const Vec4V a) +{ + return _mm_sqrt_ps(V4Dot(a, a)); +} + +PX_FORCE_INLINE FloatV V4LengthSq(const Vec4V a) +{ + return V4Dot(a, a); +} + +PX_FORCE_INLINE Vec4V V4Normalize(const Vec4V a) +{ + ASSERT_ISFINITELENGTH(a); + return V4ScaleInv(a, _mm_sqrt_ps(V4Dot(a, a))); +} + +PX_FORCE_INLINE Vec4V V4NormalizeFast(const Vec4V a) +{ + ASSERT_ISFINITELENGTH(a); + return V4ScaleInvFast(a, _mm_sqrt_ps(V4Dot(a, a))); +} + +PX_FORCE_INLINE Vec4V V4NormalizeSafe(const Vec4V a, const Vec4V unsafeReturnValue) +{ + const __m128 eps = V3Eps(); + const __m128 length = V4Length(a); + const __m128 isGreaterThanZero = V4IsGrtr(length, eps); + return V4Sel(isGreaterThanZero, V4ScaleInv(a, length), unsafeReturnValue); +} + +PX_FORCE_INLINE Vec4V V4Sel(const BoolV c, const Vec4V a, const Vec4V b) +{ + return _mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a)); +} + +PX_FORCE_INLINE BoolV V4IsGrtr(const Vec4V a, const Vec4V b) +{ + return _mm_cmpgt_ps(a, b); +} + +PX_FORCE_INLINE BoolV V4IsGrtrOrEq(const Vec4V a, const Vec4V b) +{ + return _mm_cmpge_ps(a, b); +} + +PX_FORCE_INLINE BoolV V4IsEq(const Vec4V a, const Vec4V b) +{ + return _mm_cmpeq_ps(a, b); +} + +PX_FORCE_INLINE BoolV V4IsEqU32(const VecU32V a, const VecU32V b) +{ + return internalWindowsSimd::m128_I2F( + _mm_cmpeq_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b))); +} + +PX_FORCE_INLINE Vec4V V4Max(const Vec4V a, const Vec4V b) +{ + return _mm_max_ps(a, b); +} + +PX_FORCE_INLINE Vec4V V4Min(const Vec4V a, const Vec4V b) +{ + return _mm_min_ps(a, b); +} + +PX_FORCE_INLINE FloatV V4ExtractMax(const Vec4V a) +{ + const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 1, 0, 3)); + const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)); + const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 3, 2, 1)); + + return _mm_max_ps(_mm_max_ps(a, shuf1), _mm_max_ps(shuf2, shuf3)); +} + +PX_FORCE_INLINE FloatV V4ExtractMin(const Vec4V a) +{ + const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 1, 0, 3)); + const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)); + const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 3, 2, 1)); + + return _mm_min_ps(_mm_min_ps(a, shuf1), _mm_min_ps(shuf2, shuf3)); +} + +PX_FORCE_INLINE Vec4V V4Clamp(const Vec4V a, const Vec4V minV, const Vec4V maxV) +{ + return V4Max(V4Min(a, maxV), minV); +} + +PX_FORCE_INLINE PxU32 V4AllGrtr(const Vec4V a, const Vec4V b) +{ + return internalWindowsSimd::BAllTrue4_R(V4IsGrtr(a, b)); +} + +PX_FORCE_INLINE PxU32 V4AllGrtrOrEq(const Vec4V a, const Vec4V b) +{ + return internalWindowsSimd::BAllTrue4_R(V4IsGrtrOrEq(a, b)); +} + +PX_FORCE_INLINE PxU32 V4AllGrtrOrEq3(const Vec4V a, const Vec4V b) +{ + return internalWindowsSimd::BAllTrue3_R(V4IsGrtrOrEq(a, b)); +} + +PX_FORCE_INLINE PxU32 V4AllEq(const Vec4V a, const Vec4V b) +{ + return internalWindowsSimd::BAllTrue4_R(V4IsEq(a, b)); +} + +PX_FORCE_INLINE PxU32 V4AnyGrtr3(const Vec4V a, const Vec4V b) +{ + return internalWindowsSimd::BAnyTrue3_R(V4IsGrtr(a, b)); +} + +PX_FORCE_INLINE Vec4V V4Round(const Vec4V a) +{ + // return _mm_round_ps(a, 0x0); + const Vec4V half = V4Load(0.5f); + const __m128 signBit = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_cvtps_epi32(a), 31)); + const Vec4V aRound = V4Sub(V4Add(a, half), signBit); + const __m128i tmp = _mm_cvttps_epi32(aRound); + return _mm_cvtepi32_ps(tmp); +} + +PX_FORCE_INLINE Vec4V V4Sin(const Vec4V a) +{ + const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f); + const Vec4V twoPi = V4LoadA(g_PXTwoPi.f); + const Vec4V tmp = V4Mul(a, recipTwoPi); + const Vec4V b = V4Round(tmp); + const Vec4V V1 = V4NegMulSub(twoPi, b, a); + + // sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! - + // V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI) + const Vec4V V2 = V4Mul(V1, V1); + const Vec4V V3 = V4Mul(V2, V1); + const Vec4V V5 = V4Mul(V3, V2); + const Vec4V V7 = V4Mul(V5, V2); + const Vec4V V9 = V4Mul(V7, V2); + const Vec4V V11 = V4Mul(V9, V2); + const Vec4V V13 = V4Mul(V11, V2); + const Vec4V V15 = V4Mul(V13, V2); + const Vec4V V17 = V4Mul(V15, V2); + const Vec4V V19 = V4Mul(V17, V2); + const Vec4V V21 = V4Mul(V19, V2); + const Vec4V V23 = V4Mul(V21, V2); + + const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f); + const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f); + const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f); + + const FloatV S1 = V4GetY(sinCoefficients0); + const FloatV S2 = V4GetZ(sinCoefficients0); + const FloatV S3 = V4GetW(sinCoefficients0); + const FloatV S4 = V4GetX(sinCoefficients1); + const FloatV S5 = V4GetY(sinCoefficients1); + const FloatV S6 = V4GetZ(sinCoefficients1); + const FloatV S7 = V4GetW(sinCoefficients1); + const FloatV S8 = V4GetX(sinCoefficients2); + const FloatV S9 = V4GetY(sinCoefficients2); + const FloatV S10 = V4GetZ(sinCoefficients2); + const FloatV S11 = V4GetW(sinCoefficients2); + + Vec4V Result; + Result = V4MulAdd(S1, V3, V1); + Result = V4MulAdd(S2, V5, Result); + Result = V4MulAdd(S3, V7, Result); + Result = V4MulAdd(S4, V9, Result); + Result = V4MulAdd(S5, V11, Result); + Result = V4MulAdd(S6, V13, Result); + Result = V4MulAdd(S7, V15, Result); + Result = V4MulAdd(S8, V17, Result); + Result = V4MulAdd(S9, V19, Result); + Result = V4MulAdd(S10, V21, Result); + Result = V4MulAdd(S11, V23, Result); + + return Result; +} + +PX_FORCE_INLINE Vec4V V4Cos(const Vec4V a) +{ + const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f); + const FloatV twoPi = V4LoadA(g_PXTwoPi.f); + const Vec4V tmp = V4Mul(a, recipTwoPi); + const Vec4V b = V4Round(tmp); + const Vec4V V1 = V4NegMulSub(twoPi, b, a); + + // cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! - + // V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI) + const Vec4V V2 = V4Mul(V1, V1); + const Vec4V V4 = V4Mul(V2, V2); + const Vec4V V6 = V4Mul(V4, V2); + const Vec4V V8 = V4Mul(V4, V4); + const Vec4V V10 = V4Mul(V6, V4); + const Vec4V V12 = V4Mul(V6, V6); + const Vec4V V14 = V4Mul(V8, V6); + const Vec4V V16 = V4Mul(V8, V8); + const Vec4V V18 = V4Mul(V10, V8); + const Vec4V V20 = V4Mul(V10, V10); + const Vec4V V22 = V4Mul(V12, V10); + + const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f); + const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f); + const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f); + + const FloatV C1 = V4GetY(cosCoefficients0); + const FloatV C2 = V4GetZ(cosCoefficients0); + const FloatV C3 = V4GetW(cosCoefficients0); + const FloatV C4 = V4GetX(cosCoefficients1); + const FloatV C5 = V4GetY(cosCoefficients1); + const FloatV C6 = V4GetZ(cosCoefficients1); + const FloatV C7 = V4GetW(cosCoefficients1); + const FloatV C8 = V4GetX(cosCoefficients2); + const FloatV C9 = V4GetY(cosCoefficients2); + const FloatV C10 = V4GetZ(cosCoefficients2); + const FloatV C11 = V4GetW(cosCoefficients2); + + Vec4V Result; + Result = V4MulAdd(C1, V2, V4One()); + Result = V4MulAdd(C2, V4, Result); + Result = V4MulAdd(C3, V6, Result); + Result = V4MulAdd(C4, V8, Result); + Result = V4MulAdd(C5, V10, Result); + Result = V4MulAdd(C6, V12, Result); + Result = V4MulAdd(C7, V14, Result); + Result = V4MulAdd(C8, V16, Result); + Result = V4MulAdd(C9, V18, Result); + Result = V4MulAdd(C10, V20, Result); + Result = V4MulAdd(C11, V22, Result); + + return Result; +} + +PX_FORCE_INLINE void V4Transpose(Vec4V& col0, Vec4V& col1, Vec4V& col2, Vec4V& col3) +{ + Vec4V tmp0 = _mm_unpacklo_ps(col0, col1); + Vec4V tmp2 = _mm_unpacklo_ps(col2, col3); + Vec4V tmp1 = _mm_unpackhi_ps(col0, col1); + Vec4V tmp3 = _mm_unpackhi_ps(col2, col3); + col0 = _mm_movelh_ps(tmp0, tmp2); + col1 = _mm_movehl_ps(tmp2, tmp0); + col2 = _mm_movelh_ps(tmp1, tmp3); + col3 = _mm_movehl_ps(tmp3, tmp1); +} + +////////////////////////////////// +// BoolV +////////////////////////////////// + +PX_FORCE_INLINE BoolV BFFFF() +{ + return _mm_setzero_ps(); +} + +PX_FORCE_INLINE BoolV BFFFT() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0,0,0,0xFFFFFFFF}; + const __m128 ffft=_mm_load_ps((float*)&f); + return ffft;*/ + return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, 0, 0, 0)); +} + +PX_FORCE_INLINE BoolV BFFTF() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0}; + const __m128 fftf=_mm_load_ps((float*)&f); + return fftf;*/ + return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, -1, 0, 0)); +} + +PX_FORCE_INLINE BoolV BFFTT() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0xFFFFFFFF}; + const __m128 fftt=_mm_load_ps((float*)&f); + return fftt;*/ + return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, -1, 0, 0)); +} + +PX_FORCE_INLINE BoolV BFTFF() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0}; + const __m128 ftff=_mm_load_ps((float*)&f); + return ftff;*/ + return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, 0, -1, 0)); +} + +PX_FORCE_INLINE BoolV BFTFT() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0xFFFFFFFF}; + const __m128 ftft=_mm_load_ps((float*)&f); + return ftft;*/ + return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, 0, -1, 0)); +} + +PX_FORCE_INLINE BoolV BFTTF() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0xFFFFFFFF,0}; + const __m128 fttf=_mm_load_ps((float*)&f); + return fttf;*/ + return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, -1, -1, 0)); +} + +PX_FORCE_INLINE BoolV BFTTT() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF}; + const __m128 fttt=_mm_load_ps((float*)&f); + return fttt;*/ + return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, -1, -1, 0)); +} + +PX_FORCE_INLINE BoolV BTFFF() +{ + // const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0}; + // const __m128 tfff=_mm_load_ps((float*)&f); + // return tfff; + return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, 0, 0, -1)); +} + +PX_FORCE_INLINE BoolV BTFFT() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0xFFFFFFFF}; + const __m128 tfft=_mm_load_ps((float*)&f); + return tfft;*/ + return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, 0, 0, -1)); +} + +PX_FORCE_INLINE BoolV BTFTF() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0xFFFFFFFF,0}; + const __m128 tftf=_mm_load_ps((float*)&f); + return tftf;*/ + return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, -1, 0, -1)); +} + +PX_FORCE_INLINE BoolV BTFTT() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0xFFFFFFFF,0xFFFFFFFF}; + const __m128 tftt=_mm_load_ps((float*)&f); + return tftt;*/ + return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, -1, 0, -1)); +} + +PX_FORCE_INLINE BoolV BTTFF() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0,0}; + const __m128 ttff=_mm_load_ps((float*)&f); + return ttff;*/ + return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, 0, -1, -1)); +} + +PX_FORCE_INLINE BoolV BTTFT() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0,0xFFFFFFFF}; + const __m128 ttft=_mm_load_ps((float*)&f); + return ttft;*/ + return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, 0, -1, -1)); +} + +PX_FORCE_INLINE BoolV BTTTF() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0}; + const __m128 tttf=_mm_load_ps((float*)&f); + return tttf;*/ + return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, -1, -1, -1)); +} + +PX_FORCE_INLINE BoolV BTTTT() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF}; + const __m128 tttt=_mm_load_ps((float*)&f); + return tttt;*/ + return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, -1, -1, -1)); +} + +PX_FORCE_INLINE BoolV BXMask() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0}; + const __m128 tfff=_mm_load_ps((float*)&f); + return tfff;*/ + return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, 0, 0, -1)); +} + +PX_FORCE_INLINE BoolV BYMask() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0}; + const __m128 ftff=_mm_load_ps((float*)&f); + return ftff;*/ + return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, 0, -1, 0)); +} + +PX_FORCE_INLINE BoolV BZMask() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0}; + const __m128 fftf=_mm_load_ps((float*)&f); + return fftf;*/ + return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, -1, 0, 0)); +} + +PX_FORCE_INLINE BoolV BWMask() +{ + /*const PX_ALIGN(16, PxU32 f[4])={0,0,0,0xFFFFFFFF}; + const __m128 ffft=_mm_load_ps((float*)&f); + return ffft;*/ + return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, 0, 0, 0)); +} + +PX_FORCE_INLINE BoolV BGetX(const BoolV f) +{ + return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0, 0, 0, 0)); +} + +PX_FORCE_INLINE BoolV BGetY(const BoolV f) +{ + return _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 1, 1)); +} + +PX_FORCE_INLINE BoolV BGetZ(const BoolV f) +{ + return _mm_shuffle_ps(f, f, _MM_SHUFFLE(2, 2, 2, 2)); +} + +PX_FORCE_INLINE BoolV BGetW(const BoolV f) +{ + return _mm_shuffle_ps(f, f, _MM_SHUFFLE(3, 3, 3, 3)); +} + +PX_FORCE_INLINE BoolV BSetX(const BoolV v, const BoolV f) +{ + return V4Sel(BFTTT(), v, f); +} + +PX_FORCE_INLINE BoolV BSetY(const BoolV v, const BoolV f) +{ + return V4Sel(BTFTT(), v, f); +} + +PX_FORCE_INLINE BoolV BSetZ(const BoolV v, const BoolV f) +{ + return V4Sel(BTTFT(), v, f); +} + +PX_FORCE_INLINE BoolV BSetW(const BoolV v, const BoolV f) +{ + return V4Sel(BTTTF(), v, f); +} + +template <int index> +BoolV BSplatElement(BoolV a) +{ + return internalWindowsSimd::m128_I2F( + _mm_shuffle_epi32(internalWindowsSimd::m128_F2I(a), _MM_SHUFFLE(index, index, index, index))); +} + +PX_FORCE_INLINE BoolV BAnd(const BoolV a, const BoolV b) +{ + return _mm_and_ps(a, b); +} + +PX_FORCE_INLINE BoolV BNot(const BoolV a) +{ + const BoolV bAllTrue(BTTTT()); + return _mm_xor_ps(a, bAllTrue); +} + +PX_FORCE_INLINE BoolV BAndNot(const BoolV a, const BoolV b) +{ + return _mm_andnot_ps(b, a); +} + +PX_FORCE_INLINE BoolV BOr(const BoolV a, const BoolV b) +{ + return _mm_or_ps(a, b); +} + +PX_FORCE_INLINE BoolV BAllTrue4(const BoolV a) +{ + const BoolV bTmp = + _mm_and_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 2, 3))); + return _mm_and_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)), + _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1))); +} + +PX_FORCE_INLINE BoolV BAnyTrue4(const BoolV a) +{ + const BoolV bTmp = + _mm_or_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 2, 3))); + return _mm_or_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)), + _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1))); +} + +PX_FORCE_INLINE BoolV BAllTrue3(const BoolV a) +{ + const BoolV bTmp = + _mm_and_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2))); + return _mm_and_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)), + _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1))); +} + +PX_FORCE_INLINE BoolV BAnyTrue3(const BoolV a) +{ + const BoolV bTmp = + _mm_or_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2))); + return _mm_or_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)), + _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1))); +} + +PX_FORCE_INLINE PxU32 BAllEq(const BoolV a, const BoolV b) +{ + const BoolV bTest = internalWindowsSimd::m128_I2F( + _mm_cmpeq_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b))); + return internalWindowsSimd::BAllTrue4_R(bTest); +} + +PX_FORCE_INLINE PxU32 BAllEqTTTT(const BoolV a) +{ + return PxU32(_mm_movemask_ps(a)==15); +} + +PX_FORCE_INLINE PxU32 BAllEqFFFF(const BoolV a) +{ + return PxU32(_mm_movemask_ps(a)==0); +} + +PX_FORCE_INLINE PxU32 BGetBitMask(const BoolV a) +{ + return PxU32(_mm_movemask_ps(a)); +} + +////////////////////////////////// +// MAT33V +////////////////////////////////// + +PX_FORCE_INLINE Vec3V M33MulV3(const Mat33V& a, const Vec3V b) +{ + const FloatV x = V3GetX(b); + const FloatV y = V3GetY(b); + const FloatV z = V3GetZ(b); + const Vec3V v0 = V3Scale(a.col0, x); + const Vec3V v1 = V3Scale(a.col1, y); + const Vec3V v2 = V3Scale(a.col2, z); + const Vec3V v0PlusV1 = V3Add(v0, v1); + return V3Add(v0PlusV1, v2); +} + +PX_FORCE_INLINE Vec3V M33TrnspsMulV3(const Mat33V& a, const Vec3V b) +{ + Vec3V v0 = V3Mul(a.col0, b); + Vec3V v1 = V3Mul(a.col1, b); + Vec3V v2 = V3Mul(a.col2, b); + V3Transpose(v0, v1, v2); + return V3Add(V3Add(v0, v1), v2); +} + +PX_FORCE_INLINE Vec3V M33MulV3AddV3(const Mat33V& A, const Vec3V b, const Vec3V c) +{ + const FloatV x = V3GetX(b); + const FloatV y = V3GetY(b); + const FloatV z = V3GetZ(b); + Vec3V result = V3ScaleAdd(A.col0, x, c); + result = V3ScaleAdd(A.col1, y, result); + return V3ScaleAdd(A.col2, z, result); +} + +PX_FORCE_INLINE Mat33V M33MulM33(const Mat33V& a, const Mat33V& b) +{ + return Mat33V(M33MulV3(a, b.col0), M33MulV3(a, b.col1), M33MulV3(a, b.col2)); +} + +PX_FORCE_INLINE Mat33V M33Add(const Mat33V& a, const Mat33V& b) +{ + return Mat33V(V3Add(a.col0, b.col0), V3Add(a.col1, b.col1), V3Add(a.col2, b.col2)); +} + +PX_FORCE_INLINE Mat33V M33Scale(const Mat33V& a, const FloatV& b) +{ + return Mat33V(V3Scale(a.col0, b), V3Scale(a.col1, b), V3Scale(a.col2, b)); +} + +PX_FORCE_INLINE Mat33V M33Sub(const Mat33V& a, const Mat33V& b) +{ + return Mat33V(V3Sub(a.col0, b.col0), V3Sub(a.col1, b.col1), V3Sub(a.col2, b.col2)); +} + +PX_FORCE_INLINE Mat33V M33Neg(const Mat33V& a) +{ + return Mat33V(V3Neg(a.col0), V3Neg(a.col1), V3Neg(a.col2)); +} + +PX_FORCE_INLINE Mat33V M33Abs(const Mat33V& a) +{ + return Mat33V(V3Abs(a.col0), V3Abs(a.col1), V3Abs(a.col2)); +} + +PX_FORCE_INLINE Mat33V M33Inverse(const Mat33V& a) +{ + const BoolV tfft = BTFFT(); + const BoolV tttf = BTTTF(); + const FloatV zero = V3Zero(); + const Vec3V cross01 = V3Cross(a.col0, a.col1); + const Vec3V cross12 = V3Cross(a.col1, a.col2); + const Vec3V cross20 = V3Cross(a.col2, a.col0); + const FloatV dot = V3Dot(cross01, a.col2); + const FloatV invDet = _mm_rcp_ps(dot); + const Vec3V mergeh = _mm_unpacklo_ps(cross12, cross01); + const Vec3V mergel = _mm_unpackhi_ps(cross12, cross01); + Vec3V colInv0 = _mm_unpacklo_ps(mergeh, cross20); + colInv0 = _mm_or_ps(_mm_andnot_ps(tttf, zero), _mm_and_ps(tttf, colInv0)); + const Vec3V zppd = _mm_shuffle_ps(mergeh, cross20, _MM_SHUFFLE(3, 0, 0, 2)); + const Vec3V pbwp = _mm_shuffle_ps(cross20, mergeh, _MM_SHUFFLE(3, 3, 1, 0)); + const Vec3V colInv1 = _mm_or_ps(_mm_andnot_ps(BTFFT(), pbwp), _mm_and_ps(BTFFT(), zppd)); + const Vec3V xppd = _mm_shuffle_ps(mergel, cross20, _MM_SHUFFLE(3, 0, 0, 0)); + const Vec3V pcyp = _mm_shuffle_ps(cross20, mergel, _MM_SHUFFLE(3, 1, 2, 0)); + const Vec3V colInv2 = _mm_or_ps(_mm_andnot_ps(tfft, pcyp), _mm_and_ps(tfft, xppd)); + + return Mat33V(_mm_mul_ps(colInv0, invDet), _mm_mul_ps(colInv1, invDet), _mm_mul_ps(colInv2, invDet)); +} + +PX_FORCE_INLINE Mat33V M33Trnsps(const Mat33V& a) +{ + Vec3V col0 = a.col0, col1 = a.col1, col2 = a.col2; + V3Transpose(col0, col1, col2); + return Mat33V(col0, col1, col2); +} + +PX_FORCE_INLINE Mat33V M33Identity() +{ + return Mat33V(V3UnitX(), V3UnitY(), V3UnitZ()); +} + +PX_FORCE_INLINE Mat33V M33Diagonal(const Vec3VArg d) +{ + const FloatV x = V3Mul(V3UnitX(), d); + const FloatV y = V3Mul(V3UnitY(), d); + const FloatV z = V3Mul(V3UnitZ(), d); + return Mat33V(x, y, z); +} + +////////////////////////////////// +// MAT34V +////////////////////////////////// + +PX_FORCE_INLINE Vec3V M34MulV3(const Mat34V& a, const Vec3V b) +{ + const FloatV x = V3GetX(b); + const FloatV y = V3GetY(b); + const FloatV z = V3GetZ(b); + const Vec3V v0 = V3Scale(a.col0, x); + const Vec3V v1 = V3Scale(a.col1, y); + const Vec3V v2 = V3Scale(a.col2, z); + const Vec3V v0PlusV1 = V3Add(v0, v1); + const Vec3V v0PlusV1Plusv2 = V3Add(v0PlusV1, v2); + return V3Add(v0PlusV1Plusv2, a.col3); +} + +PX_FORCE_INLINE Vec3V M34Mul33V3(const Mat34V& a, const Vec3V b) +{ + const FloatV x = V3GetX(b); + const FloatV y = V3GetY(b); + const FloatV z = V3GetZ(b); + const Vec3V v0 = V3Scale(a.col0, x); + const Vec3V v1 = V3Scale(a.col1, y); + const Vec3V v2 = V3Scale(a.col2, z); + const Vec3V v0PlusV1 = V3Add(v0, v1); + return V3Add(v0PlusV1, v2); +} + +PX_FORCE_INLINE Vec3V M34TrnspsMul33V3(const Mat34V& a, const Vec3V b) +{ + Vec3V v0 = V3Mul(a.col0, b); + Vec3V v1 = V3Mul(a.col1, b); + Vec3V v2 = V3Mul(a.col2, b); + V3Transpose(v0, v1, v2); + return V3Add(V3Add(v0, v1), v2); +} + +PX_FORCE_INLINE Mat34V M34MulM34(const Mat34V& a, const Mat34V& b) +{ + return Mat34V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2), M34MulV3(a, b.col3)); +} + +PX_FORCE_INLINE Mat33V M34MulM33(const Mat34V& a, const Mat33V& b) +{ + return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2)); +} + +PX_FORCE_INLINE Mat33V M34Mul33MM34(const Mat34V& a, const Mat34V& b) +{ + return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2)); +} + +PX_FORCE_INLINE Mat34V M34Add(const Mat34V& a, const Mat34V& b) +{ + return Mat34V(V3Add(a.col0, b.col0), V3Add(a.col1, b.col1), V3Add(a.col2, b.col2), V3Add(a.col3, b.col3)); +} + +PX_FORCE_INLINE Mat34V M34Inverse(const Mat34V& a) +{ + Mat34V aInv; + const BoolV tfft = BTFFT(); + const BoolV tttf = BTTTF(); + const FloatV zero = V3Zero(); + const Vec3V cross01 = V3Cross(a.col0, a.col1); + const Vec3V cross12 = V3Cross(a.col1, a.col2); + const Vec3V cross20 = V3Cross(a.col2, a.col0); + const FloatV dot = V3Dot(cross01, a.col2); + const FloatV invDet = _mm_rcp_ps(dot); + const Vec3V mergeh = _mm_unpacklo_ps(cross12, cross01); + const Vec3V mergel = _mm_unpackhi_ps(cross12, cross01); + Vec3V colInv0 = _mm_unpacklo_ps(mergeh, cross20); + colInv0 = _mm_or_ps(_mm_andnot_ps(tttf, zero), _mm_and_ps(tttf, colInv0)); + const Vec3V zppd = _mm_shuffle_ps(mergeh, cross20, _MM_SHUFFLE(3, 0, 0, 2)); + const Vec3V pbwp = _mm_shuffle_ps(cross20, mergeh, _MM_SHUFFLE(3, 3, 1, 0)); + const Vec3V colInv1 = _mm_or_ps(_mm_andnot_ps(BTFFT(), pbwp), _mm_and_ps(BTFFT(), zppd)); + const Vec3V xppd = _mm_shuffle_ps(mergel, cross20, _MM_SHUFFLE(3, 0, 0, 0)); + const Vec3V pcyp = _mm_shuffle_ps(cross20, mergel, _MM_SHUFFLE(3, 1, 2, 0)); + const Vec3V colInv2 = _mm_or_ps(_mm_andnot_ps(tfft, pcyp), _mm_and_ps(tfft, xppd)); + aInv.col0 = _mm_mul_ps(colInv0, invDet); + aInv.col1 = _mm_mul_ps(colInv1, invDet); + aInv.col2 = _mm_mul_ps(colInv2, invDet); + aInv.col3 = M34Mul33V3(aInv, V3Neg(a.col3)); + return aInv; +} + +PX_FORCE_INLINE Mat33V M34Trnsps33(const Mat34V& a) +{ + Vec3V col0 = a.col0, col1 = a.col1, col2 = a.col2; + V3Transpose(col0, col1, col2); + return Mat33V(col0, col1, col2); +} + +////////////////////////////////// +// MAT44V +////////////////////////////////// + +PX_FORCE_INLINE Vec4V M44MulV4(const Mat44V& a, const Vec4V b) +{ + const FloatV x = V4GetX(b); + const FloatV y = V4GetY(b); + const FloatV z = V4GetZ(b); + const FloatV w = V4GetW(b); + + const Vec4V v0 = V4Scale(a.col0, x); + const Vec4V v1 = V4Scale(a.col1, y); + const Vec4V v2 = V4Scale(a.col2, z); + const Vec4V v3 = V4Scale(a.col3, w); + const Vec4V v0PlusV1 = V4Add(v0, v1); + const Vec4V v0PlusV1Plusv2 = V4Add(v0PlusV1, v2); + return V4Add(v0PlusV1Plusv2, v3); +} + +PX_FORCE_INLINE Vec4V M44TrnspsMulV4(const Mat44V& a, const Vec4V b) +{ + Vec4V v0 = V4Mul(a.col0, b); + Vec4V v1 = V4Mul(a.col1, b); + Vec4V v2 = V4Mul(a.col2, b); + Vec4V v3 = V4Mul(a.col3, b); + V4Transpose(v0, v1, v2, v3); + return V4Add(V4Add(v0, v1), V4Add(v2, v3)); +} + +PX_FORCE_INLINE Mat44V M44MulM44(const Mat44V& a, const Mat44V& b) +{ + return Mat44V(M44MulV4(a, b.col0), M44MulV4(a, b.col1), M44MulV4(a, b.col2), M44MulV4(a, b.col3)); +} + +PX_FORCE_INLINE Mat44V M44Add(const Mat44V& a, const Mat44V& b) +{ + return Mat44V(V4Add(a.col0, b.col0), V4Add(a.col1, b.col1), V4Add(a.col2, b.col2), V4Add(a.col3, b.col3)); +} + +PX_FORCE_INLINE Mat44V M44Trnsps(const Mat44V& a) +{ + Vec4V col0 = a.col0, col1 = a.col1, col2 = a.col2, col3 = a.col3; + V4Transpose(col0, col1, col2, col3); + return Mat44V(col0, col1, col2, col3); +} + +PX_FORCE_INLINE Mat44V M44Inverse(const Mat44V& a) +{ + __m128 minor0, minor1, minor2, minor3; + __m128 row0, row1, row2, row3; + __m128 det, tmp1; + + tmp1 = V4Zero(); + row1 = V4Zero(); + row3 = V4Zero(); + + row0 = a.col0; + row1 = _mm_shuffle_ps(a.col1, a.col1, _MM_SHUFFLE(1, 0, 3, 2)); + row2 = a.col2; + row3 = _mm_shuffle_ps(a.col3, a.col3, _MM_SHUFFLE(1, 0, 3, 2)); + + tmp1 = _mm_mul_ps(row2, row3); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + minor0 = _mm_mul_ps(row1, tmp1); + minor1 = _mm_mul_ps(row0, tmp1); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0); + minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1); + minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E); + + tmp1 = _mm_mul_ps(row1, row2); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0); + minor3 = _mm_mul_ps(row0, tmp1); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1)); + minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3); + minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E); + + tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + row2 = _mm_shuffle_ps(row2, row2, 0x4E); + minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0); + minor2 = _mm_mul_ps(row0, tmp1); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1)); + minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2); + minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E); + + tmp1 = _mm_mul_ps(row0, row1); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2); + minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2); + minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1)); + + tmp1 = _mm_mul_ps(row0, row3); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1)); + minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1); + minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1)); + + tmp1 = _mm_mul_ps(row0, row2); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); + minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1); + minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1)); + tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); + minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1)); + minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3); + + det = _mm_mul_ps(row0, minor0); + det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det); + det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det); + tmp1 = _mm_rcp_ss(det); +#if 0 + det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1))); + det = _mm_shuffle_ps(det, det, 0x00); +#else + det = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(0, 0, 0, 0)); +#endif + + minor0 = _mm_mul_ps(det, minor0); + minor1 = _mm_mul_ps(det, minor1); + minor2 = _mm_mul_ps(det, minor2); + minor3 = _mm_mul_ps(det, minor3); + Mat44V invTrans(minor0, minor1, minor2, minor3); + return M44Trnsps(invTrans); +} + +PX_FORCE_INLINE Vec4V V4LoadXYZW(const PxF32& x, const PxF32& y, const PxF32& z, const PxF32& w) +{ + return _mm_set_ps(w, z, y, x); +} + +PX_FORCE_INLINE VecU32V V4U32Sel(const BoolV c, const VecU32V a, const VecU32V b) +{ + return internalWindowsSimd::m128_I2F( + _mm_or_si128(_mm_andnot_si128(internalWindowsSimd::m128_F2I(c), internalWindowsSimd::m128_F2I(b)), + _mm_and_si128(internalWindowsSimd::m128_F2I(c), internalWindowsSimd::m128_F2I(a)))); +} + +PX_FORCE_INLINE VecU32V V4U32or(VecU32V a, VecU32V b) +{ + return internalWindowsSimd::m128_I2F(_mm_or_si128(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b))); +} + +PX_FORCE_INLINE VecU32V V4U32xor(VecU32V a, VecU32V b) +{ + return internalWindowsSimd::m128_I2F( + _mm_xor_si128(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b))); +} + +PX_FORCE_INLINE VecU32V V4U32and(VecU32V a, VecU32V b) +{ + return internalWindowsSimd::m128_I2F( + _mm_and_si128(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b))); +} + +PX_FORCE_INLINE VecU32V V4U32Andc(VecU32V a, VecU32V b) +{ + return internalWindowsSimd::m128_I2F( + _mm_andnot_si128(internalWindowsSimd::m128_F2I(b), internalWindowsSimd::m128_F2I(a))); +} + +PX_FORCE_INLINE VecI32V U4Load(const PxU32 i) +{ + return _mm_load1_ps((PxF32*)&i); +} + +PX_FORCE_INLINE VecU32V U4LoadU(const PxU32* i) +{ + return _mm_loadu_ps((PxF32*)i); +} + +PX_FORCE_INLINE VecU32V U4LoadA(const PxU32* i) +{ + ASSERT_ISALIGNED16(i); + return _mm_load_ps((PxF32*)i); +} + +PX_FORCE_INLINE VecI32V I4Load(const PxI32 i) +{ + return _mm_load1_ps((PxF32*)&i); +} + +PX_FORCE_INLINE VecI32V I4LoadU(const PxI32* i) +{ + return _mm_loadu_ps((PxF32*)i); +} + +PX_FORCE_INLINE VecI32V I4LoadA(const PxI32* i) +{ + ASSERT_ISALIGNED16(i); + return _mm_load_ps((PxF32*)i); +} + +PX_FORCE_INLINE VecI32V VecI32V_Add(const VecI32VArg a, const VecI32VArg b) +{ + return internalWindowsSimd::m128_I2F( + _mm_add_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b))); +} + +PX_FORCE_INLINE VecI32V VecI32V_Sub(const VecI32VArg a, const VecI32VArg b) +{ + return internalWindowsSimd::m128_I2F( + _mm_sub_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b))); +} + +PX_FORCE_INLINE BoolV VecI32V_IsGrtr(const VecI32VArg a, const VecI32VArg b) +{ + return internalWindowsSimd::m128_I2F( + _mm_cmpgt_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b))); +} + +PX_FORCE_INLINE BoolV VecI32V_IsEq(const VecI32VArg a, const VecI32VArg b) +{ + return internalWindowsSimd::m128_I2F( + _mm_cmpeq_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b))); +} + +PX_FORCE_INLINE VecI32V V4I32Sel(const BoolV c, const VecI32V a, const VecI32V b) +{ + return V4U32Sel(c, a, b); +} + +PX_FORCE_INLINE VecI32V VecI32V_Zero() +{ + return V4Zero(); +} + +PX_FORCE_INLINE VecI32V VecI32V_One() +{ + return I4Load(1); +} + +PX_FORCE_INLINE VecI32V VecI32V_Two() +{ + return I4Load(2); +} + +PX_FORCE_INLINE VecI32V VecI32V_MinusOne() +{ + return I4Load(-1); +} + +PX_FORCE_INLINE VecU32V U4Zero() +{ + return U4Load(0); +} + +PX_FORCE_INLINE VecU32V U4One() +{ + return U4Load(1); +} + +PX_FORCE_INLINE VecU32V U4Two() +{ + return U4Load(2); +} + +PX_FORCE_INLINE VecI32V VecI32V_Sel(const BoolV c, const VecI32VArg a, const VecI32VArg b) +{ + PX_ASSERT(_VecMathTests::allElementsEqualBoolV(c, BTTTT()) || + _VecMathTests::allElementsEqualBoolV(c, BFFFF())); + return _mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a)); +} + +PX_FORCE_INLINE VecShiftV VecI32V_PrepareShift(const VecI32VArg shift) +{ + VecShiftV preparedShift; + preparedShift.shift = _mm_or_ps(_mm_andnot_ps(BTFFF(), VecI32V_Zero()), _mm_and_ps(BTFFF(), shift)); + return preparedShift; +} + +PX_FORCE_INLINE VecI32V VecI32V_LeftShift(const VecI32VArg a, const VecShiftVArg count) +{ + return internalWindowsSimd::m128_I2F( + _mm_sll_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(count.shift))); +} + +PX_FORCE_INLINE VecI32V VecI32V_RightShift(const VecI32VArg a, const VecShiftVArg count) +{ + return internalWindowsSimd::m128_I2F( + _mm_srl_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(count.shift))); +} + +PX_FORCE_INLINE VecI32V VecI32V_And(const VecI32VArg a, const VecI32VArg b) +{ + return _mm_and_ps(a, b); +} + +PX_FORCE_INLINE VecI32V VecI32V_Or(const VecI32VArg a, const VecI32VArg b) +{ + return _mm_or_ps(a, b); +} + +PX_FORCE_INLINE VecI32V VecI32V_GetX(const VecI32VArg a) +{ + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)); +} + +PX_FORCE_INLINE VecI32V VecI32V_GetY(const VecI32VArg a) +{ + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)); +} + +PX_FORCE_INLINE VecI32V VecI32V_GetZ(const VecI32VArg a) +{ + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)); +} + +PX_FORCE_INLINE VecI32V VecI32V_GetW(const VecI32VArg a) +{ + return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)); +} + +PX_FORCE_INLINE void PxI32_From_VecI32V(const VecI32VArg a, PxI32* i) +{ + _mm_store_ss((PxF32*)i, a); +} + +PX_FORCE_INLINE VecI32V VecI32V_From_BoolV(const BoolVArg a) +{ + return a; +} + +PX_FORCE_INLINE VecU32V VecU32V_From_BoolV(const BoolVArg a) +{ + return a; +} + +PX_FORCE_INLINE VecI32V VecI32V_Merge(const VecI32VArg a, const VecI32VArg b, const VecI32VArg c, const VecI32VArg d) +{ + const __m128 xw = _mm_move_ss(b, a); // y, y, y, x + const __m128 yz = _mm_move_ss(c, d); // z, z, z, w + return _mm_shuffle_ps(xw, yz, _MM_SHUFFLE(0, 2, 1, 0)); +} + +PX_FORCE_INLINE void V4U32StoreAligned(VecU32V val, VecU32V* address) +{ + *address = val; +} + +PX_FORCE_INLINE Vec4V V4Andc(const Vec4V a, const VecU32V b) +{ + VecU32V result32(a); + result32 = V4U32Andc(result32, b); + return Vec4V(result32); +} + +PX_FORCE_INLINE VecU32V V4IsGrtrV32u(const Vec4V a, const Vec4V b) +{ + return V4IsGrtr(a, b); +} + +PX_FORCE_INLINE VecU16V V4U16LoadAligned(VecU16V* addr) +{ + return *addr; +} + +PX_FORCE_INLINE VecU16V V4U16LoadUnaligned(VecU16V* addr) +{ + return *addr; +} + +// unsigned compares are not supported on x86 +PX_FORCE_INLINE VecU16V V4U16CompareGt(VecU16V a, VecU16V b) +{ + // _mm_cmpgt_epi16 doesn't work for unsigned values unfortunately + // return m128_I2F(_mm_cmpgt_epi16(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b))); + VecU16V result; + result.m128_u16[0] = PxU16((a).m128_u16[0] > (b).m128_u16[0]); + result.m128_u16[1] = PxU16((a).m128_u16[1] > (b).m128_u16[1]); + result.m128_u16[2] = PxU16((a).m128_u16[2] > (b).m128_u16[2]); + result.m128_u16[3] = PxU16((a).m128_u16[3] > (b).m128_u16[3]); + result.m128_u16[4] = PxU16((a).m128_u16[4] > (b).m128_u16[4]); + result.m128_u16[5] = PxU16((a).m128_u16[5] > (b).m128_u16[5]); + result.m128_u16[6] = PxU16((a).m128_u16[6] > (b).m128_u16[6]); + result.m128_u16[7] = PxU16((a).m128_u16[7] > (b).m128_u16[7]); + return result; +} + +PX_FORCE_INLINE VecU16V V4I16CompareGt(VecU16V a, VecU16V b) +{ + return internalWindowsSimd::m128_I2F( + _mm_cmpgt_epi16(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b))); +} + +PX_FORCE_INLINE Vec4V Vec4V_From_VecU32V(VecU32V a) +{ + Vec4V result = V4LoadXYZW(PxF32(a.m128_u32[0]), PxF32(a.m128_u32[1]), PxF32(a.m128_u32[2]), PxF32(a.m128_u32[3])); + return result; +} + +PX_FORCE_INLINE Vec4V Vec4V_From_VecI32V(VecI32V a) +{ + return _mm_cvtepi32_ps(internalWindowsSimd::m128_F2I(a)); +} + +PX_FORCE_INLINE VecI32V VecI32V_From_Vec4V(Vec4V a) +{ + return internalWindowsSimd::m128_I2F(_mm_cvttps_epi32(a)); +} + +PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecU32V(VecU32V a) +{ + return Vec4V(a); +} + +PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecI32V(VecI32V a) +{ + return Vec4V(a); +} + +PX_FORCE_INLINE VecU32V VecU32V_ReinterpretFrom_Vec4V(Vec4V a) +{ + return VecU32V(a); +} + +PX_FORCE_INLINE VecI32V VecI32V_ReinterpretFrom_Vec4V(Vec4V a) +{ + return VecI32V(a); +} + +template <int index> +PX_FORCE_INLINE VecU32V V4U32SplatElement(VecU32V a) +{ + return internalWindowsSimd::m128_I2F( + _mm_shuffle_epi32(internalWindowsSimd::m128_F2I(a), _MM_SHUFFLE(index, index, index, index))); +} + +template <int index> +PX_FORCE_INLINE Vec4V V4SplatElement(Vec4V a) +{ + return internalWindowsSimd::m128_I2F( + _mm_shuffle_epi32(internalWindowsSimd::m128_F2I(a), _MM_SHUFFLE(index, index, index, index))); +} + +PX_FORCE_INLINE VecU32V U4LoadXYZW(PxU32 x, PxU32 y, PxU32 z, PxU32 w) +{ + VecU32V result; + result.m128_u32[0] = x; + result.m128_u32[1] = y; + result.m128_u32[2] = z; + result.m128_u32[3] = w; + return result; +} + +PX_FORCE_INLINE Vec4V V4ConvertFromI32V(const VecI32V in) +{ + return _mm_cvtepi32_ps(internalWindowsSimd::m128_F2I(in)); +} + +#endif // PSFOUNDATION_PSWINDOWSINLINEAOS_H diff --git a/PxShared/src/foundation/include/windows/PsWindowsIntrinsics.h b/PxShared/src/foundation/include/windows/PsWindowsIntrinsics.h new file mode 100644 index 00000000..ccf6d620 --- /dev/null +++ b/PxShared/src/foundation/include/windows/PsWindowsIntrinsics.h @@ -0,0 +1,190 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSWINDOWSINTRINSICS_H +#define PSFOUNDATION_PSWINDOWSINTRINSICS_H + +#include "Ps.h" +#include "foundation/PxAssert.h" + +// this file is for internal intrinsics - that is, intrinsics that are used in +// cross platform code but do not appear in the API + +#if !PX_WINDOWS_FAMILY +#error "This file should only be included by Windows builds!!" +#endif + +#pragma warning(push) +//'symbol' is not defined as a preprocessor macro, replacing with '0' for 'directives' +#pragma warning(disable : 4668) +#if PX_VC == 10 +#pragma warning(disable : 4987) // nonstandard extension used: 'throw (...)' +#endif +#include <intrin.h> +#pragma warning(pop) + +#pragma warning(push) +#pragma warning(disable : 4985) // 'symbol name': attributes not present on previous declaration +#include <math.h> +#pragma warning(pop) + +#include <float.h> +#include <mmintrin.h> + +#pragma intrinsic(_BitScanForward) +#pragma intrinsic(_BitScanReverse) + +namespace physx +{ +namespace shdfnd +{ + +/* +* Implements a memory barrier +*/ +PX_FORCE_INLINE void memoryBarrier() +{ + _ReadWriteBarrier(); + /* long Barrier; + __asm { + xchg Barrier, eax + }*/ +} + +/*! +Returns the index of the highest set bit. Not valid for zero arg. +*/ +PX_FORCE_INLINE uint32_t highestSetBitUnsafe(uint32_t v) +{ + unsigned long retval; + _BitScanReverse(&retval, v); + return retval; +} + +/*! +Returns the index of the highest set bit. Undefined for zero arg. +*/ +PX_FORCE_INLINE uint32_t lowestSetBitUnsafe(uint32_t v) +{ + unsigned long retval; + _BitScanForward(&retval, v); + return retval; +} + +/*! +Returns the number of leading zeros in v. Returns 32 for v=0. +*/ +PX_FORCE_INLINE uint32_t countLeadingZeros(uint32_t v) +{ + if(v) + { + unsigned long bsr = (unsigned long)-1; + _BitScanReverse(&bsr, v); + return 31 - bsr; + } + else + return 32; +} + +/*! +Prefetch aligned cache size around \c ptr+offset. +*/ +#if !PX_ARM +PX_FORCE_INLINE void prefetchLine(const void* ptr, uint32_t offset = 0) +{ + // cache line on X86/X64 is 64-bytes so a 128-byte prefetch would require 2 prefetches. + // However, we can only dispatch a limited number of prefetch instructions so we opt to prefetch just 1 cache line + /*_mm_prefetch(((const char*)ptr + offset), _MM_HINT_T0);*/ + // We get slightly better performance prefetching to non-temporal addresses instead of all cache levels + _mm_prefetch(((const char*)ptr + offset), _MM_HINT_NTA); +} +#else +PX_FORCE_INLINE void prefetchLine(const void* ptr, uint32_t offset = 0) +{ + // arm does have 32b cache line size + __prefetch(((const char*)ptr + offset)); +} +#endif + +/*! +Prefetch \c count bytes starting at \c ptr. +*/ +#if !PX_ARM +PX_FORCE_INLINE void prefetch(const void* ptr, uint32_t count = 1) +{ + const char* cp = (char*)ptr; + uint64_t p = size_t(ptr); + uint64_t startLine = p >> 6, endLine = (p + count - 1) >> 6; + uint64_t lines = endLine - startLine + 1; + do + { + prefetchLine(cp); + cp += 64; + } while(--lines); +} +#else +PX_FORCE_INLINE void prefetch(const void* ptr, uint32_t count = 1) +{ + const char* cp = (char*)ptr; + uint32_t p = size_t(ptr); + uint32_t startLine = p >> 5, endLine = (p + count - 1) >> 5; + uint32_t lines = endLine - startLine + 1; + do + { + prefetchLine(cp); + cp += 32; + } while(--lines); +} +#endif + +//! \brief platform-specific reciprocal +PX_CUDA_CALLABLE PX_FORCE_INLINE float recipFast(float a) +{ + return 1.0f / a; +} + +//! \brief platform-specific fast reciprocal square root +PX_CUDA_CALLABLE PX_FORCE_INLINE float recipSqrtFast(float a) +{ + return 1.0f / ::sqrtf(a); +} + +//! \brief platform-specific floor +PX_CUDA_CALLABLE PX_FORCE_INLINE float floatFloor(float x) +{ + return ::floorf(x); +} + +#define NS_EXPECT_TRUE(x) x +#define NS_EXPECT_FALSE(x) x + +} // namespace shdfnd +} // namespace physx + +#endif // #ifndef PSFOUNDATION_PSWINDOWSINTRINSICS_H diff --git a/PxShared/src/foundation/include/windows/PsWindowsLoadLibrary.h b/PxShared/src/foundation/include/windows/PsWindowsLoadLibrary.h new file mode 100644 index 00000000..882f0b70 --- /dev/null +++ b/PxShared/src/foundation/include/windows/PsWindowsLoadLibrary.h @@ -0,0 +1,72 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + + +#ifndef PS_WINDOWS_FOUNDATION_LOADLIBRARY_H +#define PS_WINDOWS_FOUNDATION_LOADLIBRARY_H + +#include "foundation/PxPreprocessor.h" +#include "windows/PsWindowsInclude.h" +#include "foundation/windows/PxWindowsFoundationDelayLoadHook.h" + +namespace physx +{ +namespace shdfnd +{ + EXTERN_C IMAGE_DOS_HEADER __ImageBase; + + PX_INLINE FARPROC WINAPI foundationDliNotePreLoadLibrary(const char* libraryName, const physx::PxFoundationDelayLoadHook* delayLoadHook) + { + if(!delayLoadHook) + { + return (FARPROC)::LoadLibraryA(libraryName); + } + else + { + if(strstr(libraryName, "PxFoundation")) + { + if(strstr(libraryName, "DEBUG")) + return (FARPROC)::LoadLibraryA(delayLoadHook->getPxFoundationDEBUGDllName()); + + if(strstr(libraryName, "CHECKED")) + return (FARPROC)::LoadLibraryA(delayLoadHook->getPxFoundationCHECKEDDllName()); + + if(strstr(libraryName, "PROFILE")) + return (FARPROC)::LoadLibraryA(delayLoadHook->getPxFoundationPROFILEDllName()); + + return (FARPROC)::LoadLibraryA(delayLoadHook->getPxFoundationDllName()); + } + } + return NULL; + } +} // namespace shdfnd +} // namespace physx + + +#endif // PS_WINDOWS_FOUNDATION_LOADLIBRARY_H diff --git a/PxShared/src/foundation/include/windows/PsWindowsTrigConstants.h b/PxShared/src/foundation/include/windows/PsWindowsTrigConstants.h new file mode 100644 index 00000000..6693fc2a --- /dev/null +++ b/PxShared/src/foundation/include/windows/PsWindowsTrigConstants.h @@ -0,0 +1,87 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#ifndef PSFOUNDATION_PSWINDOWSTRIGCONSTANTS_H +#define PSFOUNDATION_PSWINDOWSTRIGCONSTANTS_H + +#define PX_GLOBALCONST extern const __declspec(selectany) + +__declspec(align(16)) struct PX_VECTORF32 +{ + float f[4]; +}; + +//#define PX_PI 3.141592654f +//#define PX_2PI 6.283185307f +//#define PX_1DIVPI 0.318309886f +//#define PX_1DIV2PI 0.159154943f +//#define PX_PIDIV2 1.570796327f +//#define PX_PIDIV4 0.785398163f + +PX_GLOBALCONST PX_VECTORF32 g_PXSinCoefficients0 = { { 1.0f, -0.166666667f, 8.333333333e-3f, -1.984126984e-4f } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXSinCoefficients1 = { { 2.755731922e-6f, -2.505210839e-8f, 1.605904384e-10f, -7.647163732e-13f } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXSinCoefficients2 = { { 2.811457254e-15f, -8.220635247e-18f, 1.957294106e-20f, -3.868170171e-23f } }; +PX_GLOBALCONST PX_VECTORF32 g_PXCosCoefficients0 = { { 1.0f, -0.5f, 4.166666667e-2f, -1.388888889e-3f } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXCosCoefficients1 = { { 2.480158730e-5f, -2.755731922e-7f, 2.087675699e-9f, -1.147074560e-11f } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXCosCoefficients2 = { { 4.779477332e-14f, -1.561920697e-16f, 4.110317623e-19f, -8.896791392e-22f } }; +PX_GLOBALCONST PX_VECTORF32 g_PXTanCoefficients0 = { { 1.0f, 0.333333333f, 0.133333333f, 5.396825397e-2f } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXTanCoefficients1 = { { 2.186948854e-2f, 8.863235530e-3f, 3.592128167e-3f, 1.455834485e-3f } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXTanCoefficients2 = { { 5.900274264e-4f, 2.391290764e-4f, 9.691537707e-5f, 3.927832950e-5f } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXASinCoefficients0 = { { -0.05806367563904f, -0.41861972469416f, 0.22480114791621f, 2.17337241360606f } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXASinCoefficients1 = { { 0.61657275907170f, 4.29696498283455f, -1.18942822255452f, -6.53784832094831f } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXASinCoefficients2 = { { -1.36926553863413f, -4.48179294237210f, 1.41810672941833f, 5.48179257935713f } }; +PX_GLOBALCONST PX_VECTORF32 g_PXATanCoefficients0 = { { 1.0f, 0.333333334f, 0.2f, 0.142857143f } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXATanCoefficients1 = { { 1.111111111e-1f, 9.090909091e-2f, 7.692307692e-2f, 6.666666667e-2f } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXATanCoefficients2 = { { 5.882352941e-2f, 5.263157895e-2f, 4.761904762e-2f, 4.347826087e-2f } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXSinEstCoefficients = { { 1.0f, -1.66521856991541e-1f, 8.199913018755e-3f, -1.61475937228e-4f } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXCosEstCoefficients = { { 1.0f, -4.95348008918096e-1f, 3.878259962881e-2f, -9.24587976263e-4f } }; +PX_GLOBALCONST PX_VECTORF32 g_PXTanEstCoefficients = { { 2.484f, -1.954923183e-1f, 2.467401101f, PxInvPi } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXATanEstCoefficients = { { 7.689891418951e-1f, 1.104742493348f, 8.661844266006e-1f, PxPiDivTwo } }; +PX_GLOBALCONST PX_VECTORF32 +g_PXASinEstCoefficients = { { -1.36178272886711f, 2.37949493464538f, -8.08228565650486e-1f, 2.78440142746736e-1f } }; +PX_GLOBALCONST PX_VECTORF32 g_PXASinEstConstants = { { 1.00000011921f, PxPiDivTwo, 0.0f, 0.0f } }; +PX_GLOBALCONST PX_VECTORF32 g_PXPiConstants0 = { { PxPi, PxTwoPi, PxInvPi, PxInvTwoPi } }; +PX_GLOBALCONST PX_VECTORF32 g_PXReciprocalTwoPi = { { PxInvTwoPi, PxInvTwoPi, PxInvTwoPi, PxInvTwoPi } }; +PX_GLOBALCONST PX_VECTORF32 g_PXTwoPi = { { PxTwoPi, PxTwoPi, PxTwoPi, PxTwoPi } }; + +#endif |