Initial commit:

PhysX 3.4.0 Update @ 21294896 APEX 1.4.0 Update @ 21275617 [CL 21300167]
author: git perforce import user <a@b> 2016-10-25 12:29:14 -0600
committer: Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees> 2016-10-25 18:56:37 -0500
commit: 3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
tree: fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /PxShared/src/foundation/include
download: physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz
physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip
58 files changed, 22760 insertions, 0 deletions
diff --git a/PxShared/src/foundation/include/Ps.h b/PxShared/src/foundation/include/Ps.h
new file mode 100644
index 00000000..53c289db
--- /dev/null
+++ b/PxShared/src/foundation/include/Ps.h
@@ -0,0 +1,70 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PS_H
+#define PSFOUNDATION_PS_H
+
+/*! \file top level include file for shared foundation */
+
+#include "foundation/Px.h"
+
+/**
+Platform specific defines
+*/
+#if PX_WINDOWS_FAMILY || PX_XBOXONE
+#pragma intrinsic(memcmp)
+#pragma intrinsic(memcpy)
+#pragma intrinsic(memset)
+#pragma intrinsic(abs)
+#pragma intrinsic(labs)
+#endif
+
+// An expression that should expand to nothing in non PX_CHECKED builds.
+// We currently use this only for tagging the purpose of containers for memory use tracking.
+#if PX_CHECKED
+#define PX_DEBUG_EXP(x) (x)
+#else
+#define PX_DEBUG_EXP(x)
+#endif
+
+#define PX_SIGN_BITMASK 0x80000000
+
+namespace physx
+{
+namespace shdfnd
+{
+// Int-as-bool type - has some uses for efficiency and with SIMD
+typedef int IntBool;
+static const IntBool IntFalse = 0;
+static const IntBool IntTrue = 1;
+}
+
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PS_H
diff --git a/PxShared/src/foundation/include/PsAlignedMalloc.h b/PxShared/src/foundation/include/PsAlignedMalloc.h
new file mode 100644
index 00000000..5dd889c4
--- /dev/null
+++ b/PxShared/src/foundation/include/PsAlignedMalloc.h
@@ -0,0 +1,88 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSALIGNEDMALLOC_H
+#define PSFOUNDATION_PSALIGNEDMALLOC_H
+
+#include "PsUserAllocated.h"
+
+/*!
+Allocate aligned memory.
+Alignment must be a power of 2!
+-- should be templated by a base allocator
+*/
+
+namespace physx
+{
+namespace shdfnd
+{
+/**
+Allocator, which is used to access the global PxAllocatorCallback instance
+(used for dynamic data types template instantiation), which can align memory
+*/
+
+// SCS: AlignedMalloc with 3 params not found, seems not used on PC either
+// disabled for now to avoid GCC error
+
+template <uint32_t N, typename BaseAllocator = NonTrackingAllocator>
+class AlignedAllocator : public BaseAllocator
+{
+  public:
+	AlignedAllocator(const BaseAllocator& base = BaseAllocator()) : BaseAllocator(base)
+	{
+	}
+
+	void* allocate(size_t size, const char* file, int line)
+	{
+		size_t pad = N - 1 + sizeof(size_t); // store offset for delete.
+		uint8_t* base = reinterpret_cast<uint8_t*>(BaseAllocator::allocate(size + pad, file, line));
+		if(!base)
+			return NULL;
+
+		uint8_t* ptr = reinterpret_cast<uint8_t*>(size_t(base + pad) & ~(size_t(N) - 1)); // aligned pointer, ensuring N
+		// is a size_t
+		// wide mask
+		reinterpret_cast<size_t*>(ptr)[-1] = size_t(ptr - base); // store offset
+
+		return ptr;
+	}
+	void deallocate(void* ptr)
+	{
+		if(ptr == NULL)
+			return;
+
+		uint8_t* base = reinterpret_cast<uint8_t*>(ptr) - reinterpret_cast<size_t*>(ptr)[-1];
+		BaseAllocator::deallocate(base);
+	}
+};
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSALIGNEDMALLOC_H
diff --git a/PxShared/src/foundation/include/PsAlloca.h b/PxShared/src/foundation/include/PsAlloca.h
new file mode 100644
index 00000000..f1f36732
--- /dev/null
+++ b/PxShared/src/foundation/include/PsAlloca.h
@@ -0,0 +1,76 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSALLOCA_H
+#define PSFOUNDATION_PSALLOCA_H
+
+#include "PsTempAllocator.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+template <typename T, typename Alloc = TempAllocator>
+class ScopedPointer : private Alloc
+{
+  public:
+	~ScopedPointer()
+	{
+		if(mOwned)
+			Alloc::deallocate(mPointer);
+	}
+
+	operator T*() const
+	{
+		return mPointer;
+	}
+
+	T* mPointer;
+	bool mOwned;
+};
+
+} // namespace shdfnd
+} // namespace physx
+
+/*! Stack allocation for \c count instances of \c type. Falling back to temp allocator if using more than 1kB. */
+#ifdef __SPU__
+#define PX_ALLOCA(var, type, count) type* var = reinterpret_cast<type*>(PxAlloca(sizeof(type) * (count)))
+#else
+#define PX_ALLOCA(var, type, count)                                                                                    \
+	physx::shdfnd::ScopedPointer<type> var;                                                                            \
+	{                                                                                                                  \
+		uint32_t size = sizeof(type) * (count);                                                                        \
+		var.mOwned = size > 1024;                                                                                      \
+		if(var.mOwned)                                                                                                 \
+			var.mPointer = reinterpret_cast<type*>(physx::shdfnd::TempAllocator().allocate(size, __FILE__, __LINE__)); \
+		else                                                                                                           \
+			var.mPointer = reinterpret_cast<type*>(PxAlloca(size));                                                    \
+	}
+#endif
+#endif // #ifndef PSFOUNDATION_PSALLOCA_H
diff --git a/PxShared/src/foundation/include/PsAllocator.h b/PxShared/src/foundation/include/PsAllocator.h
new file mode 100644
index 00000000..4fed4614
--- /dev/null
+++ b/PxShared/src/foundation/include/PsAllocator.h
@@ -0,0 +1,364 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSALLOCATOR_H
+#define PSFOUNDATION_PSALLOCATOR_H
+
+#include "foundation/PxAllocatorCallback.h"
+#include "foundation/PxFoundation.h"
+#include "Ps.h"
+#include "foundation/PxAssert.h"
+
+#if(PX_WINDOWS_FAMILY || PX_XBOXONE)
+#include <exception>
+#include <typeinfo.h>
+#endif
+#if(PX_APPLE_FAMILY)
+#include <typeinfo>
+#endif
+
+#include <new>
+
+// Allocation macros going through user allocator
+#if PX_CHECKED
+#define PX_ALLOC(n, name) physx::shdfnd::NamedAllocator(name).allocate(n, __FILE__, __LINE__)
+#else
+#define PX_ALLOC(n, name) physx::shdfnd::NonTrackingAllocator().allocate(n, __FILE__, __LINE__)
+#endif
+#define PX_ALLOC_TEMP(n, name) PX_ALLOC(n, name)
+#define PX_FREE(x) physx::shdfnd::NonTrackingAllocator().deallocate(x)
+#define PX_FREE_AND_RESET(x)                                                                                           \
+	{                                                                                                                  \
+		PX_FREE(x);                                                                                                    \
+		x = 0;                                                                                                         \
+	}
+
+// The following macros support plain-old-types and classes derived from UserAllocated.
+#define PX_NEW(T) new (physx::shdfnd::ReflectionAllocator<T>(), __FILE__, __LINE__) T
+#define PX_NEW_TEMP(T) PX_NEW(T)
+#define PX_DELETE(x) delete x
+#define PX_DELETE_AND_RESET(x)                                                                                         \
+	{                                                                                                                  \
+		PX_DELETE(x);                                                                                                  \
+		x = 0;                                                                                                         \
+	}
+#define PX_DELETE_POD(x)                                                                                               \
+	{                                                                                                                  \
+		PX_FREE(x);                                                                                                    \
+		x = 0;                                                                                                         \
+	}
+#define PX_DELETE_ARRAY(x)                                                                                             \
+	{                                                                                                                  \
+		PX_DELETE([] x);                                                                                               \
+		x = 0;                                                                                                         \
+	}
+
+// aligned allocation
+#define PX_ALIGNED16_ALLOC(n) physx::shdfnd::AlignedAllocator<16>().allocate(n, __FILE__, __LINE__)
+#define PX_ALIGNED16_FREE(x) physx::shdfnd::AlignedAllocator<16>().deallocate(x)
+
+//! placement new macro to make it easy to spot bad use of 'new'
+#define PX_PLACEMENT_NEW(p, T) new (p) T
+
+#if PX_DEBUG || PX_CHECKED
+#define PX_USE_NAMED_ALLOCATOR 1
+#else
+#define PX_USE_NAMED_ALLOCATOR 0
+#endif
+
+// Don't use inline for alloca !!!
+#if PX_WINDOWS_FAMILY
+#include <malloc.h>
+#define PxAlloca(x) _alloca(x)
+#elif PX_LINUX || PX_ANDROID
+#include <malloc.h>
+#define PxAlloca(x) alloca(x)
+#elif PX_APPLE_FAMILY
+#include <alloca.h>
+#define PxAlloca(x) alloca(x)
+#elif PX_PS4
+#include <memory.h>
+#define PxAlloca(x) alloca(x)
+#elif PX_XBOXONE
+#include <malloc.h>
+#define PxAlloca(x) alloca(x)
+#endif
+
+#define PxAllocaAligned(x, alignment) ((size_t(PxAlloca(x + alignment)) + (alignment - 1)) & ~size_t(alignment - 1))
+
+namespace physx
+{
+namespace shdfnd
+{
+
+PX_FOUNDATION_API PxAllocatorCallback& getAllocator();
+
+/**
+Allocator used to access the global PxAllocatorCallback instance without providing additional information.
+*/
+
+class PX_FOUNDATION_API Allocator
+{
+  public:
+	Allocator(const char* = 0)
+	{
+	}
+	void* allocate(size_t size, const char* file, int line);
+	void deallocate(void* ptr);
+};
+
+/*
+ * Bootstrap allocator using malloc/free.
+ * Don't use unless your objects get allocated before foundation is initialized.
+ */
+class RawAllocator
+{
+  public:
+	RawAllocator(const char* = 0)
+	{
+	}
+	void* allocate(size_t size, const char*, int)
+	{
+		// malloc returns valid pointer for size==0, no need to check
+		return ::malloc(size);
+	}
+	void deallocate(void* ptr)
+	{
+		// free(0) is guaranteed to have no side effect, no need to check
+		::free(ptr);
+	}
+};
+
+/*
+ * Allocator that simply calls straight back to the application without tracking.
+ * This is used by the heap (Foundation::mNamedAllocMap) that tracks allocations
+ * because it needs to be able to grow as a result of an allocation.
+ * Making the hash table re-entrant to deal with this may not make sense.
+ */
+class NonTrackingAllocator
+{
+  public:
+	PX_FORCE_INLINE NonTrackingAllocator(const char* = 0)
+	{
+	}
+	PX_FORCE_INLINE void* allocate(size_t size, const char* file, int line)
+	{
+		return !size ? 0 : getAllocator().allocate(size, "NonTrackedAlloc", file, line);
+	}
+	PX_FORCE_INLINE void deallocate(void* ptr)
+	{
+		if(ptr)
+			getAllocator().deallocate(ptr);
+	}
+};
+
+/*
+\brief	Virtual allocator callback used to provide run-time defined allocators to foundation types like Array or Bitmap.
+        This is used by VirtualAllocator
+*/
+class VirtualAllocatorCallback
+{
+  public:
+	VirtualAllocatorCallback()
+	{
+	}
+	virtual ~VirtualAllocatorCallback()
+	{
+	}
+	virtual void* allocate(const size_t size, const char* file, const int line) = 0;
+	virtual void deallocate(void* ptr) = 0;
+};
+
+/*
+\brief Virtual allocator to be used by foundation types to provide run-time defined allocators.
+Due to the fact that Array extends its allocator, rather than contains a reference/pointer to it, the VirtualAllocator
+must
+be a concrete type containing a pointer to a virtual callback. The callback may not be available at instantiation time,
+therefore
+methods are provided to set the callback later.
+*/
+class VirtualAllocator
+{
+  public:
+	VirtualAllocator(VirtualAllocatorCallback* callback = NULL) : mCallback(callback)
+	{
+	}
+
+	void* allocate(const size_t size, const char* file, const int line)
+	{
+		PX_ASSERT(mCallback);
+		if(size)
+			return mCallback->allocate(size, file, line);
+		return NULL;
+	}
+	void deallocate(void* ptr)
+	{
+		PX_ASSERT(mCallback);
+		if(ptr)
+			mCallback->deallocate(ptr);
+	}
+
+	void setCallback(VirtualAllocatorCallback* callback)
+	{
+		mCallback = callback;
+	}
+	VirtualAllocatorCallback* getCallback()
+	{
+		return mCallback;
+	}
+
+  private:
+	VirtualAllocatorCallback* mCallback;
+	VirtualAllocator& operator=(const VirtualAllocator&);
+};
+
+#if PX_USE_NAMED_ALLOCATOR // can be slow, so only use in debug/checked
+class PX_FOUNDATION_API NamedAllocator
+{
+  public:
+	NamedAllocator(const PxEMPTY);
+	NamedAllocator(const char* name = 0); // todo: should not have default argument!
+	NamedAllocator(const NamedAllocator&);
+	~NamedAllocator();
+	NamedAllocator& operator=(const NamedAllocator&);
+	void* allocate(size_t size, const char* filename, int line);
+	void deallocate(void* ptr);
+};
+#else
+class NamedAllocator;
+#endif // PX_DEBUG
+
+/**
+Allocator used to access the global PxAllocatorCallback instance using a static name derived from T.
+*/
+
+template <typename T>
+class ReflectionAllocator
+{
+	static const char* getName()
+	{
+		if(!PxGetFoundation().getReportAllocationNames())
+			return "<allocation names disabled>";
+#if PX_GCC_FAMILY
+		return __PRETTY_FUNCTION__;
+#else
+		// name() calls malloc(), raw_name() wouldn't
+		return typeid(T).name();
+#endif
+	}
+
+  public:
+	ReflectionAllocator(const PxEMPTY)
+	{
+	}
+	ReflectionAllocator(const char* = 0)
+	{
+	}
+	inline ReflectionAllocator(const ReflectionAllocator&)
+	{
+	}
+	void* allocate(size_t size, const char* filename, int line)
+	{
+		return size ? getAllocator().allocate(size, getName(), filename, line) : 0;
+	}
+	void deallocate(void* ptr)
+	{
+		if(ptr)
+			getAllocator().deallocate(ptr);
+	}
+};
+
+template <typename T>
+struct AllocatorTraits
+{
+#if PX_USE_NAMED_ALLOCATOR
+	typedef NamedAllocator Type;
+#else
+	typedef ReflectionAllocator<T> Type;
+#endif
+};
+
+// if you get a build error here, you are trying to PX_NEW a class
+// that is neither plain-old-type nor derived from UserAllocated
+template <typename T, typename X>
+union EnableIfPod
+{
+	int i;
+	T t;
+	typedef X Type;
+};
+
+} // namespace shdfnd
+} // namespace physx
+
+// Global placement new for ReflectionAllocator templated by
+// plain-old-type. Allows using PX_NEW for pointers and built-in-types.
+//
+// ATTENTION: You need to use PX_DELETE_POD or PX_FREE to deallocate
+// memory, not PX_DELETE. PX_DELETE_POD redirects to PX_FREE.
+//
+// Rationale: PX_DELETE uses global operator delete(void*), which we dont' want to overload.
+// Any other definition of PX_DELETE couldn't support array syntax 'PX_DELETE([]a);'.
+// PX_DELETE_POD was preferred over PX_DELETE_ARRAY because it is used
+// less often and applies to both single instances and arrays.
+template <typename T>
+PX_INLINE void* operator new(size_t size, physx::shdfnd::ReflectionAllocator<T> alloc, const char* fileName,
+                             typename physx::shdfnd::EnableIfPod<T, int>::Type line)
+{
+	return alloc.allocate(size, fileName, line);
+}
+
+template <typename T>
+PX_INLINE void* operator new [](size_t size, physx::shdfnd::ReflectionAllocator<T> alloc, const char* fileName,
+                                typename physx::shdfnd::EnableIfPod<T, int>::Type line)
+{ return alloc.allocate(size, fileName, line); }
+
+// If construction after placement new throws, this placement delete is being called.
+template <typename T>
+PX_INLINE void operator delete(void* ptr, physx::shdfnd::ReflectionAllocator<T> alloc, const char* fileName,
+                               typename physx::shdfnd::EnableIfPod<T, int>::Type line)
+{
+	PX_UNUSED(fileName);
+	PX_UNUSED(line);
+
+	alloc.deallocate(ptr);
+}
+
+// If construction after placement new throws, this placement delete is being called.
+template <typename T>
+PX_INLINE void operator delete [](void* ptr, physx::shdfnd::ReflectionAllocator<T> alloc, const char* fileName,
+                                  typename physx::shdfnd::EnableIfPod<T, int>::Type line)
+{
+	PX_UNUSED(fileName);
+	PX_UNUSED(line);
+
+	alloc.deallocate(ptr);
+}
+
+#endif // #ifndef PSFOUNDATION_PSALLOCATOR_H
diff --git a/PxShared/src/foundation/include/PsAoS.h b/PxShared/src/foundation/include/PsAoS.h
new file mode 100644
index 00000000..f892a4d8
--- /dev/null
+++ b/PxShared/src/foundation/include/PsAoS.h
@@ -0,0 +1,45 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSAOS_H
+#define PSFOUNDATION_PSAOS_H
+
+#include "foundation/Px.h"
+
+#if PX_WINDOWS && !PX_NEON
+#include "windows/PsWindowsAoS.h"
+#elif(PX_UNIX_FAMILY || PX_PS4)
+#include "unix/PsUnixAoS.h"
+#elif PX_XBOXONE
+#include "XboxOne/PsXboxOneAoS.h"
+#else
+#error "Platform not supported!"
+#endif
+
+#endif
diff --git a/PxShared/src/foundation/include/PsArray.h b/PxShared/src/foundation/include/PsArray.h
new file mode 100644
index 00000000..19ed9b05
--- /dev/null
+++ b/PxShared/src/foundation/include/PsArray.h
@@ -0,0 +1,802 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSARRAY_H
+#define PSFOUNDATION_PSARRAY_H
+
+#include "foundation/PxAssert.h"
+#include "foundation/PxIntrinsics.h"
+#include "PsAllocator.h"
+#include "PsBasicTemplates.h"
+
+#if PX_LIBCPP
+#include <type_traits>
+#else
+#include <tr1/type_traits>
+#endif
+
+#if PX_VC == 9 || PX_VC == 10
+#pragma warning(push)
+#pragma warning(disable : 4347) // behavior change: 'function template' is called instead of 'function'
+#endif
+
+namespace physx
+{
+namespace shdfnd
+{
+template <class Serializer>
+void exportArray(Serializer& stream, const void* data, uint32_t size, uint32_t sizeOfElement, uint32_t capacity);
+char* importArray(char* address, void** data, uint32_t size, uint32_t sizeOfElement, uint32_t capacity);
+
+/*!
+An array is a sequential container.
+
+Implementation note
+* entries between 0 and size are valid objects
+* we use inheritance to build this because the array is included inline in a lot
+  of objects and we want the allocator to take no space if it's not stateful, which
+  aggregation doesn't allow. Also, we want the metadata at the front for the inline
+  case where the allocator contains some inline storage space
+*/
+template <class T, class Alloc = typename AllocatorTraits<T>::Type>
+class Array : protected Alloc
+{
+  public:
+	typedef T* Iterator;
+	typedef const T* ConstIterator;
+
+	explicit Array(const PxEMPTY v) : Alloc(v)
+	{
+		if(mData)
+			mCapacity |= PX_SIGN_BITMASK;
+	}
+
+	/*!
+	Default array constructor. Initialize an empty array
+	*/
+	PX_INLINE explicit Array(const Alloc& alloc = Alloc()) : Alloc(alloc), mData(0), mSize(0), mCapacity(0)
+	{
+	}
+
+	/*!
+	Initialize array with given capacity
+	*/
+	PX_INLINE explicit Array(uint32_t size, const T& a = T(), const Alloc& alloc = Alloc())
+	: Alloc(alloc), mData(0), mSize(0), mCapacity(0)
+	{
+		resize(size, a);
+	}
+
+	/*!
+	Copy-constructor. Copy all entries from other array
+	*/
+	template <class A>
+	PX_INLINE explicit Array(const Array<T, A>& other, const Alloc& alloc = Alloc())
+	: Alloc(alloc)
+	{
+		copy(other);
+	}
+
+	// This is necessary else the basic default copy constructor is used in the case of both arrays being of the same
+	// template instance
+	// The C++ standard clearly states that a template constructor is never a copy constructor [2]. In other words,
+	// the presence of a template constructor does not suppress the implicit declaration of the copy constructor.
+	// Also never make a copy constructor explicit, or copy-initialization* will no longer work. This is because
+	// 'binding an rvalue to a const reference requires an accessible copy constructor' (http://gcc.gnu.org/bugs/)
+	// *http://stackoverflow.com/questions/1051379/is-there-a-difference-in-c-between-copy-initialization-and-assignment-initializ
+	PX_INLINE Array(const Array& other, const Alloc& alloc = Alloc()) : Alloc(alloc)
+	{
+		copy(other);
+	}
+
+	/*!
+	Initialize array with given length
+	*/
+	PX_INLINE explicit Array(const T* first, const T* last, const Alloc& alloc = Alloc())
+	: Alloc(alloc), mSize(last < first ? 0 : uint32_t(last - first)), mCapacity(mSize)
+	{
+		mData = allocate(mSize);
+		copy(mData, mData + mSize, first);
+	}
+
+	/*!
+	Destructor
+	*/
+	PX_INLINE ~Array()
+	{
+		destroy(mData, mData + mSize);
+
+		if(capacity() && !isInUserMemory())
+			deallocate(mData);
+	}
+
+	/*!
+	Assignment operator. Copy content (deep-copy)
+	*/
+	template <class A>
+	PX_INLINE Array& operator=(const Array<T, A>& rhs)
+	{
+		if(&rhs == this)
+			return *this;
+
+		clear();
+		reserve(rhs.mSize);
+		copy(mData, mData + rhs.mSize, rhs.mData);
+
+		mSize = rhs.mSize;
+		return *this;
+	}
+
+	PX_INLINE Array& operator=(const Array& t) // Needs to be declared, see comment at copy-constructor
+	{
+		return operator=<Alloc>(t);
+	}
+
+	PX_FORCE_INLINE static bool isArrayOfPOD()
+	{
+#if PX_LIBCPP
+		return std::is_trivially_copyable<T>::value;
+#else
+		return std::tr1::is_pod<T>::value;
+#endif
+	}
+
+	/*!
+	Array indexing operator.
+	\param i
+	The index of the element that will be returned.
+	\return
+	The element i in the array.
+	*/
+	PX_FORCE_INLINE const T& operator[](uint32_t i) const
+	{
+		PX_ASSERT(i < mSize);
+		return mData[i];
+	}
+
+	/*!
+	Array indexing operator.
+	\param i
+	The index of the element that will be returned.
+	\return
+	The element i in the array.
+	*/
+	PX_FORCE_INLINE T& operator[](uint32_t i)
+	{
+		PX_ASSERT(i < mSize);
+		return mData[i];
+	}
+
+	/*!
+	Returns a pointer to the initial element of the array.
+	\return
+	a pointer to the initial element of the array.
+	*/
+	PX_FORCE_INLINE ConstIterator begin() const
+	{
+		return mData;
+	}
+
+	PX_FORCE_INLINE Iterator begin()
+	{
+		return mData;
+	}
+
+	/*!
+	Returns an iterator beyond the last element of the array. Do not dereference.
+	\return
+	a pointer to the element beyond the last element of the array.
+	*/
+
+	PX_FORCE_INLINE ConstIterator end() const
+	{
+		return mData + mSize;
+	}
+
+	PX_FORCE_INLINE Iterator end()
+	{
+		return mData + mSize;
+	}
+
+	/*!
+	Returns a reference to the first element of the array. Undefined if the array is empty.
+	\return a reference to the first element of the array
+	*/
+
+	PX_FORCE_INLINE const T& front() const
+	{
+		PX_ASSERT(mSize);
+		return mData[0];
+	}
+
+	PX_FORCE_INLINE T& front()
+	{
+		PX_ASSERT(mSize);
+		return mData[0];
+	}
+
+	/*!
+	Returns a reference to the last element of the array. Undefined if the array is empty
+	\return a reference to the last element of the array
+	*/
+
+	PX_FORCE_INLINE const T& back() const
+	{
+		PX_ASSERT(mSize);
+		return mData[mSize - 1];
+	}
+
+	PX_FORCE_INLINE T& back()
+	{
+		PX_ASSERT(mSize);
+		return mData[mSize - 1];
+	}
+
+	/*!
+	Returns the number of entries in the array. This can, and probably will,
+	differ from the array capacity.
+	\return
+	The number of of entries in the array.
+	*/
+	PX_FORCE_INLINE uint32_t size() const
+	{
+		return mSize;
+	}
+
+	/*!
+	Clears the array.
+	*/
+	PX_INLINE void clear()
+	{
+		destroy(mData, mData + mSize);
+		mSize = 0;
+	}
+
+	/*!
+	Returns whether the array is empty (i.e. whether its size is 0).
+	\return
+	true if the array is empty
+	*/
+	PX_FORCE_INLINE bool empty() const
+	{
+		return mSize == 0;
+	}
+
+	/*!
+	Finds the first occurrence of an element in the array.
+	\param a
+	The element to find.
+	*/
+
+	PX_INLINE Iterator find(const T& a)
+	{
+		uint32_t index;
+		for(index = 0; index < mSize && mData[index] != a; index++)
+			;
+		return mData + index;
+	}
+
+	PX_INLINE ConstIterator find(const T& a) const
+	{
+		uint32_t index;
+		for(index = 0; index < mSize && mData[index] != a; index++)
+			;
+		return mData + index;
+	}
+
+	/////////////////////////////////////////////////////////////////////////
+	/*!
+	Adds one element to the end of the array. Operation is O(1).
+	\param a
+	The element that will be added to this array.
+	*/
+	/////////////////////////////////////////////////////////////////////////
+
+	PX_FORCE_INLINE T& pushBack(const T& a)
+	{
+		if(capacity() <= mSize)
+			return growAndPushBack(a);
+
+		PX_PLACEMENT_NEW(reinterpret_cast<void*>(mData + mSize), T)(a);
+
+		return mData[mSize++];
+	}
+
+	/////////////////////////////////////////////////////////////////////////
+	/*!
+	Returns the element at the end of the array. Only legal if the array is non-empty.
+	*/
+	/////////////////////////////////////////////////////////////////////////
+	PX_INLINE T popBack()
+	{
+		PX_ASSERT(mSize);
+		T t = mData[mSize - 1];
+
+		if(!isArrayOfPOD())
+		{
+			mData[--mSize].~T();
+		}
+		else
+		{
+			--mSize;
+		}
+
+		return t;
+	}
+
+	/////////////////////////////////////////////////////////////////////////
+	/*!
+	Construct one element at the end of the array. Operation is O(1).
+	*/
+	/////////////////////////////////////////////////////////////////////////
+	PX_INLINE T& insert()
+	{
+		if(capacity() <= mSize)
+			grow(capacityIncrement());
+
+		T* ptr = mData + mSize++;
+		new (ptr) T; // not 'T()' because PODs should not get default-initialized.
+		return *ptr;
+	}
+
+	/////////////////////////////////////////////////////////////////////////
+	/*!
+	Subtracts the element on position i from the array and replace it with
+	the last element.
+	Operation is O(1)
+	\param i
+	The position of the element that will be subtracted from this array.
+	*/
+	/////////////////////////////////////////////////////////////////////////
+	PX_INLINE void replaceWithLast(uint32_t i)
+	{
+		PX_ASSERT(i < mSize);
+		mData[i] = mData[--mSize];
+
+		if(!isArrayOfPOD())
+		{
+			mData[mSize].~T();
+		}
+	}
+
+	PX_INLINE void replaceWithLast(Iterator i)
+	{
+		replaceWithLast(static_cast<uint32_t>(i - mData));
+	}
+
+	/////////////////////////////////////////////////////////////////////////
+	/*!
+	Replaces the first occurrence of the element a with the last element
+	Operation is O(n)
+	\param a
+	The position of the element that will be subtracted from this array.
+	\return true if the element has been removed.
+	*/
+	/////////////////////////////////////////////////////////////////////////
+
+	PX_INLINE bool findAndReplaceWithLast(const T& a)
+	{
+		uint32_t index = 0;
+		while(index < mSize && mData[index] != a)
+			++index;
+		if(index == mSize)
+			return false;
+		replaceWithLast(index);
+		return true;
+	}
+
+	/////////////////////////////////////////////////////////////////////////
+	/*!
+	Subtracts the element on position i from the array. Shift the entire
+	array one step.
+	Operation is O(n)
+	\param i
+	The position of the element that will be subtracted from this array.
+	*/
+	/////////////////////////////////////////////////////////////////////////
+	PX_INLINE void remove(uint32_t i)
+	{
+		PX_ASSERT(i < mSize);
+
+		if(isArrayOfPOD())
+		{
+			if(i + 1 != mSize)
+			{
+				physx::intrinsics::memMove(mData + i, mData + i + 1, (mSize - i - 1) * sizeof(T));
+			}
+		}
+		else
+		{
+			T* it = mData + i++;
+			it->~T();
+			do
+			{								
+				new (it) T(mData[i]);
+				++it;
+				it->~T();
+			} while(++i < mSize);
+		}
+
+		--mSize;
+	}
+
+	/////////////////////////////////////////////////////////////////////////
+	/*!
+	Removes a range from the array.  Shifts the array so order is maintained.
+	Operation is O(n)
+	\param begin
+	The starting position of the element that will be subtracted from this array.
+	\param count
+	The number of elments that will be subtracted from this array.
+	*/
+	/////////////////////////////////////////////////////////////////////////
+	PX_INLINE void removeRange(uint32_t begin, uint32_t count)
+	{
+		PX_ASSERT(begin < mSize);
+		PX_ASSERT((begin + count) <= mSize);
+
+		if(!isArrayOfPOD())
+		{
+			for(uint32_t i = 0; i < count; i++)
+			{
+				mData[begin + i].~T(); // call the destructor on the ones being removed first.
+			}
+		}
+
+		T* dest = &mData[begin];                       // location we are copying the tail end objects to
+		T* src = &mData[begin + count];                // start of tail objects
+		uint32_t move_count = mSize - (begin + count); // compute remainder that needs to be copied down
+
+		if(isArrayOfPOD())
+		{
+			physx::intrinsics::memMove(dest, src, move_count * sizeof(T));
+		}
+		else
+		{
+			for(uint32_t i = 0; i < move_count; i++)
+			{
+				new (dest) T(*src); // copy the old one to the new location
+				src->~T();          // call the destructor on the old location
+				dest++;
+				src++;
+			}
+		}
+		mSize -= count;
+	}
+
+	//////////////////////////////////////////////////////////////////////////
+	/*!
+	Resize array
+	*/
+	//////////////////////////////////////////////////////////////////////////
+	PX_NOINLINE void resize(const uint32_t size, const T& a = T());
+
+	PX_NOINLINE void resizeUninitialized(const uint32_t size);
+
+	//////////////////////////////////////////////////////////////////////////
+	/*!
+	Resize array such that only as much memory is allocated to hold the
+	existing elements
+	*/
+	//////////////////////////////////////////////////////////////////////////
+	PX_INLINE void shrink()
+	{
+		recreate(mSize);
+	}
+
+	//////////////////////////////////////////////////////////////////////////
+	/*!
+	Deletes all array elements and frees memory.
+	*/
+	//////////////////////////////////////////////////////////////////////////
+	PX_INLINE void reset()
+	{
+		resize(0);
+		shrink();
+	}
+
+	//////////////////////////////////////////////////////////////////////////
+	/*!
+	Ensure that the array has at least size capacity.
+	*/
+	//////////////////////////////////////////////////////////////////////////
+	PX_INLINE void reserve(const uint32_t capacity)
+	{
+		if(capacity > this->capacity())
+			grow(capacity);
+	}
+
+	//////////////////////////////////////////////////////////////////////////
+	/*!
+	Query the capacity(allocated mem) for the array.
+	*/
+	//////////////////////////////////////////////////////////////////////////
+	PX_FORCE_INLINE uint32_t capacity() const
+	{
+		return mCapacity & ~PX_SIGN_BITMASK;
+	}
+
+	//////////////////////////////////////////////////////////////////////////
+	/*!
+	Unsafe function to force the size of the array
+	*/
+	//////////////////////////////////////////////////////////////////////////
+	PX_FORCE_INLINE void forceSize_Unsafe(uint32_t size)
+	{
+		PX_ASSERT(size <= mCapacity);
+		mSize = size;
+	}
+
+	//////////////////////////////////////////////////////////////////////////
+	/*!
+	Swap contents of an array without allocating temporary storage
+	*/
+	//////////////////////////////////////////////////////////////////////////
+	PX_INLINE void swap(Array<T, Alloc>& other)
+	{
+		shdfnd::swap(mData, other.mData);
+		shdfnd::swap(mSize, other.mSize);
+		shdfnd::swap(mCapacity, other.mCapacity);
+	}
+
+	//////////////////////////////////////////////////////////////////////////
+	/*!
+	Assign a range of values to this vector (resizes to length of range)
+	*/
+	//////////////////////////////////////////////////////////////////////////
+	PX_INLINE void assign(const T* first, const T* last)
+	{
+		resizeUninitialized(uint32_t(last - first));
+		copy(begin(), end(), first);
+	}
+
+	// We need one bit to mark arrays that have been deserialized from a user-provided memory block.
+	// For alignment & memory saving purpose we store that bit in the rarely used capacity member.
+	PX_FORCE_INLINE uint32_t isInUserMemory() const
+	{
+		return mCapacity & PX_SIGN_BITMASK;
+	}
+
+	/// return reference to allocator
+	PX_INLINE Alloc& getAllocator()
+	{
+		return *this;
+	}
+
+  protected:
+	// constructor for where we don't own the memory
+	Array(T* memory, uint32_t size, uint32_t capacity, const Alloc& alloc = Alloc())
+	: Alloc(alloc), mData(memory), mSize(size), mCapacity(capacity | PX_SIGN_BITMASK)
+	{
+	}
+
+	template <class A>
+	PX_NOINLINE void copy(const Array<T, A>& other);
+
+	PX_INLINE T* allocate(uint32_t size)
+	{
+		if(size > 0)
+		{
+			T* p = reinterpret_cast<T*>(Alloc::allocate(sizeof(T) * size, __FILE__, __LINE__));
+/**
+Mark a specified amount of memory with 0xcd pattern. This is used to check that the meta data
+definition for serialized classes is complete in checked builds.
+*/
+#if PX_CHECKED
+			if(p)
+			{
+				for(uint32_t i = 0; i < (sizeof(T) * size); ++i)
+					reinterpret_cast<uint8_t*>(p)[i] = 0xcd;
+			}
+#endif
+			return p;
+		}
+		return 0;
+	}
+
+	PX_INLINE void deallocate(void* mem)
+	{
+		Alloc::deallocate(mem);
+	}
+
+	static PX_INLINE bool isZeroInit(const T& object)
+	{
+		char ZeroBuffOnStack[sizeof(object)] = {};
+		return memcmp(&object, ZeroBuffOnStack, sizeof(object)) == 0;
+	}
+
+	static PX_INLINE void create(T* first, T* last, const T& a)
+	{
+		if(isArrayOfPOD() && isZeroInit(a))
+		{
+			if(last > first)
+				physx::intrinsics::memZero(first, uint32_t((last - first) * sizeof(T)));
+		}
+		else
+		{
+			for(; first < last; ++first)
+				::new (first) T(a);
+		}
+	}
+
+	static PX_INLINE void copy(T* first, T* last, const T* src)
+	{
+		if(last <= first)
+			return;
+
+		if(isArrayOfPOD())
+		{
+			physx::intrinsics::memCopy(first, src, uint32_t((last - first) * sizeof(T)));
+		}
+		else
+		{
+			for(; first < last; ++first, ++src)
+				::new (first) T(*src);
+		}
+	}
+
+	static PX_INLINE void destroy(T* first, T* last)
+	{
+		if(!isArrayOfPOD())
+		{
+			for(; first < last; ++first)
+				first->~T();
+		}
+	}
+
+	/*!
+	Called when pushBack() needs to grow the array.
+	\param a The element that will be added to this array.
+	*/
+	PX_NOINLINE T& growAndPushBack(const T& a);
+
+	/*!
+	Resizes the available memory for the array.
+
+	\param capacity
+	The number of entries that the set should be able to hold.
+	*/
+	PX_INLINE void grow(uint32_t capacity)
+	{
+		PX_ASSERT(this->capacity() < capacity);
+		recreate(capacity);
+	}
+
+	/*!
+	Creates a new memory block, copies all entries to the new block and destroys old entries.
+
+	\param capacity
+	The number of entries that the set should be able to hold.
+	*/
+	PX_NOINLINE void recreate(uint32_t capacity);
+
+	// The idea here is to prevent accidental bugs with pushBack or insert. Unfortunately
+	// it interacts badly with InlineArrays with smaller inline allocations.
+	// TODO(dsequeira): policy template arg, this is exactly what they're for.
+	PX_INLINE uint32_t capacityIncrement() const
+	{
+		const uint32_t capacity = this->capacity();
+		return capacity == 0 ? 1 : capacity * 2;
+	}
+
+	T* mData;
+	uint32_t mSize;
+	uint32_t mCapacity;
+};
+
+template <class T, class Alloc>
+PX_NOINLINE void Array<T, Alloc>::resize(const uint32_t size, const T& a)
+{
+	reserve(size);
+	create(mData + mSize, mData + size, a);
+	destroy(mData + size, mData + mSize);
+	mSize = size;
+}
+
+template <class T, class Alloc>
+template <class A>
+PX_NOINLINE void Array<T, Alloc>::copy(const Array<T, A>& other)
+{
+	if(!other.empty())
+	{
+		mData = allocate(mSize = mCapacity = other.size());
+		copy(mData, mData + mSize, other.begin());
+	}
+	else
+	{
+		mData = NULL;
+		mSize = 0;
+		mCapacity = 0;
+	}
+
+	// mData = allocate(other.mSize);
+	// mSize = other.mSize;
+	// mCapacity = other.mSize;
+	// copy(mData, mData + mSize, other.mData);
+}
+
+template <class T, class Alloc>
+PX_NOINLINE void Array<T, Alloc>::resizeUninitialized(const uint32_t size)
+{
+	reserve(size);
+	mSize = size;
+}
+
+template <class T, class Alloc>
+PX_NOINLINE T& Array<T, Alloc>::growAndPushBack(const T& a)
+{
+	uint32_t capacity = capacityIncrement();
+
+	T* newData = allocate(capacity);
+	PX_ASSERT((!capacity) || (newData && (newData != mData)));
+	copy(newData, newData + mSize, mData);
+
+	// inserting element before destroying old array
+	// avoids referencing destroyed object when duplicating array element.
+	PX_PLACEMENT_NEW(reinterpret_cast<void*>(newData + mSize), T)(a);
+
+	destroy(mData, mData + mSize);
+	if(!isInUserMemory())
+		deallocate(mData);
+
+	mData = newData;
+	mCapacity = capacity;
+
+	return mData[mSize++];
+}
+
+template <class T, class Alloc>
+PX_NOINLINE void Array<T, Alloc>::recreate(uint32_t capacity)
+{
+	T* newData = allocate(capacity);
+	PX_ASSERT((!capacity) || (newData && (newData != mData)));
+
+	copy(newData, newData + mSize, mData);
+	destroy(mData, mData + mSize);
+	if(!isInUserMemory())
+		deallocate(mData);
+
+	mData = newData;
+	mCapacity = capacity;
+}
+
+template <class T, class Alloc>
+PX_INLINE void swap(Array<T, Alloc>& x, Array<T, Alloc>& y)
+{
+	x.swap(y);
+}
+
+} // namespace shdfnd
+} // namespace physx
+
+#if PX_VC == 9 || PX_VC == 10
+#pragma warning(pop)
+#endif
+
+#endif // #ifndef PSFOUNDATION_PSARRAY_H
diff --git a/PxShared/src/foundation/include/PsAtomic.h b/PxShared/src/foundation/include/PsAtomic.h
new file mode 100644
index 00000000..4e24c84c
--- /dev/null
+++ b/PxShared/src/foundation/include/PsAtomic.h
@@ -0,0 +1,63 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSATOMIC_H
+#define PSFOUNDATION_PSATOMIC_H
+
+#include "Ps.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+/* set *dest equal to val. Return the old value of *dest */
+PX_FOUNDATION_API int32_t atomicExchange(volatile int32_t* dest, int32_t val);
+
+/* if *dest == comp, replace with exch. Return original value of *dest */
+PX_FOUNDATION_API int32_t atomicCompareExchange(volatile int32_t* dest, int32_t exch, int32_t comp);
+
+/* if *dest == comp, replace with exch. Return original value of *dest */
+PX_FOUNDATION_API void* atomicCompareExchangePointer(volatile void** dest, void* exch, void* comp);
+
+/* increment the specified location. Return the incremented value */
+PX_FOUNDATION_API int32_t atomicIncrement(volatile int32_t* val);
+
+/* decrement the specified location. Return the decremented value */
+PX_FOUNDATION_API int32_t atomicDecrement(volatile int32_t* val);
+
+/* add delta to *val. Return the new value */
+PX_FOUNDATION_API int32_t atomicAdd(volatile int32_t* val, int32_t delta);
+
+/* compute the maximum of dest and val. Return the new value */
+PX_FOUNDATION_API int32_t atomicMax(volatile int32_t* val, int32_t val2);
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSATOMIC_H
diff --git a/PxShared/src/foundation/include/PsBasicTemplates.h b/PxShared/src/foundation/include/PsBasicTemplates.h
new file mode 100644
index 00000000..ec8503e4
--- /dev/null
+++ b/PxShared/src/foundation/include/PsBasicTemplates.h
@@ -0,0 +1,146 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSBASICTEMPLATES_H
+#define PSFOUNDATION_PSBASICTEMPLATES_H
+
+#include "Ps.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+template <typename A>
+struct Equal
+{
+	bool operator()(const A& a, const A& b) const
+	{
+		return a == b;
+	}
+};
+
+template <typename A>
+struct Less
+{
+	bool operator()(const A& a, const A& b) const
+	{
+		return a < b;
+	}
+};
+
+template <typename A>
+struct Greater
+{
+	bool operator()(const A& a, const A& b) const
+	{
+		return a > b;
+	}
+};
+
+template <class F, class S>
+class Pair
+{
+  public:
+	F first;
+	S second;
+	Pair() : first(F()), second(S())
+	{
+	}
+	Pair(const F& f, const S& s) : first(f), second(s)
+	{
+	}
+	Pair(const Pair& p) : first(p.first), second(p.second)
+	{
+	}
+	// CN - fix for /.../PsBasicTemplates.h(61) : warning C4512: 'physx::shdfnd::Pair<F,S>' : assignment operator could
+	// not be generated
+	Pair& operator=(const Pair& p)
+	{
+		first = p.first;
+		second = p.second;
+		return *this;
+	}
+	bool operator==(const Pair& p) const
+	{
+		return first == p.first && second == p.second;
+	}
+	bool operator<(const Pair& p) const
+	{
+		if(first < p.first)
+			return true;
+		else
+			return !(p.first < first) && (second < p.second);
+	}
+};
+
+template <unsigned int A>
+struct LogTwo
+{
+	static const unsigned int value = LogTwo<(A >> 1)>::value + 1;
+};
+template <>
+struct LogTwo<1>
+{
+	static const unsigned int value = 0;
+};
+
+template <typename T>
+struct UnConst
+{
+	typedef T Type;
+};
+template <typename T>
+struct UnConst<const T>
+{
+	typedef T Type;
+};
+
+template <typename T>
+T pointerOffset(void* p, ptrdiff_t offset)
+{
+	return reinterpret_cast<T>(reinterpret_cast<char*>(p) + offset);
+}
+template <typename T>
+T pointerOffset(const void* p, ptrdiff_t offset)
+{
+	return reinterpret_cast<T>(reinterpret_cast<const char*>(p) + offset);
+}
+
+template <class T>
+PX_CUDA_CALLABLE PX_INLINE void swap(T& x, T& y)
+{
+	const T tmp = x;
+	x = y;
+	y = tmp;
+}
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSBASICTEMPLATES_H
diff --git a/PxShared/src/foundation/include/PsBitUtils.h b/PxShared/src/foundation/include/PsBitUtils.h
new file mode 100644
index 00000000..7bfef7a7
--- /dev/null
+++ b/PxShared/src/foundation/include/PsBitUtils.h
@@ -0,0 +1,109 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSBITUTILS_H
+#define PSFOUNDATION_PSBITUTILS_H
+
+#include "foundation/PxIntrinsics.h"
+#include "foundation/PxAssert.h"
+#include "PsIntrinsics.h"
+#include "Ps.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+PX_INLINE uint32_t bitCount(uint32_t v)
+{
+	// from http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+	uint32_t const w = v - ((v >> 1) & 0x55555555);
+	uint32_t const x = (w & 0x33333333) + ((w >> 2) & 0x33333333);
+	return (((x + (x >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
+}
+
+PX_INLINE bool isPowerOfTwo(uint32_t x)
+{
+	return x != 0 && (x & (x - 1)) == 0;
+}
+
+// "Next Largest Power of 2
+// Given a binary integer value x, the next largest power of 2 can be computed by a SWAR algorithm
+// that recursively "folds" the upper bits into the lower bits. This process yields a bit vector with
+// the same most significant 1 as x, but all 1's below it. Adding 1 to that value yields the next
+// largest power of 2. For a 32-bit value:"
+PX_INLINE uint32_t nextPowerOfTwo(uint32_t x)
+{
+	x |= (x >> 1);
+	x |= (x >> 2);
+	x |= (x >> 4);
+	x |= (x >> 8);
+	x |= (x >> 16);
+	return x + 1;
+}
+
+/*!
+Return the index of the highest set bit. Not valid for zero arg.
+*/
+
+PX_INLINE uint32_t lowestSetBit(uint32_t x)
+{
+	PX_ASSERT(x);
+	return lowestSetBitUnsafe(x);
+}
+
+/*!
+Return the index of the highest set bit. Not valid for zero arg.
+*/
+
+PX_INLINE uint32_t highestSetBit(uint32_t x)
+{
+	PX_ASSERT(x);
+	return highestSetBitUnsafe(x);
+}
+
+// Helper function to approximate log2 of an integer value
+// assumes that the input is actually power of two.
+// todo: replace 2 usages with 'highestSetBit'
+PX_INLINE uint32_t ilog2(uint32_t num)
+{
+	for(uint32_t i = 0; i < 32; i++)
+	{
+		num >>= 1;
+		if(num == 0)
+			return i;
+	}
+
+	PX_ASSERT(0);
+	return uint32_t(-1);
+}
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSBITUTILS_H
diff --git a/PxShared/src/foundation/include/PsBroadcast.h b/PxShared/src/foundation/include/PsBroadcast.h
new file mode 100644
index 00000000..a1e04c38
--- /dev/null
+++ b/PxShared/src/foundation/include/PsBroadcast.h
@@ -0,0 +1,277 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PXPVDSDK_PXBROADCAST_H
+#define PXPVDSDK_PXBROADCAST_H
+
+#include "Ps.h"
+#include "PsInlineArray.h"
+
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxErrorCallback.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+
+/**
+\brief Abstract listener class that listens to allocation and deallocation events from the
+    foundation memory system.
+
+<b>Threading:</b> All methods of this class should be thread safe as it can be called from the user thread
+or the physics processing thread(s).
+*/
+class AllocationListener
+{
+  public:
+	/**
+	\brief callback when memory is allocated.
+	\param size Size of the allocation in bytes.
+	\param typeName Type this data is being allocated for.
+	\param filename File the allocation came from.
+	\param line the allocation came from.
+	\param allocatedMemory memory that will be returned from the allocation.
+	*/
+	virtual void onAllocation(size_t size, const char* typeName, const char* filename, int line,
+	                          void* allocatedMemory) = 0;
+
+	/**
+	\brief callback when memory is deallocated.
+	\param allocatedMemory memory just before allocation.
+	*/
+	virtual void onDeallocation(void* allocatedMemory) = 0;
+
+  protected:
+	virtual ~AllocationListener()
+	{
+	}
+};
+
+/**
+\brief Broadcast class implementation, registering listeners.
+
+<b>Threading:</b> All methods of this class should be thread safe as it can be called from the user thread
+or the physics processing thread(s). There is not internal locking
+*/
+template <class Listener, class Base>
+class Broadcast : public Base
+{
+  public:
+	static const uint32_t MAX_NB_LISTENERS = 16;
+
+	/**
+	\brief The default constructor.
+	*/
+	Broadcast()
+	{
+	}
+
+	/**
+	\brief Register new listener.
+
+	\note It is NOT SAFE to register and deregister listeners while allocations may be taking place.
+	moreover, there is no thread safety to registration/deregistration.
+
+	\param listener Listener to register.
+	*/
+	void registerListener(Listener& listener)
+	{
+		if(mListeners.size() < MAX_NB_LISTENERS)
+			mListeners.pushBack(&listener);
+	}
+
+	/**
+	\brief Deregister an existing listener.
+
+	\note It is NOT SAFE to register and deregister listeners while allocations may be taking place.
+	moreover, there is no thread safety to registration/deregistration.
+
+	\param listener Listener to deregister.
+	*/
+	void deregisterListener(Listener& listener)
+	{
+		mListeners.findAndReplaceWithLast(&listener);
+	}
+
+	/**
+	\brief Get number of registered listeners.
+
+	\return Number of listeners.
+	*/
+	uint32_t getNbListeners() const
+	{
+		return mListeners.size();
+	}
+
+	/**
+	\brief Get an existing listener from given index.
+
+	\param index Index of the listener.
+	\return Listener on given index.
+	*/
+	Listener& getListener(uint32_t index)
+	{
+		PX_ASSERT(index <= mListeners.size());
+		return *mListeners[index];
+	}
+
+  protected:
+	virtual ~Broadcast()
+	{
+	}
+
+	physx::shdfnd::InlineArray<Listener*, MAX_NB_LISTENERS, physx::shdfnd::NonTrackingAllocator> mListeners;
+};
+
+/**
+\brief Abstract base class for an application defined memory allocator that allows an external listener
+to audit the memory allocations.
+*/
+class BroadcastingAllocator : public Broadcast<AllocationListener, PxAllocatorCallback>
+{
+	PX_NOCOPY(BroadcastingAllocator)
+
+  public:
+	/**
+	\brief The default constructor.
+	*/
+	BroadcastingAllocator(PxAllocatorCallback& allocator, PxErrorCallback& error) : mAllocator(allocator), mError(error)
+	{
+		mListeners.clear();
+	}
+
+	/**
+	\brief The default constructor.
+	*/
+	virtual ~BroadcastingAllocator()
+	{
+		mListeners.clear();
+	}
+
+	/**
+	\brief Allocates size bytes of memory, which must be 16-byte aligned.
+
+	This method should never return NULL.  If you run out of memory, then
+	you should terminate the app or take some other appropriate action.
+
+	<b>Threading:</b> This function should be thread safe as it can be called in the context of the user thread
+	and physics processing thread(s).
+
+	\param size			Number of bytes to allocate.
+	\param typeName		Name of the datatype that is being allocated
+	\param filename		The source file which allocated the memory
+	\param line			The source line which allocated the memory
+	\return				The allocated block of memory.
+	*/
+	void* allocate(size_t size, const char* typeName, const char* filename, int line)
+	{
+		void* mem = mAllocator.allocate(size, typeName, filename, line);
+
+		if(!mem)
+		{
+			mError.reportError(PxErrorCode::eABORT, "User allocator returned NULL.", __FILE__, __LINE__);
+			return NULL;
+		}
+
+		if((reinterpret_cast<size_t>(mem) & 15))
+		{
+			mError.reportError(PxErrorCode::eABORT, "Allocations must be 16-byte aligned.", __FILE__, __LINE__);
+			return NULL;
+		}
+
+		for(uint32_t i = 0; i < mListeners.size(); i++)
+			mListeners[i]->onAllocation(size, typeName, filename, line, mem);
+
+		return mem;
+	}
+
+	/**
+	\brief Frees memory previously allocated by allocate().
+
+	<b>Threading:</b> This function should be thread safe as it can be called in the context of the user thread
+	and physics processing thread(s).
+
+	\param ptr Memory to free.
+	*/
+	void deallocate(void* ptr)
+	{
+		for(uint32_t i = 0; i < mListeners.size(); i++)
+		{
+			mListeners[i]->onDeallocation(ptr);
+		}
+		mAllocator.deallocate(ptr);
+	}
+
+  private:
+	PxAllocatorCallback& mAllocator;
+	PxErrorCallback& mError;
+};
+
+/**
+\brief Abstract base class for an application defined error callback that allows an external listener
+to report errors.
+*/
+class BroadcastingErrorCallback : public Broadcast<PxErrorCallback, PxErrorCallback>
+{
+	PX_NOCOPY(BroadcastingErrorCallback)
+  public:
+	/**
+	\brief The default constructor.
+	*/
+	BroadcastingErrorCallback(PxErrorCallback& errorCallback)
+	{
+		registerListener(errorCallback);
+	}
+
+	/**
+	\brief The default destructor.
+	*/
+	virtual ~BroadcastingErrorCallback()
+	{
+		mListeners.clear();
+	}
+
+	/**
+	\brief Reports an error code.
+	\param code Error code, see #PxErrorCode
+	\param message Message to display.
+	\param file File error occured in.
+	\param line Line number error occured on.
+	*/
+	void reportError(PxErrorCode::Enum code, const char* message, const char* file, int line)
+	{
+		for(uint32_t i = 0; i < mListeners.size(); i++)
+			mListeners[i]->reportError(code, message, file, line);
+	}
+};
+}
+} // namespace physx
+
+#endif // PXPVDSDK_PXBROADCAST_H
diff --git a/PxShared/src/foundation/include/PsCpu.h b/PxShared/src/foundation/include/PsCpu.h
new file mode 100644
index 00000000..52aef045
--- /dev/null
+++ b/PxShared/src/foundation/include/PsCpu.h
@@ -0,0 +1,47 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSCPU_H
+#define PSFOUNDATION_PSCPU_H
+
+#include "Ps.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+class Cpu
+{
+  public:
+	static uint8_t getCpuId();
+};
+}
+}
+
+#endif // #ifndef PSFOUNDATION_PSCPU_H
diff --git a/PxShared/src/foundation/include/PsFPU.h b/PxShared/src/foundation/include/PsFPU.h
new file mode 100644
index 00000000..d8f0df75
--- /dev/null
+++ b/PxShared/src/foundation/include/PsFPU.h
@@ -0,0 +1,103 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSFPU_H
+#define PSFOUNDATION_PSFPU_H
+
+#include "Ps.h"
+#include "PsIntrinsics.h"
+
+#define PX_IR(x) ((uint32_t&)(x))
+#define PX_SIR(x) ((int32_t&)(x))
+#define PX_FR(x) ((float&)(x))
+
+// signed integer representation of a floating-point value.
+
+// Floating-point representation of a integer value.
+
+#define PX_SIGN_BITMASK 0x80000000
+
+#define PX_FPU_GUARD shdfnd::FPUGuard scopedFpGuard;
+#define PX_SIMD_GUARD shdfnd::SIMDGuard scopedFpGuard;
+
+#define PX_SUPPORT_GUARDS (PX_WINDOWS_FAMILY || PX_XBOXONE || (PX_LINUX && (PX_X86 || PX_X64)) || PX_PS4 || PX_OSX)
+
+namespace physx
+{
+namespace shdfnd
+{
+// sets the default SDK state for scalar and SIMD units
+class PX_FOUNDATION_API FPUGuard
+{
+  public:
+	FPUGuard();  // set fpu control word for PhysX
+	~FPUGuard(); // restore fpu control word
+  private:
+	uint32_t mControlWords[8];
+};
+
+// sets default SDK state for simd unit only, lighter weight than FPUGuard
+class SIMDGuard
+{
+  public:
+	PX_INLINE SIMDGuard();  // set simd control word for PhysX
+	PX_INLINE ~SIMDGuard(); // restore simd control word
+  private:
+#if PX_SUPPORT_GUARDS
+	uint32_t mControlWord;
+#endif
+};
+
+/**
+\brief Enables floating point exceptions for the scalar and SIMD unit
+*/
+PX_FOUNDATION_API void enableFPExceptions();
+
+/**
+\brief Disables floating point exceptions for the scalar and SIMD unit
+*/
+PX_FOUNDATION_API void disableFPExceptions();
+
+} // namespace shdfnd
+} // namespace physx
+
+#if PX_WINDOWS_FAMILY || PX_XBOXONE
+#include "windows/PsWindowsFPU.h"
+#elif PX_LINUX || PX_PS4 || PX_OSX
+#include "unix/PsUnixFPU.h"
+#else
+PX_INLINE physx::shdfnd::SIMDGuard::SIMDGuard()
+{
+}
+PX_INLINE physx::shdfnd::SIMDGuard::~SIMDGuard()
+{
+}
+#endif
+
+#endif // #ifndef PSFOUNDATION_PSFPU_H
diff --git a/PxShared/src/foundation/include/PsFoundation.h b/PxShared/src/foundation/include/PsFoundation.h
new file mode 100644
index 00000000..27e94321
--- /dev/null
+++ b/PxShared/src/foundation/include/PsFoundation.h
@@ -0,0 +1,216 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PX_FOUNDATION_PSFOUNDATION_H
+#define PX_FOUNDATION_PSFOUNDATION_H
+
+#include "foundation/PxFoundation.h"
+#include "foundation/PxErrors.h"
+#include "foundation/PxProfiler.h"
+
+#include "PsBroadcast.h"
+#include "PsAllocator.h"
+#include "PsTempAllocator.h"
+#include "PsMutex.h"
+#include "PsHashMap.h"
+#include "PsUserAllocated.h"
+
+#include <stdarg.h>
+
+namespace physx
+{
+namespace shdfnd
+{
+
+#if PX_VC
+#pragma warning(push)
+#pragma warning(disable : 4251) // class needs to have dll-interface to be used by clients of class
+#endif
+
+class PX_FOUNDATION_API Foundation : public PxFoundation, public UserAllocated
+{
+	PX_NOCOPY(Foundation)
+
+  public:
+	typedef MutexT<Allocator> Mutex;
+
+	typedef HashMap<const NamedAllocator*, const char*, Hash<const NamedAllocator*>, NonTrackingAllocator> AllocNameMap;
+	typedef Array<TempAllocatorChunk*, Allocator> AllocFreeTable;
+
+  public:
+	// factory
+	// note, you MUST eventually call release if createInstance returned true!
+	static Foundation* createInstance(PxU32 version, PxErrorCallback& errc, PxAllocatorCallback& alloc);
+	static Foundation& getInstance();
+	void release();
+	static void incRefCount(); // this call requires a foundation object to exist already
+	static void decRefCount(); // this call requires a foundation object to exist already
+
+	// Begin Errors
+	virtual PxErrorCallback& getErrorCallback()
+	{
+		return mErrorCallback;
+	} // Return the user's error callback
+	PxErrorCallback& getInternalErrorCallback()
+	{
+		return mBroadcastingError;
+	} // Return the broadcasting error callback
+
+	void registerErrorCallback(PxErrorCallback& listener);
+	void deregisterErrorCallback(PxErrorCallback& listener);
+
+	virtual void setErrorLevel(PxErrorCode::Enum mask)
+	{
+		mErrorMask = mask;
+	}
+	virtual PxErrorCode::Enum getErrorLevel() const
+	{
+		return mErrorMask;
+	}
+
+	void error(PxErrorCode::Enum, const char* file, int line, const char* messageFmt, ...); // Report errors with the
+	                                                                                        // broadcasting
+	void errorImpl(PxErrorCode::Enum, const char* file, int line, const char* messageFmt, va_list); // error callback
+	static PxU32 getWarnOnceTimestamp();
+
+	// End errors
+
+	// Begin Allocations
+	virtual PxAllocatorCallback& getAllocatorCallback()
+	{
+		return mAllocatorCallback;
+	} // Return the user's allocator callback
+	PxAllocatorCallback& getAllocator()
+	{
+		return mBroadcastingAllocator;
+	} // Return the broadcasting allocator
+
+	void registerAllocationListener(physx::shdfnd::AllocationListener& listener);
+	void deregisterAllocationListener(physx::shdfnd::AllocationListener& listener);
+
+	virtual bool getReportAllocationNames() const
+	{
+		return mReportAllocationNames;
+	}
+	virtual void setReportAllocationNames(bool value)
+	{
+		mReportAllocationNames = value;
+	}
+
+	PX_INLINE AllocNameMap& getNamedAllocMap()
+	{
+		return mNamedAllocMap;
+	}
+	PX_INLINE Mutex& getNamedAllocMutex()
+	{
+		return mNamedAllocMutex;
+	}
+
+	PX_INLINE AllocFreeTable& getTempAllocFreeTable()
+	{
+		return mTempAllocFreeTable;
+	}
+	PX_INLINE Mutex& getTempAllocMutex()
+	{
+		return mTempAllocMutex;
+	}
+	// End allocations
+
+  private:
+	static void destroyInstance();
+
+	Foundation(PxErrorCallback& errc, PxAllocatorCallback& alloc);
+	~Foundation();
+
+	// init order is tricky here: the mutexes require the allocator, the allocator may require the error stream
+	PxAllocatorCallback& mAllocatorCallback;
+	PxErrorCallback& mErrorCallback;
+
+	BroadcastingAllocator mBroadcastingAllocator;
+	BroadcastingErrorCallback mBroadcastingError;
+
+	bool mReportAllocationNames;
+
+	PxErrorCode::Enum mErrorMask;
+	Mutex mErrorMutex;
+
+	AllocNameMap mNamedAllocMap;
+	Mutex mNamedAllocMutex;
+
+	AllocFreeTable mTempAllocFreeTable;
+	Mutex mTempAllocMutex;
+
+	Mutex mListenerMutex;
+
+	static Foundation* mInstance;
+	static PxU32 mRefCount;
+	static PxU32 mWarnOnceTimestap;
+};
+#if PX_VC
+#pragma warning(pop)
+#endif
+
+PX_INLINE Foundation& getFoundation()
+{
+	return Foundation::getInstance();
+}
+
+} // namespace shdfnd
+} // namespace physx
+
+// shortcut macros:
+// usage: Foundation::error(PX_WARN, "static friction %f is is lower than dynamic friction %d", sfr, dfr);
+#define PX_WARN ::physx::PxErrorCode::eDEBUG_WARNING, __FILE__, __LINE__
+#define PX_INFO ::physx::PxErrorCode::eDEBUG_INFO, __FILE__, __LINE__
+
+#if PX_DEBUG || PX_CHECKED
+#define PX_WARN_ONCE(string)                                                                                           \
+	{                                                                                                                  \
+		static PxU32 timestamp = 0;                                                                                    \
+		if(timestamp != Ps::getFoundation().getWarnOnceTimestamp())                                                    \
+		{                                                                                                              \
+			timestamp = Ps::getFoundation().getWarnOnceTimestamp();                                                    \
+			Ps::getFoundation().error(PX_WARN, string);                                                                \
+		}                                                                                                              \
+	\
+}
+#define PX_WARN_ONCE_IF(condition, string)                                                                             \
+	{                                                                                                                  \
+		if(condition)                                                                                                  \
+		{                                                                                                              \
+			PX_WARN_ONCE(string)                                                                                       \
+		}                                                                                                              \
+	\
+}
+#else
+#define PX_WARN_ONCE(string) ((void)0)
+#define PX_WARN_ONCE_IF(condition, string) ((void)0)
+#endif
+
+#endif
diff --git a/PxShared/src/foundation/include/PsHash.h b/PxShared/src/foundation/include/PsHash.h
new file mode 100644
index 00000000..0c7ab22a
--- /dev/null
+++ b/PxShared/src/foundation/include/PsHash.h
@@ -0,0 +1,162 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSHASH_H
+#define PSFOUNDATION_PSHASH_H
+
+#include "Ps.h"
+#include "PsBasicTemplates.h"
+
+#if PX_VC
+#pragma warning(push)
+#pragma warning(disable : 4302)
+#endif
+
+#if PX_LINUX
+#include "foundation/PxSimpleTypes.h"
+#endif
+
+/*!
+Central definition of hash functions
+*/
+
+namespace physx
+{
+namespace shdfnd
+{
+// Hash functions
+
+// Thomas Wang's 32 bit mix
+// http://www.cris.com/~Ttwang/tech/inthash.htm
+PX_FORCE_INLINE uint32_t hash(const uint32_t key)
+{
+	uint32_t k = key;
+	k += ~(k << 15);
+	k ^= (k >> 10);
+	k += (k << 3);
+	k ^= (k >> 6);
+	k += ~(k << 11);
+	k ^= (k >> 16);
+	return uint32_t(k);
+}
+
+PX_FORCE_INLINE uint32_t hash(const int32_t key)
+{
+	return hash(uint32_t(key));
+}
+
+// Thomas Wang's 64 bit mix
+// http://www.cris.com/~Ttwang/tech/inthash.htm
+PX_FORCE_INLINE uint32_t hash(const uint64_t key)
+{
+	uint64_t k = key;
+	k += ~(k << 32);
+	k ^= (k >> 22);
+	k += ~(k << 13);
+	k ^= (k >> 8);
+	k += (k << 3);
+	k ^= (k >> 15);
+	k += ~(k << 27);
+	k ^= (k >> 31);
+	return uint32_t(UINT32_MAX & k);
+}
+
+#if PX_APPLE_FAMILY
+// hash for size_t, to make gcc happy
+PX_INLINE uint32_t hash(const size_t key)
+{
+#if PX_P64_FAMILY
+	return hash(uint64_t(key));
+#else
+	return hash(uint32_t(key));
+#endif
+}
+#endif
+
+// Hash function for pointers
+PX_INLINE uint32_t hash(const void* ptr)
+{
+#if PX_P64_FAMILY
+	return hash(uint64_t(ptr));
+#else
+	return hash(uint32_t(UINT32_MAX & size_t(ptr)));
+#endif
+}
+
+// Hash function for pairs
+template <typename F, typename S>
+PX_INLINE uint32_t hash(const Pair<F, S>& p)
+{
+	uint32_t seed = 0x876543;
+	uint32_t m = 1000007;
+	return hash(p.second) ^ (m * (hash(p.first) ^ (m * seed)));
+}
+
+// hash object for hash map template parameter
+template <class Key>
+struct Hash
+{
+	uint32_t operator()(const Key& k) const
+	{
+		return hash(k);
+	}
+	bool equal(const Key& k0, const Key& k1) const
+	{
+		return k0 == k1;
+	}
+};
+
+// specialization for strings
+template <>
+struct Hash<const char*>
+{
+  public:
+	uint32_t operator()(const char* _string) const
+	{
+		// "DJB" string hash
+		const uint8_t* string = reinterpret_cast<const uint8_t*>(_string);
+		uint32_t h = 5381;
+		for(const uint8_t* ptr = string; *ptr; ptr++)
+			h = ((h << 5) + h) ^ uint32_t(*ptr);
+		return h;
+	}
+	bool equal(const char* string0, const char* string1) const
+	{
+		return !strcmp(string0, string1);
+	}
+};
+
+} // namespace shdfnd
+} // namespace physx
+
+#if PX_VC
+#pragma warning(pop)
+#endif
+
+#endif // #ifndef PSFOUNDATION_PSHASH_H
diff --git a/PxShared/src/foundation/include/PsHashInternals.h b/PxShared/src/foundation/include/PsHashInternals.h
new file mode 100644
index 00000000..5aa7adcf
--- /dev/null
+++ b/PxShared/src/foundation/include/PsHashInternals.h
@@ -0,0 +1,795 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSHASHINTERNALS_H
+#define PSFOUNDATION_PSHASHINTERNALS_H
+
+#include "PsBasicTemplates.h"
+#include "PsArray.h"
+#include "PsBitUtils.h"
+#include "PsHash.h"
+#include "foundation/PxIntrinsics.h"
+
+#if PX_VC
+#pragma warning(push)
+#pragma warning(disable : 4127) // conditional expression is constant
+#endif
+namespace physx
+{
+namespace shdfnd
+{
+namespace internal
+{
+template <class Entry, class Key, class HashFn, class GetKey, class Allocator, bool compacting>
+class HashBase : private Allocator
+{
+	void init(uint32_t initialTableSize, float loadFactor)
+	{
+		mBuffer = NULL;
+		mEntries = NULL;
+		mEntriesNext = NULL;
+		mHash = NULL;
+		mEntriesCapacity = 0;
+		mHashSize = 0;
+		mLoadFactor = loadFactor;
+		mFreeList = uint32_t(EOL);
+		mTimestamp = 0;
+		mEntriesCount = 0;
+
+		if(initialTableSize)
+			reserveInternal(initialTableSize);
+	}
+
+  public:
+	typedef Entry EntryType;
+
+	HashBase(uint32_t initialTableSize = 64, float loadFactor = 0.75f) : Allocator(PX_DEBUG_EXP("hashBase"))
+	{
+		init(initialTableSize, loadFactor);
+	}
+
+	HashBase(uint32_t initialTableSize, float loadFactor, const Allocator& alloc) : Allocator(alloc)
+	{
+		init(initialTableSize, loadFactor);
+	}
+
+	HashBase(const Allocator& alloc) : Allocator(alloc)
+	{
+		init(64, 0.75f);
+	}
+
+	~HashBase()
+	{
+		destroy(); // No need to clear()
+
+		if(mBuffer)
+			Allocator::deallocate(mBuffer);
+	}
+
+	static const uint32_t EOL = 0xffffffff;
+
+	PX_INLINE Entry* create(const Key& k, bool& exists)
+	{
+		uint32_t h = 0;
+		if(mHashSize)
+		{
+			h = hash(k);
+			uint32_t index = mHash[h];
+			while(index != EOL && !HashFn().equal(GetKey()(mEntries[index]), k))
+				index = mEntriesNext[index];
+			exists = index != EOL;
+			if(exists)
+				return mEntries + index;
+		}
+		else
+			exists = false;
+
+		if(freeListEmpty())
+		{
+			grow();
+			h = hash(k);
+		}
+
+		uint32_t entryIndex = freeListGetNext();
+
+		mEntriesNext[entryIndex] = mHash[h];
+		mHash[h] = entryIndex;
+
+		mEntriesCount++;
+		mTimestamp++;
+
+		return mEntries + entryIndex;
+	}
+
+	PX_INLINE const Entry* find(const Key& k) const
+	{
+		if(!mEntriesCount)
+			return NULL;
+
+		const uint32_t h = hash(k);
+		uint32_t index = mHash[h];
+		while(index != EOL && !HashFn().equal(GetKey()(mEntries[index]), k))
+			index = mEntriesNext[index];
+		return index != EOL ? mEntries + index : NULL;
+	}
+
+	PX_INLINE bool erase(const Key& k, Entry& e)
+	{
+		if(!mEntriesCount)
+			return false;
+
+		const uint32_t h = hash(k);
+		uint32_t* ptr = mHash + h;
+		while(*ptr != EOL && !HashFn().equal(GetKey()(mEntries[*ptr]), k))
+			ptr = mEntriesNext + *ptr;
+
+		if(*ptr == EOL)
+			return false;
+
+		PX_PLACEMENT_NEW(&e, Entry)(mEntries[*ptr]);		
+
+		return eraseInternal(ptr);
+	}
+
+	PX_INLINE bool erase(const Key& k)
+	{
+		if(!mEntriesCount)
+			return false;
+
+		const uint32_t h = hash(k);
+		uint32_t* ptr = mHash + h;
+		while(*ptr != EOL && !HashFn().equal(GetKey()(mEntries[*ptr]), k))
+			ptr = mEntriesNext + *ptr;
+
+		if(*ptr == EOL)
+			return false;		
+
+		return eraseInternal(ptr);
+	}
+
+	PX_INLINE uint32_t size() const
+	{
+		return mEntriesCount;
+	}
+
+	PX_INLINE uint32_t capacity() const
+	{
+		return mHashSize;
+	}
+
+	void clear()
+	{
+		if(!mHashSize || mEntriesCount == 0)
+			return;
+
+		destroy();
+
+		intrinsics::memSet(mHash, EOL, mHashSize * sizeof(uint32_t));
+
+		const uint32_t sizeMinus1 = mEntriesCapacity - 1;
+		for(uint32_t i = 0; i < sizeMinus1; i++)
+		{
+			prefetchLine(mEntriesNext + i, 128);
+			mEntriesNext[i] = i + 1;
+		}
+		mEntriesNext[mEntriesCapacity - 1] = uint32_t(EOL);
+		mFreeList = 0;
+		mEntriesCount = 0;
+	}
+
+	void reserve(uint32_t size)
+	{
+		if(size > mHashSize)
+			reserveInternal(size);
+	}
+
+	PX_INLINE const Entry* getEntries() const
+	{
+		return mEntries;
+	}
+
+	PX_INLINE Entry* insertUnique(const Key& k)
+	{
+		PX_ASSERT(find(k) == NULL);
+		uint32_t h = hash(k);
+
+		uint32_t entryIndex = freeListGetNext();
+
+		mEntriesNext[entryIndex] = mHash[h];
+		mHash[h] = entryIndex;
+
+		mEntriesCount++;
+		mTimestamp++;
+
+		return mEntries + entryIndex;
+	}
+
+  private:
+	void destroy()
+	{
+		for(uint32_t i = 0; i < mHashSize; i++)
+		{
+			for(uint32_t j = mHash[i]; j != EOL; j = mEntriesNext[j])
+				mEntries[j].~Entry();
+		}
+	}
+
+	template <typename HK, typename GK, class A, bool comp>
+	PX_NOINLINE void copy(const HashBase<Entry, Key, HK, GK, A, comp>& other);
+
+	// free list management - if we're coalescing, then we use mFreeList to hold
+	// the top of the free list and it should always be equal to size(). Otherwise,
+	// we build a free list in the next() pointers.
+
+	PX_INLINE void freeListAdd(uint32_t index)
+	{
+		if(compacting)
+		{
+			mFreeList--;
+			PX_ASSERT(mFreeList == mEntriesCount);
+		}
+		else
+		{
+			mEntriesNext[index] = mFreeList;
+			mFreeList = index;
+		}
+	}
+
+	PX_INLINE void freeListAdd(uint32_t start, uint32_t end)
+	{
+		if(!compacting)
+		{
+			for(uint32_t i = start; i < end - 1; i++) // add the new entries to the free list
+				mEntriesNext[i] = i + 1;
+
+			// link in old free list
+			mEntriesNext[end - 1] = mFreeList;
+			PX_ASSERT(mFreeList != end - 1);
+			mFreeList = start;
+		}
+		else if(mFreeList == EOL) // don't reset the free ptr for the compacting hash unless it's empty
+			mFreeList = start;
+	}
+
+	PX_INLINE uint32_t freeListGetNext()
+	{
+		PX_ASSERT(!freeListEmpty());
+		if(compacting)
+		{
+			PX_ASSERT(mFreeList == mEntriesCount);
+			return mFreeList++;
+		}
+		else
+		{
+			uint32_t entryIndex = mFreeList;
+			mFreeList = mEntriesNext[mFreeList];
+			return entryIndex;
+		}
+	}
+
+	PX_INLINE bool freeListEmpty() const
+	{
+		if(compacting)
+			return mEntriesCount == mEntriesCapacity;
+		else
+			return mFreeList == EOL;
+	}
+
+	PX_INLINE void replaceWithLast(uint32_t index)
+	{
+		PX_PLACEMENT_NEW(mEntries + index, Entry)(mEntries[mEntriesCount]);
+		mEntries[mEntriesCount].~Entry();
+		mEntriesNext[index] = mEntriesNext[mEntriesCount];
+
+		uint32_t h = hash(GetKey()(mEntries[index]));
+		uint32_t* ptr;
+		for(ptr = mHash + h; *ptr != mEntriesCount; ptr = mEntriesNext + *ptr)
+			PX_ASSERT(*ptr != EOL);
+		*ptr = index;
+	}
+
+	PX_INLINE uint32_t hash(const Key& k, uint32_t hashSize) const
+	{
+		return HashFn()(k) & (hashSize - 1);
+	}
+
+	PX_INLINE uint32_t hash(const Key& k) const
+	{
+		return hash(k, mHashSize);
+	}
+
+	PX_INLINE bool eraseInternal(uint32_t* ptr)
+	{
+		const uint32_t index = *ptr;
+
+		*ptr = mEntriesNext[index];
+
+		mEntries[index].~Entry();
+
+		mEntriesCount--;
+		mTimestamp++;
+
+		if (compacting && index != mEntriesCount)
+			replaceWithLast(index);
+
+		freeListAdd(index);
+		return true;
+	}
+
+	void reserveInternal(uint32_t size)
+	{
+		if(!isPowerOfTwo(size))
+			size = nextPowerOfTwo(size);
+
+		PX_ASSERT(!(size & (size - 1)));
+
+		// decide whether iteration can be done on the entries directly
+		bool resizeCompact = compacting || freeListEmpty();
+
+		// define new table sizes
+		uint32_t oldEntriesCapacity = mEntriesCapacity;
+		uint32_t newEntriesCapacity = uint32_t(float(size) * mLoadFactor);
+		uint32_t newHashSize = size;
+
+		// allocate new common buffer and setup pointers to new tables
+		uint8_t* newBuffer;
+		uint32_t* newHash;
+		uint32_t* newEntriesNext;
+		Entry* newEntries;
+		{
+			uint32_t newHashByteOffset = 0;
+			uint32_t newEntriesNextBytesOffset = newHashByteOffset + newHashSize * sizeof(uint32_t);
+			uint32_t newEntriesByteOffset = newEntriesNextBytesOffset + newEntriesCapacity * sizeof(uint32_t);
+			newEntriesByteOffset += (16 - (newEntriesByteOffset & 15)) & 15;
+			uint32_t newBufferByteSize = newEntriesByteOffset + newEntriesCapacity * sizeof(Entry);
+
+			newBuffer = reinterpret_cast<uint8_t*>(Allocator::allocate(newBufferByteSize, __FILE__, __LINE__));
+			PX_ASSERT(newBuffer);
+
+			newHash = reinterpret_cast<uint32_t*>(newBuffer + newHashByteOffset);
+			newEntriesNext = reinterpret_cast<uint32_t*>(newBuffer + newEntriesNextBytesOffset);
+			newEntries = reinterpret_cast<Entry*>(newBuffer + newEntriesByteOffset);
+		}
+
+		// initialize new hash table
+		intrinsics::memSet(newHash, uint32_t(EOL), newHashSize * sizeof(uint32_t));
+
+		// iterate over old entries, re-hash and create new entries
+		if(resizeCompact)
+		{
+			// check that old free list is empty - we don't need to copy the next entries
+			PX_ASSERT(compacting || mFreeList == EOL);
+
+			for(uint32_t index = 0; index < mEntriesCount; ++index)
+			{
+				uint32_t h = hash(GetKey()(mEntries[index]), newHashSize);
+				newEntriesNext[index] = newHash[h];
+				newHash[h] = index;
+
+				PX_PLACEMENT_NEW(newEntries + index, Entry)(mEntries[index]);
+				mEntries[index].~Entry();
+			}
+		}
+		else
+		{
+			// copy old free list, only required for non compact resizing
+			intrinsics::memCopy(newEntriesNext, mEntriesNext, mEntriesCapacity * sizeof(uint32_t));
+
+			for(uint32_t bucket = 0; bucket < mHashSize; bucket++)
+			{
+				uint32_t index = mHash[bucket];
+				while(index != EOL)
+				{
+					uint32_t h = hash(GetKey()(mEntries[index]), newHashSize);
+					newEntriesNext[index] = newHash[h];
+					PX_ASSERT(index != newHash[h]);
+
+					newHash[h] = index;
+
+					PX_PLACEMENT_NEW(newEntries + index, Entry)(mEntries[index]);
+					mEntries[index].~Entry();
+
+					index = mEntriesNext[index];
+				}
+			}
+		}
+
+		// swap buffer and pointers
+		Allocator::deallocate(mBuffer);
+		mBuffer = newBuffer;
+		mHash = newHash;
+		mHashSize = newHashSize;
+		mEntriesNext = newEntriesNext;
+		mEntries = newEntries;
+		mEntriesCapacity = newEntriesCapacity;
+
+		freeListAdd(oldEntriesCapacity, newEntriesCapacity);
+	}
+
+	void grow()
+	{
+		PX_ASSERT((mFreeList == EOL) || (compacting && (mEntriesCount == mEntriesCapacity)));
+
+		uint32_t size = mHashSize == 0 ? 16 : mHashSize * 2;
+		reserve(size);
+	}
+
+	uint8_t* mBuffer;
+	Entry* mEntries;
+	uint32_t* mEntriesNext; // same size as mEntries
+	uint32_t* mHash;
+	uint32_t mEntriesCapacity;
+	uint32_t mHashSize;
+	float mLoadFactor;
+	uint32_t mFreeList;
+	uint32_t mTimestamp;
+	uint32_t mEntriesCount; // number of entries
+
+  public:
+	class Iter
+	{
+	  public:
+		PX_INLINE Iter(HashBase& b) : mBucket(0), mEntry(uint32_t(b.EOL)), mTimestamp(b.mTimestamp), mBase(b)
+		{
+			if(mBase.mEntriesCapacity > 0)
+			{
+				mEntry = mBase.mHash[0];
+				skip();
+			}
+		}
+
+		PX_INLINE void check() const
+		{
+			PX_ASSERT(mTimestamp == mBase.mTimestamp);
+		}
+		PX_INLINE const Entry& operator*() const
+		{
+			check();
+			return mBase.mEntries[mEntry];
+		}
+		PX_INLINE Entry& operator*()
+		{
+			check();
+			return mBase.mEntries[mEntry];
+		}
+		PX_INLINE const Entry* operator->() const
+		{
+			check();
+			return mBase.mEntries + mEntry;
+		}
+		PX_INLINE Entry* operator->()
+		{
+			check();
+			return mBase.mEntries + mEntry;
+		}
+		PX_INLINE Iter operator++()
+		{
+			check();
+			advance();
+			return *this;
+		}
+		PX_INLINE Iter operator++(int)
+		{
+			check();
+			Iter i = *this;
+			advance();
+			return i;
+		}
+		PX_INLINE bool done() const
+		{
+			check();
+			return mEntry == mBase.EOL;
+		}
+
+	  private:
+		PX_INLINE void advance()
+		{
+			mEntry = mBase.mEntriesNext[mEntry];
+			skip();
+		}
+		PX_INLINE void skip()
+		{
+			while(mEntry == mBase.EOL)
+			{
+				if(++mBucket == mBase.mHashSize)
+					break;
+				mEntry = mBase.mHash[mBucket];
+			}
+		}
+
+		Iter& operator=(const Iter&);
+
+		uint32_t mBucket;
+		uint32_t mEntry;
+		uint32_t mTimestamp;
+		HashBase& mBase;
+	};
+
+	/*!
+	Iterate over entries in a hash base and allow entry erase while iterating
+	*/
+	class EraseIterator
+	{
+	public:
+		PX_INLINE EraseIterator(HashBase& b): mBase(b)
+		{
+			reset();
+		}
+
+		PX_INLINE Entry* eraseCurrentGetNext(bool eraseCurrent)
+		{
+			if(eraseCurrent && mCurrentEntryIndexPtr)
+			{
+				mBase.eraseInternal(mCurrentEntryIndexPtr);
+				// if next was valid return the same ptr, if next was EOL search new hash entry
+				if(*mCurrentEntryIndexPtr != mBase.EOL)
+					return mBase.mEntries + *mCurrentEntryIndexPtr;
+				else
+					return traverseHashEntries();
+			}
+
+			// traverse mHash to find next entry
+			if(mCurrentEntryIndexPtr == NULL)
+				return traverseHashEntries();
+			
+			const uint32_t index = *mCurrentEntryIndexPtr;			
+			if(mBase.mEntriesNext[index] == mBase.EOL)
+			{
+				return traverseHashEntries();
+			}
+			else
+			{
+				mCurrentEntryIndexPtr = mBase.mEntriesNext + index;
+				return mBase.mEntries + *mCurrentEntryIndexPtr;
+			}
+		}
+
+		PX_INLINE void reset()
+		{
+			mCurrentHashIndex = 0;
+			mCurrentEntryIndexPtr = NULL;			
+		}
+
+	private:
+		PX_INLINE Entry* traverseHashEntries()
+		{
+			mCurrentEntryIndexPtr = NULL;			
+			while (mCurrentEntryIndexPtr == NULL && mCurrentHashIndex < mBase.mHashSize)
+			{
+				if (mBase.mHash[mCurrentHashIndex] != mBase.EOL)
+				{
+					mCurrentEntryIndexPtr = mBase.mHash + mCurrentHashIndex;
+					mCurrentHashIndex++;
+					return mBase.mEntries + *mCurrentEntryIndexPtr;
+				}
+				else
+				{
+					mCurrentHashIndex++;
+				}
+			}
+			return NULL;
+		}
+
+		EraseIterator& operator=(const EraseIterator&);
+	private:
+		uint32_t*	mCurrentEntryIndexPtr;
+		uint32_t	mCurrentHashIndex;		
+		HashBase&	mBase;
+	};
+};
+
+template <class Entry, class Key, class HashFn, class GetKey, class Allocator, bool compacting>
+template <typename HK, typename GK, class A, bool comp>
+PX_NOINLINE void
+HashBase<Entry, Key, HashFn, GetKey, Allocator, compacting>::copy(const HashBase<Entry, Key, HK, GK, A, comp>& other)
+{
+	reserve(other.mEntriesCount);
+
+	for(uint32_t i = 0; i < other.mEntriesCount; i++)
+	{
+		for(uint32_t j = other.mHash[i]; j != EOL; j = other.mEntriesNext[j])
+		{
+			const Entry& otherEntry = other.mEntries[j];
+
+			bool exists;
+			Entry* newEntry = create(GK()(otherEntry), exists);
+			PX_ASSERT(!exists);
+
+			PX_PLACEMENT_NEW(newEntry, Entry)(otherEntry);
+		}
+	}
+}
+
+template <class Key, class HashFn, class Allocator = typename AllocatorTraits<Key>::Type, bool Coalesced = false>
+class HashSetBase
+{
+	PX_NOCOPY(HashSetBase)
+  public:
+	struct GetKey
+	{
+		PX_INLINE const Key& operator()(const Key& e)
+		{
+			return e;
+		}
+	};
+
+	typedef HashBase<Key, Key, HashFn, GetKey, Allocator, Coalesced> BaseMap;
+	typedef typename BaseMap::Iter Iterator;
+
+	HashSetBase(uint32_t initialTableSize, float loadFactor, const Allocator& alloc)
+	: mBase(initialTableSize, loadFactor, alloc)
+	{
+	}
+
+	HashSetBase(const Allocator& alloc) : mBase(64, 0.75f, alloc)
+	{
+	}
+
+	HashSetBase(uint32_t initialTableSize = 64, float loadFactor = 0.75f) : mBase(initialTableSize, loadFactor)
+	{
+	}
+
+	bool insert(const Key& k)
+	{
+		bool exists;
+		Key* e = mBase.create(k, exists);
+		if(!exists)
+			PX_PLACEMENT_NEW(e, Key)(k);
+		return !exists;
+	}
+
+	PX_INLINE bool contains(const Key& k) const
+	{
+		return mBase.find(k) != 0;
+	}
+	PX_INLINE bool erase(const Key& k)
+	{
+		return mBase.erase(k);
+	}
+	PX_INLINE uint32_t size() const
+	{
+		return mBase.size();
+	}
+	PX_INLINE uint32_t capacity() const
+	{
+		return mBase.capacity();
+	}
+	PX_INLINE void reserve(uint32_t size)
+	{
+		mBase.reserve(size);
+	}
+	PX_INLINE void clear()
+	{
+		mBase.clear();
+	}
+
+  protected:
+	BaseMap mBase;
+};
+
+template <class Key, class Value, class HashFn, class Allocator = typename AllocatorTraits<Pair<const Key, Value> >::Type>
+class HashMapBase
+{
+	PX_NOCOPY(HashMapBase)
+  public:
+	typedef Pair<const Key, Value> Entry;
+
+	struct GetKey
+	{
+		PX_INLINE const Key& operator()(const Entry& e)
+		{
+			return e.first;
+		}
+	};
+
+	typedef HashBase<Entry, Key, HashFn, GetKey, Allocator, true> BaseMap;
+	typedef typename BaseMap::Iter Iterator;
+	typedef typename BaseMap::EraseIterator EraseIterator;
+
+	HashMapBase(uint32_t initialTableSize, float loadFactor, const Allocator& alloc)
+	: mBase(initialTableSize, loadFactor, alloc)
+	{
+	}
+
+	HashMapBase(const Allocator& alloc) : mBase(64, 0.75f, alloc)
+	{
+	}
+
+	HashMapBase(uint32_t initialTableSize = 64, float loadFactor = 0.75f) : mBase(initialTableSize, loadFactor)
+	{
+	}
+
+	bool insert(const Key /*&*/ k, const Value /*&*/ v)
+	{
+		bool exists;
+		Entry* e = mBase.create(k, exists);
+		if(!exists)
+			PX_PLACEMENT_NEW(e, Entry)(k, v);
+		return !exists;
+	}
+
+	Value& operator[](const Key& k)
+	{
+		bool exists;
+		Entry* e = mBase.create(k, exists);
+		if(!exists)
+			PX_PLACEMENT_NEW(e, Entry)(k, Value());
+
+		return e->second;
+	}
+
+	PX_INLINE const Entry* find(const Key& k) const
+	{
+		return mBase.find(k);
+	}
+	PX_INLINE bool erase(const Key& k)
+	{
+		return mBase.erase(k);
+	}
+	PX_INLINE bool erase(const Key& k, Entry& e)
+	{		
+		return mBase.erase(k, e);
+	}
+	PX_INLINE uint32_t size() const
+	{
+		return mBase.size();
+	}
+	PX_INLINE uint32_t capacity() const
+	{
+		return mBase.capacity();
+	}
+	PX_INLINE Iterator getIterator()
+	{
+		return Iterator(mBase);
+	}
+	PX_INLINE EraseIterator getEraseIterator()
+	{
+		return EraseIterator(mBase);
+	}
+	PX_INLINE void reserve(uint32_t size)
+	{
+		mBase.reserve(size);
+	}
+	PX_INLINE void clear()
+	{
+		mBase.clear();
+	}
+
+  protected:
+	BaseMap mBase;
+};
+}
+
+} // namespace shdfnd
+} // namespace physx
+
+#if PX_VC
+#pragma warning(pop)
+#endif
+#endif // #ifndef PSFOUNDATION_PSHASHINTERNALS_H
diff --git a/PxShared/src/foundation/include/PsHashMap.h b/PxShared/src/foundation/include/PsHashMap.h
new file mode 100644
index 00000000..8af35174
--- /dev/null
+++ b/PxShared/src/foundation/include/PsHashMap.h
@@ -0,0 +1,118 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSHASHMAP_H
+#define PSFOUNDATION_PSHASHMAP_H
+
+#include "PsHashInternals.h"
+
+// TODO: make this doxy-format
+//
+// This header defines two hash maps. Hash maps
+// * support custom initial table sizes (rounded up internally to power-of-2)
+// * support custom static allocator objects
+// * auto-resize, based on a load factor (i.e. a 64-entry .75 load factor hash will resize
+//                                        when the 49th element is inserted)
+// * are based on open hashing
+// * have O(1) contains, erase
+//
+// Maps have STL-like copying semantics, and properly initialize and destruct copies of objects
+//
+// There are two forms of map: coalesced and uncoalesced. Coalesced maps keep the entries in the
+// initial segment of an array, so are fast to iterate over; however deletion is approximately
+// twice as expensive.
+//
+// HashMap<T>:
+//		bool			insert(const Key& k, const Value& v)	O(1) amortized (exponential resize policy)
+//		Value &			operator[](const Key& k)				O(1) for existing objects, else O(1) amortized
+//		const Entry *	find(const Key& k);						O(1)
+//		bool			erase(const T& k);						O(1)
+//		uint32_t			size();									constant
+//		void			reserve(uint32_t size);					O(MAX(currentOccupancy,size))
+//		void			clear();								O(currentOccupancy) (with zero constant for objects
+// without
+// destructors)
+//      Iterator		getIterator();
+//
+// operator[] creates an entry if one does not exist, initializing with the default constructor.
+// CoalescedHashMap<T> does not support getIterator, but instead supports
+// 		const Key *getEntries();
+//
+// Use of iterators:
+//
+// for(HashMap::Iterator iter = test.getIterator(); !iter.done(); ++iter)
+//			myFunction(iter->first, iter->second);
+
+namespace physx
+{
+namespace shdfnd
+{
+template <class Key, class Value, class HashFn = Hash<Key>, class Allocator = NonTrackingAllocator>
+class HashMap : public internal::HashMapBase<Key, Value, HashFn, Allocator>
+{
+  public:
+	typedef internal::HashMapBase<Key, Value, HashFn, Allocator> HashMapBase;
+	typedef typename HashMapBase::Iterator Iterator;
+
+	HashMap(uint32_t initialTableSize = 64, float loadFactor = 0.75f) : HashMapBase(initialTableSize, loadFactor)
+	{
+	}
+	HashMap(uint32_t initialTableSize, float loadFactor, const Allocator& alloc)
+	: HashMapBase(initialTableSize, loadFactor, alloc)
+	{
+	}
+	HashMap(const Allocator& alloc) : HashMapBase(64, 0.75f, alloc)
+	{
+	}
+	Iterator getIterator()
+	{
+		return Iterator(HashMapBase::mBase);
+	}
+};
+
+template <class Key, class Value, class HashFn = Hash<Key>, class Allocator = NonTrackingAllocator>
+class CoalescedHashMap : public internal::HashMapBase<Key, Value, HashFn, Allocator>
+{
+  public:
+	typedef internal::HashMapBase<Key, Value, HashFn, Allocator> HashMapBase;
+
+	CoalescedHashMap(uint32_t initialTableSize = 64, float loadFactor = 0.75f)
+	: HashMapBase(initialTableSize, loadFactor)
+	{
+	}
+	const Pair<const Key, Value>* getEntries() const
+	{
+		return HashMapBase::mBase.getEntries();
+	}
+};
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSHASHMAP_H
diff --git a/PxShared/src/foundation/include/PsHashSet.h b/PxShared/src/foundation/include/PsHashSet.h
new file mode 100644
index 00000000..73fb7502
--- /dev/null
+++ b/PxShared/src/foundation/include/PsHashSet.h
@@ -0,0 +1,127 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSHASHSET_H
+#define PSFOUNDATION_PSHASHSET_H
+
+#include "PsHashInternals.h"
+
+// TODO: make this doxy-format
+
+// This header defines two hash sets. Hash sets
+// * support custom initial table sizes (rounded up internally to power-of-2)
+// * support custom static allocator objects
+// * auto-resize, based on a load factor (i.e. a 64-entry .75 load factor hash will resize
+//                                        when the 49th element is inserted)
+// * are based on open hashing
+//
+// Sets have STL-like copying semantics, and properly initialize and destruct copies of objects
+//
+// There are two forms of set: coalesced and uncoalesced. Coalesced sets keep the entries in the
+// initial segment of an array, so are fast to iterate over; however deletion is approximately
+// twice as expensive.
+//
+// HashSet<T>:
+//		bool		insert(const T& k)						amortized O(1) (exponential resize policy)
+// 		bool		contains(const T& k)	const;			O(1)
+//		bool		erase(const T& k);						O(1)
+//		uint32_t		size()					const;			constant
+//		void		reserve(uint32_t size);					O(MAX(size, currentOccupancy))
+//		void		clear();								O(currentOccupancy) (with zero constant for objects without
+// destructors)
+//      Iterator    getIterator();
+//
+// Use of iterators:
+//
+// for(HashSet::Iterator iter = test.getIterator(); !iter.done(); ++iter)
+//			myFunction(*iter);
+//
+// CoalescedHashSet<T> does not support getIterator, but instead supports
+// 		const Key *getEntries();
+//
+// insertion into a set already containing the element fails returning false, as does
+// erasure of an element not in the set
+//
+
+namespace physx
+{
+namespace shdfnd
+{
+template <class Key, class HashFn = Hash<Key>, class Allocator = NonTrackingAllocator>
+class HashSet : public internal::HashSetBase<Key, HashFn, Allocator, false>
+{
+  public:
+	typedef internal::HashSetBase<Key, HashFn, Allocator, false> HashSetBase;
+	typedef typename HashSetBase::Iterator Iterator;
+
+	HashSet(uint32_t initialTableSize = 64, float loadFactor = 0.75f) : HashSetBase(initialTableSize, loadFactor)
+	{
+	}
+	HashSet(uint32_t initialTableSize, float loadFactor, const Allocator& alloc)
+	: HashSetBase(initialTableSize, loadFactor, alloc)
+	{
+	}
+	HashSet(const Allocator& alloc) : HashSetBase(64, 0.75f, alloc)
+	{
+	}
+	Iterator getIterator()
+	{
+		return Iterator(HashSetBase::mBase);
+	}
+};
+
+template <class Key, class HashFn = Hash<Key>, class Allocator = NonTrackingAllocator>
+class CoalescedHashSet : public internal::HashSetBase<Key, HashFn, Allocator, true>
+{
+  public:
+	typedef typename internal::HashSetBase<Key, HashFn, Allocator, true> HashSetBase;
+
+	CoalescedHashSet(uint32_t initialTableSize = 64, float loadFactor = 0.75f)
+	: HashSetBase(initialTableSize, loadFactor)
+	{
+	}
+
+	CoalescedHashSet(uint32_t initialTableSize, float loadFactor, const Allocator& alloc)
+	: HashSetBase(initialTableSize, loadFactor, alloc)
+	{
+	}
+	CoalescedHashSet(const Allocator& alloc) : HashSetBase(64, 0.75f, alloc)
+	{
+	}
+
+	const Key* getEntries() const
+	{
+		return HashSetBase::mBase.getEntries();
+	}
+};
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSHASHSET_H
diff --git a/PxShared/src/foundation/include/PsInlineAllocator.h b/PxShared/src/foundation/include/PsInlineAllocator.h
new file mode 100644
index 00000000..9e129f37
--- /dev/null
+++ b/PxShared/src/foundation/include/PsInlineAllocator.h
@@ -0,0 +1,91 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSINLINEALLOCATOR_H
+#define PSFOUNDATION_PSINLINEALLOCATOR_H
+
+#include "PsUserAllocated.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+// this is used by the array class to allocate some space for a small number
+// of objects along with the metadata
+template <uint32_t N, typename BaseAllocator>
+class InlineAllocator : private BaseAllocator
+{
+  public:
+	InlineAllocator(const PxEMPTY v) : BaseAllocator(v)
+	{
+	}
+
+	InlineAllocator(const BaseAllocator& alloc = BaseAllocator()) : BaseAllocator(alloc), mBufferUsed(false)
+	{
+	}
+
+	InlineAllocator(const InlineAllocator& aloc) : BaseAllocator(aloc), mBufferUsed(false)
+	{
+	}
+
+	void* allocate(uint32_t size, const char* filename, int line)
+	{
+		if(!mBufferUsed && size <= N)
+		{
+			mBufferUsed = true;
+			return mBuffer;
+		}
+		return BaseAllocator::allocate(size, filename, line);
+	}
+
+	void deallocate(void* ptr)
+	{
+		if(ptr == mBuffer)
+			mBufferUsed = false;
+		else
+			BaseAllocator::deallocate(ptr);
+	}
+
+	PX_FORCE_INLINE uint8_t* getInlineBuffer()
+	{
+		return mBuffer;
+	}
+	PX_FORCE_INLINE bool isBufferUsed() const
+	{
+		return mBufferUsed;
+	}
+
+  protected:
+	uint8_t mBuffer[N];
+	bool mBufferUsed;
+};
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSINLINEALLOCATOR_H
diff --git a/PxShared/src/foundation/include/PsInlineAoS.h b/PxShared/src/foundation/include/PsInlineAoS.h
new file mode 100644
index 00000000..cd051180
--- /dev/null
+++ b/PxShared/src/foundation/include/PsInlineAoS.h
@@ -0,0 +1,48 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSINLINEAOS_H
+#define PSFOUNDATION_PSINLINEAOS_H
+
+#include "foundation/PxPreprocessor.h"
+
+#if PX_WINDOWS
+#include "windows/PsWindowsTrigConstants.h"
+#include "windows/PsWindowsInlineAoS.h"
+#elif(PX_UNIX_FAMILY || PX_PS4)
+#include "unix/PsUnixTrigConstants.h"
+#include "unix/PsUnixInlineAoS.h"
+#elif PX_XBOXONE
+#include "XboxOne/PsXboxOneTrigConstants.h"
+#include "XboxOne/PsXboxOneInlineAoS.h"
+#else
+#error "Platform not supported!"
+#endif
+
+#endif
diff --git a/PxShared/src/foundation/include/PsInlineArray.h b/PxShared/src/foundation/include/PsInlineArray.h
new file mode 100644
index 00000000..10434f91
--- /dev/null
+++ b/PxShared/src/foundation/include/PsInlineArray.h
@@ -0,0 +1,68 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSINLINEARRAY_H
+#define PSFOUNDATION_PSINLINEARRAY_H
+
+#include "PsArray.h"
+#include "PsInlineAllocator.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+
+// array that pre-allocates for N elements
+template <typename T, uint32_t N, typename Alloc = typename AllocatorTraits<T>::Type>
+class InlineArray : public Array<T, InlineAllocator<N * sizeof(T), Alloc> >
+{
+	typedef InlineAllocator<N * sizeof(T), Alloc> Allocator;
+
+  public:
+	InlineArray(const PxEMPTY v) : Array<T, Allocator>(v)
+	{
+		if(isInlined())
+			this->mData = reinterpret_cast<T*>(Array<T, Allocator>::getInlineBuffer());
+	}
+
+	PX_INLINE bool isInlined() const
+	{
+		return Allocator::isBufferUsed();
+	}
+
+	PX_INLINE explicit InlineArray(const Alloc& alloc = Alloc()) : Array<T, Allocator>(alloc)
+	{
+		this->mData = this->allocate(N);
+		this->mCapacity = N;
+	}
+};
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSINLINEARRAY_H
diff --git a/PxShared/src/foundation/include/PsIntrinsics.h b/PxShared/src/foundation/include/PsIntrinsics.h
new file mode 100644
index 00000000..2657d2a6
--- /dev/null
+++ b/PxShared/src/foundation/include/PsIntrinsics.h
@@ -0,0 +1,45 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSINTRINSICS_H
+#define PSFOUNDATION_PSINTRINSICS_H
+
+#include "foundation/PxPreprocessor.h"
+
+#if PX_WINDOWS_FAMILY
+#include "windows/PsWindowsIntrinsics.h"
+#elif(PX_LINUX || PX_ANDROID || PX_APPLE_FAMILY || PX_PS4)
+#include "unix/PsUnixIntrinsics.h"
+#elif PX_XBOXONE
+#include "XboxOne/PsXboxOneIntrinsics.h"
+#else
+#error "Platform not supported!"
+#endif
+
+#endif // #ifndef PSFOUNDATION_PSINTRINSICS_H
diff --git a/PxShared/src/foundation/include/PsMathUtils.h b/PxShared/src/foundation/include/PsMathUtils.h
new file mode 100644
index 00000000..e3c6e8c9
--- /dev/null
+++ b/PxShared/src/foundation/include/PsMathUtils.h
@@ -0,0 +1,692 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSMATHUTILS_H
+#define PSFOUNDATION_PSMATHUTILS_H
+
+#include "foundation/PxPreprocessor.h"
+#include "foundation/PxTransform.h"
+#include "foundation/PxMat33.h"
+#include "Ps.h"
+#include "PsIntrinsics.h"
+
+// General guideline is: if it's an abstract math function, it belongs here.
+// If it's a math function where the inputs have specific semantics (e.g.
+// separateSwingTwist) it doesn't.
+
+namespace physx
+{
+namespace shdfnd
+{
+/**
+\brief sign returns the sign of its argument. The sign of zero is undefined.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 sign(const PxF32 a)
+{
+	return intrinsics::sign(a);
+}
+
+/**
+\brief sign returns the sign of its argument. The sign of zero is undefined.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 sign(const PxF64 a)
+{
+	return (a >= 0.0) ? 1.0 : -1.0;
+}
+
+/**
+\brief sign returns the sign of its argument. The sign of zero is undefined.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxI32 sign(const PxI32 a)
+{
+	return (a >= 0) ? 1 : -1;
+}
+
+/**
+\brief Returns true if the two numbers are within eps of each other.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE bool equals(const PxF32 a, const PxF32 b, const PxF32 eps)
+{
+	return (PxAbs(a - b) < eps);
+}
+
+/**
+\brief Returns true if the two numbers are within eps of each other.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE bool equals(const PxF64 a, const PxF64 b, const PxF64 eps)
+{
+	return (PxAbs(a - b) < eps);
+}
+
+/**
+\brief The floor function returns a floating-point value representing the largest integer that is less than or equal to
+x.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 floor(const PxF32 a)
+{
+	return floatFloor(a);
+}
+
+/**
+\brief The floor function returns a floating-point value representing the largest integer that is less than or equal to
+x.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 floor(const PxF64 a)
+{
+	return ::floor(a);
+}
+
+/**
+\brief The ceil function returns a single value representing the smallest integer that is greater than or equal to x.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 ceil(const PxF32 a)
+{
+	return ::ceilf(a);
+}
+
+/**
+\brief The ceil function returns a double value representing the smallest integer that is greater than or equal to x.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 ceil(const PxF64 a)
+{
+	return ::ceil(a);
+}
+
+/**
+\brief mod returns the floating-point remainder of x / y.
+
+If the value of y is 0.0, mod returns a quiet NaN.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 mod(const PxF32 x, const PxF32 y)
+{
+	return PxF32(::fmodf(x, y));
+}
+
+/**
+\brief mod returns the floating-point remainder of x / y.
+
+If the value of y is 0.0, mod returns a quiet NaN.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 mod(const PxF64 x, const PxF64 y)
+{
+	return ::fmod(x, y);
+}
+
+/**
+\brief Square.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 sqr(const PxF32 a)
+{
+	return a * a;
+}
+
+/**
+\brief Square.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 sqr(const PxF64 a)
+{
+	return a * a;
+}
+
+/**
+\brief Calculates x raised to the power of y.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 pow(const PxF32 x, const PxF32 y)
+{
+	return ::powf(x, y);
+}
+
+/**
+\brief Calculates x raised to the power of y.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 pow(const PxF64 x, const PxF64 y)
+{
+	return ::pow(x, y);
+}
+
+/**
+\brief Calculates e^n
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 exp(const PxF32 a)
+{
+	return ::expf(a);
+}
+/**
+
+\brief Calculates e^n
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 exp(const PxF64 a)
+{
+	return ::exp(a);
+}
+
+/**
+\brief Calculates 2^n
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 exp2(const PxF32 a)
+{
+	return ::expf(a * 0.693147180559945309417f);
+}
+/**
+
+\brief Calculates 2^n
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 exp2(const PxF64 a)
+{
+	return ::exp(a * 0.693147180559945309417);
+}
+
+/**
+\brief Calculates logarithms.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 logE(const PxF32 a)
+{
+	return ::logf(a);
+}
+
+/**
+\brief Calculates logarithms.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 logE(const PxF64 a)
+{
+	return ::log(a);
+}
+
+/**
+\brief Calculates logarithms.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 log2(const PxF32 a)
+{
+	return ::logf(a) / 0.693147180559945309417f;
+}
+
+/**
+\brief Calculates logarithms.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 log2(const PxF64 a)
+{
+	return ::log(a) / 0.693147180559945309417;
+}
+
+/**
+\brief Calculates logarithms.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 log10(const PxF32 a)
+{
+	return ::log10f(a);
+}
+
+/**
+\brief Calculates logarithms.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 log10(const PxF64 a)
+{
+	return ::log10(a);
+}
+
+/**
+\brief Converts degrees to radians.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 degToRad(const PxF32 a)
+{
+	return 0.01745329251994329547f * a;
+}
+
+/**
+\brief Converts degrees to radians.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 degToRad(const PxF64 a)
+{
+	return 0.01745329251994329547 * a;
+}
+
+/**
+\brief Converts radians to degrees.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 radToDeg(const PxF32 a)
+{
+	return 57.29577951308232286465f * a;
+}
+
+/**
+\brief Converts radians to degrees.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 radToDeg(const PxF64 a)
+{
+	return 57.29577951308232286465 * a;
+}
+
+//! \brief compute sine and cosine at the same time. There is a 'fsincos' on PC that we probably want to use here
+PX_CUDA_CALLABLE PX_FORCE_INLINE void sincos(const PxF32 radians, PxF32& sin, PxF32& cos)
+{
+	/* something like:
+	_asm fld  Local
+	_asm fsincos
+	_asm fstp LocalCos
+	_asm fstp LocalSin
+	*/
+	sin = PxSin(radians);
+	cos = PxCos(radians);
+}
+
+/**
+\brief uniform random number in [a,b]
+*/
+PX_FORCE_INLINE PxI32 rand(const PxI32 a, const PxI32 b)
+{
+	return a + PxI32(::rand() % (b - a + 1));
+}
+
+/**
+\brief uniform random number in [a,b]
+*/
+PX_FORCE_INLINE PxF32 rand(const PxF32 a, const PxF32 b)
+{
+	return a + (b - a) * ::rand() / RAND_MAX;
+}
+
+//! \brief return angle between two vectors in radians
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 angle(const PxVec3& v0, const PxVec3& v1)
+{
+	const PxF32 cos = v0.dot(v1);                 // |v0|*|v1|*Cos(Angle)
+	const PxF32 sin = (v0.cross(v1)).magnitude(); // |v0|*|v1|*Sin(Angle)
+	return PxAtan2(sin, cos);
+}
+
+//! If possible use instead fsel on the dot product /*fsel(d.dot(p),onething,anotherthing);*/
+//! Compares orientations (more readable, user-friendly function)
+PX_CUDA_CALLABLE PX_FORCE_INLINE bool sameDirection(const PxVec3& d, const PxVec3& p)
+{
+	return d.dot(p) >= 0.0f;
+}
+
+//! Checks 2 values have different signs
+PX_CUDA_CALLABLE PX_FORCE_INLINE IntBool differentSign(PxReal f0, PxReal f1)
+{
+	union
+	{
+		PxU32 u;
+		PxReal f;
+	} u1, u2;
+	u1.f = f0;
+	u2.f = f1;
+	return IntBool((u1.u ^ u2.u) & PX_SIGN_BITMASK);
+}
+
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxMat33 star(const PxVec3& v)
+{
+	return PxMat33(PxVec3(0, v.z, -v.y), PxVec3(-v.z, 0, v.x), PxVec3(v.y, -v.x, 0));
+}
+
+PX_CUDA_CALLABLE PX_INLINE PxVec3 log(const PxQuat& q)
+{
+	const PxReal s = q.getImaginaryPart().magnitude();
+	if(s < 1e-12f)
+		return PxVec3(0.0f);
+	// force the half-angle to have magnitude <= pi/2
+	PxReal halfAngle = q.w < 0 ? PxAtan2(-s, -q.w) : PxAtan2(s, q.w);
+	PX_ASSERT(halfAngle >= -PxPi / 2 && halfAngle <= PxPi / 2);
+
+	return q.getImaginaryPart().getNormalized() * 2.f * halfAngle;
+}
+
+PX_CUDA_CALLABLE PX_INLINE PxQuat exp(const PxVec3& v)
+{
+	const PxReal m = v.magnitudeSquared();
+	return m < 1e-24f ? PxQuat(PxIdentity) : PxQuat(PxSqrt(m), v * PxRecipSqrt(m));
+}
+
+// quat to rotate v0 t0 v1
+PX_CUDA_CALLABLE PX_INLINE PxQuat rotationArc(const PxVec3& v0, const PxVec3& v1)
+{
+	const PxVec3 cross = v0.cross(v1);
+	const PxReal d = v0.dot(v1);
+	if(d <= -0.99999f)
+		return (PxAbs(v0.x) < 0.1f ? PxQuat(0.0f, v0.z, -v0.y, 0.0f) : PxQuat(v0.y, -v0.x, 0.0, 0.0)).getNormalized();
+
+	const PxReal s = PxSqrt((1 + d) * 2), r = 1 / s;
+
+	return PxQuat(cross.x * r, cross.y * r, cross.z * r, s * 0.5f).getNormalized();
+}
+
+/**
+\brief returns largest axis
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxU32 largestAxis(const PxVec3& v)
+{
+	PxU32 m = PxU32(v.y > v.x ? 1 : 0);
+	return v.z > v[m] ? 2 : m;
+}
+
+/**
+\brief returns indices for the largest axis and 2 other axii
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxU32 largestAxis(const PxVec3& v, PxU32& other1, PxU32& other2)
+{
+	if(v.x >= PxMax(v.y, v.z))
+	{
+		other1 = 1;
+		other2 = 2;
+		return 0;
+	}
+	else if(v.y >= v.z)
+	{
+		other1 = 0;
+		other2 = 2;
+		return 1;
+	}
+	else
+	{
+		other1 = 0;
+		other2 = 1;
+		return 2;
+	}
+}
+
+/**
+\brief returns axis with smallest absolute value
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxU32 closestAxis(const PxVec3& v)
+{
+	PxU32 m = PxU32(PxAbs(v.y) > PxAbs(v.x) ? 1 : 0);
+	return PxAbs(v.z) > PxAbs(v[m]) ? 2 : m;
+}
+
+PX_CUDA_CALLABLE PX_INLINE PxU32 closestAxis(const PxVec3& v, PxU32& j, PxU32& k)
+{
+	// find largest 2D plane projection
+	const PxF32 absPx = PxAbs(v.x);
+	const PxF32 absNy = PxAbs(v.y);
+	const PxF32 absNz = PxAbs(v.z);
+
+	PxU32 m = 0; // x biggest axis
+	j = 1;
+	k = 2;
+	if(absNy > absPx && absNy > absNz)
+	{
+		// y biggest
+		j = 2;
+		k = 0;
+		m = 1;
+	}
+	else if(absNz > absPx)
+	{
+		// z biggest
+		j = 0;
+		k = 1;
+		m = 2;
+	}
+	return m;
+}
+
+/*!
+Extend an edge along its length by a factor
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE void makeFatEdge(PxVec3& p0, PxVec3& p1, PxReal fatCoeff)
+{
+	PxVec3 delta = p1 - p0;
+
+	const PxReal m = delta.magnitude();
+	if(m > 0.0f)
+	{
+		delta *= fatCoeff / m;
+		p0 -= delta;
+		p1 += delta;
+	}
+}
+
+//! Compute point as combination of barycentric coordinates
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxVec3
+computeBarycentricPoint(const PxVec3& p0, const PxVec3& p1, const PxVec3& p2, PxReal u, PxReal v)
+{
+	// This seems to confuse the compiler...
+	// return (1.0f - u - v)*p0 + u*p1 + v*p2;
+	const PxF32 w = 1.0f - u - v;
+	return PxVec3(w * p0.x + u * p1.x + v * p2.x, w * p0.y + u * p1.y + v * p2.y, w * p0.z + u * p1.z + v * p2.z);
+}
+
+// generates a pair of quaternions (swing, twist) such that in = swing * twist, with
+// swing.x = 0
+// twist.y = twist.z = 0, and twist is a unit quat
+PX_FORCE_INLINE void separateSwingTwist(const PxQuat& q, PxQuat& swing, PxQuat& twist)
+{
+	twist = q.x != 0.0f ? PxQuat(q.x, 0, 0, q.w).getNormalized() : PxQuat(PxIdentity);
+	swing = q * twist.getConjugate();
+}
+
+// generate two tangent vectors to a given normal
+PX_FORCE_INLINE void normalToTangents(const PxVec3& normal, PxVec3& tangent0, PxVec3& tangent1)
+{
+	tangent0 = PxAbs(normal.x) < 0.70710678f ? PxVec3(0, -normal.z, normal.y) : PxVec3(-normal.y, normal.x, 0);
+	tangent0.normalize();
+	tangent1 = normal.cross(tangent0);
+}
+
+// todo: what is this function doing?
+PX_FOUNDATION_API PxQuat computeQuatFromNormal(const PxVec3& n);
+
+/**
+\brief computes a oriented bounding box around the scaled basis.
+\param basis Input = skewed basis, Output = (normalized) orthogonal basis.
+\return Bounding box extent.
+*/
+PX_FOUNDATION_API PxVec3 optimizeBoundingBox(PxMat33& basis);
+
+PX_FOUNDATION_API PxQuat slerp(const PxReal t, const PxQuat& left, const PxQuat& right);
+
+PX_CUDA_CALLABLE PX_INLINE PxVec3 ellipseClamp(const PxVec3& point, const PxVec3& radii)
+{
+	// This function need to be implemented in the header file because
+	// it is included in a spu shader program.
+
+	// finds the closest point on the ellipse to a given point
+
+	// (p.y, p.z) is the input point
+	// (e.y, e.z) are the radii of the ellipse
+
+	// lagrange multiplier method with Newton/Halley hybrid root-finder.
+	// see http://www.geometrictools.com/Documentation/DistancePointToEllipse2.pdf
+	// for proof of Newton step robustness and initial estimate.
+	// Halley converges much faster but sometimes overshoots - when that happens we take
+	// a newton step instead
+
+	// converges in 1-2 iterations where D&C works well, and it's good with 4 iterations
+	// with any ellipse that isn't completely crazy
+
+	const PxU32 MAX_ITERATIONS = 20;
+	const PxReal convergenceThreshold = 1e-4f;
+
+	// iteration requires first quadrant but we recover generality later
+
+	PxVec3 q(0, PxAbs(point.y), PxAbs(point.z));
+	const PxReal tinyEps = 1e-6f; // very close to minor axis is numerically problematic but trivial
+	if(radii.y >= radii.z)
+	{
+		if(q.z < tinyEps)
+			return PxVec3(0, point.y > 0 ? radii.y : -radii.y, 0);
+	}
+	else
+	{
+		if(q.y < tinyEps)
+			return PxVec3(0, 0, point.z > 0 ? radii.z : -radii.z);
+	}
+
+	PxVec3 denom, e2 = radii.multiply(radii), eq = radii.multiply(q);
+
+	// we can use any initial guess which is > maximum(-e.y^2,-e.z^2) and for which f(t) is > 0.
+	// this guess works well near the axes, but is weak along the diagonals.
+
+	PxReal t = PxMax(eq.y - e2.y, eq.z - e2.z);
+
+	for(PxU32 i = 0; i < MAX_ITERATIONS; i++)
+	{
+		denom = PxVec3(0, 1 / (t + e2.y), 1 / (t + e2.z));
+		PxVec3 denom2 = eq.multiply(denom);
+
+		PxVec3 fv = denom2.multiply(denom2);
+		PxReal f = fv.y + fv.z - 1;
+
+		// although in exact arithmetic we are guaranteed f>0, we can get here
+		// on the first iteration via catastrophic cancellation if the point is
+		// very close to the origin. In that case we just behave as if f=0
+
+		if(f < convergenceThreshold)
+			return e2.multiply(point).multiply(denom);
+
+		PxReal df = fv.dot(denom) * -2.0f;
+		t = t - f / df;
+	}
+
+	// we didn't converge, so clamp what we have
+	PxVec3 r = e2.multiply(point).multiply(denom);
+	return r * PxRecipSqrt(sqr(r.y / radii.y) + sqr(r.z / radii.z));
+}
+
+PX_CUDA_CALLABLE PX_INLINE PxReal tanHalf(PxReal sin, PxReal cos)
+{
+	return sin / (1 + cos);
+}
+
+PX_INLINE PxQuat quatFromTanQVector(const PxVec3& v)
+{
+	PxReal v2 = v.dot(v);
+	if(v2 < 1e-12f)
+		return PxQuat(PxIdentity);
+	PxReal d = 1 / (1 + v2);
+	return PxQuat(v.x * 2, v.y * 2, v.z * 2, 1 - v2) * d;
+}
+
+PX_FORCE_INLINE PxVec3 cross100(const PxVec3& b)
+{
+	return PxVec3(0.0f, -b.z, b.y);
+}
+PX_FORCE_INLINE PxVec3 cross010(const PxVec3& b)
+{
+	return PxVec3(b.z, 0.0f, -b.x);
+}
+PX_FORCE_INLINE PxVec3 cross001(const PxVec3& b)
+{
+	return PxVec3(-b.y, b.x, 0.0f);
+}
+
+PX_INLINE void decomposeVector(PxVec3& normalCompo, PxVec3& tangentCompo, const PxVec3& outwardDir,
+                               const PxVec3& outwardNormal)
+{
+	normalCompo = outwardNormal * (outwardDir.dot(outwardNormal));
+	tangentCompo = outwardDir - normalCompo;
+}
+
+//! \brief Return (i+1)%3
+// Avoid variable shift for XBox:
+// PX_INLINE PxU32 Ps::getNextIndex3(PxU32 i)			{	return (1<<i) & 3;			}
+PX_INLINE PxU32 getNextIndex3(PxU32 i)
+{
+	return (i + 1 + (i >> 1)) & 3;
+}
+
+PX_INLINE PxMat33 rotFrom2Vectors(const PxVec3& from, const PxVec3& to)
+{
+	// See bottom of http://www.euclideanspace.com/maths/algebra/matrix/orthogonal/rotation/index.htm
+
+	// Early exit if to = from
+	if((from - to).magnitudeSquared() < 1e-4f)
+		return PxMat33(PxIdentity);
+
+	// Early exit if to = -from
+	if((from + to).magnitudeSquared() < 1e-4f)
+		return PxMat33::createDiagonal(PxVec3(1.0f, -1.0f, -1.0f));
+
+	PxVec3 n = from.cross(to);
+
+	PxReal C = from.dot(to), S = PxSqrt(1 - C * C), CC = 1 - C;
+
+	PxReal xx = n.x * n.x, yy = n.y * n.y, zz = n.z * n.z, xy = n.x * n.y, yz = n.y * n.z, xz = n.x * n.z;
+
+	PxMat33 R;
+
+	R(0, 0) = 1 + CC * (xx - 1);
+	R(0, 1) = -n.z * S + CC * xy;
+	R(0, 2) = n.y * S + CC * xz;
+
+	R(1, 0) = n.z * S + CC * xy;
+	R(1, 1) = 1 + CC * (yy - 1);
+	R(1, 2) = -n.x * S + CC * yz;
+
+	R(2, 0) = -n.y * S + CC * xz;
+	R(2, 1) = n.x * S + CC * yz;
+	R(2, 2) = 1 + CC * (zz - 1);
+
+	return R;
+}
+
+PX_FOUNDATION_API void integrateTransform(const PxTransform& curTrans, const PxVec3& linvel, const PxVec3& angvel,
+                                          PxReal timeStep, PxTransform& result);
+
+PX_INLINE void computeBasis(const PxVec3& dir, PxVec3& right, PxVec3& up)
+{
+	// Derive two remaining vectors
+	if(PxAbs(dir.y) <= 0.9999f)
+	{
+		right = PxVec3(dir.z, 0.0f, -dir.x);
+		right.normalize();
+
+		// PT: normalize not needed for 'up' because dir & right are unit vectors,
+		// and by construction the angle between them is 90 degrees (i.e. sin(angle)=1)
+		up = PxVec3(dir.y * right.z, dir.z * right.x - dir.x * right.z, -dir.y * right.x);
+	}
+	else
+	{
+		right = PxVec3(1.0f, 0.0f, 0.0f);
+
+		up = PxVec3(0.0f, dir.z, -dir.y);
+		up.normalize();
+	}
+}
+
+PX_INLINE void computeBasis(const PxVec3& p0, const PxVec3& p1, PxVec3& dir, PxVec3& right, PxVec3& up)
+{
+	// Compute the new direction vector
+	dir = p1 - p0;
+	dir.normalize();
+
+	// Derive two remaining vectors
+	computeBasis(dir, right, up);
+}
+
+PX_FORCE_INLINE bool isAlmostZero(const PxVec3& v)
+{
+	if(PxAbs(v.x) > 1e-6f || PxAbs(v.y) > 1e-6f || PxAbs(v.z) > 1e-6f)
+		return false;
+	return true;
+}
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif
diff --git a/PxShared/src/foundation/include/PsMutex.h b/PxShared/src/foundation/include/PsMutex.h
new file mode 100644
index 00000000..a75d5567
--- /dev/null
+++ b/PxShared/src/foundation/include/PsMutex.h
@@ -0,0 +1,330 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSMUTEX_H
+#define PSFOUNDATION_PSMUTEX_H
+
+#include "PsAllocator.h"
+
+/*
+ * This <new> inclusion is a best known fix for gcc 4.4.1 error:
+ * Creating object file for apex/src/PsAllocator.cpp ...
+ * In file included from apex/include/PsFoundation.h:30,
+ *                from apex/src/PsAllocator.cpp:26:
+ * apex/include/PsMutex.h: In constructor  'physx::shdfnd::MutexT<Alloc>::MutexT(const Alloc&)':
+ * apex/include/PsMutex.h:92: error: no matching function for call to 'operator new(unsigned int,
+ * physx::shdfnd::MutexImpl*&)'
+ * <built-in>:0: note: candidates are: void* operator new(unsigned int)
+ */
+#include <new>
+
+namespace physx
+{
+namespace shdfnd
+{
+class PX_FOUNDATION_API MutexImpl
+{
+  public:
+	/**
+	The constructor for Mutex creates a mutex. It is initially unlocked.
+	*/
+	MutexImpl();
+
+	/**
+	The destructor for Mutex deletes the mutex.
+	*/
+	~MutexImpl();
+
+	/**
+	Acquire (lock) the mutex. If the mutex is already locked
+	by another thread, this method blocks until the mutex is
+	unlocked.
+	*/
+	void lock();
+
+	/**
+	Acquire (lock) the mutex. If the mutex is already locked
+	by another thread, this method returns false without blocking.
+	*/
+	bool trylock();
+
+	/**
+	Release (unlock) the mutex.
+	*/
+	void unlock();
+
+	/**
+	Size of this class.
+	*/
+	static const uint32_t& getSize();
+};
+
+template <typename Alloc = ReflectionAllocator<MutexImpl> >
+class MutexT : protected Alloc
+{
+	PX_NOCOPY(MutexT)
+  public:
+	class ScopedLock
+	{
+		MutexT<Alloc>& mMutex;
+		PX_NOCOPY(ScopedLock)
+	  public:
+		PX_INLINE ScopedLock(MutexT<Alloc>& mutex) : mMutex(mutex)
+		{
+			mMutex.lock();
+		}
+		PX_INLINE ~ScopedLock()
+		{
+			mMutex.unlock();
+		}
+	};
+
+	/**
+	The constructor for Mutex creates a mutex. It is initially unlocked.
+	*/
+	MutexT(const Alloc& alloc = Alloc()) : Alloc(alloc)
+	{
+		mImpl = reinterpret_cast<MutexImpl*>(Alloc::allocate(MutexImpl::getSize(), __FILE__, __LINE__));
+		PX_PLACEMENT_NEW(mImpl, MutexImpl)();
+	}
+
+	/**
+	The destructor for Mutex deletes the mutex.
+	*/
+	~MutexT()
+	{
+		mImpl->~MutexImpl();
+		Alloc::deallocate(mImpl);
+	}
+
+	/**
+	Acquire (lock) the mutex. If the mutex is already locked
+	by another thread, this method blocks until the mutex is
+	unlocked.
+	*/
+	void lock() const
+	{
+		mImpl->lock();
+	}
+
+	/**
+	Acquire (lock) the mutex. If the mutex is already locked
+	by another thread, this method returns false without blocking,
+	returns true if lock is successfully acquired
+	*/
+	bool trylock() const
+	{
+		return mImpl->trylock();
+	}
+
+	/**
+	Release (unlock) the mutex, the calling thread must have
+	previously called lock() or method will error
+	*/
+	void unlock() const
+	{
+		mImpl->unlock();
+	}
+
+  private:
+	MutexImpl* mImpl;
+};
+
+class PX_FOUNDATION_API ReadWriteLock
+{
+	PX_NOCOPY(ReadWriteLock)
+  public:
+	ReadWriteLock();
+	~ReadWriteLock();
+
+	void lockReader();
+	void lockWriter();
+
+	void unlockReader();
+	void unlockWriter();
+
+  private:
+	class ReadWriteLockImpl* mImpl;
+};
+
+class ScopedReadLock
+{
+	PX_NOCOPY(ScopedReadLock)
+  public:
+	PX_INLINE ScopedReadLock(ReadWriteLock& lock) : mLock(lock)
+	{
+		mLock.lockReader();
+	}
+	PX_INLINE ~ScopedReadLock()
+	{
+		mLock.unlockReader();
+	}
+
+  private:
+	ReadWriteLock& mLock;
+};
+
+class ScopedWriteLock
+{
+	PX_NOCOPY(ScopedWriteLock)
+  public:
+	PX_INLINE ScopedWriteLock(ReadWriteLock& lock) : mLock(lock)
+	{
+		mLock.lockWriter();
+	}
+	PX_INLINE ~ScopedWriteLock()
+	{
+		mLock.unlockWriter();
+	}
+
+  private:
+	ReadWriteLock& mLock;
+};
+
+typedef MutexT<> Mutex;
+
+/*
+ * Use this type of lock for mutex behaviour that must operate on SPU and PPU
+ * On non-PS3 platforms, it is implemented using Mutex
+ */
+class AtomicLock
+{
+	Mutex mMutex;
+	PX_NOCOPY(AtomicLock)
+
+  public:
+	AtomicLock()
+	{
+	}
+
+	bool lock()
+	{
+		mMutex.lock();
+		return true;
+	}
+
+	bool trylock()
+	{
+		return mMutex.trylock();
+	}
+
+	bool unlock()
+	{
+		mMutex.unlock();
+		return true;
+	}
+};
+
+class AtomicLockCopy
+{
+	AtomicLock* pLock;
+
+  public:
+	AtomicLockCopy() : pLock(NULL)
+	{
+	}
+
+	AtomicLockCopy& operator=(AtomicLock& lock)
+	{
+		pLock = &lock;
+		return *this;
+	}
+
+	bool lock()
+	{
+		return pLock->lock();
+	}
+
+	bool trylock()
+	{
+		return pLock->trylock();
+	}
+
+	bool unlock()
+	{
+		return pLock->unlock();
+	}
+};
+
+class AtomicRwLock
+{
+	ReadWriteLock m_Lock;
+	PX_NOCOPY(AtomicRwLock)
+
+  public:
+	AtomicRwLock()
+	{
+	}
+
+	void lockReader()
+	{
+		m_Lock.lockReader();
+	}
+	void lockWriter()
+	{
+		m_Lock.lockWriter();
+	}
+
+	bool tryLockReader()
+	{
+		// Todo - implement this
+		m_Lock.lockReader();
+		return true;
+	}
+
+	void unlockReader()
+	{
+		m_Lock.unlockReader();
+	}
+	void unlockWriter()
+	{
+		m_Lock.unlockWriter();
+	}
+};
+
+class ScopedAtomicLock
+{
+	PX_INLINE ScopedAtomicLock(AtomicLock& lock) : mLock(lock)
+	{
+		mLock.lock();
+	}
+	PX_INLINE ~ScopedAtomicLock()
+	{
+		mLock.unlock();
+	}
+
+	PX_NOCOPY(ScopedAtomicLock)
+  private:
+	AtomicLock& mLock;
+};
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSMUTEX_H
diff --git a/PxShared/src/foundation/include/PsPool.h b/PxShared/src/foundation/include/PsPool.h
new file mode 100644
index 00000000..48280100
--- /dev/null
+++ b/PxShared/src/foundation/include/PsPool.h
@@ -0,0 +1,298 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSPOOL_H
+#define PSFOUNDATION_PSPOOL_H
+
+#include "PsArray.h"
+#include "PsSort.h"
+#include "PsBasicTemplates.h"
+#include "PsInlineArray.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+
+/*!
+Simple allocation pool
+*/
+template <class T, class Alloc = typename AllocatorTraits<T>::Type>
+class PoolBase : public UserAllocated, public Alloc
+{
+	PX_NOCOPY(PoolBase)
+  protected:
+	PoolBase(const Alloc& alloc, uint32_t elementsPerSlab, uint32_t slabSize)
+	: Alloc(alloc), mSlabs(alloc), mElementsPerSlab(elementsPerSlab), mUsed(0), mSlabSize(slabSize), mFreeElement(0)
+	{
+		PX_COMPILE_TIME_ASSERT(sizeof(T) >= sizeof(size_t));
+	}
+
+  public:
+	~PoolBase()
+	{
+		if(mUsed)
+			disposeElements();
+
+		for(void** slabIt = mSlabs.begin(), *slabEnd = mSlabs.end(); slabIt != slabEnd; ++slabIt)
+			Alloc::deallocate(*slabIt);
+	}
+
+	// Allocate space for single object
+	PX_INLINE T* allocate()
+	{
+		if(mFreeElement == 0)
+			allocateSlab();
+		T* p = reinterpret_cast<T*>(mFreeElement);
+		mFreeElement = mFreeElement->mNext;
+		mUsed++;
+/**
+Mark a specified amount of memory with 0xcd pattern. This is used to check that the meta data
+definition for serialized classes is complete in checked builds.
+*/
+#if PX_CHECKED
+		for(uint32_t i = 0; i < sizeof(T); ++i)
+			reinterpret_cast<uint8_t*>(p)[i] = 0xcd;
+#endif
+		return p;
+	}
+
+	// Put space for a single element back in the lists
+	PX_INLINE void deallocate(T* p)
+	{
+		if(p)
+		{
+			PX_ASSERT(mUsed);
+			mUsed--;
+			push(reinterpret_cast<FreeList*>(p));
+		}
+	}
+
+	PX_INLINE T* construct()
+	{
+		T* t = allocate();
+		return t ? new (t) T() : 0;
+	}
+
+	template <class A1>
+	PX_INLINE T* construct(A1& a)
+	{
+		T* t = allocate();
+		return t ? new (t) T(a) : 0;
+	}
+
+	template <class A1, class A2>
+	PX_INLINE T* construct(A1& a, A2& b)
+	{
+		T* t = allocate();
+		return t ? new (t) T(a, b) : 0;
+	}
+
+	template <class A1, class A2, class A3>
+	PX_INLINE T* construct(A1& a, A2& b, A3& c)
+	{
+		T* t = allocate();
+		return t ? new (t) T(a, b, c) : 0;
+	}
+
+	template <class A1, class A2, class A3>
+	PX_INLINE T* construct(A1* a, A2& b, A3& c)
+	{
+		T* t = allocate();
+		return t ? new (t) T(a, b, c) : 0;
+	}
+
+	template <class A1, class A2, class A3, class A4>
+	PX_INLINE T* construct(A1& a, A2& b, A3& c, A4& d)
+	{
+		T* t = allocate();
+		return t ? new (t) T(a, b, c, d) : 0;
+	}
+
+	template <class A1, class A2, class A3, class A4, class A5>
+	PX_INLINE T* construct(A1& a, A2& b, A3& c, A4& d, A5& e)
+	{
+		T* t = allocate();
+		return t ? new (t) T(a, b, c, d, e) : 0;
+	}
+
+	PX_INLINE void destroy(T* const p)
+	{
+		if(p)
+		{
+			p->~T();
+			deallocate(p);
+		}
+	}
+
+  protected:
+	struct FreeList
+	{
+		FreeList* mNext;
+	};
+
+	// All the allocated slabs, sorted by pointer
+	InlineArray<void*, 64, Alloc> mSlabs;
+
+	uint32_t mElementsPerSlab;
+	uint32_t mUsed;
+	uint32_t mSlabSize;
+
+	FreeList* mFreeElement; // Head of free-list
+
+	// Helper function to get bitmap of allocated elements
+
+	void push(FreeList* p)
+	{
+		p->mNext = mFreeElement;
+		mFreeElement = p;
+	}
+
+	// Allocate a slab and segregate it into the freelist
+	void allocateSlab()
+	{
+		T* slab = reinterpret_cast<T*>(Alloc::allocate(mSlabSize, __FILE__, __LINE__));
+
+		mSlabs.pushBack(slab);
+
+		// Build a chain of nodes for the freelist
+		T* it = slab + mElementsPerSlab;
+		while(--it >= slab)
+			push(reinterpret_cast<FreeList*>(it));
+	}
+
+	/*
+	Cleanup method. Go through all active slabs and call destructor for live objects,
+	then free their memory
+	*/
+	void disposeElements()
+	{
+		Array<void*, Alloc> freeNodes(*this);
+		while(mFreeElement)
+		{
+			freeNodes.pushBack(mFreeElement);
+			mFreeElement = mFreeElement->mNext;
+		}
+		Alloc& alloc(*this);
+		sort(freeNodes.begin(), freeNodes.size(), Less<void*>(), alloc);
+		sort(mSlabs.begin(), mSlabs.size(), Less<void*>(), alloc);
+
+		typename Array<void*, Alloc>::Iterator slabIt = mSlabs.begin(), slabEnd = mSlabs.end();
+		for(typename Array<void*, Alloc>::Iterator freeIt = freeNodes.begin(); slabIt != slabEnd; ++slabIt)
+		{
+			for(T* tIt = reinterpret_cast<T*>(*slabIt), *tEnd = tIt + mElementsPerSlab; tIt != tEnd; ++tIt)
+			{
+				if(freeIt != freeNodes.end() && *freeIt == tIt)
+					++freeIt;
+				else
+					tIt->~T();
+			}
+		}
+	}
+
+	/*
+	Go through all slabs and call destructor if the slab is empty
+	*/
+	void releaseEmptySlabs()
+	{
+		Array<void*, Alloc> freeNodes(*this);
+		Array<void*, Alloc> slabNodes(mSlabs, *this);
+		while(mFreeElement)
+		{
+			freeNodes.pushBack(mFreeElement);
+			mFreeElement = mFreeElement->mNext;
+		}
+
+		typename Array<void*, Alloc>::Iterator freeIt = freeNodes.begin(), freeEnd = freeNodes.end(),
+		                                       lastCheck = freeNodes.end() - mElementsPerSlab;
+
+		if(freeNodes.size() > mElementsPerSlab)
+		{
+			Alloc& alloc(*this);
+			sort(freeNodes.begin(), freeNodes.size(), Less<void*>(), alloc);
+			sort(slabNodes.begin(), slabNodes.size(), Less<void*>(), alloc);
+
+			mSlabs.clear();
+			for(void** slabIt = slabNodes.begin(), *slabEnd = slabNodes.end(); slabIt != slabEnd; ++slabIt)
+			{
+				while((freeIt < lastCheck) && (*slabIt > (*freeIt)))
+				{
+					push(reinterpret_cast<FreeList*>(*freeIt));
+					freeIt++;
+				}
+
+				if(*slabIt == (*freeIt)) // the slab's first element in freeList
+				{
+					const size_t endSlabAddress = size_t(*slabIt) + mSlabSize;
+					const size_t endFreeAddress = size_t(*(freeIt + mElementsPerSlab - 1));
+					if(endFreeAddress + sizeof(T) == endSlabAddress)
+					{ // all slab's element in freeList
+						Alloc::deallocate(*slabIt);
+						freeIt += mElementsPerSlab;
+						continue;
+					}
+				}
+
+				mSlabs.pushBack(*slabIt);
+			}
+		}
+
+		while(freeIt != freeEnd)
+		{
+			push(reinterpret_cast<FreeList*>(*freeIt));
+			++freeIt;
+		}
+	}
+};
+
+// original pool implementation
+template <class T, class Alloc = typename AllocatorTraits<T>::Type>
+class Pool : public PoolBase<T, Alloc>
+{
+  public:
+	Pool(const Alloc& alloc = Alloc(), uint32_t elementsPerSlab = 32)
+	: PoolBase<T, Alloc>(alloc, elementsPerSlab, elementsPerSlab * sizeof(T))
+	{
+	}
+};
+
+// allows specification of the slab size instead of the occupancy
+template <class T, uint32_t slabSize, class Alloc = typename AllocatorTraits<T>::Type>
+class Pool2 : public PoolBase<T, Alloc>
+{
+  public:
+	Pool2(const Alloc& alloc = Alloc()) : PoolBase<T, Alloc>(alloc, slabSize / sizeof(T), slabSize)
+	{
+	}
+};
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSPOOL_H
diff --git a/PxShared/src/foundation/include/PsSList.h b/PxShared/src/foundation/include/PsSList.h
new file mode 100644
index 00000000..70db740d
--- /dev/null
+++ b/PxShared/src/foundation/include/PsSList.h
@@ -0,0 +1,140 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSSLIST_H
+#define PSFOUNDATION_PSSLIST_H
+
+#include "foundation/Px.h"
+#include "foundation/PxAssert.h"
+#include "PsAlignedMalloc.h"
+
+#if PX_P64_FAMILY
+#define PX_SLIST_ALIGNMENT 16
+#else
+#define PX_SLIST_ALIGNMENT 8
+#endif
+
+namespace physx
+{
+namespace shdfnd
+{
+
+#if PX_VC
+#pragma warning(push)
+#pragma warning(disable : 4324) // Padding was added at the end of a structure because of a __declspec(align) value.
+#endif
+
+#if !PX_GCC_FAMILY
+__declspec(align(PX_SLIST_ALIGNMENT))
+#endif
+    class SListEntry
+{
+	friend struct SListImpl;
+
+  public:
+	SListEntry() : mNext(NULL)
+	{
+		PX_ASSERT((size_t(this) & (PX_SLIST_ALIGNMENT - 1)) == 0);
+	}
+
+	// Only use on elements returned by SList::flush()
+	// because the operation is not atomic.
+	SListEntry* next()
+	{
+		return mNext;
+	}
+
+  private:
+	SListEntry* mNext;
+}
+#if PX_GCC_FAMILY
+__attribute__((aligned(PX_SLIST_ALIGNMENT)));
+#else
+;
+#endif
+
+#if PX_VC
+#pragma warning(pop)
+#endif
+
+// template-less implementation
+struct PX_FOUNDATION_API SListImpl
+{
+	SListImpl();
+	~SListImpl();
+	void push(SListEntry* entry);
+	SListEntry* pop();
+	SListEntry* flush();
+	static const uint32_t& getSize();
+};
+
+template <typename Alloc = ReflectionAllocator<SListImpl> >
+class SListT : protected Alloc
+{
+  public:
+	SListT(const Alloc& alloc = Alloc()) : Alloc(alloc)
+	{
+		mImpl = reinterpret_cast<SListImpl*>(Alloc::allocate(SListImpl::getSize(), __FILE__, __LINE__));
+		PX_ASSERT((size_t(mImpl) & (PX_SLIST_ALIGNMENT - 1)) == 0);
+		PX_PLACEMENT_NEW(mImpl, SListImpl)();
+	}
+	~SListT()
+	{
+		mImpl->~SListImpl();
+		Alloc::deallocate(mImpl);
+	}
+
+	// pushes a new element to the list
+	void push(SListEntry& entry)
+	{
+		mImpl->push(&entry);
+	}
+
+	// pops an element from the list
+	SListEntry* pop()
+	{
+		return mImpl->pop();
+	}
+
+	// removes all items from list, returns pointer to first element
+	SListEntry* flush()
+	{
+		return mImpl->flush();
+	}
+
+  private:
+	SListImpl* mImpl;
+};
+
+typedef SListT<> SList;
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSSLIST_H
diff --git a/PxShared/src/foundation/include/PsSocket.h b/PxShared/src/foundation/include/PsSocket.h
new file mode 100644
index 00000000..5f65b9fb
--- /dev/null
+++ b/PxShared/src/foundation/include/PsSocket.h
@@ -0,0 +1,186 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSSOCKET_H
+#define PSFOUNDATION_PSSOCKET_H
+
+#include "PsUserAllocated.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+/**
+Socket abstraction API
+*/
+
+class PX_FOUNDATION_API Socket : public UserAllocated
+{
+  public:
+	static const uint32_t DEFAULT_BUFFER_SIZE;
+
+	Socket(bool inEnableBuffering = true, bool blocking = true);
+
+	virtual ~Socket();
+
+	/*!
+	Opens a network socket for input and/or output
+
+	\param host
+	Name of the host to connect to. This can be an IP, URL, etc
+
+	\param port
+	The port to connect to on the remote host
+
+	\param timeout
+	Timeout in ms until the connection must be established.
+
+	\return
+	True if the connection was successful, false otherwise
+	*/
+	bool connect(const char* host, uint16_t port, uint32_t timeout = 1000);
+
+	/*!
+	Opens a network socket for input and/or output as a server.  Put the connection in listening mode
+
+	\param port
+	The port on which the socket listens
+	*/
+	bool listen(uint16_t port);
+
+	/*!
+	Accept a connection on a socket that is in listening mode
+
+	\note
+	This method only supports a single connection client.  Additional clients
+	that connect to the listening port will overwrite the existing socket handle.
+
+	\param block
+	whether or not the call should block
+
+	\return whether a connection was established
+	*/
+	bool accept(bool block);
+
+	/*!
+	Disconnects an open socket
+	*/
+	void disconnect();
+
+	/*!
+	Returns whether the socket is currently open (connected) or not.
+
+	\return
+	True if the socket is connected, false otherwise
+	*/
+	bool isConnected() const;
+
+	/*!
+	Returns the name of the connected host. This is the same as the string
+	that was supplied to the connect call.
+
+	\return
+	The name of the connected host
+	*/
+	const char* getHost() const;
+
+	/*!
+	Returns the port of the connected host. This is the same as the port
+	that was supplied to the connect call.
+
+	\return
+	The port of the connected host
+	*/
+	uint16_t getPort() const;
+
+	/*!
+	Flushes the output stream. Until the stream is flushed, there is no
+	guarantee that the written data has actually reached the destination
+	storage. Flush forces all buffered data to be sent to the output.
+
+	\note flush always blocks. If the socket is in non-blocking mode, this will result
+	the thread spinning.
+
+	\return
+	True if the flush was successful, false otherwise
+	*/
+	bool flush();
+
+	/*!
+	Writes data to the output stream.
+
+	\param data
+	Pointer to a  block of data to write to the stream
+
+	\param length
+	Amount of data to write, in bytes
+
+	\return
+	Number of bytes actually written. This could be lower than length if the socket is non-blocking.
+	*/
+
+	uint32_t write(const uint8_t* data, uint32_t length);
+
+	/*!
+	Reads data from the output stream.
+
+	\param data
+	Pointer to a buffer where the read data will be stored.
+
+	\param length
+	Amount of data to read, in bytes.
+
+	\return
+	Number of bytes actually read. This could be lower than length if the stream end is
+	encountered or the socket is non-blocking.
+	*/
+	uint32_t read(uint8_t* data, uint32_t length);
+
+	/*!
+	Sets blocking mode of the socket.
+	Socket must be connected, otherwise calling this method won't take any effect.
+	*/
+	void setBlocking(bool blocking);
+
+	/*!
+	Returns whether read/write/flush calls to the socket are blocking.
+
+	\return
+	True if the socket is blocking.
+	*/
+	bool isBlocking() const;
+
+  private:
+	class SocketImpl* mImpl;
+};
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // PSFOUNDATION_PSSOCKET_H
diff --git a/PxShared/src/foundation/include/PsSort.h b/PxShared/src/foundation/include/PsSort.h
new file mode 100644
index 00000000..14fa5732
--- /dev/null
+++ b/PxShared/src/foundation/include/PsSort.h
@@ -0,0 +1,130 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSSORT_H
+#define PSFOUNDATION_PSSORT_H
+
+/** \addtogroup foundation
+@{
+*/
+
+#include "PsSortInternals.h"
+#include "PsAlloca.h"
+
+#define PX_SORT_PARANOIA PX_DEBUG
+
+/**
+\brief Sorts an array of objects in ascending order, assuming
+that the predicate implements the < operator:
+
+\see Less, Greater
+*/
+
+#if PX_VC
+#pragma warning(push)
+#pragma warning(disable : 4706) // disable the warning that we did an assignment within a conditional expression, as
+// this was intentional.
+#endif
+
+namespace physx
+{
+namespace shdfnd
+{
+template <class T, class Predicate, class Allocator>
+void sort(T* elements, uint32_t count, const Predicate& compare, const Allocator& inAllocator,
+          const uint32_t initialStackSize = 32)
+{
+	static const uint32_t SMALL_SORT_CUTOFF = 5; // must be >= 3 since we need 3 for median
+
+	PX_ALLOCA(stackMem, int32_t, initialStackSize);
+	internal::Stack<Allocator> stack(stackMem, initialStackSize, inAllocator);
+
+	int32_t first = 0, last = int32_t(count - 1);
+	if(last > first)
+	{
+		for(;;)
+		{
+			while(last > first)
+			{
+				PX_ASSERT(first >= 0 && last < int32_t(count));
+				if(uint32_t(last - first) < SMALL_SORT_CUTOFF)
+				{
+					internal::smallSort(elements, first, last, compare);
+					break;
+				}
+				else
+				{
+					const int32_t partIndex = internal::partition(elements, first, last, compare);
+
+					// push smaller sublist to minimize stack usage
+					if((partIndex - first) < (last - partIndex))
+					{
+						stack.push(first, partIndex - 1);
+						first = partIndex + 1;
+					}
+					else
+					{
+						stack.push(partIndex + 1, last);
+						last = partIndex - 1;
+					}
+				}
+			}
+
+			if(stack.empty())
+				break;
+
+			stack.pop(first, last);
+		}
+	}
+#if PX_SORT_PARANOIA
+	for(uint32_t i = 1; i < count; i++)
+		PX_ASSERT(!compare(elements[i], elements[i - 1]));
+#endif
+}
+
+template <class T, class Predicate>
+void sort(T* elements, uint32_t count, const Predicate& compare)
+{
+	sort(elements, count, compare, typename shdfnd::AllocatorTraits<T>::Type());
+}
+
+template <class T>
+void sort(T* elements, uint32_t count)
+{
+	sort(elements, count, shdfnd::Less<T>(), typename shdfnd::AllocatorTraits<T>::Type());
+}
+
+} // namespace shdfnd
+} // namespace physx
+
+#if PX_VC
+#pragma warning(pop)
+#endif
+
+#endif // #ifndef PSFOUNDATION_PSSORT_H
diff --git a/PxShared/src/foundation/include/PsSortInternals.h b/PxShared/src/foundation/include/PsSortInternals.h
new file mode 100644
index 00000000..7e9ee48e
--- /dev/null
+++ b/PxShared/src/foundation/include/PsSortInternals.h
@@ -0,0 +1,188 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSSORTINTERNALS_H
+#define PSFOUNDATION_PSSORTINTERNALS_H
+
+/** \addtogroup foundation
+@{
+*/
+
+#include "foundation/PxAssert.h"
+#include "foundation/PxIntrinsics.h"
+#include "PsBasicTemplates.h"
+#include "PsUserAllocated.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+namespace internal
+{
+template <class T, class Predicate>
+PX_INLINE void median3(T* elements, int32_t first, int32_t last, Predicate& compare)
+{
+	/*
+	This creates sentinels because we know there is an element at the start minimum(or equal)
+	than the pivot and an element at the end greater(or equal) than the pivot. Plus the
+	median of 3 reduces the chance of degenerate behavour.
+	*/
+
+	int32_t mid = (first + last) / 2;
+
+	if(compare(elements[mid], elements[first]))
+		swap(elements[first], elements[mid]);
+
+	if(compare(elements[last], elements[first]))
+		swap(elements[first], elements[last]);
+
+	if(compare(elements[last], elements[mid]))
+		swap(elements[mid], elements[last]);
+
+	// keep the pivot at last-1
+	swap(elements[mid], elements[last - 1]);
+}
+
+template <class T, class Predicate>
+PX_INLINE int32_t partition(T* elements, int32_t first, int32_t last, Predicate& compare)
+{
+	median3(elements, first, last, compare);
+
+	/*
+	WARNING: using the line:
+
+	T partValue = elements[last-1];
+
+	and changing the scan loops to:
+
+	while(comparator.greater(partValue, elements[++i]));
+	while(comparator.greater(elements[--j], partValue);
+
+	triggers a compiler optimizer bug on xenon where it stores a double to the stack for partValue
+	then loads it as a single...:-(
+	*/
+
+	int32_t i = first;    // we know first is less than pivot(but i gets pre incremented)
+	int32_t j = last - 1; // pivot is in last-1 (but j gets pre decremented)
+
+	for(;;)
+	{
+		while(compare(elements[++i], elements[last - 1]))
+			;
+		while(compare(elements[last - 1], elements[--j]))
+			;
+
+		if(i >= j)
+			break;
+
+		PX_ASSERT(i <= last && j >= first);
+		swap(elements[i], elements[j]);
+	}
+	// put the pivot in place
+
+	PX_ASSERT(i <= last && first <= (last - 1));
+	swap(elements[i], elements[last - 1]);
+
+	return i;
+}
+
+template <class T, class Predicate>
+PX_INLINE void smallSort(T* elements, int32_t first, int32_t last, Predicate& compare)
+{
+	// selection sort - could reduce to fsel on 360 with floats.
+
+	for(int32_t i = first; i < last; i++)
+	{
+		int32_t m = i;
+		for(int32_t j = i + 1; j <= last; j++)
+			if(compare(elements[j], elements[m]))
+				m = j;
+
+		if(m != i)
+			swap(elements[m], elements[i]);
+	}
+}
+
+template <class Allocator>
+class Stack
+{
+	Allocator mAllocator;
+	uint32_t mSize, mCapacity;
+	int32_t* mMemory;
+	bool mRealloc;
+
+  public:
+	Stack(int32_t* memory, uint32_t capacity, const Allocator& inAllocator)
+	: mAllocator(inAllocator), mSize(0), mCapacity(capacity), mMemory(memory), mRealloc(false)
+	{
+	}
+	~Stack()
+	{
+		if(mRealloc)
+			mAllocator.deallocate(mMemory);
+	}
+
+	void grow()
+	{
+		mCapacity *= 2;
+		int32_t* newMem =
+		    reinterpret_cast<int32_t*>(mAllocator.allocate(sizeof(int32_t) * mCapacity, __FILE__, __LINE__));
+		intrinsics::memCopy(newMem, mMemory, mSize * sizeof(int32_t));
+		if(mRealloc)
+			mAllocator.deallocate(mMemory);
+		mRealloc = true;
+		mMemory = newMem;
+	}
+
+	PX_INLINE void push(int32_t start, int32_t end)
+	{
+		if(mSize >= mCapacity - 1)
+			grow();
+		mMemory[mSize++] = start;
+		mMemory[mSize++] = end;
+	}
+
+	PX_INLINE void pop(int32_t& start, int32_t& end)
+	{
+		PX_ASSERT(!empty());
+		end = mMemory[--mSize];
+		start = mMemory[--mSize];
+	}
+
+	PX_INLINE bool empty()
+	{
+		return mSize == 0;
+	}
+};
+} // namespace internal
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSSORTINTERNALS_H
diff --git a/PxShared/src/foundation/include/PsString.h b/PxShared/src/foundation/include/PsString.h
new file mode 100644
index 00000000..7a69264a
--- /dev/null
+++ b/PxShared/src/foundation/include/PsString.h
@@ -0,0 +1,90 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSSTRING_H
+#define PSFOUNDATION_PSSTRING_H
+
+#include "foundation/PxPreprocessor.h"
+#include "foundation/PxSimpleTypes.h"
+#include <stdarg.h>
+
+namespace physx
+{
+namespace shdfnd
+{
+
+// the following functions have C99 semantics. Note that C99 requires for snprintf and vsnprintf:
+// * the resulting string is always NULL-terminated regardless of truncation.
+// * in the case of truncation the return value is the number of characters that would have been created.
+
+PX_FOUNDATION_API int32_t sscanf(const char* buffer, const char* format, ...);
+PX_FOUNDATION_API int32_t strcmp(const char* str1, const char* str2);
+PX_FOUNDATION_API int32_t strncmp(const char* str1, const char* str2, size_t count);
+PX_FOUNDATION_API int32_t snprintf(char* dst, size_t dstSize, const char* format, ...);
+PX_FOUNDATION_API int32_t vsnprintf(char* dst, size_t dstSize, const char* src, va_list arg);
+
+// strlcat and strlcpy have BSD semantics:
+// * dstSize is always the size of the destination buffer
+// * the resulting string is always NULL-terminated regardless of truncation
+// * in the case of truncation the return value is the length of the string that would have been created
+
+PX_FOUNDATION_API size_t strlcat(char* dst, size_t dstSize, const char* src);
+PX_FOUNDATION_API size_t strlcpy(char* dst, size_t dstSize, const char* src);
+
+// case-insensitive string comparison
+PX_FOUNDATION_API int32_t stricmp(const char* str1, const char* str2);
+PX_FOUNDATION_API int32_t strnicmp(const char* str1, const char* str2, size_t count);
+
+// in-place string case conversion
+PX_FOUNDATION_API void strlwr(char* str);
+PX_FOUNDATION_API void strupr(char* str);
+
+/**
+\brief The maximum supported formatted output string length
+(number of characters after replacement).
+
+@see printFormatted()
+*/
+static const size_t MAX_PRINTFORMATTED_LENGTH = 1024;
+
+/**
+\brief Prints the formatted data, trying to make sure it's visible to the app programmer
+
+@see NS_MAX_PRINTFORMATTED_LENGTH
+*/
+PX_FOUNDATION_API void printFormatted(const char*, ...);
+
+/**
+\brief Prints the string literally (does not consume % specifier), trying to make sure it's visible to the app
+programmer
+*/
+PX_FOUNDATION_API void printString(const char*);
+}
+}
+#endif // #ifndef PSFOUNDATION_PSSTRING_H
diff --git a/PxShared/src/foundation/include/PsSync.h b/PxShared/src/foundation/include/PsSync.h
new file mode 100644
index 00000000..e1db6cea
--- /dev/null
+++ b/PxShared/src/foundation/include/PsSync.h
@@ -0,0 +1,138 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSSYNC_H
+#define PSFOUNDATION_PSSYNC_H
+
+#include "PsAllocator.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+/*!
+Implementation notes:
+* - Calling set() on an already signaled Sync does not change its state.
+* - Calling reset() on an already reset Sync does not change its state.
+* - Calling set() on a reset Sync wakes all waiting threads (potential for thread contention).
+* - Calling wait() on an already signaled Sync will return true immediately.
+* - NOTE: be careful when pulsing an event with set() followed by reset(), because a
+*   thread that is not waiting on the event will miss the signal.
+*/
+class PX_FOUNDATION_API SyncImpl
+{
+  public:
+	static const uint32_t waitForever = 0xffffffff;
+
+	SyncImpl();
+
+	~SyncImpl();
+
+	/** Wait on the object for at most the given number of ms. Returns
+	*  true if the object is signaled. Sync::waitForever will block forever
+	*  or until the object is signaled.
+	*/
+
+	bool wait(uint32_t milliseconds = waitForever);
+
+	/** Signal the synchronization object, waking all threads waiting on it */
+
+	void set();
+
+	/** Reset the synchronization object */
+
+	void reset();
+
+	/**
+   Size of this class.
+   */
+	static const uint32_t& getSize();
+};
+
+/*!
+Implementation notes:
+* - Calling set() on an already signaled Sync does not change its state.
+* - Calling reset() on an already reset Sync does not change its state.
+* - Calling set() on a reset Sync wakes all waiting threads (potential for thread contention).
+* - Calling wait() on an already signaled Sync will return true immediately.
+* - NOTE: be careful when pulsing an event with set() followed by reset(), because a
+*   thread that is not waiting on the event will miss the signal.
+*/
+template <typename Alloc = ReflectionAllocator<SyncImpl> >
+class SyncT : protected Alloc
+{
+  public:
+	static const uint32_t waitForever = SyncImpl::waitForever;
+
+	SyncT(const Alloc& alloc = Alloc()) : Alloc(alloc)
+	{
+		mImpl = reinterpret_cast<SyncImpl*>(Alloc::allocate(SyncImpl::getSize(), __FILE__, __LINE__));
+		PX_PLACEMENT_NEW(mImpl, SyncImpl)();
+	}
+
+	~SyncT()
+	{
+		mImpl->~SyncImpl();
+		Alloc::deallocate(mImpl);
+	}
+
+	/** Wait on the object for at most the given number of ms. Returns
+	*  true if the object is signaled. Sync::waitForever will block forever
+	*  or until the object is signaled.
+	*/
+
+	bool wait(uint32_t milliseconds = SyncImpl::waitForever)
+	{
+		return mImpl->wait(milliseconds);
+	}
+
+	/** Signal the synchronization object, waking all threads waiting on it */
+
+	void set()
+	{
+		mImpl->set();
+	}
+
+	/** Reset the synchronization object */
+
+	void reset()
+	{
+		mImpl->reset();
+	}
+
+  private:
+	class SyncImpl* mImpl;
+};
+
+typedef SyncT<> Sync;
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSSYNC_H
diff --git a/PxShared/src/foundation/include/PsTempAllocator.h b/PxShared/src/foundation/include/PsTempAllocator.h
new file mode 100644
index 00000000..40c029d0
--- /dev/null
+++ b/PxShared/src/foundation/include/PsTempAllocator.h
@@ -0,0 +1,62 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSTEMPALLOCATOR_H
+#define PSFOUNDATION_PSTEMPALLOCATOR_H
+
+#include "PsAllocator.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+union TempAllocatorChunk
+{
+	TempAllocatorChunk() : mNext(0)
+	{
+	}
+	TempAllocatorChunk* mNext; // while chunk is free
+	uint32_t mIndex;           // while chunk is allocated
+	uint8_t mPad[16];          // 16 byte aligned allocations
+};
+
+class TempAllocator
+{
+  public:
+	PX_FORCE_INLINE TempAllocator(const char* = 0)
+	{
+	}
+	PX_FOUNDATION_API void* allocate(size_t size, const char* file, int line);
+	PX_FOUNDATION_API void deallocate(void* ptr);
+};
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSTEMPALLOCATOR_H
diff --git a/PxShared/src/foundation/include/PsThread.h b/PxShared/src/foundation/include/PsThread.h
new file mode 100644
index 00000000..950d5381
--- /dev/null
+++ b/PxShared/src/foundation/include/PsThread.h
@@ -0,0 +1,382 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSTHREAD_H
+#define PSFOUNDATION_PSTHREAD_H
+
+#include "PsUserAllocated.h"
+
+// dsequeira: according to existing comment here (David Black would be my guess)
+// "This is useful to reduce bus contention on tight spin locks. And it needs
+// to be a macro as the xenon compiler often ignores even __forceinline." What's not
+// clear is why a pause function needs inlining...? (TODO: check with XBox team)
+
+// todo: these need to go somewhere else
+
+#if PX_WINDOWS_FAMILY || PX_XBOXONE
+#define PxSpinLockPause() __asm pause
+#elif PX_LINUX || PX_ANDROID || PX_PS4 || PX_APPLE_FAMILY
+#define PxSpinLockPause() asm("nop")
+#else
+#error "Platform not supported!"
+#endif
+
+namespace physx
+{
+namespace shdfnd
+{
+struct ThreadPriority // todo: put in some other header file
+{
+	enum Enum
+	{
+		/**
+	    \brief High priority
+	    */
+		eHIGH         = 0,
+
+		/**
+	    \brief Above Normal priority
+	    */
+		eABOVE_NORMAL = 1,
+
+		/**
+	    \brief Normal/default priority
+	    */
+		eNORMAL       = 2,
+
+		/**
+	    \brief Below Normal priority
+	    */
+		eBELOW_NORMAL = 3,
+
+		/**
+	    \brief Low priority.
+	    */
+		eLOW          = 4,
+		eFORCE_DWORD  = 0xffFFffFF
+	};
+};
+
+class Runnable
+{
+  public:
+	Runnable()
+	{
+	}
+	virtual ~Runnable()
+	{
+	}
+	virtual void execute(void)
+	{
+	}
+};
+
+class PX_FOUNDATION_API ThreadImpl
+{
+  public:
+	typedef size_t Id; // space for a pointer or an integer
+	typedef void* (*ExecuteFn)(void*);
+
+	static uint32_t getDefaultStackSize();
+	static Id getId();
+
+	/**
+	Construct (but do not start) the thread object. The OS thread object will not be created
+	until start() is called. Executes in the context
+	of the spawning thread.
+	*/
+
+	ThreadImpl();
+
+	/**
+	Construct and start the the thread, passing the given arg to the given fn. (pthread style)
+	*/
+
+	ThreadImpl(ExecuteFn fn, void* arg);
+
+	/**
+	Deallocate all resources associated with the thread. Should be called in the
+	context of the spawning thread.
+	*/
+
+	~ThreadImpl();
+
+	/**
+	Create the OS thread and start it running. Called in the context of the spawning thread.
+	If an affinity mask has previously been set then it will be applied after the
+	thread has been created.
+	*/
+
+	void start(uint32_t stackSize, Runnable* r);
+
+	/**
+	Violently kill the current thread. Blunt instrument, not recommended since
+	it can leave all kinds of things unreleased (stack, memory, mutexes...) Should
+	be called in the context of the spawning thread.
+	*/
+
+	void kill();
+
+	/**
+	Stop the thread. Signals the spawned thread that it should stop, so the
+	thread should check regularly
+	*/
+
+	void signalQuit();
+
+	/**
+	Wait for a thread to stop. Should be called in the context of the spawning
+	thread. Returns false if the thread has not been started.
+	*/
+
+	bool waitForQuit();
+
+	/**
+	check whether the thread is signalled to quit. Called in the context of the
+	spawned thread.
+	*/
+
+	bool quitIsSignalled();
+
+	/**
+	Cleanly shut down this thread. Called in the context of the spawned thread.
+	*/
+	void quit();
+
+	/**
+	Change the affinity mask for this thread. The mask is a platform
+	specific value.
+
+	On Windows, Linux, PS4 and XboxOne platforms, each set mask bit represents
+	the index of a logical processor that the OS may schedule thread execution on.
+	Bits outside the range of valid logical processors may be ignored or cause
+	the function to return an error.
+
+	On Apple platforms, this function has no effect.
+
+	If the thread has not yet been started then the mask is stored
+	and applied when the thread is started.
+
+	If the thread has already been started then this method	returns the
+	previous affinity mask on success, otherwise it returns zero.
+	*/
+	uint32_t setAffinityMask(uint32_t mask);
+
+	static ThreadPriority::Enum getPriority(Id threadId);
+
+	/** Set thread priority. */
+	void setPriority(ThreadPriority::Enum prio);
+
+	/** set the thread's name */
+	void setName(const char* name);
+
+	/** Put the current thread to sleep for the given number of milliseconds */
+	static void sleep(uint32_t ms);
+
+	/** Yield the current thread's slot on the CPU */
+	static void yield();
+
+	/** Return the number of physical cores (does not include hyper-threaded cores), returns 0 on failure */
+	static uint32_t getNbPhysicalCores();
+
+	/**
+   Size of this class.
+   */
+	static const uint32_t& getSize();
+};
+
+/**
+Thread abstraction API
+*/
+template <typename Alloc = ReflectionAllocator<ThreadImpl> >
+class ThreadT : protected Alloc, public UserAllocated, public Runnable
+{
+  public:
+	typedef ThreadImpl::Id Id; // space for a pointer or an integer
+
+	/**
+	Construct (but do not start) the thread object. Executes in the context
+	of the spawning thread
+	*/
+	ThreadT(const Alloc& alloc = Alloc()) : Alloc(alloc)
+	{
+		mImpl = reinterpret_cast<ThreadImpl*>(Alloc::allocate(ThreadImpl::getSize(), __FILE__, __LINE__));
+		PX_PLACEMENT_NEW(mImpl, ThreadImpl)();
+	}
+
+	/**
+	Construct and start the the thread, passing the given arg to the given fn. (pthread style)
+	*/
+	ThreadT(ThreadImpl::ExecuteFn fn, void* arg, const Alloc& alloc = Alloc()) : Alloc(alloc)
+	{
+		mImpl = reinterpret_cast<ThreadImpl*>(Alloc::allocate(ThreadImpl::getSize(), __FILE__, __LINE__));
+		PX_PLACEMENT_NEW(mImpl, ThreadImpl)(fn, arg);
+	}
+
+	/**
+	Deallocate all resources associated with the thread. Should be called in the
+	context of the spawning thread.
+	*/
+	virtual ~ThreadT()
+	{
+		mImpl->~ThreadImpl();
+		Alloc::deallocate(mImpl);
+	}
+
+	/**
+	start the thread running. Called in the context of the spawning thread.
+	*/
+
+	void start(uint32_t stackSize = ThreadImpl::getDefaultStackSize())
+	{
+		mImpl->start(stackSize, this);
+	}
+
+	/**
+	Violently kill the current thread. Blunt instrument, not recommended since
+	it can leave all kinds of things unreleased (stack, memory, mutexes...) Should
+	be called in the context of the spawning thread.
+	*/
+
+	void kill()
+	{
+		mImpl->kill();
+	}
+
+	/**
+	The virtual execute() method is the user defined function that will
+	run in the new thread. Called in the context of the spawned thread.
+	*/
+
+	virtual void execute(void)
+	{
+	}
+
+	/**
+	stop the thread. Signals the spawned thread that it should stop, so the
+	thread should check regularly
+	*/
+
+	void signalQuit()
+	{
+		mImpl->signalQuit();
+	}
+
+	/**
+	Wait for a thread to stop. Should be called in the context of the spawning
+	thread. Returns false if the thread has not been started.
+	*/
+
+	bool waitForQuit()
+	{
+		return mImpl->waitForQuit();
+	}
+
+	/**
+	check whether the thread is signalled to quit. Called in the context of the
+	spawned thread.
+	*/
+
+	bool quitIsSignalled()
+	{
+		return mImpl->quitIsSignalled();
+	}
+
+	/**
+	Cleanly shut down this thread. Called in the context of the spawned thread.
+	*/
+	void quit()
+	{
+		mImpl->quit();
+	}
+
+	uint32_t setAffinityMask(uint32_t mask)
+	{
+		return mImpl->setAffinityMask(mask);
+	}
+
+	static ThreadPriority::Enum getPriority(ThreadImpl::Id threadId)
+	{
+		return ThreadImpl::getPriority(threadId);
+	}
+
+	/** Set thread priority. */
+	void setPriority(ThreadPriority::Enum prio)
+	{
+		mImpl->setPriority(prio);
+	}
+
+	/** set the thread's name */
+	void setName(const char* name)
+	{
+		mImpl->setName(name);
+	}
+
+	/** Put the current thread to sleep for the given number of milliseconds */
+	static void sleep(uint32_t ms)
+	{
+		ThreadImpl::sleep(ms);
+	}
+
+	/** Yield the current thread's slot on the CPU */
+	static void yield()
+	{
+		ThreadImpl::yield();
+	}
+
+	static uint32_t getDefaultStackSize()
+	{
+		return ThreadImpl::getDefaultStackSize();
+	}
+
+	static ThreadImpl::Id getId()
+	{
+		return ThreadImpl::getId();
+	}
+
+	static uint32_t getNbPhysicalCores()
+	{
+		return ThreadImpl::getNbPhysicalCores();
+	}
+
+  private:
+	class ThreadImpl* mImpl;
+};
+
+typedef ThreadT<> Thread;
+
+PX_FOUNDATION_API uint32_t TlsAlloc();
+PX_FOUNDATION_API void TlsFree(uint32_t index);
+PX_FOUNDATION_API void* TlsGet(uint32_t index);
+PX_FOUNDATION_API uint32_t TlsSet(uint32_t index, void* value);
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSTHREAD_H
diff --git a/PxShared/src/foundation/include/PsTime.h b/PxShared/src/foundation/include/PsTime.h
new file mode 100644
index 00000000..09631e64
--- /dev/null
+++ b/PxShared/src/foundation/include/PsTime.h
@@ -0,0 +1,95 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSTIME_H
+#define PSFOUNDATION_PSTIME_H
+
+#include "Ps.h"
+
+#if PX_LINUX || PX_ANDROID
+#include <time.h>
+#endif
+
+namespace physx
+{
+namespace shdfnd
+{
+
+struct CounterFrequencyToTensOfNanos
+{
+	uint64_t mNumerator;
+	uint64_t mDenominator;
+	CounterFrequencyToTensOfNanos(uint64_t inNum, uint64_t inDenom) : mNumerator(inNum), mDenominator(inDenom)
+	{
+	}
+
+	// quite slow.
+	uint64_t toTensOfNanos(uint64_t inCounter) const
+	{
+		return (inCounter * mNumerator) / mDenominator;
+	}
+};
+
+class PX_FOUNDATION_API Time
+{
+  public:
+	typedef double Second;
+	static const uint64_t sNumTensOfNanoSecondsInASecond = 100000000;
+	// This is supposedly guaranteed to not change after system boot
+	// regardless of processors, speedstep, etc.
+	static const CounterFrequencyToTensOfNanos& getBootCounterFrequency();
+
+	static CounterFrequencyToTensOfNanos getCounterFrequency();
+
+	static uint64_t getCurrentCounterValue();
+
+	// SLOW!!
+	// Thar be a 64 bit divide in thar!
+	static uint64_t getCurrentTimeInTensOfNanoSeconds()
+	{
+		uint64_t ticks = getCurrentCounterValue();
+		return getBootCounterFrequency().toTensOfNanos(ticks);
+	}
+
+	Time();
+	Second getElapsedSeconds();
+	Second peekElapsedSeconds();
+	Second getLastTime() const;
+
+  private:
+#if PX_LINUX || PX_ANDROID || PX_APPLE_FAMILY || PX_PS4
+	Second mLastTime;
+#else
+	int64_t mTickCount;
+#endif
+};
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSTIME_H
diff --git a/PxShared/src/foundation/include/PsUserAllocated.h b/PxShared/src/foundation/include/PsUserAllocated.h
new file mode 100644
index 00000000..56b2e418
--- /dev/null
+++ b/PxShared/src/foundation/include/PsUserAllocated.h
@@ -0,0 +1,92 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSUSERALLOCATED_H
+#define PSFOUNDATION_PSUSERALLOCATED_H
+
+#include "PsAllocator.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+/**
+Provides new and delete using a UserAllocator.
+Guarantees that 'delete x;' uses the UserAllocator too.
+*/
+class UserAllocated
+{
+  public:
+	// PX_SERIALIZATION
+	PX_INLINE void* operator new(size_t, void* address)
+	{
+		return address;
+	}
+	//~PX_SERIALIZATION
+	// Matching operator delete to the above operator new.  Don't ask me
+	// how this makes any sense - Nuernberger.
+	PX_INLINE void operator delete(void*, void*)
+	{
+	}
+
+	template <typename Alloc>
+	PX_INLINE void* operator new(size_t size, Alloc alloc, const char* fileName, int line)
+	{
+		return alloc.allocate(size, fileName, line);
+	}
+	template <typename Alloc>
+	PX_INLINE void* operator new [](size_t size, Alloc alloc, const char* fileName, int line)
+	{ return alloc.allocate(size, fileName, line); }
+
+	// placement delete
+	template <typename Alloc>
+	PX_INLINE void operator delete(void* ptr, Alloc alloc, const char* fileName, int line)
+	{
+		PX_UNUSED(fileName);
+		PX_UNUSED(line);
+		alloc.deallocate(ptr);
+	}
+	template <typename Alloc>
+	PX_INLINE void operator delete [](void* ptr, Alloc alloc, const char* fileName, int line)
+	{
+		PX_UNUSED(fileName);
+		PX_UNUSED(line);
+		alloc.deallocate(ptr);
+	} PX_INLINE void
+	operator delete(void* ptr)
+	{
+		NonTrackingAllocator().deallocate(ptr);
+	}
+	PX_INLINE void operator delete [](void* ptr)
+	{ NonTrackingAllocator().deallocate(ptr); }
+};
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSUSERALLOCATED_H
diff --git a/PxShared/src/foundation/include/PsUtilities.h b/PxShared/src/foundation/include/PsUtilities.h
new file mode 100644
index 00000000..34c5e5c6
--- /dev/null
+++ b/PxShared/src/foundation/include/PsUtilities.h
@@ -0,0 +1,165 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSUTILITIES_H
+#define PSFOUNDATION_PSUTILITIES_H
+
+#include "foundation/PxVec3.h"
+#include "foundation/PxAssert.h"
+#include "Ps.h"
+#include "PsIntrinsics.h"
+#include "PsBasicTemplates.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+PX_INLINE char littleEndian()
+{
+	int i = 1;
+	return *(reinterpret_cast<char*>(&i));
+}
+
+// PT: checked casts
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxU32 to32(PxU64 value)
+{
+	PX_ASSERT(value <= 0xffffffff);
+	return PxU32(value);
+}
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxU16 to16(PxU32 value)
+{
+	PX_ASSERT(value <= 0xffff);
+	return PxU16(value);
+}
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxU8 to8(PxU16 value)
+{
+	PX_ASSERT(value <= 0xff);
+	return PxU8(value);
+}
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxU8 to8(PxU32 value)
+{
+	PX_ASSERT(value <= 0xff);
+	return PxU8(value);
+}
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxU8 to8(PxI32 value)
+{
+	PX_ASSERT(value <= 0xff);
+	PX_ASSERT(value >= 0);
+	return PxU8(value);
+}
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxI8 toI8(PxU32 value)
+{
+	PX_ASSERT(value <= 0x7f);
+	return PxI8(value);
+}
+
+/*!
+Get number of elements in array
+*/
+template <typename T, size_t N>
+char (&ArraySizeHelper(T (&array)[N]))[N];
+#define PX_ARRAY_SIZE(_array) (sizeof(physx::shdfnd::ArraySizeHelper(_array)))
+
+/*!
+Sort two elements using operator<
+
+On return x will be the smaller of the two
+*/
+template <class T>
+PX_CUDA_CALLABLE PX_FORCE_INLINE void order(T& x, T& y)
+{
+	if(y < x)
+		swap(x, y);
+}
+
+// most architectures can do predication on real comparisons, and on VMX, it matters
+
+PX_CUDA_CALLABLE PX_FORCE_INLINE void order(PxReal& x, PxReal& y)
+{
+	PxReal newX = PxMin(x, y);
+	PxReal newY = PxMax(x, y);
+	x = newX;
+	y = newY;
+}
+
+/*!
+Sort two elements using operator< and also keep order
+of any extra data
+*/
+template <class T, class E1>
+PX_CUDA_CALLABLE PX_FORCE_INLINE void order(T& x, T& y, E1& xe1, E1& ye1)
+{
+	if(y < x)
+	{
+		swap(x, y);
+		swap(xe1, ye1);
+	}
+}
+
+#if PX_GCC_FAMILY && !PX_EMSCRIPTEN
+__attribute__((noreturn))
+#endif
+    PX_INLINE void debugBreak()
+{
+#if PX_WINDOWS || PX_XBOXONE
+	__debugbreak();
+#elif PX_ANDROID
+	raise(SIGTRAP); // works better than __builtin_trap. Proper call stack and can be continued.
+#elif PX_LINUX
+	asm("int $3");
+#elif PX_GCC_FAMILY
+	__builtin_trap();
+#else
+	PX_ASSERT(false);
+#endif
+}
+
+bool checkValid(const float&);
+bool checkValid(const PxVec3&);
+bool checkValid(const PxQuat&);
+bool checkValid(const PxMat33&);
+bool checkValid(const PxTransform&);
+bool checkValid(const char*);
+
+// equivalent to std::max_element
+template <typename T>
+inline const T* maxElement(const T* first, const T* last)
+{
+	const T* m = first;
+	for(const T* it = first + 1; it < last; ++it)
+		if(*m < *it)
+			m = it;
+
+	return m;
+}
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif
diff --git a/PxShared/src/foundation/include/PsVecMath.h b/PxShared/src/foundation/include/PsVecMath.h
new file mode 100644
index 00000000..25054aed
--- /dev/null
+++ b/PxShared/src/foundation/include/PsVecMath.h
@@ -0,0 +1,1330 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSVECMATH_H
+#define PSFOUNDATION_PSVECMATH_H
+
+#include "Ps.h"
+#include "PsIntrinsics.h"
+#include "foundation/PxVec3.h"
+#include "foundation/PxVec4.h"
+#include "foundation/PxMat33.h"
+#include "foundation/PxUnionCast.h"
+
+// We can opt to use the scalar version of vectorised functions.
+// This can catch type safety issues and might even work out more optimal on pc.
+// It will also be useful for benchmarking and testing.
+// NEVER submit with vector intrinsics deactivated without good reason.
+// AM: deactivating SIMD for debug win64 just so autobuild will also exercise
+// non-SIMD path, until a dedicated non-SIMD platform sich as Arm comes online.
+// TODO: dima: reference all platforms with SIMD support here,
+// all unknown/experimental cases should better default to NO SIMD.
+
+// enable/disable SIMD
+#if PX_INTEL_FAMILY
+#define COMPILE_VECTOR_INTRINSICS 1
+#elif PX_ANDROID&& PX_NEON
+#define COMPILE_VECTOR_INTRINSICS 1
+#elif PX_IOS&& PX_NEON
+#define COMPILE_VECTOR_INTRINSICS 1
+#else
+#define COMPILE_VECTOR_INTRINSICS 0
+#endif
+
+#if defined(COMPILE_VECTOR_INTRINSICS) && PX_INTEL_FAMILY&&(PX_UNIX_FAMILY || PX_PS4)
+// only SSE2 compatible platforms should reach this
+#if PX_EMSCRIPTEN
+#include <emmintrin.h>
+#else
+#include <xmmintrin.h>
+#endif
+#endif
+
+namespace physx
+{
+namespace shdfnd
+{
+namespace aos
+{
+
+// Basic AoS types are
+// FloatV	- 16-byte aligned representation of float.
+// Vec3V		- 16-byte aligned representation of PxVec3 stored as (x y z 0).
+// Vec4V		- 16-byte aligned representation of vector of 4 floats stored as (x y z w).
+// BoolV		- 16-byte aligned representation of vector of 4 bools stored as (x y z w).
+// VecU32V	- 16-byte aligned representation of 4 unsigned ints stored as (x y z w).
+// VecI32V	- 16-byte aligned representation of 4 signed ints stored as (x y z w).
+// Mat33V	- 16-byte aligned representation of any 3x3 matrix.
+// Mat34V	- 16-byte aligned representation of transformation matrix (rotation in col1,col2,col3 and translation in
+// col4).
+// Mat44V	- 16-byte aligned representation of any 4x4 matrix.
+
+#if COMPILE_VECTOR_INTRINSICS
+#include "PsAoS.h"
+#else
+#include "PsVecMathAoSScalar.h"
+#endif
+
+//////////////////////////////////////////
+// Construct a simd type from a scalar type
+//////////////////////////////////////////
+
+// FloatV
+//(f,f,f,f)
+PX_FORCE_INLINE FloatV FLoad(const PxF32 f);
+
+// Vec3V
+//(f,f,f,0)
+PX_FORCE_INLINE Vec3V V3Load(const PxF32 f);
+//(f.x,f.y,f.z,0)
+PX_FORCE_INLINE Vec3V V3LoadU(const PxVec3& f);
+//(f.x,f.y,f.z,0), f must be 16-byte aligned
+PX_FORCE_INLINE Vec3V V3LoadA(const PxVec3& f);
+//(f.x,f.y,f.z,w_undefined), f must be 16-byte aligned
+PX_FORCE_INLINE Vec3V V3LoadUnsafeA(const PxVec3& f);
+//(f.x,f.y,f.z,0)
+PX_FORCE_INLINE Vec3V V3LoadU(const PxF32* f);
+//(f.x,f.y,f.z,0), f must be 16-byte aligned
+PX_FORCE_INLINE Vec3V V3LoadA(const PxF32* f);
+
+// Vec4V
+//(f,f,f,f)
+PX_FORCE_INLINE Vec4V V4Load(const PxF32 f);
+//(f[0],f[1],f[2],f[3])
+PX_FORCE_INLINE Vec4V V4LoadU(const PxF32* const f);
+//(f[0],f[1],f[2],f[3]), f must be 16-byte aligned
+PX_FORCE_INLINE Vec4V V4LoadA(const PxF32* const f);
+//(x,y,z,w)
+PX_FORCE_INLINE Vec4V V4LoadXYZW(const PxF32& x, const PxF32& y, const PxF32& z, const PxF32& w);
+
+// BoolV
+//(f,f,f,f)
+PX_FORCE_INLINE BoolV BLoad(const bool f);
+//(f[0],f[1],f[2],f[3])
+PX_FORCE_INLINE BoolV BLoad(const bool* const f);
+
+// VecU32V
+//(f,f,f,f)
+PX_FORCE_INLINE VecU32V U4Load(const PxU32 f);
+//(f[0],f[1],f[2],f[3])
+PX_FORCE_INLINE VecU32V U4LoadU(const PxU32* f);
+//(f[0],f[1],f[2],f[3]), f must be 16-byte aligned
+PX_FORCE_INLINE VecU32V U4LoadA(const PxU32* f);
+//((U32)x, (U32)y, (U32)z, (U32)w)
+PX_FORCE_INLINE VecU32V U4LoadXYZW(PxU32 x, PxU32 y, PxU32 z, PxU32 w);
+
+// VecI32V
+//(i,i,i,i)
+PX_FORCE_INLINE VecI32V I4Load(const PxI32 i);
+//(i,i,i,i)
+PX_FORCE_INLINE VecI32V I4LoadU(const PxI32* i);
+//(i,i,i,i)
+PX_FORCE_INLINE VecI32V I4LoadA(const PxI32* i);
+
+// QuatV
+//(x = v[0], y=v[1], z=v[2], w=v3[3]) and array don't need to aligned
+PX_FORCE_INLINE QuatV QuatVLoadU(const PxF32* v);
+//(x = v[0], y=v[1], z=v[2], w=v3[3]) and array need to aligned, fast load
+PX_FORCE_INLINE QuatV QuatVLoadA(const PxF32* v);
+//(x, y, z, w)
+PX_FORCE_INLINE QuatV QuatVLoadXYZW(const PxF32 x, const PxF32 y, const PxF32 z, const PxF32 w);
+
+// not added to public api
+Vec4V Vec4V_From_PxVec3_WUndefined(const PxVec3& v);
+
+///////////////////////////////////////////////////
+// Construct a simd type from a different simd type
+///////////////////////////////////////////////////
+
+// Vec3V
+//(v.x,v.y,v.z,0)
+PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V(Vec4V v);
+//(v.x,v.y,v.z,undefined) - be very careful with w!=0 because many functions require w==0 for correct operation eg V3Dot, V3Length, V3Cross etc etc.
+PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V_WUndefined(const Vec4V v);
+
+// Vec4V
+//(f.x,f.y,f.z,f.w)
+PX_FORCE_INLINE Vec4V Vec4V_From_Vec3V(Vec3V f);
+//((PxF32)f.x, (PxF32)f.y, (PxF32)f.z, (PxF32)f.w)
+PX_FORCE_INLINE Vec4V Vec4V_From_VecU32V(VecU32V a);
+//((PxF32)f.x, (PxF32)f.y, (PxF32)f.z, (PxF32)f.w)
+PX_FORCE_INLINE Vec4V Vec4V_From_VecI32V(VecI32V a);
+//(*(reinterpret_cast<PxF32*>(&f.x), (reinterpret_cast<PxF32*>(&f.y), (reinterpret_cast<PxF32*>(&f.z),
+//(reinterpret_cast<PxF32*>(&f.w))
+PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecU32V(VecU32V a);
+//(*(reinterpret_cast<PxF32*>(&f.x), (reinterpret_cast<PxF32*>(&f.y), (reinterpret_cast<PxF32*>(&f.z),
+//(reinterpret_cast<PxF32*>(&f.w))
+PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecI32V(VecI32V a);
+
+// VecU32V
+//(*(reinterpret_cast<PxU32*>(&f.x), (reinterpret_cast<PxU32*>(&f.y), (reinterpret_cast<PxU32*>(&f.z),
+//(reinterpret_cast<PxU32*>(&f.w))
+PX_FORCE_INLINE VecU32V VecU32V_ReinterpretFrom_Vec4V(Vec4V a);
+//(b[0], b[1], b[2], b[3])
+PX_FORCE_INLINE VecU32V VecU32V_From_BoolV(const BoolVArg b);
+
+// VecI32V
+//(*(reinterpret_cast<PxI32*>(&f.x), (reinterpret_cast<PxI32*>(&f.y), (reinterpret_cast<PxI32*>(&f.z),
+//(reinterpret_cast<PxI32*>(&f.w))
+PX_FORCE_INLINE VecI32V VecI32V_ReinterpretFrom_Vec4V(Vec4V a);
+//((I32)a.x, (I32)a.y, (I32)a.z, (I32)a.w)
+PX_FORCE_INLINE VecI32V VecI32V_From_Vec4V(Vec4V a);
+//((I32)b.x, (I32)b.y, (I32)b.z, (I32)b.w)
+PX_FORCE_INLINE VecI32V VecI32V_From_BoolV(const BoolVArg b);
+
+///////////////////////////////////////////////////
+// Convert from a simd type back to a scalar type
+///////////////////////////////////////////////////
+
+// FloatV
+// a.x
+PX_FORCE_INLINE void FStore(const FloatV a, PxF32* PX_RESTRICT f);
+
+// Vec3V
+//(a.x,a.y,a.z)
+PX_FORCE_INLINE void V3StoreA(const Vec3V a, PxVec3& f);
+//(a.x,a.y,a.z)
+PX_FORCE_INLINE void V3StoreU(const Vec3V a, PxVec3& f);
+
+// Vec4V
+PX_FORCE_INLINE void V4StoreA(const Vec4V a, PxF32* f);
+PX_FORCE_INLINE void V4StoreU(const Vec4V a, PxF32* f);
+
+// BoolV
+PX_FORCE_INLINE void BStoreA(const BoolV b, PxU32* f);
+
+// VecU32V
+PX_FORCE_INLINE void U4StoreA(const VecU32V uv, PxU32* u);
+
+// VecI32V
+PX_FORCE_INLINE void I4StoreA(const VecI32V iv, PxI32* i);
+
+//////////////////////////////////////////////////////////////////
+// Test that simd types have elements in the floating point range
+//////////////////////////////////////////////////////////////////
+
+// check for each component is valid ie in floating point range
+PX_FORCE_INLINE bool isFiniteFloatV(const FloatV a);
+// check for each component is valid ie in floating point range
+PX_FORCE_INLINE bool isFiniteVec3V(const Vec3V a);
+// check for each component is valid ie in floating point range
+PX_FORCE_INLINE bool isFiniteVec4V(const Vec4V a);
+
+// Check that w-component is zero.
+PX_FORCE_INLINE bool isValidVec3V(const Vec3V a);
+
+//////////////////////////////////////////////////////////////////
+// Tests that all elements of two 16-byte types are completely equivalent.
+// Use these tests for unit testing and asserts only.
+//////////////////////////////////////////////////////////////////
+
+namespace _VecMathTests
+{
+PX_FORCE_INLINE Vec3V getInvalidVec3V();
+PX_FORCE_INLINE bool allElementsEqualFloatV(const FloatV a, const FloatV b);
+PX_FORCE_INLINE bool allElementsEqualVec3V(const Vec3V a, const Vec3V b);
+PX_FORCE_INLINE bool allElementsEqualVec4V(const Vec4V a, const Vec4V b);
+PX_FORCE_INLINE bool allElementsEqualBoolV(const BoolV a, const BoolV b);
+PX_FORCE_INLINE bool allElementsEqualVecU32V(const VecU32V a, const VecU32V b);
+PX_FORCE_INLINE bool allElementsEqualVecI32V(const VecI32V a, const VecI32V b);
+
+PX_FORCE_INLINE bool allElementsEqualMat33V(const Mat33V& a, const Mat33V& b)
+{
+	return (allElementsEqualVec3V(a.col0, b.col0) && allElementsEqualVec3V(a.col1, b.col1) &&
+	        allElementsEqualVec3V(a.col2, b.col2));
+}
+PX_FORCE_INLINE bool allElementsEqualMat34V(const Mat34V& a, const Mat34V& b)
+{
+	return (allElementsEqualVec3V(a.col0, b.col0) && allElementsEqualVec3V(a.col1, b.col1) &&
+	        allElementsEqualVec3V(a.col2, b.col2) && allElementsEqualVec3V(a.col3, b.col3));
+}
+PX_FORCE_INLINE bool allElementsEqualMat44V(const Mat44V& a, const Mat44V& b)
+{
+	return (allElementsEqualVec4V(a.col0, b.col0) && allElementsEqualVec4V(a.col1, b.col1) &&
+	        allElementsEqualVec4V(a.col2, b.col2) && allElementsEqualVec4V(a.col3, b.col3));
+}
+
+PX_FORCE_INLINE bool allElementsNearEqualFloatV(const FloatV a, const FloatV b);
+PX_FORCE_INLINE bool allElementsNearEqualVec3V(const Vec3V a, const Vec3V b);
+PX_FORCE_INLINE bool allElementsNearEqualVec4V(const Vec4V a, const Vec4V b);
+PX_FORCE_INLINE bool allElementsNearEqualMat33V(const Mat33V& a, const Mat33V& b)
+{
+	return (allElementsNearEqualVec3V(a.col0, b.col0) && allElementsNearEqualVec3V(a.col1, b.col1) &&
+	        allElementsNearEqualVec3V(a.col2, b.col2));
+}
+PX_FORCE_INLINE bool allElementsNearEqualMat34V(const Mat34V& a, const Mat34V& b)
+{
+	return (allElementsNearEqualVec3V(a.col0, b.col0) && allElementsNearEqualVec3V(a.col1, b.col1) &&
+	        allElementsNearEqualVec3V(a.col2, b.col2) && allElementsNearEqualVec3V(a.col3, b.col3));
+}
+PX_FORCE_INLINE bool allElementsNearEqualMat44V(const Mat44V& a, const Mat44V& b)
+{
+	return (allElementsNearEqualVec4V(a.col0, b.col0) && allElementsNearEqualVec4V(a.col1, b.col1) &&
+	        allElementsNearEqualVec4V(a.col2, b.col2) && allElementsNearEqualVec4V(a.col3, b.col3));
+}
+}
+
+//////////////////////////////////////////////////////////////////
+// Math operations on FloatV
+//////////////////////////////////////////////////////////////////
+
+//(0,0,0,0)
+PX_FORCE_INLINE FloatV FZero();
+//(1,1,1,1)
+PX_FORCE_INLINE FloatV FOne();
+//(0.5,0.5,0.5,0.5)
+PX_FORCE_INLINE FloatV FHalf();
+//(PX_EPS_REAL,PX_EPS_REAL,PX_EPS_REAL,PX_EPS_REAL)
+PX_FORCE_INLINE FloatV FEps();
+//(PX_MAX_REAL, PX_MAX_REAL, PX_MAX_REAL PX_MAX_REAL)
+PX_FORCE_INLINE FloatV FMax();
+//(-PX_MAX_REAL, -PX_MAX_REAL, -PX_MAX_REAL -PX_MAX_REAL)
+PX_FORCE_INLINE FloatV FNegMax();
+//(1e-6f, 1e-6f, 1e-6f, 1e-6f)
+PX_FORCE_INLINE FloatV FEps6();
+//((PxF32*)&1, (PxF32*)&1, (PxF32*)&1, (PxF32*)&1)
+
+//-f (per component)
+PX_FORCE_INLINE FloatV FNeg(const FloatV f);
+// a+b (per component)
+PX_FORCE_INLINE FloatV FAdd(const FloatV a, const FloatV b);
+// a-b (per component)
+PX_FORCE_INLINE FloatV FSub(const FloatV a, const FloatV b);
+// a*b (per component)
+PX_FORCE_INLINE FloatV FMul(const FloatV a, const FloatV b);
+// a/b (per component)
+PX_FORCE_INLINE FloatV FDiv(const FloatV a, const FloatV b);
+// a/b (per component)
+PX_FORCE_INLINE FloatV FDivFast(const FloatV a, const FloatV b);
+// 1.0f/a
+PX_FORCE_INLINE FloatV FRecip(const FloatV a);
+// 1.0f/a
+PX_FORCE_INLINE FloatV FRecipFast(const FloatV a);
+// 1.0f/sqrt(a)
+PX_FORCE_INLINE FloatV FRsqrt(const FloatV a);
+// 1.0f/sqrt(a)
+PX_FORCE_INLINE FloatV FRsqrtFast(const FloatV a);
+// sqrt(a)
+PX_FORCE_INLINE FloatV FSqrt(const FloatV a);
+// a*b+c
+PX_FORCE_INLINE FloatV FScaleAdd(const FloatV a, const FloatV b, const FloatV c);
+// c-a*b
+PX_FORCE_INLINE FloatV FNegScaleSub(const FloatV a, const FloatV b, const FloatV c);
+// fabs(a)
+PX_FORCE_INLINE FloatV FAbs(const FloatV a);
+// c ? a : b (per component)
+PX_FORCE_INLINE FloatV FSel(const BoolV c, const FloatV a, const FloatV b);
+// a>b (per component)
+PX_FORCE_INLINE BoolV FIsGrtr(const FloatV a, const FloatV b);
+// a>=b (per component)
+PX_FORCE_INLINE BoolV FIsGrtrOrEq(const FloatV a, const FloatV b);
+// a==b (per component)
+PX_FORCE_INLINE BoolV FIsEq(const FloatV a, const FloatV b);
+// Max(a,b) (per component)
+PX_FORCE_INLINE FloatV FMax(const FloatV a, const FloatV b);
+// Min(a,b) (per component)
+PX_FORCE_INLINE FloatV FMin(const FloatV a, const FloatV b);
+// Clamp(a,b) (per component)
+PX_FORCE_INLINE FloatV FClamp(const FloatV a, const FloatV minV, const FloatV maxV);
+
+// a.x>b.x
+PX_FORCE_INLINE PxU32 FAllGrtr(const FloatV a, const FloatV b);
+// a.x>=b.x
+PX_FORCE_INLINE PxU32 FAllGrtrOrEq(const FloatV a, const FloatV b);
+// a.x==b.x
+PX_FORCE_INLINE PxU32 FAllEq(const FloatV a, const FloatV b);
+// a<min || a>max
+PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV min, const FloatV max);
+// a>=min && a<=max
+PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV min, const FloatV max);
+// a<-bounds || a>bounds
+PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV bounds);
+// a>=-bounds && a<=bounds
+PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV bounds);
+
+// round float a to the near int
+PX_FORCE_INLINE FloatV FRound(const FloatV a);
+// calculate the sin of float a
+PX_FORCE_INLINE FloatV FSin(const FloatV a);
+// calculate the cos of float b
+PX_FORCE_INLINE FloatV FCos(const FloatV a);
+
+//////////////////////////////////////////////////////////////////
+// Math operations on Vec3V
+//////////////////////////////////////////////////////////////////
+
+//(f,f,f,f)
+PX_FORCE_INLINE Vec3V V3Splat(const FloatV f);
+
+//(x,y,z)
+PX_FORCE_INLINE Vec3V V3Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z);
+
+//(1,0,0,0)
+PX_FORCE_INLINE Vec3V V3UnitX();
+//(0,1,0,0)
+PX_FORCE_INLINE Vec3V V3UnitY();
+//(0,0,1,0)
+PX_FORCE_INLINE Vec3V V3UnitZ();
+
+//(f.x,f.x,f.x,f.x)
+PX_FORCE_INLINE FloatV V3GetX(const Vec3V f);
+//(f.y,f.y,f.y,f.y)
+PX_FORCE_INLINE FloatV V3GetY(const Vec3V f);
+//(f.z,f.z,f.z,f.z)
+PX_FORCE_INLINE FloatV V3GetZ(const Vec3V f);
+
+//(f,v.y,v.z,v.w)
+PX_FORCE_INLINE Vec3V V3SetX(const Vec3V v, const FloatV f);
+//(v.x,f,v.z,v.w)
+PX_FORCE_INLINE Vec3V V3SetY(const Vec3V v, const FloatV f);
+//(v.x,v.y,f,v.w)
+PX_FORCE_INLINE Vec3V V3SetZ(const Vec3V v, const FloatV f);
+
+// v.x=f
+PX_FORCE_INLINE void V3WriteX(Vec3V& v, const PxF32 f);
+// v.y=f
+PX_FORCE_INLINE void V3WriteY(Vec3V& v, const PxF32 f);
+// v.z=f
+PX_FORCE_INLINE void V3WriteZ(Vec3V& v, const PxF32 f);
+// v.x=f.x, v.y=f.y, v.z=f.z
+PX_FORCE_INLINE void V3WriteXYZ(Vec3V& v, const PxVec3& f);
+// return v.x
+PX_FORCE_INLINE PxF32 V3ReadX(const Vec3V& v);
+// return v.y
+PX_FORCE_INLINE PxF32 V3ReadY(const Vec3V& v);
+// return v.y
+PX_FORCE_INLINE PxF32 V3ReadZ(const Vec3V& v);
+// return (v.x,v.y,v.z)
+PX_FORCE_INLINE const PxVec3& V3ReadXYZ(const Vec3V& v);
+
+//(a.x, b.x, c.x)
+PX_FORCE_INLINE Vec3V V3ColX(const Vec3V a, const Vec3V b, const Vec3V c);
+//(a.y, b.y, c.y)
+PX_FORCE_INLINE Vec3V V3ColY(const Vec3V a, const Vec3V b, const Vec3V c);
+//(a.z, b.z, c.z)
+PX_FORCE_INLINE Vec3V V3ColZ(const Vec3V a, const Vec3V b, const Vec3V c);
+
+//(0,0,0,0)
+PX_FORCE_INLINE Vec3V V3Zero();
+//(1,1,1,1)
+PX_FORCE_INLINE Vec3V V3One();
+//(PX_EPS_REAL,PX_EPS_REAL,PX_EPS_REAL,PX_EPS_REAL)
+PX_FORCE_INLINE Vec3V V3Eps();
+//-c (per component)
+PX_FORCE_INLINE Vec3V V3Neg(const Vec3V c);
+// a+b (per component)
+PX_FORCE_INLINE Vec3V V3Add(const Vec3V a, const Vec3V b);
+// a-b (per component)
+PX_FORCE_INLINE Vec3V V3Sub(const Vec3V a, const Vec3V b);
+// a*b (per component)
+PX_FORCE_INLINE Vec3V V3Scale(const Vec3V a, const FloatV b);
+// a*b (per component)
+PX_FORCE_INLINE Vec3V V3Mul(const Vec3V a, const Vec3V b);
+// a/b (per component)
+PX_FORCE_INLINE Vec3V V3ScaleInv(const Vec3V a, const FloatV b);
+// a/b (per component)
+PX_FORCE_INLINE Vec3V V3Div(const Vec3V a, const Vec3V b);
+// a/b (per component)
+PX_FORCE_INLINE Vec3V V3ScaleInvFast(const Vec3V a, const FloatV b);
+// a/b (per component)
+PX_FORCE_INLINE Vec3V V3DivFast(const Vec3V a, const Vec3V b);
+// 1.0f/a
+PX_FORCE_INLINE Vec3V V3Recip(const Vec3V a);
+// 1.0f/a
+PX_FORCE_INLINE Vec3V V3RecipFast(const Vec3V a);
+// 1.0f/sqrt(a)
+PX_FORCE_INLINE Vec3V V3Rsqrt(const Vec3V a);
+// 1.0f/sqrt(a)
+PX_FORCE_INLINE Vec3V V3RsqrtFast(const Vec3V a);
+// a*b+c
+PX_FORCE_INLINE Vec3V V3ScaleAdd(const Vec3V a, const FloatV b, const Vec3V c);
+// c-a*b
+PX_FORCE_INLINE Vec3V V3NegScaleSub(const Vec3V a, const FloatV b, const Vec3V c);
+// a*b+c
+PX_FORCE_INLINE Vec3V V3MulAdd(const Vec3V a, const Vec3V b, const Vec3V c);
+// c-a*b
+PX_FORCE_INLINE Vec3V V3NegMulSub(const Vec3V a, const Vec3V b, const Vec3V c);
+// fabs(a)
+PX_FORCE_INLINE Vec3V V3Abs(const Vec3V a);
+
+// a.b 
+// Note: a.w and b.w must have value zero
+PX_FORCE_INLINE FloatV V3Dot(const Vec3V a, const Vec3V b);
+// aXb
+// Note: a.w and b.w must have value zero
+PX_FORCE_INLINE Vec3V V3Cross(const Vec3V a, const Vec3V b);
+// |a.a|^1/2
+// Note: a.w must have value zero
+PX_FORCE_INLINE FloatV V3Length(const Vec3V a);
+// a.a
+// Note: a.w must have value zero
+PX_FORCE_INLINE FloatV V3LengthSq(const Vec3V a);
+// a*|a.a|^-1/2
+// Note: a.w must have value zero
+PX_FORCE_INLINE Vec3V V3Normalize(const Vec3V a);
+// a.a>0 ? a*|a.a|^-1/2 : (0,0,0,0)
+// Note: a.w must have value zero
+PX_FORCE_INLINE FloatV V3Length(const Vec3V a);
+// a.a>0 ? a*|a.a|^-1/2 : unsafeReturnValue 
+// Note: a.w must have value zero
+PX_FORCE_INLINE Vec3V V3NormalizeSafe(const Vec3V a, const Vec3V unsafeReturnValue);
+// a.x + a.y + a.z
+// Note: a.w must have value zero
+PX_FORCE_INLINE FloatV V3SumElems(const Vec3V a);
+
+// c ? a : b (per component)
+PX_FORCE_INLINE Vec3V V3Sel(const BoolV c, const Vec3V a, const Vec3V b);
+// a>b (per component)
+PX_FORCE_INLINE BoolV V3IsGrtr(const Vec3V a, const Vec3V b);
+// a>=b (per component)
+PX_FORCE_INLINE BoolV V3IsGrtrOrEq(const Vec3V a, const Vec3V b);
+// a==b (per component)
+PX_FORCE_INLINE BoolV V3IsEq(const Vec3V a, const Vec3V b);
+// Max(a,b) (per component)
+PX_FORCE_INLINE Vec3V V3Max(const Vec3V a, const Vec3V b);
+// Min(a,b) (per component)
+PX_FORCE_INLINE Vec3V V3Min(const Vec3V a, const Vec3V b);
+
+// Extract the maximum value from a
+// Note: a.w must have value zero
+PX_FORCE_INLINE FloatV V3ExtractMax(const Vec3V a);
+
+// Extract the minimum value from a
+// Note: a.w must have value zero
+PX_FORCE_INLINE FloatV V3ExtractMin(const Vec3V a);
+
+// Clamp(a,b) (per component)
+PX_FORCE_INLINE Vec3V V3Clamp(const Vec3V a, const Vec3V minV, const Vec3V maxV);
+
+// Extract the sign for each component
+PX_FORCE_INLINE Vec3V V3Sign(const Vec3V a);
+
+// Test all components.
+// (a.x>b.x && a.y>b.y && a.z>b.z)
+// Note: a.w and b.w must have value zero
+PX_FORCE_INLINE PxU32 V3AllGrtr(const Vec3V a, const Vec3V b);
+// (a.x>=b.x && a.y>=b.y && a.z>=b.z)
+// Note: a.w and b.w must have value zero
+PX_FORCE_INLINE PxU32 V3AllGrtrOrEq(const Vec3V a, const Vec3V b);
+// (a.x==b.x && a.y==b.y && a.z==b.z)
+// Note: a.w and b.w must have value zero
+PX_FORCE_INLINE PxU32 V3AllEq(const Vec3V a, const Vec3V b);
+// a.x<min.x || a.y<min.y || a.z<min.z || a.x>max.x || a.y>max.y || a.z>max.z
+// Note: a.w and min.w and max.w must have value zero
+PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V min, const Vec3V max);
+// a.x>=min.x && a.y>=min.y && a.z>=min.z && a.x<=max.x && a.y<=max.y && a.z<=max.z
+// Note: a.w and min.w and max.w must have value zero
+PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V min, const Vec3V max);
+// a.x<-bounds.x || a.y<=-bounds.y || a.z<bounds.z || a.x>bounds.x || a.y>bounds.y || a.z>bounds.z
+// Note: a.w and bounds.w must have value zero
+PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V bounds);
+// a.x>=-bounds.x && a.y>=-bounds.y && a.z>=-bounds.z && a.x<=bounds.x && a.y<=bounds.y && a.z<=bounds.z
+// Note: a.w and bounds.w must have value zero
+PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V bounds);
+
+//(floor(a.x + 0.5f), floor(a.y + 0.5f), floor(a.z + 0.5f))
+PX_FORCE_INLINE Vec3V V3Round(const Vec3V a);
+
+//(sinf(a.x), sinf(a.y), sinf(a.z))
+PX_FORCE_INLINE Vec3V V3Sin(const Vec3V a);
+//(cosf(a.x), cosf(a.y), cosf(a.z))
+PX_FORCE_INLINE Vec3V V3Cos(const Vec3V a);
+
+//(a.y,a.z,a.z)
+PX_FORCE_INLINE Vec3V V3PermYZZ(const Vec3V a);
+//(a.x,a.y,a.x)
+PX_FORCE_INLINE Vec3V V3PermXYX(const Vec3V a);
+//(a.y,a.z,a.x)
+PX_FORCE_INLINE Vec3V V3PermYZX(const Vec3V a);
+//(a.z, a.x, a.y)
+PX_FORCE_INLINE Vec3V V3PermZXY(const Vec3V a);
+//(a.z,a.z,a.y)
+PX_FORCE_INLINE Vec3V V3PermZZY(const Vec3V a);
+//(a.y,a.x,a.x)
+PX_FORCE_INLINE Vec3V V3PermYXX(const Vec3V a);
+//(0, v1.z, v0.y)
+PX_FORCE_INLINE Vec3V V3Perm_Zero_1Z_0Y(const Vec3V v0, const Vec3V v1);
+//(v0.z, 0, v1.x)
+PX_FORCE_INLINE Vec3V V3Perm_0Z_Zero_1X(const Vec3V v0, const Vec3V v1);
+//(v1.y, v0.x, 0)
+PX_FORCE_INLINE Vec3V V3Perm_1Y_0X_Zero(const Vec3V v0, const Vec3V v1);
+
+// Transpose 3 Vec3Vs inplace. Sets the w component to zero
+// [ x0, y0, z0, w0] [ x1, y1, z1, w1]  [ x2, y2, z2, w2]  -> [x0 x1 x2 0] [y0 y1 y2 0] [z0 z1 z2 0]
+PX_FORCE_INLINE void V3Transpose(Vec3V& col0, Vec3V& col1, Vec3V& col2);
+
+//////////////////////////////////////////////////////////////////
+// Math operations on Vec4V
+//////////////////////////////////////////////////////////////////
+
+//(f,f,f,f)
+PX_FORCE_INLINE Vec4V V4Splat(const FloatV f);
+
+//(f[0],f[1],f[2],f[3])
+PX_FORCE_INLINE Vec4V V4Merge(const FloatV* const f);
+//(x,y,z,w)
+PX_FORCE_INLINE Vec4V V4Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z, const FloatVArg w);
+//(x.w, y.w, z.w, w.w)
+PX_FORCE_INLINE Vec4V V4MergeW(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w);
+//(x.z, y.z, z.z, w.z)
+PX_FORCE_INLINE Vec4V V4MergeZ(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w);
+//(x.y, y.y, z.y, w.y)
+PX_FORCE_INLINE Vec4V V4MergeY(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w);
+//(x.x, y.x, z.x, w.x)
+PX_FORCE_INLINE Vec4V V4MergeX(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w);
+
+//(a.x, b.x, a.y, b.y)
+PX_FORCE_INLINE Vec4V V4UnpackXY(const Vec4VArg a, const Vec4VArg b);
+//(a.z, b.z, a.w, b.w)
+PX_FORCE_INLINE Vec4V V4UnpackZW(const Vec4VArg a, const Vec4VArg b);
+
+//(1,0,0,0)
+PX_FORCE_INLINE Vec4V V4UnitW();
+//(0,1,0,0)
+PX_FORCE_INLINE Vec4V V4UnitY();
+//(0,0,1,0)
+PX_FORCE_INLINE Vec4V V4UnitZ();
+//(0,0,0,1)
+PX_FORCE_INLINE Vec4V V4UnitW();
+
+//(f.x,f.x,f.x,f.x)
+PX_FORCE_INLINE FloatV V4GetX(const Vec4V f);
+//(f.y,f.y,f.y,f.y)
+PX_FORCE_INLINE FloatV V4GetY(const Vec4V f);
+//(f.z,f.z,f.z,f.z)
+PX_FORCE_INLINE FloatV V4GetZ(const Vec4V f);
+//(f.w,f.w,f.w,f.w)
+PX_FORCE_INLINE FloatV V4GetW(const Vec4V f);
+
+//(f,v.y,v.z,v.w)
+PX_FORCE_INLINE Vec4V V4SetX(const Vec4V v, const FloatV f);
+//(v.x,f,v.z,v.w)
+PX_FORCE_INLINE Vec4V V4SetY(const Vec4V v, const FloatV f);
+//(v.x,v.y,f,v.w)
+PX_FORCE_INLINE Vec4V V4SetZ(const Vec4V v, const FloatV f);
+//(v.x,v.y,v.z,f)
+PX_FORCE_INLINE Vec4V V4SetW(const Vec4V v, const FloatV f);
+
+//(v.x,v.y,v.z,0)
+PX_FORCE_INLINE Vec4V V4ClearW(const Vec4V v);
+
+//(a[elementIndex], a[elementIndex], a[elementIndex], a[elementIndex])
+template <int elementIndex>
+PX_FORCE_INLINE Vec4V V4SplatElement(Vec4V a);
+
+// v.x=f
+PX_FORCE_INLINE void V4WriteX(Vec4V& v, const PxF32 f);
+// v.y=f
+PX_FORCE_INLINE void V4WriteY(Vec4V& v, const PxF32 f);
+// v.z=f
+PX_FORCE_INLINE void V4WriteZ(Vec4V& v, const PxF32 f);
+// v.w=f
+PX_FORCE_INLINE void V4WriteW(Vec4V& v, const PxF32 f);
+// v.x=f.x, v.y=f.y, v.z=f.z
+PX_FORCE_INLINE void V4WriteXYZ(Vec4V& v, const PxVec3& f);
+// return v.x
+PX_FORCE_INLINE PxF32 V4ReadX(const Vec4V& v);
+// return v.y
+PX_FORCE_INLINE PxF32 V4ReadY(const Vec4V& v);
+// return v.z
+PX_FORCE_INLINE PxF32 V4ReadZ(const Vec4V& v);
+// return v.w
+PX_FORCE_INLINE PxF32 V4ReadW(const Vec4V& v);
+// return (v.x,v.y,v.z)
+PX_FORCE_INLINE const PxVec3& V4ReadXYZ(const Vec4V& v);
+
+//(0,0,0,0)
+PX_FORCE_INLINE Vec4V V4Zero();
+//(1,1,1,1)
+PX_FORCE_INLINE Vec4V V4One();
+//(PX_EPS_REAL,PX_EPS_REAL,PX_EPS_REAL,PX_EPS_REAL)
+PX_FORCE_INLINE Vec4V V4Eps();
+
+//-c (per component)
+PX_FORCE_INLINE Vec4V V4Neg(const Vec4V c);
+// a+b (per component)
+PX_FORCE_INLINE Vec4V V4Add(const Vec4V a, const Vec4V b);
+// a-b (per component)
+PX_FORCE_INLINE Vec4V V4Sub(const Vec4V a, const Vec4V b);
+// a*b (per component)
+PX_FORCE_INLINE Vec4V V4Scale(const Vec4V a, const FloatV b);
+// a*b (per component)
+PX_FORCE_INLINE Vec4V V4Mul(const Vec4V a, const Vec4V b);
+// a/b (per component)
+PX_FORCE_INLINE Vec4V V4ScaleInv(const Vec4V a, const FloatV b);
+// a/b (per component)
+PX_FORCE_INLINE Vec4V V4Div(const Vec4V a, const Vec4V b);
+// a/b (per component)
+PX_FORCE_INLINE Vec4V V4ScaleInvFast(const Vec4V a, const FloatV b);
+// a/b (per component)
+PX_FORCE_INLINE Vec4V V4DivFast(const Vec4V a, const Vec4V b);
+// 1.0f/a
+PX_FORCE_INLINE Vec4V V4Recip(const Vec4V a);
+// 1.0f/a
+PX_FORCE_INLINE Vec4V V4RecipFast(const Vec4V a);
+// 1.0f/sqrt(a)
+PX_FORCE_INLINE Vec4V V4Rsqrt(const Vec4V a);
+// 1.0f/sqrt(a)
+PX_FORCE_INLINE Vec4V V4RsqrtFast(const Vec4V a);
+// a*b+c
+PX_FORCE_INLINE Vec4V V4ScaleAdd(const Vec4V a, const FloatV b, const Vec4V c);
+// c-a*b
+PX_FORCE_INLINE Vec4V V4NegScaleSub(const Vec4V a, const FloatV b, const Vec4V c);
+// a*b+c
+PX_FORCE_INLINE Vec4V V4MulAdd(const Vec4V a, const Vec4V b, const Vec4V c);
+// c-a*b
+PX_FORCE_INLINE Vec4V V4NegMulSub(const Vec4V a, const Vec4V b, const Vec4V c);
+
+// fabs(a)
+PX_FORCE_INLINE Vec4V V4Abs(const Vec4V a);
+// bitwise a & ~b
+PX_FORCE_INLINE Vec4V V4Andc(const Vec4V a, const VecU32V b);
+
+// a.b (W is taken into account)
+PX_FORCE_INLINE FloatV V4Dot(const Vec4V a, const Vec4V b);
+// a.b (same computation as V3Dot. W is ignored in input)
+PX_FORCE_INLINE FloatV V4Dot3(const Vec4V a, const Vec4V b);
+// aXb (same computation as V3Cross. W is ignored in input and undefined in output)
+PX_FORCE_INLINE Vec4V V4Cross(const Vec4V a, const Vec4V b);
+
+//|a.a|^1/2
+PX_FORCE_INLINE FloatV V4Length(const Vec4V a);
+// a.a
+PX_FORCE_INLINE FloatV V4LengthSq(const Vec4V a);
+
+// a*|a.a|^-1/2
+PX_FORCE_INLINE Vec4V V4Normalize(const Vec4V a);
+// a.a>0 ? a*|a.a|^-1/2 : unsafeReturnValue 
+PX_FORCE_INLINE Vec4V V4NormalizeSafe(const Vec4V a, const Vec4V unsafeReturnValue);
+// a*|a.a|^-1/2
+PX_FORCE_INLINE Vec4V V4NormalizeFast(const Vec4V a);
+
+// c ? a : b (per component)
+PX_FORCE_INLINE Vec4V V4Sel(const BoolV c, const Vec4V a, const Vec4V b);
+// a>b (per component)
+PX_FORCE_INLINE BoolV V4IsGrtr(const Vec4V a, const Vec4V b);
+// a>=b (per component)
+PX_FORCE_INLINE BoolV V4IsGrtrOrEq(const Vec4V a, const Vec4V b);
+// a==b (per component)
+PX_FORCE_INLINE BoolV V4IsEq(const Vec4V a, const Vec4V b);
+// Max(a,b) (per component)
+PX_FORCE_INLINE Vec4V V4Max(const Vec4V a, const Vec4V b);
+// Min(a,b) (per component)
+PX_FORCE_INLINE Vec4V V4Min(const Vec4V a, const Vec4V b);
+// Get the maximum component from a
+PX_FORCE_INLINE FloatV V4ExtractMax(const Vec4V a);
+// Get the minimum component from a
+PX_FORCE_INLINE FloatV V4ExtractMin(const Vec4V a);
+
+// Clamp(a,b) (per component)
+PX_FORCE_INLINE Vec4V V4Clamp(const Vec4V a, const Vec4V minV, const Vec4V maxV);
+
+// return 1 if all components of a are greater than all components of b.
+PX_FORCE_INLINE PxU32 V4AllGrtr(const Vec4V a, const Vec4V b);
+// return 1 if all components of a are greater than or equal to all components of b
+PX_FORCE_INLINE PxU32 V4AllGrtrOrEq(const Vec4V a, const Vec4V b);
+// return 1 if XYZ components of a are greater than or equal to XYZ components of b. W is ignored.
+PX_FORCE_INLINE PxU32 V4AllGrtrOrEq3(const Vec4V a, const Vec4V b);
+// return 1 if all components of a are equal to all components of b
+PX_FORCE_INLINE PxU32 V4AllEq(const Vec4V a, const Vec4V b);
+// return 1 if any XYZ component of a is greater than the corresponding component of b. W is ignored.
+PX_FORCE_INLINE PxU32 V4AnyGrtr3(const Vec4V a, const Vec4V b);
+
+// round(a)(per component)
+PX_FORCE_INLINE Vec4V V4Round(const Vec4V a);
+// sin(a) (per component)
+PX_FORCE_INLINE Vec4V V4Sin(const Vec4V a);
+// cos(a) (per component)
+PX_FORCE_INLINE Vec4V V4Cos(const Vec4V a);
+
+// Permute v into a new vec4v with YXWZ format
+PX_FORCE_INLINE Vec4V V4PermYXWZ(const Vec4V v);
+// Permute v into a new vec4v with XZXZ format
+PX_FORCE_INLINE Vec4V V4PermXZXZ(const Vec4V v);
+// Permute v into a new vec4v with YWYW format
+PX_FORCE_INLINE Vec4V V4PermYWYW(const Vec4V v);
+// Permute v into a new vec4v with YZXW format
+PX_FORCE_INLINE Vec4V V4PermYZXW(const Vec4V v);
+
+// Permute v into a new vec4v with format {a[x], a[y], a[z], a[w]}
+// V4Perm<1,3,1,3> is equal to V4PermYWYW
+// V4Perm<0,2,0,2> is equal to V4PermXZXZ
+// V3Perm<1,0,3,2> is equal to V4PermYXWZ
+template <PxU8 x, PxU8 y, PxU8 z, PxU8 w>
+PX_FORCE_INLINE Vec4V V4Perm(const Vec4V a);
+
+// Transpose 4 Vec4Vs inplace.
+// [ x0, y0, z0, w0] [ x1, y1, z1, w1] [ x2, y2, z2, w2] [ x3, y3, z3, w3] ->
+// [ x0, x1, x2, x3] [ y0, y1, y2, y3] [ z0, z1, z2, z3] [ w0, w1, w2, w3]
+PX_FORCE_INLINE void V3Transpose(Vec3V& col0, Vec3V& col1, Vec3V& col2);
+
+// q = cos(a/2) + u*sin(a/2)
+PX_FORCE_INLINE QuatV QuatV_From_RotationAxisAngle(const Vec3V u, const FloatV a);
+// convert q to a unit quaternion
+PX_FORCE_INLINE QuatV QuatNormalize(const QuatV q);
+//|q.q|^1/2
+PX_FORCE_INLINE FloatV QuatLength(const QuatV q);
+// q.q
+PX_FORCE_INLINE FloatV QuatLengthSq(const QuatV q);
+// a.b
+PX_FORCE_INLINE FloatV QuatDot(const QuatV a, const QuatV b);
+//(-q.x, -q.y, -q.z, q.w)
+PX_FORCE_INLINE QuatV QuatConjugate(const QuatV q);
+//(q.x, q.y, q.z)
+PX_FORCE_INLINE Vec3V QuatGetImaginaryPart(const QuatV q);
+// convert quaternion to matrix 33
+PX_FORCE_INLINE Mat33V QuatGetMat33V(const QuatVArg q);
+// convert quaternion to matrix 33
+PX_FORCE_INLINE void QuatGetMat33V(const QuatVArg q, Vec3V& column0, Vec3V& column1, Vec3V& column2);
+// convert matrix 33 to quaternion
+PX_FORCE_INLINE QuatV Mat33GetQuatV(const Mat33V& a);
+// brief computes rotation of x-axis
+PX_FORCE_INLINE Vec3V QuatGetBasisVector0(const QuatV q);
+// brief computes rotation of y-axis
+PX_FORCE_INLINE Vec3V QuatGetBasisVector1(const QuatV q);
+// brief computes rotation of z-axis
+PX_FORCE_INLINE Vec3V QuatGetBasisVector2(const QuatV q);
+// calculate the rotation vector from q and v
+PX_FORCE_INLINE Vec3V QuatRotate(const QuatV q, const Vec3V v);
+// calculate the rotation vector from the conjugate quaternion and v
+PX_FORCE_INLINE Vec3V QuatRotateInv(const QuatV q, const Vec3V v);
+// quaternion multiplication
+PX_FORCE_INLINE QuatV QuatMul(const QuatV a, const QuatV b);
+// quaternion add
+PX_FORCE_INLINE QuatV QuatAdd(const QuatV a, const QuatV b);
+// (-q.x, -q.y, -q.z, -q.w)
+PX_FORCE_INLINE QuatV QuatNeg(const QuatV q);
+// (a.x - b.x, a.y-b.y, a.z-b.z, a.w-b.w )
+PX_FORCE_INLINE QuatV QuatSub(const QuatV a, const QuatV b);
+// (a.x*b, a.y*b, a.z*b, a.w*b)
+PX_FORCE_INLINE QuatV QuatScale(const QuatV a, const FloatV b);
+// (x = v[0], y = v[1], z = v[2], w =v[3])
+PX_FORCE_INLINE QuatV QuatMerge(const FloatV* const v);
+// (x = v[0], y = v[1], z = v[2], w =v[3])
+PX_FORCE_INLINE QuatV QuatMerge(const FloatVArg x, const FloatVArg y, const FloatVArg z, const FloatVArg w);
+// (x = 0.f, y = 0.f, z = 0.f, w = 1.f)
+PX_FORCE_INLINE QuatV QuatIdentity();
+// check for each component is valid
+PX_FORCE_INLINE bool isFiniteQuatV(const QuatV q);
+// check for each component is valid
+PX_FORCE_INLINE bool isValidQuatV(const QuatV q);
+// check for each component is valid
+PX_FORCE_INLINE bool isSaneQuatV(const QuatV q);
+
+// Math operations on 16-byte aligned booleans.
+// x=false	y=false		z=false		w=false
+PX_FORCE_INLINE BoolV BFFFF();
+// x=false	y=false		z=false		w=true
+PX_FORCE_INLINE BoolV BFFFT();
+// x=false	y=false		z=true		w=false
+PX_FORCE_INLINE BoolV BFFTF();
+// x=false	y=false		z=true		w=true
+PX_FORCE_INLINE BoolV BFFTT();
+// x=false	y=true		z=false		w=false
+PX_FORCE_INLINE BoolV BFTFF();
+// x=false	y=true		z=false		w=true
+PX_FORCE_INLINE BoolV BFTFT();
+// x=false	y=true		z=true		w=false
+PX_FORCE_INLINE BoolV BFTTF();
+// x=false	y=true		z=true		w=true
+PX_FORCE_INLINE BoolV BFTTT();
+// x=true	y=false		z=false		w=false
+PX_FORCE_INLINE BoolV BTFFF();
+// x=true	y=false		z=false		w=true
+PX_FORCE_INLINE BoolV BTFFT();
+// x=true	y=false		z=true		w=false
+PX_FORCE_INLINE BoolV BTFTF();
+// x=true	y=false		z=true		w=true
+PX_FORCE_INLINE BoolV BTFTT();
+// x=true	y=true		z=false		w=false
+PX_FORCE_INLINE BoolV BTTFF();
+// x=true	y=true		z=false		w=true
+PX_FORCE_INLINE BoolV BTTFT();
+// x=true	y=true		z=true		w=false
+PX_FORCE_INLINE BoolV BTTTF();
+// x=true	y=true		z=true		w=true
+PX_FORCE_INLINE BoolV BTTTT();
+
+// x=false	y=false		z=false		w=true
+PX_FORCE_INLINE BoolV BWMask();
+// x=true	y=false		z=false		w=false
+PX_FORCE_INLINE BoolV BXMask();
+// x=false	y=true		z=false		w=false
+PX_FORCE_INLINE BoolV BYMask();
+// x=false	y=false		z=true		w=false
+PX_FORCE_INLINE BoolV BZMask();
+
+// get x component
+PX_FORCE_INLINE BoolV BGetX(const BoolV f);
+// get y component
+PX_FORCE_INLINE BoolV BGetY(const BoolV f);
+// get z component
+PX_FORCE_INLINE BoolV BGetZ(const BoolV f);
+// get w component
+PX_FORCE_INLINE BoolV BGetW(const BoolV f);
+
+// Use elementIndex to splat xxxx or yyyy or zzzz or wwww
+template <int elementIndex>
+PX_FORCE_INLINE BoolV BSplatElement(Vec4V a);
+
+// component-wise && (AND)
+PX_FORCE_INLINE BoolV BAnd(const BoolV a, const BoolV b);
+// component-wise || (OR)
+PX_FORCE_INLINE BoolV BOr(const BoolV a, const BoolV b);
+// component-wise not
+PX_FORCE_INLINE BoolV BNot(const BoolV a);
+
+// if all four components are true, return true, otherwise return false
+PX_FORCE_INLINE BoolV BAllTrue4(const BoolV a);
+
+// if any four components is true, return true, otherwise return false
+PX_FORCE_INLINE BoolV BAnyTrue4(const BoolV a);
+
+// if all three(0, 1, 2) components are true, return true, otherwise return false
+PX_FORCE_INLINE BoolV BAllTrue3(const BoolV a);
+
+// if any three (0, 1, 2) components is true, return true, otherwise return false
+PX_FORCE_INLINE BoolV BAnyTrue3(const BoolV a);
+
+// Return 1 if all components equal, zero otherwise.
+PX_FORCE_INLINE PxU32 BAllEq(const BoolV a, const BoolV b);
+
+// Specialized/faster BAllEq function for b==TTTT
+PX_FORCE_INLINE PxU32 BAllEqTTTT(const BoolV a);
+// Specialized/faster BAllEq function for b==FFFF
+PX_FORCE_INLINE PxU32 BAllEqFFFF(const BoolV a);
+
+/// Get BoolV as bits set in an PxU32. A bit in the output is set if the element is 'true' in the input.
+/// There is a bit for each element in a, with element 0s value held in bit0, element 1 in bit 1s and so forth.
+/// If nothing is true in the input it will return 0, and if all are true if will return 0xf.
+/// NOTE! That performance of the function varies considerably by platform, thus it is recommended to use
+/// where your algorithm really needs a BoolV in an integer variable.
+PX_FORCE_INLINE PxU32 BGetBitMask(const BoolV a);
+
+// VecI32V stuff
+
+PX_FORCE_INLINE VecI32V VecI32V_Zero();
+
+PX_FORCE_INLINE VecI32V VecI32V_One();
+
+PX_FORCE_INLINE VecI32V VecI32V_Two();
+
+PX_FORCE_INLINE VecI32V VecI32V_MinusOne();
+
+// Compute a shift parameter for VecI32V_LeftShift and VecI32V_RightShift
+// Each element of shift must be identical ie the vector must have form {count, count, count, count} with count>=0
+PX_FORCE_INLINE VecShiftV VecI32V_PrepareShift(const VecI32VArg shift);
+
+// Shift each element of a leftwards by the same amount
+// Compute shift with VecI32V_PrepareShift
+//{a.x<<shift[0], a.y<<shift[0], a.z<<shift[0], a.w<<shift[0]}
+PX_FORCE_INLINE VecI32V VecI32V_LeftShift(const VecI32VArg a, const VecShiftVArg shift);
+
+// Shift each element of a rightwards by the same amount
+// Compute shift with VecI32V_PrepareShift
+//{a.x>>shift[0], a.y>>shift[0], a.z>>shift[0], a.w>>shift[0]}
+PX_FORCE_INLINE VecI32V VecI32V_RightShift(const VecI32VArg a, const VecShiftVArg shift);
+
+PX_FORCE_INLINE VecI32V VecI32V_Add(const VecI32VArg a, const VecI32VArg b);
+
+PX_FORCE_INLINE VecI32V VecI32V_Or(const VecI32VArg a, const VecI32VArg b);
+
+PX_FORCE_INLINE VecI32V VecI32V_GetX(const VecI32VArg a);
+
+PX_FORCE_INLINE VecI32V VecI32V_GetY(const VecI32VArg a);
+
+PX_FORCE_INLINE VecI32V VecI32V_GetZ(const VecI32VArg a);
+
+PX_FORCE_INLINE VecI32V VecI32V_GetW(const VecI32VArg a);
+
+PX_FORCE_INLINE VecI32V VecI32V_Sub(const VecI32VArg a, const VecI32VArg b);
+
+PX_FORCE_INLINE BoolV VecI32V_IsGrtr(const VecI32VArg a, const VecI32VArg b);
+
+PX_FORCE_INLINE BoolV VecI32V_IsEq(const VecI32VArg a, const VecI32VArg b);
+
+PX_FORCE_INLINE VecI32V V4I32Sel(const BoolV c, const VecI32V a, const VecI32V b);
+
+// VecU32V stuff
+
+PX_FORCE_INLINE VecU32V U4Zero();
+
+PX_FORCE_INLINE VecU32V U4One();
+
+PX_FORCE_INLINE VecU32V U4Two();
+
+PX_FORCE_INLINE BoolV V4IsEqU32(const VecU32V a, const VecU32V b);
+
+PX_FORCE_INLINE VecU32V V4U32Sel(const BoolV c, const VecU32V a, const VecU32V b);
+
+PX_FORCE_INLINE VecU32V V4U32or(VecU32V a, VecU32V b);
+
+PX_FORCE_INLINE VecU32V V4U32xor(VecU32V a, VecU32V b);
+
+PX_FORCE_INLINE VecU32V V4U32and(VecU32V a, VecU32V b);
+
+PX_FORCE_INLINE VecU32V V4U32Andc(VecU32V a, VecU32V b);
+
+// VecU32 - why does this not return a bool?
+PX_FORCE_INLINE VecU32V V4IsGrtrV32u(const Vec4V a, const Vec4V b);
+
+// Math operations on 16-byte aligned Mat33s (represents any 3x3 matrix)
+// a*b
+PX_FORCE_INLINE Vec3V M33MulV3(const Mat33V& a, const Vec3V b);
+// A*x + b
+PX_FORCE_INLINE Vec3V M33MulV3AddV3(const Mat33V& A, const Vec3V b, const Vec3V c);
+// transpose(a) * b
+PX_FORCE_INLINE Vec3V M33TrnspsMulV3(const Mat33V& a, const Vec3V b);
+// a*b
+PX_FORCE_INLINE Mat33V M33MulM33(const Mat33V& a, const Mat33V& b);
+// a+b
+PX_FORCE_INLINE Mat33V M33Add(const Mat33V& a, const Mat33V& b);
+// a+b
+PX_FORCE_INLINE Mat33V M33Sub(const Mat33V& a, const Mat33V& b);
+//-a
+PX_FORCE_INLINE Mat33V M33Neg(const Mat33V& a);
+// absolute value of the matrix
+PX_FORCE_INLINE Mat33V M33Abs(const Mat33V& a);
+// inverse mat
+PX_FORCE_INLINE Mat33V M33Inverse(const Mat33V& a);
+// transpose(a)
+PX_FORCE_INLINE Mat33V M33Trnsps(const Mat33V& a);
+// create an identity matrix
+PX_FORCE_INLINE Mat33V M33Identity();
+
+// create a vec3 to store the diagonal element of the M33
+PX_FORCE_INLINE Mat33V M33Diagonal(const Vec3VArg);
+
+// Not implemented
+// return 1 if all components of a are equal to all components of b
+// PX_FORCE_INLINE PxU32 V4U32AllEq(const VecU32V a, const VecU32V b);
+// v.w=f
+// PX_FORCE_INLINE void V3WriteW(Vec3V& v, const PxF32 f);
+// PX_FORCE_INLINE PxF32 V3ReadW(const Vec3V& v);
+
+// Not used
+// PX_FORCE_INLINE Vec4V V4LoadAligned(Vec4V* addr);
+// PX_FORCE_INLINE Vec4V V4LoadUnaligned(Vec4V* addr);
+// floor(a)(per component)
+// PX_FORCE_INLINE Vec4V V4Floor(Vec4V a);
+// ceil(a) (per component)
+// PX_FORCE_INLINE Vec4V V4Ceil(Vec4V a);
+// PX_FORCE_INLINE VecU32V V4ConvertToU32VSaturate(const Vec4V a, PxU32 power);
+
+// Math operations on 16-byte aligned Mat34s (represents transformation matrix - rotation and translation).
+// namespace _Mat34V
+//{
+//	//a*b
+//	PX_FORCE_INLINE Vec3V multiplyV(const Mat34V& a, const Vec3V b);
+//	//a_rotation * b
+//	PX_FORCE_INLINE Vec3V multiply3X3V(const Mat34V& a, const Vec3V b);
+//	//transpose(a_rotation)*b
+//	PX_FORCE_INLINE Vec3V multiplyTranspose3X3V(const Mat34V& a, const Vec3V b);
+//	//a*b
+//	PX_FORCE_INLINE Mat34V multiplyV(const Mat34V& a, const Mat34V& b);
+//	//a_rotation*b
+//	PX_FORCE_INLINE Mat33V multiply3X3V(const Mat34V& a, const Mat33V& b);
+//	//a_rotation*b_rotation
+//	PX_FORCE_INLINE Mat33V multiply3X3V(const Mat34V& a, const Mat34V& b);
+//	//a+b
+//	PX_FORCE_INLINE Mat34V addV(const Mat34V& a, const Mat34V& b);
+//	//a^-1
+//	PX_FORCE_INLINE Mat34V getInverseV(const Mat34V& a);
+//	//transpose(a_rotation)
+//	PX_FORCE_INLINE Mat33V getTranspose3X3(const Mat34V& a);
+//}; //namespace _Mat34V
+
+// a*b
+//#define M34MulV3(a,b)			(M34MulV3(a,b))
+////a_rotation * b
+//#define M34Mul33V3(a,b)			(M34Mul33V3(a,b))
+////transpose(a_rotation)*b
+//#define M34TrnspsMul33V3(a,b)	(M34TrnspsMul33V3(a,b))
+////a*b
+//#define M34MulM34(a,b)			(_Mat34V::multiplyV(a,b))
+// a_rotation*b
+//#define M34MulM33(a,b)			(M34MulM33(a,b))
+// a_rotation*b_rotation
+//#define M34Mul33MM34(a,b)		(M34MulM33(a,b))
+// a+b
+//#define M34Add(a,b)				(M34Add(a,b))
+////a^-1
+//#define M34Inverse(a,b)			(M34Inverse(a))
+// transpose(a_rotation)
+//#define M34Trnsps33(a)			(M33Trnsps3X3(a))
+
+// Math operations on 16-byte aligned Mat44s (represents any 4x4 matrix)
+// namespace _Mat44V
+//{
+//	//a*b
+//	PX_FORCE_INLINE Vec4V multiplyV(const Mat44V& a, const Vec4V b);
+//	//transpose(a)*b
+//	PX_FORCE_INLINE Vec4V multiplyTransposeV(const Mat44V& a, const Vec4V b);
+//	//a*b
+//	PX_FORCE_INLINE Mat44V multiplyV(const Mat44V& a, const Mat44V& b);
+//	//a+b
+//	PX_FORCE_INLINE Mat44V addV(const Mat44V& a, const Mat44V& b);
+//	//a&-1
+//	PX_FORCE_INLINE Mat44V getInverseV(const Mat44V& a);
+//	//transpose(a)
+//	PX_FORCE_INLINE Mat44V getTransposeV(const Mat44V& a);
+//}; //namespace _Mat44V
+
+// namespace _VecU32V
+//{
+//	// pack 8 U32s to 8 U16s with saturation
+//	PX_FORCE_INLINE VecU16V pack2U32VToU16VSaturate(VecU32V a, VecU32V b);
+//	PX_FORCE_INLINE VecU32V orV(VecU32V a, VecU32V b);
+//	PX_FORCE_INLINE VecU32V andV(VecU32V a, VecU32V b);
+//	PX_FORCE_INLINE VecU32V andcV(VecU32V a, VecU32V b);
+//	// conversion from integer to float
+//	PX_FORCE_INLINE Vec4V convertToVec4V(VecU32V a);
+//	// splat a[elementIndex] into all fields of a
+//	template<int elementIndex>
+//	PX_FORCE_INLINE VecU32V splatElement(VecU32V a);
+//	PX_FORCE_INLINE void storeAligned(VecU32V a, VecU32V* address);
+//};
+
+// namespace _VecI32V
+//{
+//	template<int a> PX_FORCE_INLINE VecI32V splatI32();
+//};
+//
+// namespace _VecU16V
+//{
+//	PX_FORCE_INLINE VecU16V orV(VecU16V a, VecU16V b);
+//	PX_FORCE_INLINE VecU16V andV(VecU16V a, VecU16V b);
+//	PX_FORCE_INLINE VecU16V andcV(VecU16V a, VecU16V b);
+//	PX_FORCE_INLINE void storeAligned(VecU16V val, VecU16V *address);
+//	PX_FORCE_INLINE VecU16V loadAligned(VecU16V* addr);
+//	PX_FORCE_INLINE VecU16V loadUnaligned(VecU16V* addr);
+//	PX_FORCE_INLINE VecU16V compareGt(VecU16V a, VecU16V b);
+//	template<int elementIndex>
+//	PX_FORCE_INLINE VecU16V splatElement(VecU16V a);
+//	PX_FORCE_INLINE VecU16V subtractModulo(VecU16V a, VecU16V b);
+//	PX_FORCE_INLINE VecU16V addModulo(VecU16V a, VecU16V b);
+//	PX_FORCE_INLINE VecU32V getLo16(VecU16V a); // [0,2,4,6] 16-bit values to [0,1,2,3] 32-bit vector
+//	PX_FORCE_INLINE VecU32V getHi16(VecU16V a); // [1,3,5,7] 16-bit values to [0,1,2,3] 32-bit vector
+//};
+//
+// namespace _VecI16V
+//{
+//	template <int val> PX_FORCE_INLINE VecI16V splatImmediate();
+//};
+//
+// namespace _VecU8V
+//{
+//};
+
+// a*b
+//#define M44MulV4(a,b)		(M44MulV4(a,b))
+////transpose(a)*b
+//#define M44TrnspsMulV4(a,b) (M44TrnspsMulV4(a,b))
+////a*b
+//#define M44MulM44(a,b)		(M44MulM44(a,b))
+////a+b
+//#define M44Add(a,b)			(M44Add(a,b))
+////a&-1
+//#define M44Inverse(a)		(M44Inverse(a))
+////transpose(a)
+//#define M44Trnsps(a)		(M44Trnsps(a))
+
+// dsequeira: these used to be assert'd out in SIMD builds, but they're necessary if
+// we want to be able to write some scalar functions which run using SIMD data structures
+
+PX_FORCE_INLINE void V3WriteX(Vec3V& v, const PxF32 f)
+{
+	reinterpret_cast<PxVec3&>(v).x = f;
+}
+
+PX_FORCE_INLINE void V3WriteY(Vec3V& v, const PxF32 f)
+{
+	reinterpret_cast<PxVec3&>(v).y = f;
+}
+
+PX_FORCE_INLINE void V3WriteZ(Vec3V& v, const PxF32 f)
+{
+	reinterpret_cast<PxVec3&>(v).z = f;
+}
+
+PX_FORCE_INLINE void V3WriteXYZ(Vec3V& v, const PxVec3& f)
+{
+	reinterpret_cast<PxVec3&>(v) = f;
+}
+
+PX_FORCE_INLINE PxF32 V3ReadX(const Vec3V& v)
+{
+	return reinterpret_cast<const PxVec3&>(v).x;
+}
+
+PX_FORCE_INLINE PxF32 V3ReadY(const Vec3V& v)
+{
+	return reinterpret_cast<const PxVec3&>(v).y;
+}
+
+PX_FORCE_INLINE PxF32 V3ReadZ(const Vec3V& v)
+{
+	return reinterpret_cast<const PxVec3&>(v).z;
+}
+
+PX_FORCE_INLINE const PxVec3& V3ReadXYZ(const Vec3V& v)
+{
+	return reinterpret_cast<const PxVec3&>(v);
+}
+
+PX_FORCE_INLINE void V4WriteX(Vec4V& v, const PxF32 f)
+{
+	reinterpret_cast<PxVec4&>(v).x = f;
+}
+
+PX_FORCE_INLINE void V4WriteY(Vec4V& v, const PxF32 f)
+{
+	reinterpret_cast<PxVec4&>(v).y = f;
+}
+
+PX_FORCE_INLINE void V4WriteZ(Vec4V& v, const PxF32 f)
+{
+	reinterpret_cast<PxVec4&>(v).z = f;
+}
+
+PX_FORCE_INLINE void V4WriteW(Vec4V& v, const PxF32 f)
+{
+	reinterpret_cast<PxVec4&>(v).w = f;
+}
+
+PX_FORCE_INLINE void V4WriteXYZ(Vec4V& v, const PxVec3& f)
+{
+	reinterpret_cast<PxVec3&>(v) = f;
+}
+
+PX_FORCE_INLINE PxF32 V4ReadX(const Vec4V& v)
+{
+	return reinterpret_cast<const PxVec4&>(v).x;
+}
+
+PX_FORCE_INLINE PxF32 V4ReadY(const Vec4V& v)
+{
+	return reinterpret_cast<const PxVec4&>(v).y;
+}
+
+PX_FORCE_INLINE PxF32 V4ReadZ(const Vec4V& v)
+{
+	return reinterpret_cast<const PxVec4&>(v).z;
+}
+
+PX_FORCE_INLINE PxF32 V4ReadW(const Vec4V& v)
+{
+	return reinterpret_cast<const PxVec4&>(v).w;
+}
+
+PX_FORCE_INLINE const PxVec3& V4ReadXYZ(const Vec4V& v)
+{
+	return reinterpret_cast<const PxVec3&>(v);
+}
+
+// this macro transposes 4 Vec4V into 3 Vec4V (assuming that the W component can be ignored
+#define PX_TRANSPOSE_44_34(inA, inB, inC, inD, outA, outB, outC)                                                       \
+	\
+outA = V4UnpackXY(inA, inC);                                                                                           \
+	\
+inA = V4UnpackZW(inA, inC);                                                                                            \
+	\
+inC = V4UnpackXY(inB, inD);                                                                                            \
+	\
+inB = V4UnpackZW(inB, inD);                                                                                            \
+	\
+outB = V4UnpackZW(outA, inC);                                                                                          \
+	\
+outA = V4UnpackXY(outA, inC);                                                                                          \
+	\
+outC = V4UnpackXY(inA, inB);
+
+// this macro transposes 3 Vec4V into 4 Vec4V (with W components as garbage!)
+#define PX_TRANSPOSE_34_44(inA, inB, inC, outA, outB, outC, outD)                                                      \
+	outA = V4UnpackXY(inA, inC);                                                                                       \
+	inA = V4UnpackZW(inA, inC);                                                                                        \
+	outC = V4UnpackXY(inB, inB);                                                                                       \
+	inC = V4UnpackZW(inB, inB);                                                                                        \
+	outB = V4UnpackZW(outA, outC);                                                                                     \
+	outA = V4UnpackXY(outA, outC);                                                                                     \
+	outC = V4UnpackXY(inA, inC);                                                                                       \
+	outD = V4UnpackZW(inA, inC);
+
+#define PX_TRANSPOSE_44(inA, inB, inC, inD, outA, outB, outC, outD)                                                    \
+	outA = V4UnpackXY(inA, inC);                                                                                       \
+	inA = V4UnpackZW(inA, inC);                                                                                        \
+	inC = V4UnpackXY(inB, inD);                                                                                        \
+	inB = V4UnpackZW(inB, inD);                                                                                        \
+	outB = V4UnpackZW(outA, inC);                                                                                      \
+	outA = V4UnpackXY(outA, inC);                                                                                      \
+	outC = V4UnpackXY(inA, inB);                                                                                       \
+	outD = V4UnpackZW(inA, inB);
+
+// This function returns a Vec4V, where each element is the dot product of one pair of Vec3Vs. On PC, each element in
+// the result should be identical to the results if V3Dot was performed
+// for each pair of Vec3V.
+// However, on other platforms, the result might diverge by some small margin due to differences in FP rounding, e.g. if
+// _mm_dp_ps was used or some other approximate dot product or fused madd operations
+// were used.
+// Where there does not exist a hw-accelerated dot-product operation, this approach should be the fastest way to compute
+// the dot product of 4 vectors.
+PX_FORCE_INLINE Vec4V V3Dot4(const Vec3VArg a0, const Vec3VArg b0, const Vec3VArg a1, const Vec3VArg b1,
+                             const Vec3VArg a2, const Vec3VArg b2, const Vec3VArg a3, const Vec3VArg b3)
+{
+	Vec4V a0b0 = Vec4V_From_Vec3V(V3Mul(a0, b0));
+	Vec4V a1b1 = Vec4V_From_Vec3V(V3Mul(a1, b1));
+	Vec4V a2b2 = Vec4V_From_Vec3V(V3Mul(a2, b2));
+	Vec4V a3b3 = Vec4V_From_Vec3V(V3Mul(a3, b3));
+
+	Vec4V aTrnsps, bTrnsps, cTrnsps;
+
+	PX_TRANSPOSE_44_34(a0b0, a1b1, a2b2, a3b3, aTrnsps, bTrnsps, cTrnsps);
+
+	return V4Add(V4Add(aTrnsps, bTrnsps), cTrnsps);
+}
+
+//(f.x,f.y,f.z,0) - Alternative/faster V3LoadU implementation when it is safe to read "W", i.e. the 32bits after the PxVec3.
+PX_FORCE_INLINE Vec3V V3LoadU_SafeReadW(const PxVec3& f)
+{
+	return Vec3V_From_Vec4V(V4LoadU(&f.x));
+}
+
+// Now for the cross-platform implementations of the 16-byte aligned maths functions (win32/360/ppu/spu etc).
+#if COMPILE_VECTOR_INTRINSICS
+#include "PsInlineAoS.h"
+#else // #if COMPILE_VECTOR_INTRINSICS
+#include "PsVecMathAoSScalarInline.h"
+#endif // #if !COMPILE_VECTOR_INTRINSICS
+#include "PsVecQuat.h"
+
+} // namespace aos
+} // namespace shdfnd
+} // namespace physx
+
+#endif // PSFOUNDATION_PSVECMATH_H
diff --git a/PxShared/src/foundation/include/PsVecMathAoSScalar.h b/PxShared/src/foundation/include/PsVecMathAoSScalar.h
new file mode 100644
index 00000000..e3a72a12
--- /dev/null
+++ b/PxShared/src/foundation/include/PsVecMathAoSScalar.h
@@ -0,0 +1,242 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSVECMATHAOSSCALAR_H
+#define PSFOUNDATION_PSVECMATHAOSSCALAR_H
+
+#if COMPILE_VECTOR_INTRINSICS
+#error Scalar version should not be included when using vector intrinsics.
+#endif
+
+// Remove this define when all platforms use simd solver.
+#define PX_SUPPORT_SIMD
+
+struct VecI16V;
+struct VecU16V;
+struct VecI32V;
+struct VecU32V;
+struct Vec4V;
+typedef Vec4V QuatV;
+
+PX_ALIGN_PREFIX(16)
+struct FloatV
+{
+	PxF32 x;
+	PxF32 pad[3];
+	FloatV()
+	{
+	}
+	FloatV(const PxF32 _x) : x(_x)
+	{
+	}
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct Vec4V
+{
+	PxF32 x, y, z, w;
+	Vec4V()
+	{
+	}
+	Vec4V(const PxF32 _x, const PxF32 _y, const PxF32 _z, const PxF32 _w) : x(_x), y(_y), z(_z), w(_w)
+	{
+	}
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct Vec3V
+{
+	PxF32 x, y, z;
+	PxF32 pad;
+	Vec3V()
+	{
+	}
+	Vec3V(const PxF32 _x, const PxF32 _y, const PxF32 _z) : x(_x), y(_y), z(_z), pad(0.0f)
+	{
+	}
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct BoolV
+{
+	PxU32 ux, uy, uz, uw;
+	BoolV()
+	{
+	}
+	BoolV(const PxU32 _x, const PxU32 _y, const PxU32 _z, const PxU32 _w) : ux(_x), uy(_y), uz(_z), uw(_w)
+	{
+	}
+} PX_ALIGN_SUFFIX(16);
+
+struct Mat33V
+{
+	Mat33V()
+	{
+	}
+	Mat33V(const Vec3V& c0, const Vec3V& c1, const Vec3V& c2) : col0(c0), col1(c1), col2(c2)
+	{
+	}
+	Vec3V col0;
+	Vec3V col1;
+	Vec3V col2;
+};
+
+struct Mat34V
+{
+	Mat34V()
+	{
+	}
+	Mat34V(const Vec3V& c0, const Vec3V& c1, const Vec3V& c2, const Vec3V& c3) : col0(c0), col1(c1), col2(c2), col3(c3)
+	{
+	}
+	Vec3V col0;
+	Vec3V col1;
+	Vec3V col2;
+	Vec3V col3;
+};
+
+struct Mat43V
+{
+	Mat43V()
+	{
+	}
+	Mat43V(const Vec4V& c0, const Vec4V& c1, const Vec4V& c2) : col0(c0), col1(c1), col2(c2)
+	{
+	}
+	Vec4V col0;
+	Vec4V col1;
+	Vec4V col2;
+};
+
+struct Mat44V
+{
+	Mat44V()
+	{
+	}
+	Mat44V(const Vec4V& c0, const Vec4V& c1, const Vec4V& c2, const Vec4V& c3) : col0(c0), col1(c1), col2(c2), col3(c3)
+	{
+	}
+	Vec4V col0;
+	Vec4V col1;
+	Vec4V col2;
+	Vec4V col3;
+};
+
+PX_ALIGN_PREFIX(16)
+struct VecU32V
+{
+	PxU32 u32[4];
+	PX_FORCE_INLINE VecU32V()
+	{
+	}
+	PX_FORCE_INLINE VecU32V(PxU32 a, PxU32 b, PxU32 c, PxU32 d)
+	{
+		u32[0] = a;
+		u32[1] = b;
+		u32[2] = c;
+		u32[3] = d;
+	}
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct VecI32V
+{
+	PxI32 i32[4];
+	PX_FORCE_INLINE VecI32V()
+	{
+	}
+	PX_FORCE_INLINE VecI32V(PxI32 a, PxI32 b, PxI32 c, PxI32 d)
+	{
+		i32[0] = a;
+		i32[1] = b;
+		i32[2] = c;
+		i32[3] = d;
+	}
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct VecI16V
+{
+	PxI16 i16[8];
+	PX_FORCE_INLINE VecI16V()
+	{
+	}
+	PX_FORCE_INLINE VecI16V(PxI16 a, PxI16 b, PxI16 c, PxI16 d, PxI16 e, PxI16 f, PxI16 g, PxI16 h)
+	{
+		i16[0] = a;
+		i16[1] = b;
+		i16[2] = c;
+		i16[3] = d;
+		i16[4] = e;
+		i16[5] = f;
+		i16[6] = g;
+		i16[7] = h;
+	}
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct VecU16V
+{
+	union
+	{
+		PxU16 u16[8];
+		PxI16 i16[8];
+	};
+	PX_FORCE_INLINE VecU16V()
+	{
+	}
+	PX_FORCE_INLINE VecU16V(PxU16 a, PxU16 b, PxU16 c, PxU16 d, PxU16 e, PxU16 f, PxU16 g, PxU16 h)
+	{
+		u16[0] = a;
+		u16[1] = b;
+		u16[2] = c;
+		u16[3] = d;
+		u16[4] = e;
+		u16[5] = f;
+		u16[6] = g;
+		u16[7] = h;
+	}
+} PX_ALIGN_SUFFIX(16);
+
+#define FloatVArg FloatV &
+#define Vec3VArg Vec3V &
+#define Vec4VArg Vec4V &
+#define BoolVArg BoolV &
+#define VecU32VArg VecU32V &
+#define VecI32VArg VecI32V &
+#define VecU16VArg VecU16V &
+#define VecI16VArg VecI16V &
+#define QuatVArg QuatV &
+
+#define VecCrossV Vec3V
+
+typedef VecI32V VecShiftV;
+#define VecShiftVArg VecShiftV &
+
+#endif // PX_PHYSICS_COMMON_VECMATH_INLINE_SCALAR
diff --git a/PxShared/src/foundation/include/PsVecMathAoSScalarInline.h b/PxShared/src/foundation/include/PsVecMathAoSScalarInline.h
new file mode 100644
index 00000000..d1061290
--- /dev/null
+++ b/PxShared/src/foundation/include/PsVecMathAoSScalarInline.h
@@ -0,0 +1,2254 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSVECMATHAOSSCALARINLINE_H
+#define PSFOUNDATION_PSVECMATHAOSSCALARINLINE_H
+
+#if COMPILE_VECTOR_INTRINSICS
+#error Scalar version should not be included when using vector intrinsics.
+#endif
+
+#define BOOL_TO_U32(b) (PxU32)(- PxI32(b))
+#define TRUE_TO_U32 (PxU32)(-1)
+#define FALSE_TO_U32 (PxU32)(0)
+
+#define BOOL_TO_U16(b) (PxU16)(- PxI32(b))
+
+
+#define VECMATHAOS_ASSERT(x) { PX_ASSERT(x); }
+
+/////////////////////////////////////////////////////////////////////
+////INTERNAL USE ONLY AND TESTS
+/////////////////////////////////////////////////////////////////////
+
+namespace internalScalarSimd
+{
+PX_FORCE_INLINE PxF32 FStore(const FloatV a)
+{
+	return a.x;
+}
+
+PX_FORCE_INLINE bool hasZeroElementInFloatV(const FloatV a)
+{
+	return (0 == a.x);
+}
+
+PX_FORCE_INLINE bool hasZeroElementInVec3V(const Vec3V a)
+{
+	return (0 == a.x || 0 == a.y || 0 == a.z);
+}
+
+PX_FORCE_INLINE bool hasZeroElementInVec4V(const Vec4V a)
+{
+	return (0 == a.x || 0 == a.y || 0 == a.z || 0 == a.w);
+}
+}
+
+namespace _VecMathTests
+{
+// PT: this function returns an invalid Vec3V (W!=0.0f) just for unit-testing 'isValidVec3V'
+PX_FORCE_INLINE Vec3V getInvalidVec3V()
+{
+	Vec3V tmp;
+	tmp.x = tmp.y = tmp.z = 0.0f;
+	tmp.pad = 1.0f;
+	return tmp;
+}
+
+PX_FORCE_INLINE bool allElementsEqualFloatV(const FloatV a, const FloatV b)
+{
+	return (a.x == b.x);
+}
+
+PX_FORCE_INLINE bool allElementsEqualVec3V(const Vec3V a, const Vec3V b)
+{
+	return (a.x == b.x && a.y == b.y && a.z == b.z);
+}
+
+PX_FORCE_INLINE bool allElementsEqualVec4V(const Vec4V a, const Vec4V b)
+{
+	return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w);
+}
+
+PX_FORCE_INLINE bool allElementsEqualBoolV(const BoolV a, const BoolV b)
+{
+	return (a.ux == b.ux && a.uy == b.uy && a.uz == b.uz && a.uw == b.uw);
+}
+
+PX_FORCE_INLINE bool allElementsEqualVecU32V(const VecU32V a, const VecU32V b)
+{
+	return (a.u32[0] == b.u32[0] && a.u32[1] == b.u32[1] && a.u32[2] == b.u32[2] && a.u32[3] == b.u32[3]);
+}
+
+PX_FORCE_INLINE bool allElementsEqualVecI32V(const VecI32V a, const VecI32V b)
+{
+	return (a.i32[0] == b.i32[0] && a.i32[1] == b.i32[1] && a.i32[2] == b.i32[2] && a.i32[3] == b.i32[3]);
+}
+
+#define VECMATH_AOS_EPSILON (1e-3f)
+
+PX_FORCE_INLINE bool allElementsNearEqualFloatV(const FloatV a, const FloatV b)
+{
+	const PxF32 cx = a.x - b.x;
+	return (cx > -VECMATH_AOS_EPSILON && cx < VECMATH_AOS_EPSILON);
+}
+
+PX_FORCE_INLINE bool allElementsNearEqualVec3V(const Vec3V a, const Vec3V b)
+{
+	const PxF32 cx = a.x - b.x;
+	const PxF32 cy = a.y - b.y;
+	const PxF32 cz = a.z - b.z;
+	return (cx > -VECMATH_AOS_EPSILON && cx < VECMATH_AOS_EPSILON && cy > -VECMATH_AOS_EPSILON &&
+	        cy < VECMATH_AOS_EPSILON && cz > -VECMATH_AOS_EPSILON && cz < VECMATH_AOS_EPSILON);
+}
+
+PX_FORCE_INLINE bool allElementsNearEqualVec4V(const Vec4V a, const Vec4V b)
+{
+	const PxF32 cx = a.x - b.x;
+	const PxF32 cy = a.y - b.y;
+	const PxF32 cz = a.z - b.z;
+	const PxF32 cw = a.w - b.w;
+	return (cx > -VECMATH_AOS_EPSILON && cx < VECMATH_AOS_EPSILON && cy > -VECMATH_AOS_EPSILON &&
+	        cy < VECMATH_AOS_EPSILON && cz > -VECMATH_AOS_EPSILON && cz < VECMATH_AOS_EPSILON &&
+	        cw > -VECMATH_AOS_EPSILON && cw < VECMATH_AOS_EPSILON);
+}
+}
+
+///////////////////////////////////////////////////////
+
+PX_FORCE_INLINE bool isValidVec3V(const Vec3V a)
+{
+	return a.pad == 0.f;
+}
+
+PX_FORCE_INLINE bool isFiniteFloatV(const FloatV a)
+{
+	return PxIsFinite(a.x);
+}
+
+PX_FORCE_INLINE bool isFiniteVec3V(const Vec3V a)
+{
+	return PxIsFinite(a.x) && PxIsFinite(a.y) && PxIsFinite(a.z);
+}
+
+PX_FORCE_INLINE bool isFiniteVec4V(const Vec4V a)
+{
+	return PxIsFinite(a.x) && PxIsFinite(a.y) && PxIsFinite(a.z) && PxIsFinite(a.w);
+}
+
+/////////////////////////////////////////////////////////////////////
+////VECTORISED FUNCTION IMPLEMENTATIONS
+/////////////////////////////////////////////////////////////////////
+
+PX_FORCE_INLINE FloatV FLoad(const PxF32 f)
+{
+	return FloatV(f);
+}
+
+PX_FORCE_INLINE Vec3V V3Load(const PxF32 f)
+{
+	return Vec3V(f, f, f);
+}
+
+PX_FORCE_INLINE Vec4V V4Load(const PxF32 f)
+{
+	return Vec4V(f, f, f, f);
+}
+
+PX_FORCE_INLINE BoolV BLoad(const bool f)
+{
+#if PX_ARM
+	// SD: Android ARM builds fail if this is done with a cast.
+	// Might also fail because of something else but the select
+	// operator here seems to fix everything that failed in release builds.
+	return f ? BTTTT() : BFFFF();
+#else
+	return BoolV(BOOL_TO_U32(f), BOOL_TO_U32(f), BOOL_TO_U32(f), BOOL_TO_U32(f));
+#endif
+}
+
+PX_FORCE_INLINE Vec3V V3LoadA(const PxVec3& f)
+{
+	return Vec3V(f.x, f.y, f.z);
+}
+
+PX_FORCE_INLINE Vec3V V3LoadU(const PxVec3& f)
+{
+	return Vec3V(f.x, f.y, f.z);
+}
+
+PX_FORCE_INLINE Vec3V V3LoadUnsafeA(const PxVec3& f)
+{
+	return Vec3V(f.x, f.y, f.z);
+}
+
+PX_FORCE_INLINE Vec3V V3LoadA(const PxF32* const f)
+{
+	return Vec3V(f[0], f[1], f[2]);
+}
+
+PX_FORCE_INLINE Vec3V V3LoadU(const PxF32* const f)
+{
+	return Vec3V(f[0], f[1], f[2]);
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V(Vec4V f)
+{
+	return Vec3V(f.x, f.y, f.z);
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V_WUndefined(const Vec4V v)
+{
+	return Vec3V(v.x, v.y, v.z);
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_Vec3V(Vec3V f)
+{
+	return Vec4V(f.x, f.y, f.z, 0.0f);
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_FloatV(FloatV f)
+{
+	return Vec4V(f.x, f.x, f.x, f.x);
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_FloatV(FloatV f)
+{
+	return Vec3V(f.x, f.x, f.x);
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_FloatV_WUndefined(FloatV f)
+{
+	return Vec3V(f.x, f.x, f.x);
+}
+
+PX_FORCE_INLINE Vec4V V4LoadA(const PxF32* const f)
+{
+	return Vec4V(f[0], f[1], f[2], f[3]);
+}
+
+PX_FORCE_INLINE void V4StoreA(const Vec4V a, PxF32* f)
+{
+	*reinterpret_cast<Vec4V*>(f) = a;
+}
+
+PX_FORCE_INLINE void V4StoreU(const Vec4V a, PxF32* f)
+{
+	*reinterpret_cast<PxVec4*>(f) = *reinterpret_cast<const PxVec4*>(&a.x);
+}
+
+PX_FORCE_INLINE void BStoreA(const BoolV a, PxU32* f)
+{
+	*reinterpret_cast<BoolV*>(f) = a;
+}
+
+PX_FORCE_INLINE void U4StoreA(const VecU32V uv, PxU32* u)
+{
+	*reinterpret_cast<VecU32V*>(u) = uv;
+}
+
+PX_FORCE_INLINE void I4StoreA(const VecI32V iv, PxI32* i)
+{
+	*reinterpret_cast<VecI32V*>(i) = iv;
+}
+
+PX_FORCE_INLINE Vec4V V4LoadU(const PxF32* const f)
+{
+	return Vec4V(f[0], f[1], f[2], f[3]);
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_PxVec3_WUndefined(const PxVec3& f)
+{
+	return Vec4V(f[0], f[1], f[2], 0.0f);
+}
+
+PX_FORCE_INLINE BoolV BLoad(const bool* const f)
+{
+	return BoolV(BOOL_TO_U32(f[0]), BOOL_TO_U32(f[1]), BOOL_TO_U32(f[2]), BOOL_TO_U32(f[3]));
+}
+
+PX_FORCE_INLINE void FStore(const FloatV a, PxF32* PX_RESTRICT f)
+{
+	*f = a.x;
+}
+
+PX_FORCE_INLINE void V3StoreA(const Vec3V a, PxVec3& f)
+{
+	f = PxVec3(a.x, a.y, a.z);
+}
+
+PX_FORCE_INLINE void V3StoreU(const Vec3V a, PxVec3& f)
+{
+	f = PxVec3(a.x, a.y, a.z);
+}
+
+PX_FORCE_INLINE void Store_From_BoolV(const BoolV b, PxU32* b2)
+{
+	*b2 = b.ux;
+}
+
+//////////////////////////
+// FLOATV
+//////////////////////////
+
+PX_FORCE_INLINE FloatV FZero()
+{
+	return FLoad(0.0f);
+}
+
+PX_FORCE_INLINE FloatV FOne()
+{
+	return FLoad(1.0f);
+}
+
+PX_FORCE_INLINE FloatV FHalf()
+{
+	return FLoad(0.5f);
+}
+
+PX_FORCE_INLINE FloatV FEps()
+{
+	return FLoad(PX_EPS_REAL);
+}
+
+PX_FORCE_INLINE FloatV FEps6()
+{
+	return FLoad(1e-6f);
+}
+
+PX_FORCE_INLINE FloatV FMax()
+{
+	return FLoad(PX_MAX_REAL);
+}
+
+PX_FORCE_INLINE FloatV FNegMax()
+{
+	return FLoad(-PX_MAX_REAL);
+}
+
+PX_FORCE_INLINE FloatV FNeg(const FloatV f)
+{
+	return FloatV(-f.x);
+}
+
+PX_FORCE_INLINE FloatV FAdd(const FloatV a, const FloatV b)
+{
+	return FloatV(a.x + b.x);
+}
+
+PX_FORCE_INLINE FloatV FSub(const FloatV a, const FloatV b)
+{
+	return FloatV(a.x - b.x);
+}
+
+PX_FORCE_INLINE FloatV FMul(const FloatV a, const FloatV b)
+{
+	return FloatV(a.x * b.x);
+}
+
+PX_FORCE_INLINE FloatV FDiv(const FloatV a, const FloatV b)
+{
+	VECMATHAOS_ASSERT(b.x != 0.0f);
+	return FloatV(a.x / b.x);
+}
+
+PX_FORCE_INLINE FloatV FDivFast(const FloatV a, const FloatV b)
+{
+	VECMATHAOS_ASSERT(b.x != 0.0f);
+	return FloatV(a.x / b.x);
+}
+
+PX_FORCE_INLINE FloatV FRecip(const FloatV a)
+{
+	VECMATHAOS_ASSERT(a.x != 0.0f);
+	return 1.0f / a.x;
+}
+
+PX_FORCE_INLINE FloatV FRecipFast(const FloatV a)
+{
+	VECMATHAOS_ASSERT(a.x != 0.0f);
+	return 1.0f / a.x;
+}
+
+PX_FORCE_INLINE FloatV FRsqrt(const FloatV a)
+{
+	VECMATHAOS_ASSERT(a.x != 0.0f);
+	return PxRecipSqrt(a.x);
+}
+
+PX_FORCE_INLINE FloatV FSqrt(const FloatV a)
+{
+	return PxSqrt(a.x);
+}
+
+PX_FORCE_INLINE FloatV FRsqrtFast(const FloatV a)
+{
+	VECMATHAOS_ASSERT(a.x != 0.0f);
+	return PxRecipSqrt(a.x);
+}
+
+PX_FORCE_INLINE FloatV FScaleAdd(const FloatV a, const FloatV b, const FloatV c)
+{
+	return FAdd(FMul(a, b), c);
+}
+
+PX_FORCE_INLINE FloatV FNegScaleSub(const FloatV a, const FloatV b, const FloatV c)
+{
+	return FSub(c, FMul(a, b));
+}
+
+PX_FORCE_INLINE FloatV FAbs(const FloatV a)
+{
+	return FloatV(PxAbs(a.x));
+}
+
+PX_FORCE_INLINE FloatV FSel(const BoolV c, const FloatV a, const FloatV b)
+{
+	return FloatV(c.ux ? a.x : b.x);
+}
+
+PX_FORCE_INLINE BoolV FIsGrtr(const FloatV a, const FloatV b)
+{
+	return BLoad(a.x > b.x);
+}
+
+PX_FORCE_INLINE BoolV FIsGrtrOrEq(const FloatV a, const FloatV b)
+{
+	return BLoad(a.x >= b.x);
+}
+
+PX_FORCE_INLINE BoolV FIsEq(const FloatV a, const FloatV b)
+{
+	return BLoad(a.x == b.x);
+}
+
+PX_FORCE_INLINE FloatV FMax(const FloatV a, const FloatV b)
+{
+	return (a.x > b.x ? FloatV(a.x) : FloatV(b.x));
+}
+
+PX_FORCE_INLINE FloatV FMin(const FloatV a, const FloatV b)
+{
+	return (a.x > b.x ? FloatV(b.x) : FloatV(a.x));
+}
+
+PX_FORCE_INLINE FloatV FClamp(const FloatV a, const FloatV minV, const FloatV maxV)
+{
+	return FMax(FMin(a, maxV), minV);
+}
+
+PX_FORCE_INLINE PxU32 FAllGrtr(const FloatV a, const FloatV b)
+{
+	return BOOL_TO_U32(a.x > b.x);
+}
+
+PX_FORCE_INLINE PxU32 FAllGrtrOrEq(const FloatV a, const FloatV b)
+{
+	return BOOL_TO_U32(a.x >= b.x);
+}
+PX_FORCE_INLINE PxU32 FAllEq(const FloatV a, const FloatV b)
+{
+	return BOOL_TO_U32(a.x == b.x);
+}
+
+PX_FORCE_INLINE FloatV FRound(const FloatV a)
+{
+	return floor(a.x + 0.5f);
+}
+
+PX_FORCE_INLINE FloatV FSin(const FloatV a)
+{
+	return sinf(a.x);
+}
+
+PX_FORCE_INLINE FloatV FCos(const FloatV a)
+{
+	return cosf(a.x);
+}
+
+PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV min, const FloatV max)
+{
+	return BOOL_TO_U32(a.x > max.x || a.x < min.x);
+}
+
+PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV min, const FloatV max)
+{
+	return BOOL_TO_U32(a.x >= min.x && a.x <= max.x);
+}
+
+PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV bounds)
+{
+	return FOutOfBounds(a, FNeg(bounds), bounds);
+}
+
+PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV bounds)
+{
+	return FInBounds(a, FNeg(bounds), bounds);
+}
+
+/////////////////////
+// VEC3V
+/////////////////////
+
+PX_FORCE_INLINE Vec3V V3Splat(const FloatV f)
+{
+	return Vec3V(f.x, f.x, f.x);
+}
+
+PX_FORCE_INLINE Vec3V V3Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z)
+{
+	return Vec3V(x.x, y.x, z.x);
+}
+
+PX_FORCE_INLINE Vec3V V3UnitX()
+{
+	return Vec3V(1.0f, 0.0f, 0.0f);
+}
+
+PX_FORCE_INLINE Vec3V V3UnitY()
+{
+	return Vec3V(0.0f, 1.0f, 0.0f);
+}
+
+PX_FORCE_INLINE Vec3V V3UnitZ()
+{
+	return Vec3V(0.0f, 0.0f, 1.0f);
+}
+
+PX_FORCE_INLINE FloatV V3GetX(const Vec3V f)
+{
+	return FloatV(f.x);
+}
+
+PX_FORCE_INLINE FloatV V3GetY(const Vec3V f)
+{
+	return FloatV(f.y);
+}
+
+PX_FORCE_INLINE FloatV V3GetZ(const Vec3V f)
+{
+	return FloatV(f.z);
+}
+
+PX_FORCE_INLINE Vec3V V3SetX(const Vec3V v, const FloatV f)
+{
+	return Vec3V(f.x, v.y, v.z);
+}
+
+PX_FORCE_INLINE Vec3V V3SetY(const Vec3V v, const FloatV f)
+{
+	return Vec3V(v.x, f.x, v.z);
+}
+
+PX_FORCE_INLINE Vec3V V3SetZ(const Vec3V v, const FloatV f)
+{
+	return Vec3V(v.x, v.y, f.x);
+}
+
+PX_FORCE_INLINE Vec3V V3ColX(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	return Vec3V(a.x, b.x, c.x);
+}
+
+PX_FORCE_INLINE Vec3V V3ColY(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	return Vec3V(a.y, b.y, c.y);
+}
+
+PX_FORCE_INLINE Vec3V V3ColZ(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	return Vec3V(a.z, b.z, c.z);
+}
+
+PX_FORCE_INLINE Vec3V V3Zero()
+{
+	return V3Load(0.0f);
+}
+
+PX_FORCE_INLINE Vec3V V3One()
+{
+	return V3Load(1.0f);
+}
+
+PX_FORCE_INLINE Vec3V V3Eps()
+{
+	return V3Load(PX_EPS_REAL);
+}
+
+PX_FORCE_INLINE Vec3V V3Neg(const Vec3V c)
+{
+	return Vec3V(-c.x, -c.y, -c.z);
+}
+
+PX_FORCE_INLINE Vec3V V3Add(const Vec3V a, const Vec3V b)
+{
+	return Vec3V(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+
+PX_FORCE_INLINE Vec3V V3Sub(const Vec3V a, const Vec3V b)
+{
+	return Vec3V(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+
+PX_FORCE_INLINE Vec3V V3Scale(const Vec3V a, const FloatV b)
+{
+	return Vec3V(a.x * b.x, a.y * b.x, a.z * b.x);
+}
+
+PX_FORCE_INLINE Vec3V V3Mul(const Vec3V a, const Vec3V b)
+{
+	return Vec3V(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+
+PX_FORCE_INLINE Vec3V V3ScaleInv(const Vec3V a, const FloatV b)
+{
+	const PxF32 bInv = 1.0f / b.x;
+	return Vec3V(a.x * bInv, a.y * bInv, a.z * bInv);
+}
+
+PX_FORCE_INLINE Vec3V V3Div(const Vec3V a, const Vec3V b)
+{
+	return Vec3V(a.x / b.x, a.y / b.y, a.z / b.z);
+}
+
+PX_FORCE_INLINE Vec3V V3ScaleInvFast(const Vec3V a, const FloatV b)
+{
+	const PxF32 bInv = 1.0f / b.x;
+	return Vec3V(a.x * bInv, a.y * bInv, a.z * bInv);
+}
+
+PX_FORCE_INLINE Vec3V V3DivFast(const Vec3V a, const Vec3V b)
+{
+	return Vec3V(a.x / b.x, a.y / b.y, a.z / b.z);
+}
+
+PX_FORCE_INLINE Vec3V V3Recip(const Vec3V a)
+{
+	return Vec3V(1.0f / a.x, 1.0f / a.y, 1.0f / a.z);
+}
+
+PX_FORCE_INLINE Vec3V V3RecipFast(const Vec3V a)
+{
+	return Vec3V(1.0f / a.x, 1.0f / a.y, 1.0f / a.z);
+}
+
+PX_FORCE_INLINE Vec3V V3Rsqrt(const Vec3V a)
+{
+	return Vec3V(PxRecipSqrt(a.x), PxRecipSqrt(a.y), PxRecipSqrt(a.z));
+}
+
+PX_FORCE_INLINE Vec3V V3RsqrtFast(const Vec3V a)
+{
+	return Vec3V(PxRecipSqrt(a.x), PxRecipSqrt(a.y), PxRecipSqrt(a.z));
+}
+
+PX_FORCE_INLINE Vec3V V3ScaleAdd(const Vec3V a, const FloatV b, const Vec3V c)
+{
+	return V3Add(V3Scale(a, b), c);
+}
+
+PX_FORCE_INLINE Vec3V V3NegScaleSub(const Vec3V a, const FloatV b, const Vec3V c)
+{
+	return V3Sub(c, V3Scale(a, b));
+}
+
+PX_FORCE_INLINE Vec3V V3MulAdd(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	return V3Add(V3Mul(a, b), c);
+}
+
+PX_FORCE_INLINE Vec3V V3NegMulSub(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	return V3Sub(c, V3Mul(a, b));
+}
+
+PX_FORCE_INLINE FloatV V3Dot(const Vec3V a, const Vec3V b)
+{
+	return FloatV(a.x * b.x + a.y * b.y + a.z * b.z);
+}
+
+PX_FORCE_INLINE VecCrossV V3PrepareCross(const Vec3VArg normal)
+{
+	return normal;
+}
+
+PX_FORCE_INLINE Vec3V V3Cross(const Vec3V a, const Vec3V b)
+{
+	return Vec3V(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+}
+
+PX_FORCE_INLINE FloatV V3Length(const Vec3V a)
+{
+	return FloatV(PxSqrt(a.x * a.x + a.y * a.y + a.z * a.z));
+}
+
+PX_FORCE_INLINE FloatV V3LengthSq(const Vec3V a)
+{
+	return FloatV(a.x * a.x + a.y * a.y + a.z * a.z);
+}
+
+PX_FORCE_INLINE Vec3V V3Normalize(const Vec3V a)
+{
+	VECMATHAOS_ASSERT(a.x != 0 || a.y != 0 || a.z != 0);
+	const PxF32 lengthInv = 1.0f / PxSqrt(a.x * a.x + a.y * a.y + a.z * a.z);
+	return Vec3V(a.x * lengthInv, a.y * lengthInv, a.z * lengthInv);
+}
+
+PX_FORCE_INLINE Vec3V V3NormalizeSafe(const Vec3V a, const Vec3V unsafeReturnValue)
+{
+	const PxF32 length = PxSqrt(a.x * a.x + a.y * a.y + a.z * a.z);
+	if(PX_EPS_REAL >= length)
+	{
+		return unsafeReturnValue;
+	}
+	else
+	{
+		const PxF32 lengthInv = 1.0f / length;
+		return Vec3V(a.x * lengthInv, a.y * lengthInv, a.z * lengthInv);
+	}
+}
+
+PX_FORCE_INLINE Vec3V V3NormalizeFast(const Vec3V a)
+{
+	VECMATHAOS_ASSERT(a.x != 0 || a.y != 0 || a.z != 0);
+	const PxF32 lengthInv = 1.0f / PxSqrt(a.x * a.x + a.y * a.y + a.z * a.z);
+	return Vec3V(a.x * lengthInv, a.y * lengthInv, a.z * lengthInv);
+}
+
+PX_FORCE_INLINE Vec3V V3Sel(const BoolV c, const Vec3V a, const Vec3V b)
+{
+	return Vec3V(c.ux ? a.x : b.x, c.uy ? a.y : b.y, c.uz ? a.z : b.z);
+}
+
+PX_FORCE_INLINE BoolV V3IsGrtr(const Vec3V a, const Vec3V b)
+{
+	return BoolV(BOOL_TO_U32(a.x > b.x), BOOL_TO_U32(a.y > b.y), BOOL_TO_U32(a.z > b.z), FALSE_TO_U32);
+}
+
+PX_FORCE_INLINE BoolV V3IsGrtrOrEq(const Vec3V a, const Vec3V b)
+{
+	return BoolV(BOOL_TO_U32(a.x >= b.x), BOOL_TO_U32(a.y >= b.y), BOOL_TO_U32(a.z >= b.z), TRUE_TO_U32);
+}
+
+PX_FORCE_INLINE BoolV V3IsEq(const Vec3V a, const Vec3V b)
+{
+	return BoolV(BOOL_TO_U32(a.x == b.x), BOOL_TO_U32(a.y == b.y), BOOL_TO_U32(a.z == b.z), TRUE_TO_U32);
+}
+
+PX_FORCE_INLINE Vec3V V3Max(const Vec3V a, const Vec3V b)
+{
+	return Vec3V(a.x > b.x ? a.x : b.x, a.y > b.y ? a.y : b.y, a.z > b.z ? a.z : b.z);
+}
+
+PX_FORCE_INLINE Vec3V V3Min(const Vec3V a, const Vec3V b)
+{
+	return Vec3V(a.x < b.x ? a.x : b.x, a.y < b.y ? a.y : b.y, a.z < b.z ? a.z : b.z);
+}
+
+PX_FORCE_INLINE FloatV V3ExtractMax(const Vec3V a)
+{
+	const PxF32 t0 = (a.x >= a.y) ? a.x : a.y;
+	return t0 >= a.z ? t0 : a.z;
+}
+
+PX_FORCE_INLINE FloatV V3ExtractMin(const Vec3V a)
+{
+	const PxF32 t0 = (a.x <= a.y) ? a.x : a.y;
+	return t0 <= a.z ? t0 : a.z;
+}
+
+// return (a >= 0.0f) ? 1.0f : -1.0f;
+PX_FORCE_INLINE Vec3V V3Sign(const Vec3V a)
+{
+	return Vec3V((a.x >= 0.f ? 1.f : -1.f), (a.y >= 0.f ? 1.f : -1.f), (a.z >= 0.f ? 1.f : -1.f));
+}
+
+PX_FORCE_INLINE Vec3V V3Clamp(const Vec3V a, const Vec3V minV, const Vec3V maxV)
+{
+	return V3Max(V3Min(a, maxV), minV);
+}
+
+PX_FORCE_INLINE Vec3V V3Abs(const Vec3V a)
+{
+	return V3Max(a, V3Neg(a));
+}
+
+PX_FORCE_INLINE PxU32 V3AllGrtr(const Vec3V a, const Vec3V b)
+{
+	return BOOL_TO_U32((a.x > b.x) & (a.y > b.y) & (a.z > b.z));
+}
+
+PX_FORCE_INLINE PxU32 V3AllGrtrOrEq(const Vec3V a, const Vec3V b)
+{
+	return BOOL_TO_U32((a.x >= b.x) & (a.y >= b.y) & (a.z >= b.z));
+}
+
+PX_FORCE_INLINE PxU32 V3AllEq(const Vec3V a, const Vec3V b)
+{
+	return BOOL_TO_U32((a.x == b.x) & (a.y == b.y) & (a.z == b.z));
+}
+
+PX_FORCE_INLINE Vec3V V3Round(const Vec3V a)
+{
+	return Vec3V(floor(a.x + 0.5f), floor(a.y + 0.5f), floor(a.z + 0.5f));
+}
+
+PX_FORCE_INLINE Vec3V V3Sin(const Vec3V a)
+{
+	return Vec3V(sinf(a.x), sinf(a.y), sinf(a.z));
+}
+
+PX_FORCE_INLINE Vec3V V3Cos(const Vec3V a)
+{
+	return Vec3V(cosf(a.x), cosf(a.y), cosf(a.z));
+}
+
+PX_FORCE_INLINE Vec3V V3PermYZZ(const Vec3V a)
+{
+	return Vec3V(a.y, a.z, a.z);
+}
+
+PX_FORCE_INLINE Vec3V V3PermXYX(const Vec3V a)
+{
+	return Vec3V(a.x, a.y, a.x);
+}
+
+PX_FORCE_INLINE Vec3V V3PermYZX(const Vec3V a)
+{
+	return Vec3V(a.y, a.z, a.x);
+}
+
+PX_FORCE_INLINE Vec3V V3PermZXY(const Vec3V a)
+{
+	return Vec3V(a.z, a.x, a.y);
+}
+
+PX_FORCE_INLINE Vec3V V3PermZZY(const Vec3V a)
+{
+	return Vec3V(a.z, a.z, a.y);
+}
+
+PX_FORCE_INLINE Vec3V V3PermYXX(const Vec3V a)
+{
+	return Vec3V(a.y, a.x, a.x);
+}
+
+PX_FORCE_INLINE Vec3V V3Perm_Zero_1Z_0Y(const Vec3V v0, const Vec3V v1)
+{
+	return Vec3V(0.0f, v1.z, v0.y);
+}
+
+PX_FORCE_INLINE Vec3V V3Perm_0Z_Zero_1X(const Vec3V v0, const Vec3V v1)
+{
+	return Vec3V(v0.z, 0.0f, v1.x);
+}
+
+PX_FORCE_INLINE Vec3V V3Perm_1Y_0X_Zero(const Vec3V v0, const Vec3V v1)
+{
+	return Vec3V(v1.y, v0.x, 0.0f);
+}
+
+PX_FORCE_INLINE FloatV V3SumElems(const Vec3V a)
+{
+	return FloatV(a.x + a.y + a.z);
+}
+
+PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V min, const Vec3V max)
+{
+	return BOOL_TO_U32(a.x > max.x || a.y > max.y || a.z > max.z || a.x < min.x || a.y < min.y || a.z < min.z);
+}
+
+PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V min, const Vec3V max)
+{
+	return BOOL_TO_U32(a.x <= max.x && a.y <= max.y && a.z <= max.z && a.x >= min.x && a.y >= min.y && a.z >= min.z);
+}
+
+PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V bounds)
+{
+	return V3OutOfBounds(a, V3Neg(bounds), bounds);
+}
+
+PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V bounds)
+{
+	return V3InBounds(a, V3Neg(bounds), bounds);
+}
+
+PX_FORCE_INLINE void V3Transpose(Vec3V& col0, Vec3V& col1, Vec3V& col2)
+{
+	const PxF32 t01 = col0.y, t02 = col0.z, t12 = col1.z;
+	col0.y = col1.x;
+	col0.z = col2.x;
+	col1.z = col2.y;
+	col1.x = t01;
+	col2.x = t02;
+	col2.y = t12;
+}
+
+/////////////////////////
+// VEC4V
+/////////////////////////
+
+PX_FORCE_INLINE Vec4V V4Splat(const FloatV f)
+{
+	return Vec4V(f.x, f.x, f.x, f.x);
+}
+
+PX_FORCE_INLINE Vec4V V4Merge(const FloatV* const floatVArray)
+{
+	return Vec4V(floatVArray[0].x, floatVArray[1].x, floatVArray[2].x, floatVArray[3].x);
+}
+
+PX_FORCE_INLINE Vec4V V4Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z, const FloatVArg w)
+{
+	return Vec4V(x.x, y.x, z.x, w.x);
+}
+
+PX_FORCE_INLINE Vec4V V4MergeW(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	return Vec4V(x.w, y.w, z.w, w.w);
+}
+
+PX_FORCE_INLINE Vec4V V4MergeZ(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	return Vec4V(x.z, y.z, z.z, w.z);
+}
+
+PX_FORCE_INLINE Vec4V V4MergeY(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	return Vec4V(x.y, y.y, z.y, w.y);
+}
+
+PX_FORCE_INLINE Vec4V V4MergeX(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	return Vec4V(x.x, y.x, z.x, w.x);
+}
+
+PX_FORCE_INLINE Vec4V V4UnpackXY(const Vec4VArg a, const Vec4VArg b)
+{
+	return Vec4V(a.x, b.x, a.y, b.y);
+}
+
+PX_FORCE_INLINE Vec4V V4UnpackZW(const Vec4VArg a, const Vec4VArg b)
+{
+	return Vec4V(a.z, b.z, a.w, b.w);
+}
+
+PX_FORCE_INLINE Vec4V V4UnitX()
+{
+	return Vec4V(1.0f, 0.0f, 0.0f, 0.0f);
+}
+
+PX_FORCE_INLINE Vec4V V4UnitY()
+{
+	return Vec4V(0.0f, 1.0f, 0.0f, 0.0f);
+}
+
+PX_FORCE_INLINE Vec4V V4UnitZ()
+{
+	return Vec4V(0.0f, 0.0f, 1.0f, 0.0f);
+}
+
+PX_FORCE_INLINE Vec4V V4UnitW()
+{
+	return Vec4V(0.0f, 0.0f, 0.0f, 1.0f);
+}
+
+PX_FORCE_INLINE FloatV V4GetX(const Vec4V f)
+{
+	return FloatV(f.x);
+}
+
+PX_FORCE_INLINE FloatV V4GetY(const Vec4V f)
+{
+	return FloatV(f.y);
+}
+
+PX_FORCE_INLINE FloatV V4GetZ(const Vec4V f)
+{
+	return FloatV(f.z);
+}
+
+PX_FORCE_INLINE FloatV V4GetW(const Vec4V f)
+{
+	return FloatV(f.w);
+}
+
+PX_FORCE_INLINE Vec4V V4SetX(const Vec4V v, const FloatV f)
+{
+	return Vec4V(f.x, v.y, v.z, v.w);
+}
+
+PX_FORCE_INLINE Vec4V V4SetY(const Vec4V v, const FloatV f)
+{
+	return Vec4V(v.x, f.x, v.z, v.w);
+}
+
+PX_FORCE_INLINE Vec4V V4SetZ(const Vec4V v, const FloatV f)
+{
+	return Vec4V(v.x, v.y, f.x, v.w);
+}
+
+PX_FORCE_INLINE Vec4V V4SetW(const Vec4V v, const FloatV f)
+{
+	return Vec4V(v.x, v.y, v.z, f.x);
+}
+
+PX_FORCE_INLINE Vec4V V4SetW(const Vec3V v, const FloatV f)
+{
+	return Vec4V(v.x, v.y, v.z, f.x);
+}
+
+PX_FORCE_INLINE Vec4V V4ClearW(const Vec4V v)
+{
+	return Vec4V(v.x, v.y, v.z, 0.0f);
+}
+
+PX_FORCE_INLINE Vec4V V4PermYXWZ(const Vec4V v)
+{
+	return Vec4V(v.y, v.x, v.w, v.z);
+}
+
+PX_FORCE_INLINE Vec4V V4PermXZXZ(const Vec4V v)
+{
+	return Vec4V(v.x, v.z, v.x, v.z);
+}
+
+PX_FORCE_INLINE Vec4V V4PermYWYW(const Vec4V v)
+{
+	return Vec4V(v.y, v.w, v.y, v.w);
+}
+
+PX_FORCE_INLINE Vec4V V4PermYZXW(const Vec4V v)
+{
+	return Vec4V(v.y, v.z, v.x, v.w);
+}
+
+template <PxU8 _x, PxU8 _y, PxU8 _z, PxU8 _w>
+PX_FORCE_INLINE Vec4V V4Perm(const Vec4V v)
+{
+	const PxF32 f[4] = { v.x, v.y, v.z, v.w };
+	return Vec4V(f[_x], f[_y], f[_z], f[_w]);
+}
+
+PX_FORCE_INLINE Vec4V V4Zero()
+{
+	return V4Load(0.0f);
+}
+
+PX_FORCE_INLINE Vec4V V4One()
+{
+	return V4Load(1.0f);
+}
+
+PX_FORCE_INLINE Vec4V V4Eps()
+{
+	return V4Load(PX_EPS_REAL);
+}
+
+PX_FORCE_INLINE Vec4V V4Neg(const Vec4V c)
+{
+	return Vec4V(-c.x, -c.y, -c.z, -c.w);
+}
+
+PX_FORCE_INLINE Vec4V V4Add(const Vec4V a, const Vec4V b)
+{
+	return Vec4V(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+
+PX_FORCE_INLINE Vec4V V4Sub(const Vec4V a, const Vec4V b)
+{
+	return Vec4V(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+}
+
+PX_FORCE_INLINE Vec4V V4Scale(const Vec4V a, const FloatV b)
+{
+	return Vec4V(a.x * b.x, a.y * b.x, a.z * b.x, a.w * b.x);
+}
+
+PX_FORCE_INLINE Vec4V V4Mul(const Vec4V a, const Vec4V b)
+{
+	return Vec4V(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
+}
+
+PX_FORCE_INLINE Vec4V V4ScaleInv(const Vec4V a, const FloatV b)
+{
+	const PxF32 bInv = 1.0f / b.x;
+	return Vec4V(a.x * bInv, a.y * bInv, a.z * bInv, a.w * bInv);
+}
+
+PX_FORCE_INLINE Vec4V V4Div(const Vec4V a, const Vec4V b)
+{
+	VECMATHAOS_ASSERT(b.x != 0 && b.y != 0 && b.z != 0 && b.w != 0);
+	return Vec4V(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
+}
+
+PX_FORCE_INLINE Vec4V V4ScaleInvFast(const Vec4V a, const FloatV b)
+{
+	const PxF32 bInv = 1.0f / b.x;
+	return Vec4V(a.x * bInv, a.y * bInv, a.z * bInv, a.w * bInv);
+}
+
+PX_FORCE_INLINE Vec4V V4DivFast(const Vec4V a, const Vec4V b)
+{
+	return Vec4V(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
+}
+
+PX_FORCE_INLINE Vec4V V4Recip(const Vec4V a)
+{
+	return Vec4V(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
+}
+
+PX_FORCE_INLINE Vec4V V4RecipFast(const Vec4V a)
+{
+	return Vec4V(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
+}
+
+PX_FORCE_INLINE Vec4V V4Rsqrt(const Vec4V a)
+{
+	return Vec4V(PxRecipSqrt(a.x), PxRecipSqrt(a.y), PxRecipSqrt(a.z), PxRecipSqrt(a.w));
+}
+
+PX_FORCE_INLINE Vec4V V4RsqrtFast(const Vec4V a)
+{
+	return Vec4V(PxRecipSqrt(a.x), PxRecipSqrt(a.y), PxRecipSqrt(a.z), PxRecipSqrt(a.w));
+}
+
+PX_FORCE_INLINE Vec4V V4Sqrt(const Vec4V a)
+{
+	return Vec4V(PxSqrt(a.x), PxSqrt(a.y), PxSqrt(a.z), PxSqrt(a.w));
+}
+
+PX_FORCE_INLINE Vec4V V4ScaleAdd(const Vec4V a, const FloatV b, const Vec4V c)
+{
+	return V4Add(V4Scale(a, b), c);
+}
+
+PX_FORCE_INLINE Vec4V V4NegScaleSub(const Vec4V a, const FloatV b, const Vec4V c)
+{
+	return V4Sub(c, V4Scale(a, b));
+}
+
+PX_FORCE_INLINE Vec4V V4MulAdd(const Vec4V a, const Vec4V b, const Vec4V c)
+{
+	return V4Add(V4Mul(a, b), c);
+}
+
+PX_FORCE_INLINE Vec4V V4NegMulSub(const Vec4V a, const Vec4V b, const Vec4V c)
+{
+	return V4Sub(c, V4Mul(a, b));
+}
+
+PX_FORCE_INLINE FloatV V4SumElements(const Vec4V a)
+{
+	return FloatV(a.x + a.y + a.z + a.w);
+}
+
+PX_FORCE_INLINE FloatV V4Dot(const Vec4V a, const Vec4V b)
+{
+	return FloatV(a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w);
+}
+
+PX_FORCE_INLINE FloatV V4Dot3(const Vec4V a, const Vec4V b)
+{
+	return FloatV(a.x * b.x + a.y * b.y + a.z * b.z);
+}
+
+PX_FORCE_INLINE Vec4V V4Cross(const Vec4V a, const Vec4V b)
+{
+	return Vec4V(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f);
+}
+
+PX_FORCE_INLINE FloatV V4Length(const Vec4V a)
+{
+	return FloatV(PxSqrt(a.x * a.x + a.y * a.y + a.z * a.z + a.w * a.w));
+}
+
+PX_FORCE_INLINE FloatV V4LengthSq(const Vec4V a)
+{
+	return V4Dot(a, a);
+}
+
+PX_FORCE_INLINE Vec4V V4Normalize(const Vec4V a)
+{
+	VECMATHAOS_ASSERT(0 != a.x || 0 != a.y || 0 != a.z || 0 != a.w);
+	const FloatV length = FloatV(V4Length(a));
+	return V4ScaleInv(a, length);
+}
+
+PX_FORCE_INLINE Vec4V V4NormalizeSafe(const Vec4V a, const Vec4V unsafeReturnValue)
+{
+	const FloatV length = FloatV(V4Length(a));
+	if(PX_EPS_REAL >= length.x)
+	{
+		return unsafeReturnValue;
+	}
+	else
+	{
+		return V4ScaleInv(a, length);
+	}
+}
+PX_FORCE_INLINE Vec4V V4NormalizeFast(const Vec4V a)
+{
+	VECMATHAOS_ASSERT(0 != a.x || 0 != a.y || 0 != a.z || 0 != a.w);
+	const FloatV length = FloatV(V4Length(a));
+	return V4ScaleInv(a, length);
+}
+
+PX_FORCE_INLINE Vec4V V4Sel(const BoolV c, const Vec4V a, const Vec4V b)
+{
+	return Vec4V(c.ux ? a.x : b.x, c.uy ? a.y : b.y, c.uz ? a.z : b.z, c.uw ? a.w : b.w);
+}
+
+PX_FORCE_INLINE BoolV V4IsGrtr(const Vec4V a, const Vec4V b)
+{
+	return BoolV(BOOL_TO_U32(a.x > b.x), BOOL_TO_U32(a.y > b.y), BOOL_TO_U32(a.z > b.z), BOOL_TO_U32(a.w > b.w));
+}
+
+PX_FORCE_INLINE BoolV V4IsGrtrOrEq(const Vec4V a, const Vec4V b)
+{
+	return BoolV(BOOL_TO_U32(a.x >= b.x), BOOL_TO_U32(a.y >= b.y), BOOL_TO_U32(a.z >= b.z), BOOL_TO_U32(a.w >= b.w));
+}
+
+PX_FORCE_INLINE BoolV V4IsEq(const Vec4V a, const Vec4V b)
+{
+	return BoolV(BOOL_TO_U32(a.x == b.x), BOOL_TO_U32(a.y == b.y), BOOL_TO_U32(a.z == b.z), BOOL_TO_U32(a.w == b.w));
+}
+
+PX_FORCE_INLINE Vec4V V4Max(const Vec4V a, const Vec4V b)
+{
+	return Vec4V(a.x > b.x ? a.x : b.x, a.y > b.y ? a.y : b.y, a.z > b.z ? a.z : b.z, a.w > b.w ? a.w : b.w);
+}
+
+PX_FORCE_INLINE Vec4V V4Min(const Vec4V a, const Vec4V b)
+{
+	return Vec4V(a.x < b.x ? a.x : b.x, a.y < b.y ? a.y : b.y, a.z < b.z ? a.z : b.z, a.w < b.w ? a.w : b.w);
+}
+
+PX_FORCE_INLINE FloatV V4ExtractMax(const Vec4V a)
+{
+	const PxF32 t0 = (a.x >= a.y) ? a.x : a.y;
+	const PxF32 t1 = (a.z >= a.w) ? a.x : a.w;
+	return t0 >= t1 ? t0 : t1;
+}
+
+PX_FORCE_INLINE FloatV V4ExtractMin(const Vec4V a)
+{
+	const PxF32 t0 = (a.x <= a.y) ? a.x : a.y;
+	const PxF32 t1 = (a.z <= a.w) ? a.x : a.w;
+	return t0 <= t1 ? t0 : t1;
+}
+
+PX_FORCE_INLINE Vec4V V4Clamp(const Vec4V a, const Vec4V minV, const Vec4V maxV)
+{
+	return V4Max(V4Min(a, maxV), minV);
+}
+
+PX_FORCE_INLINE Vec4V V4Round(const Vec4V a)
+{
+	return Vec4V(floor(a.x + 0.5f), floor(a.y + 0.5f), floor(a.z + 0.5f), floor(a.w + 0.5f));
+}
+
+PX_FORCE_INLINE Vec4V V4Sin(const Vec4V a)
+{
+	return Vec4V(sinf(a.x), sinf(a.y), sinf(a.z), sinf(a.w));
+}
+
+PX_FORCE_INLINE Vec4V V4Cos(const Vec4V a)
+{
+	return Vec4V(cosf(a.x), cosf(a.y), cosf(a.z), cosf(a.w));
+}
+
+PX_FORCE_INLINE PxU32 V4AllGrtr(const Vec4V a, const Vec4V b)
+{
+	return BOOL_TO_U32((a.x > b.x) & (a.y > b.y) & (a.z > b.z) & (a.w > b.w));
+}
+
+PX_FORCE_INLINE PxU32 V4AllGrtrOrEq(const Vec4V a, const Vec4V b)
+{
+	return BOOL_TO_U32((a.x >= b.x) & (a.y >= b.y) & (a.z >= b.z) & (a.w >= b.w));
+}
+
+PX_FORCE_INLINE PxU32 V4AllGrtrOrEq3(const Vec4V a, const Vec4V b)
+{
+	return BOOL_TO_U32((a.x >= b.x) & (a.y >= b.y) & (a.z >= b.z));
+}
+
+PX_FORCE_INLINE PxU32 V4AllEq(const Vec4V a, const Vec4V b)
+{
+	return BOOL_TO_U32((a.x == b.x) & (a.y == b.y) & (a.z == b.z) & (a.w == b.w));
+}
+
+PX_FORCE_INLINE PxU32 V4AnyGrtr3(const Vec4V a, const Vec4V b)
+{
+	return BOOL_TO_U32((a.x > b.x) | (a.y > b.y) | (a.z > b.z));
+}
+
+PX_FORCE_INLINE void V4Transpose(Vec4V& col0, Vec4V& col1, Vec4V& col2, Vec4V& col3)
+{
+	const PxF32 t01 = col0.y, t02 = col0.z, t03 = col0.w;
+	const PxF32 t12 = col1.z, t13 = col1.w;
+	const PxF32 t23 = col2.w;
+	col0.y = col1.x;
+	col0.z = col2.x;
+	col0.w = col3.x;
+	col1.z = col2.y;
+	col1.w = col3.y;
+	col2.w = col3.z;
+	col1.x = t01;
+	col2.x = t02;
+	col3.x = t03;
+	col2.y = t12;
+	col3.y = t13;
+	col3.z = t23;
+}
+
+PX_FORCE_INLINE BoolV BFFFF()
+{
+	return BoolV(FALSE_TO_U32, FALSE_TO_U32, FALSE_TO_U32, FALSE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BFFFT()
+{
+	return BoolV(FALSE_TO_U32, FALSE_TO_U32, FALSE_TO_U32, TRUE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BFFTF()
+{
+	return BoolV(FALSE_TO_U32, FALSE_TO_U32, TRUE_TO_U32, FALSE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BFFTT()
+{
+	return BoolV(FALSE_TO_U32, FALSE_TO_U32, TRUE_TO_U32, TRUE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BFTFF()
+{
+	return BoolV(FALSE_TO_U32, TRUE_TO_U32, FALSE_TO_U32, FALSE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BFTFT()
+{
+	return BoolV(FALSE_TO_U32, TRUE_TO_U32, FALSE_TO_U32, TRUE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BFTTF()
+{
+	return BoolV(FALSE_TO_U32, TRUE_TO_U32, TRUE_TO_U32, FALSE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BFTTT()
+{
+	return BoolV(FALSE_TO_U32, TRUE_TO_U32, TRUE_TO_U32, TRUE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BTFFF()
+{
+	return BoolV(TRUE_TO_U32, FALSE_TO_U32, FALSE_TO_U32, FALSE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BTFFT()
+{
+	return BoolV(TRUE_TO_U32, FALSE_TO_U32, FALSE_TO_U32, TRUE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BTFTF()
+{
+	return BoolV(TRUE_TO_U32, FALSE_TO_U32, TRUE_TO_U32, FALSE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BTFTT()
+{
+	return BoolV(TRUE_TO_U32, FALSE_TO_U32, TRUE_TO_U32, TRUE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BTTFF()
+{
+	return BoolV(TRUE_TO_U32, TRUE_TO_U32, FALSE_TO_U32, FALSE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BTTFT()
+{
+	return BoolV(TRUE_TO_U32, TRUE_TO_U32, FALSE_TO_U32, TRUE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BTTTF()
+{
+	return BoolV(TRUE_TO_U32, TRUE_TO_U32, TRUE_TO_U32, FALSE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BTTTT()
+{
+	return BoolV(TRUE_TO_U32, TRUE_TO_U32, TRUE_TO_U32, TRUE_TO_U32);
+}
+
+PX_FORCE_INLINE BoolV BXMask()
+{
+	return BTFFF();
+}
+PX_FORCE_INLINE BoolV BYMask()
+{
+	return BFTFF();
+}
+PX_FORCE_INLINE BoolV BZMask()
+{
+	return BFFTF();
+}
+PX_FORCE_INLINE BoolV BWMask()
+{
+	return BFFFT();
+}
+
+PX_FORCE_INLINE BoolV BGetX(const BoolV a)
+{
+	return BoolV(a.ux, a.ux, a.ux, a.ux);
+}
+
+PX_FORCE_INLINE BoolV BGetY(const BoolV a)
+{
+	return BoolV(a.uy, a.uy, a.uy, a.uy);
+}
+
+PX_FORCE_INLINE BoolV BGetZ(const BoolV a)
+{
+	return BoolV(a.uz, a.uz, a.uz, a.uz);
+}
+
+PX_FORCE_INLINE BoolV BGetW(const BoolV a)
+{
+	return BoolV(a.uw, a.uw, a.uw, a.uw);
+}
+
+PX_FORCE_INLINE BoolV BSetX(const BoolV v, const BoolV f)
+{
+	return BoolV(f.ux, v.uy, v.uz, v.uw);
+}
+
+PX_FORCE_INLINE BoolV BSetY(const BoolV v, const BoolV f)
+{
+	return BoolV(v.ux, f.uy, v.uz, v.uw);
+}
+
+PX_FORCE_INLINE BoolV BSetZ(const BoolV v, const BoolV f)
+{
+	return BoolV(v.ux, v.uy, f.uz, v.uw);
+}
+
+PX_FORCE_INLINE BoolV BSetW(const BoolV v, const BoolV f)
+{
+	return BoolV(v.ux, v.uy, v.uz, f.uw);
+}
+
+template <int index>
+BoolV BSplatElement(BoolV a)
+{
+	PxU32* b = (PxU32*)&a;
+	return BoolV(b[index], b[index], b[index], b[index]);
+}
+
+PX_FORCE_INLINE BoolV BAnd(const BoolV a, const BoolV b)
+{
+	return BoolV(BOOL_TO_U32(a.ux && b.ux), BOOL_TO_U32(a.uy && b.uy), BOOL_TO_U32(a.uz && b.uz), BOOL_TO_U32(a.uw && b.uw));
+}
+
+PX_FORCE_INLINE BoolV BAndNot(const BoolV a, const BoolV b)
+{
+	return BoolV(a.ux & ~b.ux, a.uy & ~b.uy, a.uz & ~b.uz, a.uw & ~b.uw);
+}
+
+PX_FORCE_INLINE BoolV BNot(const BoolV a)
+{
+	return BoolV(~a.ux, ~a.uy, ~a.uz, ~a.uw);
+}
+
+PX_FORCE_INLINE BoolV BOr(const BoolV a, const BoolV b)
+{
+	return BoolV(BOOL_TO_U32(a.ux || b.ux), BOOL_TO_U32(a.uy || b.uy), BOOL_TO_U32(a.uz || b.uz), BOOL_TO_U32(a.uw || b.uw));
+}
+
+PX_FORCE_INLINE PxU32 BAllEq(const BoolV a, const BoolV b)
+{
+	return (a.ux == b.ux && a.uy == b.uy && a.uz == b.uz && a.uw == b.uw ? TRUE_TO_U32 : FALSE_TO_U32);
+}
+
+PX_FORCE_INLINE PxU32 BAllEqTTTT(const BoolV a)
+{
+	return BAllEq(a, BTTTT());
+}
+
+PX_FORCE_INLINE PxU32 BAllEqFFFF(const BoolV a)
+{
+	return BAllEq(a, BFFFF());
+}
+
+PX_FORCE_INLINE BoolV BAllTrue4(const BoolV a)
+{
+	return (a.ux & a.uy & a.uz & a.uw) ? BTTTT() : BFFFF();
+}
+
+PX_FORCE_INLINE BoolV BAnyTrue4(const BoolV a)
+{
+	return (a.ux | a.uy | a.uz | a.uw) ? BTTTT() : BFFFF();
+}
+
+PX_FORCE_INLINE BoolV BAllTrue3(const BoolV a)
+{
+	return (a.ux & a.uy & a.uz) ? BTTTT() : BFFFF();
+}
+
+PX_FORCE_INLINE BoolV BAnyTrue3(const BoolV a)
+{
+	return (a.ux | a.uy | a.uz) ? BTTTT() : BFFFF();
+}
+
+PX_FORCE_INLINE PxU32 BGetBitMask(const BoolV a)
+{
+	return (a.ux & 1) | (a.uy & 2) | (a.uz & 4) | (a.uw & 8);
+}
+
+//////////////////////////////////
+// MAT33V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec3V M33MulV3(const Mat33V& a, const Vec3V b)
+{
+	return Vec3V(a.col0.x * b.x + a.col1.x * b.y + a.col2.x * b.z, a.col0.y * b.x + a.col1.y * b.y + a.col2.y * b.z,
+	             a.col0.z * b.x + a.col1.z * b.y + a.col2.z * b.z);
+}
+
+PX_FORCE_INLINE Vec3V M33TrnspsMulV3(const Mat33V& a, const Vec3V b)
+{
+	return Vec3V(a.col0.x * b.x + a.col0.y * b.y + a.col0.z * b.z, a.col1.x * b.x + a.col1.y * b.y + a.col1.z * b.z,
+	             a.col2.x * b.x + a.col2.y * b.y + a.col2.z * b.z);
+}
+
+PX_FORCE_INLINE Vec3V M33MulV3AddV3(const Mat33V& A, const Vec3V b, const Vec3V c)
+{
+	const FloatV x = V3GetX(b);
+	const FloatV y = V3GetY(b);
+	const FloatV z = V3GetZ(b);
+	Vec3V result = V3ScaleAdd(A.col0, x, c);
+	result = V3ScaleAdd(A.col1, y, result);
+	return V3ScaleAdd(A.col2, z, result);
+}
+
+PX_FORCE_INLINE Mat33V M33MulM33(const Mat33V& a, const Mat33V& b)
+{
+	return Mat33V(M33MulV3(a, b.col0), M33MulV3(a, b.col1), M33MulV3(a, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Add(const Mat33V& a, const Mat33V& b)
+{
+	return Mat33V(V3Add(a.col0, b.col0), V3Add(a.col1, b.col1), V3Add(a.col2, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Scale(const Mat33V& a, const FloatV& b)
+{
+	return Mat33V(V3Scale(a.col0, b), V3Scale(a.col1, b), V3Scale(a.col2, b));
+}
+
+PX_FORCE_INLINE Mat33V M33Sub(const Mat33V& a, const Mat33V& b)
+{
+	return Mat33V(V3Sub(a.col0, b.col0), V3Sub(a.col1, b.col1), V3Sub(a.col2, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Neg(const Mat33V& a)
+{
+	return Mat33V(V3Neg(a.col0), V3Neg(a.col1), V3Neg(a.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Abs(const Mat33V& a)
+{
+	return Mat33V(V3Abs(a.col0), V3Abs(a.col1), V3Abs(a.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Diagonal(const Vec3VArg d)
+{
+	const Vec3V x = V3Mul(V3UnitX(), d);
+	const Vec3V y = V3Mul(V3UnitY(), d);
+	const Vec3V z = V3Mul(V3UnitZ(), d);
+	return Mat33V(x, y, z);
+}
+
+PX_FORCE_INLINE Mat33V M33Inverse(const Mat33V& a)
+{
+	const PxF32 det = a.col0.x * (a.col1.y * a.col2.z - a.col1.z * a.col2.y) -
+	                  a.col1.x * (a.col0.y * a.col2.z - a.col2.y * a.col0.z) +
+	                  a.col2.x * (a.col0.y * a.col1.z - a.col1.y * a.col0.z);
+
+	const PxF32 invDet = 1.0f / det;
+
+	Mat33V ret;
+	ret.col0.x = invDet * (a.col1.y * a.col2.z - a.col2.y * a.col1.z);
+	ret.col0.y = invDet * (a.col2.y * a.col0.z - a.col0.y * a.col2.z);
+	ret.col0.z = invDet * (a.col0.y * a.col1.z - a.col1.y * a.col0.z);
+
+	ret.col1.x = invDet * (a.col2.x * a.col1.z - a.col1.x * a.col2.z);
+	ret.col1.y = invDet * (a.col0.x * a.col2.z - a.col2.x * a.col0.z);
+	ret.col1.z = invDet * (a.col1.x * a.col0.z - a.col0.x * a.col1.z);
+
+	ret.col2.x = invDet * (a.col1.x * a.col2.y - a.col2.x * a.col1.y);
+	ret.col2.y = invDet * (a.col2.x * a.col0.y - a.col0.x * a.col2.y);
+	ret.col2.z = invDet * (a.col0.x * a.col1.y - a.col1.x * a.col0.y);
+
+	return ret;
+}
+
+PX_FORCE_INLINE Mat33V Mat33V_From_PxMat33(const PxMat33& m)
+{
+	return Mat33V(V3LoadU(m.column0), V3LoadU(m.column1), V3LoadU(m.column2));
+}
+
+PX_FORCE_INLINE void PxMat33_From_Mat33V(const Mat33V& m, PxMat33& out)
+{
+	PX_ASSERT((size_t(&out) & 15) == 0);
+	V3StoreU(m.col0, out.column0);
+	V3StoreU(m.col1, out.column1);
+	V3StoreU(m.col2, out.column2);
+}
+
+PX_FORCE_INLINE Mat33V M33Trnsps(const Mat33V& a)
+{
+	return Mat33V(Vec3V(a.col0.x, a.col1.x, a.col2.x), Vec3V(a.col0.y, a.col1.y, a.col2.y),
+	              Vec3V(a.col0.z, a.col1.z, a.col2.z));
+}
+
+PX_FORCE_INLINE Mat33V M33Identity()
+{
+	return Mat33V(V3UnitX(), V3UnitY(), V3UnitZ());
+}
+
+//////////////////////////////////
+// MAT34V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec3V M34MulV3(const Mat34V& a, const Vec3V b)
+{
+	return Vec3V(a.col0.x * b.x + a.col1.x * b.y + a.col2.x * b.z + a.col3.x,
+	             a.col0.y * b.x + a.col1.y * b.y + a.col2.y * b.z + a.col3.y,
+	             a.col0.z * b.x + a.col1.z * b.y + a.col2.z * b.z + a.col3.z);
+}
+
+PX_FORCE_INLINE Vec3V M34Mul33V3(const Mat34V& a, const Vec3V b)
+{
+	return Vec3V(a.col0.x * b.x + a.col1.x * b.y + a.col2.x * b.z, a.col0.y * b.x + a.col1.y * b.y + a.col2.y * b.z,
+	             a.col0.z * b.x + a.col1.z * b.y + a.col2.z * b.z);
+}
+
+PX_FORCE_INLINE Vec3V M34TrnspsMul33V3(const Mat34V& a, const Vec3V b)
+{
+	return Vec3V(a.col0.x * b.x + a.col0.y * b.y + a.col0.z * b.z, a.col1.x * b.x + a.col1.y * b.y + a.col1.z * b.z,
+	             a.col2.x * b.x + a.col2.y * b.y + a.col2.z * b.z);
+}
+
+PX_FORCE_INLINE Mat34V M34MulM34(const Mat34V& a, const Mat34V& b)
+{
+	return Mat34V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2), M34MulV3(a, b.col3));
+}
+
+PX_FORCE_INLINE Mat33V M34MulM33(const Mat34V& a, const Mat33V& b)
+{
+	return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M34Mul33V3(const Mat34V& a, const Mat33V& b)
+{
+	return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M34Mul33MM34(const Mat34V& a, const Mat34V& b)
+{
+	return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2));
+}
+
+PX_FORCE_INLINE Mat34V M34Add(const Mat34V& a, const Mat34V& b)
+{
+	return Mat34V(V3Add(a.col0, b.col0), V3Add(a.col1, b.col1), V3Add(a.col2, b.col2), V3Add(a.col3, b.col3));
+}
+
+PX_FORCE_INLINE Mat33V M34Trnsps33(const Mat34V& a)
+{
+	return Mat33V(Vec3V(a.col0.x, a.col1.x, a.col2.x), Vec3V(a.col0.y, a.col1.y, a.col2.y),
+	              Vec3V(a.col0.z, a.col1.z, a.col2.z));
+}
+
+//////////////////////////////////
+// MAT44V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec4V M44MulV4(const Mat44V& a, const Vec4V b)
+{
+	return Vec4V(a.col0.x * b.x + a.col1.x * b.y + a.col2.x * b.z + a.col3.x * b.w,
+	             a.col0.y * b.x + a.col1.y * b.y + a.col2.y * b.z + a.col3.y * b.w,
+	             a.col0.z * b.x + a.col1.z * b.y + a.col2.z * b.z + a.col3.z * b.w,
+	             a.col0.w * b.x + a.col1.w * b.y + a.col2.w * b.z + a.col3.w * b.w);
+}
+
+PX_FORCE_INLINE Vec4V M44TrnspsMulV4(const Mat44V& a, const Vec4V b)
+{
+	return Vec4V(a.col0.x * b.x + a.col0.y * b.y + a.col0.z * b.z + a.col0.w * b.w,
+	             a.col1.x * b.x + a.col1.y * b.y + a.col1.z * b.z + a.col1.w * b.w,
+	             a.col2.x * b.x + a.col2.y * b.y + a.col2.z * b.z + a.col2.w * b.w,
+	             a.col3.x * b.x + a.col3.y * b.y + a.col3.z * b.z + a.col3.w * b.w);
+}
+
+PX_FORCE_INLINE Mat44V M44MulM44(const Mat44V& a, const Mat44V& b)
+{
+	return Mat44V(M44MulV4(a, b.col0), M44MulV4(a, b.col1), M44MulV4(a, b.col2), M44MulV4(a, b.col3));
+}
+
+PX_FORCE_INLINE Mat44V M44Add(const Mat44V& a, const Mat44V& b)
+{
+	return Mat44V(V4Add(a.col0, b.col0), V4Add(a.col1, b.col1), V4Add(a.col2, b.col2), V4Add(a.col3, b.col3));
+}
+
+PX_FORCE_INLINE Mat44V M44Inverse(const Mat44V& a)
+{
+	PxF32 tmp[12];
+	PxF32 dst[16];
+	PxF32 det;
+
+	const PxF32 src[16] = { a.col0.x, a.col0.y, a.col0.z, a.col0.w, a.col1.x, a.col1.y, a.col1.z, a.col1.w,
+		                    a.col2.x, a.col2.y, a.col2.z, a.col2.w, a.col3.x, a.col3.y, a.col3.z, a.col3.w };
+
+	tmp[0] = src[10] * src[15];
+	tmp[1] = src[11] * src[14];
+	tmp[2] = src[9] * src[15];
+	tmp[3] = src[11] * src[13];
+	tmp[4] = src[9] * src[14];
+	tmp[5] = src[10] * src[13];
+	tmp[6] = src[8] * src[15];
+	tmp[7] = src[11] * src[12];
+	tmp[8] = src[8] * src[14];
+	tmp[9] = src[10] * src[12];
+	tmp[10] = src[8] * src[13];
+	tmp[11] = src[9] * src[12];
+
+	dst[0] = tmp[0] * src[5] + tmp[3] * src[6] + tmp[4] * src[7];
+	dst[0] -= tmp[1] * src[5] + tmp[2] * src[6] + tmp[5] * src[7];
+	dst[1] = tmp[1] * src[4] + tmp[6] * src[6] + tmp[9] * src[7];
+	dst[1] -= tmp[0] * src[4] + tmp[7] * src[6] + tmp[8] * src[7];
+	dst[2] = tmp[2] * src[4] + tmp[7] * src[5] + tmp[10] * src[7];
+	dst[2] -= tmp[3] * src[4] + tmp[6] * src[5] + tmp[11] * src[7];
+	dst[3] = tmp[5] * src[4] + tmp[8] * src[5] + tmp[11] * src[6];
+	dst[3] -= tmp[4] * src[4] + tmp[9] * src[5] + tmp[10] * src[6];
+	dst[4] = tmp[1] * src[1] + tmp[2] * src[2] + tmp[5] * src[3];
+	dst[4] -= tmp[0] * src[1] + tmp[3] * src[2] + tmp[4] * src[3];
+	dst[5] = tmp[0] * src[0] + tmp[7] * src[2] + tmp[8] * src[3];
+	dst[5] -= tmp[1] * src[0] + tmp[6] * src[2] + tmp[9] * src[3];
+	dst[6] = tmp[3] * src[0] + tmp[6] * src[1] + tmp[11] * src[3];
+	dst[6] -= tmp[2] * src[0] + tmp[7] * src[1] + tmp[10] * src[3];
+	dst[7] = tmp[4] * src[0] + tmp[9] * src[1] + tmp[10] * src[2];
+	dst[7] -= tmp[5] * src[0] + tmp[8] * src[1] + tmp[11] * src[2];
+
+	tmp[0] = src[2] * src[7];
+	tmp[1] = src[3] * src[6];
+	tmp[2] = src[1] * src[7];
+	tmp[3] = src[3] * src[5];
+	tmp[4] = src[1] * src[6];
+	tmp[5] = src[2] * src[5];
+	tmp[6] = src[0] * src[7];
+	tmp[7] = src[3] * src[4];
+	tmp[8] = src[0] * src[6];
+	tmp[9] = src[2] * src[4];
+	tmp[10] = src[0] * src[5];
+	tmp[11] = src[1] * src[4];
+
+	dst[8] = tmp[0] * src[13] + tmp[3] * src[14] + tmp[4] * src[15];
+	dst[8] -= tmp[1] * src[13] + tmp[2] * src[14] + tmp[5] * src[15];
+	dst[9] = tmp[1] * src[12] + tmp[6] * src[14] + tmp[9] * src[15];
+	dst[9] -= tmp[0] * src[12] + tmp[7] * src[14] + tmp[8] * src[15];
+	dst[10] = tmp[2] * src[12] + tmp[7] * src[13] + tmp[10] * src[15];
+	dst[10] -= tmp[3] * src[12] + tmp[6] * src[13] + tmp[11] * src[15];
+	dst[11] = tmp[5] * src[12] + tmp[8] * src[13] + tmp[11] * src[14];
+	dst[11] -= tmp[4] * src[12] + tmp[9] * src[13] + tmp[10] * src[14];
+	dst[12] = tmp[2] * src[10] + tmp[5] * src[11] + tmp[1] * src[9];
+	dst[12] -= tmp[4] * src[11] + tmp[0] * src[9] + tmp[3] * src[10];
+	dst[13] = tmp[8] * src[11] + tmp[0] * src[8] + tmp[7] * src[10];
+	dst[13] -= tmp[6] * src[10] + tmp[9] * src[11] + tmp[1] * src[8];
+	dst[14] = tmp[6] * src[9] + tmp[11] * src[11] + tmp[3] * src[8];
+	dst[14] -= tmp[10] * src[11] + tmp[2] * src[8] + tmp[7] * src[9];
+	dst[15] = tmp[10] * src[10] + tmp[4] * src[8] + tmp[9] * src[9];
+	dst[15] -= tmp[8] * src[9] + tmp[11] * src[10] + tmp[5] * src[8];
+
+	det = src[0] * dst[0] + src[1] * dst[1] + src[2] * dst[2] + src[3] * dst[3];
+
+	det = 1.0f / det;
+	for(PxU32 j = 0; j < 16; j++)
+	{
+		dst[j] *= det;
+	}
+
+	return Mat44V(Vec4V(dst[0], dst[4], dst[8], dst[12]), Vec4V(dst[1], dst[5], dst[9], dst[13]),
+	              Vec4V(dst[2], dst[6], dst[10], dst[14]), Vec4V(dst[3], dst[7], dst[11], dst[15]));
+}
+
+PX_FORCE_INLINE Mat44V M44Trnsps(const Mat44V& a)
+{
+	return Mat44V(Vec4V(a.col0.x, a.col1.x, a.col2.x, a.col3.x), Vec4V(a.col0.y, a.col1.y, a.col2.y, a.col3.y),
+	              Vec4V(a.col0.z, a.col1.z, a.col2.z, a.col3.z), Vec4V(a.col0.w, a.col1.w, a.col2.w, a.col3.w));
+}
+
+PX_FORCE_INLINE Vec4V V4LoadXYZW(const PxF32& x, const PxF32& y, const PxF32& z, const PxF32& w)
+{
+	return Vec4V(x, y, z, w);
+}
+
+/*
+PX_FORCE_INLINE VecU16V V4U32PK(VecU32V a, VecU32V b)
+{
+    return VecU16V(
+        PxU16(PxClamp<PxU32>((a).u32[0], 0, 0xFFFF)),
+        PxU16(PxClamp<PxU32>((a).u32[1], 0, 0xFFFF)),
+        PxU16(PxClamp<PxU32>((a).u32[2], 0, 0xFFFF)),
+        PxU16(PxClamp<PxU32>((a).u32[3], 0, 0xFFFF)),
+        PxU16(PxClamp<PxU32>((b).u32[0], 0, 0xFFFF)),
+        PxU16(PxClamp<PxU32>((b).u32[1], 0, 0xFFFF)),
+        PxU16(PxClamp<PxU32>((b).u32[2], 0, 0xFFFF)),
+        PxU16(PxClamp<PxU32>((b).u32[3], 0, 0xFFFF)));
+}
+*/
+
+PX_FORCE_INLINE VecU32V V4U32Sel(const BoolV c, const VecU32V a, const VecU32V b)
+{
+	return VecU32V(c.ux ? a.u32[0] : b.u32[0], c.uy ? a.u32[1] : b.u32[1], c.uz ? a.u32[2] : b.u32[2],
+	               c.uw ? a.u32[3] : b.u32[3]);
+}
+
+PX_FORCE_INLINE VecU32V V4U32or(VecU32V a, VecU32V b)
+{
+	return VecU32V((a).u32[0] | (b).u32[0], (a).u32[1] | (b).u32[1], (a).u32[2] | (b).u32[2], (a).u32[3] | (b).u32[3]);
+}
+
+PX_FORCE_INLINE VecU32V V4U32xor(VecU32V a, VecU32V b)
+{
+	return VecU32V((a).u32[0] ^ (b).u32[0], (a).u32[1] ^ (b).u32[1], (a).u32[2] ^ (b).u32[2], (a).u32[3] ^ (b).u32[3]);
+}
+
+PX_FORCE_INLINE VecU32V V4U32and(VecU32V a, VecU32V b)
+{
+	return VecU32V((a).u32[0] & (b).u32[0], (a).u32[1] & (b).u32[1], (a).u32[2] & (b).u32[2], (a).u32[3] & (b).u32[3]);
+}
+
+PX_FORCE_INLINE VecU32V V4U32Andc(VecU32V a, VecU32V b)
+{
+	return VecU32V((a).u32[0] & ~(b).u32[0], (a).u32[1] & ~(b).u32[1], (a).u32[2] & ~(b).u32[2],
+	               (a).u32[3] & ~(b).u32[3]);
+}
+
+/*
+PX_FORCE_INLINE VecU16V V4U16Or(VecU16V a, VecU16V b)
+{
+    return VecU16V(
+        (a).u16[0]|(b).u16[0], (a).u16[1]|(b).u16[1], (a).u16[2]|(b).u16[2], (a).u16[3]|(b).u16[3],
+        (a).u16[4]|(b).u16[4], (a).u16[5]|(b).u16[5], (a).u16[6]|(b).u16[6], (a).u16[7]|(b).u16[7]);
+}
+*/
+
+/*
+PX_FORCE_INLINE VecU16V V4U16And(VecU16V a, VecU16V b)
+{
+    return VecU16V(
+        (a).u16[0]&(b).u16[0], (a).u16[1]&(b).u16[1], (a).u16[2]&(b).u16[2], (a).u16[3]&(b).u16[3],
+        (a).u16[4]&(b).u16[4], (a).u16[5]&(b).u16[5], (a).u16[6]&(b).u16[6], (a).u16[7]&(b).u16[7]);
+}
+*/
+
+/*
+PX_FORCE_INLINE VecU16V V4U16Andc(VecU16V a, VecU16V b)
+{
+    return VecU16V(
+        (a).u16[0]&~(b).u16[0], (a).u16[1]&~(b).u16[1], (a).u16[2]&~(b).u16[2], (a).u16[3]&~(b).u16[3],
+        (a).u16[4]&~(b).u16[4], (a).u16[5]&~(b).u16[5], (a).u16[6]&~(b).u16[6], (a).u16[7]&~(b).u16[7]);
+}
+*/
+
+/*
+template<int a> PX_FORCE_INLINE VecI32V V4ISplat()
+{
+    return VecI32V(a, a, a, a);
+}
+
+template<PxU32 a> PX_FORCE_INLINE VecU32V V4USplat()
+{
+    return VecU32V(a, a, a, a);
+}
+*/
+
+/*
+PX_FORCE_INLINE void V4U16StoreAligned(VecU16V val, VecU16V* address)
+{
+    *address = val;
+}
+*/
+
+PX_FORCE_INLINE void V4U32StoreAligned(VecU32V val, VecU32V* address)
+{
+	*address = val;
+}
+
+PX_FORCE_INLINE Vec4V V4Andc(const Vec4V a, const VecU32V b)
+{
+	VecU32V r = V4U32Andc(*reinterpret_cast<const VecU32V*>(&a), b);
+	return (*reinterpret_cast<const Vec4V*>(&r));
+}
+
+PX_FORCE_INLINE VecU32V V4IsGrtrV32u(const Vec4V a, const Vec4V b)
+{
+	return VecU32V(a.x > b.x ? 0xFFFFffff : 0, a.y > b.y ? 0xFFFFffff : 0, a.z > b.z ? 0xFFFFffff : 0,
+	               a.w > b.w ? 0xFFFFffff : 0);
+}
+
+PX_FORCE_INLINE VecU16V V4U16LoadAligned(VecU16V* addr)
+{
+	return *addr;
+}
+
+PX_FORCE_INLINE VecU16V V4U16LoadUnaligned(VecU16V* addr)
+{
+	return *addr;
+}
+
+PX_FORCE_INLINE VecU16V V4U16CompareGt(VecU16V a, VecU16V b)
+{
+	return VecU16V
+		(
+		BOOL_TO_U16(a.u16[0] > b.u16[0]), BOOL_TO_U16(a.u16[1] > b.u16[1]), BOOL_TO_U16(a.u16[2] > b.u16[2]), BOOL_TO_U16(a.u16[3] > b.u16[3]),
+		BOOL_TO_U16(a.u16[4] > b.u16[4]), BOOL_TO_U16(a.u16[5] > b.u16[5]), BOOL_TO_U16(a.u16[6] > b.u16[6]), BOOL_TO_U16(a.u16[7] > b.u16[7])
+		);
+}
+
+PX_FORCE_INLINE VecU16V V4I16CompareGt(VecU16V a, VecU16V b)
+{
+	return VecU16V
+		(
+		BOOL_TO_U16(a.i16[0] > b.i16[0]), BOOL_TO_U16(a.i16[1] > b.i16[1]), BOOL_TO_U16(a.i16[2] > b.i16[2]), BOOL_TO_U16(a.i16[3] > b.i16[3]),
+		BOOL_TO_U16(a.i16[4] > b.i16[4]), BOOL_TO_U16(a.i16[5] > b.i16[5]), BOOL_TO_U16(a.i16[6] > b.i16[6]), BOOL_TO_U16(a.i16[7] > b.i16[7])
+		);
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_VecU32V(VecU32V a)
+{
+	return Vec4V(PxF32((a).u32[0]), PxF32((a).u32[1]), PxF32((a).u32[2]), PxF32((a).u32[3]));
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_VecI32V(VecI32V a)
+{
+	return Vec4V(PxF32((a).i32[0]), PxF32((a).i32[1]), PxF32((a).i32[2]), PxF32((a).i32[3]));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_From_Vec4V(Vec4V a)
+{
+	float* data = (float*)&a;
+	return VecI32V(PxI32(data[0]), PxI32(data[1]), PxI32(data[2]), PxI32(data[3]));
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecU32V(VecU32V a)
+{
+	Vec4V b = *reinterpret_cast<Vec4V*>(&a);
+	return b;
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecI32V(VecI32V a)
+{
+	Vec4V b = *reinterpret_cast<Vec4V*>(&a);
+	return b;
+}
+
+PX_FORCE_INLINE VecU32V VecU32V_ReinterpretFrom_Vec4V(Vec4V a)
+{
+	VecU32V b = *reinterpret_cast<VecU32V*>(&a);
+	return b;
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_ReinterpretFrom_Vec4V(Vec4V a)
+{
+	VecI32V b = *reinterpret_cast<VecI32V*>(&a);
+	return b;
+}
+
+template <int index>
+PX_FORCE_INLINE VecU32V V4U32SplatElement(VecU32V a)
+{
+	return VecU32V((a).u32[index], (a).u32[index], (a).u32[index], (a).u32[index]);
+}
+
+template <int index>
+PX_FORCE_INLINE VecU32V V4U32SplatElement(BoolV a)
+{
+	const PxU32 u = (&a.ux)[index];
+	return VecU32V(u, u, u, u);
+}
+
+template <int index>
+PX_FORCE_INLINE Vec4V V4SplatElement(Vec4V a)
+{
+	float* data = (float*)&a;
+	return Vec4V(data[index], data[index], data[index], data[index]);
+}
+
+PX_FORCE_INLINE VecU32V U4LoadXYZW(PxU32 x, PxU32 y, PxU32 z, PxU32 w)
+{
+	return VecU32V(x, y, z, w);
+}
+
+PX_FORCE_INLINE Vec4V V4Abs(const Vec4V a)
+{
+	return V4Max(a, V4Neg(a));
+}
+
+PX_FORCE_INLINE BoolV V4IsEqU32(const VecU32V a, const VecU32V b)
+{
+	return BoolV(BOOL_TO_U32(a.u32[0] == b.u32[0]), BOOL_TO_U32(a.u32[1] == b.u32[1]), BOOL_TO_U32(a.u32[2] == b.u32[2]), BOOL_TO_U32(a.u32[3] == b.u32[3]));
+}
+
+PX_FORCE_INLINE VecU32V U4Load(const PxU32 i)
+{
+	return VecU32V(i, i, i, i);
+}
+
+PX_FORCE_INLINE VecU32V U4LoadU(const PxU32* i)
+{
+	return VecU32V(i[0], i[1], i[2], i[3]);
+}
+
+PX_FORCE_INLINE VecU32V U4LoadA(const PxU32* i)
+{
+	return VecU32V(i[0], i[1], i[2], i[3]);
+}
+
+PX_FORCE_INLINE VecI32V I4Load(const PxI32 i)
+{
+	return VecI32V(i, i, i, i);
+}
+
+PX_FORCE_INLINE VecI32V I4LoadU(const PxI32* i)
+{
+	return VecI32V(i[0], i[1], i[2], i[3]);
+}
+
+PX_FORCE_INLINE VecI32V I4LoadA(const PxI32* i)
+{
+	return VecI32V(i[0], i[1], i[2], i[3]);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Add(const VecI32VArg a, const VecI32VArg b)
+{
+	return VecI32V(a.i32[0] + b.i32[0], a.i32[1] + b.i32[1], a.i32[2] + b.i32[2], a.i32[3] + b.i32[3]);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Sub(const VecI32VArg a, const VecI32VArg b)
+{
+	return VecI32V(a.i32[0] - b.i32[0], a.i32[1] - b.i32[1], a.i32[2] - b.i32[2], a.i32[3] - b.i32[3]);
+}
+
+PX_FORCE_INLINE BoolV VecI32V_IsGrtr(const VecI32VArg a, const VecI32VArg b)
+{
+	return BoolV(BOOL_TO_U32(a.i32[0] > b.i32[0]), BOOL_TO_U32(a.i32[1] > b.i32[1]), BOOL_TO_U32(a.i32[2] > b.i32[2]), BOOL_TO_U32(a.i32[3] > b.i32[3]));
+}
+
+PX_FORCE_INLINE BoolV VecI32V_IsEq(const VecI32VArg a, const VecI32VArg b)
+{
+	return BoolV(BOOL_TO_U32(a.i32[0] == b.i32[0]), BOOL_TO_U32(a.i32[1] == b.i32[1]), BOOL_TO_U32(a.i32[2] == b.i32[2]), BOOL_TO_U32(a.i32[3] == b.i32[3]));
+}
+
+PX_FORCE_INLINE VecI32V V4I32Sel(const BoolV c, const VecI32V a, const VecI32V b)
+{
+	return VecI32V(c.ux ? a.i32[0] : b.i32[0], c.uy ? a.i32[1] : b.i32[1], c.uz ? a.i32[2] : b.i32[2],
+	               c.uw ? a.i32[3] : b.i32[3]);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Zero()
+{
+	return VecI32V(0, 0, 0, 0);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_One()
+{
+	return VecI32V(1, 1, 1, 1);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Two()
+{
+	return VecI32V(2, 2, 2, 2);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_MinusOne()
+{
+	return VecI32V(-1, -1, -1, -1);
+}
+
+PX_FORCE_INLINE VecU32V U4Zero()
+{
+	return VecU32V(0, 0, 0, 0);
+}
+
+PX_FORCE_INLINE VecU32V U4One()
+{
+	return VecU32V(1, 1, 1, 1);
+}
+
+PX_FORCE_INLINE VecU32V U4Two()
+{
+	return VecU32V(2, 2, 2, 2);
+}
+
+PX_FORCE_INLINE VecShiftV VecI32V_PrepareShift(const VecI32VArg shift)
+{
+	return shift;
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_LeftShift(const VecI32VArg a, const VecShiftVArg count)
+{
+	return VecI32V(a.i32[0] << count.i32[0], a.i32[1] << count.i32[1], a.i32[2] << count.i32[2], a.i32[3]
+	                                                                                                 << count.i32[3]);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_RightShift(const VecI32VArg a, const VecShiftVArg count)
+{
+	return VecI32V(a.i32[0] >> count.i32[0], a.i32[1] >> count.i32[1], a.i32[2] >> count.i32[2],
+	               a.i32[3] >> count.i32[3]);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_And(const VecI32VArg a, const VecI32VArg b)
+{
+	return VecI32V(a.i32[0] & b.i32[0], a.i32[1] & b.i32[1], a.i32[2] & b.i32[2], a.i32[3] & b.i32[3]);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Or(const VecI32VArg a, const VecI32VArg b)
+{
+	return VecI32V(a.i32[0] | b.i32[0], a.i32[1] | b.i32[1], a.i32[2] | b.i32[2], a.i32[3] | b.i32[3]);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetX(const VecI32VArg a)
+{
+	return VecI32V(a.i32[0], a.i32[0], a.i32[0], a.i32[0]);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetY(const VecI32VArg a)
+{
+	return VecI32V(a.i32[1], a.i32[1], a.i32[1], a.i32[1]);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetZ(const VecI32VArg a)
+{
+	return VecI32V(a.i32[2], a.i32[2], a.i32[2], a.i32[2]);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetW(const VecI32VArg a)
+{
+	return VecI32V(a.i32[3], a.i32[3], a.i32[3], a.i32[3]);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Sel(const BoolV c, const VecI32VArg a, const VecI32VArg b)
+{
+	return VecI32V(c.ux ? a.i32[0] : b.i32[0], c.uy ? a.i32[1] : b.i32[1], c.uz ? a.i32[2] : b.i32[2],
+	               c.uw ? a.i32[3] : b.i32[3]);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Merge(const VecI32VArg a, const VecI32VArg b, const VecI32VArg c, const VecI32VArg d)
+{
+	return VecI32V(a.i32[0], b.i32[0], c.i32[0], d.i32[0]);
+}
+
+PX_FORCE_INLINE void PxI32_From_VecI32V(const VecI32VArg a, PxI32* i)
+{
+	*i = a.i32[0];
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_From_BoolV(const BoolVArg b)
+{
+	return VecI32V(PxI32(b.ux), PxI32(b.uy), PxI32(b.uz), PxI32(b.uw));
+}
+
+PX_FORCE_INLINE VecU32V VecU32V_From_BoolV(const BoolVArg b)
+{
+	return VecU32V(b.ux, b.uy, b.uz, b.uw);
+}
+
+PX_FORCE_INLINE void QuatGetMat33V(const QuatVArg q, Vec3V& column0, Vec3V& column1, Vec3V& column2)
+{
+	const FloatV one = FOne();
+	const FloatV x = V4GetX(q);
+	const FloatV y = V4GetY(q);
+	const FloatV z = V4GetZ(q);
+	const FloatV w = V4GetW(q);
+
+	const FloatV x2 = FAdd(x, x);
+	const FloatV y2 = FAdd(y, y);
+	const FloatV z2 = FAdd(z, z);
+
+	const FloatV xx = FMul(x2, x);
+	const FloatV yy = FMul(y2, y);
+	const FloatV zz = FMul(z2, z);
+
+	const FloatV xy = FMul(x2, y);
+	const FloatV xz = FMul(x2, z);
+	const FloatV xw = FMul(x2, w);
+
+	const FloatV yz = FMul(y2, z);
+	const FloatV yw = FMul(y2, w);
+	const FloatV zw = FMul(z2, w);
+
+	const FloatV v = FSub(one, xx);
+
+	column0 = V3Merge(FSub(FSub(one, yy), zz), FAdd(xy, zw), FSub(xz, yw));
+	column1 = V3Merge(FSub(xy, zw), FSub(v, zz), FAdd(yz, xw));
+	column2 = V3Merge(FAdd(xz, yw), FSub(yz, xw), FSub(v, yy));
+}
+
+
+// not used
+
+/*
+PX_FORCE_INLINE Vec4V V4LoadAligned(Vec4V* addr)
+{
+    return *addr;
+}
+*/
+
+/*
+PX_FORCE_INLINE Vec4V V4LoadUnaligned(Vec4V* addr)
+{
+    return *addr;
+}
+*/
+
+/*
+PX_FORCE_INLINE Vec4V V4Ceil(const Vec4V a)
+{
+    return Vec4V(PxCeil(a.x), PxCeil(a.y), PxCeil(a.z), PxCeil(a.w));
+}
+
+PX_FORCE_INLINE Vec4V V4Floor(const Vec4V a)
+{
+    return Vec4V(PxFloor(a.x), PxFloor(a.y), PxFloor(a.z), PxFloor(a.w));
+}
+*/
+
+/*
+PX_FORCE_INLINE VecU32V V4ConvertToU32VSaturate(const Vec4V a, PxU32 power)
+{
+    PX_ASSERT(power == 0 && "Non-zero power not supported in convertToU32VSaturate");
+    PX_UNUSED(power); // prevent warning in release builds
+    PxF32 ffffFFFFasFloat = PxF32(0xFFFF0000);
+    return VecU32V(
+        PxU32(PxClamp<PxF32>((a).x, 0.0f, ffffFFFFasFloat)),
+        PxU32(PxClamp<PxF32>((a).y, 0.0f, ffffFFFFasFloat)),
+        PxU32(PxClamp<PxF32>((a).z, 0.0f, ffffFFFFasFloat)),
+        PxU32(PxClamp<PxF32>((a).w, 0.0f, ffffFFFFasFloat)));
+}
+*/
+
+#endif // PSFOUNDATION_PSVECMATHAOSSCALARINLINE_H
diff --git a/PxShared/src/foundation/include/PsVecMathSSE.h b/PxShared/src/foundation/include/PsVecMathSSE.h
new file mode 100644
index 00000000..08027e73
--- /dev/null
+++ b/PxShared/src/foundation/include/PsVecMathSSE.h
@@ -0,0 +1,56 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSVECMATHSSE_H
+#define PSFOUNDATION_PSVECMATHSSE_H
+
+namespace
+{
+	const PX_ALIGN(16, PxF32) minus1w[4] = { 0.0f, 0.0f, 0.0f, -1.0f };
+}
+
+PX_FORCE_INLINE void QuatGetMat33V(const QuatVArg q, Vec3V& column0, Vec3V& column1, Vec3V& column2)
+{
+    const __m128 q2 = V4Add(q, q);
+    const __m128 qw2 = V4MulAdd(q2, V4GetW(q), _mm_load_ps(minus1w));			// (2wx, 2wy, 2wz, 2ww-1)
+    const __m128 nw2 = Vec3V_From_Vec4V(V4Neg(qw2));							// (-2wx, -2wy, -2wz, 0)
+    const __m128 v = Vec3V_From_Vec4V(q);
+
+    const __m128 a0 = _mm_shuffle_ps(qw2, nw2, _MM_SHUFFLE(3, 1, 2, 3));		// (2ww-1, 2wz, -2wy, 0)
+    column0 = V4MulAdd(v, V4GetX(q2), a0);
+
+    const __m128 a1 = _mm_shuffle_ps(qw2, nw2, _MM_SHUFFLE(3, 2, 0, 3));		// (2ww-1, 2wx, -2wz, 0)
+    column1 = V4MulAdd(v, V4GetY(q2), _mm_shuffle_ps(a1, a1, _MM_SHUFFLE(3, 1, 0, 2)));
+
+    const __m128 a2 = _mm_shuffle_ps(qw2, nw2, _MM_SHUFFLE(3, 0, 1, 3));		// (2ww-1, 2wy, -2wx, 0)
+    column2 = V4MulAdd(v, V4GetZ(q2), _mm_shuffle_ps(a2, a2, _MM_SHUFFLE(3, 0, 2, 1)));
+}
+
+#endif // PSFOUNDATION_PSVECMATHSSE_H
+
diff --git a/PxShared/src/foundation/include/PsVecMathUtilities.h b/PxShared/src/foundation/include/PsVecMathUtilities.h
new file mode 100644
index 00000000..7bdb4dab
--- /dev/null
+++ b/PxShared/src/foundation/include/PsVecMathUtilities.h
@@ -0,0 +1,57 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSVECMATHUTILITIES_H
+#define PSFOUNDATION_PSVECMATHUTILITIES_H
+
+#include "PsVecMath.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+namespace aos
+{
+/*!
+    Extend an edge along its length by a factor
+    */
+PX_FORCE_INLINE void makeFatEdge(Vec3V& p0, Vec3V& p1, const FloatVArg fatCoeff)
+{
+	const Vec3V delta = V3Sub(p1, p0);
+	const FloatV m = V3Length(delta);
+	const BoolV con = FIsGrtr(m, FZero());
+	const Vec3V fatDelta = V3Scale(V3ScaleInv(delta, m), fatCoeff);
+	p0 = V3Sel(con, V3Sub(p0, fatDelta), p0);
+	p1 = V3Sel(con, V3Add(p1, fatDelta), p1);
+}
+}
+}
+}
+
+#endif
diff --git a/PxShared/src/foundation/include/PsVecQuat.h b/PxShared/src/foundation/include/PsVecQuat.h
new file mode 100644
index 00000000..73eddd1f
--- /dev/null
+++ b/PxShared/src/foundation/include/PsVecQuat.h
@@ -0,0 +1,455 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSVECQUAT_H
+#define PSFOUNDATION_PSVECQUAT_H
+
+//#include "PsInlineAoS.h"
+
+#ifndef PX_PIDIV2
+#define PX_PIDIV2 1.570796327f
+#endif
+
+//////////////////////////////////
+// QuatV
+//////////////////////////////////
+PX_FORCE_INLINE QuatV QuatVLoadXYZW(const PxF32 x, const PxF32 y, const PxF32 z, const PxF32 w)
+{
+	return V4LoadXYZW(x, y, z, w);
+}
+
+PX_FORCE_INLINE QuatV QuatVLoadU(const PxF32* v)
+{
+	return V4LoadU(v);
+}
+
+PX_FORCE_INLINE QuatV QuatVLoadA(const PxF32* v)
+{
+	return V4LoadA(v);
+}
+
+PX_FORCE_INLINE QuatV QuatV_From_RotationAxisAngle(const Vec3V u, const FloatV a)
+{
+	// q = cos(a/2) + u*sin(a/2)
+	const FloatV half = FLoad(0.5f);
+	const FloatV hangle = FMul(a, half);
+	const FloatV piByTwo(FLoad(PX_PIDIV2));
+	const FloatV PiByTwoMinHangle(FSub(piByTwo, hangle));
+	const Vec4V hangle2(Vec4V_From_Vec3V(V3Merge(hangle, PiByTwoMinHangle, hangle)));
+
+	/*const FloatV sina = FSin(hangle);
+	const FloatV cosa = FCos(hangle);*/
+
+	const Vec4V _sina = V4Sin(hangle2);
+	const FloatV sina = V4GetX(_sina);
+	const FloatV cosa = V4GetY(_sina);
+
+	const Vec3V v = V3Scale(u, sina);
+	// return V4Sel(BTTTF(), Vec4V_From_Vec3V(v), V4Splat(cosa));
+	return V4SetW(Vec4V_From_Vec3V(v), cosa);
+}
+
+// Normalize
+PX_FORCE_INLINE QuatV QuatNormalize(const QuatV q)
+{
+	return V4Normalize(q);
+}
+
+PX_FORCE_INLINE FloatV QuatLength(const QuatV q)
+{
+	return V4Length(q);
+}
+
+PX_FORCE_INLINE FloatV QuatLengthSq(const QuatV q)
+{
+	return V4LengthSq(q);
+}
+
+PX_FORCE_INLINE FloatV QuatDot(const QuatV a, const QuatV b) // convert this PxQuat to a unit quaternion
+{
+	return V4Dot(a, b);
+}
+
+PX_FORCE_INLINE QuatV QuatConjugate(const QuatV q)
+{
+	return V4SetW(V4Neg(q), V4GetW(q));
+}
+
+PX_FORCE_INLINE Vec3V QuatGetImaginaryPart(const QuatV q)
+{
+	return Vec3V_From_Vec4V(q);
+}
+
+/** brief computes rotation of x-axis */
+PX_FORCE_INLINE Vec3V QuatGetBasisVector0(const QuatV q)
+{
+	/*const PxF32 x2 = x*2.0f;
+	const PxF32 w2 = w*2.0f;
+	return PxVec3(	(w * w2) - 1.0f + x*x2,
+	                (z * w2)        + y*x2,
+	                (-y * w2)       + z*x2);*/
+
+	const FloatV two = FLoad(2.f);
+	const FloatV w = V4GetW(q);
+	const Vec3V u = Vec3V_From_Vec4V(q);
+
+	const FloatV x2 = FMul(V3GetX(u), two);
+	const FloatV w2 = FMul(w, two);
+
+	const Vec3V a = V3Scale(u, x2);
+	const Vec3V tmp = V3Merge(w, V3GetZ(u), FNeg(V3GetY(u)));
+	// const Vec3V b = V3Scale(tmp, w2);
+	// const Vec3V ab = V3Add(a, b);
+	const Vec3V ab = V3ScaleAdd(tmp, w2, a);
+	return V3SetX(ab, FSub(V3GetX(ab), FOne()));
+}
+
+/** brief computes rotation of y-axis */
+PX_FORCE_INLINE Vec3V QuatGetBasisVector1(const QuatV q)
+{
+	/*const PxF32 y2 = y*2.0f;
+	const PxF32 w2 = w*2.0f;
+	return PxVec3(	(-z * w2)       + x*y2,
+	                (w * w2) - 1.0f + y*y2,
+	                (x * w2)        + z*y2);*/
+
+	const FloatV two = FLoad(2.f);
+	const FloatV w = V4GetW(q);
+	const Vec3V u = Vec3V_From_Vec4V(q);
+
+	const FloatV y2 = FMul(V3GetY(u), two);
+	const FloatV w2 = FMul(w, two);
+
+	const Vec3V a = V3Scale(u, y2);
+	const Vec3V tmp = V3Merge(FNeg(V3GetZ(u)), w, V3GetX(u));
+	// const Vec3V b = V3Scale(tmp, w2);
+	// const Vec3V ab = V3Add(a, b);
+	const Vec3V ab = V3ScaleAdd(tmp, w2, a);
+	return V3SetY(ab, FSub(V3GetY(ab), FOne()));
+}
+
+/** brief computes rotation of z-axis */
+PX_FORCE_INLINE Vec3V QuatGetBasisVector2(const QuatV q)
+{
+	/*const PxF32 z2 = z*2.0f;
+	const PxF32 w2 = w*2.0f;
+	return PxVec3(	(y * w2)        + x*z2,
+	                (-x * w2)       + y*z2,
+	                (w * w2) - 1.0f + z*z2);*/
+
+	const FloatV two = FLoad(2.f);
+	const FloatV w = V4GetW(q);
+	const Vec3V u = Vec3V_From_Vec4V(q);
+
+	const FloatV z2 = FMul(V3GetZ(u), two);
+	const FloatV w2 = FMul(w, two);
+
+	const Vec3V a = V3Scale(u, z2);
+	const Vec3V tmp = V3Merge(V3GetY(u), FNeg(V3GetX(u)), w);
+	/*const Vec3V b = V3Scale(tmp, w2);
+	const Vec3V ab = V3Add(a, b);*/
+	const Vec3V ab = V3ScaleAdd(tmp, w2, a);
+	return V3SetZ(ab, FSub(V3GetZ(ab), FOne()));
+}
+
+PX_FORCE_INLINE Vec3V QuatRotate(const QuatV q, const Vec3V v)
+{
+	/*
+	const PxVec3 qv(x,y,z);
+	return (v*(w*w-0.5f) + (qv.cross(v))*w + qv*(qv.dot(v)))*2;
+	*/
+
+	const FloatV two = FLoad(2.f);
+	// const FloatV half = FloatV_From_F32(0.5f);
+	const FloatV nhalf = FLoad(-0.5f);
+	const Vec3V u = Vec3V_From_Vec4V(q);
+	const FloatV w = V4GetW(q);
+	// const FloatV w2 = FSub(FMul(w, w), half);
+	const FloatV w2 = FScaleAdd(w, w, nhalf);
+	const Vec3V a = V3Scale(v, w2);
+	// const Vec3V b = V3Scale(V3Cross(u, v), w);
+	// const Vec3V c = V3Scale(u, V3Dot(u, v));
+	// return V3Scale(V3Add(V3Add(a, b), c), two);
+	const Vec3V temp = V3ScaleAdd(V3Cross(u, v), w, a);
+	return V3Scale(V3ScaleAdd(u, V3Dot(u, v), temp), two);
+}
+
+PX_FORCE_INLINE Vec3V QuatTransform(const QuatV q, const Vec3V p, const Vec3V v)
+{
+	// p + q.rotate(v)
+	const FloatV two = FLoad(2.f);
+	// const FloatV half = FloatV_From_F32(0.5f);
+	const FloatV nhalf = FLoad(-0.5f);
+	const Vec3V u = Vec3V_From_Vec4V(q);
+	const FloatV w = V4GetW(q);
+	// const FloatV w2 = FSub(FMul(w, w), half);
+	const FloatV w2 = FScaleAdd(w, w, nhalf);
+	const Vec3V a = V3Scale(v, w2);
+	/*const Vec3V b = V3Scale(V3Cross(u, v), w);
+	const Vec3V c = V3Scale(u, V3Dot(u, v));
+	return V3ScaleAdd(V3Add(V3Add(a, b), c), two, p);*/
+	const Vec3V temp = V3ScaleAdd(V3Cross(u, v), w, a);
+	const Vec3V z = V3ScaleAdd(u, V3Dot(u, v), temp);
+	return V3ScaleAdd(z, two, p);
+}
+
+PX_FORCE_INLINE Vec3V QuatRotateInv(const QuatV q, const Vec3V v)
+{
+
+	//	const PxVec3 qv(x,y,z);
+	//	return (v*(w*w-0.5f) - (qv.cross(v))*w + qv*(qv.dot(v)))*2;
+
+	const FloatV two = FLoad(2.f);
+	const FloatV nhalf = FLoad(-0.5f);
+	const Vec3V u = Vec3V_From_Vec4V(q);
+	const FloatV w = V4GetW(q);
+	const FloatV w2 = FScaleAdd(w, w, nhalf);
+	const Vec3V a = V3Scale(v, w2);
+	/*const Vec3V b = V3Scale(V3Cross(u, v), w);
+	const Vec3V c = V3Scale(u, V3Dot(u, v));
+	return V3Scale(V3Add(V3Sub(a, b), c), two);*/
+	const Vec3V temp = V3NegScaleSub(V3Cross(u, v), w, a);
+	return V3Scale(V3ScaleAdd(u, V3Dot(u, v), temp), two);
+}
+
+PX_FORCE_INLINE QuatV QuatMul(const QuatV a, const QuatV b)
+{
+	const Vec3V imagA = Vec3V_From_Vec4V(a);
+	const Vec3V imagB = Vec3V_From_Vec4V(b);
+	const FloatV rA = V4GetW(a);
+	const FloatV rB = V4GetW(b);
+
+	const FloatV real = FSub(FMul(rA, rB), V3Dot(imagA, imagB));
+	const Vec3V v0 = V3Scale(imagA, rB);
+	const Vec3V v1 = V3Scale(imagB, rA);
+	const Vec3V v2 = V3Cross(imagA, imagB);
+	const Vec3V imag = V3Add(V3Add(v0, v1), v2);
+
+	return V4SetW(Vec4V_From_Vec3V(imag), real);
+}
+
+PX_FORCE_INLINE QuatV QuatAdd(const QuatV a, const QuatV b)
+{
+	return V4Add(a, b);
+}
+
+PX_FORCE_INLINE QuatV QuatNeg(const QuatV q)
+{
+	return V4Neg(q);
+}
+
+PX_FORCE_INLINE QuatV QuatSub(const QuatV a, const QuatV b)
+{
+	return V4Sub(a, b);
+}
+
+PX_FORCE_INLINE QuatV QuatScale(const QuatV a, const FloatV b)
+{
+	return V4Scale(a, b);
+}
+
+PX_FORCE_INLINE QuatV QuatMerge(const FloatV* const floatVArray)
+{
+	return V4Merge(floatVArray);
+}
+
+PX_FORCE_INLINE QuatV QuatMerge(const FloatVArg x, const FloatVArg y, const FloatVArg z, const FloatVArg w)
+{
+	return V4Merge(x, y, z, w);
+}
+
+PX_FORCE_INLINE QuatV QuatIdentity()
+{
+	return V4SetW(V4Zero(), FOne());
+}
+
+PX_FORCE_INLINE bool isFiniteQuatV(const QuatV q)
+{
+	return isFiniteVec4V(q);
+}
+
+PX_FORCE_INLINE bool isValidQuatV(const QuatV q)
+{
+	const FloatV unitTolerance = FLoad(1e-4f);
+	const FloatV tmp = FAbs(FSub(QuatLength(q), FOne()));
+	const BoolV con = FIsGrtr(unitTolerance, tmp);
+	return isFiniteVec4V(q) & (BAllEqTTTT(con) == 1);
+}
+
+PX_FORCE_INLINE bool isSaneQuatV(const QuatV q)
+{
+	const FloatV unitTolerance = FLoad(1e-2f);
+	const FloatV tmp = FAbs(FSub(QuatLength(q), FOne()));
+	const BoolV con = FIsGrtr(unitTolerance, tmp);
+	return isFiniteVec4V(q) & (BAllEqTTTT(con) == 1);
+}
+
+PX_FORCE_INLINE Mat33V QuatGetMat33V(const QuatVArg q)
+{
+	// const FloatV two = FloatV_From_F32(2.f);
+	// const FloatV one = FOne();
+
+	// const FloatV x = V4GetX(q);
+	// const FloatV y = V4GetY(q);
+	// const FloatV z = V4GetZ(q);
+	// const Vec4V _q = V4Mul(q, two);
+	//
+	////const FloatV w = V4GetW(q);
+
+	// const Vec4V t0 = V4Mul(_q, x); // 2xx, 2xy, 2xz, 2xw
+	// const Vec4V t1 = V4Mul(_q, y); // 2xy, 2yy, 2yz, 2yw
+	// const Vec4V t2 = V4Mul(_q, z); // 2xz, 2yz, 2zz, 2zw
+	////const Vec4V t3 = V4Mul(_q, w); // 2xw, 2yw, 2zw, 2ww
+
+	// const FloatV xx2 = V4GetX(t0);
+	// const FloatV xy2 = V4GetY(t0);
+	// const FloatV xz2 = V4GetZ(t0);
+	// const FloatV xw2 = V4GetW(t0);
+
+	// const FloatV yy2 = V4GetY(t1);
+	// const FloatV yz2 = V4GetZ(t1);
+	// const FloatV yw2 = V4GetW(t1);
+
+	// const FloatV zz2 = V4GetZ(t2);
+	// const FloatV zw2 = V4GetW(t2);
+
+	////const FloatV ww2 = V4GetW(t3);
+
+	// const FloatV c00 = FSub(one, FAdd(yy2, zz2));
+	// const FloatV c01 = FSub(xy2, zw2);
+	// const FloatV c02 = FAdd(xz2, yw2);
+
+	// const FloatV c10 = FAdd(xy2, zw2);
+	// const FloatV c11 = FSub(one, FAdd(xx2, zz2));
+	// const FloatV c12 = FSub(yz2, xw2);
+
+	// const FloatV c20 = FSub(xz2, yw2);
+	// const FloatV c21 = FAdd(yz2, xw2);
+	// const FloatV c22 = FSub(one, FAdd(xx2, yy2));
+
+	// const Vec3V c0 = V3Merge(c00, c10, c20);
+	// const Vec3V c1 = V3Merge(c01, c11, c21);
+	// const Vec3V c2 = V3Merge(c02, c12, c22);
+
+	// return Mat33V(c0, c1, c2);
+
+	const FloatV one = FOne();
+	const FloatV x = V4GetX(q);
+	const FloatV y = V4GetY(q);
+	const FloatV z = V4GetZ(q);
+	const FloatV w = V4GetW(q);
+
+	const FloatV x2 = FAdd(x, x);
+	const FloatV y2 = FAdd(y, y);
+	const FloatV z2 = FAdd(z, z);
+
+	const FloatV xx = FMul(x2, x);
+	const FloatV yy = FMul(y2, y);
+	const FloatV zz = FMul(z2, z);
+
+	const FloatV xy = FMul(x2, y);
+	const FloatV xz = FMul(x2, z);
+	const FloatV xw = FMul(x2, w);
+
+	const FloatV yz = FMul(y2, z);
+	const FloatV yw = FMul(y2, w);
+	const FloatV zw = FMul(z2, w);
+
+	const FloatV v = FSub(one, xx);
+
+	const Vec3V column0 = V3Merge(FSub(FSub(one, yy), zz), FAdd(xy, zw), FSub(xz, yw));
+	const Vec3V column1 = V3Merge(FSub(xy, zw), FSub(v, zz), FAdd(yz, xw));
+	const Vec3V column2 = V3Merge(FAdd(xz, yw), FSub(yz, xw), FSub(v, yy));
+	return Mat33V(column0, column1, column2);
+}
+
+PX_FORCE_INLINE QuatV Mat33GetQuatV(const Mat33V& a)
+{
+	const FloatV one = FOne();
+	const FloatV zero = FZero();
+	const FloatV half = FLoad(0.5f);
+	const FloatV two = FLoad(2.f);
+	const FloatV scale = FLoad(0.25f);
+	const FloatV a00 = V3GetX(a.col0);
+	const FloatV a11 = V3GetY(a.col1);
+	const FloatV a22 = V3GetZ(a.col2);
+
+	const FloatV a21 = V3GetZ(a.col1); // row=2, col=1;
+	const FloatV a12 = V3GetY(a.col2); // row=1, col=2;
+	const FloatV a02 = V3GetX(a.col2); // row=0, col=2;
+	const FloatV a20 = V3GetZ(a.col0); // row=2, col=0;
+	const FloatV a10 = V3GetY(a.col0); // row=1, col=0;
+	const FloatV a01 = V3GetX(a.col1); // row=0, col=1;
+
+	const Vec3V vec0 = V3Merge(a21, a02, a10);
+	const Vec3V vec1 = V3Merge(a12, a20, a01);
+	const Vec3V v = V3Sub(vec0, vec1);
+	const Vec3V g = V3Add(vec0, vec1);
+
+	const FloatV trace = FAdd(a00, FAdd(a11, a22));
+
+	if(FAllGrtrOrEq(trace, zero))
+	{
+		const FloatV h = FSqrt(FAdd(trace, one));
+		const FloatV w = FMul(half, h);
+		const FloatV s = FMul(half, FRecip(h));
+		const Vec3V u = V3Scale(v, s);
+		return V4SetW(Vec4V_From_Vec3V(u), w);
+	}
+	else
+	{
+		const FloatV ntrace = FNeg(trace);
+		const Vec3V d = V3Merge(a00, a11, a22);
+		const BoolV con0 = BAllTrue3(V3IsGrtrOrEq(V3Splat(a00), d));
+		const BoolV con1 = BAllTrue3(V3IsGrtrOrEq(V3Splat(a11), d));
+
+		const FloatV t0 = FAdd(one, FScaleAdd(a00, two, ntrace));
+		const FloatV t1 = FAdd(one, FScaleAdd(a11, two, ntrace));
+		const FloatV t2 = FAdd(one, FScaleAdd(a22, two, ntrace));
+
+		const FloatV t = FSel(con0, t0, FSel(con1, t1, t2));
+
+		const FloatV h = FMul(two, FSqrt(t));
+		const FloatV s = FRecip(h);
+		const FloatV g0 = FMul(scale, h);
+		const Vec3V vs = V3Scale(v, s);
+		const Vec3V gs = V3Scale(g, s);
+		const FloatV gsx = V3GetX(gs);
+		const FloatV gsy = V3GetY(gs);
+		const FloatV gsz = V3GetZ(gs);
+		// vs.x= (a21 - a12)*s; vs.y=(a02 - a20)*s; vs.z=(a10 - a01)*s;
+		// gs.x= (a21 + a12)*s; gs.y=(a02 + a20)*s; gs.z=(a10 + a01)*s;
+		const Vec4V v0 = V4Merge(g0, gsz, gsy, V3GetX(vs));
+		const Vec4V v1 = V4Merge(gsz, g0, gsx, V3GetY(vs));
+		const Vec4V v2 = V4Merge(gsy, gsx, g0, V3GetZ(vs));
+		return V4Sel(con0, v0, V4Sel(con1, v1, v2));
+	}
+}
+
+#endif
diff --git a/PxShared/src/foundation/include/PsVecTransform.h b/PxShared/src/foundation/include/PsVecTransform.h
new file mode 100644
index 00000000..974f6fa2
--- /dev/null
+++ b/PxShared/src/foundation/include/PsVecTransform.h
@@ -0,0 +1,283 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSVECTRANSFORM_H
+#define PSFOUNDATION_PSVECTRANSFORM_H
+
+#include "PsVecMath.h"
+#include "foundation/PxTransform.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+namespace aos
+{
+
+class PsTransformV
+{
+  public:
+	QuatV q;
+	Vec3V p;
+
+	PX_FORCE_INLINE PsTransformV(const PxTransform& orientation)
+	{
+		// const PxQuat oq = orientation.q;
+		// const PxF32 f[4] = {oq.x, oq.y, oq.z, oq.w};
+		q = QuatVLoadXYZW(orientation.q.x, orientation.q.y, orientation.q.z, orientation.q.w);
+		// q = QuatV_From_F32Array(&oq.x);
+		p = V3LoadU(orientation.p);
+	}
+
+	PX_FORCE_INLINE PsTransformV(const Vec3VArg p0 = V3Zero(), const QuatVArg q0 = QuatIdentity()) : q(q0), p(p0)
+	{
+		PX_ASSERT(isSaneQuatV(q0));
+	}
+
+	PX_FORCE_INLINE PsTransformV operator*(const PsTransformV& x) const
+	{
+		PX_ASSERT(x.isSane());
+		return transform(x);
+	}
+
+	PX_FORCE_INLINE PsTransformV getInverse() const
+	{
+		PX_ASSERT(isFinite());
+		// return PxTransform(q.rotateInv(-p),q.getConjugate());
+		return PsTransformV(QuatRotateInv(q, V3Neg(p)), QuatConjugate(q));
+	}
+
+	PX_FORCE_INLINE void normalize()
+	{
+		p = V3Zero();
+		q = QuatIdentity();
+	}
+
+	PX_FORCE_INLINE void Invalidate()
+	{
+		p = V3Splat(FMax());
+		q = QuatIdentity();
+	}
+
+	PX_FORCE_INLINE Vec3V transform(const Vec3VArg input) const
+	{
+		PX_ASSERT(isFinite());
+		// return q.rotate(input) + p;
+		return QuatTransform(q, p, input);
+	}
+
+	PX_FORCE_INLINE Vec3V transformInv(const Vec3VArg input) const
+	{
+		PX_ASSERT(isFinite());
+		// return q.rotateInv(input-p);
+		return QuatRotateInv(q, V3Sub(input, p));
+	}
+
+	PX_FORCE_INLINE Vec3V rotate(const Vec3VArg input) const
+	{
+		PX_ASSERT(isFinite());
+		// return q.rotate(input);
+		return QuatRotate(q, input);
+	}
+
+	PX_FORCE_INLINE Vec3V rotateInv(const Vec3VArg input) const
+	{
+		PX_ASSERT(isFinite());
+		// return q.rotateInv(input);
+		return QuatRotateInv(q, input);
+	}
+
+	//! Transform transform to parent (returns compound transform: first src, then *this)
+	PX_FORCE_INLINE PsTransformV transform(const PsTransformV& src) const
+	{
+		PX_ASSERT(src.isSane());
+		PX_ASSERT(isSane());
+		// src = [srct, srcr] -> [r*srct + t, r*srcr]
+		// return PxTransform(q.rotate(src.p) + p, q*src.q);
+		return PsTransformV(V3Add(QuatRotate(q, src.p), p), QuatMul(q, src.q));
+	}
+
+	/**
+	\brief returns true if finite and q is a unit quaternion
+	*/
+
+	PX_FORCE_INLINE bool isValid() const
+	{
+		// return p.isFinite() && q.isFinite() && q.isValid();
+		return isFiniteVec3V(p) & isFiniteQuatV(q) & isValidQuatV(q);
+	}
+
+	/**
+	\brief returns true if finite and quat magnitude is reasonably close to unit to allow for some accumulation of error
+	vs isValid
+	*/
+
+	PX_FORCE_INLINE bool isSane() const
+	{
+		// return isFinite() && q.isSane();
+		return isFinite() & isSaneQuatV(q);
+	}
+
+	/**
+	\brief returns true if all elems are finite (not NAN or INF, etc.)
+	*/
+	PX_FORCE_INLINE bool isFinite() const
+	{
+		// return p.isFinite() && q.isFinite();
+		return isFiniteVec3V(p) & isFiniteQuatV(q);
+	}
+
+	//! Transform transform from parent (returns compound transform: first src, then this->inverse)
+	PX_FORCE_INLINE PsTransformV transformInv(const PsTransformV& src) const
+	{
+		PX_ASSERT(src.isSane());
+		PX_ASSERT(isFinite());
+		// src = [srct, srcr] -> [r^-1*(srct-t), r^-1*srcr]
+		/*PxQuat qinv = q.getConjugate();
+		return PxTransform(qinv.rotate(src.p - p), qinv*src.q);*/
+		const QuatV qinv = QuatConjugate(q);
+		const Vec3V v = QuatRotate(qinv, V3Sub(src.p, p));
+		const QuatV rot = QuatMul(qinv, src.q);
+		return PsTransformV(v, rot);
+	}
+
+	static PX_FORCE_INLINE PsTransformV createIdentity()
+	{
+		return PsTransformV(V3Zero());
+	}
+};
+
+PX_FORCE_INLINE PsTransformV loadTransformA(const PxTransform& transform)
+{
+	const QuatV q0 = QuatVLoadA(&transform.q.x);
+	const Vec3V p0 = V3LoadA(&transform.p.x);
+
+	return PsTransformV(p0, q0);
+}
+
+PX_FORCE_INLINE PsTransformV loadTransformU(const PxTransform& transform)
+{
+	const QuatV q0 = QuatVLoadU(&transform.q.x);
+	const Vec3V p0 = V3LoadU(&transform.p.x);
+
+	return PsTransformV(p0, q0);
+}
+
+class PsMatTransformV
+{
+  public:
+	Mat33V rot;
+	Vec3V p;
+
+	PX_FORCE_INLINE PsMatTransformV()
+	{
+		p = V3Zero();
+		rot = M33Identity();
+	}
+	PX_FORCE_INLINE PsMatTransformV(const Vec3VArg _p, const Mat33V& _rot)
+	{
+		p = _p;
+		rot = _rot;
+	}
+
+	PX_FORCE_INLINE PsMatTransformV(const PsTransformV& other)
+	{
+		p = other.p;
+		QuatGetMat33V(other.q, rot.col0, rot.col1, rot.col2);
+	}
+
+	PX_FORCE_INLINE PsMatTransformV(const Vec3VArg _p, const QuatV& quat)
+	{
+		p = _p;
+		QuatGetMat33V(quat, rot.col0, rot.col1, rot.col2);
+	}
+
+	PX_FORCE_INLINE Vec3V getCol0() const
+	{
+		return rot.col0;
+	}
+
+	PX_FORCE_INLINE Vec3V getCol1() const
+	{
+		return rot.col1;
+	}
+
+	PX_FORCE_INLINE Vec3V getCol2() const
+	{
+		return rot.col2;
+	}
+
+	PX_FORCE_INLINE void setCol0(const Vec3VArg col0)
+	{
+		rot.col0 = col0;
+	}
+
+	PX_FORCE_INLINE void setCol1(const Vec3VArg col1)
+	{
+		rot.col1 = col1;
+	}
+
+	PX_FORCE_INLINE void setCol2(const Vec3VArg col2)
+	{
+		rot.col2 = col2;
+	}
+
+	PX_FORCE_INLINE Vec3V transform(const Vec3VArg input) const
+	{
+		return V3Add(p, M33MulV3(rot, input));
+	}
+
+	PX_FORCE_INLINE Vec3V transformInv(const Vec3VArg input) const
+	{
+		return M33TrnspsMulV3(rot, V3Sub(input, p)); // QuatRotateInv(q, V3Sub(input, p));
+	}
+
+	PX_FORCE_INLINE Vec3V rotate(const Vec3VArg input) const
+	{
+		return M33MulV3(rot, input);
+	}
+
+	PX_FORCE_INLINE Vec3V rotateInv(const Vec3VArg input) const
+	{
+		return M33TrnspsMulV3(rot, input);
+	}
+
+	PX_FORCE_INLINE PsMatTransformV transformInv(const PsMatTransformV& src) const
+	{
+
+		const Vec3V v = M33TrnspsMulV3(rot, V3Sub(src.p, p));
+		const Mat33V mat = M33MulM33(M33Trnsps(rot), src.rot);
+		return PsMatTransformV(v, mat);
+	}
+};
+}
+}
+}
+
+#endif
diff --git a/PxShared/src/foundation/include/unix/PsUnixAoS.h b/PxShared/src/foundation/include/unix/PsUnixAoS.h
new file mode 100644
index 00000000..a40ddb73
--- /dev/null
+++ b/PxShared/src/foundation/include/unix/PsUnixAoS.h
@@ -0,0 +1,47 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSUNIXAOS_H
+#define PSFOUNDATION_PSUNIXAOS_H
+
+// no includes here! this file should be included from PxcVecMath.h only!!!
+
+#if !COMPILE_VECTOR_INTRINSICS
+#error Vector intrinsics should not be included when using scalar implementation.
+#endif
+
+#if PX_INTEL_FAMILY
+#include "sse2/PsUnixSse2AoS.h"
+#elif PX_NEON
+#include "neon/PsUnixNeonAoS.h"
+#else
+#error No SIMD implementation for this unix platform.
+#endif
+
+#endif // PSFOUNDATION_PSUNIXAOS_H
diff --git a/PxShared/src/foundation/include/unix/PsUnixFPU.h b/PxShared/src/foundation/include/unix/PsUnixFPU.h
new file mode 100644
index 00000000..db1acc6d
--- /dev/null
+++ b/PxShared/src/foundation/include/unix/PsUnixFPU.h
@@ -0,0 +1,66 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSUNIXFPU_H
+#define PSFOUNDATION_PSUNIXFPU_H
+
+#include "foundation/PxPreprocessor.h"
+
+#if PX_LINUX || PX_PS4 || PX_OSX
+
+#if PX_X86 || PX_X64
+#include <xmmintrin.h>
+#elif PX_NEON
+#include <arm_neon.h>
+#endif
+
+
+PX_INLINE physx::shdfnd::SIMDGuard::SIMDGuard()
+{
+#if !PX_EMSCRIPTEN && (PX_X86 || PX_X64)
+	mControlWord = _mm_getcsr();
+	// set default (disable exceptions: _MM_MASK_MASK) and FTZ (_MM_FLUSH_ZERO_ON), DAZ (_MM_DENORMALS_ZERO_ON: (1<<6))
+	_mm_setcsr(_MM_MASK_MASK | _MM_FLUSH_ZERO_ON | (1 << 6));
+#endif
+}
+
+PX_INLINE physx::shdfnd::SIMDGuard::~SIMDGuard()
+{
+#if !PX_EMSCRIPTEN && (PX_X86 || PX_X64)
+	// restore control word and clear exception flags
+	// (setting exception state flags cause exceptions on the first following fp operation)
+	_mm_setcsr(mControlWord & ~_MM_EXCEPT_MASK);
+#endif
+}
+
+#else
+#error No SIMD implementation for this unix platform.
+#endif // PX_LINUX || PX_PS4 || PX_OSX
+
+#endif // #ifndef PSFOUNDATION_PSUNIXFPU_H
diff --git a/PxShared/src/foundation/include/unix/PsUnixInlineAoS.h b/PxShared/src/foundation/include/unix/PsUnixInlineAoS.h
new file mode 100644
index 00000000..1ba626e8
--- /dev/null
+++ b/PxShared/src/foundation/include/unix/PsUnixInlineAoS.h
@@ -0,0 +1,48 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSUNIXINLINEAOS_H
+#define PSFOUNDATION_PSUNIXINLINEAOS_H
+
+#if !COMPILE_VECTOR_INTRINSICS
+#error Vector intrinsics should not be included when using scalar implementation.
+#endif
+
+// Remove this define when all platforms use simd solver.
+#define PX_SUPPORT_SIMD
+
+#if PX_INTEL_FAMILY
+#include "sse2/PsUnixSse2InlineAoS.h"
+#elif PX_NEON
+#include "neon/PsUnixNeonInlineAoS.h"
+#else
+#error No SIMD implementation for this unix platform.
+#endif
+
+#endif // PSFOUNDATION_PSUNIXINLINEAOS_H
diff --git a/PxShared/src/foundation/include/unix/PsUnixIntrinsics.h b/PxShared/src/foundation/include/unix/PsUnixIntrinsics.h
new file mode 100644
index 00000000..1b738518
--- /dev/null
+++ b/PxShared/src/foundation/include/unix/PsUnixIntrinsics.h
@@ -0,0 +1,153 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSUNIXINTRINSICS_H
+#define PSFOUNDATION_PSUNIXINTRINSICS_H
+
+#include "Ps.h"
+#include "foundation/PxAssert.h"
+#include <math.h>
+
+#if PX_ANDROID
+#include <signal.h> // for Ns::debugBreak() { raise(SIGTRAP); }
+#endif
+
+#if 0
+#include <libkern/OSAtomic.h>
+#endif
+
+// this file is for internal intrinsics - that is, intrinsics that are used in
+// cross platform code but do not appear in the API
+
+#if !(PX_LINUX || PX_ANDROID || PX_PS4 || PX_APPLE_FAMILY)
+#error "This file should only be included by unix builds!!"
+#endif
+
+namespace physx
+{
+namespace shdfnd
+{
+
+PX_FORCE_INLINE void memoryBarrier()
+{
+	__sync_synchronize();
+}
+
+/*!
+Return the index of the highest set bit. Undefined for zero arg.
+*/
+PX_INLINE uint32_t highestSetBitUnsafe(uint32_t v)
+{
+
+	return 31 - __builtin_clz(v);
+}
+
+/*!
+Return the index of the highest set bit. Undefined for zero arg.
+*/
+PX_INLINE int32_t lowestSetBitUnsafe(uint32_t v)
+{
+	return __builtin_ctz(v);
+}
+
+/*!
+Returns the index of the highest set bit. Returns 32 for v=0.
+*/
+PX_INLINE uint32_t countLeadingZeros(uint32_t v)
+{
+	if(v)
+		return __builtin_clz(v);
+	else
+		return 32;
+}
+
+/*!
+Prefetch aligned 64B x86, 32b ARM around \c ptr+offset.
+*/
+PX_FORCE_INLINE void prefetchLine(const void* ptr, uint32_t offset = 0)
+{
+	__builtin_prefetch(reinterpret_cast<const char* PX_RESTRICT>(ptr) + offset, 0, 3);
+}
+
+/*!
+Prefetch \c count bytes starting at \c ptr.
+*/
+#if PX_ANDROID || PX_IOS
+PX_FORCE_INLINE void prefetch(const void* ptr, uint32_t count = 1)
+{
+	const char* cp = static_cast<const char*>(ptr);
+	size_t p = reinterpret_cast<size_t>(ptr);
+	uint32_t startLine = uint32_t(p >> 5), endLine = uint32_t((p + count - 1) >> 5);
+	uint32_t lines = endLine - startLine + 1;
+	do
+	{
+		prefetchLine(cp);
+		cp += 32;
+	} while(--lines);
+}
+#else
+PX_FORCE_INLINE void prefetch(const void* ptr, uint32_t count = 1)
+{
+	const char* cp = reinterpret_cast<const char*>(ptr);
+	uint64_t p = size_t(ptr);
+	uint64_t startLine = p >> 6, endLine = (p + count - 1) >> 6;
+	uint64_t lines = endLine - startLine + 1;
+	do
+	{
+		prefetchLine(cp);
+		cp += 64;
+	} while(--lines);
+}
+#endif
+
+//! \brief platform-specific reciprocal
+PX_CUDA_CALLABLE PX_FORCE_INLINE float recipFast(float a)
+{
+	return 1.0f / a;
+}
+
+//! \brief platform-specific fast reciprocal square root
+PX_CUDA_CALLABLE PX_FORCE_INLINE float recipSqrtFast(float a)
+{
+	return 1.0f / ::sqrtf(a);
+}
+
+//! \brief platform-specific floor
+PX_CUDA_CALLABLE PX_FORCE_INLINE float floatFloor(float x)
+{
+	return ::floorf(x);
+}
+
+#define NS_EXPECT_TRUE(x) x
+#define NS_EXPECT_FALSE(x) x
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSUNIXINTRINSICS_H
diff --git a/PxShared/src/foundation/include/unix/PsUnixTrigConstants.h b/PxShared/src/foundation/include/unix/PsUnixTrigConstants.h
new file mode 100644
index 00000000..f742e293
--- /dev/null
+++ b/PxShared/src/foundation/include/unix/PsUnixTrigConstants.h
@@ -0,0 +1,82 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSUNIXTRIGCONSTANTS_H
+#define PSFOUNDATION_PSUNIXTRIGCONSTANTS_H
+
+//#define PX_GLOBALCONST extern const __declspec(selectany)
+#define PX_GLOBALCONST extern const __attribute__((weak))
+
+PX_ALIGN_PREFIX(16)
+struct PX_VECTORF32
+{
+	float f[4];
+} PX_ALIGN_SUFFIX(16);
+
+PX_GLOBALCONST PX_VECTORF32 g_PXSinCoefficients0 = { { 1.0f, -0.166666667f, 8.333333333e-3f, -1.984126984e-4f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXSinCoefficients1 = { { 2.755731922e-6f, -2.505210839e-8f, 1.605904384e-10f, -7.647163732e-13f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXSinCoefficients2 = { { 2.811457254e-15f, -8.220635247e-18f, 1.957294106e-20f, -3.868170171e-23f } };
+PX_GLOBALCONST PX_VECTORF32 g_PXCosCoefficients0 = { { 1.0f, -0.5f, 4.166666667e-2f, -1.388888889e-3f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXCosCoefficients1 = { { 2.480158730e-5f, -2.755731922e-7f, 2.087675699e-9f, -1.147074560e-11f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXCosCoefficients2 = { { 4.779477332e-14f, -1.561920697e-16f, 4.110317623e-19f, -8.896791392e-22f } };
+PX_GLOBALCONST PX_VECTORF32 g_PXTanCoefficients0 = { { 1.0f, 0.333333333f, 0.133333333f, 5.396825397e-2f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXTanCoefficients1 = { { 2.186948854e-2f, 8.863235530e-3f, 3.592128167e-3f, 1.455834485e-3f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXTanCoefficients2 = { { 5.900274264e-4f, 2.391290764e-4f, 9.691537707e-5f, 3.927832950e-5f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXASinCoefficients0 = { { -0.05806367563904f, -0.41861972469416f, 0.22480114791621f, 2.17337241360606f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXASinCoefficients1 = { { 0.61657275907170f, 4.29696498283455f, -1.18942822255452f, -6.53784832094831f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXASinCoefficients2 = { { -1.36926553863413f, -4.48179294237210f, 1.41810672941833f, 5.48179257935713f } };
+PX_GLOBALCONST PX_VECTORF32 g_PXATanCoefficients0 = { { 1.0f, 0.333333334f, 0.2f, 0.142857143f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXATanCoefficients1 = { { 1.111111111e-1f, 9.090909091e-2f, 7.692307692e-2f, 6.666666667e-2f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXATanCoefficients2 = { { 5.882352941e-2f, 5.263157895e-2f, 4.761904762e-2f, 4.347826087e-2f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXSinEstCoefficients = { { 1.0f, -1.66521856991541e-1f, 8.199913018755e-3f, -1.61475937228e-4f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXCosEstCoefficients = { { 1.0f, -4.95348008918096e-1f, 3.878259962881e-2f, -9.24587976263e-4f } };
+PX_GLOBALCONST PX_VECTORF32 g_PXTanEstCoefficients = { { 2.484f, -1.954923183e-1f, 2.467401101f, PxInvPi } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXATanEstCoefficients = { { 7.689891418951e-1f, 1.104742493348f, 8.661844266006e-1f, PxPiDivTwo } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXASinEstCoefficients = { { -1.36178272886711f, 2.37949493464538f, -8.08228565650486e-1f, 2.78440142746736e-1f } };
+PX_GLOBALCONST PX_VECTORF32 g_PXASinEstConstants = { { 1.00000011921f, PxPiDivTwo, 0.0f, 0.0f } };
+PX_GLOBALCONST PX_VECTORF32 g_PXPiConstants0 = { { PxPi, PxTwoPi, PxInvPi, PxInvTwoPi } };
+PX_GLOBALCONST PX_VECTORF32 g_PXReciprocalTwoPi = { { PxInvTwoPi, PxInvTwoPi, PxInvTwoPi, PxInvTwoPi } };
+PX_GLOBALCONST PX_VECTORF32 g_PXTwoPi = { { PxTwoPi, PxTwoPi, PxTwoPi, PxTwoPi } };
+
+#endif
diff --git a/PxShared/src/foundation/include/unix/neon/PsUnixNeonAoS.h b/PxShared/src/foundation/include/unix/neon/PsUnixNeonAoS.h
new file mode 100644
index 00000000..3a3a02e1
--- /dev/null
+++ b/PxShared/src/foundation/include/unix/neon/PsUnixNeonAoS.h
@@ -0,0 +1,129 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSUNIXNEONAOS_H
+#define PSFOUNDATION_PSUNIXNEONAOS_H
+
+// no includes here! this file should be included from PxcVecMath.h only!!!
+
+#if !COMPILE_VECTOR_INTRINSICS
+#error Vector intrinsics should not be included when using scalar implementation.
+#endif
+
+// only ARM NEON compatible platforms should reach this
+#include <arm_neon.h>
+
+typedef float32x2_t FloatV;
+typedef float32x4_t Vec3V;
+typedef float32x4_t Vec4V;
+typedef uint32x4_t BoolV;
+typedef float32x4_t QuatV;
+
+typedef uint32x4_t VecU32V;
+typedef int32x4_t VecI32V;
+typedef uint16x8_t VecU16V;
+typedef int16x8_t VecI16V;
+typedef uint8x16_t VecU8V;
+
+#define FloatVArg FloatV &
+#define Vec3VArg Vec3V &
+#define Vec4VArg Vec4V &
+#define BoolVArg BoolV &
+#define VecU32VArg VecU32V &
+#define VecI32VArg VecI32V &
+#define VecU16VArg VecU16V &
+#define VecI16VArg VecI16V &
+#define VecU8VArg VecU8V &
+#define QuatVArg QuatV &
+
+// KS - TODO - make an actual VecCrossV type for NEON
+#define VecCrossV Vec3V
+
+typedef VecI32V VecShiftV;
+#define VecShiftVArg VecShiftV &
+
+PX_ALIGN_PREFIX(16)
+struct Mat33V
+{
+	Mat33V()
+	{
+	}
+	Mat33V(const Vec3V& c0, const Vec3V& c1, const Vec3V& c2) : col0(c0), col1(c1), col2(c2)
+	{
+	}
+	Vec3V PX_ALIGN(16, col0);
+	Vec3V PX_ALIGN(16, col1);
+	Vec3V PX_ALIGN(16, col2);
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct Mat34V
+{
+	Mat34V()
+	{
+	}
+	Mat34V(const Vec3V& c0, const Vec3V& c1, const Vec3V& c2, const Vec3V& c3) : col0(c0), col1(c1), col2(c2), col3(c3)
+	{
+	}
+	Vec3V PX_ALIGN(16, col0);
+	Vec3V PX_ALIGN(16, col1);
+	Vec3V PX_ALIGN(16, col2);
+	Vec3V PX_ALIGN(16, col3);
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct Mat43V
+{
+	Mat43V()
+	{
+	}
+	Mat43V(const Vec4V& c0, const Vec4V& c1, const Vec4V& c2) : col0(c0), col1(c1), col2(c2)
+	{
+	}
+	Vec4V PX_ALIGN(16, col0);
+	Vec4V PX_ALIGN(16, col1);
+	Vec4V PX_ALIGN(16, col2);
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct Mat44V
+{
+	Mat44V()
+	{
+	}
+	Mat44V(const Vec4V& c0, const Vec4V& c1, const Vec4V& c2, const Vec4V& c3) : col0(c0), col1(c1), col2(c2), col3(c3)
+	{
+	}
+	Vec4V PX_ALIGN(16, col0);
+	Vec4V PX_ALIGN(16, col1);
+	Vec4V PX_ALIGN(16, col2);
+	Vec4V PX_ALIGN(16, col3);
+} PX_ALIGN_SUFFIX(16);
+
+#endif // PSFOUNDATION_PSUNIXNEONAOS_H
diff --git a/PxShared/src/foundation/include/unix/neon/PsUnixNeonInlineAoS.h b/PxShared/src/foundation/include/unix/neon/PsUnixNeonInlineAoS.h
new file mode 100644
index 00000000..a4f820ea
--- /dev/null
+++ b/PxShared/src/foundation/include/unix/neon/PsUnixNeonInlineAoS.h
@@ -0,0 +1,3577 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSUNIXNEONINLINEAOS_H
+#define PSFOUNDATION_PSUNIXNEONINLINEAOS_H
+
+#if !COMPILE_VECTOR_INTRINSICS
+#error Vector intrinsics should not be included when using scalar implementation.
+#endif
+
+// improved estimates
+#define VRECIPEQ recipq_newton<1>
+#define VRECIPE recip_newton<1>
+#define VRECIPSQRTEQ rsqrtq_newton<1>
+#define VRECIPSQRTE rsqrt_newton<1>
+
+// "exact"
+#define VRECIPQ recipq_newton<4>
+#define VRECIP recip_newton<4>
+#define VRECIPSQRTQ rsqrtq_newton<4>
+#define VRECIPSQRT rsqrt_newton<4>
+
+#define VECMATH_AOS_EPSILON (1e-3f)
+
+// Remove this define when all platforms use simd solver.
+#define PX_SUPPORT_SIMD
+
+//////////////////////////////////////////////////////////////////////
+//Test that Vec3V and FloatV are legal
+//////////////////////////////////
+
+#define FLOAT_COMPONENTS_EQUAL_THRESHOLD 0.01f
+PX_FORCE_INLINE bool isValidFloatV(const FloatV a)
+{
+	/*
+	PX_ALIGN(16, PxF32) data[4];
+	vst1_f32(reinterpret_cast<float32_t*>(data), a);
+	return 
+	PxU32* intData = reinterpret_cast<PxU32*>(data);
+	return intData[0] == intData[1];
+	*/
+	PX_ALIGN(16, PxF32) data[4];
+	vst1_f32(reinterpret_cast<float32_t*>(data), a);
+	const float32_t x = data[0];
+	const float32_t y = data[1];
+	
+	if (PxAbs(x - y) < FLOAT_COMPONENTS_EQUAL_THRESHOLD)
+	{
+		return true;
+	}
+
+	if (PxAbs((x - y) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD)
+	{
+		return true;
+	}
+
+	return false;
+}
+
+PX_FORCE_INLINE bool isValidVec3V(const Vec3V a)
+{
+	const float32_t w = vgetq_lane_f32(a, 3);
+	return (0.0f == w);
+	//const PxU32* intData = reinterpret_cast<const PxU32*>(&w);
+	//return *intData == 0;
+}
+
+PX_FORCE_INLINE bool isAligned16(const void* a)
+{
+	return(0 == (size_t(a) & 0x0f));
+}
+
+#if PX_DEBUG
+#define ASSERT_ISVALIDVEC3V(a) PX_ASSERT(isValidVec3V(a))
+#define ASSERT_ISVALIDFLOATV(a) PX_ASSERT(isValidFloatV(a))
+#define ASSERT_ISALIGNED16(a) PX_ASSERT(isAligned16(static_cast<const void*>(a)))
+#else
+#define ASSERT_ISVALIDVEC3V(a)
+#define ASSERT_ISVALIDFLOATV(a) 
+#define ASSERT_ISALIGNED16(a)
+#endif
+
+namespace internalUnitNeonSimd
+{
+PX_FORCE_INLINE PxU32 BAllTrue4_R(const BoolV a)
+{
+	const uint16x4_t dHigh = vget_high_u16(vreinterpretq_u16_u32(a));
+	const uint16x4_t dLow = vmovn_u32(a);
+	const uint16x8_t combined = vcombine_u16(dLow, dHigh);
+	const uint32x2_t finalReduce = vreinterpret_u32_u8(vmovn_u16(combined));
+	return PxU32(vget_lane_u32(finalReduce, 0) == 0xffffFFFF);
+}
+
+PX_FORCE_INLINE PxU32 BAllTrue3_R(const BoolV a)
+{
+	const uint16x4_t dHigh = vget_high_u16(vreinterpretq_u16_u32(a));
+	const uint16x4_t dLow = vmovn_u32(a);
+	const uint16x8_t combined = vcombine_u16(dLow, dHigh);
+	const uint32x2_t finalReduce = vreinterpret_u32_u8(vmovn_u16(combined));
+	return PxU32((vget_lane_u32(finalReduce, 0) & 0xffFFff) == 0xffFFff);
+}
+
+PX_FORCE_INLINE PxU32 BAnyTrue4_R(const BoolV a)
+{
+	const uint16x4_t dHigh = vget_high_u16(vreinterpretq_u16_u32(a));
+	const uint16x4_t dLow = vmovn_u32(a);
+	const uint16x8_t combined = vcombine_u16(dLow, dHigh);
+	const uint32x2_t finalReduce = vreinterpret_u32_u8(vmovn_u16(combined));
+	return PxU32(vget_lane_u32(finalReduce, 0) != 0x0);
+}
+
+PX_FORCE_INLINE PxU32 BAnyTrue3_R(const BoolV a)
+{
+	const uint16x4_t dHigh = vget_high_u16(vreinterpretq_u16_u32(a));
+	const uint16x4_t dLow = vmovn_u32(a);
+	const uint16x8_t combined = vcombine_u16(dLow, dHigh);
+	const uint32x2_t finalReduce = vreinterpret_u32_u8(vmovn_u16(combined));
+	return PxU32((vget_lane_u32(finalReduce, 0) & 0xffFFff) != 0);
+}
+}
+
+namespace _VecMathTests
+{
+// PT: this function returns an invalid Vec3V (W!=0.0f) just for unit-testing 'isValidVec3V'
+PX_FORCE_INLINE Vec3V getInvalidVec3V()
+{
+	PX_ALIGN(16, PxF32) data[4] = { 1.0f, 1.0f, 1.0f, 1.0f };
+	return V4LoadA(data);
+}
+
+PX_FORCE_INLINE bool allElementsEqualFloatV(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vget_lane_u32(vceq_f32(a, b), 0) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsEqualVec3V(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return V3AllEq(a, b) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsEqualVec4V(const Vec4V a, const Vec4V b)
+{
+	return V4AllEq(a, b) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsEqualBoolV(const BoolV a, const BoolV b)
+{
+	return internalUnitNeonSimd::BAllTrue4_R(vceqq_u32(a, b)) != 0;
+}
+
+PX_FORCE_INLINE PxU32 V4U32AllEq(const VecU32V a, const VecU32V b)
+{
+	return internalUnitNeonSimd::BAllTrue4_R(V4IsEqU32(a, b));
+}
+
+PX_FORCE_INLINE bool allElementsEqualVecU32V(const VecU32V a, const VecU32V b)
+{
+	return V4U32AllEq(a, b) != 0;
+}
+
+PX_FORCE_INLINE BoolV V4IsEqI32(const VecI32V a, const VecI32V b)
+{
+	return vceqq_s32(a, b);
+}
+
+PX_FORCE_INLINE PxU32 V4I32AllEq(const VecI32V a, const VecI32V b)
+{
+	return internalUnitNeonSimd::BAllTrue4_R(V4IsEqI32(a, b));
+}
+
+PX_FORCE_INLINE bool allElementsEqualVecI32V(const VecI32V a, const VecI32V b)
+{
+	return V4I32AllEq(a, b) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsNearEqualFloatV(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+
+	const float32x2_t c = vsub_f32(a, b);
+	const float32x2_t error = vdup_n_f32(VECMATH_AOS_EPSILON);
+// absolute compare abs(error) > abs(c)
+	const uint32x2_t greater = vcagt_f32(error, c);
+	const uint32x2_t min = vpmin_u32(greater, greater);
+	return vget_lane_u32(min, 0) != 0x0;
+}
+
+PX_FORCE_INLINE bool allElementsNearEqualVec3V(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	const float32x4_t c = vsubq_f32(a, b);
+	const float32x4_t error = vdupq_n_f32(VECMATH_AOS_EPSILON);
+// absolute compare abs(error) > abs(c)
+	const uint32x4_t greater = vcagtq_f32(error, c);
+	return internalUnitNeonSimd::BAllTrue3_R(greater) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsNearEqualVec4V(const Vec4V a, const Vec4V b)
+{
+	const float32x4_t c = vsubq_f32(a, b);
+	const float32x4_t error = vdupq_n_f32(VECMATH_AOS_EPSILON);
+// absolute compare abs(error) > abs(c)
+	const uint32x4_t greater = vcagtq_f32(error, c);
+	return internalUnitNeonSimd::BAllTrue4_R(greater) != 0x0;
+}
+}
+
+#if 0 // debugging printfs
+#include <stdio.h>
+PX_FORCE_INLINE void printVec(const float32x4_t& v, const char* name)
+{
+	PX_ALIGN(16, float32_t) data[4];
+	vst1q_f32(data, v);
+	printf("%s: (%f, %f, %f, %f)\n", name, data[0], data[1], data[2], data[3]);
+}
+
+PX_FORCE_INLINE void printVec(const float32x2_t& v, const char* name)
+{
+	PX_ALIGN(16, float32_t) data[2];
+	vst1_f32(data, v);
+	printf("%s: (%f, %f)\n", name, data[0], data[1]);
+}
+
+PX_FORCE_INLINE void printVec(const uint32x4_t& v, const char* name)
+{
+	PX_ALIGN(16, uint32_t) data[4];
+	vst1q_u32(data, v);
+	printf("%s: (0x%x, 0x%x, 0x%x, 0x%x)\n", name, data[0], data[1], data[2], data[3]);
+}
+
+PX_FORCE_INLINE void printVec(const uint16x8_t& v, const char* name)
+{
+	PX_ALIGN(16, uint16_t) data[8];
+	vst1q_u16(data, v);
+	printf("%s: (0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x)\n", name, data[0], data[1], data[2], data[3],
+		data[4], data[5], data[6], data[7]);
+}
+
+PX_FORCE_INLINE void printVec(const int32x4_t& v, const char* name)
+{
+	PX_ALIGN(16, int32_t) data[4];
+	vst1q_s32(data, v);
+	printf("%s: (0x%x, 0x%x, 0x%x, 0x%x)\n", name, data[0], data[1], data[2], data[3]);
+}
+
+PX_FORCE_INLINE void printVec(const int16x8_t& v, const char* name)
+{
+	PX_ALIGN(16, int16_t) data[8];
+	vst1q_s16(data, v);
+	printf("%s: (0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x)\n", name, data[0], data[1], data[2], data[3],
+		data[4], data[5], data[6], data[7]);
+}
+
+PX_FORCE_INLINE void printVec(const uint16x4_t& v, const char* name)
+{
+	PX_ALIGN(16, uint16_t) data[4];
+	vst1_u16(data, v);
+	printf("%s: (0x%x, 0x%x, 0x%x, 0x%x)\n", name, data[0], data[1], data[2], data[3]);
+}
+
+PX_FORCE_INLINE void printVec(const uint32x2_t& v, const char* name)
+{
+	PX_ALIGN(16, uint32_t) data[2];
+	vst1_u32(data, v);
+	printf("%s: (0x%x, 0x%x)\n", name, data[0], data[1]);
+}
+
+PX_FORCE_INLINE void printVar(const PxU32 v, const char* name)
+{
+	printf("%s: 0x%x\n", name, v);
+}
+
+PX_FORCE_INLINE void printVar(const PxF32 v, const char* name)
+{
+	printf("%s: %f\n", name, v);
+}
+
+#define PRINT_VAR(X) printVar((X), #X)
+#define PRINT_VEC(X) printVec((X), #X)
+#define PRINT_VEC_TITLE(TITLE, X) printVec((X), TITLE #X)
+#endif // debugging printf
+
+/////////////////////////////////////////////////////////////////////
+////FUNCTIONS USED ONLY FOR ASSERTS IN VECTORISED IMPLEMENTATIONS
+/////////////////////////////////////////////////////////////////////
+
+PX_FORCE_INLINE bool isFiniteFloatV(const FloatV a)
+{
+	PX_ALIGN(16, PxF32) data[4];
+	vst1_f32(reinterpret_cast<float32_t*>(data), a);
+	return PxIsFinite(data[0]) && PxIsFinite(data[1]);
+}
+
+PX_FORCE_INLINE bool isFiniteVec3V(const Vec3V a)
+{
+	PX_ALIGN(16, PxF32) data[4];
+	vst1q_f32(reinterpret_cast<float32_t*>(data), a);
+	return PxIsFinite(data[0]) && PxIsFinite(data[1]) && PxIsFinite(data[2]);
+}
+
+PX_FORCE_INLINE bool isFiniteVec4V(const Vec4V a)
+{
+	PX_ALIGN(16, PxF32) data[4];
+	vst1q_f32(reinterpret_cast<float32_t*>(data), a);
+	return PxIsFinite(data[0]) && PxIsFinite(data[1]) && PxIsFinite(data[2]) && PxIsFinite(data[3]);
+}
+
+PX_FORCE_INLINE bool hasZeroElementinFloatV(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return vget_lane_u32(vreinterpret_u32_f32(a), 0) == 0;
+}
+
+PX_FORCE_INLINE bool hasZeroElementInVec3V(const Vec3V a)
+{
+	const uint32x2_t dLow = vget_low_u32(vreinterpretq_u32_f32(a));
+	const uint32x2_t dMin = vpmin_u32(dLow, dLow);
+
+	return vget_lane_u32(dMin, 0) == 0 || vgetq_lane_u32(vreinterpretq_u32_f32(a), 2) == 0;
+}
+
+PX_FORCE_INLINE bool hasZeroElementInVec4V(const Vec4V a)
+{
+	const uint32x2_t dHigh = vget_high_u32(vreinterpretq_u32_f32(a));
+	const uint32x2_t dLow = vget_low_u32(vreinterpretq_u32_f32(a));
+
+	const uint32x2_t dMin = vmin_u32(dHigh, dLow);
+	const uint32x2_t pairMin = vpmin_u32(dMin, dMin);
+	return vget_lane_u32(pairMin, 0) == 0;
+}
+
+/////////////////////////////////////////////////////////////////////
+////VECTORISED FUNCTION IMPLEMENTATIONS
+/////////////////////////////////////////////////////////////////////
+
+PX_FORCE_INLINE FloatV FLoad(const PxF32 f)
+{
+	return vdup_n_f32(reinterpret_cast<const float32_t&>(f));
+}
+
+PX_FORCE_INLINE FloatV FLoadA(const PxF32* const f)
+{
+	ASSERT_ISALIGNED16(f);
+	return vld1_f32(reinterpret_cast<const float32_t*>(f));
+}
+
+PX_FORCE_INLINE Vec3V V3Load(const PxF32 f)
+{
+	PX_ALIGN(16, PxF32) data[4] = { f, f, f, 0.0f };
+	return V4LoadA(data);
+}
+
+PX_FORCE_INLINE Vec4V V4Load(const PxF32 f)
+{
+	return vdupq_n_f32(reinterpret_cast<const float32_t&>(f));
+}
+
+PX_FORCE_INLINE BoolV BLoad(const bool f)
+{
+	const PxU32 i = static_cast<PxU32>(-(static_cast<PxI32>(f)));
+	return vdupq_n_u32(i);
+}
+
+PX_FORCE_INLINE Vec3V V3LoadA(const PxVec3& f)
+{
+	ASSERT_ISALIGNED16(&f);
+	PX_ALIGN(16, PxF32) data[4] = { f.x, f.y, f.z, 0.0f };
+	return V4LoadA(data);
+}
+
+PX_FORCE_INLINE Vec3V V3LoadU(const PxVec3& f)
+{
+	PX_ALIGN(16, PxF32) data[4] = { f.x, f.y, f.z, 0.0f };
+	return V4LoadA(data);
+}
+
+PX_FORCE_INLINE Vec3V V3LoadUnsafeA(const PxVec3& f)
+{
+	ASSERT_ISALIGNED16(&f);
+	PX_ALIGN(16, PxF32) data[4] = { f.x, f.y, f.z, 0.0f };
+	return V4LoadA(data);
+}
+
+PX_FORCE_INLINE Vec3V V3LoadA(const PxF32* f)
+{
+	ASSERT_ISALIGNED16(f);
+	PX_ALIGN(16, PxF32) data[4] = { f[0], f[1], f[2], 0.0f };
+	return V4LoadA(data);
+}
+
+PX_FORCE_INLINE Vec3V V3LoadU(const PxF32* f)
+{
+	PX_ALIGN(16, PxF32) data[4] = { f[0], f[1], f[2], 0.0f };
+	return V4LoadA(data);
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V(Vec4V v)
+{
+	return vsetq_lane_f32(0.0f, v, 3);
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V_WUndefined(Vec4V v)
+{
+	return v;
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_Vec3V(Vec3V f)
+{
+	return f; // ok if it is implemented as the same type.
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_FloatV(FloatV f)
+{
+	return vcombine_f32(f, f);
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_FloatV(FloatV f)
+{
+	return Vec3V_From_Vec4V(Vec4V_From_FloatV(f));
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_FloatV_WUndefined(FloatV f)
+{
+	return Vec3V_From_Vec4V_WUndefined(Vec4V_From_FloatV(f));
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_PxVec3_WUndefined(const PxVec3& f)
+{
+	PX_ALIGN(16, PxF32) data[4] = { f.x, f.y, f.z, 0.0f };
+	return V4LoadA(data);
+}
+
+PX_FORCE_INLINE Mat33V Mat33V_From_PxMat33(const PxMat33& m)
+{
+	return Mat33V(V3LoadU(m.column0), V3LoadU(m.column1), V3LoadU(m.column2));
+}
+
+PX_FORCE_INLINE void PxMat33_From_Mat33V(const Mat33V& m, PxMat33& out)
+{
+	V3StoreU(m.col0, out.column0);
+	V3StoreU(m.col1, out.column1);
+	V3StoreU(m.col2, out.column2);
+}
+
+PX_FORCE_INLINE Vec4V V4LoadA(const PxF32* const f)
+{
+	ASSERT_ISALIGNED16(f);
+	return vld1q_f32(reinterpret_cast<const float32_t*>(f));
+}
+
+PX_FORCE_INLINE void V4StoreA(Vec4V a, PxF32* f)
+{
+	ASSERT_ISALIGNED16(f);
+	vst1q_f32(reinterpret_cast<float32_t*>(f), a);
+}
+
+PX_FORCE_INLINE void V4StoreU(const Vec4V a, PxF32* f)
+{
+	PX_ALIGN(16, PxF32) f2[4];
+	vst1q_f32(reinterpret_cast<float32_t*>(f2), a);
+	f[0] = f2[0];
+	f[1] = f2[1];
+	f[2] = f2[2];
+	f[3] = f2[3];
+}
+
+PX_FORCE_INLINE void BStoreA(const BoolV a, PxU32* u)
+{
+	ASSERT_ISALIGNED16(u);
+	vst1q_u32(reinterpret_cast<uint32_t*>(u), a);
+}
+
+PX_FORCE_INLINE void U4StoreA(const VecU32V uv, PxU32* u)
+{
+	ASSERT_ISALIGNED16(u);
+	vst1q_u32(reinterpret_cast<uint32_t*>(u), uv);
+}
+
+PX_FORCE_INLINE void I4StoreA(const VecI32V iv, PxI32* i)
+{
+	ASSERT_ISALIGNED16(i);
+	vst1q_s32(reinterpret_cast<int32_t*>(i), iv);
+}
+
+PX_FORCE_INLINE Vec4V V4LoadU(const PxF32* const f)
+{
+	return vld1q_f32(reinterpret_cast<const float32_t*>(f));
+}
+
+PX_FORCE_INLINE BoolV BLoad(const bool* const f)
+{
+	const PX_ALIGN(16, PxU32) b[4] = { static_cast<PxU32>(-static_cast<PxI32>(f[0])),
+		                               static_cast<PxU32>(-static_cast<PxI32>(f[1])),
+		                               static_cast<PxU32>(-static_cast<PxI32>(f[2])),
+		                               static_cast<PxU32>(-static_cast<PxI32>(f[3])) };
+	return vld1q_u32(b);
+}
+
+PX_FORCE_INLINE void FStore(const FloatV a, PxF32* PX_RESTRICT f)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	// vst1q_lane_f32(f, a, 0); // causes vst1 alignment bug
+	*f = vget_lane_f32(a, 0);
+}
+
+PX_FORCE_INLINE void Store_From_BoolV(const BoolV a, PxU32* PX_RESTRICT f)
+{
+	*f = vget_lane_u32(vget_low_u32(a), 0);
+}
+
+PX_FORCE_INLINE void V3StoreA(const Vec3V a, PxVec3& f)
+{
+	ASSERT_ISALIGNED16(&f);
+	PX_ALIGN(16, PxF32) f2[4];
+	vst1q_f32(reinterpret_cast<float32_t*>(f2), a);
+	f = PxVec3(f2[0], f2[1], f2[2]);
+}
+
+PX_FORCE_INLINE void V3StoreU(const Vec3V a, PxVec3& f)
+{
+	PX_ALIGN(16, PxF32) f2[4];
+	vst1q_f32(reinterpret_cast<float32_t*>(f2), a);
+	f = PxVec3(f2[0], f2[1], f2[2]);
+}
+
+//////////////////////////////////
+// FLOATV
+//////////////////////////////////
+
+PX_FORCE_INLINE FloatV FZero()
+{
+	return FLoad(0.0f);
+}
+
+PX_FORCE_INLINE FloatV FOne()
+{
+	return FLoad(1.0f);
+}
+
+PX_FORCE_INLINE FloatV FHalf()
+{
+	return FLoad(0.5f);
+}
+
+PX_FORCE_INLINE FloatV FEps()
+{
+	return FLoad(PX_EPS_REAL);
+}
+
+PX_FORCE_INLINE FloatV FEps6()
+{
+	return FLoad(1e-6f);
+}
+
+PX_FORCE_INLINE FloatV FMax()
+{
+	return FLoad(PX_MAX_REAL);
+}
+
+PX_FORCE_INLINE FloatV FNegMax()
+{
+	return FLoad(-PX_MAX_REAL);
+}
+
+PX_FORCE_INLINE FloatV IZero()
+{
+	return vreinterpret_f32_u32(vdup_n_u32(0));
+}
+
+PX_FORCE_INLINE FloatV IOne()
+{
+	return vreinterpret_f32_u32(vdup_n_u32(1));
+}
+
+PX_FORCE_INLINE FloatV ITwo()
+{
+	return vreinterpret_f32_u32(vdup_n_u32(2));
+}
+
+PX_FORCE_INLINE FloatV IThree()
+{
+	return vreinterpret_f32_u32(vdup_n_u32(3));
+}
+
+PX_FORCE_INLINE FloatV IFour()
+{
+	return vreinterpret_f32_u32(vdup_n_u32(4));
+}
+
+PX_FORCE_INLINE FloatV FNeg(const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return vneg_f32(f);
+}
+
+PX_FORCE_INLINE FloatV FAdd(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vadd_f32(a, b);
+}
+
+PX_FORCE_INLINE FloatV FSub(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vsub_f32(a, b);
+}
+
+PX_FORCE_INLINE FloatV FMul(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vmul_f32(a, b);
+}
+
+template <int n>
+PX_FORCE_INLINE float32x2_t recip_newton(const float32x2_t& in)
+{
+	float32x2_t recip = vrecpe_f32(in);
+	for(int i = 0; i < n; ++i)
+		recip = vmul_f32(recip, vrecps_f32(in, recip));
+	return recip;
+}
+
+template <int n>
+PX_FORCE_INLINE float32x4_t recipq_newton(const float32x4_t& in)
+{
+	float32x4_t recip = vrecpeq_f32(in);
+	for(int i = 0; i < n; ++i)
+		recip = vmulq_f32(recip, vrecpsq_f32(recip, in));
+	return recip;
+}
+
+template <int n>
+PX_FORCE_INLINE float32x2_t rsqrt_newton(const float32x2_t& in)
+{
+	float32x2_t rsqrt = vrsqrte_f32(in);
+	for(int i = 0; i < n; ++i)
+		rsqrt = vmul_f32(rsqrt, vrsqrts_f32(vmul_f32(rsqrt, rsqrt), in));
+	return rsqrt;
+}
+
+template <int n>
+PX_FORCE_INLINE float32x4_t rsqrtq_newton(const float32x4_t& in)
+{
+	float32x4_t rsqrt = vrsqrteq_f32(in);
+	for(int i = 0; i < n; ++i)
+		rsqrt = vmulq_f32(rsqrt, vrsqrtsq_f32(vmulq_f32(rsqrt, rsqrt), in));
+	return rsqrt;
+}
+
+PX_FORCE_INLINE FloatV FDiv(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vmul_f32(a, VRECIP(b));
+}
+
+PX_FORCE_INLINE FloatV FDivFast(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vmul_f32(a, VRECIPE(b));
+}
+
+PX_FORCE_INLINE FloatV FRecip(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return VRECIP(a);
+}
+
+PX_FORCE_INLINE FloatV FRecipFast(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return VRECIPE(a);
+}
+
+PX_FORCE_INLINE FloatV FRsqrt(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return VRECIPSQRT(a);
+}
+
+PX_FORCE_INLINE FloatV FSqrt(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return FSel(FIsEq(a, FZero()), a, vmul_f32(a, VRECIPSQRT(a)));
+}
+
+PX_FORCE_INLINE FloatV FRsqrtFast(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return VRECIPSQRTE(a);
+}
+
+PX_FORCE_INLINE FloatV FScaleAdd(const FloatV a, const FloatV b, const FloatV c)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	ASSERT_ISVALIDFLOATV(c);
+	return vmla_f32(c, a, b);
+}
+
+PX_FORCE_INLINE FloatV FNegScaleSub(const FloatV a, const FloatV b, const FloatV c)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	ASSERT_ISVALIDFLOATV(c);
+	return vmls_f32(c, a, b);
+}
+
+PX_FORCE_INLINE FloatV FAbs(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return vabs_f32(a);
+}
+
+PX_FORCE_INLINE FloatV FSel(const BoolV c, const FloatV a, const FloatV b)
+{
+	PX_ASSERT(	_VecMathTests::allElementsEqualBoolV(c, BTTTT()) || 
+				_VecMathTests::allElementsEqualBoolV(c, BFFFF()));
+	ASSERT_ISVALIDFLOATV(vbsl_f32(vget_low_u32(c), a, b));
+	return vbsl_f32(vget_low_u32(c), a, b);
+}
+
+PX_FORCE_INLINE BoolV FIsGrtr(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vdupq_lane_u32(vcgt_f32(a, b), 0);
+}
+
+PX_FORCE_INLINE BoolV FIsGrtrOrEq(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vdupq_lane_u32(vcge_f32(a, b), 0);
+}
+
+PX_FORCE_INLINE BoolV FIsEq(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vdupq_lane_u32(vceq_f32(a, b), 0);
+}
+
+PX_FORCE_INLINE FloatV FMax(const FloatV a, const FloatV b)
+{
+	//ASSERT_ISVALIDFLOATV(a);
+	//ASSERT_ISVALIDFLOATV(b);
+	return vmax_f32(a, b);
+}
+
+PX_FORCE_INLINE FloatV FMin(const FloatV a, const FloatV b)
+{
+	//ASSERT_ISVALIDFLOATV(a);
+	//ASSERT_ISVALIDFLOATV(b);
+	return vmin_f32(a, b);
+}
+
+PX_FORCE_INLINE FloatV FClamp(const FloatV a, const FloatV minV, const FloatV maxV)
+{
+	ASSERT_ISVALIDFLOATV(minV);
+	ASSERT_ISVALIDFLOATV(maxV);
+	return vmax_f32(vmin_f32(a, maxV), minV);
+}
+
+PX_FORCE_INLINE PxU32 FAllGrtr(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vget_lane_u32(vcgt_f32(a, b), 0);
+}
+
+PX_FORCE_INLINE PxU32 FAllGrtrOrEq(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vget_lane_u32(vcge_f32(a, b), 0);
+}
+
+PX_FORCE_INLINE PxU32 FAllEq(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vget_lane_u32(vceq_f32(a, b), 0);
+}
+
+PX_FORCE_INLINE FloatV FRound(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+
+	// truncate(a + (0.5f - sign(a)))
+	const float32x2_t half = vdup_n_f32(0.5f);
+	const float32x2_t sign = vcvt_f32_u32((vshr_n_u32(vreinterpret_u32_f32(a), 31)));
+	const float32x2_t aPlusHalf = vadd_f32(a, half);
+	const float32x2_t aRound = vsub_f32(aPlusHalf, sign);
+	int32x2_t tmp = vcvt_s32_f32(aRound);
+	return vcvt_f32_s32(tmp);
+}
+
+PX_FORCE_INLINE FloatV FSin(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+
+	// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
+	const FloatV recipTwoPi = FLoadA(g_PXReciprocalTwoPi.f);
+	const FloatV twoPi = FLoadA(g_PXTwoPi.f);
+	const FloatV tmp = FMul(a, recipTwoPi);
+	const FloatV b = FRound(tmp);
+	const FloatV V1 = FNegScaleSub(twoPi, b, a);
+
+	// sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
+	//           V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
+	const FloatV V2 = FMul(V1, V1);
+	const FloatV V3 = FMul(V2, V1);
+	const FloatV V5 = FMul(V3, V2);
+	const FloatV V7 = FMul(V5, V2);
+	const FloatV V9 = FMul(V7, V2);
+	const FloatV V11 = FMul(V9, V2);
+	const FloatV V13 = FMul(V11, V2);
+	const FloatV V15 = FMul(V13, V2);
+	const FloatV V17 = FMul(V15, V2);
+	const FloatV V19 = FMul(V17, V2);
+	const FloatV V21 = FMul(V19, V2);
+	const FloatV V23 = FMul(V21, V2);
+
+	const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f);
+	const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f);
+	const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f);
+
+	const FloatV S1 = V4GetY(sinCoefficients0);
+	const FloatV S2 = V4GetZ(sinCoefficients0);
+	const FloatV S3 = V4GetW(sinCoefficients0);
+	const FloatV S4 = V4GetX(sinCoefficients1);
+	const FloatV S5 = V4GetY(sinCoefficients1);
+	const FloatV S6 = V4GetZ(sinCoefficients1);
+	const FloatV S7 = V4GetW(sinCoefficients1);
+	const FloatV S8 = V4GetX(sinCoefficients2);
+	const FloatV S9 = V4GetY(sinCoefficients2);
+	const FloatV S10 = V4GetZ(sinCoefficients2);
+	const FloatV S11 = V4GetW(sinCoefficients2);
+
+	FloatV Result;
+	Result = FScaleAdd(S1, V3, V1);
+	Result = FScaleAdd(S2, V5, Result);
+	Result = FScaleAdd(S3, V7, Result);
+	Result = FScaleAdd(S4, V9, Result);
+	Result = FScaleAdd(S5, V11, Result);
+	Result = FScaleAdd(S6, V13, Result);
+	Result = FScaleAdd(S7, V15, Result);
+	Result = FScaleAdd(S8, V17, Result);
+	Result = FScaleAdd(S9, V19, Result);
+	Result = FScaleAdd(S10, V21, Result);
+	Result = FScaleAdd(S11, V23, Result);
+
+	return Result;
+}
+
+PX_FORCE_INLINE FloatV FCos(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+
+	// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
+	const FloatV recipTwoPi = FLoadA(g_PXReciprocalTwoPi.f);
+	const FloatV twoPi = FLoadA(g_PXTwoPi.f);
+	const FloatV tmp = FMul(a, recipTwoPi);
+	const FloatV b = FRound(tmp);
+	const FloatV V1 = FNegScaleSub(twoPi, b, a);
+
+	// cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
+	//           V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
+	const FloatV V2 = FMul(V1, V1);
+	const FloatV V4 = FMul(V2, V2);
+	const FloatV V6 = FMul(V4, V2);
+	const FloatV V8 = FMul(V4, V4);
+	const FloatV V10 = FMul(V6, V4);
+	const FloatV V12 = FMul(V6, V6);
+	const FloatV V14 = FMul(V8, V6);
+	const FloatV V16 = FMul(V8, V8);
+	const FloatV V18 = FMul(V10, V8);
+	const FloatV V20 = FMul(V10, V10);
+	const FloatV V22 = FMul(V12, V10);
+
+	const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f);
+	const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f);
+	const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f);
+
+	const FloatV C1 = V4GetY(cosCoefficients0);
+	const FloatV C2 = V4GetZ(cosCoefficients0);
+	const FloatV C3 = V4GetW(cosCoefficients0);
+	const FloatV C4 = V4GetX(cosCoefficients1);
+	const FloatV C5 = V4GetY(cosCoefficients1);
+	const FloatV C6 = V4GetZ(cosCoefficients1);
+	const FloatV C7 = V4GetW(cosCoefficients1);
+	const FloatV C8 = V4GetX(cosCoefficients2);
+	const FloatV C9 = V4GetY(cosCoefficients2);
+	const FloatV C10 = V4GetZ(cosCoefficients2);
+	const FloatV C11 = V4GetW(cosCoefficients2);
+
+	FloatV Result;
+	Result = FScaleAdd(C1, V2, FOne());
+	Result = FScaleAdd(C2, V4, Result);
+	Result = FScaleAdd(C3, V6, Result);
+	Result = FScaleAdd(C4, V8, Result);
+	Result = FScaleAdd(C5, V10, Result);
+	Result = FScaleAdd(C6, V12, Result);
+	Result = FScaleAdd(C7, V14, Result);
+	Result = FScaleAdd(C8, V16, Result);
+	Result = FScaleAdd(C9, V18, Result);
+	Result = FScaleAdd(C10, V20, Result);
+	Result = FScaleAdd(C11, V22, Result);
+
+	return Result;
+}
+
+PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV min, const FloatV max)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(min);
+	ASSERT_ISVALIDFLOATV(max);
+
+	const BoolV c = BOr(FIsGrtr(a, max), FIsGrtr(min, a));
+	return PxU32(!BAllEqFFFF(c));
+}
+
+PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV min, const FloatV max)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(min);
+	ASSERT_ISVALIDFLOATV(max);
+
+	const BoolV c = BAnd(FIsGrtrOrEq(a, min), FIsGrtrOrEq(max, a));
+	return PxU32(BAllEqTTTT(c));
+}
+
+PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV bounds)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(bounds);
+	const uint32x2_t greater = vcagt_f32(a, bounds);
+	return vget_lane_u32(greater, 0);
+}
+
+PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV bounds)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(bounds);
+	const uint32x2_t geq = vcage_f32(bounds, a);
+	return vget_lane_u32(geq, 0);
+}
+
+//////////////////////////////////
+// VEC3V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec3V V3Splat(const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+
+	const uint32x2_t mask = { 0xffffFFFF, 0x0 };
+
+	const uint32x2_t uHigh = vreinterpret_u32_f32(f);
+	const float32x2_t dHigh = vreinterpret_f32_u32(vand_u32(uHigh, mask));
+
+	return vcombine_f32(f, dHigh);
+}
+
+PX_FORCE_INLINE Vec3V V3Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z)
+{
+	ASSERT_ISVALIDFLOATV(x);
+	ASSERT_ISVALIDFLOATV(y);
+	ASSERT_ISVALIDFLOATV(z);
+
+	const uint32x2_t mask = { 0xffffFFFF, 0x0 };
+
+	const uint32x2_t dHigh = vand_u32(vreinterpret_u32_f32(z), mask);
+	const uint32x2_t dLow = vext_u32(vreinterpret_u32_f32(x), vreinterpret_u32_f32(y), 1);
+	return vreinterpretq_f32_u32(vcombine_u32(dLow, dHigh));
+}
+
+PX_FORCE_INLINE Vec3V V3UnitX()
+{
+	const float32x4_t x = { 1.0f, 0.0f, 0.0f, 0.0f };
+	return x;
+}
+
+PX_FORCE_INLINE Vec3V V3UnitY()
+{
+	const float32x4_t y = { 0, 1.0f, 0, 0 };
+	return y;
+}
+
+PX_FORCE_INLINE Vec3V V3UnitZ()
+{
+	const float32x4_t z = { 0, 0, 1.0f, 0 };
+	return z;
+}
+
+PX_FORCE_INLINE FloatV V3GetX(const Vec3V f)
+{
+	ASSERT_ISVALIDVEC3V(f);
+	const float32x2_t fLow = vget_low_f32(f);
+	return vdup_lane_f32(fLow, 0);
+}
+
+PX_FORCE_INLINE FloatV V3GetY(const Vec3V f)
+{
+	ASSERT_ISVALIDVEC3V(f);
+	const float32x2_t fLow = vget_low_f32(f);
+	return vdup_lane_f32(fLow, 1);
+}
+
+PX_FORCE_INLINE FloatV V3GetZ(const Vec3V f)
+{
+	ASSERT_ISVALIDVEC3V(f);
+	const float32x2_t fhigh = vget_high_f32(f);
+	return vdup_lane_f32(fhigh, 0);
+}
+
+PX_FORCE_INLINE Vec3V V3SetX(const Vec3V v, const FloatV f)
+{
+	ASSERT_ISVALIDVEC3V(v);
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BFTTT(), v, vcombine_f32(f, f));
+}
+
+PX_FORCE_INLINE Vec3V V3SetY(const Vec3V v, const FloatV f)
+{
+	ASSERT_ISVALIDVEC3V(v);
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTFTT(), v, vcombine_f32(f, f));
+}
+
+PX_FORCE_INLINE Vec3V V3SetZ(const Vec3V v, const FloatV f)
+{
+	ASSERT_ISVALIDVEC3V(v);
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTTFT(), v, vcombine_f32(f, f));
+}
+
+PX_FORCE_INLINE Vec3V V3ColX(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+
+	const float32x2_t aLow = vget_low_f32(a);
+	const float32x2_t bLow = vget_low_f32(b);
+	const float32x2_t cLow = vget_low_f32(c);
+	const float32x2_t zero = vdup_n_f32(0.0f);
+
+	const float32x2x2_t zipL = vzip_f32(aLow, bLow);
+	const float32x2x2_t zipH = vzip_f32(cLow, zero);
+
+	return vcombine_f32(zipL.val[0], zipH.val[0]);
+}
+
+PX_FORCE_INLINE Vec3V V3ColY(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+
+	const float32x2_t aLow = vget_low_f32(a);
+	const float32x2_t bLow = vget_low_f32(b);
+	const float32x2_t cLow = vget_low_f32(c);
+	const float32x2_t zero = vdup_n_f32(0.0f);
+
+	const float32x2x2_t zipL = vzip_f32(aLow, bLow);
+	const float32x2x2_t zipH = vzip_f32(cLow, zero);
+
+	return vcombine_f32(zipL.val[1], zipH.val[1]);
+}
+
+PX_FORCE_INLINE Vec3V V3ColZ(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+
+	const float32x2_t aHi = vget_high_f32(a);
+	const float32x2_t bHi = vget_high_f32(b);
+	const float32x2_t cHi = vget_high_f32(c);
+
+	const float32x2x2_t zipL = vzip_f32(aHi, bHi);
+
+	return vcombine_f32(zipL.val[0], cHi);
+}
+
+PX_FORCE_INLINE Vec3V V3Zero()
+{
+	return vdupq_n_f32(0.0f);
+}
+
+PX_FORCE_INLINE Vec3V V3Eps()
+{
+	return V3Load(PX_EPS_REAL);
+}
+
+PX_FORCE_INLINE Vec3V V3One()
+{
+	return V3Load(1.0f);
+}
+
+PX_FORCE_INLINE Vec3V V3Neg(const Vec3V f)
+{
+	ASSERT_ISVALIDVEC3V(f);
+	const float32x4_t tmp = vnegq_f32(f);
+	return vsetq_lane_f32(0.0f, tmp, 3);
+}
+
+PX_FORCE_INLINE Vec3V V3Add(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return vaddq_f32(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Add(const Vec3V a, const FloatV b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vaddq_f32(a, Vec3V_From_FloatV(b));
+}
+
+PX_FORCE_INLINE Vec3V V3Sub(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return vsubq_f32(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Sub(const Vec3V a, const FloatV b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vsubq_f32(a, Vec3V_From_FloatV(b));
+}
+
+PX_FORCE_INLINE Vec3V V3Scale(const Vec3V a, const FloatV b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	const float32x4_t tmp = vmulq_lane_f32(a, b, 0);
+	return vsetq_lane_f32(0.0f, tmp, 3);
+}
+
+PX_FORCE_INLINE Vec3V V3Mul(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return vmulq_f32(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3ScaleInv(const Vec3V a, const FloatV b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	const float32x2_t invB = VRECIP(b);
+	const float32x4_t tmp = vmulq_lane_f32(a, invB, 0);
+	return vsetq_lane_f32(0.0f, tmp, 3);
+}
+
+PX_FORCE_INLINE Vec3V V3Div(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	float32x4_t invB = VRECIPQ(b);
+	invB = vsetq_lane_f32(0.0f, invB, 3);
+	return vmulq_f32(a, invB);
+}
+
+PX_FORCE_INLINE Vec3V V3ScaleInvFast(const Vec3V a, const FloatV b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	const float32x2_t invB = VRECIPE(b);
+	const float32x4_t tmp = vmulq_lane_f32(a, invB, 0);
+	return vsetq_lane_f32(0.0f, tmp, 3);
+}
+
+PX_FORCE_INLINE Vec3V V3DivFast(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	float32x4_t invB = VRECIPEQ(b);
+	invB = vsetq_lane_f32(0.0f, invB, 3);
+	return vmulq_f32(a, invB);
+}
+
+PX_FORCE_INLINE Vec3V V3Recip(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const float32x4_t recipA = VRECIPQ(a);
+	return vsetq_lane_f32(0.0f, recipA, 3);
+}
+
+PX_FORCE_INLINE Vec3V V3RecipFast(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const float32x4_t recipA = VRECIPEQ(a);
+	return vsetq_lane_f32(0.0f, recipA, 3);
+}
+
+PX_FORCE_INLINE Vec3V V3Rsqrt(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const float32x4_t rSqrA = VRECIPSQRTQ(a);
+	return vsetq_lane_f32(0.0f, rSqrA, 3);
+}
+
+PX_FORCE_INLINE Vec3V V3RsqrtFast(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const float32x4_t rSqrA = VRECIPSQRTEQ(a);
+	return vsetq_lane_f32(0.0f, rSqrA, 3);
+}
+
+PX_FORCE_INLINE Vec3V V3ScaleAdd(const Vec3V a, const FloatV b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	ASSERT_ISVALIDVEC3V(c);
+
+	float32x4_t tmp = vmlaq_lane_f32(c, a, b, 0);
+	// using vsetq_lane_f32 resulted in failures,
+	// probably related to a compiler bug on
+	// ndk r9d-win32, gcc 4.8, cardhu/shield
+
+	// code with issue
+	// return vsetq_lane_f32(0.0f, tmp, 3);
+
+	// workaround
+	float32x2_t w_z = vget_high_f32(tmp);
+	float32x2_t y_x = vget_low_f32(tmp);
+	w_z = vset_lane_f32(0.0f, w_z, 1);
+	return vcombine_f32(y_x, w_z);
+}
+
+PX_FORCE_INLINE Vec3V V3NegScaleSub(const Vec3V a, const FloatV b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	ASSERT_ISVALIDVEC3V(c);
+
+	float32x4_t tmp = vmlsq_lane_f32(c, a, b, 0);
+	// using vsetq_lane_f32 resulted in failures,
+	// probably related to a compiler bug on
+	// ndk r9d-win32, gcc 4.8, cardhu/shield
+
+	// code with issue
+	// return vsetq_lane_f32(0.0f, tmp, 3);
+
+	// workaround
+	float32x2_t w_z = vget_high_f32(tmp);
+	float32x2_t y_x = vget_low_f32(tmp);
+	w_z = vset_lane_f32(0.0f, w_z, 1);
+	return vcombine_f32(y_x, w_z);
+}
+
+PX_FORCE_INLINE Vec3V V3MulAdd(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+	return vmlaq_f32(c, a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3NegMulSub(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+	return vmlsq_f32(c, a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Abs(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return vabsq_f32(a);
+}
+
+PX_FORCE_INLINE FloatV V3Dot(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	
+	//	const uint32x2_t mask = {0xffffFFFF, 0x0};
+	const float32x4_t tmp = vmulq_f32(a, b);
+
+	const float32x2_t low = vget_low_f32(tmp);
+	const float32x2_t high = vget_high_f32(tmp);
+	//	const float32x2_t high = vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(high_), mask));
+
+	const float32x2_t sumTmp = vpadd_f32(low, high);       // = {0+z, x+y}
+	const float32x2_t sum0ZYX = vpadd_f32(sumTmp, sumTmp); // = {x+y+z, x+y+z}
+
+	return sum0ZYX;
+}
+
+PX_FORCE_INLINE Vec3V V3Cross(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+
+	const uint32x2_t TF = { 0xffffFFFF, 0x0 };
+	const float32x2_t ay_ax = vget_low_f32(a);  // d2
+	const float32x2_t aw_az = vget_high_f32(a); // d3
+	const float32x2_t by_bx = vget_low_f32(b);  // d4
+	const float32x2_t bw_bz = vget_high_f32(b); // d5
+	// Hi, Lo
+	const float32x2_t bz_by = vext_f32(by_bx, bw_bz, 1); // bz, by
+	const float32x2_t az_ay = vext_f32(ay_ax, aw_az, 1); // az, ay
+
+	const float32x2_t azbx = vmul_f32(aw_az, by_bx);      // 0, az*bx
+	const float32x2_t aybz_axby = vmul_f32(ay_ax, bz_by); // ay*bz, ax*by
+
+	const float32x2_t azbxSUBaxbz = vmls_f32(azbx, bw_bz, ay_ax);                  // 0, az*bx-ax*bz
+	const float32x2_t aybzSUBazby_axbySUBaybx = vmls_f32(aybz_axby, by_bx, az_ay); // ay*bz-az*by, ax*by-ay*bx
+
+	const float32x2_t retLow = vext_f32(aybzSUBazby_axbySUBaybx, azbxSUBaxbz, 1);           // az*bx-ax*bz, ay*bz-az*by
+	const uint32x2_t retHigh = vand_u32(TF, vreinterpret_u32_f32(aybzSUBazby_axbySUBaybx)); // 0, ax*by-ay*bx
+
+	return vcombine_f32(retLow, vreinterpret_f32_u32(retHigh));
+}
+
+PX_FORCE_INLINE VecCrossV V3PrepareCross(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return a;
+}
+
+PX_FORCE_INLINE FloatV V3Length(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+
+	//	const uint32x2_t mask = {0xffffFFFF, 0x0};
+
+	const float32x4_t tmp = vmulq_f32(a, a);
+	const float32x2_t low = vget_low_f32(tmp);
+	const float32x2_t high = vget_high_f32(tmp);
+	//	const float32x2_t high = vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(high_), mask));
+
+	const float32x2_t sumTmp = vpadd_f32(low, high);       // = {0+z, x+y}
+	const float32x2_t sum0ZYX = vpadd_f32(sumTmp, sumTmp); // = {x+y+z, x+y+z}
+
+	return FSqrt(sum0ZYX);
+}
+
+PX_FORCE_INLINE FloatV V3LengthSq(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return V3Dot(a, a);
+}
+
+PX_FORCE_INLINE Vec3V V3Normalize(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	//PX_ASSERT(!FAllEq(V4LengthSq(a), FZero()));
+	return V3ScaleInv(a, V3Length(a));
+}
+
+PX_FORCE_INLINE Vec3V V3NormalizeFast(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	//PX_ASSERT(!FAllEq(V4LengthSq(a), FZero()));
+	return V3Scale(a, VRECIPSQRTE(V3Dot(a, a)));
+}
+
+PX_FORCE_INLINE Vec3V V3NormalizeSafe(const Vec3V a, const Vec3V unsafeReturnValue)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const FloatV zero = vdup_n_f32(0.0f);
+	const FloatV length = V3Length(a);
+	const uint32x4_t isGreaterThanZero = FIsGrtr(length, zero);
+	return V3Sel(isGreaterThanZero, V3ScaleInv(a, length), unsafeReturnValue);
+}
+
+PX_FORCE_INLINE Vec3V V3Sel(const BoolV c, const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V( vbslq_f32(c, a, b));
+	return vbslq_f32(c, a, b);
+}
+
+PX_FORCE_INLINE BoolV V3IsGrtr(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return vcgtq_f32(a, b);
+}
+
+PX_FORCE_INLINE BoolV V3IsGrtrOrEq(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return vcgeq_f32(a, b);
+}
+
+PX_FORCE_INLINE BoolV V3IsEq(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return vceqq_f32(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Max(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);	
+	return vmaxq_f32(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Min(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return vminq_f32(a, b);
+}
+
+PX_FORCE_INLINE FloatV V3ExtractMax(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+
+	const float32x2_t low = vget_low_f32(a);
+	const float32x2_t high = vget_high_f32(a);
+
+	const float32x2_t zz = vdup_lane_f32(high, 0);
+	const float32x2_t max0 = vpmax_f32(zz, low);
+	const float32x2_t max1 = vpmax_f32(max0, max0);
+
+	return max1;
+}
+
+PX_FORCE_INLINE FloatV V3ExtractMin(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+
+	const float32x2_t low = vget_low_f32(a);
+	const float32x2_t high = vget_high_f32(a);
+
+	const float32x2_t zz = vdup_lane_f32(high, 0);
+	const float32x2_t min0 = vpmin_f32(zz, low);
+	const float32x2_t min1 = vpmin_f32(min0, min0);
+
+	return min1;
+}
+
+// return (a >= 0.0f) ? 1.0f : -1.0f;
+PX_FORCE_INLINE Vec3V V3Sign(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const Vec3V zero = V3Zero();
+	const Vec3V one = V3One();
+	const Vec3V none = V3Neg(one);
+	return V3Sel(V3IsGrtrOrEq(a, zero), one, none);
+}
+
+PX_FORCE_INLINE Vec3V V3Clamp(const Vec3V a, const Vec3V minV, const Vec3V maxV)
+{
+	ASSERT_ISVALIDVEC3V(minV);
+	ASSERT_ISVALIDVEC3V(maxV);	
+	return V3Max(V3Min(a, maxV), minV);
+}
+
+PX_FORCE_INLINE PxU32 V3AllGrtr(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return internalUnitNeonSimd::BAllTrue3_R(V4IsGrtr(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V3AllGrtrOrEq(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return internalUnitNeonSimd::BAllTrue3_R(V4IsGrtrOrEq(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V3AllEq(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return internalUnitNeonSimd::BAllTrue3_R(V4IsEq(a, b));
+}
+
+PX_FORCE_INLINE Vec3V V3Round(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	// truncate(a + (0.5f - sign(a)))
+	const Vec3V half = V3Load(0.5f);
+	const float32x4_t sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(a), 31)));
+	const Vec3V aPlusHalf = V3Add(a, half);
+	const Vec3V aRound = V3Sub(aPlusHalf, sign);
+	return vcvtq_f32_s32(vcvtq_s32_f32(aRound));
+}
+
+PX_FORCE_INLINE Vec3V V3Sin(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+
+	// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
+	const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const Vec4V twoPi = V4LoadA(g_PXTwoPi.f);
+	const Vec3V tmp = V4Mul(a, recipTwoPi);
+	const Vec3V b = V3Round(tmp);
+	const Vec3V V1 = V4NegMulSub(twoPi, b, a);
+
+	// sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
+	//           V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
+	const Vec3V V2 = V3Mul(V1, V1);
+	const Vec3V V3 = V3Mul(V2, V1);
+	const Vec3V V5 = V3Mul(V3, V2);
+	const Vec3V V7 = V3Mul(V5, V2);
+	const Vec3V V9 = V3Mul(V7, V2);
+	const Vec3V V11 = V3Mul(V9, V2);
+	const Vec3V V13 = V3Mul(V11, V2);
+	const Vec3V V15 = V3Mul(V13, V2);
+	const Vec3V V17 = V3Mul(V15, V2);
+	const Vec3V V19 = V3Mul(V17, V2);
+	const Vec3V V21 = V3Mul(V19, V2);
+	const Vec3V V23 = V3Mul(V21, V2);
+
+	const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f);
+	const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f);
+	const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f);
+
+	const FloatV S1 = V4GetY(sinCoefficients0);
+	const FloatV S2 = V4GetZ(sinCoefficients0);
+	const FloatV S3 = V4GetW(sinCoefficients0);
+	const FloatV S4 = V4GetX(sinCoefficients1);
+	const FloatV S5 = V4GetY(sinCoefficients1);
+	const FloatV S6 = V4GetZ(sinCoefficients1);
+	const FloatV S7 = V4GetW(sinCoefficients1);
+	const FloatV S8 = V4GetX(sinCoefficients2);
+	const FloatV S9 = V4GetY(sinCoefficients2);
+	const FloatV S10 = V4GetZ(sinCoefficients2);
+	const FloatV S11 = V4GetW(sinCoefficients2);
+
+	Vec3V Result;
+	Result = V4ScaleAdd(V3, S1, V1);
+	Result = V4ScaleAdd(V5, S2, Result);
+	Result = V4ScaleAdd(V7, S3, Result);
+	Result = V4ScaleAdd(V9, S4, Result);
+	Result = V4ScaleAdd(V11, S5, Result);
+	Result = V4ScaleAdd(V13, S6, Result);
+	Result = V4ScaleAdd(V15, S7, Result);
+	Result = V4ScaleAdd(V17, S8, Result);
+	Result = V4ScaleAdd(V19, S9, Result);
+	Result = V4ScaleAdd(V21, S10, Result);
+	Result = V4ScaleAdd(V23, S11, Result);
+
+	return Result;
+}
+
+PX_FORCE_INLINE Vec3V V3Cos(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	
+	// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
+	const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const Vec4V twoPi = V4LoadA(g_PXTwoPi.f);
+	const Vec3V tmp = V4Mul(a, recipTwoPi);
+	const Vec3V b = V3Round(tmp);
+	const Vec3V V1 = V4NegMulSub(twoPi, b, a);
+
+	// cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
+	//           V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
+	const Vec3V V2 = V3Mul(V1, V1);
+	const Vec3V V4 = V3Mul(V2, V2);
+	const Vec3V V6 = V3Mul(V4, V2);
+	const Vec3V V8 = V3Mul(V4, V4);
+	const Vec3V V10 = V3Mul(V6, V4);
+	const Vec3V V12 = V3Mul(V6, V6);
+	const Vec3V V14 = V3Mul(V8, V6);
+	const Vec3V V16 = V3Mul(V8, V8);
+	const Vec3V V18 = V3Mul(V10, V8);
+	const Vec3V V20 = V3Mul(V10, V10);
+	const Vec3V V22 = V3Mul(V12, V10);
+
+	const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f);
+	const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f);
+	const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f);
+
+	const FloatV C1 = V4GetY(cosCoefficients0);
+	const FloatV C2 = V4GetZ(cosCoefficients0);
+	const FloatV C3 = V4GetW(cosCoefficients0);
+	const FloatV C4 = V4GetX(cosCoefficients1);
+	const FloatV C5 = V4GetY(cosCoefficients1);
+	const FloatV C6 = V4GetZ(cosCoefficients1);
+	const FloatV C7 = V4GetW(cosCoefficients1);
+	const FloatV C8 = V4GetX(cosCoefficients2);
+	const FloatV C9 = V4GetY(cosCoefficients2);
+	const FloatV C10 = V4GetZ(cosCoefficients2);
+	const FloatV C11 = V4GetW(cosCoefficients2);
+
+	Vec3V Result;
+	Result = V4ScaleAdd(V2, C1, V4One());
+	Result = V4ScaleAdd(V4, C2, Result);
+	Result = V4ScaleAdd(V6, C3, Result);
+	Result = V4ScaleAdd(V8, C4, Result);
+	Result = V4ScaleAdd(V10, C5, Result);
+	Result = V4ScaleAdd(V12, C6, Result);
+	Result = V4ScaleAdd(V14, C7, Result);
+	Result = V4ScaleAdd(V16, C8, Result);
+	Result = V4ScaleAdd(V18, C9, Result);
+	Result = V4ScaleAdd(V20, C10, Result);
+	Result = V4ScaleAdd(V22, C11, Result);
+
+	return V4ClearW(Result);
+}
+
+PX_FORCE_INLINE Vec3V V3PermYZZ(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const float32x2_t xy = vget_low_f32(a);
+	const float32x2_t zw = vget_high_f32(a);
+	const float32x2_t yz = vext_f32(xy, zw, 1);
+	return vcombine_f32(yz, zw);
+}
+
+PX_FORCE_INLINE Vec3V V3PermXYX(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const uint32x2_t mask = { 0xffffFFFF, 0x0 };
+
+	const uint32x2_t xy = vget_low_u32(vreinterpretq_u32_f32(a));
+	const uint32x2_t xw = vand_u32(xy, mask);
+	return vreinterpretq_f32_u32(vcombine_u32(xy, xw));
+}
+
+PX_FORCE_INLINE Vec3V V3PermYZX(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const uint32x2_t mask = { 0xffffFFFF, 0x0 };
+
+	const uint32x2_t xy = vget_low_u32(vreinterpretq_u32_f32(a));
+	const uint32x2_t zw = vget_high_u32(vreinterpretq_u32_f32(a));
+	const uint32x2_t yz = vext_u32(xy, zw, 1);
+	const uint32x2_t xw = vand_u32(xy, mask);
+	return vreinterpretq_f32_u32(vcombine_u32(yz, xw));
+}
+
+PX_FORCE_INLINE Vec3V V3PermZXY(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);	
+	
+	const uint32x2_t xy = vget_low_u32(vreinterpretq_u32_f32(a));
+	const uint32x2_t zw = vget_high_u32(vreinterpretq_u32_f32(a));
+	const uint32x2_t wz = vrev64_u32(zw);
+
+	const uint32x2_t zx = vext_u32(wz, xy, 1);
+	const uint32x2_t yw = vext_u32(xy, wz, 1);
+
+	return vreinterpretq_f32_u32(vcombine_u32(zx, yw));
+}
+
+PX_FORCE_INLINE Vec3V V3PermZZY(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+
+	const uint32x2_t xy = vget_low_u32(vreinterpretq_u32_f32(a));
+	const uint32x2_t zw = vget_high_u32(vreinterpretq_u32_f32(a));
+
+	const uint32x2_t wz = vrev64_u32(zw);
+	const uint32x2_t yw = vext_u32(xy, wz, 1);
+	const uint32x2_t zz = vdup_lane_u32(wz, 1);
+
+	return vreinterpretq_f32_u32(vcombine_u32(zz, yw));
+}
+
+PX_FORCE_INLINE Vec3V V3PermYXX(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+
+	const uint32x2_t mask = { 0xffffFFFF, 0x0 };
+
+	const uint32x2_t xy = vget_low_u32(vreinterpretq_u32_f32(a));
+	const uint32x2_t yx = vrev64_u32(xy);
+	const uint32x2_t xw = vand_u32(xy, mask);
+	return vreinterpretq_f32_u32(vcombine_u32(yx, xw));
+}
+
+PX_FORCE_INLINE Vec3V V3Perm_Zero_1Z_0Y(const Vec3V v0, const Vec3V v1)
+{
+	ASSERT_ISVALIDVEC3V(v0);
+	ASSERT_ISVALIDVEC3V(v1);
+	
+	const uint32x2_t xy = vget_low_u32(vreinterpretq_u32_f32(v0));
+	const uint32x2_t zw = vget_high_u32(vreinterpretq_u32_f32(v1));
+	const uint32x2_t wz = vrev64_u32(zw);
+	const uint32x2_t yw = vext_u32(xy, wz, 1);
+
+	return vreinterpretq_f32_u32(vcombine_u32(wz, yw));
+}
+
+PX_FORCE_INLINE Vec3V V3Perm_0Z_Zero_1X(const Vec3V v0, const Vec3V v1)
+{
+	ASSERT_ISVALIDVEC3V(v0);
+	ASSERT_ISVALIDVEC3V(v1);
+
+	const uint32x2_t mask = { 0xffffFFFF, 0x0 };
+
+	const uint32x2_t zw = vget_high_u32(vreinterpretq_u32_f32(v0));
+	const uint32x2_t xy = vget_low_u32(vreinterpretq_u32_f32(v1));
+	const uint32x2_t xw = vand_u32(xy, mask);
+
+	return vreinterpretq_f32_u32(vcombine_u32(zw, xw));
+}
+
+PX_FORCE_INLINE Vec3V V3Perm_1Y_0X_Zero(const Vec3V v0, const Vec3V v1)
+{
+	ASSERT_ISVALIDVEC3V(v0);
+	ASSERT_ISVALIDVEC3V(v1);
+	
+	const uint32x2_t axy = vget_low_u32(vreinterpretq_u32_f32(v0));
+	const uint32x2_t bxy = vget_low_u32(vreinterpretq_u32_f32(v1));
+	const uint32x2_t byax = vext_u32(bxy, axy, 1);
+	const uint32x2_t ww = vdup_n_u32(0);
+
+	return vreinterpretq_f32_u32(vcombine_u32(byax, ww));
+}
+
+PX_FORCE_INLINE FloatV V3SumElems(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+
+	// const uint32x2_t mask = {0xffffFFFF, 0x0};
+
+	const float32x2_t low = vget_low_f32(a);
+	const float32x2_t high = vget_high_f32(a);
+	// const float32x2_t high = vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(high_), mask));
+
+	const float32x2_t sumTmp = vpadd_f32(low, high);       // = {0+z, x+y}
+	const float32x2_t sum0ZYX = vpadd_f32(sumTmp, sumTmp); // = {x+y+z, x+y+z}
+
+	return sum0ZYX;
+}
+
+PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V min, const Vec3V max)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(min);
+	ASSERT_ISVALIDVEC3V(max);
+
+	const BoolV c = BOr(V3IsGrtr(a, max), V3IsGrtr(min, a));
+	return internalUnitNeonSimd::BAnyTrue3_R(c);
+}
+
+PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V min, const Vec3V max)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(min);
+	ASSERT_ISVALIDVEC3V(max);
+
+	const BoolV c = BAnd(V3IsGrtrOrEq(a, min), V3IsGrtrOrEq(max, a));
+	return internalUnitNeonSimd::BAllTrue4_R(c);
+}
+
+PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V bounds)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(bounds);
+
+	const BoolV greater = V3IsGrtr(V3Abs(a), bounds);
+	return internalUnitNeonSimd::BAnyTrue3_R(greater);
+}
+
+PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V bounds)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(bounds);
+
+	const BoolV greaterOrEq = V3IsGrtrOrEq(bounds, V3Abs(a));
+	return internalUnitNeonSimd::BAllTrue4_R(greaterOrEq);
+}
+
+PX_FORCE_INLINE void V3Transpose(Vec3V& col0, Vec3V& col1, Vec3V& col2)
+{
+	ASSERT_ISVALIDVEC3V(col0);
+	ASSERT_ISVALIDVEC3V(col1);
+	ASSERT_ISVALIDVEC3V(col2);
+
+	Vec3V col3 = V3Zero();
+	const float32x4x2_t v0v1 = vzipq_f32(col0, col2);
+	const float32x4x2_t v2v3 = vzipq_f32(col1, col3);
+	const float32x4x2_t zip0 = vzipq_f32(v0v1.val[0], v2v3.val[0]);
+	const float32x4x2_t zip1 = vzipq_f32(v0v1.val[1], v2v3.val[1]);
+	col0 = zip0.val[0];
+	col1 = zip0.val[1];
+	col2 = zip1.val[0];
+	// col3 = zip1.val[1];
+}
+
+//////////////////////////////////
+// VEC4V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec4V V4Splat(const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return vcombine_f32(f, f);
+}
+
+PX_FORCE_INLINE Vec4V V4Merge(const FloatV* const floatVArray)
+{
+	ASSERT_ISVALIDFLOATV(floatVArray[0]);
+	ASSERT_ISVALIDFLOATV(floatVArray[1]);
+	ASSERT_ISVALIDFLOATV(floatVArray[2]);
+	ASSERT_ISVALIDFLOATV(floatVArray[3]);
+
+	const uint32x2_t xLow = vreinterpret_u32_f32(floatVArray[0]);
+	const uint32x2_t yLow = vreinterpret_u32_f32(floatVArray[1]);
+	const uint32x2_t zLow = vreinterpret_u32_f32(floatVArray[2]);
+	const uint32x2_t wLow = vreinterpret_u32_f32(floatVArray[3]);
+
+	const uint32x2_t dLow = vext_u32(xLow, yLow, 1);
+	const uint32x2_t dHigh = vext_u32(zLow, wLow, 1);
+
+	return vreinterpretq_f32_u32(vcombine_u32(dLow, dHigh));
+}
+
+PX_FORCE_INLINE Vec4V V4Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z, const FloatVArg w)
+{
+	ASSERT_ISVALIDFLOATV(x);
+	ASSERT_ISVALIDFLOATV(y);
+	ASSERT_ISVALIDFLOATV(z);
+	ASSERT_ISVALIDFLOATV(w);
+
+	const uint32x2_t xLow = vreinterpret_u32_f32(x);
+	const uint32x2_t yLow = vreinterpret_u32_f32(y);
+	const uint32x2_t zLow = vreinterpret_u32_f32(z);
+	const uint32x2_t wLow = vreinterpret_u32_f32(w);
+
+	const uint32x2_t dLow = vext_u32(xLow, yLow, 1);
+	const uint32x2_t dHigh = vext_u32(zLow, wLow, 1);
+
+	return vreinterpretq_f32_u32(vcombine_u32(dLow, dHigh));
+}
+
+PX_FORCE_INLINE Vec4V V4MergeW(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	const float32x2_t xx = vget_high_f32(x);
+	const float32x2_t yy = vget_high_f32(y);
+	const float32x2_t zz = vget_high_f32(z);
+	const float32x2_t ww = vget_high_f32(w);
+
+	const float32x2x2_t zipL = vzip_f32(xx, yy);
+	const float32x2x2_t zipH = vzip_f32(zz, ww);
+
+	return vcombine_f32(zipL.val[1], zipH.val[1]);
+}
+
+PX_FORCE_INLINE Vec4V V4MergeZ(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	const float32x2_t xx = vget_high_f32(x);
+	const float32x2_t yy = vget_high_f32(y);
+	const float32x2_t zz = vget_high_f32(z);
+	const float32x2_t ww = vget_high_f32(w);
+
+	const float32x2x2_t zipL = vzip_f32(xx, yy);
+	const float32x2x2_t zipH = vzip_f32(zz, ww);
+
+	return vcombine_f32(zipL.val[0], zipH.val[0]);
+}
+
+PX_FORCE_INLINE Vec4V V4MergeY(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	const float32x2_t xx = vget_low_f32(x);
+	const float32x2_t yy = vget_low_f32(y);
+	const float32x2_t zz = vget_low_f32(z);
+	const float32x2_t ww = vget_low_f32(w);
+
+	const float32x2x2_t zipL = vzip_f32(xx, yy);
+	const float32x2x2_t zipH = vzip_f32(zz, ww);
+
+	return vcombine_f32(zipL.val[1], zipH.val[1]);
+}
+
+PX_FORCE_INLINE Vec4V V4MergeX(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	const float32x2_t xx = vget_low_f32(x);
+	const float32x2_t yy = vget_low_f32(y);
+	const float32x2_t zz = vget_low_f32(z);
+	const float32x2_t ww = vget_low_f32(w);
+
+	const float32x2x2_t zipL = vzip_f32(xx, yy);
+	const float32x2x2_t zipH = vzip_f32(zz, ww);
+
+	return vcombine_f32(zipL.val[0], zipH.val[0]);
+}
+
+PX_FORCE_INLINE Vec4V V4UnpackXY(const Vec4VArg a, const Vec4VArg b)
+{
+	return vzipq_f32(a, b).val[0];
+}
+
+PX_FORCE_INLINE Vec4V V4UnpackZW(const Vec4VArg a, const Vec4VArg b)
+{
+	return vzipq_f32(a, b).val[1];
+}
+
+PX_FORCE_INLINE Vec4V V4UnitW()
+{
+	const float32x2_t zeros = vreinterpret_f32_u32(vmov_n_u32(0));
+	const float32x2_t ones = vmov_n_f32(1.0f);
+	const float32x2_t zo = vext_f32(zeros, ones, 1);
+	return vcombine_f32(zeros, zo);
+}
+
+PX_FORCE_INLINE Vec4V V4UnitX()
+{
+	const float32x2_t zeros = vreinterpret_f32_u32(vmov_n_u32(0));
+	const float32x2_t ones = vmov_n_f32(1.0f);
+	const float32x2_t oz = vext_f32(ones, zeros, 1);
+	return vcombine_f32(oz, zeros);
+}
+
+PX_FORCE_INLINE Vec4V V4UnitY()
+{
+	const float32x2_t zeros = vreinterpret_f32_u32(vmov_n_u32(0));
+	const float32x2_t ones = vmov_n_f32(1.0f);
+	const float32x2_t zo = vext_f32(zeros, ones, 1);
+	return vcombine_f32(zo, zeros);
+}
+
+PX_FORCE_INLINE Vec4V V4UnitZ()
+{
+	const float32x2_t zeros = vreinterpret_f32_u32(vmov_n_u32(0));
+	const float32x2_t ones = vmov_n_f32(1.0f);
+	const float32x2_t oz = vext_f32(ones, zeros, 1);
+	return vcombine_f32(zeros, oz);
+}
+
+PX_FORCE_INLINE FloatV V4GetW(const Vec4V f)
+{
+	const float32x2_t fhigh = vget_high_f32(f);
+	return vdup_lane_f32(fhigh, 1);
+}
+
+PX_FORCE_INLINE FloatV V4GetX(const Vec4V f)
+{
+	const float32x2_t fLow = vget_low_f32(f);
+	return vdup_lane_f32(fLow, 0);
+}
+
+PX_FORCE_INLINE FloatV V4GetY(const Vec4V f)
+{
+	const float32x2_t fLow = vget_low_f32(f);
+	return vdup_lane_f32(fLow, 1);
+}
+
+PX_FORCE_INLINE FloatV V4GetZ(const Vec4V f)
+{
+	const float32x2_t fhigh = vget_high_f32(f);
+	return vdup_lane_f32(fhigh, 0);
+}
+
+PX_FORCE_INLINE Vec4V V4SetW(const Vec4V v, const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTTTF(), v, vcombine_f32(f, f));
+}
+
+PX_FORCE_INLINE Vec4V V4SetX(const Vec4V v, const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BFTTT(), v, vcombine_f32(f, f));
+}
+
+PX_FORCE_INLINE Vec4V V4SetY(const Vec4V v, const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTFTT(), v, vcombine_f32(f, f));
+}
+
+PX_FORCE_INLINE Vec4V V4SetZ(const Vec4V v, const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTTFT(), v, vcombine_f32(f, f));
+}
+
+PX_FORCE_INLINE Vec4V V4ClearW(const Vec4V v)
+{
+	return V4Sel(BTTTF(), v, V4Zero());
+}
+
+PX_FORCE_INLINE Vec4V V4PermYXWZ(const Vec4V a)
+{
+	const float32x2_t xy = vget_low_f32(a);
+	const float32x2_t zw = vget_high_f32(a);
+	const float32x2_t yx = vext_f32(xy, xy, 1);
+	const float32x2_t wz = vext_f32(zw, zw, 1);
+	return vcombine_f32(yx, wz);
+}
+
+PX_FORCE_INLINE Vec4V V4PermXZXZ(const Vec4V a)
+{
+	const float32x2_t xy = vget_low_f32(a);
+	const float32x2_t zw = vget_high_f32(a);
+	const float32x2x2_t xzyw = vzip_f32(xy, zw);
+	return vcombine_f32(xzyw.val[0], xzyw.val[0]);
+}
+
+PX_FORCE_INLINE Vec4V V4PermYWYW(const Vec4V a)
+{
+	const float32x2_t xy = vget_low_f32(a);
+	const float32x2_t zw = vget_high_f32(a);
+	const float32x2x2_t xzyw = vzip_f32(xy, zw);
+	return vcombine_f32(xzyw.val[1], xzyw.val[1]);
+}
+
+PX_FORCE_INLINE Vec4V V4PermYZXW(const Vec4V a)
+{
+	const uint32x2_t xy = vget_low_u32(vreinterpretq_u32_f32(a));
+	const uint32x2_t zw = vget_high_u32(vreinterpretq_u32_f32(a));
+	const uint32x2_t yz = vext_u32(xy, zw, 1);
+	const uint32x2_t xw = vrev64_u32(vext_u32(zw, xy, 1));
+	return vreinterpretq_f32_u32(vcombine_u32(yz, xw));
+}
+
+template <PxU8 E0, PxU8 E1, PxU8 E2, PxU8 E3>
+PX_FORCE_INLINE Vec4V V4Perm(const Vec4V V)
+{
+	static const uint32_t ControlElement[4] =
+	{
+#if 1
+		0x03020100, // XM_SWIZZLE_X
+		0x07060504, // XM_SWIZZLE_Y
+		0x0B0A0908, // XM_SWIZZLE_Z
+		0x0F0E0D0C, // XM_SWIZZLE_W
+#else
+		0x00010203, // XM_SWIZZLE_X
+		0x04050607, // XM_SWIZZLE_Y
+		0x08090A0B, // XM_SWIZZLE_Z
+		0x0C0D0E0F, // XM_SWIZZLE_W
+#endif
+	};
+
+	uint8x8x2_t tbl;
+	tbl.val[0] = vreinterpret_u8_f32(vget_low_f32(V));
+	tbl.val[1] = vreinterpret_u8_f32(vget_high_f32(V));
+
+	uint8x8_t idx =
+	    vcreate_u8(static_cast<uint64_t>(ControlElement[E0]) | (static_cast<uint64_t>(ControlElement[E1]) << 32));
+	const uint8x8_t rL = vtbl2_u8(tbl, idx);
+	idx = vcreate_u8(static_cast<uint64_t>(ControlElement[E2]) | (static_cast<uint64_t>(ControlElement[E3]) << 32));
+	const uint8x8_t rH = vtbl2_u8(tbl, idx);
+	return vreinterpretq_f32_u8(vcombine_u8(rL, rH));
+}
+
+// PT: this seems measurably slower than the hardcoded version
+/*PX_FORCE_INLINE Vec4V V4PermYZXW(const Vec4V a)
+{
+    return V4Perm<1, 2, 0, 3>(a);
+}*/
+
+PX_FORCE_INLINE Vec4V V4Zero()
+{
+	return vreinterpretq_f32_u32(vmovq_n_u32(0));
+	//	return vmovq_n_f32(0.0f);
+}
+
+PX_FORCE_INLINE Vec4V V4One()
+{
+	return vmovq_n_f32(1.0f);
+}
+
+PX_FORCE_INLINE Vec4V V4Eps()
+{
+	//	return vmovq_n_f32(PX_EPS_REAL);
+	return V4Load(PX_EPS_REAL);
+}
+
+PX_FORCE_INLINE Vec4V V4Neg(const Vec4V f)
+{
+	return vnegq_f32(f);
+}
+
+PX_FORCE_INLINE Vec4V V4Add(const Vec4V a, const Vec4V b)
+{
+	return vaddq_f32(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Sub(const Vec4V a, const Vec4V b)
+{
+	return vsubq_f32(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Scale(const Vec4V a, const FloatV b)
+{
+	return vmulq_lane_f32(a, b, 0);
+}
+
+PX_FORCE_INLINE Vec4V V4Mul(const Vec4V a, const Vec4V b)
+{
+	return vmulq_f32(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4ScaleInv(const Vec4V a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(b);
+	const float32x2_t invB = VRECIP(b);
+	return vmulq_lane_f32(a, invB, 0);
+}
+
+PX_FORCE_INLINE Vec4V V4Div(const Vec4V a, const Vec4V b)
+{
+	const float32x4_t invB = VRECIPQ(b);
+	return vmulq_f32(a, invB);
+}
+
+PX_FORCE_INLINE Vec4V V4ScaleInvFast(const Vec4V a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(b);
+	const float32x2_t invB = VRECIPE(b);
+	return vmulq_lane_f32(a, invB, 0);
+}
+
+PX_FORCE_INLINE Vec4V V4DivFast(const Vec4V a, const Vec4V b)
+{
+	const float32x4_t invB = VRECIPEQ(b);
+	return vmulq_f32(a, invB);
+}
+
+PX_FORCE_INLINE Vec4V V4Recip(const Vec4V a)
+{
+	return VRECIPQ(a);
+}
+
+PX_FORCE_INLINE Vec4V V4RecipFast(const Vec4V a)
+{
+	return VRECIPEQ(a);
+}
+
+PX_FORCE_INLINE Vec4V V4Rsqrt(const Vec4V a)
+{
+	return VRECIPSQRTQ(a);
+}
+
+PX_FORCE_INLINE Vec4V V4RsqrtFast(const Vec4V a)
+{
+	return VRECIPSQRTEQ(a);
+}
+
+PX_FORCE_INLINE Vec4V V4Sqrt(const Vec4V a)
+{
+	return V4Sel(V4IsEq(a, V4Zero()), a, V4Mul(a, VRECIPSQRTQ(a)));
+}
+
+PX_FORCE_INLINE Vec4V V4ScaleAdd(const Vec4V a, const FloatV b, const Vec4V c)
+{
+	ASSERT_ISVALIDFLOATV(b);
+	return vmlaq_lane_f32(c, a, b, 0);
+}
+
+PX_FORCE_INLINE Vec4V V4NegScaleSub(const Vec4V a, const FloatV b, const Vec4V c)
+{
+	ASSERT_ISVALIDFLOATV(b);
+	return vmlsq_lane_f32(c, a, b, 0);
+}
+
+PX_FORCE_INLINE Vec4V V4MulAdd(const Vec4V a, const Vec4V b, const Vec4V c)
+{
+	return vmlaq_f32(c, a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4NegMulSub(const Vec4V a, const Vec4V b, const Vec4V c)
+{
+	return vmlsq_f32(c, a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Abs(const Vec4V a)
+{
+	return vabsq_f32(a);
+}
+
+PX_FORCE_INLINE FloatV V4SumElements(const Vec4V a)
+{
+	const Vec4V xy = V4UnpackXY(a, a); // x,x,y,y
+	const Vec4V zw = V4UnpackZW(a, a); // z,z,w,w
+	const Vec4V xz_yw = V4Add(xy, zw); // x+z,x+z,y+w,y+w
+	const FloatV xz = V4GetX(xz_yw);   // x+z
+	const FloatV yw = V4GetZ(xz_yw);   // y+w
+	return FAdd(xz, yw);               // sum
+}
+
+PX_FORCE_INLINE FloatV V4Dot(const Vec4V a, const Vec4V b)
+{
+	const float32x4_t tmp = vmulq_f32(a, b);
+	const float32x2_t low = vget_low_f32(tmp);
+	const float32x2_t high = vget_high_f32(tmp);
+
+	const float32x2_t sumTmp = vpadd_f32(low, high);       // = {z+w, x+y}
+	const float32x2_t sumWZYX = vpadd_f32(sumTmp, sumTmp); // = {x+y+z+w, x+y+z+w}
+	return sumWZYX;
+}
+
+PX_FORCE_INLINE FloatV V4Dot3(const Vec4V aa, const Vec4V bb)
+{
+	// PT: the V3Dot code relies on the fact that W=0 so we can't reuse it as-is, we need to clear W first.
+	// TODO: find a better implementation that does not need to clear W.
+	const Vec4V a = V4ClearW(aa);
+	const Vec4V b = V4ClearW(bb);
+
+	const float32x4_t tmp = vmulq_f32(a, b);
+	const float32x2_t low = vget_low_f32(tmp);
+	const float32x2_t high = vget_high_f32(tmp);
+
+	const float32x2_t sumTmp = vpadd_f32(low, high);       // = {0+z, x+y}
+	const float32x2_t sum0ZYX = vpadd_f32(sumTmp, sumTmp); // = {x+y+z, x+y+z}
+	return sum0ZYX;
+}
+
+PX_FORCE_INLINE Vec4V V4Cross(const Vec4V a, const Vec4V b)
+{
+	const uint32x2_t TF = { 0xffffFFFF, 0x0 };
+	const float32x2_t ay_ax = vget_low_f32(a);  // d2
+	const float32x2_t aw_az = vget_high_f32(a); // d3
+	const float32x2_t by_bx = vget_low_f32(b);  // d4
+	const float32x2_t bw_bz = vget_high_f32(b); // d5
+	// Hi, Lo
+	const float32x2_t bz_by = vext_f32(by_bx, bw_bz, 1); // bz, by
+	const float32x2_t az_ay = vext_f32(ay_ax, aw_az, 1); // az, ay
+
+	const float32x2_t azbx = vmul_f32(aw_az, by_bx);      // 0, az*bx
+	const float32x2_t aybz_axby = vmul_f32(ay_ax, bz_by); // ay*bz, ax*by
+
+	const float32x2_t azbxSUBaxbz = vmls_f32(azbx, bw_bz, ay_ax);                  // 0, az*bx-ax*bz
+	const float32x2_t aybzSUBazby_axbySUBaybx = vmls_f32(aybz_axby, by_bx, az_ay); // ay*bz-az*by, ax*by-ay*bx
+
+	const float32x2_t retLow = vext_f32(aybzSUBazby_axbySUBaybx, azbxSUBaxbz, 1);           // az*bx-ax*bz, ay*bz-az*by
+	const uint32x2_t retHigh = vand_u32(TF, vreinterpret_u32_f32(aybzSUBazby_axbySUBaybx)); // 0, ax*by-ay*bx
+
+	return vcombine_f32(retLow, vreinterpret_f32_u32(retHigh));
+}
+
+PX_FORCE_INLINE FloatV V4Length(const Vec4V a)
+{
+	const float32x4_t tmp = vmulq_f32(a, a);
+	const float32x2_t low = vget_low_f32(tmp);
+	const float32x2_t high = vget_high_f32(tmp);
+
+	const float32x2_t sumTmp = vpadd_f32(low, high);       // = {0+z, x+y}
+	const float32x2_t sumWZYX = vpadd_f32(sumTmp, sumTmp); // = {x+y+z, x+y+z}
+	return FSqrt(sumWZYX);
+}
+
+PX_FORCE_INLINE FloatV V4LengthSq(const Vec4V a)
+{
+	return V4Dot(a, a);
+}
+
+PX_FORCE_INLINE Vec4V V4Normalize(const Vec4V a)
+{
+	//PX_ASSERT(!FAllEq(V4LengthSq(a), FZero()));
+	return V4ScaleInv(a, V4Length(a));
+}
+
+PX_FORCE_INLINE Vec4V V4NormalizeFast(const Vec4V a)
+{
+	//PX_ASSERT(!FAllEq(V4LengthSq(a), FZero()));
+	return V4Scale(a, FRsqrtFast(V4Dot(a, a)));
+}
+
+PX_FORCE_INLINE Vec4V V4NormalizeSafe(const Vec4V a, const Vec4V unsafeReturnValue)
+{
+	const FloatV zero = FZero();
+	const FloatV length = V4Length(a);
+	const uint32x4_t isGreaterThanZero = FIsGrtr(length, zero);
+	return V4Sel(isGreaterThanZero, V4ScaleInv(a, length), unsafeReturnValue);
+}
+
+PX_FORCE_INLINE BoolV V4IsEqU32(const VecU32V a, const VecU32V b)
+{
+	return vceqq_u32(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Sel(const BoolV c, const Vec4V a, const Vec4V b)
+{
+	return vbslq_f32(c, a, b);
+}
+
+PX_FORCE_INLINE BoolV V4IsGrtr(const Vec4V a, const Vec4V b)
+{
+	return vcgtq_f32(a, b);
+}
+
+PX_FORCE_INLINE BoolV V4IsGrtrOrEq(const Vec4V a, const Vec4V b)
+{
+	return vcgeq_f32(a, b);
+}
+
+PX_FORCE_INLINE BoolV V4IsEq(const Vec4V a, const Vec4V b)
+{
+	return vceqq_f32(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Max(const Vec4V a, const Vec4V b)
+{
+	return vmaxq_f32(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Min(const Vec4V a, const Vec4V b)
+{
+	return vminq_f32(a, b);
+}
+
+PX_FORCE_INLINE FloatV V4ExtractMax(const Vec4V a)
+{
+	const float32x2_t low = vget_low_f32(a);
+	const float32x2_t high = vget_high_f32(a);
+
+	const float32x2_t max0 = vpmax_f32(high, low);
+	const float32x2_t max1 = vpmax_f32(max0, max0);
+
+	return max1;
+}
+
+PX_FORCE_INLINE FloatV V4ExtractMin(const Vec4V a)
+{
+	const float32x2_t low = vget_low_f32(a);
+	const float32x2_t high = vget_high_f32(a);
+
+	const float32x2_t min0 = vpmin_f32(high, low);
+	const float32x2_t min1 = vpmin_f32(min0, min0);
+
+	return min1;
+}
+
+PX_FORCE_INLINE Vec4V V4Clamp(const Vec4V a, const Vec4V minV, const Vec4V maxV)
+{
+	return V4Max(V4Min(a, maxV), minV);
+}
+
+PX_FORCE_INLINE PxU32 V4AllGrtr(const Vec4V a, const Vec4V b)
+{
+	return internalUnitNeonSimd::BAllTrue4_R(V4IsGrtr(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V4AllGrtrOrEq(const Vec4V a, const Vec4V b)
+{
+	return internalUnitNeonSimd::BAllTrue4_R(V4IsGrtrOrEq(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V4AllGrtrOrEq3(const Vec4V a, const Vec4V b)
+{
+	return internalUnitNeonSimd::BAllTrue3_R(V4IsGrtrOrEq(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V4AllEq(const Vec4V a, const Vec4V b)
+{
+	return internalUnitNeonSimd::BAllTrue4_R(V4IsEq(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V4AnyGrtr3(const Vec4V a, const Vec4V b)
+{
+	return internalUnitNeonSimd::BAnyTrue3_R(V4IsGrtr(a, b));
+}
+
+PX_FORCE_INLINE Vec4V V4Round(const Vec4V a)
+{
+	// truncate(a + (0.5f - sign(a)))
+	const Vec4V half = V4Load(0.5f);
+	const float32x4_t sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(a), 31)));
+	const Vec4V aPlusHalf = V4Add(a, half);
+	const Vec4V aRound = V4Sub(aPlusHalf, sign);
+	return vcvtq_f32_s32(vcvtq_s32_f32(aRound));
+}
+
+PX_FORCE_INLINE Vec4V V4Sin(const Vec4V a)
+{
+	const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const Vec4V twoPi = V4LoadA(g_PXTwoPi.f);
+	const Vec4V tmp = V4Mul(a, recipTwoPi);
+	const Vec4V b = V4Round(tmp);
+	const Vec4V V1 = V4NegMulSub(twoPi, b, a);
+
+	// sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
+	//           V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
+	const Vec4V V2 = V4Mul(V1, V1);
+	const Vec4V V3 = V4Mul(V2, V1);
+	const Vec4V V5 = V4Mul(V3, V2);
+	const Vec4V V7 = V4Mul(V5, V2);
+	const Vec4V V9 = V4Mul(V7, V2);
+	const Vec4V V11 = V4Mul(V9, V2);
+	const Vec4V V13 = V4Mul(V11, V2);
+	const Vec4V V15 = V4Mul(V13, V2);
+	const Vec4V V17 = V4Mul(V15, V2);
+	const Vec4V V19 = V4Mul(V17, V2);
+	const Vec4V V21 = V4Mul(V19, V2);
+	const Vec4V V23 = V4Mul(V21, V2);
+
+	const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f);
+	const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f);
+	const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f);
+
+	const FloatV S1 = V4GetY(sinCoefficients0);
+	const FloatV S2 = V4GetZ(sinCoefficients0);
+	const FloatV S3 = V4GetW(sinCoefficients0);
+	const FloatV S4 = V4GetX(sinCoefficients1);
+	const FloatV S5 = V4GetY(sinCoefficients1);
+	const FloatV S6 = V4GetZ(sinCoefficients1);
+	const FloatV S7 = V4GetW(sinCoefficients1);
+	const FloatV S8 = V4GetX(sinCoefficients2);
+	const FloatV S9 = V4GetY(sinCoefficients2);
+	const FloatV S10 = V4GetZ(sinCoefficients2);
+	const FloatV S11 = V4GetW(sinCoefficients2);
+
+	Vec4V Result;
+	Result = V4ScaleAdd(V3, S1, V1);
+	Result = V4ScaleAdd(V5, S2, Result);
+	Result = V4ScaleAdd(V7, S3, Result);
+	Result = V4ScaleAdd(V9, S4, Result);
+	Result = V4ScaleAdd(V11, S5, Result);
+	Result = V4ScaleAdd(V13, S6, Result);
+	Result = V4ScaleAdd(V15, S7, Result);
+	Result = V4ScaleAdd(V17, S8, Result);
+	Result = V4ScaleAdd(V19, S9, Result);
+	Result = V4ScaleAdd(V21, S10, Result);
+	Result = V4ScaleAdd(V23, S11, Result);
+
+	return Result;
+}
+
+PX_FORCE_INLINE Vec4V V4Cos(const Vec4V a)
+{
+	const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const Vec4V twoPi = V4LoadA(g_PXTwoPi.f);
+	const Vec4V tmp = V4Mul(a, recipTwoPi);
+	const Vec4V b = V4Round(tmp);
+	const Vec4V V1 = V4NegMulSub(twoPi, b, a);
+
+	// cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
+	//           V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
+	const Vec4V V2 = V4Mul(V1, V1);
+	const Vec4V V4 = V4Mul(V2, V2);
+	const Vec4V V6 = V4Mul(V4, V2);
+	const Vec4V V8 = V4Mul(V4, V4);
+	const Vec4V V10 = V4Mul(V6, V4);
+	const Vec4V V12 = V4Mul(V6, V6);
+	const Vec4V V14 = V4Mul(V8, V6);
+	const Vec4V V16 = V4Mul(V8, V8);
+	const Vec4V V18 = V4Mul(V10, V8);
+	const Vec4V V20 = V4Mul(V10, V10);
+	const Vec4V V22 = V4Mul(V12, V10);
+
+	const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f);
+	const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f);
+	const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f);
+
+	const FloatV C1 = V4GetY(cosCoefficients0);
+	const FloatV C2 = V4GetZ(cosCoefficients0);
+	const FloatV C3 = V4GetW(cosCoefficients0);
+	const FloatV C4 = V4GetX(cosCoefficients1);
+	const FloatV C5 = V4GetY(cosCoefficients1);
+	const FloatV C6 = V4GetZ(cosCoefficients1);
+	const FloatV C7 = V4GetW(cosCoefficients1);
+	const FloatV C8 = V4GetX(cosCoefficients2);
+	const FloatV C9 = V4GetY(cosCoefficients2);
+	const FloatV C10 = V4GetZ(cosCoefficients2);
+	const FloatV C11 = V4GetW(cosCoefficients2);
+
+	Vec4V Result;
+	Result = V4ScaleAdd(V2, C1, V4One());
+	Result = V4ScaleAdd(V4, C2, Result);
+	Result = V4ScaleAdd(V6, C3, Result);
+	Result = V4ScaleAdd(V8, C4, Result);
+	Result = V4ScaleAdd(V10, C5, Result);
+	Result = V4ScaleAdd(V12, C6, Result);
+	Result = V4ScaleAdd(V14, C7, Result);
+	Result = V4ScaleAdd(V16, C8, Result);
+	Result = V4ScaleAdd(V18, C9, Result);
+	Result = V4ScaleAdd(V20, C10, Result);
+	Result = V4ScaleAdd(V22, C11, Result);
+
+	return Result;
+}
+
+PX_FORCE_INLINE void V4Transpose(Vec4V& col0, Vec4V& col1, Vec4V& col2, Vec4V& col3)
+{
+	const float32x4x2_t v0v1 = vzipq_f32(col0, col2);
+	const float32x4x2_t v2v3 = vzipq_f32(col1, col3);
+	const float32x4x2_t zip0 = vzipq_f32(v0v1.val[0], v2v3.val[0]);
+	const float32x4x2_t zip1 = vzipq_f32(v0v1.val[1], v2v3.val[1]);
+	col0 = zip0.val[0];
+	col1 = zip0.val[1];
+	col2 = zip1.val[0];
+	col3 = zip1.val[1];
+}
+
+//////////////////////////////////
+// VEC4V
+//////////////////////////////////
+
+PX_FORCE_INLINE BoolV BFFFF()
+{
+	return vmovq_n_u32(0);
+}
+
+PX_FORCE_INLINE BoolV BFFFT()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	const uint32x2_t zo = vext_u32(zeros, ones, 1);
+	return vcombine_u32(zeros, zo);
+}
+
+PX_FORCE_INLINE BoolV BFFTF()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	const uint32x2_t oz = vext_u32(ones, zeros, 1);
+	return vcombine_u32(zeros, oz);
+}
+
+PX_FORCE_INLINE BoolV BFFTT()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	return vcombine_u32(zeros, ones);
+}
+
+PX_FORCE_INLINE BoolV BFTFF()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	const uint32x2_t zo = vext_u32(zeros, ones, 1);
+	return vcombine_u32(zo, zeros);
+}
+
+PX_FORCE_INLINE BoolV BFTFT()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	const uint32x2_t zo = vext_u32(zeros, ones, 1);
+	return vcombine_u32(zo, zo);
+}
+
+PX_FORCE_INLINE BoolV BFTTF()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	const uint32x2_t zo = vext_u32(zeros, ones, 1);
+	const uint32x2_t oz = vext_u32(ones, zeros, 1);
+	return vcombine_u32(zo, oz);
+}
+
+PX_FORCE_INLINE BoolV BFTTT()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	const uint32x2_t zo = vext_u32(zeros, ones, 1);
+	return vcombine_u32(zo, ones);
+}
+
+PX_FORCE_INLINE BoolV BTFFF()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	// const uint32x2_t zo = vext_u32(zeros, ones, 1);
+	const uint32x2_t oz = vext_u32(ones, zeros, 1);
+	return vcombine_u32(oz, zeros);
+}
+
+PX_FORCE_INLINE BoolV BTFFT()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	const uint32x2_t zo = vext_u32(zeros, ones, 1);
+	const uint32x2_t oz = vext_u32(ones, zeros, 1);
+	return vcombine_u32(oz, zo);
+}
+
+PX_FORCE_INLINE BoolV BTFTF()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	const uint32x2_t oz = vext_u32(ones, zeros, 1);
+	return vcombine_u32(oz, oz);
+}
+
+PX_FORCE_INLINE BoolV BTFTT()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	const uint32x2_t oz = vext_u32(ones, zeros, 1);
+	return vcombine_u32(oz, ones);
+}
+
+PX_FORCE_INLINE BoolV BTTFF()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	return vcombine_u32(ones, zeros);
+}
+
+PX_FORCE_INLINE BoolV BTTFT()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	const uint32x2_t zo = vext_u32(zeros, ones, 1);
+	return vcombine_u32(ones, zo);
+}
+
+PX_FORCE_INLINE BoolV BTTTF()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	const uint32x2_t oz = vext_u32(ones, zeros, 1);
+	return vcombine_u32(ones, oz);
+}
+
+PX_FORCE_INLINE BoolV BTTTT()
+{
+	return vmovq_n_u32(0xffffFFFF);
+}
+
+PX_FORCE_INLINE BoolV BXMask()
+{
+	return BTFFF();
+}
+
+PX_FORCE_INLINE BoolV BYMask()
+{
+	return BFTFF();
+}
+
+PX_FORCE_INLINE BoolV BZMask()
+{
+	return BFFTF();
+}
+
+PX_FORCE_INLINE BoolV BWMask()
+{
+	return BFFFT();
+}
+
+PX_FORCE_INLINE BoolV BGetX(const BoolV f)
+{
+	const uint32x2_t fLow = vget_low_u32(f);
+	return vdupq_lane_u32(fLow, 0);
+}
+
+PX_FORCE_INLINE BoolV BGetY(const BoolV f)
+{
+	const uint32x2_t fLow = vget_low_u32(f);
+	return vdupq_lane_u32(fLow, 1);
+}
+
+PX_FORCE_INLINE BoolV BGetZ(const BoolV f)
+{
+	const uint32x2_t fHigh = vget_high_u32(f);
+	return vdupq_lane_u32(fHigh, 0);
+}
+
+PX_FORCE_INLINE BoolV BGetW(const BoolV f)
+{
+	const uint32x2_t fHigh = vget_high_u32(f);
+	return vdupq_lane_u32(fHigh, 1);
+}
+
+PX_FORCE_INLINE BoolV BSetX(const BoolV v, const BoolV f)
+{
+	return vbslq_u32(BFTTT(), v, f);
+}
+
+PX_FORCE_INLINE BoolV BSetY(const BoolV v, const BoolV f)
+{
+	return vbslq_u32(BTFTT(), v, f);
+}
+
+PX_FORCE_INLINE BoolV BSetZ(const BoolV v, const BoolV f)
+{
+	return vbslq_u32(BTTFT(), v, f);
+}
+
+PX_FORCE_INLINE BoolV BSetW(const BoolV v, const BoolV f)
+{
+	return vbslq_u32(BTTTF(), v, f);
+}
+
+PX_FORCE_INLINE BoolV BAnd(const BoolV a, const BoolV b)
+{
+	return vandq_u32(a, b);
+}
+
+PX_FORCE_INLINE BoolV BNot(const BoolV a)
+{
+	return vmvnq_u32(a);
+}
+
+PX_FORCE_INLINE BoolV BAndNot(const BoolV a, const BoolV b)
+{
+	// return vbicq_u32(a, b);
+	return vandq_u32(a, vmvnq_u32(b));
+}
+
+PX_FORCE_INLINE BoolV BOr(const BoolV a, const BoolV b)
+{
+	return vorrq_u32(a, b);
+}
+
+PX_FORCE_INLINE BoolV BAllTrue4(const BoolV a)
+{
+	const uint32x2_t allTrue = vmov_n_u32(0xffffFFFF);
+	const uint16x4_t dHigh = vget_high_u16(vreinterpretq_u16_u32(a));
+	const uint16x4_t dLow = vmovn_u32(a);
+	uint16x8_t combined = vcombine_u16(dLow, dHigh);
+	const uint32x2_t finalReduce = vreinterpret_u32_u8(vmovn_u16(combined));
+	const uint32x2_t result = vceq_u32(finalReduce, allTrue);
+	return vdupq_lane_u32(result, 0);
+}
+
+PX_FORCE_INLINE BoolV BAnyTrue4(const BoolV a)
+{
+	const uint32x2_t allTrue = vmov_n_u32(0xffffFFFF);
+	const uint16x4_t dHigh = vget_high_u16(vreinterpretq_u16_u32(a));
+	const uint16x4_t dLow = vmovn_u32(a);
+	uint16x8_t combined = vcombine_u16(dLow, dHigh);
+	const uint32x2_t finalReduce = vreinterpret_u32_u8(vmovn_u16(combined));
+	const uint32x2_t result = vtst_u32(finalReduce, allTrue);
+	return vdupq_lane_u32(result, 0);
+}
+
+PX_FORCE_INLINE BoolV BAllTrue3(const BoolV a)
+{
+	const uint32x2_t allTrue3 = vmov_n_u32(0x00ffFFFF);
+	const uint16x4_t dHigh = vget_high_u16(vreinterpretq_u16_u32(a));
+	const uint16x4_t dLow = vmovn_u32(a);
+	uint16x8_t combined = vcombine_u16(dLow, dHigh);
+	const uint32x2_t finalReduce = vreinterpret_u32_u8(vmovn_u16(combined));
+	const uint32x2_t result = vceq_u32(vand_u32(finalReduce, allTrue3), allTrue3);
+	return vdupq_lane_u32(result, 0);
+}
+
+PX_FORCE_INLINE BoolV BAnyTrue3(const BoolV a)
+{
+	const uint32x2_t allTrue3 = vmov_n_u32(0x00ffFFFF);
+	const uint16x4_t dHigh = vget_high_u16(vreinterpretq_u16_u32(a));
+	const uint16x4_t dLow = vmovn_u32(a);
+	uint16x8_t combined = vcombine_u16(dLow, dHigh);
+	const uint32x2_t finalReduce = vreinterpret_u32_u8(vmovn_u16(combined));
+	const uint32x2_t result = vtst_u32(vand_u32(finalReduce, allTrue3), allTrue3);
+	return vdupq_lane_u32(result, 0);
+}
+
+PX_FORCE_INLINE PxU32 BAllEq(const BoolV a, const BoolV b)
+{
+	const BoolV bTest = vceqq_u32(a, b);
+	return internalUnitNeonSimd::BAllTrue4_R(bTest);
+}
+
+PX_FORCE_INLINE PxU32 BAllEqTTTT(const BoolV a)
+{
+	return BAllEq(a, BTTTT());
+}
+
+PX_FORCE_INLINE PxU32 BAllEqFFFF(const BoolV a)
+{
+	return BAllEq(a, BFFFF());
+}
+
+PX_FORCE_INLINE PxU32 BGetBitMask(const BoolV a)
+{
+	static PX_ALIGN(16, const PxU32) bitMaskData[4] = { 1, 2, 4, 8 };
+	const uint32x4_t bitMask = *(reinterpret_cast<const uint32x4_t*>(bitMaskData));
+	const uint32x4_t t0 = vandq_u32(a, bitMask);
+	const uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0)); // Pairwise add (0 + 1), (2 + 3)
+	return PxU32(vget_lane_u32(vpadd_u32(t1, t1), 0));
+}
+
+//////////////////////////////////
+// MAT33V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec3V M33MulV3(const Mat33V& a, const Vec3V b)
+{
+	const FloatV x = V3GetX(b);
+	const FloatV y = V3GetY(b);
+	const FloatV z = V3GetZ(b);
+	const Vec3V v0 = V3Scale(a.col0, x);
+	const Vec3V v1 = V3Scale(a.col1, y);
+	const Vec3V v2 = V3Scale(a.col2, z);
+	const Vec3V v0PlusV1 = V3Add(v0, v1);
+	return V3Add(v0PlusV1, v2);
+}
+
+PX_FORCE_INLINE Vec3V M33TrnspsMulV3(const Mat33V& a, const Vec3V b)
+{
+	const FloatV x = V3Dot(a.col0, b);
+	const FloatV y = V3Dot(a.col1, b);
+	const FloatV z = V3Dot(a.col2, b);
+	return V3Merge(x, y, z);
+}
+
+PX_FORCE_INLINE Vec3V M33MulV3AddV3(const Mat33V& A, const Vec3V b, const Vec3V c)
+{
+	const FloatV x = V3GetX(b);
+	const FloatV y = V3GetY(b);
+	const FloatV z = V3GetZ(b);
+	Vec3V result = V3ScaleAdd(A.col0, x, c);
+	result = V3ScaleAdd(A.col1, y, result);
+	return V3ScaleAdd(A.col2, z, result);
+}
+
+PX_FORCE_INLINE Mat33V M33MulM33(const Mat33V& a, const Mat33V& b)
+{
+	return Mat33V(M33MulV3(a, b.col0), M33MulV3(a, b.col1), M33MulV3(a, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Add(const Mat33V& a, const Mat33V& b)
+{
+	return Mat33V(V3Add(a.col0, b.col0), V3Add(a.col1, b.col1), V3Add(a.col2, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Scale(const Mat33V& a, const FloatV& b)
+{
+	return Mat33V(V3Scale(a.col0, b), V3Scale(a.col1, b), V3Scale(a.col2, b));
+}
+
+PX_FORCE_INLINE Mat33V M33Inverse(const Mat33V& a)
+{
+	const float32x2_t zeros = vreinterpret_f32_u32(vmov_n_u32(0));
+	const BoolV btttf = BTTTF();
+
+	const Vec3V cross01 = V3Cross(a.col0, a.col1);
+	const Vec3V cross12 = V3Cross(a.col1, a.col2);
+	const Vec3V cross20 = V3Cross(a.col2, a.col0);
+	const FloatV dot = V3Dot(cross01, a.col2);
+	const FloatV invDet = FRecipFast(dot);
+
+	const float32x4x2_t merge = vzipq_f32(cross12, cross01);
+	const float32x4_t mergeh = merge.val[0];
+	const float32x4_t mergel = merge.val[1];
+
+	// const Vec3V colInv0 = XMVectorPermute(mergeh,cross20,PxPermuteControl(0,4,1,7));
+	const float32x4_t colInv0_xxyy = vzipq_f32(mergeh, cross20).val[0];
+	const float32x4_t colInv0 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(colInv0_xxyy), btttf));
+
+	// const Vec3V colInv1 = XMVectorPermute(mergeh,cross20,PxPermuteControl(2,5,3,7));
+	const float32x2_t zw0 = vget_high_f32(mergeh);
+	const float32x2_t xy1 = vget_low_f32(cross20);
+	const float32x2_t yzero1 = vext_f32(xy1, zeros, 1);
+	const float32x2x2_t merge1 = vzip_f32(zw0, yzero1);
+	const float32x4_t colInv1 = vcombine_f32(merge1.val[0], merge1.val[1]);
+
+	// const Vec3V colInv2 = XMVectorPermute(mergel,cross20,PxPermuteControl(0,6,1,7));
+	const float32x2_t x0y0 = vget_low_f32(mergel);
+	const float32x2_t z1w1 = vget_high_f32(cross20);
+	const float32x2x2_t merge2 = vzip_f32(x0y0, z1w1);
+	const float32x4_t colInv2 = vcombine_f32(merge2.val[0], merge2.val[1]);
+
+	return Mat33V(vmulq_lane_f32(colInv0, invDet, 0), vmulq_lane_f32(colInv1, invDet, 0),
+	              vmulq_lane_f32(colInv2, invDet, 0));
+}
+
+PX_FORCE_INLINE Mat33V M33Trnsps(const Mat33V& a)
+{
+	return Mat33V(V3Merge(V3GetX(a.col0), V3GetX(a.col1), V3GetX(a.col2)),
+	              V3Merge(V3GetY(a.col0), V3GetY(a.col1), V3GetY(a.col2)),
+	              V3Merge(V3GetZ(a.col0), V3GetZ(a.col1), V3GetZ(a.col2)));
+}
+
+PX_FORCE_INLINE Mat33V M33Identity()
+{
+	return Mat33V(V3UnitX(), V3UnitY(), V3UnitZ());
+}
+
+PX_FORCE_INLINE Mat33V M33Sub(const Mat33V& a, const Mat33V& b)
+{
+	return Mat33V(V3Sub(a.col0, b.col0), V3Sub(a.col1, b.col1), V3Sub(a.col2, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Neg(const Mat33V& a)
+{
+	return Mat33V(V3Neg(a.col0), V3Neg(a.col1), V3Neg(a.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Abs(const Mat33V& a)
+{
+	return Mat33V(V3Abs(a.col0), V3Abs(a.col1), V3Abs(a.col2));
+}
+
+PX_FORCE_INLINE Mat33V PromoteVec3V(const Vec3V v)
+{
+	const BoolV bTFFF = BTFFF();
+	const BoolV bFTFF = BFTFF();
+	const BoolV bFFTF = BTFTF();
+
+	const Vec3V zero = V3Zero();
+
+	return Mat33V(V3Sel(bTFFF, v, zero), V3Sel(bFTFF, v, zero), V3Sel(bFFTF, v, zero));
+}
+
+PX_FORCE_INLINE Mat33V M33Diagonal(const Vec3VArg d)
+{
+	const Vec3V x = V3Mul(V3UnitX(), d);
+	const Vec3V y = V3Mul(V3UnitY(), d);
+	const Vec3V z = V3Mul(V3UnitZ(), d);
+	return Mat33V(x, y, z);
+}
+
+//////////////////////////////////
+// MAT34V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec3V M34MulV3(const Mat34V& a, const Vec3V b)
+{
+	const FloatV x = V3GetX(b);
+	const FloatV y = V3GetY(b);
+	const FloatV z = V3GetZ(b);
+	const Vec3V v0 = V3Scale(a.col0, x);
+	const Vec3V v1 = V3Scale(a.col1, y);
+	const Vec3V v2 = V3Scale(a.col2, z);
+	const Vec3V v0PlusV1 = V3Add(v0, v1);
+	const Vec3V v0PlusV1Plusv2 = V3Add(v0PlusV1, v2);
+	return V3Add(v0PlusV1Plusv2, a.col3);
+}
+
+PX_FORCE_INLINE Vec3V M34Mul33V3(const Mat34V& a, const Vec3V b)
+{
+	const FloatV x = V3GetX(b);
+	const FloatV y = V3GetY(b);
+	const FloatV z = V3GetZ(b);
+	const Vec3V v0 = V3Scale(a.col0, x);
+	const Vec3V v1 = V3Scale(a.col1, y);
+	const Vec3V v2 = V3Scale(a.col2, z);
+	const Vec3V v0PlusV1 = V3Add(v0, v1);
+	return V3Add(v0PlusV1, v2);
+}
+
+PX_FORCE_INLINE Vec3V M34TrnspsMul33V3(const Mat34V& a, const Vec3V b)
+{
+	const FloatV x = V3Dot(a.col0, b);
+	const FloatV y = V3Dot(a.col1, b);
+	const FloatV z = V3Dot(a.col2, b);
+	return V3Merge(x, y, z);
+}
+
+PX_FORCE_INLINE Mat34V M34MulM34(const Mat34V& a, const Mat34V& b)
+{
+	return Mat34V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2), M34MulV3(a, b.col3));
+}
+
+PX_FORCE_INLINE Mat33V M34MulM33(const Mat34V& a, const Mat33V& b)
+{
+	return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M34Mul33MM34(const Mat34V& a, const Mat34V& b)
+{
+	return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2));
+}
+
+PX_FORCE_INLINE Mat34V M34Add(const Mat34V& a, const Mat34V& b)
+{
+	return Mat34V(V3Add(a.col0, b.col0), V3Add(a.col1, b.col1), V3Add(a.col2, b.col2), V3Add(a.col3, b.col3));
+}
+
+PX_FORCE_INLINE Mat33V M34Trnsps33(const Mat34V& a)
+{
+	return Mat33V(V3Merge(V3GetX(a.col0), V3GetX(a.col1), V3GetX(a.col2)),
+	              V3Merge(V3GetY(a.col0), V3GetY(a.col1), V3GetY(a.col2)),
+	              V3Merge(V3GetZ(a.col0), V3GetZ(a.col1), V3GetZ(a.col2)));
+}
+
+//////////////////////////////////
+// MAT44V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec4V M44MulV4(const Mat44V& a, const Vec4V b)
+{
+	const FloatV x = V4GetX(b);
+	const FloatV y = V4GetY(b);
+	const FloatV z = V4GetZ(b);
+	const FloatV w = V4GetW(b);
+
+	const Vec4V v0 = V4Scale(a.col0, x);
+	const Vec4V v1 = V4Scale(a.col1, y);
+	const Vec4V v2 = V4Scale(a.col2, z);
+	const Vec4V v3 = V4Scale(a.col3, w);
+	const Vec4V v0PlusV1 = V4Add(v0, v1);
+	const Vec4V v0PlusV1Plusv2 = V4Add(v0PlusV1, v2);
+	return V4Add(v0PlusV1Plusv2, v3);
+}
+
+PX_FORCE_INLINE Vec4V M44TrnspsMulV4(const Mat44V& a, const Vec4V b)
+{
+	return V4Merge(V4Dot(a.col0, b), V4Dot(a.col1, b), V4Dot(a.col2, b), V4Dot(a.col3, b));
+}
+
+PX_FORCE_INLINE Mat44V M44MulM44(const Mat44V& a, const Mat44V& b)
+{
+	return Mat44V(M44MulV4(a, b.col0), M44MulV4(a, b.col1), M44MulV4(a, b.col2), M44MulV4(a, b.col3));
+}
+
+PX_FORCE_INLINE Mat44V M44Add(const Mat44V& a, const Mat44V& b)
+{
+	return Mat44V(V4Add(a.col0, b.col0), V4Add(a.col1, b.col1), V4Add(a.col2, b.col2), V4Add(a.col3, b.col3));
+}
+
+PX_FORCE_INLINE Mat44V M44Trnsps(const Mat44V& a)
+{
+	// asm volatile(
+	// "vzip.f32 %q0, %q2 \n\t"
+	// "vzip.f32 %q1, %q3 \n\t"
+	// "vzip.f32 %q0, %q1 \n\t"
+	// "vzip.f32 %q2, %q3 \n\t"
+	// : "+w" (a.col0), "+w" (a.col1), "+w" (a.col2), "+w" a.col3));
+
+	const float32x4x2_t v0v1 = vzipq_f32(a.col0, a.col2);
+	const float32x4x2_t v2v3 = vzipq_f32(a.col1, a.col3);
+	const float32x4x2_t zip0 = vzipq_f32(v0v1.val[0], v2v3.val[0]);
+	const float32x4x2_t zip1 = vzipq_f32(v0v1.val[1], v2v3.val[1]);
+
+	return Mat44V(zip0.val[0], zip0.val[1], zip1.val[0], zip1.val[1]);
+}
+
+PX_FORCE_INLINE Mat44V M44Inverse(const Mat44V& a)
+{
+	float32x4_t minor0, minor1, minor2, minor3;
+	float32x4_t row0, row1, row2, row3;
+	float32x4_t det, tmp1;
+
+	tmp1 = vmovq_n_f32(0.0f);
+	row1 = vmovq_n_f32(0.0f);
+	row3 = vmovq_n_f32(0.0f);
+
+	row0 = a.col0;
+	row1 = vextq_f32(a.col1, a.col1, 2);
+	row2 = a.col2;
+	row3 = vextq_f32(a.col3, a.col3, 2);
+
+	tmp1 = vmulq_f32(row2, row3);
+	tmp1 = vrev64q_f32(tmp1);
+	minor0 = vmulq_f32(row1, tmp1);
+	minor1 = vmulq_f32(row0, tmp1);
+	tmp1 = vextq_f32(tmp1, tmp1, 2);
+	minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
+	minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
+	minor1 = vextq_f32(minor1, minor1, 2);
+
+	tmp1 = vmulq_f32(row1, row2);
+	tmp1 = vrev64q_f32(tmp1);
+	minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
+	minor3 = vmulq_f32(row0, tmp1);
+	tmp1 = vextq_f32(tmp1, tmp1, 2);
+	minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
+	minor3 = vsubq_f32(vmulq_f32(row0, tmp1), minor3);
+	minor3 = vextq_f32(minor3, minor3, 2);
+
+	tmp1 = vmulq_f32(vextq_f32(row1, row1, 2), row3);
+	tmp1 = vrev64q_f32(tmp1);
+	row2 = vextq_f32(row2, row2, 2);
+	minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
+	minor2 = vmulq_f32(row0, tmp1);
+	tmp1 = vextq_f32(tmp1, tmp1, 2);
+	minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
+	minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
+	minor2 = vextq_f32(minor2, minor2, 2);
+
+	tmp1 = vmulq_f32(row0, row1);
+	tmp1 = vrev64q_f32(tmp1);
+	minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
+	minor3 = vsubq_f32(vmulq_f32(row2, tmp1), minor3);
+	tmp1 = vextq_f32(tmp1, tmp1, 2);
+	minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
+	minor3 = vsubq_f32(minor3, vmulq_f32(row2, tmp1));
+
+	tmp1 = vmulq_f32(row0, row3);
+	tmp1 = vrev64q_f32(tmp1);
+	minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
+	minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
+	tmp1 = vextq_f32(tmp1, tmp1, 2);
+	minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
+	minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
+
+	tmp1 = vmulq_f32(row0, row2);
+	tmp1 = vrev64q_f32(tmp1);
+	minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
+	minor3 = vsubq_f32(minor3, vmulq_f32(row1, tmp1));
+	tmp1 = vextq_f32(tmp1, tmp1, 2);
+	minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
+	minor3 = vaddq_f32(vmulq_f32(row1, tmp1), minor3);
+
+	det = vmulq_f32(row0, minor0);
+	det = vaddq_f32(vextq_f32(det, det, 2), det);
+	det = vaddq_f32(vrev64q_f32(det), det);
+	det = vdupq_lane_f32(VRECIPE(vget_low_f32(det)), 0);
+
+	minor0 = vmulq_f32(det, minor0);
+	minor1 = vmulq_f32(det, minor1);
+	minor2 = vmulq_f32(det, minor2);
+	minor3 = vmulq_f32(det, minor3);
+	Mat44V invTrans(minor0, minor1, minor2, minor3);
+	return M44Trnsps(invTrans);
+}
+
+PX_FORCE_INLINE Vec4V V4LoadXYZW(const PxF32& x, const PxF32& y, const PxF32& z, const PxF32& w)
+{
+	const float32x4_t ret = { x, y, z, w };
+	return ret;
+}
+
+/*
+PX_FORCE_INLINE VecU16V V4U32PK(VecU32V a, VecU32V b)
+{
+    return vcombine_u16(vqmovn_u32(a), vqmovn_u32(b));
+}
+*/
+
+PX_FORCE_INLINE VecU32V V4U32Sel(const BoolV c, const VecU32V a, const VecU32V b)
+{
+	return vbslq_u32(c, a, b);
+}
+
+PX_FORCE_INLINE VecU32V V4U32or(VecU32V a, VecU32V b)
+{
+	return vorrq_u32(a, b);
+}
+
+PX_FORCE_INLINE VecU32V V4U32xor(VecU32V a, VecU32V b)
+{
+	return veorq_u32(a, b);
+}
+
+PX_FORCE_INLINE VecU32V V4U32and(VecU32V a, VecU32V b)
+{
+	return vandq_u32(a, b);
+}
+
+PX_FORCE_INLINE VecU32V V4U32Andc(VecU32V a, VecU32V b)
+{
+	// return vbicq_u32(a, b); // creates gcc compiler bug in RTreeQueries.cpp
+	return vandq_u32(a, vmvnq_u32(b));
+}
+
+/*
+PX_FORCE_INLINE VecU16V V4U16Or(VecU16V a, VecU16V b)
+{
+    return vorrq_u16(a, b);
+}
+*/
+
+/*
+PX_FORCE_INLINE VecU16V V4U16And(VecU16V a, VecU16V b)
+{
+    return vandq_u16(a, b);
+}
+*/
+/*
+PX_FORCE_INLINE VecU16V V4U16Andc(VecU16V a, VecU16V b)
+{
+    return vbicq_u16(a, b);
+}
+*/
+
+PX_FORCE_INLINE VecI32V I4Load(const PxI32 i)
+{
+	return vdupq_n_s32(i);
+}
+
+PX_FORCE_INLINE VecI32V I4LoadU(const PxI32* i)
+{
+	return vld1q_s32(i);
+}
+
+PX_FORCE_INLINE VecI32V I4LoadA(const PxI32* i)
+{
+	return vld1q_s32(i);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Add(const VecI32VArg a, const VecI32VArg b)
+{
+	return vaddq_s32(a, b);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Sub(const VecI32VArg a, const VecI32VArg b)
+{
+	return vsubq_s32(a, b);
+}
+
+PX_FORCE_INLINE BoolV VecI32V_IsGrtr(const VecI32VArg a, const VecI32VArg b)
+{
+	return vcgtq_s32(a, b);
+}
+
+PX_FORCE_INLINE BoolV VecI32V_IsEq(const VecI32VArg a, const VecI32VArg b)
+{
+	return vceqq_s32(a, b);
+}
+
+PX_FORCE_INLINE VecI32V V4I32Sel(const BoolV c, const VecI32V a, const VecI32V b)
+{
+	return vbslq_s32(c, a, b);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Zero()
+{
+	return vdupq_n_s32(0);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_One()
+{
+	return vdupq_n_s32(1);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Two()
+{
+	return vdupq_n_s32(2);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_MinusOne()
+{
+	return vdupq_n_s32(-1);
+}
+
+PX_FORCE_INLINE VecU32V U4Zero()
+{
+	return U4Load(0);
+}
+
+PX_FORCE_INLINE VecU32V U4One()
+{
+	return U4Load(1);
+}
+
+PX_FORCE_INLINE VecU32V U4Two()
+{
+	return U4Load(2);
+}
+
+PX_FORCE_INLINE VecShiftV VecI32V_PrepareShift(const VecI32VArg shift)
+{
+	return shift;
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_LeftShift(const VecI32VArg a, const VecShiftVArg count)
+{
+	return vshlq_s32(a, count);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_RightShift(const VecI32VArg a, const VecShiftVArg count)
+{
+	return vshlq_s32(a, VecI32V_Sub(I4Load(0), count));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_And(const VecI32VArg a, const VecI32VArg b)
+{
+	return vandq_s32(a, b);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Or(const VecI32VArg a, const VecI32VArg b)
+{
+	return vorrq_s32(a, b);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetX(const VecI32VArg f)
+{
+	const int32x2_t fLow = vget_low_s32(f);
+	return vdupq_lane_s32(fLow, 0);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetY(const VecI32VArg f)
+{
+	const int32x2_t fLow = vget_low_s32(f);
+	return vdupq_lane_s32(fLow, 1);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetZ(const VecI32VArg f)
+{
+	const int32x2_t fHigh = vget_high_s32(f);
+	return vdupq_lane_s32(fHigh, 0);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetW(const VecI32VArg f)
+{
+	const int32x2_t fHigh = vget_high_s32(f);
+	return vdupq_lane_s32(fHigh, 1);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Sel(const BoolV c, const VecI32VArg a, const VecI32VArg b)
+{
+	return vbslq_s32(c, a, b);
+}
+
+PX_FORCE_INLINE void PxI32_From_VecI32V(const VecI32VArg a, PxI32* i)
+{
+	*i = vgetq_lane_s32(a, 0);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Merge(const VecI32VArg a, const VecI32VArg b, const VecI32VArg c, const VecI32VArg d)
+{
+	const int32x2_t aLow = vget_low_s32(a);
+	const int32x2_t bLow = vget_low_s32(b);
+	const int32x2_t cLow = vget_low_s32(c);
+	const int32x2_t dLow = vget_low_s32(d);
+
+	const int32x2_t low = vext_s32(aLow, bLow, 1);
+	const int32x2_t high = vext_s32(cLow, dLow, 1);
+
+	return vcombine_s32(low, high);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_From_BoolV(const BoolVArg a)
+{
+	return vreinterpretq_s32_u32(a);
+}
+
+PX_FORCE_INLINE VecU32V VecU32V_From_BoolV(const BoolVArg a)
+{
+	return a;
+}
+
+/*
+template<int a> PX_FORCE_INLINE VecI32V V4ISplat()
+{
+    return vdupq_n_s32(a);
+}
+
+template<PxU32 a> PX_FORCE_INLINE VecU32V V4USplat()
+{
+    return vdupq_n_u32(a);
+}
+*/
+
+/*
+PX_FORCE_INLINE void V4U16StoreAligned(VecU16V val, VecU16V* address)
+{
+    vst1q_u16((uint16_t*)address, val);
+}
+*/
+
+PX_FORCE_INLINE void V4U32StoreAligned(VecU32V val, VecU32V* address)
+{
+	vst1q_u32(reinterpret_cast<uint32_t*>(address), val);
+}
+
+PX_FORCE_INLINE Vec4V V4LoadAligned(Vec4V* addr)
+{
+	return vld1q_f32(reinterpret_cast<float32_t*>(addr));
+}
+
+PX_FORCE_INLINE Vec4V V4LoadUnaligned(Vec4V* addr)
+{
+	return vld1q_f32(reinterpret_cast<float32_t*>(addr));
+}
+
+PX_FORCE_INLINE Vec4V V4Andc(const Vec4V a, const VecU32V b)
+{
+	return vreinterpretq_f32_u32(V4U32Andc(vreinterpretq_u32_f32(a), b));
+}
+
+PX_FORCE_INLINE VecU32V V4IsGrtrV32u(const Vec4V a, const Vec4V b)
+{
+	return V4IsGrtr(a, b);
+}
+
+PX_FORCE_INLINE VecU16V V4U16LoadAligned(VecU16V* addr)
+{
+	return vld1q_u16(reinterpret_cast<uint16_t*>(addr));
+}
+
+PX_FORCE_INLINE VecU16V V4U16LoadUnaligned(VecU16V* addr)
+{
+	return vld1q_u16(reinterpret_cast<uint16_t*>(addr));
+}
+
+PX_FORCE_INLINE VecU16V V4U16CompareGt(VecU16V a, VecU16V b)
+{
+	return vcgtq_u16(a, b);
+}
+
+PX_FORCE_INLINE VecU16V V4I16CompareGt(VecI16V a, VecI16V b)
+{
+	return vcgtq_s16(a, b);
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_VecU32V(VecU32V a)
+{
+	return vcvtq_f32_u32(a);
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_VecI32V(VecI32V a)
+{
+	return vcvtq_f32_s32(a);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_From_Vec4V(Vec4V a)
+{
+	return vcvtq_s32_f32(a);
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecU32V(VecU32V a)
+{
+	return vreinterpretq_f32_u32(a);
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecI32V(VecI32V a)
+{
+	return vreinterpretq_f32_s32(a);
+}
+
+PX_FORCE_INLINE VecU32V VecU32V_ReinterpretFrom_Vec4V(Vec4V a)
+{
+	return vreinterpretq_u32_f32(a);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_ReinterpretFrom_Vec4V(Vec4V a)
+{
+	return vreinterpretq_s32_f32(a);
+}
+
+template <int index>
+PX_FORCE_INLINE BoolV BSplatElement(BoolV a)
+{
+	if(index < 2)
+	{
+		return vdupq_lane_u32(vget_low_u32(a), index);
+	}
+	else if(index == 2)
+	{
+		return vdupq_lane_u32(vget_high_u32(a), 0);
+	}
+	else if(index == 3)
+	{
+		return vdupq_lane_u32(vget_high_u32(a), 1);
+	}
+}
+
+template <int index>
+PX_FORCE_INLINE VecU32V V4U32SplatElement(VecU32V a)
+{
+	if(index < 2)
+	{
+		return vdupq_lane_u32(vget_low_u32(a), index);
+	}
+	else if(index == 2)
+	{
+		return vdupq_lane_u32(vget_high_u32(a), 0);
+	}
+	else if(index == 3)
+	{
+		return vdupq_lane_u32(vget_high_u32(a), 1);
+	}
+}
+
+template <int index>
+PX_FORCE_INLINE Vec4V V4SplatElement(Vec4V a)
+{
+	if(index < 2)
+	{
+		return vdupq_lane_f32(vget_low_f32(a), index);
+	}
+	else if(index == 2)
+	{
+		return vdupq_lane_f32(vget_high_f32(a), 0);
+	}
+	else if(index == 3)
+	{
+		return vdupq_lane_f32(vget_high_f32(a), 1);
+	}
+}
+
+PX_FORCE_INLINE VecU32V U4LoadXYZW(PxU32 x, PxU32 y, PxU32 z, PxU32 w)
+{
+	const uint32x4_t ret = { x, y, z, w };
+	return ret;
+}
+
+PX_FORCE_INLINE VecU32V U4Load(const PxU32 i)
+{
+	return vdupq_n_u32(i);
+}
+
+PX_FORCE_INLINE VecU32V U4LoadU(const PxU32* i)
+{
+	return vld1q_u32(i);
+}
+
+PX_FORCE_INLINE VecU32V U4LoadA(const PxU32* i)
+{
+	return vld1q_u32(i);
+}
+
+PX_FORCE_INLINE Vec4V V4Ceil(const Vec4V in)
+{
+	const float32x4_t ones = vdupq_n_f32(1.0f);
+	const float32x4_t rdToZero = vcvtq_f32_s32(vcvtq_s32_f32(in));
+	const float32x4_t rdToZeroPlusOne = vaddq_f32(rdToZero, ones);
+	const uint32x4_t gt = vcgtq_f32(in, rdToZero);
+	return vbslq_f32(gt, rdToZeroPlusOne, rdToZero);
+}
+
+PX_FORCE_INLINE Vec4V V4Floor(const Vec4V in)
+{
+	const float32x4_t ones = vdupq_n_f32(1.0f);
+	const float32x4_t rdToZero = vcvtq_f32_s32(vcvtq_s32_f32(in));
+	const float32x4_t rdToZeroMinusOne = vsubq_f32(rdToZero, ones);
+	const uint32x4_t lt = vcltq_f32(in, rdToZero);
+	return vbslq_f32(lt, rdToZeroMinusOne, rdToZero);
+}
+
+PX_FORCE_INLINE VecU32V V4ConvertToU32VSaturate(const Vec4V in, PxU32 power)
+{
+	PX_ASSERT(power == 0 && "Non-zero power not supported in convertToU32VSaturate");
+	PX_UNUSED(power); // prevent warning in release builds
+
+	return vcvtq_u32_f32(in);
+}
+
+PX_FORCE_INLINE void QuatGetMat33V(const QuatVArg q, Vec3V& column0, Vec3V& column1, Vec3V& column2)
+{
+	const FloatV one = FOne();
+	const FloatV x = V4GetX(q);
+	const FloatV y = V4GetY(q);
+	const FloatV z = V4GetZ(q);
+	const FloatV w = V4GetW(q);
+
+	const FloatV x2 = FAdd(x, x);
+	const FloatV y2 = FAdd(y, y);
+	const FloatV z2 = FAdd(z, z);
+
+	const FloatV xx = FMul(x2, x);
+	const FloatV yy = FMul(y2, y);
+	const FloatV zz = FMul(z2, z);
+
+	const FloatV xy = FMul(x2, y);
+	const FloatV xz = FMul(x2, z);
+	const FloatV xw = FMul(x2, w);
+
+	const FloatV yz = FMul(y2, z);
+	const FloatV yw = FMul(y2, w);
+	const FloatV zw = FMul(z2, w);
+
+	const FloatV v = FSub(one, xx);
+
+	column0 = V3Merge(FSub(FSub(one, yy), zz), FAdd(xy, zw), FSub(xz, yw));
+	column1 = V3Merge(FSub(xy, zw), FSub(v, zz), FAdd(yz, xw));
+	column2 = V3Merge(FAdd(xz, yw), FSub(yz, xw), FSub(v, yy));
+}
+
+#endif // PSFOUNDATION_PSUNIXNEONINLINEAOS_H
diff --git a/PxShared/src/foundation/include/unix/sse2/PsUnixSse2AoS.h b/PxShared/src/foundation/include/unix/sse2/PsUnixSse2AoS.h
new file mode 100644
index 00000000..54c622f6
--- /dev/null
+++ b/PxShared/src/foundation/include/unix/sse2/PsUnixSse2AoS.h
@@ -0,0 +1,179 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSUNIXSSE2AOS_H
+#define PSFOUNDATION_PSUNIXSSE2AOS_H
+
+// no includes here! this file should be included from PxcVecMath.h only!!!
+
+#if !COMPILE_VECTOR_INTRINSICS
+#error Vector intrinsics should not be included when using scalar implementation.
+#endif
+#ifdef __EMSCRIPTEN__
+typedef int8_t   __int8_t;
+typedef int16_t  __int16_t;
+typedef int32_t  __int32_t;
+typedef int64_t  __int64_t;
+typedef uint16_t __uint16_t;
+typedef uint32_t __uint32_t;
+typedef uint64_t __uint64_t;
+#endif
+
+typedef union UnionM128
+{
+	UnionM128()
+	{
+	}
+	UnionM128(__m128 in)
+	{
+		m128 = in;
+	}
+
+	UnionM128(__m128i in)
+	{
+		m128i = in;
+	}
+
+	operator __m128()
+	{
+		return m128;
+	}
+
+	operator const __m128() const
+	{
+		return m128;
+	}
+
+	float m128_f32[4];
+	__int8_t m128_i8[16];
+	__int16_t m128_i16[8];
+	__int32_t m128_i32[4];
+	__int64_t m128_i64[2];
+	__uint16_t m128_u16[8];
+	__uint32_t m128_u32[4];
+	__uint64_t m128_u64[2];
+	__m128 m128;
+	__m128i m128i;
+} UnionM128;
+
+typedef __m128 FloatV;
+typedef __m128 Vec3V;
+typedef __m128 Vec4V;
+typedef __m128 BoolV;
+typedef __m128 QuatV;
+typedef __m128i VecI32V;
+typedef UnionM128 VecU32V;
+typedef UnionM128 VecU16V;
+typedef UnionM128 VecI16V;
+typedef UnionM128 VecU8V;
+
+#define FloatVArg FloatV &
+#define Vec3VArg Vec3V &
+#define Vec4VArg Vec4V &
+#define BoolVArg BoolV &
+#define VecU32VArg VecU32V &
+#define VecI32VArg VecI32V &
+#define VecU16VArg VecU16V &
+#define VecI16VArg VecI16V &
+#define VecU8VArg VecU8V &
+#define QuatVArg QuatV &
+
+// Optimization for situations in which you cross product multiple vectors with the same vector.
+// Avoids 2X shuffles per product
+struct VecCrossV
+{
+	Vec3V mL1;
+	Vec3V mR1;
+};
+
+struct VecShiftV
+{
+	VecI32V shift;
+};
+#define VecShiftVArg VecShiftV &
+
+PX_ALIGN_PREFIX(16)
+struct Mat33V
+{
+	Mat33V()
+	{
+	}
+	Mat33V(const Vec3V& c0, const Vec3V& c1, const Vec3V& c2) : col0(c0), col1(c1), col2(c2)
+	{
+	}
+	Vec3V PX_ALIGN(16, col0);
+	Vec3V PX_ALIGN(16, col1);
+	Vec3V PX_ALIGN(16, col2);
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct Mat34V
+{
+	Mat34V()
+	{
+	}
+	Mat34V(const Vec3V& c0, const Vec3V& c1, const Vec3V& c2, const Vec3V& c3) : col0(c0), col1(c1), col2(c2), col3(c3)
+	{
+	}
+	Vec3V PX_ALIGN(16, col0);
+	Vec3V PX_ALIGN(16, col1);
+	Vec3V PX_ALIGN(16, col2);
+	Vec3V PX_ALIGN(16, col3);
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct Mat43V
+{
+	Mat43V()
+	{
+	}
+	Mat43V(const Vec4V& c0, const Vec4V& c1, const Vec4V& c2) : col0(c0), col1(c1), col2(c2)
+	{
+	}
+	Vec4V PX_ALIGN(16, col0);
+	Vec4V PX_ALIGN(16, col1);
+	Vec4V PX_ALIGN(16, col2);
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct Mat44V
+{
+	Mat44V()
+	{
+	}
+	Mat44V(const Vec4V& c0, const Vec4V& c1, const Vec4V& c2, const Vec4V& c3) : col0(c0), col1(c1), col2(c2), col3(c3)
+	{
+	}
+	Vec4V PX_ALIGN(16, col0);
+	Vec4V PX_ALIGN(16, col1);
+	Vec4V PX_ALIGN(16, col2);
+	Vec4V PX_ALIGN(16, col3);
+} PX_ALIGN_SUFFIX(16);
+
+#endif // PSFOUNDATION_PSUNIXSSE2AOS_H
diff --git a/PxShared/src/foundation/include/unix/sse2/PsUnixSse2InlineAoS.h b/PxShared/src/foundation/include/unix/sse2/PsUnixSse2InlineAoS.h
new file mode 100644
index 00000000..4503c80e
--- /dev/null
+++ b/PxShared/src/foundation/include/unix/sse2/PsUnixSse2InlineAoS.h
@@ -0,0 +1,3208 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSUNIXSSE2INLINEAOS_H
+#define PSFOUNDATION_PSUNIXSSE2INLINEAOS_H
+
+#if !COMPILE_VECTOR_INTRINSICS
+#error Vector intrinsics should not be included when using scalar implementation.
+#endif
+
+// Remove this define when all platforms use simd solver.
+#define PX_SUPPORT_SIMD
+
+#ifdef __SSE4_2__
+#include "smmintrin.h"
+#endif
+
+#include "../../PsVecMathSSE.h"
+
+#define PX_FPCLASS_SNAN 0x0001 /* signaling NaN */
+#define PX_FPCLASS_QNAN 0x0002 /* quiet NaN */
+#define PX_FPCLASS_NINF 0x0004 /* negative infinity */
+#define PX_FPCLASS_PINF 0x0200 /* positive infinity */
+
+PX_FORCE_INLINE __m128 m128_I2F(__m128i n)
+{
+	return _mm_castsi128_ps(n);
+}
+PX_FORCE_INLINE __m128i m128_F2I(__m128 n)
+{
+	return _mm_castps_si128(n);
+}
+
+//////////////////////////////////////////////////////////////////////
+//Test that Vec3V and FloatV are legal
+//////////////////////////////////////////////////////////////////////
+
+#define FLOAT_COMPONENTS_EQUAL_THRESHOLD 0.01f
+PX_FORCE_INLINE static bool isValidFloatV(const FloatV a)
+{
+	const PxF32 x = V4ReadX(a);
+	const PxF32 y = V4ReadY(a);
+	const PxF32 z = V4ReadZ(a);
+	const PxF32 w = V4ReadW(a);
+
+ 	if (
+		(PxAbs(x - y) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) &&
+		(PxAbs(x - z) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) &&
+		(PxAbs(x - w) < FLOAT_COMPONENTS_EQUAL_THRESHOLD)
+		)
+	{
+		return true;
+	}
+
+	if (
+		(PxAbs((x - y) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) &&
+		(PxAbs((x - z) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) &&
+		(PxAbs((x - w) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD)
+		)
+	{
+		return true;
+	}
+
+	return false;
+}
+
+PX_FORCE_INLINE bool isValidVec3V(const Vec3V a)
+{
+	PX_ALIGN(16, PxF32 f[4]);
+	V4StoreA(a, f);
+	return (f[3] == 0.0f);
+}
+
+PX_FORCE_INLINE bool isFiniteLength(const Vec3V a)
+{
+	return !FAllEq(V4LengthSq(a), FZero());
+}
+
+PX_FORCE_INLINE bool isAligned16(void* a)
+{
+	return(0 == (size_t(a) & 0x0f));
+}
+
+//ASSERT_FINITELENGTH is deactivated because there is a lot of code that calls a simd normalisation function with zero length but then ignores the result.
+
+#if PX_DEBUG
+#define ASSERT_ISVALIDVEC3V(a) PX_ASSERT(isValidVec3V(a))
+#define ASSERT_ISVALIDFLOATV(a) PX_ASSERT(isValidFloatV(a))
+#define ASSERT_ISALIGNED16(a) PX_ASSERT(isAligned16(reinterpret_cast<void*>(a)))
+#define ASSERT_ISFINITELENGTH(a) //PX_ASSERT(isFiniteLength(a))
+#else
+#define ASSERT_ISVALIDVEC3V(a)
+#define ASSERT_ISVALIDFLOATV(a) 
+#define ASSERT_ISALIGNED16(a)
+#define ASSERT_ISFINITELENGTH(a)
+#endif
+
+
+namespace internalUnitSSE2Simd
+{
+PX_FORCE_INLINE PxU32 BAllTrue4_R(const BoolV a)
+{
+	const PxI32 moveMask = _mm_movemask_ps(a);
+	return PxU32(moveMask == 0xf);
+}
+
+PX_FORCE_INLINE PxU32 BAllTrue3_R(const BoolV a)
+{
+	const PxI32 moveMask = _mm_movemask_ps(a);
+	return PxU32((moveMask & 0x7) == 0x7);
+}
+
+PX_FORCE_INLINE PxU32 BAnyTrue4_R(const BoolV a)
+{
+	const PxI32 moveMask = _mm_movemask_ps(a);
+	return PxU32(moveMask != 0x0);
+}
+
+PX_FORCE_INLINE PxU32 BAnyTrue3_R(const BoolV a)
+{
+	const PxI32 moveMask = _mm_movemask_ps(a);
+	return PxU32((moveMask & 0x7) != 0x0);
+}
+
+PX_FORCE_INLINE PxU32 FiniteTestEq(const Vec4V a, const Vec4V b)
+{
+	// This is a bit of a bodge.
+	//_mm_comieq_ss returns 1 if either value is nan so we need to re-cast a and b with true encoded as a non-nan
+	// number.
+	// There must be a better way of doing this in sse.
+	const BoolV one = FOne();
+	const BoolV zero = FZero();
+	const BoolV a1 = V4Sel(a, one, zero);
+	const BoolV b1 = V4Sel(b, one, zero);
+	return (
+	    _mm_comieq_ss(a1, b1) &&
+	    _mm_comieq_ss(_mm_shuffle_ps(a1, a1, _MM_SHUFFLE(1, 1, 1, 1)), _mm_shuffle_ps(b1, b1, _MM_SHUFFLE(1, 1, 1, 1))) &&
+	    _mm_comieq_ss(_mm_shuffle_ps(a1, a1, _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(b1, b1, _MM_SHUFFLE(2, 2, 2, 2))) &&
+	    _mm_comieq_ss(_mm_shuffle_ps(a1, a1, _MM_SHUFFLE(3, 3, 3, 3)), _mm_shuffle_ps(b1, b1, _MM_SHUFFLE(3, 3, 3, 3))));
+}
+
+const PX_ALIGN(16, PxF32 gMaskXYZ[4]) = { physx::PxUnionCast<PxF32>(0xffffffff), physx::PxUnionCast<PxF32>(0xffffffff),
+	                                      physx::PxUnionCast<PxF32>(0xffffffff), 0 };
+}
+
+namespace _VecMathTests
+{
+// PT: this function returns an invalid Vec3V (W!=0.0f) just for unit-testing 'isValidVec3V'
+PX_FORCE_INLINE Vec3V getInvalidVec3V()
+{
+	const float f = 1.0f;
+	return _mm_load1_ps(&f);
+}
+
+PX_FORCE_INLINE bool allElementsEqualFloatV(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_comieq_ss(a, b) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsEqualVec3V(const Vec3V a, const Vec3V b)
+{
+	return V3AllEq(a, b) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsEqualVec4V(const Vec4V a, const Vec4V b)
+{
+	return V4AllEq(a, b) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsEqualBoolV(const BoolV a, const BoolV b)
+{
+	return internalUnitSSE2Simd::BAllTrue4_R(VecI32V_IsEq(m128_F2I(a), m128_F2I(b))) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsEqualVecU32V(const VecU32V a, const VecU32V b)
+{
+	return internalUnitSSE2Simd::BAllTrue4_R(V4IsEqU32(a, b)) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsEqualVecI32V(const VecI32V a, const VecI32V b)
+{
+	BoolV c = m128_I2F(_mm_cmpeq_epi32(a, b));
+	return internalUnitSSE2Simd::BAllTrue4_R(c) != 0;
+}
+
+#define VECMATH_AOS_EPSILON (1e-3f)
+
+PX_FORCE_INLINE bool allElementsNearEqualFloatV(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	const FloatV c = FSub(a, b);
+	const FloatV minError = FLoad(-VECMATH_AOS_EPSILON);
+	const FloatV maxError = FLoad(VECMATH_AOS_EPSILON);
+	return _mm_comigt_ss(c, minError) && _mm_comilt_ss(c, maxError);
+}
+
+PX_FORCE_INLINE bool allElementsNearEqualVec3V(const Vec3V a, const Vec3V b)
+{
+	const Vec3V c = V3Sub(a, b);
+	const Vec3V minError = V3Load(-VECMATH_AOS_EPSILON);
+	const Vec3V maxError = V3Load(VECMATH_AOS_EPSILON);
+	return (_mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), minError) &&
+	 		_mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), maxError) &&
+	 		_mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), minError) &&
+	 		_mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), maxError) &&
+	 		_mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), minError) &&
+	 		_mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), maxError));
+}
+
+PX_FORCE_INLINE bool allElementsNearEqualVec4V(const Vec4V a, const Vec4V b)
+{
+	const Vec4V c = V4Sub(a, b);
+	const Vec4V minError = V4Load(-VECMATH_AOS_EPSILON);
+	const Vec4V maxError = V4Load(VECMATH_AOS_EPSILON);
+	return (_mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), minError) &&
+	        _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), maxError) &&
+	        _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), minError) &&
+	        _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), maxError) &&
+	        _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), minError) &&
+	        _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), maxError) &&
+	        _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 3, 3, 3)), minError) &&
+	        _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 3, 3, 3)), maxError));
+}
+}
+
+/////////////////////////////////////////////////////////////////////
+////FUNCTIONS USED ONLY FOR ASSERTS IN VECTORISED IMPLEMENTATIONS
+/////////////////////////////////////////////////////////////////////
+
+PX_FORCE_INLINE bool isFiniteFloatV(const FloatV a)
+{
+	PxF32 badNumber =
+	    physx::PxUnionCast<PxF32, PxU32>(PX_FPCLASS_SNAN | PX_FPCLASS_QNAN | PX_FPCLASS_NINF | PX_FPCLASS_PINF);
+	const FloatV vBadNum = FLoad(badNumber);
+	const BoolV vMask = BAnd(vBadNum, a);
+	return internalUnitSSE2Simd::FiniteTestEq(vMask, BFFFF()) == 1;
+}
+
+PX_FORCE_INLINE bool isFiniteVec3V(const Vec3V a)
+{
+	PxF32 badNumber =
+	    physx::PxUnionCast<PxF32, PxU32>(PX_FPCLASS_SNAN | PX_FPCLASS_QNAN | PX_FPCLASS_NINF | PX_FPCLASS_PINF);
+	const Vec3V vBadNum = V3Load(badNumber);
+	const BoolV vMask = BAnd(BAnd(vBadNum, a), BTTTF());
+	return internalUnitSSE2Simd::FiniteTestEq(vMask, BFFFF()) == 1;
+}
+
+PX_FORCE_INLINE bool isFiniteVec4V(const Vec4V a)
+{
+	/*Vec4V a;
+	PX_ALIGN(16, PxF32 f[4]);
+	F32Array_Aligned_From_Vec4V(a, f);
+	return PxIsFinite(f[0])
+	        && PxIsFinite(f[1])
+	        && PxIsFinite(f[2])
+	        && PxIsFinite(f[3]);*/
+
+	PxF32 badNumber =
+	    physx::PxUnionCast<PxF32, PxU32>(PX_FPCLASS_SNAN | PX_FPCLASS_QNAN | PX_FPCLASS_NINF | PX_FPCLASS_PINF);
+	const Vec4V vBadNum = V4Load(badNumber);
+	const BoolV vMask = BAnd(vBadNum, a);
+
+	return internalUnitSSE2Simd::FiniteTestEq(vMask, BFFFF()) == 1;
+}
+
+PX_FORCE_INLINE bool hasZeroElementinFloatV(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), FZero()) ? true : false;
+}
+
+PX_FORCE_INLINE bool hasZeroElementInVec3V(const Vec3V a)
+{
+	return (_mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), FZero()) ||
+	        _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)), FZero()) ||
+	        _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)), FZero()));
+}
+
+PX_FORCE_INLINE bool hasZeroElementInVec4V(const Vec4V a)
+{
+	return (_mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), FZero()) ||
+	        _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)), FZero()) ||
+	        _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)), FZero()) ||
+	        _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)), FZero()));
+}
+
+/////////////////////////////////////////////////////////////////////
+////VECTORISED FUNCTION IMPLEMENTATIONS
+/////////////////////////////////////////////////////////////////////
+
+PX_FORCE_INLINE FloatV FLoad(const PxF32 f)
+{
+	return _mm_load1_ps(&f);
+}
+
+PX_FORCE_INLINE Vec3V V3Load(const PxF32 f)
+{
+	return _mm_set_ps(0.0f, f, f, f);
+}
+
+PX_FORCE_INLINE Vec4V V4Load(const PxF32 f)
+{
+	return _mm_load1_ps(&f);
+}
+
+PX_FORCE_INLINE BoolV BLoad(const bool f)
+{
+	const PxU32 i = -PxI32(f);
+	return _mm_load1_ps(reinterpret_cast<const float*>(&i));
+}
+
+PX_FORCE_INLINE Vec3V V3LoadA(const PxVec3& f)
+{
+	ASSERT_ISALIGNED16(const_cast<PxVec3*>(&f));
+	return _mm_and_ps(reinterpret_cast<const Vec3V&>(f), V4LoadA(internalUnitSSE2Simd::gMaskXYZ));
+}
+
+PX_FORCE_INLINE Vec3V V3LoadU(const PxVec3& f)
+{
+	return _mm_set_ps(0.0f, f.z, f.y, f.x);
+}
+
+PX_FORCE_INLINE Vec3V V3LoadUnsafeA(const PxVec3& f)
+{
+	ASSERT_ISALIGNED16(const_cast<PxVec3*>(&f));
+	return _mm_set_ps(0.0f, f.z, f.y, f.x);
+}
+
+PX_FORCE_INLINE Vec3V V3LoadA(const PxF32* const f)
+{
+	ASSERT_ISALIGNED16(const_cast<PxF32*>(f));
+	return _mm_and_ps(V4LoadA(f), V4LoadA(internalUnitSSE2Simd::gMaskXYZ));
+}
+
+PX_FORCE_INLINE Vec3V V3LoadU(const PxF32* const i)
+{
+	return _mm_set_ps(0.0f, i[2], i[1], i[0]);
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V(Vec4V v)
+{
+	return V4ClearW(v);
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V_WUndefined(const Vec4V v)
+{
+	return v;
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_Vec3V(Vec3V f)
+{
+	ASSERT_ISVALIDVEC3V(f);
+	return f; // ok if it is implemented as the same type.
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_PxVec3_WUndefined(const PxVec3& f)
+{
+	return _mm_set_ps(0.0f, f.z, f.y, f.x);
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_FloatV(FloatV f)
+{
+	return f;
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_FloatV(FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return Vec3V_From_Vec4V(Vec4V_From_FloatV(f));
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_FloatV_WUndefined(FloatV f)
+{
+	ASSERT_ISVALIDVEC3V(f);
+	return Vec3V_From_Vec4V_WUndefined(Vec4V_From_FloatV(f));
+}
+
+PX_FORCE_INLINE Mat33V Mat33V_From_PxMat33(const PxMat33& m)
+{
+	return Mat33V(V3LoadU(m.column0), V3LoadU(m.column1), V3LoadU(m.column2));
+}
+
+PX_FORCE_INLINE void PxMat33_From_Mat33V(const Mat33V& m, PxMat33& out)
+{
+	V3StoreU(m.col0, out.column0);
+	V3StoreU(m.col1, out.column1);
+	V3StoreU(m.col2, out.column2);
+}
+
+PX_FORCE_INLINE Vec4V V4LoadA(const PxF32* const f)
+{
+	ASSERT_ISALIGNED16(const_cast<PxF32*>(f));
+	return _mm_load_ps(f);
+}
+
+PX_FORCE_INLINE void V4StoreA(Vec4V a, PxF32* f)
+{
+	ASSERT_ISALIGNED16(f);
+	_mm_store_ps(f, a);
+}
+
+PX_FORCE_INLINE void V4StoreU(const Vec4V a, PxF32* f)
+{
+	_mm_storeu_ps(f, a);
+}
+
+PX_FORCE_INLINE void BStoreA(const BoolV a, PxU32* f)
+{
+	ASSERT_ISALIGNED16(f);
+	_mm_store_ps(reinterpret_cast<PxF32*>(f), a);
+}
+
+PX_FORCE_INLINE void U4StoreA(const VecU32V uv, PxU32* u)
+{
+	ASSERT_ISALIGNED16(u);
+	_mm_store_ps(reinterpret_cast<float*>(u), uv);
+}
+
+PX_FORCE_INLINE void I4StoreA(const VecI32V iv, PxI32* i)
+{
+	ASSERT_ISALIGNED16(i);
+	_mm_store_ps(reinterpret_cast<float*>(i), m128_I2F(iv));
+}
+
+PX_FORCE_INLINE Vec4V V4LoadU(const PxF32* const f)
+{
+	return _mm_loadu_ps(f);
+}
+
+PX_FORCE_INLINE BoolV BLoad(const bool* const f)
+{
+	const PX_ALIGN(16, PxI32) b[4] = { -PxI32(f[0]), -PxI32(f[1]), -PxI32(f[2]), -PxI32(f[3]) };
+	return _mm_load_ps(reinterpret_cast<const float*>(&b));
+}
+
+PX_FORCE_INLINE void FStore(const FloatV a, PxF32* PX_RESTRICT f)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	_mm_store_ss(f, a);
+}
+
+PX_FORCE_INLINE void V3StoreA(const Vec3V a, PxVec3& f)
+{
+	ASSERT_ISALIGNED16(&f);
+	PX_ALIGN(16, PxF32) f2[4];
+	_mm_store_ps(f2, a);
+	f = PxVec3(f2[0], f2[1], f2[2]);
+}
+
+PX_FORCE_INLINE void V3StoreU(const Vec3V a, PxVec3& f)
+{
+	PX_ALIGN(16, PxF32) f2[4];
+	_mm_store_ps(f2, a);
+	f = PxVec3(f2[0], f2[1], f2[2]);
+}
+
+PX_FORCE_INLINE void Store_From_BoolV(const BoolV b, PxU32* b2)
+{
+	_mm_store_ss(reinterpret_cast<PxF32*>(b2), b);
+}
+
+PX_FORCE_INLINE VecU32V U4Load(const PxU32 i)
+{
+	return _mm_load1_ps(reinterpret_cast<const PxF32*>(&i));
+}
+
+PX_FORCE_INLINE VecU32V U4LoadU(const PxU32* i)
+{
+	return _mm_loadu_ps(reinterpret_cast<const PxF32*>(i));
+}
+
+PX_FORCE_INLINE VecU32V U4LoadA(const PxU32* i)
+{
+	ASSERT_ISALIGNED16(const_cast<PxU32*>(i));
+	return _mm_load_ps(reinterpret_cast<const PxF32*>(i));
+}
+
+//////////////////////////////////
+// FLOATV
+//////////////////////////////////
+
+PX_FORCE_INLINE FloatV FZero()
+{
+	return FLoad(0.0f);
+}
+
+PX_FORCE_INLINE FloatV FOne()
+{
+	return FLoad(1.0f);
+}
+
+PX_FORCE_INLINE FloatV FHalf()
+{
+	return FLoad(0.5f);
+}
+
+PX_FORCE_INLINE FloatV FEps()
+{
+	return FLoad(PX_EPS_REAL);
+}
+
+PX_FORCE_INLINE FloatV FEps6()
+{
+	return FLoad(1e-6f);
+}
+
+PX_FORCE_INLINE FloatV FMax()
+{
+	return FLoad(PX_MAX_REAL);
+}
+
+PX_FORCE_INLINE FloatV FNegMax()
+{
+	return FLoad(-PX_MAX_REAL);
+}
+
+PX_FORCE_INLINE FloatV IZero()
+{
+	const PxU32 zero = 0;
+	return _mm_load1_ps(reinterpret_cast<const PxF32*>(&zero));
+}
+
+PX_FORCE_INLINE FloatV IOne()
+{
+	const PxU32 one = 1;
+	return _mm_load1_ps(reinterpret_cast<const PxF32*>(&one));
+}
+
+PX_FORCE_INLINE FloatV ITwo()
+{
+	const PxU32 two = 2;
+	return _mm_load1_ps(reinterpret_cast<const PxF32*>(&two));
+}
+
+PX_FORCE_INLINE FloatV IThree()
+{
+	const PxU32 three = 3;
+	return _mm_load1_ps(reinterpret_cast<const PxF32*>(&three));
+}
+
+PX_FORCE_INLINE FloatV IFour()
+{
+	PxU32 four = 4;
+	return _mm_load1_ps(reinterpret_cast<const PxF32*>(&four));
+}
+
+PX_FORCE_INLINE FloatV FNeg(const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return _mm_sub_ps(_mm_setzero_ps(), f);
+}
+
+PX_FORCE_INLINE FloatV FAdd(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+/*
+	if(!isValidFloatV(a))
+	{
+assert(false);
+	}
+	if(!isValidFloatV(b))
+	{
+assert(false);
+	}
+*/
+	return _mm_add_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FSub(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_sub_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FMul(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_mul_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FDiv(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_div_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FDivFast(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_mul_ps(a, _mm_rcp_ps(b));
+}
+
+PX_FORCE_INLINE FloatV FRecip(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return _mm_div_ps(FOne(), a);
+}
+
+PX_FORCE_INLINE FloatV FRecipFast(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return _mm_rcp_ps(a);
+}
+
+PX_FORCE_INLINE FloatV FRsqrt(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return _mm_div_ps(FOne(), _mm_sqrt_ps(a));
+}
+
+PX_FORCE_INLINE FloatV FSqrt(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return _mm_sqrt_ps(a);
+}
+
+PX_FORCE_INLINE FloatV FRsqrtFast(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return _mm_rsqrt_ps(a);
+}
+
+PX_FORCE_INLINE FloatV FScaleAdd(const FloatV a, const FloatV b, const FloatV c)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	ASSERT_ISVALIDFLOATV(c);
+	return FAdd(FMul(a, b), c);
+}
+
+PX_FORCE_INLINE FloatV FNegScaleSub(const FloatV a, const FloatV b, const FloatV c)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	ASSERT_ISVALIDFLOATV(c);
+	return FSub(c, FMul(a, b));
+}
+
+PX_FORCE_INLINE FloatV FAbs(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	PX_ALIGN(16, const PxU32) absMask[4] = { 0x7fFFffFF, 0x7fFFffFF, 0x7fFFffFF, 0x7fFFffFF };
+	return _mm_and_ps(a, _mm_load_ps(reinterpret_cast<const PxF32*>(absMask)));
+}
+
+PX_FORCE_INLINE FloatV FSel(const BoolV c, const FloatV a, const FloatV b)
+{
+	PX_ASSERT(_VecMathTests::allElementsEqualBoolV(c,BTTTT()) ||
+			  _VecMathTests::allElementsEqualBoolV(c,BFFFF()));
+	ASSERT_ISVALIDFLOATV(_mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a)));
+	return _mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a));
+}
+
+PX_FORCE_INLINE BoolV FIsGrtr(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_cmpgt_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV FIsGrtrOrEq(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_cmpge_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV FIsEq(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_cmpeq_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FMax(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_max_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FMin(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_min_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FClamp(const FloatV a, const FloatV minV, const FloatV maxV)
+{
+	ASSERT_ISVALIDFLOATV(minV);
+	ASSERT_ISVALIDFLOATV(maxV);
+	return _mm_max_ps(_mm_min_ps(a, maxV), minV);
+}
+
+PX_FORCE_INLINE PxU32 FAllGrtr(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_comigt_ss(a, b);
+}
+
+PX_FORCE_INLINE PxU32 FAllGrtrOrEq(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_comige_ss(a, b);
+}
+
+PX_FORCE_INLINE PxU32 FAllEq(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_comieq_ss(a, b);
+}
+
+PX_FORCE_INLINE FloatV FRound(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+#ifdef __SSE4_2__
+	return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+#else
+	// return _mm_round_ps(a, 0x0);
+	const FloatV half = FLoad(0.5f);
+	const __m128 signBit = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_cvtps_epi32(a), 31));
+	const FloatV aRound = FSub(FAdd(a, half), signBit);
+	__m128i tmp = _mm_cvttps_epi32(aRound);
+	return _mm_cvtepi32_ps(tmp);
+#endif
+}
+
+PX_FORCE_INLINE FloatV FSin(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+
+	// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
+	const FloatV recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const FloatV twoPi = V4LoadA(g_PXTwoPi.f);
+	const FloatV tmp = FMul(a, recipTwoPi);
+	const FloatV b = FRound(tmp);
+	const FloatV V1 = FNegScaleSub(twoPi, b, a);
+
+	// sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
+	//			 V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
+	const FloatV V2 = FMul(V1, V1);
+	const FloatV V3 = FMul(V2, V1);
+	const FloatV V5 = FMul(V3, V2);
+	const FloatV V7 = FMul(V5, V2);
+	const FloatV V9 = FMul(V7, V2);
+	const FloatV V11 = FMul(V9, V2);
+	const FloatV V13 = FMul(V11, V2);
+	const FloatV V15 = FMul(V13, V2);
+	const FloatV V17 = FMul(V15, V2);
+	const FloatV V19 = FMul(V17, V2);
+	const FloatV V21 = FMul(V19, V2);
+	const FloatV V23 = FMul(V21, V2);
+
+	const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f);
+	const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f);
+	const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f);
+
+	const FloatV S1 = V4GetY(sinCoefficients0);
+	const FloatV S2 = V4GetZ(sinCoefficients0);
+	const FloatV S3 = V4GetW(sinCoefficients0);
+	const FloatV S4 = V4GetX(sinCoefficients1);
+	const FloatV S5 = V4GetY(sinCoefficients1);
+	const FloatV S6 = V4GetZ(sinCoefficients1);
+	const FloatV S7 = V4GetW(sinCoefficients1);
+	const FloatV S8 = V4GetX(sinCoefficients2);
+	const FloatV S9 = V4GetY(sinCoefficients2);
+	const FloatV S10 = V4GetZ(sinCoefficients2);
+	const FloatV S11 = V4GetW(sinCoefficients2);
+
+	FloatV Result;
+	Result = FScaleAdd(S1, V3, V1);
+	Result = FScaleAdd(S2, V5, Result);
+	Result = FScaleAdd(S3, V7, Result);
+	Result = FScaleAdd(S4, V9, Result);
+	Result = FScaleAdd(S5, V11, Result);
+	Result = FScaleAdd(S6, V13, Result);
+	Result = FScaleAdd(S7, V15, Result);
+	Result = FScaleAdd(S8, V17, Result);
+	Result = FScaleAdd(S9, V19, Result);
+	Result = FScaleAdd(S10, V21, Result);
+	Result = FScaleAdd(S11, V23, Result);
+
+	return Result;
+}
+
+PX_FORCE_INLINE FloatV FCos(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+
+	// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
+	const FloatV recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const FloatV twoPi = V4LoadA(g_PXTwoPi.f);
+	const FloatV tmp = FMul(a, recipTwoPi);
+	const FloatV b = FRound(tmp);
+	const FloatV V1 = FNegScaleSub(twoPi, b, a);
+
+	// cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
+	//           V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
+	const FloatV V2 = FMul(V1, V1);
+	const FloatV V4 = FMul(V2, V2);
+	const FloatV V6 = FMul(V4, V2);
+	const FloatV V8 = FMul(V4, V4);
+	const FloatV V10 = FMul(V6, V4);
+	const FloatV V12 = FMul(V6, V6);
+	const FloatV V14 = FMul(V8, V6);
+	const FloatV V16 = FMul(V8, V8);
+	const FloatV V18 = FMul(V10, V8);
+	const FloatV V20 = FMul(V10, V10);
+	const FloatV V22 = FMul(V12, V10);
+
+	const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f);
+	const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f);
+	const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f);
+
+	const FloatV C1 = V4GetY(cosCoefficients0);
+	const FloatV C2 = V4GetZ(cosCoefficients0);
+	const FloatV C3 = V4GetW(cosCoefficients0);
+	const FloatV C4 = V4GetX(cosCoefficients1);
+	const FloatV C5 = V4GetY(cosCoefficients1);
+	const FloatV C6 = V4GetZ(cosCoefficients1);
+	const FloatV C7 = V4GetW(cosCoefficients1);
+	const FloatV C8 = V4GetX(cosCoefficients2);
+	const FloatV C9 = V4GetY(cosCoefficients2);
+	const FloatV C10 = V4GetZ(cosCoefficients2);
+	const FloatV C11 = V4GetW(cosCoefficients2);
+
+	FloatV Result;
+	Result = FScaleAdd(C1, V2, V4One());
+	Result = FScaleAdd(C2, V4, Result);
+	Result = FScaleAdd(C3, V6, Result);
+	Result = FScaleAdd(C4, V8, Result);
+	Result = FScaleAdd(C5, V10, Result);
+	Result = FScaleAdd(C6, V12, Result);
+	Result = FScaleAdd(C7, V14, Result);
+	Result = FScaleAdd(C8, V16, Result);
+	Result = FScaleAdd(C9, V18, Result);
+	Result = FScaleAdd(C10, V20, Result);
+	Result = FScaleAdd(C11, V22, Result);
+
+	return Result;
+}
+
+PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV min, const FloatV max)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(min);
+	ASSERT_ISVALIDFLOATV(max);
+	const BoolV c = BOr(FIsGrtr(a, max), FIsGrtr(min, a));
+	return !BAllEqFFFF(c);
+}
+
+PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV min, const FloatV max)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(min);
+	ASSERT_ISVALIDFLOATV(max)
+	const BoolV c = BAnd(FIsGrtrOrEq(a, min), FIsGrtrOrEq(max, a));
+	return BAllEqTTTT(c);
+}
+
+PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV bounds)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(bounds);
+	return FOutOfBounds(a, FNeg(bounds), bounds);
+}
+
+PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV bounds)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(bounds);
+	return FInBounds(a, FNeg(bounds), bounds);
+}
+
+//////////////////////////////////
+// VEC3V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec3V V3Splat(const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	const __m128 zero = FZero();
+	const __m128 fff0 = _mm_move_ss(f, zero);
+	return _mm_shuffle_ps(fff0, fff0, _MM_SHUFFLE(0, 1, 2, 3));
+}
+
+PX_FORCE_INLINE Vec3V V3Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z)
+{
+	ASSERT_ISVALIDFLOATV(x);
+	ASSERT_ISVALIDFLOATV(y);
+	ASSERT_ISVALIDFLOATV(z);
+	// static on zero causes compiler crash on x64 debug_opt
+	const __m128 zero = FZero();
+	const __m128 xy = _mm_move_ss(x, y);
+	const __m128 z0 = _mm_move_ss(zero, z);
+
+	return _mm_shuffle_ps(xy, z0, _MM_SHUFFLE(1, 0, 0, 1));
+}
+
+PX_FORCE_INLINE Vec3V V3UnitX()
+{
+	const PX_ALIGN(16, PxF32) x[4] = { 1.0f, 0.0f, 0.0f, 0.0f };
+	const __m128 x128 = _mm_load_ps(x);
+	return x128;
+}
+
+PX_FORCE_INLINE Vec3V V3UnitY()
+{
+	const PX_ALIGN(16, PxF32) y[4] = { 0.0f, 1.0f, 0.0f, 0.0f };
+	const __m128 y128 = _mm_load_ps(y);
+	return y128;
+}
+
+PX_FORCE_INLINE Vec3V V3UnitZ()
+{
+	const PX_ALIGN(16, PxF32) z[4] = { 0.0f, 0.0f, 1.0f, 0.0f };
+	const __m128 z128 = _mm_load_ps(z);
+	return z128;
+}
+
+PX_FORCE_INLINE FloatV V3GetX(const Vec3V f)
+{
+	ASSERT_ISVALIDVEC3V(f);
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0, 0, 0, 0));
+}
+
+PX_FORCE_INLINE FloatV V3GetY(const Vec3V f)
+{
+	ASSERT_ISVALIDVEC3V(f)
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 1, 1));
+}
+
+PX_FORCE_INLINE FloatV V3GetZ(const Vec3V f)
+{
+	ASSERT_ISVALIDVEC3V(f);
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(2, 2, 2, 2));
+}
+
+PX_FORCE_INLINE Vec3V V3SetX(const Vec3V v, const FloatV f)
+{
+	ASSERT_ISVALIDVEC3V(v);
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BFTTT(), v, f);
+}
+
+PX_FORCE_INLINE Vec3V V3SetY(const Vec3V v, const FloatV f)
+{
+	ASSERT_ISVALIDVEC3V(v);
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTFTT(), v, f);
+}
+
+PX_FORCE_INLINE Vec3V V3SetZ(const Vec3V v, const FloatV f)
+{
+	ASSERT_ISVALIDVEC3V(v);
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTTFT(), v, f);
+}
+
+PX_FORCE_INLINE Vec3V V3ColX(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+	Vec3V r = _mm_shuffle_ps(a, c, _MM_SHUFFLE(3, 0, 3, 0));
+	return V3SetY(r, V3GetX(b));
+}
+
+PX_FORCE_INLINE Vec3V V3ColY(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c)
+	Vec3V r = _mm_shuffle_ps(a, c, _MM_SHUFFLE(3, 1, 3, 1));
+	return V3SetY(r, V3GetY(b));
+}
+
+PX_FORCE_INLINE Vec3V V3ColZ(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+	Vec3V r = _mm_shuffle_ps(a, c, _MM_SHUFFLE(3, 2, 3, 2));
+	return V3SetY(r, V3GetZ(b));
+}
+
+PX_FORCE_INLINE Vec3V V3Zero()
+{
+	return V3Load(0.0f);
+}
+
+PX_FORCE_INLINE Vec3V V3Eps()
+{
+	return V3Load(PX_EPS_REAL);
+}
+PX_FORCE_INLINE Vec3V V3One()
+{
+	return V3Load(1.0f);
+}
+
+PX_FORCE_INLINE Vec3V V3Neg(const Vec3V f)
+{
+	ASSERT_ISVALIDVEC3V(f);
+	return _mm_sub_ps(_mm_setzero_ps(), f);
+}
+
+PX_FORCE_INLINE Vec3V V3Add(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_add_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Sub(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_sub_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Scale(const Vec3V a, const FloatV b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_mul_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Mul(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_mul_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3ScaleInv(const Vec3V a, const FloatV b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_div_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Div(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return V4ClearW(_mm_div_ps(a, b));
+}
+
+PX_FORCE_INLINE Vec3V V3ScaleInvFast(const Vec3V a, const FloatV b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_mul_ps(a, _mm_rcp_ps(b));
+}
+
+PX_FORCE_INLINE Vec3V V3DivFast(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return V4ClearW(_mm_mul_ps(a, _mm_rcp_ps(b)));
+}
+
+PX_FORCE_INLINE Vec3V V3Recip(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 zero = V3Zero();
+	const __m128 tttf = BTTTF();
+	const __m128 recipA = _mm_div_ps(V3One(), a);
+	return V4Sel(tttf, recipA, zero);
+}
+
+PX_FORCE_INLINE Vec3V V3RecipFast(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 zero = V3Zero();
+	const __m128 tttf = BTTTF();
+	const __m128 recipA = _mm_rcp_ps(a);
+	return V4Sel(tttf, recipA, zero);
+}
+
+PX_FORCE_INLINE Vec3V V3Rsqrt(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 zero = V3Zero();
+	const __m128 tttf = BTTTF();
+	const __m128 recipA = _mm_div_ps(V3One(), _mm_sqrt_ps(a));
+	return V4Sel(tttf, recipA, zero);
+}
+
+PX_FORCE_INLINE Vec3V V3RsqrtFast(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 zero = V3Zero();
+	const __m128 tttf = BTTTF();
+	const __m128 recipA = _mm_rsqrt_ps(a);
+	return V4Sel(tttf, recipA, zero);
+}
+
+PX_FORCE_INLINE Vec3V V3ScaleAdd(const Vec3V a, const FloatV b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	ASSERT_ISVALIDVEC3V(c);
+	return V3Add(V3Scale(a, b), c);
+}
+
+PX_FORCE_INLINE Vec3V V3NegScaleSub(const Vec3V a, const FloatV b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	ASSERT_ISVALIDVEC3V(c);
+	return V3Sub(c, V3Scale(a, b));
+}
+
+PX_FORCE_INLINE Vec3V V3MulAdd(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+	return V3Add(V3Mul(a, b), c);
+}
+
+PX_FORCE_INLINE Vec3V V3NegMulSub(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+	return V3Sub(c, V3Mul(a, b));
+}
+
+PX_FORCE_INLINE Vec3V V3Abs(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return V3Max(a, V3Neg(a));
+}
+
+PX_FORCE_INLINE FloatV V3Dot(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+#ifdef __SSE4_2__
+	return _mm_dp_ps(a, b, 0x7f);
+#else
+	const __m128 t0 = _mm_mul_ps(a, b);								//	aw*bw | az*bz | ay*by | ax*bx
+	const __m128 t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(1,0,3,2));	//	ay*by | ax*bx | aw*bw | az*bz
+	const __m128 t2 = _mm_add_ps(t0, t1);							//	ay*by + aw*bw | ax*bx + az*bz | aw*bw + ay*by | az*bz + ax*bx
+	const __m128 t3 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,3,0,1));	//	ax*bx + az*bz | ay*by + aw*bw | az*bz + ax*bx | aw*bw + ay*by
+	return _mm_add_ps(t3, t2);										//	ax*bx + az*bz + ay*by + aw*bw 
+																	//	ay*by + aw*bw + ax*bx + az*bz
+																	//	az*bz + ax*bx + aw*bw + ay*by
+																	//	aw*bw + ay*by + az*bz + ax*bx
+#endif
+}
+
+PX_FORCE_INLINE Vec3V V3Cross(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	const __m128 r1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	const __m128 r2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	const __m128 l1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	const __m128 l2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	return _mm_sub_ps(_mm_mul_ps(l1, l2), _mm_mul_ps(r1, r2));
+}
+
+PX_FORCE_INLINE VecCrossV V3PrepareCross(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	VecCrossV v;
+	v.mR1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	v.mL1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	return v;
+}
+
+PX_FORCE_INLINE Vec3V V3Cross(const VecCrossV& a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(b);
+	const __m128 r2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	const __m128 l2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	return _mm_sub_ps(_mm_mul_ps(a.mL1, l2), _mm_mul_ps(a.mR1, r2));
+}
+
+PX_FORCE_INLINE Vec3V V3Cross(const Vec3V a, const VecCrossV& b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 r2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	const __m128 l2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	return _mm_sub_ps(_mm_mul_ps(b.mR1, r2), _mm_mul_ps(b.mL1, l2));
+}
+
+PX_FORCE_INLINE Vec3V V3Cross(const VecCrossV& a, const VecCrossV& b)
+{
+	return _mm_sub_ps(_mm_mul_ps(a.mL1, b.mR1), _mm_mul_ps(a.mR1, b.mL1));
+}
+
+PX_FORCE_INLINE FloatV V3Length(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_sqrt_ps(V3Dot(a, a));
+}
+
+PX_FORCE_INLINE FloatV V3LengthSq(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return V3Dot(a, a);
+}
+
+PX_FORCE_INLINE Vec3V V3Normalize(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISFINITELENGTH(a);
+	return V3ScaleInv(a, _mm_sqrt_ps(V3Dot(a, a)));
+}
+
+PX_FORCE_INLINE Vec3V V3NormalizeFast(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISFINITELENGTH(a);
+	return V3Scale(a, _mm_rsqrt_ps(V3Dot(a, a)));
+}
+
+PX_FORCE_INLINE Vec3V V3NormalizeSafe(const Vec3V a, const Vec3V unsafeReturnValue)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 eps = V3Eps();
+	const __m128 length = V3Length(a);
+	const __m128 isGreaterThanZero = FIsGrtr(length, eps);
+	return V3Sel(isGreaterThanZero, V3ScaleInv(a, length), unsafeReturnValue);
+}
+
+PX_FORCE_INLINE Vec3V V3Sel(const BoolV c, const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(_mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a)));
+	return _mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a));
+}
+
+PX_FORCE_INLINE BoolV V3IsGrtr(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_cmpgt_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV V3IsGrtrOrEq(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_cmpge_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV V3IsEq(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_cmpeq_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Max(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_max_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Min(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_min_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV V3ExtractMax(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0));
+	const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1));
+	const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2));
+
+	return _mm_max_ps(_mm_max_ps(shuf1, shuf2), shuf3);
+}
+
+PX_FORCE_INLINE FloatV V3ExtractMin(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+
+	const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0));
+	const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1));
+	const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2));
+
+	return _mm_min_ps(_mm_min_ps(shuf1, shuf2), shuf3);
+}
+
+// return (a >= 0.0f) ? 1.0f : -1.0f;
+PX_FORCE_INLINE Vec3V V3Sign(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 zero = V3Zero();
+	const __m128 one = V3One();
+	const __m128 none = V3Neg(one);
+	return V3Sel(V3IsGrtrOrEq(a, zero), one, none);
+}
+
+PX_FORCE_INLINE Vec3V V3Clamp(const Vec3V a, const Vec3V minV, const Vec3V maxV)
+{
+	ASSERT_ISVALIDVEC3V(maxV);
+	ASSERT_ISVALIDVEC3V(minV);
+	return V3Max(V3Min(a, maxV), minV);
+}
+
+PX_FORCE_INLINE PxU32 V3AllGrtr(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return internalUnitSSE2Simd::BAllTrue3_R(V4IsGrtr(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V3AllGrtrOrEq(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return internalUnitSSE2Simd::BAllTrue3_R(V4IsGrtrOrEq(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V3AllEq(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return internalUnitSSE2Simd::BAllTrue3_R(V4IsEq(a, b));
+}
+
+PX_FORCE_INLINE Vec3V V3Round(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+#ifdef __SSE4_2__
+	return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+#else
+	// return _mm_round_ps(a, 0x0);
+	const Vec3V half = V3Load(0.5f);
+	const __m128 signBit = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_cvtps_epi32(a), 31));
+	const Vec3V aRound = V3Sub(V3Add(a, half), signBit);
+	__m128i tmp = _mm_cvttps_epi32(aRound);
+	return _mm_cvtepi32_ps(tmp);
+#endif
+}
+
+PX_FORCE_INLINE Vec3V V3Sin(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
+	const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const Vec4V twoPi = V4LoadA(g_PXTwoPi.f);
+	const Vec3V tmp = V3Scale(a, recipTwoPi);
+	const Vec3V b = V3Round(tmp);
+	const Vec3V V1 = V3NegScaleSub(b, twoPi, a);
+
+	// sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
+	//           V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
+	const Vec3V V2 = V3Mul(V1, V1);
+	const Vec3V V3 = V3Mul(V2, V1);
+	const Vec3V V5 = V3Mul(V3, V2);
+	const Vec3V V7 = V3Mul(V5, V2);
+	const Vec3V V9 = V3Mul(V7, V2);
+	const Vec3V V11 = V3Mul(V9, V2);
+	const Vec3V V13 = V3Mul(V11, V2);
+	const Vec3V V15 = V3Mul(V13, V2);
+	const Vec3V V17 = V3Mul(V15, V2);
+	const Vec3V V19 = V3Mul(V17, V2);
+	const Vec3V V21 = V3Mul(V19, V2);
+	const Vec3V V23 = V3Mul(V21, V2);
+
+	const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f);
+	const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f);
+	const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f);
+
+	const FloatV S1 = V4GetY(sinCoefficients0);
+	const FloatV S2 = V4GetZ(sinCoefficients0);
+	const FloatV S3 = V4GetW(sinCoefficients0);
+	const FloatV S4 = V4GetX(sinCoefficients1);
+	const FloatV S5 = V4GetY(sinCoefficients1);
+	const FloatV S6 = V4GetZ(sinCoefficients1);
+	const FloatV S7 = V4GetW(sinCoefficients1);
+	const FloatV S8 = V4GetX(sinCoefficients2);
+	const FloatV S9 = V4GetY(sinCoefficients2);
+	const FloatV S10 = V4GetZ(sinCoefficients2);
+	const FloatV S11 = V4GetW(sinCoefficients2);
+
+	Vec3V Result;
+	Result = V3ScaleAdd(V3, S1, V1);
+	Result = V3ScaleAdd(V5, S2, Result);
+	Result = V3ScaleAdd(V7, S3, Result);
+	Result = V3ScaleAdd(V9, S4, Result);
+	Result = V3ScaleAdd(V11, S5, Result);
+	Result = V3ScaleAdd(V13, S6, Result);
+	Result = V3ScaleAdd(V15, S7, Result);
+	Result = V3ScaleAdd(V17, S8, Result);
+	Result = V3ScaleAdd(V19, S9, Result);
+	Result = V3ScaleAdd(V21, S10, Result);
+	Result = V3ScaleAdd(V23, S11, Result);
+
+	ASSERT_ISVALIDVEC3V(Result);
+	return Result;
+}
+
+PX_FORCE_INLINE Vec3V V3Cos(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+
+	// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
+	const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const Vec4V twoPi = V4LoadA(g_PXTwoPi.f);
+	const Vec3V tmp = V3Scale(a, recipTwoPi);
+	const Vec3V b = V3Round(tmp);
+	const Vec3V V1 = V3NegScaleSub(b, twoPi, a);
+
+	// cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
+	//           V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
+	const Vec3V V2 = V3Mul(V1, V1);
+	const Vec3V V4 = V3Mul(V2, V2);
+	const Vec3V V6 = V3Mul(V4, V2);
+	const Vec3V V8 = V3Mul(V4, V4);
+	const Vec3V V10 = V3Mul(V6, V4);
+	const Vec3V V12 = V3Mul(V6, V6);
+	const Vec3V V14 = V3Mul(V8, V6);
+	const Vec3V V16 = V3Mul(V8, V8);
+	const Vec3V V18 = V3Mul(V10, V8);
+	const Vec3V V20 = V3Mul(V10, V10);
+	const Vec3V V22 = V3Mul(V12, V10);
+
+	const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f);
+	const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f);
+	const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f);
+
+	const FloatV C1 = V4GetY(cosCoefficients0);
+	const FloatV C2 = V4GetZ(cosCoefficients0);
+	const FloatV C3 = V4GetW(cosCoefficients0);
+	const FloatV C4 = V4GetX(cosCoefficients1);
+	const FloatV C5 = V4GetY(cosCoefficients1);
+	const FloatV C6 = V4GetZ(cosCoefficients1);
+	const FloatV C7 = V4GetW(cosCoefficients1);
+	const FloatV C8 = V4GetX(cosCoefficients2);
+	const FloatV C9 = V4GetY(cosCoefficients2);
+	const FloatV C10 = V4GetZ(cosCoefficients2);
+	const FloatV C11 = V4GetW(cosCoefficients2);
+
+	Vec3V Result;
+	Result = V3ScaleAdd(V2, C1, V3One());
+	Result = V3ScaleAdd(V4, C2, Result);
+	Result = V3ScaleAdd(V6, C3, Result);
+	Result = V3ScaleAdd(V8, C4, Result);
+	Result = V3ScaleAdd(V10, C5, Result);
+	Result = V3ScaleAdd(V12, C6, Result);
+	Result = V3ScaleAdd(V14, C7, Result);
+	Result = V3ScaleAdd(V16, C8, Result);
+	Result = V3ScaleAdd(V18, C9, Result);
+	Result = V3ScaleAdd(V20, C10, Result);
+	Result = V3ScaleAdd(V22, C11, Result);
+
+	ASSERT_ISVALIDVEC3V(Result);
+	return Result;
+}
+
+PX_FORCE_INLINE Vec3V V3PermYZZ(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 2, 1));
+}
+
+PX_FORCE_INLINE Vec3V V3PermXYX(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 1, 0));
+}
+
+PX_FORCE_INLINE Vec3V V3PermYZX(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
+}
+
+PX_FORCE_INLINE Vec3V V3PermZXY(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2));
+}
+
+PX_FORCE_INLINE Vec3V V3PermZZY(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 2, 2));
+}
+
+PX_FORCE_INLINE Vec3V V3PermYXX(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 0, 1));
+}
+
+PX_FORCE_INLINE Vec3V V3Perm_Zero_1Z_0Y(const Vec3V v0, const Vec3V v1)
+{
+	ASSERT_ISVALIDVEC3V(v0);
+	ASSERT_ISVALIDVEC3V(v1);
+	return _mm_shuffle_ps(v1, v0, _MM_SHUFFLE(3, 1, 2, 3));
+}
+
+PX_FORCE_INLINE Vec3V V3Perm_0Z_Zero_1X(const Vec3V v0, const Vec3V v1)
+{
+	ASSERT_ISVALIDVEC3V(v0);
+	ASSERT_ISVALIDVEC3V(v1);
+	return _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(3, 0, 3, 2));
+}
+
+PX_FORCE_INLINE Vec3V V3Perm_1Y_0X_Zero(const Vec3V v0, const Vec3V v1)
+{
+	ASSERT_ISVALIDVEC3V(v0);
+	ASSERT_ISVALIDVEC3V(v1);
+	// There must be a better way to do this.
+	Vec3V v2 = V3Zero();
+	FloatV y1 = V3GetY(v1);
+	FloatV x0 = V3GetX(v0);
+	v2 = V3SetX(v2, y1);
+	return V3SetY(v2, x0);
+}
+
+PX_FORCE_INLINE FloatV V3SumElems(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+#ifdef __SSE4_2__
+	Vec3V r = _mm_hadd_ps(a, a);
+	r = _mm_hadd_ps(r, r);
+	return r;
+#else
+	__m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)); // z,y,x,w
+	__m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)); // y,x,w,z
+	__m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)); // x,w,z,y
+	return _mm_add_ps(_mm_add_ps(shuf1, shuf2), shuf3);
+#endif
+}
+
+PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V min, const Vec3V max)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(min);
+	ASSERT_ISVALIDVEC3V(max);
+	const BoolV c = BOr(V3IsGrtr(a, max), V3IsGrtr(min, a));
+	return !BAllEqFFFF(c);
+}
+
+PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V min, const Vec3V max)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(min);
+	ASSERT_ISVALIDVEC3V(max);
+	const BoolV c = BAnd(V3IsGrtrOrEq(a, min), V3IsGrtrOrEq(max, a));
+	return BAllEqTTTT(c);
+}
+
+PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V bounds)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(bounds);
+	return V3OutOfBounds(a, V3Neg(bounds), bounds);
+}
+
+PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V bounds)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(bounds)
+	return V3InBounds(a, V3Neg(bounds), bounds);
+}
+
+PX_FORCE_INLINE void V3Transpose(Vec3V& col0, Vec3V& col1, Vec3V& col2)
+{
+	ASSERT_ISVALIDVEC3V(col0);
+	ASSERT_ISVALIDVEC3V(col1);
+	ASSERT_ISVALIDVEC3V(col2);
+
+	const Vec3V col3 = _mm_setzero_ps();
+	Vec3V tmp0 = _mm_unpacklo_ps(col0, col1);
+	Vec3V tmp2 = _mm_unpacklo_ps(col2, col3);
+	Vec3V tmp1 = _mm_unpackhi_ps(col0, col1);
+	Vec3V tmp3 = _mm_unpackhi_ps(col2, col3);
+	col0 = _mm_movelh_ps(tmp0, tmp2);
+	col1 = _mm_movehl_ps(tmp2, tmp0);
+	col2 = _mm_movelh_ps(tmp1, tmp3);
+}
+
+//////////////////////////////////
+// VEC4V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec4V V4Splat(const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	// return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0,0,0,0));
+	return f;
+}
+
+PX_FORCE_INLINE Vec4V V4Merge(const FloatV* const floatVArray)
+{
+	ASSERT_ISVALIDFLOATV(floatVArray[0]);
+	ASSERT_ISVALIDFLOATV(floatVArray[1]);
+	ASSERT_ISVALIDFLOATV(floatVArray[2]);
+	ASSERT_ISVALIDFLOATV(floatVArray[3]);
+	const __m128 xw = _mm_move_ss(floatVArray[1], floatVArray[0]); // y, y, y, x
+	const __m128 yz = _mm_move_ss(floatVArray[2], floatVArray[3]); // z, z, z, w
+	return _mm_shuffle_ps(xw, yz, _MM_SHUFFLE(0, 2, 1, 0));
+}
+
+PX_FORCE_INLINE Vec4V V4Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z, const FloatVArg w)
+{
+	ASSERT_ISVALIDFLOATV(x);
+	ASSERT_ISVALIDFLOATV(y);
+	ASSERT_ISVALIDFLOATV(z);
+	ASSERT_ISVALIDFLOATV(w);
+	const __m128 xw = _mm_move_ss(y, x); // y, y, y, x
+	const __m128 yz = _mm_move_ss(z, w); // z, z, z, w
+	return _mm_shuffle_ps(xw, yz, _MM_SHUFFLE(0, 2, 1, 0));
+}
+
+PX_FORCE_INLINE Vec4V V4MergeW(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	const Vec4V xz = _mm_unpackhi_ps(x, z);
+	const Vec4V yw = _mm_unpackhi_ps(y, w);
+	return _mm_unpackhi_ps(xz, yw);
+}
+
+PX_FORCE_INLINE Vec4V V4MergeZ(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	const Vec4V xz = _mm_unpackhi_ps(x, z);
+	const Vec4V yw = _mm_unpackhi_ps(y, w);
+	return _mm_unpacklo_ps(xz, yw);
+}
+
+PX_FORCE_INLINE Vec4V V4MergeY(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	const Vec4V xz = _mm_unpacklo_ps(x, z);
+	const Vec4V yw = _mm_unpacklo_ps(y, w);
+	return _mm_unpackhi_ps(xz, yw);
+}
+
+PX_FORCE_INLINE Vec4V V4MergeX(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	const Vec4V xz = _mm_unpacklo_ps(x, z);
+	const Vec4V yw = _mm_unpacklo_ps(y, w);
+	return _mm_unpacklo_ps(xz, yw);
+}
+
+PX_FORCE_INLINE Vec4V V4UnpackXY(const Vec4VArg a, const Vec4VArg b)
+{
+	return _mm_unpacklo_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4UnpackZW(const Vec4VArg a, const Vec4VArg b)
+{
+	return _mm_unpackhi_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4UnitW()
+{
+	const PX_ALIGN(16, PxF32) w[4] = { 0.0f, 0.0f, 0.0f, 1.0f };
+	const __m128 w128 = _mm_load_ps(w);
+	return w128;
+}
+
+PX_FORCE_INLINE Vec4V V4UnitX()
+{
+	const PX_ALIGN(16, PxF32) x[4] = { 1.0f, 0.0f, 0.0f, 0.0f };
+	const __m128 x128 = _mm_load_ps(x);
+	return x128;
+}
+
+PX_FORCE_INLINE Vec4V V4UnitY()
+{
+	const PX_ALIGN(16, PxF32) y[4] = { 0.0f, 1.0f, 0.0f, 0.0f };
+	const __m128 y128 = _mm_load_ps(y);
+	return y128;
+}
+
+PX_FORCE_INLINE Vec4V V4UnitZ()
+{
+	const PX_ALIGN(16, PxF32) z[4] = { 0.0f, 0.0f, 1.0f, 0.0f };
+	const __m128 z128 = _mm_load_ps(z);
+	return z128;
+}
+
+PX_FORCE_INLINE FloatV V4GetW(const Vec4V f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(3, 3, 3, 3));
+}
+
+PX_FORCE_INLINE FloatV V4GetX(const Vec4V f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0, 0, 0, 0));
+}
+
+PX_FORCE_INLINE FloatV V4GetY(const Vec4V f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 1, 1));
+}
+
+PX_FORCE_INLINE FloatV V4GetZ(const Vec4V f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(2, 2, 2, 2));
+}
+
+PX_FORCE_INLINE Vec4V V4SetW(const Vec4V v, const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTTTF(), v, f);
+}
+
+PX_FORCE_INLINE Vec4V V4SetX(const Vec4V v, const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BFTTT(), v, f);
+}
+
+PX_FORCE_INLINE Vec4V V4SetY(const Vec4V v, const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTFTT(), v, f);
+}
+
+PX_FORCE_INLINE Vec4V V4SetZ(const Vec4V v, const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTTFT(), v, f);
+}
+
+PX_FORCE_INLINE Vec4V V4ClearW(const Vec4V v)
+{
+	return _mm_and_ps(v, V4LoadA(internalUnitSSE2Simd::gMaskXYZ));
+}
+
+PX_FORCE_INLINE Vec4V V4PermYXWZ(const Vec4V a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+
+PX_FORCE_INLINE Vec4V V4PermXZXZ(const Vec4V a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 0, 2, 0));
+}
+
+PX_FORCE_INLINE Vec4V V4PermYWYW(const Vec4V a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 3, 1));
+}
+
+PX_FORCE_INLINE Vec4V V4PermYZXW(const Vec4V a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
+}
+
+template <PxU8 x, PxU8 y, PxU8 z, PxU8 w>
+PX_FORCE_INLINE Vec4V V4Perm(const Vec4V a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(w, z, y, x));
+}
+
+PX_FORCE_INLINE Vec4V V4Zero()
+{
+	return V4Load(0.0f);
+}
+
+PX_FORCE_INLINE Vec4V V4One()
+{
+	return V4Load(1.0f);
+}
+
+PX_FORCE_INLINE Vec4V V4Eps()
+{
+	return V4Load(PX_EPS_REAL);
+}
+
+PX_FORCE_INLINE Vec4V V4Neg(const Vec4V f)
+{
+	return _mm_sub_ps(_mm_setzero_ps(), f);
+}
+
+PX_FORCE_INLINE Vec4V V4Add(const Vec4V a, const Vec4V b)
+{
+	return _mm_add_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Sub(const Vec4V a, const Vec4V b)
+{
+	return _mm_sub_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Scale(const Vec4V a, const FloatV b)
+{
+	return _mm_mul_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Mul(const Vec4V a, const Vec4V b)
+{
+	return _mm_mul_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4ScaleInv(const Vec4V a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_div_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Div(const Vec4V a, const Vec4V b)
+{
+	return _mm_div_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4ScaleInvFast(const Vec4V a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_mul_ps(a, _mm_rcp_ps(b));
+}
+
+PX_FORCE_INLINE Vec4V V4DivFast(const Vec4V a, const Vec4V b)
+{
+	return _mm_mul_ps(a, _mm_rcp_ps(b));
+}
+
+PX_FORCE_INLINE Vec4V V4Recip(const Vec4V a)
+{
+	return _mm_div_ps(V4One(), a);
+}
+
+PX_FORCE_INLINE Vec4V V4RecipFast(const Vec4V a)
+{
+	return _mm_rcp_ps(a);
+}
+
+PX_FORCE_INLINE Vec4V V4Rsqrt(const Vec4V a)
+{
+	return _mm_div_ps(V4One(), _mm_sqrt_ps(a));
+}
+
+PX_FORCE_INLINE Vec4V V4RsqrtFast(const Vec4V a)
+{
+	return _mm_rsqrt_ps(a);
+}
+
+PX_FORCE_INLINE Vec4V V4Sqrt(const Vec4V a)
+{
+	return _mm_sqrt_ps(a);
+}
+
+PX_FORCE_INLINE Vec4V V4ScaleAdd(const Vec4V a, const FloatV b, const Vec4V c)
+{
+	ASSERT_ISVALIDFLOATV(b);
+	return V4Add(V4Scale(a, b), c);
+}
+
+PX_FORCE_INLINE Vec4V V4NegScaleSub(const Vec4V a, const FloatV b, const Vec4V c)
+{
+	ASSERT_ISVALIDFLOATV(b);
+	return V4Sub(c, V4Scale(a, b));
+}
+
+PX_FORCE_INLINE Vec4V V4MulAdd(const Vec4V a, const Vec4V b, const Vec4V c)
+{
+	return V4Add(V4Mul(a, b), c);
+}
+
+PX_FORCE_INLINE Vec4V V4NegMulSub(const Vec4V a, const Vec4V b, const Vec4V c)
+{
+	return V4Sub(c, V4Mul(a, b));
+}
+
+PX_FORCE_INLINE Vec4V V4Abs(const Vec4V a)
+{
+	return V4Max(a, V4Neg(a));
+}
+
+PX_FORCE_INLINE FloatV V4SumElements(const Vec4V a)
+{
+#ifdef __SSE4_2__
+	Vec4V r = _mm_hadd_ps(a, a);
+	r = _mm_hadd_ps(r, r);
+	return r;
+#else
+	const Vec4V xy = V4UnpackXY(a, a);                                        // x,x,y,y
+	const Vec4V zw = V4UnpackZW(a, a);                                        // z,z,w,w
+	const Vec4V xz_yw = V4Add(xy, zw);                                        // x+z,x+z,y+w,y+w
+	const FloatV xz = V4GetX(xz_yw);                                          // x+z
+	const FloatV yw = V4GetZ(xz_yw);                                          // y+w
+	return FAdd(xz, yw);                                                      // sum
+#endif
+}
+
+PX_FORCE_INLINE FloatV V4Dot(const Vec4V a, const Vec4V b)
+{
+#ifdef __SSE4_2__
+	return _mm_dp_ps(a, b, 0xff);
+#else
+	const __m128 dot1 = _mm_mul_ps(a, b);                                     // x,y,z,w
+	const __m128 shuf1 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(2, 1, 0, 3)); // w,x,y,z
+	const __m128 shuf2 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(1, 0, 3, 2)); // z,w,x,y
+	const __m128 shuf3 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(0, 3, 2, 1)); // y,z,w,x
+	return _mm_add_ps(_mm_add_ps(shuf2, shuf3), _mm_add_ps(dot1, shuf1));
+#endif
+}
+
+PX_FORCE_INLINE FloatV V4Dot3(const Vec4V a, const Vec4V b)
+{
+#ifdef __SSE4_2__
+	return _mm_dp_ps(a, b, 0x7f);
+#else
+	const __m128 dot1 = _mm_mul_ps(a, b);                                     // w,z,y,x
+	const __m128 shuf1 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(0, 0, 0, 0)); // z,y,x,w
+	const __m128 shuf2 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(1, 1, 1, 1)); // y,x,w,z
+	const __m128 shuf3 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(2, 2, 2, 2)); // x,w,z,y
+	return _mm_add_ps(_mm_add_ps(shuf1, shuf2), shuf3);
+#endif
+}
+
+PX_FORCE_INLINE Vec4V V4Cross(const Vec4V a, const Vec4V b)
+{
+	const __m128 r1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	const __m128 r2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	const __m128 l1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	const __m128 l2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	return _mm_sub_ps(_mm_mul_ps(l1, l2), _mm_mul_ps(r1, r2));
+}
+
+PX_FORCE_INLINE FloatV V4Length(const Vec4V a)
+{
+	return _mm_sqrt_ps(V4Dot(a, a));
+}
+
+PX_FORCE_INLINE FloatV V4LengthSq(const Vec4V a)
+{
+	return V4Dot(a, a);
+}
+
+PX_FORCE_INLINE Vec4V V4Normalize(const Vec4V a)
+{
+	ASSERT_ISFINITELENGTH(a);
+	return V4ScaleInv(a, _mm_sqrt_ps(V4Dot(a, a)));
+}
+
+PX_FORCE_INLINE Vec4V V4NormalizeFast(const Vec4V a)
+{
+	ASSERT_ISFINITELENGTH(a);
+	return V4ScaleInvFast(a, _mm_sqrt_ps(V4Dot(a, a)));
+}
+
+PX_FORCE_INLINE Vec4V V4NormalizeSafe(const Vec4V a, const Vec3V unsafeReturnValue)
+{
+	const __m128 eps = V3Eps();
+	const __m128 length = V4Length(a);
+	const __m128 isGreaterThanZero = V4IsGrtr(length, eps);
+	return V4Sel(isGreaterThanZero, V4ScaleInv(a, length), unsafeReturnValue);
+}
+
+PX_FORCE_INLINE BoolV V4IsEqU32(const VecU32V a, const VecU32V b)
+{
+	return m128_I2F(_mm_cmpeq_epi32(m128_F2I(a), m128_F2I(b)));
+}
+
+PX_FORCE_INLINE Vec4V V4Sel(const BoolV c, const Vec4V a, const Vec4V b)
+{
+	return _mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a));
+}
+
+PX_FORCE_INLINE BoolV V4IsGrtr(const Vec4V a, const Vec4V b)
+{
+	return _mm_cmpgt_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV V4IsGrtrOrEq(const Vec4V a, const Vec4V b)
+{
+	return _mm_cmpge_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV V4IsEq(const Vec4V a, const Vec4V b)
+{
+	return _mm_cmpeq_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Max(const Vec4V a, const Vec4V b)
+{
+	return _mm_max_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Min(const Vec4V a, const Vec4V b)
+{
+	return _mm_min_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV V4ExtractMax(const Vec4V a)
+{
+	const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 1, 0, 3));
+	const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2));
+	const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 3, 2, 1));
+
+	return _mm_max_ps(_mm_max_ps(a, shuf1), _mm_max_ps(shuf2, shuf3));
+}
+
+PX_FORCE_INLINE FloatV V4ExtractMin(const Vec4V a)
+{
+	const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 1, 0, 3));
+	const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2));
+	const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 3, 2, 1));
+
+	return _mm_min_ps(_mm_min_ps(a, shuf1), _mm_min_ps(shuf2, shuf3));
+}
+
+PX_FORCE_INLINE Vec4V V4Clamp(const Vec4V a, const Vec4V minV, const Vec4V maxV)
+{
+	return V4Max(V4Min(a, maxV), minV);
+}
+
+PX_FORCE_INLINE PxU32 V4AllGrtr(const Vec4V a, const Vec4V b)
+{
+	return internalUnitSSE2Simd::BAllTrue4_R(V4IsGrtr(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V4AllGrtrOrEq(const Vec4V a, const Vec4V b)
+{
+	return internalUnitSSE2Simd::BAllTrue4_R(V4IsGrtrOrEq(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V4AllGrtrOrEq3(const Vec4V a, const Vec4V b)
+{
+	return internalUnitSSE2Simd::BAllTrue3_R(V4IsGrtrOrEq(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V4AllEq(const Vec4V a, const Vec4V b)
+{
+	return internalUnitSSE2Simd::BAllTrue4_R(V4IsEq(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V4AnyGrtr3(const Vec4V a, const Vec4V b)
+{
+	return internalUnitSSE2Simd::BAnyTrue3_R(V4IsGrtr(a, b));
+}
+
+PX_FORCE_INLINE Vec4V V4Round(const Vec4V a)
+{
+#ifdef __SSE4_2__
+	return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+#else
+	// return _mm_round_ps(a, 0x0);
+	const Vec4V half = V4Load(0.5f);
+	const __m128 signBit = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_cvtps_epi32(a), 31));
+	const Vec4V aRound = V4Sub(V4Add(a, half), signBit);
+	__m128i tmp = _mm_cvttps_epi32(aRound);
+	return _mm_cvtepi32_ps(tmp);
+#endif
+}
+
+PX_FORCE_INLINE Vec4V V4Sin(const Vec4V a)
+{
+	const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const Vec4V twoPi = V4LoadA(g_PXTwoPi.f);
+	const Vec4V tmp = V4Mul(a, recipTwoPi);
+	const Vec4V b = V4Round(tmp);
+	const Vec4V V1 = V4NegMulSub(twoPi, b, a);
+
+	// sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
+	//           V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
+	const Vec4V V2 = V4Mul(V1, V1);
+	const Vec4V V3 = V4Mul(V2, V1);
+	const Vec4V V5 = V4Mul(V3, V2);
+	const Vec4V V7 = V4Mul(V5, V2);
+	const Vec4V V9 = V4Mul(V7, V2);
+	const Vec4V V11 = V4Mul(V9, V2);
+	const Vec4V V13 = V4Mul(V11, V2);
+	const Vec4V V15 = V4Mul(V13, V2);
+	const Vec4V V17 = V4Mul(V15, V2);
+	const Vec4V V19 = V4Mul(V17, V2);
+	const Vec4V V21 = V4Mul(V19, V2);
+	const Vec4V V23 = V4Mul(V21, V2);
+
+	const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f);
+	const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f);
+	const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f);
+
+	const FloatV S1 = V4GetY(sinCoefficients0);
+	const FloatV S2 = V4GetZ(sinCoefficients0);
+	const FloatV S3 = V4GetW(sinCoefficients0);
+	const FloatV S4 = V4GetX(sinCoefficients1);
+	const FloatV S5 = V4GetY(sinCoefficients1);
+	const FloatV S6 = V4GetZ(sinCoefficients1);
+	const FloatV S7 = V4GetW(sinCoefficients1);
+	const FloatV S8 = V4GetX(sinCoefficients2);
+	const FloatV S9 = V4GetY(sinCoefficients2);
+	const FloatV S10 = V4GetZ(sinCoefficients2);
+	const FloatV S11 = V4GetW(sinCoefficients2);
+
+	Vec4V Result;
+	Result = V4MulAdd(S1, V3, V1);
+	Result = V4MulAdd(S2, V5, Result);
+	Result = V4MulAdd(S3, V7, Result);
+	Result = V4MulAdd(S4, V9, Result);
+	Result = V4MulAdd(S5, V11, Result);
+	Result = V4MulAdd(S6, V13, Result);
+	Result = V4MulAdd(S7, V15, Result);
+	Result = V4MulAdd(S8, V17, Result);
+	Result = V4MulAdd(S9, V19, Result);
+	Result = V4MulAdd(S10, V21, Result);
+	Result = V4MulAdd(S11, V23, Result);
+
+	return Result;
+}
+
+PX_FORCE_INLINE Vec4V V4Cos(const Vec4V a)
+{
+	const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const Vec4V twoPi = V4LoadA(g_PXTwoPi.f);
+	const Vec4V tmp = V4Mul(a, recipTwoPi);
+	const Vec4V b = V4Round(tmp);
+	const Vec4V V1 = V4NegMulSub(twoPi, b, a);
+
+	// cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
+	//           V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
+	const Vec4V V2 = V4Mul(V1, V1);
+	const Vec4V V4 = V4Mul(V2, V2);
+	const Vec4V V6 = V4Mul(V4, V2);
+	const Vec4V V8 = V4Mul(V4, V4);
+	const Vec4V V10 = V4Mul(V6, V4);
+	const Vec4V V12 = V4Mul(V6, V6);
+	const Vec4V V14 = V4Mul(V8, V6);
+	const Vec4V V16 = V4Mul(V8, V8);
+	const Vec4V V18 = V4Mul(V10, V8);
+	const Vec4V V20 = V4Mul(V10, V10);
+	const Vec4V V22 = V4Mul(V12, V10);
+
+	const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f);
+	const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f);
+	const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f);
+
+	const FloatV C1 = V4GetY(cosCoefficients0);
+	const FloatV C2 = V4GetZ(cosCoefficients0);
+	const FloatV C3 = V4GetW(cosCoefficients0);
+	const FloatV C4 = V4GetX(cosCoefficients1);
+	const FloatV C5 = V4GetY(cosCoefficients1);
+	const FloatV C6 = V4GetZ(cosCoefficients1);
+	const FloatV C7 = V4GetW(cosCoefficients1);
+	const FloatV C8 = V4GetX(cosCoefficients2);
+	const FloatV C9 = V4GetY(cosCoefficients2);
+	const FloatV C10 = V4GetZ(cosCoefficients2);
+	const FloatV C11 = V4GetW(cosCoefficients2);
+
+	Vec4V Result;
+	Result = V4MulAdd(C1, V2, V4One());
+	Result = V4MulAdd(C2, V4, Result);
+	Result = V4MulAdd(C3, V6, Result);
+	Result = V4MulAdd(C4, V8, Result);
+	Result = V4MulAdd(C5, V10, Result);
+	Result = V4MulAdd(C6, V12, Result);
+	Result = V4MulAdd(C7, V14, Result);
+	Result = V4MulAdd(C8, V16, Result);
+	Result = V4MulAdd(C9, V18, Result);
+	Result = V4MulAdd(C10, V20, Result);
+	Result = V4MulAdd(C11, V22, Result);
+
+	return Result;
+}
+
+PX_FORCE_INLINE void V4Transpose(Vec4V& col0, Vec4V& col1, Vec4V& col2, Vec4V& col3)
+{
+	Vec4V tmp0 = _mm_unpacklo_ps(col0, col1);
+	Vec4V tmp2 = _mm_unpacklo_ps(col2, col3);
+	Vec4V tmp1 = _mm_unpackhi_ps(col0, col1);
+	Vec4V tmp3 = _mm_unpackhi_ps(col2, col3);
+	col0 = _mm_movelh_ps(tmp0, tmp2);
+	col1 = _mm_movehl_ps(tmp2, tmp0);
+	col2 = _mm_movelh_ps(tmp1, tmp3);
+	col3 = _mm_movehl_ps(tmp3, tmp1);
+}
+
+//////////////////////////////////
+// BoolV
+//////////////////////////////////
+
+PX_FORCE_INLINE BoolV BFFFF()
+{
+	return _mm_setzero_ps();
+}
+
+PX_FORCE_INLINE BoolV BFFFT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0,0,0xFFFFFFFF};
+	const __m128 ffft=_mm_load_ps((float*)&f);
+	return ffft;*/
+	return m128_I2F(_mm_set_epi32(-1, 0, 0, 0));
+}
+
+PX_FORCE_INLINE BoolV BFFTF()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0};
+	const __m128 fftf=_mm_load_ps((float*)&f);
+	return fftf;*/
+	return m128_I2F(_mm_set_epi32(0, -1, 0, 0));
+}
+
+PX_FORCE_INLINE BoolV BFFTT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0xFFFFFFFF};
+	const __m128 fftt=_mm_load_ps((float*)&f);
+	return fftt;*/
+	return m128_I2F(_mm_set_epi32(-1, -1, 0, 0));
+}
+
+PX_FORCE_INLINE BoolV BFTFF()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0};
+	const __m128 ftff=_mm_load_ps((float*)&f);
+	return ftff;*/
+	return m128_I2F(_mm_set_epi32(0, 0, -1, 0));
+}
+
+PX_FORCE_INLINE BoolV BFTFT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0xFFFFFFFF};
+	const __m128 ftft=_mm_load_ps((float*)&f);
+	return ftft;*/
+	return m128_I2F(_mm_set_epi32(-1, 0, -1, 0));
+}
+
+PX_FORCE_INLINE BoolV BFTTF()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0xFFFFFFFF,0};
+	const __m128 fttf=_mm_load_ps((float*)&f);
+	return fttf;*/
+	return m128_I2F(_mm_set_epi32(0, -1, -1, 0));
+}
+
+PX_FORCE_INLINE BoolV BFTTT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF};
+	const __m128 fttt=_mm_load_ps((float*)&f);
+	return fttt;*/
+	return m128_I2F(_mm_set_epi32(-1, -1, -1, 0));
+}
+
+PX_FORCE_INLINE BoolV BTFFF()
+{
+	// const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0};
+	// const __m128 tfff=_mm_load_ps((float*)&f);
+	// return tfff;
+	return m128_I2F(_mm_set_epi32(0, 0, 0, -1));
+}
+
+PX_FORCE_INLINE BoolV BTFFT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0xFFFFFFFF};
+	const __m128 tfft=_mm_load_ps((float*)&f);
+	return tfft;*/
+	return m128_I2F(_mm_set_epi32(-1, 0, 0, -1));
+}
+
+PX_FORCE_INLINE BoolV BTFTF()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0xFFFFFFFF,0};
+	const __m128 tftf=_mm_load_ps((float*)&f);
+	return tftf;*/
+	return m128_I2F(_mm_set_epi32(0, -1, 0, -1));
+}
+
+PX_FORCE_INLINE BoolV BTFTT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0xFFFFFFFF,0xFFFFFFFF};
+	const __m128 tftt=_mm_load_ps((float*)&f);
+	return tftt;*/
+	return m128_I2F(_mm_set_epi32(-1, -1, 0, -1));
+}
+
+PX_FORCE_INLINE BoolV BTTFF()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0,0};
+	const __m128 ttff=_mm_load_ps((float*)&f);
+	return ttff;*/
+	return m128_I2F(_mm_set_epi32(0, 0, -1, -1));
+}
+
+PX_FORCE_INLINE BoolV BTTFT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0,0xFFFFFFFF};
+	const __m128 ttft=_mm_load_ps((float*)&f);
+	return ttft;*/
+	return m128_I2F(_mm_set_epi32(-1, 0, -1, -1));
+}
+
+PX_FORCE_INLINE BoolV BTTTF()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0};
+	const __m128 tttf=_mm_load_ps((float*)&f);
+	return tttf;*/
+	return m128_I2F(_mm_set_epi32(0, -1, -1, -1));
+}
+
+PX_FORCE_INLINE BoolV BTTTT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF};
+	const __m128 tttt=_mm_load_ps((float*)&f);
+	return tttt;*/
+	return m128_I2F(_mm_set_epi32(-1, -1, -1, -1));
+}
+
+PX_FORCE_INLINE BoolV BXMask()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0};
+	const __m128 tfff=_mm_load_ps((float*)&f);
+	return tfff;*/
+	return m128_I2F(_mm_set_epi32(0, 0, 0, -1));
+}
+
+PX_FORCE_INLINE BoolV BYMask()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0};
+	const __m128 ftff=_mm_load_ps((float*)&f);
+	return ftff;*/
+	return m128_I2F(_mm_set_epi32(0, 0, -1, 0));
+}
+
+PX_FORCE_INLINE BoolV BZMask()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0};
+	const __m128 fftf=_mm_load_ps((float*)&f);
+	return fftf;*/
+	return m128_I2F(_mm_set_epi32(0, -1, 0, 0));
+}
+
+PX_FORCE_INLINE BoolV BWMask()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0,0,0xFFFFFFFF};
+	const __m128 ffft=_mm_load_ps((float*)&f);
+	return ffft;*/
+	return m128_I2F(_mm_set_epi32(-1, 0, 0, 0));
+}
+
+PX_FORCE_INLINE BoolV BGetX(const BoolV f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0, 0, 0, 0));
+}
+
+PX_FORCE_INLINE BoolV BGetY(const BoolV f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 1, 1));
+}
+
+PX_FORCE_INLINE BoolV BGetZ(const BoolV f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(2, 2, 2, 2));
+}
+
+PX_FORCE_INLINE BoolV BGetW(const BoolV f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(3, 3, 3, 3));
+}
+
+PX_FORCE_INLINE BoolV BSetX(const BoolV v, const BoolV f)
+{
+	return V4Sel(BFTTT(), v, f);
+}
+
+PX_FORCE_INLINE BoolV BSetY(const BoolV v, const BoolV f)
+{
+	return V4Sel(BTFTT(), v, f);
+}
+
+PX_FORCE_INLINE BoolV BSetZ(const BoolV v, const BoolV f)
+{
+	return V4Sel(BTTFT(), v, f);
+}
+
+PX_FORCE_INLINE BoolV BSetW(const BoolV v, const BoolV f)
+{
+	return V4Sel(BTTTF(), v, f);
+}
+
+PX_FORCE_INLINE BoolV BAnd(const BoolV a, const BoolV b)
+{
+	return _mm_and_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV BNot(const BoolV a)
+{
+	const BoolV bAllTrue(BTTTT());
+	return _mm_xor_ps(a, bAllTrue);
+}
+
+PX_FORCE_INLINE BoolV BAndNot(const BoolV a, const BoolV b)
+{
+	return _mm_andnot_ps(b, a);
+}
+
+PX_FORCE_INLINE BoolV BOr(const BoolV a, const BoolV b)
+{
+	return _mm_or_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV BAllTrue4(const BoolV a)
+{
+	const BoolV bTmp =
+	    _mm_and_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 2, 3)));
+	return _mm_and_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)),
+	                  _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1)));
+}
+
+PX_FORCE_INLINE BoolV BAnyTrue4(const BoolV a)
+{
+	const BoolV bTmp =
+	    _mm_or_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 2, 3)));
+	return _mm_or_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)),
+	                 _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1)));
+}
+
+PX_FORCE_INLINE BoolV BAllTrue3(const BoolV a)
+{
+	const BoolV bTmp =
+	    _mm_and_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)));
+	return _mm_and_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)),
+	                  _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1)));
+}
+
+PX_FORCE_INLINE BoolV BAnyTrue3(const BoolV a)
+{
+	const BoolV bTmp =
+	    _mm_or_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)));
+	return _mm_or_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)),
+	                 _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1)));
+}
+
+PX_FORCE_INLINE PxU32 BAllEq(const BoolV a, const BoolV b)
+{
+	const BoolV bTest = m128_I2F(_mm_cmpeq_epi32(m128_F2I(a), m128_F2I(b)));
+	return internalUnitSSE2Simd::BAllTrue4_R(bTest);
+}
+
+PX_FORCE_INLINE PxU32 BAllEqTTTT(const BoolV a)
+{
+	return PxU32(_mm_movemask_ps(a)==15);
+}
+
+PX_FORCE_INLINE PxU32 BAllEqFFFF(const BoolV a)
+{
+	return PxU32(_mm_movemask_ps(a)==0);
+}
+
+PX_FORCE_INLINE PxU32 BGetBitMask(const BoolV a)
+{
+	return PxU32(_mm_movemask_ps(a));
+}
+
+//////////////////////////////////
+// MAT33V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec3V M33MulV3(const Mat33V& a, const Vec3V b)
+{
+	const FloatV x = V3GetX(b);
+	const FloatV y = V3GetY(b);
+	const FloatV z = V3GetZ(b);
+	const Vec3V v0 = V3Scale(a.col0, x);
+	const Vec3V v1 = V3Scale(a.col1, y);
+	const Vec3V v2 = V3Scale(a.col2, z);
+	const Vec3V v0PlusV1 = V3Add(v0, v1);
+	return V3Add(v0PlusV1, v2);
+}
+
+PX_FORCE_INLINE Vec3V M33TrnspsMulV3(const Mat33V& a, const Vec3V b)
+{
+	const FloatV x = V3Dot(a.col0, b);
+	const FloatV y = V3Dot(a.col1, b);
+	const FloatV z = V3Dot(a.col2, b);
+	return V3Merge(x, y, z);
+}
+
+PX_FORCE_INLINE Vec3V M33MulV3AddV3(const Mat33V& A, const Vec3V b, const Vec3V c)
+{
+	const FloatV x = V3GetX(b);
+	const FloatV y = V3GetY(b);
+	const FloatV z = V3GetZ(b);
+	Vec3V result = V3ScaleAdd(A.col0, x, c);
+	result = V3ScaleAdd(A.col1, y, result);
+	return V3ScaleAdd(A.col2, z, result);
+}
+
+PX_FORCE_INLINE Mat33V M33MulM33(const Mat33V& a, const Mat33V& b)
+{
+	return Mat33V(M33MulV3(a, b.col0), M33MulV3(a, b.col1), M33MulV3(a, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Add(const Mat33V& a, const Mat33V& b)
+{
+	return Mat33V(V3Add(a.col0, b.col0), V3Add(a.col1, b.col1), V3Add(a.col2, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Scale(const Mat33V& a, const FloatV& b)
+{
+	return Mat33V(V3Scale(a.col0, b), V3Scale(a.col1, b), V3Scale(a.col2, b));
+}
+
+PX_FORCE_INLINE Mat33V M33Inverse(const Mat33V& a)
+{
+	const BoolV tfft = BTFFT();
+	const BoolV tttf = BTTTF();
+	const FloatV zero = FZero();
+	const Vec3V cross01 = V3Cross(a.col0, a.col1);
+	const Vec3V cross12 = V3Cross(a.col1, a.col2);
+	const Vec3V cross20 = V3Cross(a.col2, a.col0);
+	const FloatV dot = V3Dot(cross01, a.col2);
+	const FloatV invDet = _mm_rcp_ps(dot);
+	const Vec3V mergeh = _mm_unpacklo_ps(cross12, cross01);
+	const Vec3V mergel = _mm_unpackhi_ps(cross12, cross01);
+	Vec3V colInv0 = _mm_unpacklo_ps(mergeh, cross20);
+	colInv0 = _mm_or_ps(_mm_andnot_ps(tttf, zero), _mm_and_ps(tttf, colInv0));
+	const Vec3V zppd = _mm_shuffle_ps(mergeh, cross20, _MM_SHUFFLE(3, 0, 0, 2));
+	const Vec3V pbwp = _mm_shuffle_ps(cross20, mergeh, _MM_SHUFFLE(3, 3, 1, 0));
+	const Vec3V colInv1 = _mm_or_ps(_mm_andnot_ps(BTFFT(), pbwp), _mm_and_ps(BTFFT(), zppd));
+	const Vec3V xppd = _mm_shuffle_ps(mergel, cross20, _MM_SHUFFLE(3, 0, 0, 0));
+	const Vec3V pcyp = _mm_shuffle_ps(cross20, mergel, _MM_SHUFFLE(3, 1, 2, 0));
+	const Vec3V colInv2 = _mm_or_ps(_mm_andnot_ps(tfft, pcyp), _mm_and_ps(tfft, xppd));
+
+	return Mat33V(_mm_mul_ps(colInv0, invDet), _mm_mul_ps(colInv1, invDet), _mm_mul_ps(colInv2, invDet));
+}
+
+PX_FORCE_INLINE Mat33V M33Trnsps(const Mat33V& a)
+{
+	return Mat33V(V3Merge(V3GetX(a.col0), V3GetX(a.col1), V3GetX(a.col2)),
+	              V3Merge(V3GetY(a.col0), V3GetY(a.col1), V3GetY(a.col2)),
+	              V3Merge(V3GetZ(a.col0), V3GetZ(a.col1), V3GetZ(a.col2)));
+}
+
+PX_FORCE_INLINE Mat33V M33Identity()
+{
+	return Mat33V(V3UnitX(), V3UnitY(), V3UnitZ());
+}
+
+PX_FORCE_INLINE Mat33V M33Sub(const Mat33V& a, const Mat33V& b)
+{
+	return Mat33V(V3Sub(a.col0, b.col0), V3Sub(a.col1, b.col1), V3Sub(a.col2, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Neg(const Mat33V& a)
+{
+	return Mat33V(V3Neg(a.col0), V3Neg(a.col1), V3Neg(a.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Abs(const Mat33V& a)
+{
+	return Mat33V(V3Abs(a.col0), V3Abs(a.col1), V3Abs(a.col2));
+}
+
+PX_FORCE_INLINE Mat33V PromoteVec3V(const Vec3V v)
+{
+	const BoolV bTFFF = BTFFF();
+	const BoolV bFTFF = BFTFF();
+	const BoolV bFFTF = BTFTF();
+
+	const Vec3V zero = V3Zero();
+
+	return Mat33V(V3Sel(bTFFF, v, zero), V3Sel(bFTFF, v, zero), V3Sel(bFFTF, v, zero));
+}
+
+PX_FORCE_INLINE Mat33V M33Diagonal(const Vec3VArg d)
+{
+	const FloatV x = V3Mul(V3UnitX(), d);
+	const FloatV y = V3Mul(V3UnitY(), d);
+	const FloatV z = V3Mul(V3UnitZ(), d);
+	return Mat33V(x, y, z);
+}
+
+//////////////////////////////////
+// MAT34V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec3V M34MulV3(const Mat34V& a, const Vec3V b)
+{
+	const FloatV x = V3GetX(b);
+	const FloatV y = V3GetY(b);
+	const FloatV z = V3GetZ(b);
+	const Vec3V v0 = V3Scale(a.col0, x);
+	const Vec3V v1 = V3Scale(a.col1, y);
+	const Vec3V v2 = V3Scale(a.col2, z);
+	const Vec3V v0PlusV1 = V3Add(v0, v1);
+	const Vec3V v0PlusV1Plusv2 = V3Add(v0PlusV1, v2);
+	return V3Add(v0PlusV1Plusv2, a.col3);
+}
+
+PX_FORCE_INLINE Vec3V M34Mul33V3(const Mat34V& a, const Vec3V b)
+{
+	const FloatV x = V3GetX(b);
+	const FloatV y = V3GetY(b);
+	const FloatV z = V3GetZ(b);
+	const Vec3V v0 = V3Scale(a.col0, x);
+	const Vec3V v1 = V3Scale(a.col1, y);
+	const Vec3V v2 = V3Scale(a.col2, z);
+	const Vec3V v0PlusV1 = V3Add(v0, v1);
+	return V3Add(v0PlusV1, v2);
+}
+
+PX_FORCE_INLINE Vec3V M34TrnspsMul33V3(const Mat34V& a, const Vec3V b)
+{
+	const FloatV x = V3Dot(a.col0, b);
+	const FloatV y = V3Dot(a.col1, b);
+	const FloatV z = V3Dot(a.col2, b);
+	return V3Merge(x, y, z);
+}
+
+PX_FORCE_INLINE Mat34V M34MulM34(const Mat34V& a, const Mat34V& b)
+{
+	return Mat34V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2), M34MulV3(a, b.col3));
+}
+
+PX_FORCE_INLINE Mat33V M34MulM33(const Mat34V& a, const Mat33V& b)
+{
+	return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M34Mul33MM34(const Mat34V& a, const Mat34V& b)
+{
+	return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2));
+}
+
+PX_FORCE_INLINE Mat34V M34Add(const Mat34V& a, const Mat34V& b)
+{
+	return Mat34V(V3Add(a.col0, b.col0), V3Add(a.col1, b.col1), V3Add(a.col2, b.col2), V3Add(a.col3, b.col3));
+}
+
+PX_FORCE_INLINE Mat33V M34Trnsps33(const Mat34V& a)
+{
+	return Mat33V(V3Merge(V3GetX(a.col0), V3GetX(a.col1), V3GetX(a.col2)),
+	              V3Merge(V3GetY(a.col0), V3GetY(a.col1), V3GetY(a.col2)),
+	              V3Merge(V3GetZ(a.col0), V3GetZ(a.col1), V3GetZ(a.col2)));
+}
+
+//////////////////////////////////
+// MAT44V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec4V M44MulV4(const Mat44V& a, const Vec4V b)
+{
+	const FloatV x = V4GetX(b);
+	const FloatV y = V4GetY(b);
+	const FloatV z = V4GetZ(b);
+	const FloatV w = V4GetW(b);
+
+	const Vec4V v0 = V4Scale(a.col0, x);
+	const Vec4V v1 = V4Scale(a.col1, y);
+	const Vec4V v2 = V4Scale(a.col2, z);
+	const Vec4V v3 = V4Scale(a.col3, w);
+	const Vec4V v0PlusV1 = V4Add(v0, v1);
+	const Vec4V v0PlusV1Plusv2 = V4Add(v0PlusV1, v2);
+	return V4Add(v0PlusV1Plusv2, v3);
+}
+
+PX_FORCE_INLINE Vec4V M44TrnspsMulV4(const Mat44V& a, const Vec4V b)
+{
+	PX_ALIGN(16, FloatV) dotProdArray[4] = { V4Dot(a.col0, b), V4Dot(a.col1, b), V4Dot(a.col2, b), V4Dot(a.col3, b) };
+	return V4Merge(dotProdArray);
+}
+
+PX_FORCE_INLINE Mat44V M44MulM44(const Mat44V& a, const Mat44V& b)
+{
+	return Mat44V(M44MulV4(a, b.col0), M44MulV4(a, b.col1), M44MulV4(a, b.col2), M44MulV4(a, b.col3));
+}
+
+PX_FORCE_INLINE Mat44V M44Add(const Mat44V& a, const Mat44V& b)
+{
+	return Mat44V(V4Add(a.col0, b.col0), V4Add(a.col1, b.col1), V4Add(a.col2, b.col2), V4Add(a.col3, b.col3));
+}
+
+PX_FORCE_INLINE Mat44V M44Trnsps(const Mat44V& a)
+{
+	const Vec4V v0 = _mm_unpacklo_ps(a.col0, a.col2);
+	const Vec4V v1 = _mm_unpackhi_ps(a.col0, a.col2);
+	const Vec4V v2 = _mm_unpacklo_ps(a.col1, a.col3);
+	const Vec4V v3 = _mm_unpackhi_ps(a.col1, a.col3);
+	return Mat44V(_mm_unpacklo_ps(v0, v2), _mm_unpackhi_ps(v0, v2), _mm_unpacklo_ps(v1, v3), _mm_unpackhi_ps(v1, v3));
+}
+
+PX_FORCE_INLINE Mat44V M44Inverse(const Mat44V& a)
+{
+	__m128 minor0, minor1, minor2, minor3;
+	__m128 row0, row1, row2, row3;
+	__m128 det, tmp1;
+
+	tmp1 = V4Zero();
+	row1 = V4Zero();
+	row3 = V4Zero();
+
+	row0 = a.col0;
+	row1 = _mm_shuffle_ps(a.col1, a.col1, _MM_SHUFFLE(1, 0, 3, 2));
+	row2 = a.col2;
+	row3 = _mm_shuffle_ps(a.col3, a.col3, _MM_SHUFFLE(1, 0, 3, 2));
+
+	tmp1 = _mm_mul_ps(row2, row3);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	minor0 = _mm_mul_ps(row1, tmp1);
+	minor1 = _mm_mul_ps(row0, tmp1);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
+	minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
+	minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E);
+
+	tmp1 = _mm_mul_ps(row1, row2);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
+	minor3 = _mm_mul_ps(row0, tmp1);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
+	minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
+	minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E);
+
+	tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	row2 = _mm_shuffle_ps(row2, row2, 0x4E);
+	minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
+	minor2 = _mm_mul_ps(row0, tmp1);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
+	minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
+	minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
+
+	tmp1 = _mm_mul_ps(row0, row1);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
+	minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
+	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
+
+	tmp1 = _mm_mul_ps(row0, row3);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
+	minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
+	minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
+
+	tmp1 = _mm_mul_ps(row0, row2);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
+	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
+	minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
+
+	det = _mm_mul_ps(row0, minor0);
+	det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
+	det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
+	tmp1 = _mm_rcp_ss(det);
+#if 0
+	det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
+	det = _mm_shuffle_ps(det, det, 0x00);
+#else
+	det = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(0, 0, 0, 0));
+#endif
+
+	minor0 = _mm_mul_ps(det, minor0);
+	minor1 = _mm_mul_ps(det, minor1);
+	minor2 = _mm_mul_ps(det, minor2);
+	minor3 = _mm_mul_ps(det, minor3);
+	Mat44V invTrans(minor0, minor1, minor2, minor3);
+	return M44Trnsps(invTrans);
+}
+
+PX_FORCE_INLINE Vec4V V4LoadXYZW(const PxF32& x, const PxF32& y, const PxF32& z, const PxF32& w)
+{
+	return _mm_set_ps(w, z, y, x);
+}
+
+/*
+// AP: work in progress - use proper SSE intrinsics where possible
+PX_FORCE_INLINE VecU16V V4U32PK(VecU32V a, VecU32V b)
+{
+    VecU16V result;
+    result.m128_u16[0] = PxU16(PxClamp<PxU32>((a).m128_u32[0], 0, 0xFFFF));
+    result.m128_u16[1] = PxU16(PxClamp<PxU32>((a).m128_u32[1], 0, 0xFFFF));
+    result.m128_u16[2] = PxU16(PxClamp<PxU32>((a).m128_u32[2], 0, 0xFFFF));
+    result.m128_u16[3] = PxU16(PxClamp<PxU32>((a).m128_u32[3], 0, 0xFFFF));
+    result.m128_u16[4] = PxU16(PxClamp<PxU32>((b).m128_u32[0], 0, 0xFFFF));
+    result.m128_u16[5] = PxU16(PxClamp<PxU32>((b).m128_u32[1], 0, 0xFFFF));
+    result.m128_u16[6] = PxU16(PxClamp<PxU32>((b).m128_u32[2], 0, 0xFFFF));
+    result.m128_u16[7] = PxU16(PxClamp<PxU32>((b).m128_u32[3], 0, 0xFFFF));
+    return result;
+}
+*/
+
+PX_FORCE_INLINE VecU32V V4U32Sel(const BoolV c, const VecU32V a, const VecU32V b)
+{
+	return m128_I2F(_mm_or_si128(_mm_andnot_si128(m128_F2I(c), m128_F2I(b)), _mm_and_si128(m128_F2I(c), m128_F2I(a))));
+}
+
+PX_FORCE_INLINE VecU32V V4U32or(VecU32V a, VecU32V b)
+{
+	return m128_I2F(_mm_or_si128(m128_F2I(a), m128_F2I(b)));
+}
+
+PX_FORCE_INLINE VecU32V V4U32xor(VecU32V a, VecU32V b)
+{
+	return m128_I2F(_mm_xor_si128(m128_F2I(a), m128_F2I(b)));
+}
+
+PX_FORCE_INLINE VecU32V V4U32and(VecU32V a, VecU32V b)
+{
+	return m128_I2F(_mm_and_si128(m128_F2I(a), m128_F2I(b)));
+}
+
+PX_FORCE_INLINE VecU32V V4U32Andc(VecU32V a, VecU32V b)
+{
+	return m128_I2F(_mm_andnot_si128(m128_F2I(b), m128_F2I(a)));
+}
+
+/*
+PX_FORCE_INLINE VecU16V V4U16Or(VecU16V a, VecU16V b)
+{
+    return m128_I2F(_mm_or_si128(m128_F2I(a), m128_F2I(b)));
+}
+*/
+
+/*
+PX_FORCE_INLINE VecU16V V4U16And(VecU16V a, VecU16V b)
+{
+    return m128_I2F(_mm_and_si128(m128_F2I(a), m128_F2I(b)));
+}
+*/
+
+/*
+PX_FORCE_INLINE VecU16V V4U16Andc(VecU16V a, VecU16V b)
+{
+    return m128_I2F(_mm_andnot_si128(m128_F2I(b), m128_F2I(a)));
+}
+*/
+
+PX_FORCE_INLINE VecI32V I4Load(const PxI32 i)
+{
+	return m128_F2I(_mm_load1_ps(reinterpret_cast<const PxF32*>(&i)));
+}
+
+PX_FORCE_INLINE VecI32V I4LoadU(const PxI32* i)
+{
+	return m128_F2I(_mm_loadu_ps(reinterpret_cast<const PxF32*>(i)));
+}
+
+PX_FORCE_INLINE VecI32V I4LoadA(const PxI32* i)
+{
+	return m128_F2I(_mm_load_ps(reinterpret_cast<const PxF32*>(i)));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Add(const VecI32VArg a, const VecI32VArg b)
+{
+	return _mm_add_epi32(a, b);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Sub(const VecI32VArg a, const VecI32VArg b)
+{
+	return _mm_sub_epi32(a, b);
+}
+
+PX_FORCE_INLINE BoolV VecI32V_IsGrtr(const VecI32VArg a, const VecI32VArg b)
+{
+	return m128_I2F(_mm_cmpgt_epi32(a, b));
+}
+
+PX_FORCE_INLINE BoolV VecI32V_IsEq(const VecI32VArg a, const VecI32VArg b)
+{
+	return m128_I2F(_mm_cmpeq_epi32(a, b));
+}
+
+PX_FORCE_INLINE VecI32V V4I32Sel(const BoolV c, const VecI32V a, const VecI32V b)
+{
+	return _mm_or_si128(_mm_andnot_si128(m128_F2I(c), b), _mm_and_si128(m128_F2I(c), a));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Zero()
+{
+	return _mm_setzero_si128();
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_One()
+{
+	return I4Load(1);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Two()
+{
+	return I4Load(2);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_MinusOne()
+{
+	return I4Load(-1);
+}
+
+PX_FORCE_INLINE VecU32V U4Zero()
+{
+	return U4Load(0);
+}
+
+PX_FORCE_INLINE VecU32V U4One()
+{
+	return U4Load(1);
+}
+
+PX_FORCE_INLINE VecU32V U4Two()
+{
+	return U4Load(2);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Sel(const BoolV c, const VecI32VArg a, const VecI32VArg b)
+{
+	return _mm_or_si128(_mm_andnot_si128(m128_F2I(c), b), _mm_and_si128(m128_F2I(c), a));
+}
+
+PX_FORCE_INLINE VecShiftV VecI32V_PrepareShift(const VecI32VArg shift)
+{
+	VecShiftV s;
+	s.shift = VecI32V_Sel(BTFFF(), shift, VecI32V_Zero());
+	return s;
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_LeftShift(const VecI32VArg a, const VecShiftVArg count)
+{
+	return _mm_sll_epi32(a, count.shift);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_RightShift(const VecI32VArg a, const VecShiftVArg count)
+{
+	return _mm_srl_epi32(a, count.shift);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_And(const VecI32VArg a, const VecI32VArg b)
+{
+	return _mm_and_si128(a, b);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Or(const VecI32VArg a, const VecI32VArg b)
+{
+	return _mm_or_si128(a, b);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetX(const VecI32VArg a)
+{
+	return m128_F2I(_mm_shuffle_ps(m128_I2F(a), m128_I2F(a), _MM_SHUFFLE(0, 0, 0, 0)));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetY(const VecI32VArg a)
+{
+	return m128_F2I(_mm_shuffle_ps(m128_I2F(a), m128_I2F(a), _MM_SHUFFLE(1, 1, 1, 1)));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetZ(const VecI32VArg a)
+{
+	return m128_F2I(_mm_shuffle_ps(m128_I2F(a), m128_I2F(a), _MM_SHUFFLE(2, 2, 2, 2)));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetW(const VecI32VArg a)
+{
+	return m128_F2I(_mm_shuffle_ps(m128_I2F(a), m128_I2F(a), _MM_SHUFFLE(3, 3, 3, 3)));
+}
+
+PX_FORCE_INLINE void PxI32_From_VecI32V(const VecI32VArg a, PxI32* i)
+{
+	_mm_store_ss(reinterpret_cast<PxF32*>(i), m128_I2F(a));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Merge(const VecI32VArg x, const VecI32VArg y, const VecI32VArg z, const VecI32VArg w)
+{
+	const __m128 xw = _mm_move_ss(m128_I2F(y), m128_I2F(x)); // y, y, y, x
+	const __m128 yz = _mm_move_ss(m128_I2F(z), m128_I2F(w)); // z, z, z, w
+	return m128_F2I(_mm_shuffle_ps(xw, yz, _MM_SHUFFLE(0, 2, 1, 0)));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_From_BoolV(const BoolVArg a)
+{
+	return m128_F2I(a);
+}
+
+PX_FORCE_INLINE VecU32V VecU32V_From_BoolV(const BoolVArg a)
+{
+	return a;
+}
+
+/*
+template<int a> PX_FORCE_INLINE VecI32V V4ISplat()
+{
+    VecI32V result;
+    result.m128_i32[0] = a;
+    result.m128_i32[1] = a;
+    result.m128_i32[2] = a;
+    result.m128_i32[3] = a;
+    return result;
+}
+
+template<PxU32 a> PX_FORCE_INLINE VecU32V V4USplat()
+{
+    VecU32V result;
+    result.m128_u32[0] = a;
+    result.m128_u32[1] = a;
+    result.m128_u32[2] = a;
+    result.m128_u32[3] = a;
+    return result;
+}
+*/
+
+/*
+PX_FORCE_INLINE void V4U16StoreAligned(VecU16V val, VecU16V* address)
+{
+    *address = val;
+}
+*/
+
+PX_FORCE_INLINE void V4U32StoreAligned(VecU32V val, VecU32V* address)
+{
+	*address = val;
+}
+
+PX_FORCE_INLINE Vec4V V4LoadAligned(Vec4V* addr)
+{
+	return *addr;
+}
+
+PX_FORCE_INLINE Vec4V V4LoadUnaligned(Vec4V* addr)
+{
+	return V4LoadU(reinterpret_cast<float*>(addr));
+}
+
+PX_FORCE_INLINE Vec4V V4Andc(const Vec4V a, const VecU32V b)
+{
+	VecU32V result32(a);
+	result32 = V4U32Andc(result32, b);
+	return Vec4V(result32);
+}
+
+PX_FORCE_INLINE VecU32V V4IsGrtrV32u(const Vec4V a, const Vec4V b)
+{
+	return V4IsGrtr(a, b);
+}
+
+PX_FORCE_INLINE VecU16V V4U16LoadAligned(VecU16V* addr)
+{
+	return *addr;
+}
+
+PX_FORCE_INLINE VecU16V V4U16LoadUnaligned(VecU16V* addr)
+{
+	return *addr;
+}
+
+PX_FORCE_INLINE VecU16V V4U16CompareGt(VecU16V a, VecU16V b)
+{
+	// _mm_cmpgt_epi16 doesn't work for unsigned values unfortunately
+	// return m128_I2F(_mm_cmpgt_epi16(m128_F2I(a), m128_F2I(b)));
+	VecU16V result;
+	result.m128_u16[0] = (a).m128_u16[0] > (b).m128_u16[0];
+	result.m128_u16[1] = (a).m128_u16[1] > (b).m128_u16[1];
+	result.m128_u16[2] = (a).m128_u16[2] > (b).m128_u16[2];
+	result.m128_u16[3] = (a).m128_u16[3] > (b).m128_u16[3];
+	result.m128_u16[4] = (a).m128_u16[4] > (b).m128_u16[4];
+	result.m128_u16[5] = (a).m128_u16[5] > (b).m128_u16[5];
+	result.m128_u16[6] = (a).m128_u16[6] > (b).m128_u16[6];
+	result.m128_u16[7] = (a).m128_u16[7] > (b).m128_u16[7];
+	return result;
+}
+
+PX_FORCE_INLINE VecU16V V4I16CompareGt(VecU16V a, VecU16V b)
+{
+	return m128_I2F(_mm_cmpgt_epi16(m128_F2I(a), m128_F2I(b)));
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_VecU32V(VecU32V a)
+{
+	Vec4V result = V4LoadXYZW(PxF32(a.m128_u32[0]), PxF32(a.m128_u32[1]), PxF32(a.m128_u32[2]), PxF32(a.m128_u32[3]));
+	return result;
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_VecI32V(VecI32V in)
+{
+	return _mm_cvtepi32_ps(in);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_From_Vec4V(Vec4V a)
+{
+	return _mm_cvttps_epi32(a);
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecU32V(VecU32V a)
+{
+	return Vec4V(a);
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecI32V(VecI32V a)
+{
+	return m128_I2F(a);
+}
+
+PX_FORCE_INLINE VecU32V VecU32V_ReinterpretFrom_Vec4V(Vec4V a)
+{
+	return VecU32V(a);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_ReinterpretFrom_Vec4V(Vec4V a)
+{
+	return m128_F2I(a);
+}
+
+/*
+template<int index> PX_FORCE_INLINE BoolV BSplatElement(BoolV a)
+{
+    BoolV result;
+    result[0] = result[1] = result[2] = result[3] = a[index];
+    return result;
+}
+*/
+
+template <int index>
+BoolV BSplatElement(BoolV a)
+{
+	float* data = reinterpret_cast<float*>(&a);
+	return V4Load(data[index]);
+}
+
+template <int index>
+PX_FORCE_INLINE VecU32V V4U32SplatElement(VecU32V a)
+{
+	VecU32V result;
+	result.m128_u32[0] = result.m128_u32[1] = result.m128_u32[2] = result.m128_u32[3] = a.m128_u32[index];
+	return result;
+}
+
+template <int index>
+PX_FORCE_INLINE Vec4V V4SplatElement(Vec4V a)
+{
+	float* data = reinterpret_cast<float*>(&a);
+	return V4Load(data[index]);
+}
+
+PX_FORCE_INLINE VecU32V U4LoadXYZW(PxU32 x, PxU32 y, PxU32 z, PxU32 w)
+{
+	VecU32V result;
+	result.m128_u32[0] = x;
+	result.m128_u32[1] = y;
+	result.m128_u32[2] = z;
+	result.m128_u32[3] = w;
+	return result;
+}
+
+PX_FORCE_INLINE Vec4V V4Ceil(const Vec4V in)
+{
+	UnionM128 a(in);
+	return V4LoadXYZW(PxCeil(a.m128_f32[0]), PxCeil(a.m128_f32[1]), PxCeil(a.m128_f32[2]), PxCeil(a.m128_f32[3]));
+}
+
+PX_FORCE_INLINE Vec4V V4Floor(const Vec4V in)
+{
+	UnionM128 a(in);
+	return V4LoadXYZW(PxFloor(a.m128_f32[0]), PxFloor(a.m128_f32[1]), PxFloor(a.m128_f32[2]), PxFloor(a.m128_f32[3]));
+}
+
+PX_FORCE_INLINE VecU32V V4ConvertToU32VSaturate(const Vec4V in, PxU32 power)
+{
+	PX_ASSERT(power == 0 && "Non-zero power not supported in convertToU32VSaturate");
+	PX_UNUSED(power); // prevent warning in release builds
+	PxF32 ffffFFFFasFloat = PxF32(0xFFFF0000);
+	UnionM128 a(in);
+	VecU32V result;
+	result.m128_u32[0] = PxU32(PxClamp<PxF32>((a).m128_f32[0], 0.0f, ffffFFFFasFloat));
+	result.m128_u32[1] = PxU32(PxClamp<PxF32>((a).m128_f32[1], 0.0f, ffffFFFFasFloat));
+	result.m128_u32[2] = PxU32(PxClamp<PxF32>((a).m128_f32[2], 0.0f, ffffFFFFasFloat));
+	result.m128_u32[3] = PxU32(PxClamp<PxF32>((a).m128_f32[3], 0.0f, ffffFFFFasFloat));
+	return result;
+}
+
+#endif // PSFOUNDATION_PSUNIXSSE2INLINEAOS_H
diff --git a/PxShared/src/foundation/include/windows/PsWindowsAoS.h b/PxShared/src/foundation/include/windows/PsWindowsAoS.h
new file mode 100644
index 00000000..dd4288d5
--- /dev/null
+++ b/PxShared/src/foundation/include/windows/PsWindowsAoS.h
@@ -0,0 +1,131 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSWINDOWSAOS_H
+#define PSFOUNDATION_PSWINDOWSAOS_H
+
+// no includes here! this file should be included from PxcVecMath.h only!!!
+
+#if !COMPILE_VECTOR_INTRINSICS
+#error Vector intrinsics should not be included when using scalar implementation.
+#endif
+
+typedef __m128 FloatV;
+typedef __m128 Vec3V;
+typedef __m128 Vec4V;
+typedef __m128 BoolV;
+typedef __m128 VecU32V;
+typedef __m128 VecI32V;
+typedef __m128 VecU16V;
+typedef __m128 VecI16V;
+typedef __m128 QuatV;
+
+#define FloatVArg FloatV &
+#define Vec3VArg Vec3V &
+#define Vec4VArg Vec4V &
+#define BoolVArg BoolV &
+#define VecU32VArg VecU32V &
+#define VecI32VArg VecI32V &
+#define VecU16VArg VecU16V &
+#define VecI16VArg VecI16V &
+#define QuatVArg QuatV &
+
+// Optimization for situations in which you cross product multiple vectors with the same vector.
+// Avoids 2X shuffles per product
+struct VecCrossV
+{
+	Vec3V mL1;
+	Vec3V mR1;
+};
+
+struct VecShiftV
+{
+	VecI32V shift;
+};
+#define VecShiftVArg VecShiftV &
+
+PX_ALIGN_PREFIX(16)
+struct Mat33V
+{
+	Mat33V()
+	{
+	}
+	Mat33V(const Vec3V& c0, const Vec3V& c1, const Vec3V& c2) : col0(c0), col1(c1), col2(c2)
+	{
+	}
+	Vec3V PX_ALIGN(16, col0);
+	Vec3V PX_ALIGN(16, col1);
+	Vec3V PX_ALIGN(16, col2);
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct Mat34V
+{
+	Mat34V()
+	{
+	}
+	Mat34V(const Vec3V& c0, const Vec3V& c1, const Vec3V& c2, const Vec3V& c3) : col0(c0), col1(c1), col2(c2), col3(c3)
+	{
+	}
+	Vec3V PX_ALIGN(16, col0);
+	Vec3V PX_ALIGN(16, col1);
+	Vec3V PX_ALIGN(16, col2);
+	Vec3V PX_ALIGN(16, col3);
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct Mat43V
+{
+	Mat43V()
+	{
+	}
+	Mat43V(const Vec4V& c0, const Vec4V& c1, const Vec4V& c2) : col0(c0), col1(c1), col2(c2)
+	{
+	}
+	Vec4V PX_ALIGN(16, col0);
+	Vec4V PX_ALIGN(16, col1);
+	Vec4V PX_ALIGN(16, col2);
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct Mat44V
+{
+	Mat44V()
+	{
+	}
+	Mat44V(const Vec4V& c0, const Vec4V& c1, const Vec4V& c2, const Vec4V& c3) : col0(c0), col1(c1), col2(c2), col3(c3)
+	{
+	}
+	Vec4V PX_ALIGN(16, col0);
+	Vec4V PX_ALIGN(16, col1);
+	Vec4V PX_ALIGN(16, col2);
+	Vec4V PX_ALIGN(16, col3);
+} PX_ALIGN_SUFFIX(16);
+
+#endif // PSFOUNDATION_PSWINDOWSAOS_H
diff --git a/PxShared/src/foundation/include/windows/PsWindowsFPU.h b/PxShared/src/foundation/include/windows/PsWindowsFPU.h
new file mode 100644
index 00000000..28694885
--- /dev/null
+++ b/PxShared/src/foundation/include/windows/PsWindowsFPU.h
@@ -0,0 +1,51 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSWINDOWSFPU_H
+#define PSFOUNDATION_PSWINDOWSFPU_H
+
+PX_INLINE physx::shdfnd::SIMDGuard::SIMDGuard()
+{
+#if !PX_ARM
+	mControlWord = _mm_getcsr();
+	// set default (disable exceptions: _MM_MASK_MASK) and FTZ (_MM_FLUSH_ZERO_ON), DAZ (_MM_DENORMALS_ZERO_ON: (1<<6))
+	_mm_setcsr(_MM_MASK_MASK | _MM_FLUSH_ZERO_ON | (1 << 6));
+#endif
+}
+
+PX_INLINE physx::shdfnd::SIMDGuard::~SIMDGuard()
+{
+#if !PX_ARM
+	// restore control word and clear any exception flags
+	// (setting exception state flags cause exceptions on the first following fp operation)
+	_mm_setcsr(mControlWord & ~_MM_EXCEPT_MASK);
+#endif
+}
+
+#endif // #ifndef PSFOUNDATION_PSWINDOWSFPU_H
diff --git a/PxShared/src/foundation/include/windows/PsWindowsInclude.h b/PxShared/src/foundation/include/windows/PsWindowsInclude.h
new file mode 100644
index 00000000..4b4fd2f9
--- /dev/null
+++ b/PxShared/src/foundation/include/windows/PsWindowsInclude.h
@@ -0,0 +1,96 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSWINDOWSINCLUDE_H
+#define PSFOUNDATION_PSWINDOWSINCLUDE_H
+
+#include "Ps.h"
+
+#ifndef _WIN32
+#error "This file should only be included by Windows builds!!"
+#endif
+
+#ifdef _WINDOWS_ // windows already included
+#error "Only include windows.h through this file!!"
+#endif
+
+// We only support >= Windows XP, and we need this for critical section and
+#define _WIN32_WINNT 0x0501
+
+// turn off as much as we can for windows. All we really need is the thread functions(critical sections/Interlocked*
+// etc)
+#define NOGDICAPMASKS
+#define NOVIRTUALKEYCODES
+#define NOWINMESSAGES
+#define NOWINSTYLES
+#define NOSYSMETRICS
+#define NOMENUS
+#define NOICONS
+#define NOKEYSTATES
+#define NOSYSCOMMANDS
+#define NORASTEROPS
+#define NOSHOWWINDOW
+#define NOATOM
+#define NOCLIPBOARD
+#define NOCOLOR
+#define NOCTLMGR
+#define NODRAWTEXT
+#define NOGDI
+#define NOMB
+#define NOMEMMGR
+#define NOMETAFILE
+#define NOMINMAX
+#define NOOPENFILE
+#define NOSCROLL
+#define NOSERVICE
+#define NOSOUND
+#define NOTEXTMETRIC
+#define NOWH
+#define NOWINOFFSETS
+#define NOCOMM
+#define NOKANJI
+#define NOHELP
+#define NOPROFILER
+#define NODEFERWINDOWPOS
+#define NOMCX
+#define WIN32_LEAN_AND_MEAN
+#define NOUSER
+#define NONLS
+#define NOMSG
+
+#pragma warning(push)
+#pragma warning(disable : 4668) //'symbol' is not defined as a preprocessor macro, replacing with '0' for 'directives'
+#include <windows.h>
+#pragma warning(pop)
+
+#if PX_SSE2
+#include <xmmintrin.h>
+#endif
+
+#endif // #ifndef PSFOUNDATION_PSWINDOWSINCLUDE_H
diff --git a/PxShared/src/foundation/include/windows/PsWindowsInlineAoS.h b/PxShared/src/foundation/include/windows/PsWindowsInlineAoS.h
new file mode 100644
index 00000000..5bfb62f7
--- /dev/null
+++ b/PxShared/src/foundation/include/windows/PsWindowsInlineAoS.h
@@ -0,0 +1,3119 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSWINDOWSINLINEAOS_H
+#define PSFOUNDATION_PSWINDOWSINLINEAOS_H
+
+#if !COMPILE_VECTOR_INTRINSICS
+#error Vector intrinsics should not be included when using scalar implementation.
+#endif
+
+// Remove this define when all platforms use simd solver.
+#define PX_SUPPORT_SIMD
+
+#include "../PsVecMathSSE.h"
+
+//////////////////////////////////////////////////////////////////////
+//Test that Vec3V and FloatV are legal
+//////////////////////////////////////////////////////////////////////
+
+#define FLOAT_COMPONENTS_EQUAL_THRESHOLD 0.01f
+PX_FORCE_INLINE bool isValidFloatV(const FloatV a)
+{
+	const PxF32 x = V4ReadX(a);
+	const PxF32 y = V4ReadY(a);
+	const PxF32 z = V4ReadZ(a);
+	const PxF32 w = V4ReadW(a);
+
+	if (
+		(PxAbs(x - y) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) &&
+		(PxAbs(x - z) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) &&
+		(PxAbs(x - w) < FLOAT_COMPONENTS_EQUAL_THRESHOLD)
+		)
+	{
+		return true;
+	}
+	
+	if (
+		(PxAbs((x - y) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) &&
+		(PxAbs((x - z) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) &&
+		(PxAbs((x - w) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD)
+		)
+	{
+		return true;
+	}
+	return false;
+}
+
+PX_FORCE_INLINE bool isValidVec3V(const Vec3V a)
+{
+	//using _mm_comieq_ss to do the comparison doesn't work for NaN.
+	PX_ALIGN(16, PxF32 f[4]);
+	V4StoreA((const Vec4V&)a, f);
+	return f[3] == 0.0f;
+}
+
+PX_FORCE_INLINE bool isFiniteLength(const Vec3V a)
+{
+	return !FAllEq(V4LengthSq(a), FZero());
+}
+
+PX_FORCE_INLINE bool isAligned16(void* a)
+{
+	return(0 == ((size_t)a & 0x0f));
+}
+
+//ASSERT_FINITELENGTH is deactivated because there is a lot of code that calls a simd normalisation function with zero length but then ignores the result.
+
+#if PX_DEBUG
+#define ASSERT_ISVALIDVEC3V(a) PX_ASSERT(isValidVec3V(a))
+#define ASSERT_ISVALIDFLOATV(a) PX_ASSERT(isValidFloatV(a))
+#define ASSERT_ISALIGNED16(a) PX_ASSERT(isAligned16((void*)a))
+#define ASSERT_ISFINITELENGTH(a) //PX_ASSERT(isFiniteLength(a))
+#else
+#define ASSERT_ISVALIDVEC3V(a)
+#define ASSERT_ISVALIDFLOATV(a) 
+#define ASSERT_ISALIGNED16(a)
+#define ASSERT_ISFINITELENGTH(a)
+#endif
+/////////////////////////////////////////////////////////////////////
+////FUNCTIONS USED ONLY FOR ASSERTS IN VECTORISED IMPLEMENTATIONS
+/////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////
+// USED ONLY INTERNALLY
+//////////////////////////////////////////////////////////////////////
+
+namespace internalWindowsSimd
+{
+PX_FORCE_INLINE __m128 m128_I2F(__m128i n)
+{
+	return _mm_castsi128_ps(n);
+}
+
+PX_FORCE_INLINE __m128i m128_F2I(__m128 n)
+{
+	return _mm_castps_si128(n);
+}
+
+PX_FORCE_INLINE PxU32 BAllTrue4_R(const BoolV a)
+{
+	const PxI32 moveMask = _mm_movemask_ps(a);
+	return PxU32(moveMask == 0xf);
+}
+
+PX_FORCE_INLINE PxU32 BAllTrue3_R(const BoolV a)
+{
+	const PxI32 moveMask = _mm_movemask_ps(a);
+	return PxU32((moveMask & 0x7) == 0x7);
+}
+
+PX_FORCE_INLINE PxU32 BAnyTrue4_R(const BoolV a)
+{
+	const PxI32 moveMask = _mm_movemask_ps(a);
+	return PxU32(moveMask != 0x0);
+}
+
+PX_FORCE_INLINE PxU32 BAnyTrue3_R(const BoolV a)
+{
+	const PxI32 moveMask = _mm_movemask_ps(a);
+	return PxU32(((moveMask & 0x7) != 0x0));
+}
+
+PX_FORCE_INLINE PxU32 FiniteTestEq(const Vec4V a, const Vec4V b)
+{
+	// This is a bit of a bodge.
+	//_mm_comieq_ss returns 1 if either value is nan so we need to re-cast a and b with true encoded as a non-nan
+	// number.
+	// There must be a better way of doing this in sse.
+	const BoolV one = FOne();
+	const BoolV zero = FZero();
+	const BoolV a1 = V4Sel(a, one, zero);
+	const BoolV b1 = V4Sel(b, one, zero);
+	return (PxU32(
+	    _mm_comieq_ss(a1, b1) &&
+	    _mm_comieq_ss(_mm_shuffle_ps(a1, a1, _MM_SHUFFLE(1, 1, 1, 1)), _mm_shuffle_ps(b1, b1, _MM_SHUFFLE(1, 1, 1, 1))) &&
+	    _mm_comieq_ss(_mm_shuffle_ps(a1, a1, _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(b1, b1, _MM_SHUFFLE(2, 2, 2, 2))) &&
+	    _mm_comieq_ss(_mm_shuffle_ps(a1, a1, _MM_SHUFFLE(3, 3, 3, 3)), _mm_shuffle_ps(b1, b1, _MM_SHUFFLE(3, 3, 3, 3)))));
+}
+
+PX_FORCE_INLINE bool hasZeroElementinFloatV(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), FZero()) ? true : false;
+}
+
+PX_FORCE_INLINE bool hasZeroElementInVec3V(const Vec3V a)
+{
+	return (_mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), FZero()) ||
+			_mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)), FZero()) ||
+			_mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)), FZero()));
+}
+
+PX_FORCE_INLINE bool hasZeroElementInVec4V(const Vec4V a)
+{
+	return (_mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), FZero()) ||
+	        _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)), FZero()) ||
+	        _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)), FZero()) ||
+	        _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)), FZero()));
+}
+
+const PX_ALIGN(16, PxU32 gMaskXYZ[4]) = { 0xffffffff, 0xffffffff, 0xffffffff, 0 };
+} //internalWindowsSimd
+
+namespace _VecMathTests
+{
+// PT: this function returns an invalid Vec3V (W!=0.0f) just for unit-testing 'isValidVec3V'
+PX_FORCE_INLINE Vec3V getInvalidVec3V()
+{
+	const float f = 1.0f;
+	return _mm_load1_ps(&f);
+}
+
+PX_FORCE_INLINE bool allElementsEqualFloatV(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_comieq_ss(a, b) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsEqualVec3V(const Vec3V a, const Vec3V b)
+{
+	return V3AllEq(a, b) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsEqualVec4V(const Vec4V a, const Vec4V b)
+{
+	return V4AllEq(a, b) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsEqualBoolV(const BoolV a, const BoolV b)
+{
+	return internalWindowsSimd::BAllTrue4_R(VecI32V_IsEq(a, b)) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsEqualVecU32V(const VecU32V a, const VecU32V b)
+{
+	return internalWindowsSimd::BAllTrue4_R(V4IsEqU32(a, b)) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsEqualVecI32V(const VecI32V a, const VecI32V b)
+{
+	BoolV c = internalWindowsSimd::m128_I2F(
+	    _mm_cmpeq_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b)));
+	return internalWindowsSimd::BAllTrue4_R(c) != 0;
+}
+
+#define VECMATH_AOS_EPSILON (1e-3f)
+static const FloatV minFError = FLoad(-VECMATH_AOS_EPSILON);
+static const FloatV maxFError = FLoad(VECMATH_AOS_EPSILON);
+static const Vec3V minV3Error = V3Load(-VECMATH_AOS_EPSILON);
+static const Vec3V maxV3Error = V3Load(VECMATH_AOS_EPSILON);
+static const Vec4V minV4Error = V4Load(-VECMATH_AOS_EPSILON);
+static const Vec4V maxV4Error = V4Load(VECMATH_AOS_EPSILON);
+
+PX_FORCE_INLINE bool allElementsNearEqualFloatV(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	const FloatV c = FSub(a, b);
+	return _mm_comigt_ss(c, minFError) && _mm_comilt_ss(c, maxFError);
+}
+
+PX_FORCE_INLINE bool allElementsNearEqualVec3V(const Vec3V a, const Vec3V b)
+{
+	const Vec3V c = V3Sub(a, b);
+	return (_mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), minV3Error) &&
+			_mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), maxV3Error) &&
+			_mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), minV3Error) &&
+			_mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), maxV3Error) &&
+			_mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), minV3Error) &&
+			_mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), maxV3Error));
+}
+
+PX_FORCE_INLINE bool allElementsNearEqualVec4V(const Vec4V a, const Vec4V b)
+{
+	const Vec4V c = V4Sub(a, b);
+	return (_mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), minV4Error) &&
+	        _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), maxV4Error) &&
+	        _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), minV4Error) &&
+	        _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), maxV4Error) &&
+	        _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), minV4Error) &&
+	        _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), maxV4Error) &&
+	        _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 3, 3, 3)), minV4Error) &&
+	        _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 3, 3, 3)), maxV4Error));
+}
+} //_VecMathTests
+
+PX_FORCE_INLINE bool isFiniteFloatV(const FloatV a)
+{
+	PxF32 f;
+	FStore(a, &f);
+	return PxIsFinite(f);
+	/*
+	const PxU32 badNumber = (_FPCLASS_SNAN | _FPCLASS_QNAN | _FPCLASS_NINF | _FPCLASS_PINF);
+	const FloatV vBadNum = FloatV_From_F32((PxF32&)badNumber);
+	const BoolV vMask = BAnd(vBadNum,  a);
+	return FiniteTestEq(vMask, BFFFF()) == 1;
+	*/
+}
+
+PX_FORCE_INLINE bool isFiniteVec3V(const Vec3V a)
+{
+	PX_ALIGN(16, PxF32 f[4]);
+	V4StoreA((Vec4V&)a, f);
+	return PxIsFinite(f[0]) && PxIsFinite(f[1]) && PxIsFinite(f[2]);
+
+	/*
+	const PxU32 badNumber = (_FPCLASS_SNAN | _FPCLASS_QNAN | _FPCLASS_NINF | _FPCLASS_PINF);
+	const Vec3V vBadNum = Vec3V_From_F32((PxF32&)badNumber);
+	const BoolV vMask = BAnd(BAnd(vBadNum,  a), BTTTF());
+	return FiniteTestEq(vMask, BFFFF()) == 1;
+	*/
+}
+
+PX_FORCE_INLINE bool isFiniteVec4V(const Vec4V a)
+{
+	PX_ALIGN(16, PxF32 f[4]);
+	V4StoreA(a, f);
+	return PxIsFinite(f[0]) && PxIsFinite(f[1]) && PxIsFinite(f[2]) && PxIsFinite(f[3]);
+
+	/*
+	const PxU32 badNumber = (_FPCLASS_SNAN | _FPCLASS_QNAN | _FPCLASS_NINF | _FPCLASS_PINF);
+	const Vec4V vBadNum = Vec4V_From_U32((PxF32&)badNumber);
+	const BoolV vMask = BAnd(vBadNum,  a);
+
+	return FiniteTestEq(vMask, BFFFF()) == 1;
+	*/
+}
+
+/////////////////////////////////////////////////////////////////////
+////VECTORISED FUNCTION IMPLEMENTATIONS
+/////////////////////////////////////////////////////////////////////
+
+PX_FORCE_INLINE FloatV FLoad(const PxF32 f)
+{
+	return _mm_load1_ps(&f);
+}
+
+PX_FORCE_INLINE Vec3V V3Load(const PxF32 f)
+{
+	return _mm_set_ps(0.0f, f, f, f);
+}
+
+PX_FORCE_INLINE Vec4V V4Load(const PxF32 f)
+{
+	return _mm_load1_ps(&f);
+}
+
+PX_FORCE_INLINE BoolV BLoad(const bool f)
+{
+	const PxU32 i = PxU32(-(PxI32)f);
+	return _mm_load1_ps((float*)&i);
+}
+
+PX_FORCE_INLINE Vec3V V3LoadA(const PxVec3& f)
+{
+	ASSERT_ISALIGNED16(&f);
+	return _mm_and_ps(_mm_load_ps(&f.x), reinterpret_cast<const Vec4V&>(internalWindowsSimd::gMaskXYZ));
+}
+
+PX_FORCE_INLINE Vec3V V3LoadU(const PxVec3& f)
+{
+	return _mm_set_ps(0.0f, f.z, f.y, f.x);
+}
+
+// w component of result is undefined
+PX_FORCE_INLINE Vec3V V3LoadUnsafeA(const PxVec3& f)
+{
+	ASSERT_ISALIGNED16(&f);
+	return _mm_load_ps(&f.x);
+}
+
+PX_FORCE_INLINE Vec3V V3LoadA(const PxF32* const f)
+{
+	ASSERT_ISALIGNED16(f);
+	return V4ClearW(_mm_load_ps(f));
+}
+
+PX_FORCE_INLINE Vec3V V3LoadU(const PxF32* const i)
+{
+	return _mm_set_ps(0.0f, i[2], i[1], i[0]);
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V(Vec4V v)
+{
+	return V4ClearW(v);
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V_WUndefined(const Vec4V v)
+{
+	return v;
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_Vec3V(Vec3V f)
+{
+	return f; // ok if it is implemented as the same type.
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_FloatV(FloatV f)
+{
+	return f;
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_FloatV(FloatV f)
+{
+	return Vec3V_From_Vec4V(Vec4V_From_FloatV(f));
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_FloatV_WUndefined(FloatV f)
+{
+	return Vec3V_From_Vec4V_WUndefined(Vec4V_From_FloatV(f));
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_PxVec3_WUndefined(const PxVec3& f)
+{
+	return _mm_set_ps(0.0f, f.z, f.y, f.x);
+}
+
+PX_FORCE_INLINE Vec4V V4LoadA(const PxF32* const f)
+{
+	ASSERT_ISALIGNED16(f);
+	return _mm_load_ps(f);
+}
+
+PX_FORCE_INLINE void V4StoreA(const Vec4V a, PxF32* f)
+{
+	ASSERT_ISALIGNED16(f);
+	_mm_store_ps(f, a);
+}
+
+PX_FORCE_INLINE void V4StoreU(const Vec4V a, PxF32* f)
+{
+	_mm_storeu_ps(f, a);
+}
+
+PX_FORCE_INLINE void BStoreA(const BoolV a, PxU32* f)
+{
+	ASSERT_ISALIGNED16(f);
+	_mm_store_ps((PxF32*)f, a);
+}
+
+PX_FORCE_INLINE void U4StoreA(const VecU32V uv, PxU32* u)
+{
+	ASSERT_ISALIGNED16(u);
+	_mm_store_ps((PxF32*)u, uv);
+}
+
+PX_FORCE_INLINE void I4StoreA(const VecI32V iv, PxI32* i)
+{
+	ASSERT_ISALIGNED16(i);
+	_mm_store_ps((PxF32*)i, iv);
+}
+
+PX_FORCE_INLINE Vec4V V4LoadU(const PxF32* const f)
+{
+	return _mm_loadu_ps(f);
+}
+
+PX_FORCE_INLINE BoolV BLoad(const bool* const f)
+{
+	const PX_ALIGN(16, PxU32 b[4]) = { PxU32(-(PxI32)f[0]), PxU32(-(PxI32)f[1]),
+		                               PxU32(-(PxI32)f[2]), PxU32(-(PxI32)f[3]) };
+	return _mm_load_ps((float*)&b);
+}
+
+PX_FORCE_INLINE void FStore(const FloatV a, PxF32* PX_RESTRICT f)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	_mm_store_ss(f, a);
+}
+
+PX_FORCE_INLINE void V3StoreA(const Vec3V a, PxVec3& f)
+{
+	ASSERT_ISALIGNED16(&f);
+	PX_ALIGN(16, PxF32 f2[4]);
+	_mm_store_ps(f2, a);
+	f = PxVec3(f2[0], f2[1], f2[2]);
+}
+
+PX_FORCE_INLINE void Store_From_BoolV(const BoolV b, PxU32* b2)
+{
+	_mm_store_ss((PxF32*)b2, b);
+}
+
+PX_FORCE_INLINE void V3StoreU(const Vec3V a, PxVec3& f)
+{
+	PX_ALIGN(16, PxF32 f2[4]);
+	_mm_store_ps(f2, a);
+	f = PxVec3(f2[0], f2[1], f2[2]);
+}
+
+PX_FORCE_INLINE Mat33V Mat33V_From_PxMat33(const PxMat33& m)
+{
+	return Mat33V(V3LoadU(m.column0), V3LoadU(m.column1), V3LoadU(m.column2));
+}
+
+PX_FORCE_INLINE void PxMat33_From_Mat33V(const Mat33V& m, PxMat33& out)
+{
+	ASSERT_ISALIGNED16(&out);
+	V3StoreU(m.col0, out.column0);
+	V3StoreU(m.col1, out.column1);
+	V3StoreU(m.col2, out.column2);
+}
+
+//////////////////////////////////
+// FLOATV
+//////////////////////////////////
+
+PX_FORCE_INLINE FloatV FZero()
+{
+	return _mm_setzero_ps();
+}
+
+PX_FORCE_INLINE FloatV FOne()
+{
+	return FLoad(1.0f);
+}
+
+PX_FORCE_INLINE FloatV FHalf()
+{
+	return FLoad(0.5f);
+}
+
+PX_FORCE_INLINE FloatV FEps()
+{
+	return FLoad(PX_EPS_REAL);
+}
+
+PX_FORCE_INLINE FloatV FEps6()
+{
+	return FLoad(1e-6f);
+}
+
+PX_FORCE_INLINE FloatV FMax()
+{
+	return FLoad(PX_MAX_REAL);
+}
+
+PX_FORCE_INLINE FloatV FNegMax()
+{
+	return FLoad(-PX_MAX_REAL);
+}
+
+PX_FORCE_INLINE FloatV IZero()
+{
+	const PxU32 zero = 0;
+	return _mm_load1_ps((PxF32*)&zero);
+}
+
+PX_FORCE_INLINE FloatV IOne()
+{
+	const PxU32 one = 1;
+	return _mm_load1_ps((PxF32*)&one);
+}
+
+PX_FORCE_INLINE FloatV ITwo()
+{
+	const PxU32 two = 2;
+	return _mm_load1_ps((PxF32*)&two);
+}
+
+PX_FORCE_INLINE FloatV IThree()
+{
+	const PxU32 three = 3;
+	return _mm_load1_ps((PxF32*)&three);
+}
+
+PX_FORCE_INLINE FloatV IFour()
+{
+	const PxU32 four = 4;
+	return _mm_load1_ps((PxF32*)&four);
+}
+
+PX_FORCE_INLINE FloatV FNeg(const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return _mm_sub_ps(_mm_setzero_ps(), f);
+}
+
+PX_FORCE_INLINE FloatV FAdd(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_add_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FSub(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_sub_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FMul(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_mul_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FDiv(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_div_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FDivFast(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_mul_ps(a, _mm_rcp_ps(b));
+}
+
+PX_FORCE_INLINE FloatV FRecip(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return _mm_div_ps(FOne(), a);
+}
+
+PX_FORCE_INLINE FloatV FRecipFast(const FloatV a)
+{
+	return _mm_rcp_ps(a);
+}
+
+PX_FORCE_INLINE FloatV FRsqrt(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return _mm_div_ps(FOne(), _mm_sqrt_ps(a));
+}
+
+PX_FORCE_INLINE FloatV FSqrt(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return _mm_sqrt_ps(a);
+}
+
+PX_FORCE_INLINE FloatV FRsqrtFast(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return _mm_rsqrt_ps(a);
+}
+
+PX_FORCE_INLINE FloatV FScaleAdd(const FloatV a, const FloatV b, const FloatV c)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	ASSERT_ISVALIDFLOATV(c);
+	return FAdd(FMul(a, b), c);
+}
+
+PX_FORCE_INLINE FloatV FNegScaleSub(const FloatV a, const FloatV b, const FloatV c)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	ASSERT_ISVALIDFLOATV(c);
+	return FSub(c, FMul(a, b));
+}
+
+PX_FORCE_INLINE FloatV FAbs(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	PX_ALIGN(16, const static PxU32 absMask[4]) = { 0x7fFFffFF, 0x7fFFffFF, 0x7fFFffFF, 0x7fFFffFF };
+	return _mm_and_ps(a, _mm_load_ps((PxF32*)absMask));
+}
+
+PX_FORCE_INLINE FloatV FSel(const BoolV c, const FloatV a, const FloatV b)
+{
+	PX_ASSERT(_VecMathTests::allElementsEqualBoolV(c, BTTTT()) ||
+			  _VecMathTests::allElementsEqualBoolV(c, BFFFF()));
+	ASSERT_ISVALIDFLOATV(_mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a)));
+	return _mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a));
+}
+
+PX_FORCE_INLINE BoolV FIsGrtr(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_cmpgt_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV FIsGrtrOrEq(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_cmpge_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV FIsEq(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_cmpeq_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FMax(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_max_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FMin(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_min_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FClamp(const FloatV a, const FloatV minV, const FloatV maxV)
+{
+	ASSERT_ISVALIDFLOATV(minV);
+	ASSERT_ISVALIDFLOATV(maxV);
+	return _mm_max_ps(_mm_min_ps(a, maxV), minV);
+}
+
+PX_FORCE_INLINE PxU32 FAllGrtr(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return PxU32(_mm_comigt_ss(a, b));
+}
+
+PX_FORCE_INLINE PxU32 FAllGrtrOrEq(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return PxU32(_mm_comige_ss(a, b));
+}
+
+PX_FORCE_INLINE PxU32 FAllEq(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return PxU32(_mm_comieq_ss(a, b));
+}
+
+PX_FORCE_INLINE FloatV FRound(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	// return _mm_round_ps(a, 0x0);
+	const FloatV half = FLoad(0.5f);
+	const __m128 signBit = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_cvtps_epi32(a), 31));
+	const FloatV aRound = FSub(FAdd(a, half), signBit);
+	__m128i tmp = _mm_cvttps_epi32(aRound);
+	return _mm_cvtepi32_ps(tmp);
+}
+
+PX_FORCE_INLINE FloatV FSin(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
+	const FloatV recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const FloatV twoPi = V4LoadA(g_PXTwoPi.f);
+	const FloatV tmp = FMul(a, recipTwoPi);
+	const FloatV b = FRound(tmp);
+	const FloatV V1 = FNegScaleSub(twoPi, b, a);
+
+	// sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
+	//           V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
+	const FloatV V2 = FMul(V1, V1);
+	const FloatV V3 = FMul(V2, V1);
+	const FloatV V5 = FMul(V3, V2);
+	const FloatV V7 = FMul(V5, V2);
+	const FloatV V9 = FMul(V7, V2);
+	const FloatV V11 = FMul(V9, V2);
+	const FloatV V13 = FMul(V11, V2);
+	const FloatV V15 = FMul(V13, V2);
+	const FloatV V17 = FMul(V15, V2);
+	const FloatV V19 = FMul(V17, V2);
+	const FloatV V21 = FMul(V19, V2);
+	const FloatV V23 = FMul(V21, V2);
+
+	const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f);
+	const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f);
+	const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f);
+
+	const FloatV S1 = V4GetY(sinCoefficients0);
+	const FloatV S2 = V4GetZ(sinCoefficients0);
+	const FloatV S3 = V4GetW(sinCoefficients0);
+	const FloatV S4 = V4GetX(sinCoefficients1);
+	const FloatV S5 = V4GetY(sinCoefficients1);
+	const FloatV S6 = V4GetZ(sinCoefficients1);
+	const FloatV S7 = V4GetW(sinCoefficients1);
+	const FloatV S8 = V4GetX(sinCoefficients2);
+	const FloatV S9 = V4GetY(sinCoefficients2);
+	const FloatV S10 = V4GetZ(sinCoefficients2);
+	const FloatV S11 = V4GetW(sinCoefficients2);
+
+	FloatV Result;
+	Result = FScaleAdd(S1, V3, V1);
+	Result = FScaleAdd(S2, V5, Result);
+	Result = FScaleAdd(S3, V7, Result);
+	Result = FScaleAdd(S4, V9, Result);
+	Result = FScaleAdd(S5, V11, Result);
+	Result = FScaleAdd(S6, V13, Result);
+	Result = FScaleAdd(S7, V15, Result);
+	Result = FScaleAdd(S8, V17, Result);
+	Result = FScaleAdd(S9, V19, Result);
+	Result = FScaleAdd(S10, V21, Result);
+	Result = FScaleAdd(S11, V23, Result);
+
+	return Result;
+}
+
+PX_FORCE_INLINE FloatV FCos(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+
+	// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
+	const FloatV recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const FloatV twoPi = V4LoadA(g_PXTwoPi.f);
+	const FloatV tmp = FMul(a, recipTwoPi);
+	const FloatV b = FRound(tmp);
+	const FloatV V1 = FNegScaleSub(twoPi, b, a);
+
+	// cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
+	//           V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
+	const FloatV V2 = FMul(V1, V1);
+	const FloatV V4 = FMul(V2, V2);
+	const FloatV V6 = FMul(V4, V2);
+	const FloatV V8 = FMul(V4, V4);
+	const FloatV V10 = FMul(V6, V4);
+	const FloatV V12 = FMul(V6, V6);
+	const FloatV V14 = FMul(V8, V6);
+	const FloatV V16 = FMul(V8, V8);
+	const FloatV V18 = FMul(V10, V8);
+	const FloatV V20 = FMul(V10, V10);
+	const FloatV V22 = FMul(V12, V10);
+
+	const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f);
+	const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f);
+	const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f);
+
+	const FloatV C1 = V4GetY(cosCoefficients0);
+	const FloatV C2 = V4GetZ(cosCoefficients0);
+	const FloatV C3 = V4GetW(cosCoefficients0);
+	const FloatV C4 = V4GetX(cosCoefficients1);
+	const FloatV C5 = V4GetY(cosCoefficients1);
+	const FloatV C6 = V4GetZ(cosCoefficients1);
+	const FloatV C7 = V4GetW(cosCoefficients1);
+	const FloatV C8 = V4GetX(cosCoefficients2);
+	const FloatV C9 = V4GetY(cosCoefficients2);
+	const FloatV C10 = V4GetZ(cosCoefficients2);
+	const FloatV C11 = V4GetW(cosCoefficients2);
+
+	FloatV Result;
+	Result = FScaleAdd(C1, V2, V4One());
+	Result = FScaleAdd(C2, V4, Result);
+	Result = FScaleAdd(C3, V6, Result);
+	Result = FScaleAdd(C4, V8, Result);
+	Result = FScaleAdd(C5, V10, Result);
+	Result = FScaleAdd(C6, V12, Result);
+	Result = FScaleAdd(C7, V14, Result);
+	Result = FScaleAdd(C8, V16, Result);
+	Result = FScaleAdd(C9, V18, Result);
+	Result = FScaleAdd(C10, V20, Result);
+	Result = FScaleAdd(C11, V22, Result);
+
+	return Result;
+}
+
+PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV min, const FloatV max)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(min);
+	ASSERT_ISVALIDFLOATV(max);
+	const BoolV c = BOr(FIsGrtr(a, max), FIsGrtr(min, a));
+	return PxU32(!BAllEqFFFF(c));
+}
+
+PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV min, const FloatV max)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(min);
+	ASSERT_ISVALIDFLOATV(max);
+	const BoolV c = BAnd(FIsGrtrOrEq(a, min), FIsGrtrOrEq(max, a));
+	return BAllEqTTTT(c);
+}
+
+PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV bounds)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(bounds);
+	return FOutOfBounds(a, FNeg(bounds), bounds);
+}
+
+PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV bounds)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(bounds);
+	return FInBounds(a, FNeg(bounds), bounds);
+}
+
+//////////////////////////////////
+// VEC3V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec3V V3Splat(const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	const __m128 zero = V3Zero();
+	const __m128 fff0 = _mm_move_ss(f, zero);
+	return _mm_shuffle_ps(fff0, fff0, _MM_SHUFFLE(0, 1, 2, 3));
+}
+
+PX_FORCE_INLINE Vec3V V3Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z)
+{
+	ASSERT_ISVALIDFLOATV(x);
+	ASSERT_ISVALIDFLOATV(y);
+	ASSERT_ISVALIDFLOATV(z);
+	// static on zero causes compiler crash on x64 debug_opt
+	const __m128 zero = V3Zero();
+	const __m128 xy = _mm_move_ss(x, y);
+	const __m128 z0 = _mm_move_ss(zero, z);
+
+	return _mm_shuffle_ps(xy, z0, _MM_SHUFFLE(1, 0, 0, 1));
+}
+
+PX_FORCE_INLINE Vec3V V3UnitX()
+{
+	const PX_ALIGN(16, PxF32 x[4]) = { 1.0f, 0.0f, 0.0f, 0.0f };
+	const __m128 x128 = _mm_load_ps(x);
+	return x128;
+}
+
+PX_FORCE_INLINE Vec3V V3UnitY()
+{
+	const PX_ALIGN(16, PxF32 y[4]) = { 0.0f, 1.0f, 0.0f, 0.0f };
+	const __m128 y128 = _mm_load_ps(y);
+	return y128;
+}
+
+PX_FORCE_INLINE Vec3V V3UnitZ()
+{
+	const PX_ALIGN(16, PxF32 z[4]) = { 0.0f, 0.0f, 1.0f, 0.0f };
+	const __m128 z128 = _mm_load_ps(z);
+	return z128;
+}
+
+PX_FORCE_INLINE FloatV V3GetX(const Vec3V f)
+{
+	ASSERT_ISVALIDVEC3V(f);
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0, 0, 0, 0));
+}
+
+PX_FORCE_INLINE FloatV V3GetY(const Vec3V f)
+{
+	ASSERT_ISVALIDVEC3V(f);
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 1, 1));
+}
+
+PX_FORCE_INLINE FloatV V3GetZ(const Vec3V f)
+{
+	ASSERT_ISVALIDVEC3V(f);
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(2, 2, 2, 2));
+}
+
+PX_FORCE_INLINE Vec3V V3SetX(const Vec3V v, const FloatV f)
+{
+	ASSERT_ISVALIDVEC3V(v);
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BFTTT(), v, f);
+}
+
+PX_FORCE_INLINE Vec3V V3SetY(const Vec3V v, const FloatV f)
+{
+	ASSERT_ISVALIDVEC3V(v);
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTFTT(), v, f);
+}
+
+PX_FORCE_INLINE Vec3V V3SetZ(const Vec3V v, const FloatV f)
+{
+	ASSERT_ISVALIDVEC3V(v);
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTTFT(), v, f);
+}
+
+PX_FORCE_INLINE Vec3V V3ColX(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+	Vec3V r = _mm_shuffle_ps(a, c, _MM_SHUFFLE(3, 0, 3, 0));
+	return V3SetY(r, V3GetX(b));
+}
+
+PX_FORCE_INLINE Vec3V V3ColY(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+	Vec3V r = _mm_shuffle_ps(a, c, _MM_SHUFFLE(3, 1, 3, 1));
+	return V3SetY(r, V3GetY(b));
+}
+
+PX_FORCE_INLINE Vec3V V3ColZ(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+	Vec3V r = _mm_shuffle_ps(a, c, _MM_SHUFFLE(3, 2, 3, 2));
+	return V3SetY(r, V3GetZ(b));
+}
+
+PX_FORCE_INLINE Vec3V V3Zero()
+{
+	return _mm_setzero_ps();
+}
+
+PX_FORCE_INLINE Vec3V V3One()
+{
+	return V3Load(1.0f);
+}
+
+PX_FORCE_INLINE Vec3V V3Eps()
+{
+	return V3Load(PX_EPS_REAL);
+}
+
+PX_FORCE_INLINE Vec3V V3Neg(const Vec3V f)
+{
+	ASSERT_ISVALIDVEC3V(f);
+	return _mm_sub_ps(_mm_setzero_ps(), f);
+}
+
+PX_FORCE_INLINE Vec3V V3Add(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_add_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Sub(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_sub_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Scale(const Vec3V a, const FloatV b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_mul_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Mul(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_mul_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3ScaleInv(const Vec3V a, const FloatV b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_div_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Div(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return V4ClearW(_mm_div_ps(a, b));
+}
+
+PX_FORCE_INLINE Vec3V V3ScaleInvFast(const Vec3V a, const FloatV b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_mul_ps(a, _mm_rcp_ps(b));
+}
+
+PX_FORCE_INLINE Vec3V V3DivFast(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return V4ClearW(_mm_mul_ps(a, _mm_rcp_ps(b)));
+}
+
+PX_FORCE_INLINE Vec3V V3Recip(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 zero = V3Zero();
+	const __m128 tttf = BTTTF();
+	const __m128 recipA = _mm_div_ps(V3One(), a);
+	return V4Sel(tttf, recipA, zero);
+}
+
+PX_FORCE_INLINE Vec3V V3RecipFast(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 zero = V3Zero();
+	const __m128 tttf = BTTTF();
+	const __m128 recipA = _mm_rcp_ps(a);
+	return V4Sel(tttf, recipA, zero);
+}
+
+PX_FORCE_INLINE Vec3V V3Rsqrt(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 zero = V3Zero();
+	const __m128 tttf = BTTTF();
+	const __m128 recipA = _mm_div_ps(V3One(), _mm_sqrt_ps(a));
+	return V4Sel(tttf, recipA, zero);
+}
+
+PX_FORCE_INLINE Vec3V V3RsqrtFast(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 zero = V3Zero();
+	const __m128 tttf = BTTTF();
+	const __m128 recipA = _mm_rsqrt_ps(a);
+	return V4Sel(tttf, recipA, zero);
+}
+
+PX_FORCE_INLINE Vec3V V3ScaleAdd(const Vec3V a, const FloatV b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	ASSERT_ISVALIDVEC3V(c);
+	return V3Add(V3Scale(a, b), c);
+}
+
+PX_FORCE_INLINE Vec3V V3NegScaleSub(const Vec3V a, const FloatV b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	ASSERT_ISVALIDVEC3V(c);
+	return V3Sub(c, V3Scale(a, b));
+}
+
+PX_FORCE_INLINE Vec3V V3MulAdd(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+	return V3Add(V3Mul(a, b), c);
+}
+
+PX_FORCE_INLINE Vec3V V3NegMulSub(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+	return V3Sub(c, V3Mul(a, b));
+}
+
+PX_FORCE_INLINE Vec3V V3Abs(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return V3Max(a, V3Neg(a));
+}
+
+PX_FORCE_INLINE FloatV V3Dot(const Vec3V a, const Vec3V b)	
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+
+	const __m128 t0 = _mm_mul_ps(a, b);								//	aw*bw | az*bz | ay*by | ax*bx
+	const __m128 t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(1,0,3,2));	//	ay*by | ax*bx | aw*bw | az*bz
+	const __m128 t2 = _mm_add_ps(t0, t1);							//	ay*by + aw*bw | ax*bx + az*bz | aw*bw + ay*by | az*bz + ax*bx
+	const __m128 t3 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,3,0,1));	//	ax*bx + az*bz | ay*by + aw*bw | az*bz + ax*bx | aw*bw + ay*by
+	return _mm_add_ps(t3, t2);										//	ax*bx + az*bz + ay*by + aw*bw 
+																	//	ay*by + aw*bw + ax*bx + az*bz
+																	//	az*bz + ax*bx + aw*bw + ay*by
+																	//	aw*bw + ay*by + az*bz + ax*bx
+}
+
+PX_FORCE_INLINE Vec3V V3Cross(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	const __m128 r1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	const __m128 r2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	const __m128 l1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	const __m128 l2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	return _mm_sub_ps(_mm_mul_ps(l1, l2), _mm_mul_ps(r1, r2));
+}
+
+PX_FORCE_INLINE VecCrossV V3PrepareCross(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	VecCrossV v;
+	v.mR1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	v.mL1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	return v;
+}
+
+PX_FORCE_INLINE Vec3V V3Cross(const VecCrossV& a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(b);
+	const __m128 r2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	const __m128 l2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	return _mm_sub_ps(_mm_mul_ps(a.mL1, l2), _mm_mul_ps(a.mR1, r2));
+}
+
+PX_FORCE_INLINE Vec3V V3Cross(const Vec3V a, const VecCrossV& b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 r2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	const __m128 l2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	return _mm_sub_ps(_mm_mul_ps(b.mR1, r2), _mm_mul_ps(b.mL1, l2));
+}
+
+PX_FORCE_INLINE Vec3V V3Cross(const VecCrossV& a, const VecCrossV& b)
+{
+	return _mm_sub_ps(_mm_mul_ps(a.mL1, b.mR1), _mm_mul_ps(a.mR1, b.mL1));
+}
+
+PX_FORCE_INLINE FloatV V3Length(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_sqrt_ps(V3Dot(a, a));
+}
+
+PX_FORCE_INLINE FloatV V3LengthSq(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return V3Dot(a, a);
+}
+
+PX_FORCE_INLINE Vec3V V3Normalize(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISFINITELENGTH(a);
+	return V3ScaleInv(a, _mm_sqrt_ps(V3Dot(a, a)));
+}
+
+PX_FORCE_INLINE Vec3V V3NormalizeFast(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISFINITELENGTH(a);
+	return V3Scale(a, _mm_rsqrt_ps(V3Dot(a, a)));
+}
+
+PX_FORCE_INLINE Vec3V V3NormalizeSafe(const Vec3V a, const Vec3V unsafeReturnValue)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 eps = FEps();
+	const __m128 length = V3Length(a);
+	const __m128 isGreaterThanZero = FIsGrtr(length, eps);
+	return V3Sel(isGreaterThanZero, V3ScaleInv(a, length), unsafeReturnValue);
+}
+
+PX_FORCE_INLINE Vec3V V3Sel(const BoolV c, const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(_mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a)));
+	return _mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a));
+}
+
+PX_FORCE_INLINE BoolV V3IsGrtr(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_cmpgt_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV V3IsGrtrOrEq(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_cmpge_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV V3IsEq(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_cmpeq_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Max(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_max_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Min(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_min_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV V3ExtractMax(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0));
+	const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1));
+	const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2));
+	return _mm_max_ps(_mm_max_ps(shuf1, shuf2), shuf3);
+}
+
+PX_FORCE_INLINE FloatV V3ExtractMin(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0));
+	const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1));
+	const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2));
+	return _mm_min_ps(_mm_min_ps(shuf1, shuf2), shuf3);
+}
+
+//// if(a > 0.0f) return 1.0f; else if a == 0.f return 0.f, else return -1.f;
+// PX_FORCE_INLINE Vec3V V3MathSign(const Vec3V a)
+//{
+//	VECMATHAOS_ASSERT(isValidVec3V(a));
+//
+//	const __m128i ai = _mm_cvtps_epi32(a);
+//	const __m128i bi = _mm_cvtps_epi32(V3Neg(a));
+//	const __m128  aa = _mm_cvtepi32_ps(_mm_srai_epi32(ai, 31));
+//	const __m128  bb = _mm_cvtepi32_ps(_mm_srai_epi32(bi, 31));
+//	return _mm_or_ps(aa, bb);
+//}
+
+// return (a >= 0.0f) ? 1.0f : -1.0f;
+PX_FORCE_INLINE Vec3V V3Sign(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 zero = V3Zero();
+	const __m128 one = V3One();
+	const __m128 none = V3Neg(one);
+	return V3Sel(V3IsGrtrOrEq(a, zero), one, none);
+}
+
+PX_FORCE_INLINE Vec3V V3Clamp(const Vec3V a, const Vec3V minV, const Vec3V maxV)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(minV);
+	ASSERT_ISVALIDVEC3V(maxV);
+	return V3Max(V3Min(a, maxV), minV);
+}
+
+PX_FORCE_INLINE PxU32 V3AllGrtr(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return internalWindowsSimd::BAllTrue3_R(V4IsGrtr(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V3AllGrtrOrEq(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return internalWindowsSimd::BAllTrue3_R(V4IsGrtrOrEq(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V3AllEq(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return internalWindowsSimd::BAllTrue3_R(V4IsEq(a, b));
+}
+
+PX_FORCE_INLINE Vec3V V3Round(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+
+	// return _mm_round_ps(a, 0x0);
+	const Vec3V half = V3Load(0.5f);
+	const __m128 signBit = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_cvtps_epi32(a), 31));
+	const Vec3V aRound = V3Sub(V3Add(a, half), signBit);
+	__m128i tmp = _mm_cvttps_epi32(aRound);
+	return _mm_cvtepi32_ps(tmp);
+}
+
+PX_FORCE_INLINE Vec3V V3Sin(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+
+	// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
+	const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const Vec4V twoPi = V4LoadA(g_PXTwoPi.f);
+	const Vec3V tmp = V3Scale(a, recipTwoPi);
+	const Vec3V b = V3Round(tmp);
+	const Vec3V V1 = V3NegScaleSub(b, twoPi, a);
+
+	// sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
+	//           V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
+	const Vec3V V2 = V3Mul(V1, V1);
+	const Vec3V V3 = V3Mul(V2, V1);
+	const Vec3V V5 = V3Mul(V3, V2);
+	const Vec3V V7 = V3Mul(V5, V2);
+	const Vec3V V9 = V3Mul(V7, V2);
+	const Vec3V V11 = V3Mul(V9, V2);
+	const Vec3V V13 = V3Mul(V11, V2);
+	const Vec3V V15 = V3Mul(V13, V2);
+	const Vec3V V17 = V3Mul(V15, V2);
+	const Vec3V V19 = V3Mul(V17, V2);
+	const Vec3V V21 = V3Mul(V19, V2);
+	const Vec3V V23 = V3Mul(V21, V2);
+
+	const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f);
+	const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f);
+	const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f);
+
+	const FloatV S1 = V4GetY(sinCoefficients0);
+	const FloatV S2 = V4GetZ(sinCoefficients0);
+	const FloatV S3 = V4GetW(sinCoefficients0);
+	const FloatV S4 = V4GetX(sinCoefficients1);
+	const FloatV S5 = V4GetY(sinCoefficients1);
+	const FloatV S6 = V4GetZ(sinCoefficients1);
+	const FloatV S7 = V4GetW(sinCoefficients1);
+	const FloatV S8 = V4GetX(sinCoefficients2);
+	const FloatV S9 = V4GetY(sinCoefficients2);
+	const FloatV S10 = V4GetZ(sinCoefficients2);
+	const FloatV S11 = V4GetW(sinCoefficients2);
+
+	Vec3V Result;
+	Result = V3ScaleAdd(V3, S1, V1);
+	Result = V3ScaleAdd(V5, S2, Result);
+	Result = V3ScaleAdd(V7, S3, Result);
+	Result = V3ScaleAdd(V9, S4, Result);
+	Result = V3ScaleAdd(V11, S5, Result);
+	Result = V3ScaleAdd(V13, S6, Result);
+	Result = V3ScaleAdd(V15, S7, Result);
+	Result = V3ScaleAdd(V17, S8, Result);
+	Result = V3ScaleAdd(V19, S9, Result);
+	Result = V3ScaleAdd(V21, S10, Result);
+	Result = V3ScaleAdd(V23, S11, Result);
+
+	ASSERT_ISVALIDVEC3V(Result);
+	return Result;
+}
+
+PX_FORCE_INLINE Vec3V V3Cos(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+
+	// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
+	const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const Vec4V twoPi = V4LoadA(g_PXTwoPi.f);
+	const Vec3V tmp = V3Scale(a, recipTwoPi);
+	const Vec3V b = V3Round(tmp);
+	const Vec3V V1 = V3NegScaleSub(b, twoPi, a);
+
+	// cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
+	//           V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
+	const Vec3V V2 = V3Mul(V1, V1);
+	const Vec3V V4 = V3Mul(V2, V2);
+	const Vec3V V6 = V3Mul(V4, V2);
+	const Vec3V V8 = V3Mul(V4, V4);
+	const Vec3V V10 = V3Mul(V6, V4);
+	const Vec3V V12 = V3Mul(V6, V6);
+	const Vec3V V14 = V3Mul(V8, V6);
+	const Vec3V V16 = V3Mul(V8, V8);
+	const Vec3V V18 = V3Mul(V10, V8);
+	const Vec3V V20 = V3Mul(V10, V10);
+	const Vec3V V22 = V3Mul(V12, V10);
+
+	const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f);
+	const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f);
+	const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f);
+
+	const FloatV C1 = V4GetY(cosCoefficients0);
+	const FloatV C2 = V4GetZ(cosCoefficients0);
+	const FloatV C3 = V4GetW(cosCoefficients0);
+	const FloatV C4 = V4GetX(cosCoefficients1);
+	const FloatV C5 = V4GetY(cosCoefficients1);
+	const FloatV C6 = V4GetZ(cosCoefficients1);
+	const FloatV C7 = V4GetW(cosCoefficients1);
+	const FloatV C8 = V4GetX(cosCoefficients2);
+	const FloatV C9 = V4GetY(cosCoefficients2);
+	const FloatV C10 = V4GetZ(cosCoefficients2);
+	const FloatV C11 = V4GetW(cosCoefficients2);
+
+	Vec3V Result;
+	Result = V3ScaleAdd(V2, C1, V3One());
+	Result = V3ScaleAdd(V4, C2, Result);
+	Result = V3ScaleAdd(V6, C3, Result);
+	Result = V3ScaleAdd(V8, C4, Result);
+	Result = V3ScaleAdd(V10, C5, Result);
+	Result = V3ScaleAdd(V12, C6, Result);
+	Result = V3ScaleAdd(V14, C7, Result);
+	Result = V3ScaleAdd(V16, C8, Result);
+	Result = V3ScaleAdd(V18, C9, Result);
+	Result = V3ScaleAdd(V20, C10, Result);
+	Result = V3ScaleAdd(V22, C11, Result);
+
+	ASSERT_ISVALIDVEC3V(Result); 
+	return Result;
+}
+
+PX_FORCE_INLINE Vec3V V3PermYZZ(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 2, 1));
+}
+
+PX_FORCE_INLINE Vec3V V3PermXYX(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 1, 0));
+}
+
+PX_FORCE_INLINE Vec3V V3PermYZX(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
+}
+
+PX_FORCE_INLINE Vec3V V3PermZXY(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2));
+}
+
+PX_FORCE_INLINE Vec3V V3PermZZY(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 2, 2));
+}
+
+PX_FORCE_INLINE Vec3V V3PermYXX(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 0, 1));
+}
+
+PX_FORCE_INLINE Vec3V V3Perm_Zero_1Z_0Y(const Vec3V v0, const Vec3V v1)
+{
+	ASSERT_ISVALIDVEC3V(v0);
+	ASSERT_ISVALIDVEC3V(v1);
+	return _mm_shuffle_ps(v1, v0, _MM_SHUFFLE(3, 1, 2, 3));
+}
+
+PX_FORCE_INLINE Vec3V V3Perm_0Z_Zero_1X(const Vec3V v0, const Vec3V v1)
+{
+	ASSERT_ISVALIDVEC3V(v0);
+	ASSERT_ISVALIDVEC3V(v1);
+	return _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(3, 0, 3, 2));
+}
+
+PX_FORCE_INLINE Vec3V V3Perm_1Y_0X_Zero(const Vec3V v0, const Vec3V v1)
+{
+	ASSERT_ISVALIDVEC3V(v0);
+	ASSERT_ISVALIDVEC3V(v1);
+	// There must be a better way to do this.
+	Vec3V v2 = V3Zero();
+	FloatV y1 = V3GetY(v1);
+	FloatV x0 = V3GetX(v0);
+	v2 = V3SetX(v2, y1);
+	return V3SetY(v2, x0);
+}
+
+PX_FORCE_INLINE FloatV V3SumElems(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)); // z,y,x,w
+	const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)); // y,x,w,z
+	const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)); // x,w,z,y
+	return _mm_add_ps(_mm_add_ps(shuf1, shuf2), shuf3);
+}
+
+PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V min, const Vec3V max)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(min);
+	ASSERT_ISVALIDVEC3V(max);
+	const BoolV c = BOr(V3IsGrtr(a, max), V3IsGrtr(min, a));
+	return PxU32(!BAllEqFFFF(c));
+}
+
+PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V min, const Vec3V max)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(min);
+	ASSERT_ISVALIDVEC3V(max);
+	const BoolV c = BAnd(V3IsGrtrOrEq(a, min), V3IsGrtrOrEq(max, a));
+	return BAllEqTTTT(c);
+}
+
+PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V bounds)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(bounds);
+	return V3OutOfBounds(a, V3Neg(bounds), bounds);
+}
+
+PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V bounds)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(bounds);
+	return V3InBounds(a, V3Neg(bounds), bounds);
+}
+
+PX_FORCE_INLINE void V3Transpose(Vec3V& col0, Vec3V& col1, Vec3V& col2)
+{
+	ASSERT_ISVALIDVEC3V(col0);
+	ASSERT_ISVALIDVEC3V(col1);
+	ASSERT_ISVALIDVEC3V(col2);
+	const Vec3V col3 = _mm_setzero_ps();
+	Vec3V tmp0 = _mm_unpacklo_ps(col0, col1);
+	Vec3V tmp2 = _mm_unpacklo_ps(col2, col3);
+	Vec3V tmp1 = _mm_unpackhi_ps(col0, col1);
+	Vec3V tmp3 = _mm_unpackhi_ps(col2, col3);
+	col0 = _mm_movelh_ps(tmp0, tmp2);
+	col1 = _mm_movehl_ps(tmp2, tmp0);
+	col2 = _mm_movelh_ps(tmp1, tmp3);
+}
+
+//////////////////////////////////
+// VEC4V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec4V V4Splat(const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	// return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0,0,0,0));
+	return f;
+}
+
+PX_FORCE_INLINE Vec4V V4Merge(const FloatV* const floatVArray)
+{
+	ASSERT_ISVALIDFLOATV(floatVArray[0]);
+	ASSERT_ISVALIDFLOATV(floatVArray[1]);
+	ASSERT_ISVALIDFLOATV(floatVArray[2]);
+	ASSERT_ISVALIDFLOATV(floatVArray[3]);
+	const __m128 xw = _mm_move_ss(floatVArray[1], floatVArray[0]); // y, y, y, x
+	const __m128 yz = _mm_move_ss(floatVArray[2], floatVArray[3]); // z, z, z, w
+	return _mm_shuffle_ps(xw, yz, _MM_SHUFFLE(0, 2, 1, 0));
+}
+
+PX_FORCE_INLINE Vec4V V4Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z, const FloatVArg w)
+{
+	ASSERT_ISVALIDFLOATV(x);
+	ASSERT_ISVALIDFLOATV(y);
+	ASSERT_ISVALIDFLOATV(z);
+	ASSERT_ISVALIDFLOATV(w);
+	const __m128 xw = _mm_move_ss(y, x); // y, y, y, x
+	const __m128 yz = _mm_move_ss(z, w); // z, z, z, w
+	return _mm_shuffle_ps(xw, yz, _MM_SHUFFLE(0, 2, 1, 0));
+}
+
+PX_FORCE_INLINE Vec4V V4MergeW(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	const Vec4V xz = _mm_unpackhi_ps(x, z);
+	const Vec4V yw = _mm_unpackhi_ps(y, w);
+	return _mm_unpackhi_ps(xz, yw);
+}
+
+PX_FORCE_INLINE Vec4V V4MergeZ(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	const Vec4V xz = _mm_unpackhi_ps(x, z);
+	const Vec4V yw = _mm_unpackhi_ps(y, w);
+	return _mm_unpacklo_ps(xz, yw);
+}
+
+PX_FORCE_INLINE Vec4V V4MergeY(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	const Vec4V xz = _mm_unpacklo_ps(x, z);
+	const Vec4V yw = _mm_unpacklo_ps(y, w);
+	return _mm_unpackhi_ps(xz, yw);
+}
+
+PX_FORCE_INLINE Vec4V V4MergeX(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	const Vec4V xz = _mm_unpacklo_ps(x, z);
+	const Vec4V yw = _mm_unpacklo_ps(y, w);
+	return _mm_unpacklo_ps(xz, yw);
+}
+
+PX_FORCE_INLINE Vec4V V4UnpackXY(const Vec4VArg a, const Vec4VArg b)
+{
+	return _mm_unpacklo_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4UnpackZW(const Vec4VArg a, const Vec4VArg b)
+{
+	return _mm_unpackhi_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4PermYXWZ(const Vec4V a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+
+PX_FORCE_INLINE Vec4V V4PermXZXZ(const Vec4V a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 0, 2, 0));
+}
+
+PX_FORCE_INLINE Vec4V V4PermYWYW(const Vec4V a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 3, 1));
+}
+
+PX_FORCE_INLINE Vec4V V4PermYZXW(const Vec4V a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
+}
+
+template <PxU8 x, PxU8 y, PxU8 z, PxU8 w>
+PX_FORCE_INLINE Vec4V V4Perm(const Vec4V a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(w, z, y, x));
+}
+
+PX_FORCE_INLINE Vec4V V4UnitW()
+{
+	const PX_ALIGN(16, PxF32 w[4]) = { 0.0f, 0.0f, 0.0f, 1.0f };
+	const __m128 w128 = _mm_load_ps(w);
+	return w128;
+}
+
+PX_FORCE_INLINE Vec4V V4UnitX()
+{
+	const PX_ALIGN(16, PxF32 x[4]) = { 1.0f, 0.0f, 0.0f, 0.0f };
+	const __m128 x128 = _mm_load_ps(x);
+	return x128;
+}
+
+PX_FORCE_INLINE Vec4V V4UnitY()
+{
+	const PX_ALIGN(16, PxF32 y[4]) = { 0.0f, 1.0f, 0.0f, 0.0f };
+	const __m128 y128 = _mm_load_ps(y);
+	return y128;
+}
+
+PX_FORCE_INLINE Vec4V V4UnitZ()
+{
+	const PX_ALIGN(16, PxF32 z[4]) = { 0.0f, 0.0f, 1.0f, 0.0f };
+	const __m128 z128 = _mm_load_ps(z);
+	return z128;
+}
+
+PX_FORCE_INLINE FloatV V4GetW(const Vec4V f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(3, 3, 3, 3));
+}
+
+PX_FORCE_INLINE FloatV V4GetX(const Vec4V f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0, 0, 0, 0));
+}
+
+PX_FORCE_INLINE FloatV V4GetY(const Vec4V f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 1, 1));
+}
+
+PX_FORCE_INLINE FloatV V4GetZ(const Vec4V f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(2, 2, 2, 2));
+}
+
+PX_FORCE_INLINE Vec4V V4SetW(const Vec4V v, const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTTTF(), v, f);
+}
+
+PX_FORCE_INLINE Vec4V V4ClearW(const Vec4V v)
+{
+	return _mm_and_ps(v, (VecI32V&)internalWindowsSimd::gMaskXYZ);
+}
+
+PX_FORCE_INLINE Vec4V V4SetX(const Vec4V v, const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BFTTT(), v, f);
+}
+
+PX_FORCE_INLINE Vec4V V4SetY(const Vec4V v, const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTFTT(), v, f);
+}
+
+PX_FORCE_INLINE Vec4V V4SetZ(const Vec4V v, const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTTFT(), v, f);
+}
+
+PX_FORCE_INLINE Vec4V V4Zero()
+{
+	return _mm_setzero_ps();
+}
+
+PX_FORCE_INLINE Vec4V V4One()
+{
+	return V4Load(1.0f);
+}
+
+PX_FORCE_INLINE Vec4V V4Eps()
+{
+	return V4Load(PX_EPS_REAL);
+}
+
+PX_FORCE_INLINE Vec4V V4Neg(const Vec4V f)
+{
+	return _mm_sub_ps(_mm_setzero_ps(), f);
+}
+
+PX_FORCE_INLINE Vec4V V4Add(const Vec4V a, const Vec4V b)
+{
+	return _mm_add_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Sub(const Vec4V a, const Vec4V b)
+{
+	return _mm_sub_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Scale(const Vec4V a, const FloatV b)
+{
+	return _mm_mul_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Mul(const Vec4V a, const Vec4V b)
+{
+	return _mm_mul_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4ScaleInv(const Vec4V a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_div_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Div(const Vec4V a, const Vec4V b)
+{
+	return _mm_div_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4ScaleInvFast(const Vec4V a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_mul_ps(a, _mm_rcp_ps(b));
+}
+
+PX_FORCE_INLINE Vec4V V4DivFast(const Vec4V a, const Vec4V b)
+{
+	return _mm_mul_ps(a, _mm_rcp_ps(b));
+}
+
+PX_FORCE_INLINE Vec4V V4Recip(const Vec4V a)
+{
+	return _mm_div_ps(V4One(), a);
+}
+
+PX_FORCE_INLINE Vec4V V4RecipFast(const Vec4V a)
+{
+	return _mm_rcp_ps(a);
+}
+
+PX_FORCE_INLINE Vec4V V4Rsqrt(const Vec4V a)
+{
+	return _mm_div_ps(V4One(), _mm_sqrt_ps(a));
+}
+
+PX_FORCE_INLINE Vec4V V4RsqrtFast(const Vec4V a)
+{
+	return _mm_rsqrt_ps(a);
+}
+
+PX_FORCE_INLINE Vec4V V4Sqrt(const Vec4V a)
+{
+	return _mm_sqrt_ps(a);
+}
+
+PX_FORCE_INLINE Vec4V V4ScaleAdd(const Vec4V a, const FloatV b, const Vec4V c)
+{
+	ASSERT_ISVALIDFLOATV(b);
+	return V4Add(V4Scale(a, b), c);
+}
+
+PX_FORCE_INLINE Vec4V V4NegScaleSub(const Vec4V a, const FloatV b, const Vec4V c)
+{
+	ASSERT_ISVALIDFLOATV(b);
+	return V4Sub(c, V4Scale(a, b));
+}
+
+PX_FORCE_INLINE Vec4V V4MulAdd(const Vec4V a, const Vec4V b, const Vec4V c)
+{
+	return V4Add(V4Mul(a, b), c);
+}
+
+PX_FORCE_INLINE Vec4V V4NegMulSub(const Vec4V a, const Vec4V b, const Vec4V c)
+{
+	return V4Sub(c, V4Mul(a, b));
+}
+
+PX_FORCE_INLINE Vec4V V4Abs(const Vec4V a)
+{
+	return V4Max(a, V4Neg(a));
+}
+
+PX_FORCE_INLINE FloatV V4SumElements(const Vec4V a)
+{
+	const Vec4V xy = V4UnpackXY(a, a); // x,x,y,y
+	const Vec4V zw = V4UnpackZW(a, a); // z,z,w,w
+	const Vec4V xz_yw = V4Add(xy, zw); // x+z,x+z,y+w,y+w
+	const FloatV xz = V4GetX(xz_yw);   // x+z
+	const FloatV yw = V4GetZ(xz_yw);   // y+w
+	return FAdd(xz, yw);               // sum
+}
+
+PX_FORCE_INLINE FloatV V4Dot(const Vec4V a, const Vec4V b)
+{
+	const __m128 dot1 = _mm_mul_ps(a, b);                                     // x,y,z,w
+	const __m128 shuf1 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(2, 1, 0, 3)); // w,x,y,z
+	const __m128 shuf2 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(1, 0, 3, 2)); // z,w,x,y
+	const __m128 shuf3 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(0, 3, 2, 1)); // y,z,w,x
+	return _mm_add_ps(_mm_add_ps(shuf2, shuf3), _mm_add_ps(dot1, shuf1));
+
+	// PT: this version has two less instructions but we should check its accuracy
+	// aw*bw | az*bz | ay*by | ax*bx
+	// const __m128 t0 = _mm_mul_ps(a, b);
+	// ay*by | ax*bx | aw*bw | az*bz
+	// const __m128 t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(1,0,3,2));
+	// ay*by + aw*bw | ax*bx + az*bz | aw*bw + ay*by | az*bz + ax*bx
+	// const __m128 t2 = _mm_add_ps(t0, t1);
+	// ax*bx + az*bz | ay*by + aw*bw | az*bz + ax*bx | aw*bw + ay*by
+	// const __m128 t3 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,3,0,1));
+	// ax*bx + az*bz + ay*by + aw*bw
+	// return _mm_add_ps(t3, t2);
+	// ay*by + aw*bw + ax*bx + az*bz
+	// az*bz + ax*bx + aw*bw + ay*by
+	// aw*bw + ay*by + az*bz + ax*bx
+}
+
+PX_FORCE_INLINE FloatV V4Dot3(const Vec4V a, const Vec4V b)
+{
+	const __m128 dot1 = _mm_mul_ps(a, b);                                     // aw*bw | az*bz | ay*by | ax*bx
+	const __m128 shuf1 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(0, 0, 0, 0)); // ax*bx | ax*bx | ax*bx | ax*bx
+	const __m128 shuf2 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(1, 1, 1, 1)); // ay*by | ay*by | ay*by | ay*by
+	const __m128 shuf3 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(2, 2, 2, 2)); // az*bz | az*bz | az*bz | az*bz
+	return _mm_add_ps(_mm_add_ps(shuf1, shuf2), shuf3);                       // ax*bx + ay*by + az*bz in each component
+}
+
+PX_FORCE_INLINE Vec4V V4Cross(const Vec4V a, const Vec4V b)
+{
+	const __m128 r1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	const __m128 r2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	const __m128 l1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	const __m128 l2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	return _mm_sub_ps(_mm_mul_ps(l1, l2), _mm_mul_ps(r1, r2));
+}
+
+PX_FORCE_INLINE FloatV V4Length(const Vec4V a)
+{
+	return _mm_sqrt_ps(V4Dot(a, a));
+}
+
+PX_FORCE_INLINE FloatV V4LengthSq(const Vec4V a)
+{
+	return V4Dot(a, a);
+}
+
+PX_FORCE_INLINE Vec4V V4Normalize(const Vec4V a)
+{
+	ASSERT_ISFINITELENGTH(a);
+	return V4ScaleInv(a, _mm_sqrt_ps(V4Dot(a, a)));
+}
+
+PX_FORCE_INLINE Vec4V V4NormalizeFast(const Vec4V a)
+{
+	ASSERT_ISFINITELENGTH(a);
+	return V4ScaleInvFast(a, _mm_sqrt_ps(V4Dot(a, a)));
+}
+
+PX_FORCE_INLINE Vec4V V4NormalizeSafe(const Vec4V a, const Vec4V unsafeReturnValue)
+{
+	const __m128 eps = V3Eps();
+	const __m128 length = V4Length(a);
+	const __m128 isGreaterThanZero = V4IsGrtr(length, eps);
+	return V4Sel(isGreaterThanZero, V4ScaleInv(a, length), unsafeReturnValue);
+}
+
+PX_FORCE_INLINE Vec4V V4Sel(const BoolV c, const Vec4V a, const Vec4V b)
+{
+	return _mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a));
+}
+
+PX_FORCE_INLINE BoolV V4IsGrtr(const Vec4V a, const Vec4V b)
+{
+	return _mm_cmpgt_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV V4IsGrtrOrEq(const Vec4V a, const Vec4V b)
+{
+	return _mm_cmpge_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV V4IsEq(const Vec4V a, const Vec4V b)
+{
+	return _mm_cmpeq_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV V4IsEqU32(const VecU32V a, const VecU32V b)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_cmpeq_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b)));
+}
+
+PX_FORCE_INLINE Vec4V V4Max(const Vec4V a, const Vec4V b)
+{
+	return _mm_max_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Min(const Vec4V a, const Vec4V b)
+{
+	return _mm_min_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV V4ExtractMax(const Vec4V a)
+{
+	const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 1, 0, 3));
+	const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2));
+	const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 3, 2, 1));
+
+	return _mm_max_ps(_mm_max_ps(a, shuf1), _mm_max_ps(shuf2, shuf3));
+}
+
+PX_FORCE_INLINE FloatV V4ExtractMin(const Vec4V a)
+{
+	const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 1, 0, 3));
+	const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2));
+	const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 3, 2, 1));
+
+	return _mm_min_ps(_mm_min_ps(a, shuf1), _mm_min_ps(shuf2, shuf3));
+}
+
+PX_FORCE_INLINE Vec4V V4Clamp(const Vec4V a, const Vec4V minV, const Vec4V maxV)
+{
+	return V4Max(V4Min(a, maxV), minV);
+}
+
+PX_FORCE_INLINE PxU32 V4AllGrtr(const Vec4V a, const Vec4V b)
+{
+	return internalWindowsSimd::BAllTrue4_R(V4IsGrtr(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V4AllGrtrOrEq(const Vec4V a, const Vec4V b)
+{
+	return internalWindowsSimd::BAllTrue4_R(V4IsGrtrOrEq(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V4AllGrtrOrEq3(const Vec4V a, const Vec4V b)
+{
+	return internalWindowsSimd::BAllTrue3_R(V4IsGrtrOrEq(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V4AllEq(const Vec4V a, const Vec4V b)
+{
+	return internalWindowsSimd::BAllTrue4_R(V4IsEq(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V4AnyGrtr3(const Vec4V a, const Vec4V b)
+{
+	return internalWindowsSimd::BAnyTrue3_R(V4IsGrtr(a, b));
+}
+
+PX_FORCE_INLINE Vec4V V4Round(const Vec4V a)
+{
+	// return _mm_round_ps(a, 0x0);
+	const Vec4V half = V4Load(0.5f);
+	const __m128 signBit = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_cvtps_epi32(a), 31));
+	const Vec4V aRound = V4Sub(V4Add(a, half), signBit);
+	const __m128i tmp = _mm_cvttps_epi32(aRound);
+	return _mm_cvtepi32_ps(tmp);
+}
+
+PX_FORCE_INLINE Vec4V V4Sin(const Vec4V a)
+{
+	const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const Vec4V twoPi = V4LoadA(g_PXTwoPi.f);
+	const Vec4V tmp = V4Mul(a, recipTwoPi);
+	const Vec4V b = V4Round(tmp);
+	const Vec4V V1 = V4NegMulSub(twoPi, b, a);
+
+	// sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
+	//           V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
+	const Vec4V V2 = V4Mul(V1, V1);
+	const Vec4V V3 = V4Mul(V2, V1);
+	const Vec4V V5 = V4Mul(V3, V2);
+	const Vec4V V7 = V4Mul(V5, V2);
+	const Vec4V V9 = V4Mul(V7, V2);
+	const Vec4V V11 = V4Mul(V9, V2);
+	const Vec4V V13 = V4Mul(V11, V2);
+	const Vec4V V15 = V4Mul(V13, V2);
+	const Vec4V V17 = V4Mul(V15, V2);
+	const Vec4V V19 = V4Mul(V17, V2);
+	const Vec4V V21 = V4Mul(V19, V2);
+	const Vec4V V23 = V4Mul(V21, V2);
+
+	const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f);
+	const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f);
+	const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f);
+
+	const FloatV S1 = V4GetY(sinCoefficients0);
+	const FloatV S2 = V4GetZ(sinCoefficients0);
+	const FloatV S3 = V4GetW(sinCoefficients0);
+	const FloatV S4 = V4GetX(sinCoefficients1);
+	const FloatV S5 = V4GetY(sinCoefficients1);
+	const FloatV S6 = V4GetZ(sinCoefficients1);
+	const FloatV S7 = V4GetW(sinCoefficients1);
+	const FloatV S8 = V4GetX(sinCoefficients2);
+	const FloatV S9 = V4GetY(sinCoefficients2);
+	const FloatV S10 = V4GetZ(sinCoefficients2);
+	const FloatV S11 = V4GetW(sinCoefficients2);
+
+	Vec4V Result;
+	Result = V4MulAdd(S1, V3, V1);
+	Result = V4MulAdd(S2, V5, Result);
+	Result = V4MulAdd(S3, V7, Result);
+	Result = V4MulAdd(S4, V9, Result);
+	Result = V4MulAdd(S5, V11, Result);
+	Result = V4MulAdd(S6, V13, Result);
+	Result = V4MulAdd(S7, V15, Result);
+	Result = V4MulAdd(S8, V17, Result);
+	Result = V4MulAdd(S9, V19, Result);
+	Result = V4MulAdd(S10, V21, Result);
+	Result = V4MulAdd(S11, V23, Result);
+
+	return Result;
+}
+
+PX_FORCE_INLINE Vec4V V4Cos(const Vec4V a)
+{
+	const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const FloatV twoPi = V4LoadA(g_PXTwoPi.f);
+	const Vec4V tmp = V4Mul(a, recipTwoPi);
+	const Vec4V b = V4Round(tmp);
+	const Vec4V V1 = V4NegMulSub(twoPi, b, a);
+
+	// cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
+	//           V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
+	const Vec4V V2 = V4Mul(V1, V1);
+	const Vec4V V4 = V4Mul(V2, V2);
+	const Vec4V V6 = V4Mul(V4, V2);
+	const Vec4V V8 = V4Mul(V4, V4);
+	const Vec4V V10 = V4Mul(V6, V4);
+	const Vec4V V12 = V4Mul(V6, V6);
+	const Vec4V V14 = V4Mul(V8, V6);
+	const Vec4V V16 = V4Mul(V8, V8);
+	const Vec4V V18 = V4Mul(V10, V8);
+	const Vec4V V20 = V4Mul(V10, V10);
+	const Vec4V V22 = V4Mul(V12, V10);
+
+	const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f);
+	const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f);
+	const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f);
+
+	const FloatV C1 = V4GetY(cosCoefficients0);
+	const FloatV C2 = V4GetZ(cosCoefficients0);
+	const FloatV C3 = V4GetW(cosCoefficients0);
+	const FloatV C4 = V4GetX(cosCoefficients1);
+	const FloatV C5 = V4GetY(cosCoefficients1);
+	const FloatV C6 = V4GetZ(cosCoefficients1);
+	const FloatV C7 = V4GetW(cosCoefficients1);
+	const FloatV C8 = V4GetX(cosCoefficients2);
+	const FloatV C9 = V4GetY(cosCoefficients2);
+	const FloatV C10 = V4GetZ(cosCoefficients2);
+	const FloatV C11 = V4GetW(cosCoefficients2);
+
+	Vec4V Result;
+	Result = V4MulAdd(C1, V2, V4One());
+	Result = V4MulAdd(C2, V4, Result);
+	Result = V4MulAdd(C3, V6, Result);
+	Result = V4MulAdd(C4, V8, Result);
+	Result = V4MulAdd(C5, V10, Result);
+	Result = V4MulAdd(C6, V12, Result);
+	Result = V4MulAdd(C7, V14, Result);
+	Result = V4MulAdd(C8, V16, Result);
+	Result = V4MulAdd(C9, V18, Result);
+	Result = V4MulAdd(C10, V20, Result);
+	Result = V4MulAdd(C11, V22, Result);
+
+	return Result;
+}
+
+PX_FORCE_INLINE void V4Transpose(Vec4V& col0, Vec4V& col1, Vec4V& col2, Vec4V& col3)
+{
+	Vec4V tmp0 = _mm_unpacklo_ps(col0, col1);
+	Vec4V tmp2 = _mm_unpacklo_ps(col2, col3);
+	Vec4V tmp1 = _mm_unpackhi_ps(col0, col1);
+	Vec4V tmp3 = _mm_unpackhi_ps(col2, col3);
+	col0 = _mm_movelh_ps(tmp0, tmp2);
+	col1 = _mm_movehl_ps(tmp2, tmp0);
+	col2 = _mm_movelh_ps(tmp1, tmp3);
+	col3 = _mm_movehl_ps(tmp3, tmp1);
+}
+
+//////////////////////////////////
+// BoolV
+//////////////////////////////////
+
+PX_FORCE_INLINE BoolV BFFFF()
+{
+	return _mm_setzero_ps();
+}
+
+PX_FORCE_INLINE BoolV BFFFT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0,0,0xFFFFFFFF};
+	const __m128 ffft=_mm_load_ps((float*)&f);
+	return ffft;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, 0, 0, 0));
+}
+
+PX_FORCE_INLINE BoolV BFFTF()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0};
+	const __m128 fftf=_mm_load_ps((float*)&f);
+	return fftf;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, -1, 0, 0));
+}
+
+PX_FORCE_INLINE BoolV BFFTT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0xFFFFFFFF};
+	const __m128 fftt=_mm_load_ps((float*)&f);
+	return fftt;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, -1, 0, 0));
+}
+
+PX_FORCE_INLINE BoolV BFTFF()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0};
+	const __m128 ftff=_mm_load_ps((float*)&f);
+	return ftff;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, 0, -1, 0));
+}
+
+PX_FORCE_INLINE BoolV BFTFT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0xFFFFFFFF};
+	const __m128 ftft=_mm_load_ps((float*)&f);
+	return ftft;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, 0, -1, 0));
+}
+
+PX_FORCE_INLINE BoolV BFTTF()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0xFFFFFFFF,0};
+	const __m128 fttf=_mm_load_ps((float*)&f);
+	return fttf;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, -1, -1, 0));
+}
+
+PX_FORCE_INLINE BoolV BFTTT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF};
+	const __m128 fttt=_mm_load_ps((float*)&f);
+	return fttt;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, -1, -1, 0));
+}
+
+PX_FORCE_INLINE BoolV BTFFF()
+{
+	// const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0};
+	// const __m128 tfff=_mm_load_ps((float*)&f);
+	// return tfff;
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, 0, 0, -1));
+}
+
+PX_FORCE_INLINE BoolV BTFFT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0xFFFFFFFF};
+	const __m128 tfft=_mm_load_ps((float*)&f);
+	return tfft;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, 0, 0, -1));
+}
+
+PX_FORCE_INLINE BoolV BTFTF()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0xFFFFFFFF,0};
+	const __m128 tftf=_mm_load_ps((float*)&f);
+	return tftf;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, -1, 0, -1));
+}
+
+PX_FORCE_INLINE BoolV BTFTT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0xFFFFFFFF,0xFFFFFFFF};
+	const __m128 tftt=_mm_load_ps((float*)&f);
+	return tftt;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, -1, 0, -1));
+}
+
+PX_FORCE_INLINE BoolV BTTFF()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0,0};
+	const __m128 ttff=_mm_load_ps((float*)&f);
+	return ttff;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, 0, -1, -1));
+}
+
+PX_FORCE_INLINE BoolV BTTFT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0,0xFFFFFFFF};
+	const __m128 ttft=_mm_load_ps((float*)&f);
+	return ttft;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, 0, -1, -1));
+}
+
+PX_FORCE_INLINE BoolV BTTTF()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0};
+	const __m128 tttf=_mm_load_ps((float*)&f);
+	return tttf;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, -1, -1, -1));
+}
+
+PX_FORCE_INLINE BoolV BTTTT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF};
+	const __m128 tttt=_mm_load_ps((float*)&f);
+	return tttt;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, -1, -1, -1));
+}
+
+PX_FORCE_INLINE BoolV BXMask()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0};
+	const __m128 tfff=_mm_load_ps((float*)&f);
+	return tfff;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, 0, 0, -1));
+}
+
+PX_FORCE_INLINE BoolV BYMask()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0};
+	const __m128 ftff=_mm_load_ps((float*)&f);
+	return ftff;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, 0, -1, 0));
+}
+
+PX_FORCE_INLINE BoolV BZMask()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0};
+	const __m128 fftf=_mm_load_ps((float*)&f);
+	return fftf;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, -1, 0, 0));
+}
+
+PX_FORCE_INLINE BoolV BWMask()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0,0,0xFFFFFFFF};
+	const __m128 ffft=_mm_load_ps((float*)&f);
+	return ffft;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, 0, 0, 0));
+}
+
+PX_FORCE_INLINE BoolV BGetX(const BoolV f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0, 0, 0, 0));
+}
+
+PX_FORCE_INLINE BoolV BGetY(const BoolV f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 1, 1));
+}
+
+PX_FORCE_INLINE BoolV BGetZ(const BoolV f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(2, 2, 2, 2));
+}
+
+PX_FORCE_INLINE BoolV BGetW(const BoolV f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(3, 3, 3, 3));
+}
+
+PX_FORCE_INLINE BoolV BSetX(const BoolV v, const BoolV f)
+{
+	return V4Sel(BFTTT(), v, f);
+}
+
+PX_FORCE_INLINE BoolV BSetY(const BoolV v, const BoolV f)
+{
+	return V4Sel(BTFTT(), v, f);
+}
+
+PX_FORCE_INLINE BoolV BSetZ(const BoolV v, const BoolV f)
+{
+	return V4Sel(BTTFT(), v, f);
+}
+
+PX_FORCE_INLINE BoolV BSetW(const BoolV v, const BoolV f)
+{
+	return V4Sel(BTTTF(), v, f);
+}
+
+template <int index>
+BoolV BSplatElement(BoolV a)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_shuffle_epi32(internalWindowsSimd::m128_F2I(a), _MM_SHUFFLE(index, index, index, index)));
+}
+
+PX_FORCE_INLINE BoolV BAnd(const BoolV a, const BoolV b)
+{
+	return _mm_and_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV BNot(const BoolV a)
+{
+	const BoolV bAllTrue(BTTTT());
+	return _mm_xor_ps(a, bAllTrue);
+}
+
+PX_FORCE_INLINE BoolV BAndNot(const BoolV a, const BoolV b)
+{
+	return _mm_andnot_ps(b, a);
+}
+
+PX_FORCE_INLINE BoolV BOr(const BoolV a, const BoolV b)
+{
+	return _mm_or_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV BAllTrue4(const BoolV a)
+{
+	const BoolV bTmp =
+	    _mm_and_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 2, 3)));
+	return _mm_and_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)),
+	                  _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1)));
+}
+
+PX_FORCE_INLINE BoolV BAnyTrue4(const BoolV a)
+{
+	const BoolV bTmp =
+	    _mm_or_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 2, 3)));
+	return _mm_or_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)),
+	                 _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1)));
+}
+
+PX_FORCE_INLINE BoolV BAllTrue3(const BoolV a)
+{
+	const BoolV bTmp =
+	    _mm_and_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)));
+	return _mm_and_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)),
+	                  _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1)));
+}
+
+PX_FORCE_INLINE BoolV BAnyTrue3(const BoolV a)
+{
+	const BoolV bTmp =
+	    _mm_or_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)));
+	return _mm_or_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)),
+	                 _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1)));
+}
+
+PX_FORCE_INLINE PxU32 BAllEq(const BoolV a, const BoolV b)
+{
+	const BoolV bTest = internalWindowsSimd::m128_I2F(
+	    _mm_cmpeq_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b)));
+	return internalWindowsSimd::BAllTrue4_R(bTest);
+}
+
+PX_FORCE_INLINE PxU32 BAllEqTTTT(const BoolV a)
+{
+	return PxU32(_mm_movemask_ps(a)==15);
+}
+
+PX_FORCE_INLINE PxU32 BAllEqFFFF(const BoolV a)
+{
+	return PxU32(_mm_movemask_ps(a)==0);
+}
+
+PX_FORCE_INLINE PxU32 BGetBitMask(const BoolV a)
+{
+	return PxU32(_mm_movemask_ps(a));
+}
+
+//////////////////////////////////
+// MAT33V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec3V M33MulV3(const Mat33V& a, const Vec3V b)
+{
+	const FloatV x = V3GetX(b);
+	const FloatV y = V3GetY(b);
+	const FloatV z = V3GetZ(b);
+	const Vec3V v0 = V3Scale(a.col0, x);
+	const Vec3V v1 = V3Scale(a.col1, y);
+	const Vec3V v2 = V3Scale(a.col2, z);
+	const Vec3V v0PlusV1 = V3Add(v0, v1);
+	return V3Add(v0PlusV1, v2);
+}
+
+PX_FORCE_INLINE Vec3V M33TrnspsMulV3(const Mat33V& a, const Vec3V b)
+{
+	Vec3V v0 = V3Mul(a.col0, b);
+	Vec3V v1 = V3Mul(a.col1, b);
+	Vec3V v2 = V3Mul(a.col2, b);
+	V3Transpose(v0, v1, v2);
+	return V3Add(V3Add(v0, v1), v2);
+}
+
+PX_FORCE_INLINE Vec3V M33MulV3AddV3(const Mat33V& A, const Vec3V b, const Vec3V c)
+{
+	const FloatV x = V3GetX(b);
+	const FloatV y = V3GetY(b);
+	const FloatV z = V3GetZ(b);
+	Vec3V result = V3ScaleAdd(A.col0, x, c);
+	result = V3ScaleAdd(A.col1, y, result);
+	return V3ScaleAdd(A.col2, z, result);
+}
+
+PX_FORCE_INLINE Mat33V M33MulM33(const Mat33V& a, const Mat33V& b)
+{
+	return Mat33V(M33MulV3(a, b.col0), M33MulV3(a, b.col1), M33MulV3(a, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Add(const Mat33V& a, const Mat33V& b)
+{
+	return Mat33V(V3Add(a.col0, b.col0), V3Add(a.col1, b.col1), V3Add(a.col2, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Scale(const Mat33V& a, const FloatV& b)
+{
+	return Mat33V(V3Scale(a.col0, b), V3Scale(a.col1, b), V3Scale(a.col2, b));
+}
+
+PX_FORCE_INLINE Mat33V M33Sub(const Mat33V& a, const Mat33V& b)
+{
+	return Mat33V(V3Sub(a.col0, b.col0), V3Sub(a.col1, b.col1), V3Sub(a.col2, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Neg(const Mat33V& a)
+{
+	return Mat33V(V3Neg(a.col0), V3Neg(a.col1), V3Neg(a.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Abs(const Mat33V& a)
+{
+	return Mat33V(V3Abs(a.col0), V3Abs(a.col1), V3Abs(a.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Inverse(const Mat33V& a)
+{
+	const BoolV tfft = BTFFT();
+	const BoolV tttf = BTTTF();
+	const FloatV zero = V3Zero();
+	const Vec3V cross01 = V3Cross(a.col0, a.col1);
+	const Vec3V cross12 = V3Cross(a.col1, a.col2);
+	const Vec3V cross20 = V3Cross(a.col2, a.col0);
+	const FloatV dot = V3Dot(cross01, a.col2);
+	const FloatV invDet = _mm_rcp_ps(dot);
+	const Vec3V mergeh = _mm_unpacklo_ps(cross12, cross01);
+	const Vec3V mergel = _mm_unpackhi_ps(cross12, cross01);
+	Vec3V colInv0 = _mm_unpacklo_ps(mergeh, cross20);
+	colInv0 = _mm_or_ps(_mm_andnot_ps(tttf, zero), _mm_and_ps(tttf, colInv0));
+	const Vec3V zppd = _mm_shuffle_ps(mergeh, cross20, _MM_SHUFFLE(3, 0, 0, 2));
+	const Vec3V pbwp = _mm_shuffle_ps(cross20, mergeh, _MM_SHUFFLE(3, 3, 1, 0));
+	const Vec3V colInv1 = _mm_or_ps(_mm_andnot_ps(BTFFT(), pbwp), _mm_and_ps(BTFFT(), zppd));
+	const Vec3V xppd = _mm_shuffle_ps(mergel, cross20, _MM_SHUFFLE(3, 0, 0, 0));
+	const Vec3V pcyp = _mm_shuffle_ps(cross20, mergel, _MM_SHUFFLE(3, 1, 2, 0));
+	const Vec3V colInv2 = _mm_or_ps(_mm_andnot_ps(tfft, pcyp), _mm_and_ps(tfft, xppd));
+
+	return Mat33V(_mm_mul_ps(colInv0, invDet), _mm_mul_ps(colInv1, invDet), _mm_mul_ps(colInv2, invDet));
+}
+
+PX_FORCE_INLINE Mat33V M33Trnsps(const Mat33V& a)
+{
+	Vec3V col0 = a.col0, col1 = a.col1, col2 = a.col2;
+	V3Transpose(col0, col1, col2);
+	return Mat33V(col0, col1, col2);
+}
+
+PX_FORCE_INLINE Mat33V M33Identity()
+{
+	return Mat33V(V3UnitX(), V3UnitY(), V3UnitZ());
+}
+
+PX_FORCE_INLINE Mat33V M33Diagonal(const Vec3VArg d)
+{
+	const FloatV x = V3Mul(V3UnitX(), d);
+	const FloatV y = V3Mul(V3UnitY(), d);
+	const FloatV z = V3Mul(V3UnitZ(), d);
+	return Mat33V(x, y, z);
+}
+
+//////////////////////////////////
+// MAT34V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec3V M34MulV3(const Mat34V& a, const Vec3V b)
+{
+	const FloatV x = V3GetX(b);
+	const FloatV y = V3GetY(b);
+	const FloatV z = V3GetZ(b);
+	const Vec3V v0 = V3Scale(a.col0, x);
+	const Vec3V v1 = V3Scale(a.col1, y);
+	const Vec3V v2 = V3Scale(a.col2, z);
+	const Vec3V v0PlusV1 = V3Add(v0, v1);
+	const Vec3V v0PlusV1Plusv2 = V3Add(v0PlusV1, v2);
+	return V3Add(v0PlusV1Plusv2, a.col3);
+}
+
+PX_FORCE_INLINE Vec3V M34Mul33V3(const Mat34V& a, const Vec3V b)
+{
+	const FloatV x = V3GetX(b);
+	const FloatV y = V3GetY(b);
+	const FloatV z = V3GetZ(b);
+	const Vec3V v0 = V3Scale(a.col0, x);
+	const Vec3V v1 = V3Scale(a.col1, y);
+	const Vec3V v2 = V3Scale(a.col2, z);
+	const Vec3V v0PlusV1 = V3Add(v0, v1);
+	return V3Add(v0PlusV1, v2);
+}
+
+PX_FORCE_INLINE Vec3V M34TrnspsMul33V3(const Mat34V& a, const Vec3V b)
+{
+	Vec3V v0 = V3Mul(a.col0, b);
+	Vec3V v1 = V3Mul(a.col1, b);
+	Vec3V v2 = V3Mul(a.col2, b);
+	V3Transpose(v0, v1, v2);
+	return V3Add(V3Add(v0, v1), v2);
+}
+
+PX_FORCE_INLINE Mat34V M34MulM34(const Mat34V& a, const Mat34V& b)
+{
+	return Mat34V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2), M34MulV3(a, b.col3));
+}
+
+PX_FORCE_INLINE Mat33V M34MulM33(const Mat34V& a, const Mat33V& b)
+{
+	return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M34Mul33MM34(const Mat34V& a, const Mat34V& b)
+{
+	return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2));
+}
+
+PX_FORCE_INLINE Mat34V M34Add(const Mat34V& a, const Mat34V& b)
+{
+	return Mat34V(V3Add(a.col0, b.col0), V3Add(a.col1, b.col1), V3Add(a.col2, b.col2), V3Add(a.col3, b.col3));
+}
+
+PX_FORCE_INLINE Mat34V M34Inverse(const Mat34V& a)
+{
+	Mat34V aInv;
+	const BoolV tfft = BTFFT();
+	const BoolV tttf = BTTTF();
+	const FloatV zero = V3Zero();
+	const Vec3V cross01 = V3Cross(a.col0, a.col1);
+	const Vec3V cross12 = V3Cross(a.col1, a.col2);
+	const Vec3V cross20 = V3Cross(a.col2, a.col0);
+	const FloatV dot = V3Dot(cross01, a.col2);
+	const FloatV invDet = _mm_rcp_ps(dot);
+	const Vec3V mergeh = _mm_unpacklo_ps(cross12, cross01);
+	const Vec3V mergel = _mm_unpackhi_ps(cross12, cross01);
+	Vec3V colInv0 = _mm_unpacklo_ps(mergeh, cross20);
+	colInv0 = _mm_or_ps(_mm_andnot_ps(tttf, zero), _mm_and_ps(tttf, colInv0));
+	const Vec3V zppd = _mm_shuffle_ps(mergeh, cross20, _MM_SHUFFLE(3, 0, 0, 2));
+	const Vec3V pbwp = _mm_shuffle_ps(cross20, mergeh, _MM_SHUFFLE(3, 3, 1, 0));
+	const Vec3V colInv1 = _mm_or_ps(_mm_andnot_ps(BTFFT(), pbwp), _mm_and_ps(BTFFT(), zppd));
+	const Vec3V xppd = _mm_shuffle_ps(mergel, cross20, _MM_SHUFFLE(3, 0, 0, 0));
+	const Vec3V pcyp = _mm_shuffle_ps(cross20, mergel, _MM_SHUFFLE(3, 1, 2, 0));
+	const Vec3V colInv2 = _mm_or_ps(_mm_andnot_ps(tfft, pcyp), _mm_and_ps(tfft, xppd));
+	aInv.col0 = _mm_mul_ps(colInv0, invDet);
+	aInv.col1 = _mm_mul_ps(colInv1, invDet);
+	aInv.col2 = _mm_mul_ps(colInv2, invDet);
+	aInv.col3 = M34Mul33V3(aInv, V3Neg(a.col3));
+	return aInv;
+}
+
+PX_FORCE_INLINE Mat33V M34Trnsps33(const Mat34V& a)
+{
+	Vec3V col0 = a.col0, col1 = a.col1, col2 = a.col2;
+	V3Transpose(col0, col1, col2);
+	return Mat33V(col0, col1, col2);
+}
+
+//////////////////////////////////
+// MAT44V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec4V M44MulV4(const Mat44V& a, const Vec4V b)
+{
+	const FloatV x = V4GetX(b);
+	const FloatV y = V4GetY(b);
+	const FloatV z = V4GetZ(b);
+	const FloatV w = V4GetW(b);
+
+	const Vec4V v0 = V4Scale(a.col0, x);
+	const Vec4V v1 = V4Scale(a.col1, y);
+	const Vec4V v2 = V4Scale(a.col2, z);
+	const Vec4V v3 = V4Scale(a.col3, w);
+	const Vec4V v0PlusV1 = V4Add(v0, v1);
+	const Vec4V v0PlusV1Plusv2 = V4Add(v0PlusV1, v2);
+	return V4Add(v0PlusV1Plusv2, v3);
+}
+
+PX_FORCE_INLINE Vec4V M44TrnspsMulV4(const Mat44V& a, const Vec4V b)
+{
+	Vec4V v0 = V4Mul(a.col0, b);
+	Vec4V v1 = V4Mul(a.col1, b);
+	Vec4V v2 = V4Mul(a.col2, b);
+	Vec4V v3 = V4Mul(a.col3, b);
+	V4Transpose(v0, v1, v2, v3);
+	return V4Add(V4Add(v0, v1), V4Add(v2, v3));
+}
+
+PX_FORCE_INLINE Mat44V M44MulM44(const Mat44V& a, const Mat44V& b)
+{
+	return Mat44V(M44MulV4(a, b.col0), M44MulV4(a, b.col1), M44MulV4(a, b.col2), M44MulV4(a, b.col3));
+}
+
+PX_FORCE_INLINE Mat44V M44Add(const Mat44V& a, const Mat44V& b)
+{
+	return Mat44V(V4Add(a.col0, b.col0), V4Add(a.col1, b.col1), V4Add(a.col2, b.col2), V4Add(a.col3, b.col3));
+}
+
+PX_FORCE_INLINE Mat44V M44Trnsps(const Mat44V& a)
+{
+	Vec4V col0 = a.col0, col1 = a.col1, col2 = a.col2, col3 = a.col3;
+	V4Transpose(col0, col1, col2, col3);
+	return Mat44V(col0, col1, col2, col3);
+}
+
+PX_FORCE_INLINE Mat44V M44Inverse(const Mat44V& a)
+{
+	__m128 minor0, minor1, minor2, minor3;
+	__m128 row0, row1, row2, row3;
+	__m128 det, tmp1;
+
+	tmp1 = V4Zero();
+	row1 = V4Zero();
+	row3 = V4Zero();
+
+	row0 = a.col0;
+	row1 = _mm_shuffle_ps(a.col1, a.col1, _MM_SHUFFLE(1, 0, 3, 2));
+	row2 = a.col2;
+	row3 = _mm_shuffle_ps(a.col3, a.col3, _MM_SHUFFLE(1, 0, 3, 2));
+
+	tmp1 = _mm_mul_ps(row2, row3);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	minor0 = _mm_mul_ps(row1, tmp1);
+	minor1 = _mm_mul_ps(row0, tmp1);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
+	minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
+	minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E);
+
+	tmp1 = _mm_mul_ps(row1, row2);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
+	minor3 = _mm_mul_ps(row0, tmp1);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
+	minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
+	minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E);
+
+	tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	row2 = _mm_shuffle_ps(row2, row2, 0x4E);
+	minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
+	minor2 = _mm_mul_ps(row0, tmp1);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
+	minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
+	minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
+
+	tmp1 = _mm_mul_ps(row0, row1);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
+	minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
+	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
+
+	tmp1 = _mm_mul_ps(row0, row3);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
+	minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
+	minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
+
+	tmp1 = _mm_mul_ps(row0, row2);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
+	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
+	minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
+
+	det = _mm_mul_ps(row0, minor0);
+	det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
+	det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
+	tmp1 = _mm_rcp_ss(det);
+#if 0
+	det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
+	det = _mm_shuffle_ps(det, det, 0x00);
+#else
+	det = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(0, 0, 0, 0));
+#endif
+
+	minor0 = _mm_mul_ps(det, minor0);
+	minor1 = _mm_mul_ps(det, minor1);
+	minor2 = _mm_mul_ps(det, minor2);
+	minor3 = _mm_mul_ps(det, minor3);
+	Mat44V invTrans(minor0, minor1, minor2, minor3);
+	return M44Trnsps(invTrans);
+}
+
+PX_FORCE_INLINE Vec4V V4LoadXYZW(const PxF32& x, const PxF32& y, const PxF32& z, const PxF32& w)
+{
+	return _mm_set_ps(w, z, y, x);
+}
+
+PX_FORCE_INLINE VecU32V V4U32Sel(const BoolV c, const VecU32V a, const VecU32V b)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_or_si128(_mm_andnot_si128(internalWindowsSimd::m128_F2I(c), internalWindowsSimd::m128_F2I(b)),
+	                 _mm_and_si128(internalWindowsSimd::m128_F2I(c), internalWindowsSimd::m128_F2I(a))));
+}
+
+PX_FORCE_INLINE VecU32V V4U32or(VecU32V a, VecU32V b)
+{
+	return internalWindowsSimd::m128_I2F(_mm_or_si128(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b)));
+}
+
+PX_FORCE_INLINE VecU32V V4U32xor(VecU32V a, VecU32V b)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_xor_si128(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b)));
+}
+
+PX_FORCE_INLINE VecU32V V4U32and(VecU32V a, VecU32V b)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_and_si128(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b)));
+}
+
+PX_FORCE_INLINE VecU32V V4U32Andc(VecU32V a, VecU32V b)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_andnot_si128(internalWindowsSimd::m128_F2I(b), internalWindowsSimd::m128_F2I(a)));
+}
+
+PX_FORCE_INLINE VecI32V U4Load(const PxU32 i)
+{
+	return _mm_load1_ps((PxF32*)&i);
+}
+
+PX_FORCE_INLINE VecU32V U4LoadU(const PxU32* i)
+{
+	return _mm_loadu_ps((PxF32*)i);
+}
+
+PX_FORCE_INLINE VecU32V U4LoadA(const PxU32* i)
+{
+	ASSERT_ISALIGNED16(i);
+	return _mm_load_ps((PxF32*)i);
+}
+
+PX_FORCE_INLINE VecI32V I4Load(const PxI32 i)
+{
+	return _mm_load1_ps((PxF32*)&i);
+}
+
+PX_FORCE_INLINE VecI32V I4LoadU(const PxI32* i)
+{
+	return _mm_loadu_ps((PxF32*)i);
+}
+
+PX_FORCE_INLINE VecI32V I4LoadA(const PxI32* i)
+{
+	ASSERT_ISALIGNED16(i);
+	return _mm_load_ps((PxF32*)i);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Add(const VecI32VArg a, const VecI32VArg b)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_add_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b)));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Sub(const VecI32VArg a, const VecI32VArg b)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_sub_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b)));
+}
+
+PX_FORCE_INLINE BoolV VecI32V_IsGrtr(const VecI32VArg a, const VecI32VArg b)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_cmpgt_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b)));
+}
+
+PX_FORCE_INLINE BoolV VecI32V_IsEq(const VecI32VArg a, const VecI32VArg b)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_cmpeq_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b)));
+}
+
+PX_FORCE_INLINE VecI32V V4I32Sel(const BoolV c, const VecI32V a, const VecI32V b)
+{
+	return V4U32Sel(c, a, b);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Zero()
+{
+	return V4Zero();
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_One()
+{
+	return I4Load(1);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Two()
+{
+	return I4Load(2);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_MinusOne()
+{
+	return I4Load(-1);
+}
+
+PX_FORCE_INLINE VecU32V U4Zero()
+{
+	return U4Load(0);
+}
+
+PX_FORCE_INLINE VecU32V U4One()
+{
+	return U4Load(1);
+}
+
+PX_FORCE_INLINE VecU32V U4Two()
+{
+	return U4Load(2);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Sel(const BoolV c, const VecI32VArg a, const VecI32VArg b)
+{
+	PX_ASSERT(_VecMathTests::allElementsEqualBoolV(c, BTTTT()) ||
+			  _VecMathTests::allElementsEqualBoolV(c, BFFFF()));
+	return _mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a));
+}
+
+PX_FORCE_INLINE VecShiftV VecI32V_PrepareShift(const VecI32VArg shift)
+{
+	VecShiftV preparedShift;
+	preparedShift.shift = _mm_or_ps(_mm_andnot_ps(BTFFF(), VecI32V_Zero()), _mm_and_ps(BTFFF(), shift)); 
+	return preparedShift;
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_LeftShift(const VecI32VArg a, const VecShiftVArg count)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_sll_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(count.shift)));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_RightShift(const VecI32VArg a, const VecShiftVArg count)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_srl_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(count.shift)));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_And(const VecI32VArg a, const VecI32VArg b)
+{
+	return _mm_and_ps(a, b);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Or(const VecI32VArg a, const VecI32VArg b)
+{
+	return _mm_or_ps(a, b);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetX(const VecI32VArg a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetY(const VecI32VArg a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetZ(const VecI32VArg a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetW(const VecI32VArg a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3));
+}
+
+PX_FORCE_INLINE void PxI32_From_VecI32V(const VecI32VArg a, PxI32* i)
+{
+	_mm_store_ss((PxF32*)i, a);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_From_BoolV(const BoolVArg a)
+{
+	return a;
+}
+
+PX_FORCE_INLINE VecU32V VecU32V_From_BoolV(const BoolVArg a)
+{
+	return a;
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Merge(const VecI32VArg a, const VecI32VArg b, const VecI32VArg c, const VecI32VArg d)
+{
+	const __m128 xw = _mm_move_ss(b, a); // y, y, y, x
+	const __m128 yz = _mm_move_ss(c, d); // z, z, z, w
+	return _mm_shuffle_ps(xw, yz, _MM_SHUFFLE(0, 2, 1, 0));
+}
+
+PX_FORCE_INLINE void V4U32StoreAligned(VecU32V val, VecU32V* address)
+{
+	*address = val;
+}
+
+PX_FORCE_INLINE Vec4V V4Andc(const Vec4V a, const VecU32V b)
+{
+	VecU32V result32(a);
+	result32 = V4U32Andc(result32, b);
+	return Vec4V(result32);
+}
+
+PX_FORCE_INLINE VecU32V V4IsGrtrV32u(const Vec4V a, const Vec4V b)
+{
+	return V4IsGrtr(a, b);
+}
+
+PX_FORCE_INLINE VecU16V V4U16LoadAligned(VecU16V* addr)
+{
+	return *addr;
+}
+
+PX_FORCE_INLINE VecU16V V4U16LoadUnaligned(VecU16V* addr)
+{
+	return *addr;
+}
+
+// unsigned compares are not supported on x86
+PX_FORCE_INLINE VecU16V V4U16CompareGt(VecU16V a, VecU16V b)
+{
+	// _mm_cmpgt_epi16 doesn't work for unsigned values unfortunately
+	// return m128_I2F(_mm_cmpgt_epi16(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b)));
+	VecU16V result;
+	result.m128_u16[0] = PxU16((a).m128_u16[0] > (b).m128_u16[0]);
+	result.m128_u16[1] = PxU16((a).m128_u16[1] > (b).m128_u16[1]);
+	result.m128_u16[2] = PxU16((a).m128_u16[2] > (b).m128_u16[2]);
+	result.m128_u16[3] = PxU16((a).m128_u16[3] > (b).m128_u16[3]);
+	result.m128_u16[4] = PxU16((a).m128_u16[4] > (b).m128_u16[4]);
+	result.m128_u16[5] = PxU16((a).m128_u16[5] > (b).m128_u16[5]);
+	result.m128_u16[6] = PxU16((a).m128_u16[6] > (b).m128_u16[6]);
+	result.m128_u16[7] = PxU16((a).m128_u16[7] > (b).m128_u16[7]);
+	return result;
+}
+
+PX_FORCE_INLINE VecU16V V4I16CompareGt(VecU16V a, VecU16V b)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_cmpgt_epi16(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b)));
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_VecU32V(VecU32V a)
+{
+	Vec4V result = V4LoadXYZW(PxF32(a.m128_u32[0]), PxF32(a.m128_u32[1]), PxF32(a.m128_u32[2]), PxF32(a.m128_u32[3]));
+	return result;
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_VecI32V(VecI32V a)
+{
+	return _mm_cvtepi32_ps(internalWindowsSimd::m128_F2I(a));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_From_Vec4V(Vec4V a)
+{
+	return internalWindowsSimd::m128_I2F(_mm_cvttps_epi32(a));
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecU32V(VecU32V a)
+{
+	return Vec4V(a);
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecI32V(VecI32V a)
+{
+	return Vec4V(a);
+}
+
+PX_FORCE_INLINE VecU32V VecU32V_ReinterpretFrom_Vec4V(Vec4V a)
+{
+	return VecU32V(a);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_ReinterpretFrom_Vec4V(Vec4V a)
+{
+	return VecI32V(a);
+}
+
+template <int index>
+PX_FORCE_INLINE VecU32V V4U32SplatElement(VecU32V a)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_shuffle_epi32(internalWindowsSimd::m128_F2I(a), _MM_SHUFFLE(index, index, index, index)));
+}
+
+template <int index>
+PX_FORCE_INLINE Vec4V V4SplatElement(Vec4V a)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_shuffle_epi32(internalWindowsSimd::m128_F2I(a), _MM_SHUFFLE(index, index, index, index)));
+}
+
+PX_FORCE_INLINE VecU32V U4LoadXYZW(PxU32 x, PxU32 y, PxU32 z, PxU32 w)
+{
+	VecU32V result;
+	result.m128_u32[0] = x;
+	result.m128_u32[1] = y;
+	result.m128_u32[2] = z;
+	result.m128_u32[3] = w;
+	return result;
+}
+
+PX_FORCE_INLINE Vec4V V4ConvertFromI32V(const VecI32V in)
+{
+	return _mm_cvtepi32_ps(internalWindowsSimd::m128_F2I(in));
+}
+
+#endif // PSFOUNDATION_PSWINDOWSINLINEAOS_H
diff --git a/PxShared/src/foundation/include/windows/PsWindowsIntrinsics.h b/PxShared/src/foundation/include/windows/PsWindowsIntrinsics.h
new file mode 100644
index 00000000..ccf6d620
--- /dev/null
+++ b/PxShared/src/foundation/include/windows/PsWindowsIntrinsics.h
@@ -0,0 +1,190 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSWINDOWSINTRINSICS_H
+#define PSFOUNDATION_PSWINDOWSINTRINSICS_H
+
+#include "Ps.h"
+#include "foundation/PxAssert.h"
+
+// this file is for internal intrinsics - that is, intrinsics that are used in
+// cross platform code but do not appear in the API
+
+#if !PX_WINDOWS_FAMILY
+#error "This file should only be included by Windows builds!!"
+#endif
+
+#pragma warning(push)
+//'symbol' is not defined as a preprocessor macro, replacing with '0' for 'directives'
+#pragma warning(disable : 4668)
+#if PX_VC == 10
+#pragma warning(disable : 4987) // nonstandard extension used: 'throw (...)'
+#endif
+#include <intrin.h>
+#pragma warning(pop)
+
+#pragma warning(push)
+#pragma warning(disable : 4985) // 'symbol name': attributes not present on previous declaration
+#include <math.h>
+#pragma warning(pop)
+
+#include <float.h>
+#include <mmintrin.h>
+
+#pragma intrinsic(_BitScanForward)
+#pragma intrinsic(_BitScanReverse)
+
+namespace physx
+{
+namespace shdfnd
+{
+
+/*
+* Implements a memory barrier
+*/
+PX_FORCE_INLINE void memoryBarrier()
+{
+	_ReadWriteBarrier();
+	/* long Barrier;
+	__asm {
+	    xchg Barrier, eax
+	}*/
+}
+
+/*!
+Returns the index of the highest set bit. Not valid for zero arg.
+*/
+PX_FORCE_INLINE uint32_t highestSetBitUnsafe(uint32_t v)
+{
+	unsigned long retval;
+	_BitScanReverse(&retval, v);
+	return retval;
+}
+
+/*!
+Returns the index of the highest set bit. Undefined for zero arg.
+*/
+PX_FORCE_INLINE uint32_t lowestSetBitUnsafe(uint32_t v)
+{
+	unsigned long retval;
+	_BitScanForward(&retval, v);
+	return retval;
+}
+
+/*!
+Returns the number of leading zeros in v. Returns 32 for v=0.
+*/
+PX_FORCE_INLINE uint32_t countLeadingZeros(uint32_t v)
+{
+	if(v)
+	{
+		unsigned long bsr = (unsigned long)-1;
+		_BitScanReverse(&bsr, v);
+		return 31 - bsr;
+	}
+	else
+		return 32;
+}
+
+/*!
+Prefetch aligned cache size around \c ptr+offset.
+*/
+#if !PX_ARM
+PX_FORCE_INLINE void prefetchLine(const void* ptr, uint32_t offset = 0)
+{
+	// cache line on X86/X64 is 64-bytes so a 128-byte prefetch would require 2 prefetches.
+	// However, we can only dispatch a limited number of prefetch instructions so we opt to prefetch just 1 cache line
+	/*_mm_prefetch(((const char*)ptr + offset), _MM_HINT_T0);*/
+	// We get slightly better performance prefetching to non-temporal addresses instead of all cache levels
+	_mm_prefetch(((const char*)ptr + offset), _MM_HINT_NTA);
+}
+#else
+PX_FORCE_INLINE void prefetchLine(const void* ptr, uint32_t offset = 0)
+{
+	// arm does have 32b cache line size
+	__prefetch(((const char*)ptr + offset));
+}
+#endif
+
+/*!
+Prefetch \c count bytes starting at \c ptr.
+*/
+#if !PX_ARM
+PX_FORCE_INLINE void prefetch(const void* ptr, uint32_t count = 1)
+{
+	const char* cp = (char*)ptr;
+	uint64_t p = size_t(ptr);
+	uint64_t startLine = p >> 6, endLine = (p + count - 1) >> 6;
+	uint64_t lines = endLine - startLine + 1;
+	do
+	{
+		prefetchLine(cp);
+		cp += 64;
+	} while(--lines);
+}
+#else
+PX_FORCE_INLINE void prefetch(const void* ptr, uint32_t count = 1)
+{
+	const char* cp = (char*)ptr;
+	uint32_t p = size_t(ptr);
+	uint32_t startLine = p >> 5, endLine = (p + count - 1) >> 5;
+	uint32_t lines = endLine - startLine + 1;
+	do
+	{
+		prefetchLine(cp);
+		cp += 32;
+	} while(--lines);
+}
+#endif
+
+//! \brief platform-specific reciprocal
+PX_CUDA_CALLABLE PX_FORCE_INLINE float recipFast(float a)
+{
+	return 1.0f / a;
+}
+
+//! \brief platform-specific fast reciprocal square root
+PX_CUDA_CALLABLE PX_FORCE_INLINE float recipSqrtFast(float a)
+{
+	return 1.0f / ::sqrtf(a);
+}
+
+//! \brief platform-specific floor
+PX_CUDA_CALLABLE PX_FORCE_INLINE float floatFloor(float x)
+{
+	return ::floorf(x);
+}
+
+#define NS_EXPECT_TRUE(x) x
+#define NS_EXPECT_FALSE(x) x
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSWINDOWSINTRINSICS_H
diff --git a/PxShared/src/foundation/include/windows/PsWindowsLoadLibrary.h b/PxShared/src/foundation/include/windows/PsWindowsLoadLibrary.h
new file mode 100644
index 00000000..882f0b70
--- /dev/null
+++ b/PxShared/src/foundation/include/windows/PsWindowsLoadLibrary.h
@@ -0,0 +1,72 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PS_WINDOWS_FOUNDATION_LOADLIBRARY_H
+#define PS_WINDOWS_FOUNDATION_LOADLIBRARY_H
+
+#include "foundation/PxPreprocessor.h"
+#include "windows/PsWindowsInclude.h"
+#include "foundation/windows/PxWindowsFoundationDelayLoadHook.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+	EXTERN_C IMAGE_DOS_HEADER __ImageBase;
+
+	PX_INLINE FARPROC WINAPI foundationDliNotePreLoadLibrary(const char* libraryName, const physx::PxFoundationDelayLoadHook* delayLoadHook)
+	{	
+		if(!delayLoadHook)
+		{
+			return (FARPROC)::LoadLibraryA(libraryName);
+		}
+		else
+		{
+			if(strstr(libraryName, "PxFoundation"))
+			{
+				if(strstr(libraryName, "DEBUG"))
+					return (FARPROC)::LoadLibraryA(delayLoadHook->getPxFoundationDEBUGDllName());
+
+				if(strstr(libraryName, "CHECKED"))
+					return (FARPROC)::LoadLibraryA(delayLoadHook->getPxFoundationCHECKEDDllName());
+
+				if(strstr(libraryName, "PROFILE"))
+					return (FARPROC)::LoadLibraryA(delayLoadHook->getPxFoundationPROFILEDllName());
+
+				return (FARPROC)::LoadLibraryA(delayLoadHook->getPxFoundationDllName());
+			}
+		}
+		return NULL;
+    }
+} // namespace shdfnd
+} // namespace physx
+
+
+#endif	// PS_WINDOWS_FOUNDATION_LOADLIBRARY_H
diff --git a/PxShared/src/foundation/include/windows/PsWindowsTrigConstants.h b/PxShared/src/foundation/include/windows/PsWindowsTrigConstants.h
new file mode 100644
index 00000000..6693fc2a
--- /dev/null
+++ b/PxShared/src/foundation/include/windows/PsWindowsTrigConstants.h
@@ -0,0 +1,87 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSWINDOWSTRIGCONSTANTS_H
+#define PSFOUNDATION_PSWINDOWSTRIGCONSTANTS_H
+
+#define PX_GLOBALCONST extern const __declspec(selectany)
+
+__declspec(align(16)) struct PX_VECTORF32
+{
+	float f[4];
+};
+
+//#define PX_PI               3.141592654f
+//#define PX_2PI              6.283185307f
+//#define PX_1DIVPI           0.318309886f
+//#define PX_1DIV2PI          0.159154943f
+//#define PX_PIDIV2           1.570796327f
+//#define PX_PIDIV4           0.785398163f
+
+PX_GLOBALCONST PX_VECTORF32 g_PXSinCoefficients0 = { { 1.0f, -0.166666667f, 8.333333333e-3f, -1.984126984e-4f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXSinCoefficients1 = { { 2.755731922e-6f, -2.505210839e-8f, 1.605904384e-10f, -7.647163732e-13f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXSinCoefficients2 = { { 2.811457254e-15f, -8.220635247e-18f, 1.957294106e-20f, -3.868170171e-23f } };
+PX_GLOBALCONST PX_VECTORF32 g_PXCosCoefficients0 = { { 1.0f, -0.5f, 4.166666667e-2f, -1.388888889e-3f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXCosCoefficients1 = { { 2.480158730e-5f, -2.755731922e-7f, 2.087675699e-9f, -1.147074560e-11f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXCosCoefficients2 = { { 4.779477332e-14f, -1.561920697e-16f, 4.110317623e-19f, -8.896791392e-22f } };
+PX_GLOBALCONST PX_VECTORF32 g_PXTanCoefficients0 = { { 1.0f, 0.333333333f, 0.133333333f, 5.396825397e-2f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXTanCoefficients1 = { { 2.186948854e-2f, 8.863235530e-3f, 3.592128167e-3f, 1.455834485e-3f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXTanCoefficients2 = { { 5.900274264e-4f, 2.391290764e-4f, 9.691537707e-5f, 3.927832950e-5f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXASinCoefficients0 = { { -0.05806367563904f, -0.41861972469416f, 0.22480114791621f, 2.17337241360606f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXASinCoefficients1 = { { 0.61657275907170f, 4.29696498283455f, -1.18942822255452f, -6.53784832094831f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXASinCoefficients2 = { { -1.36926553863413f, -4.48179294237210f, 1.41810672941833f, 5.48179257935713f } };
+PX_GLOBALCONST PX_VECTORF32 g_PXATanCoefficients0 = { { 1.0f, 0.333333334f, 0.2f, 0.142857143f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXATanCoefficients1 = { { 1.111111111e-1f, 9.090909091e-2f, 7.692307692e-2f, 6.666666667e-2f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXATanCoefficients2 = { { 5.882352941e-2f, 5.263157895e-2f, 4.761904762e-2f, 4.347826087e-2f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXSinEstCoefficients = { { 1.0f, -1.66521856991541e-1f, 8.199913018755e-3f, -1.61475937228e-4f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXCosEstCoefficients = { { 1.0f, -4.95348008918096e-1f, 3.878259962881e-2f, -9.24587976263e-4f } };
+PX_GLOBALCONST PX_VECTORF32 g_PXTanEstCoefficients = { { 2.484f, -1.954923183e-1f, 2.467401101f, PxInvPi } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXATanEstCoefficients = { { 7.689891418951e-1f, 1.104742493348f, 8.661844266006e-1f, PxPiDivTwo } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXASinEstCoefficients = { { -1.36178272886711f, 2.37949493464538f, -8.08228565650486e-1f, 2.78440142746736e-1f } };
+PX_GLOBALCONST PX_VECTORF32 g_PXASinEstConstants = { { 1.00000011921f, PxPiDivTwo, 0.0f, 0.0f } };
+PX_GLOBALCONST PX_VECTORF32 g_PXPiConstants0 = { { PxPi, PxTwoPi, PxInvPi, PxInvTwoPi } };
+PX_GLOBALCONST PX_VECTORF32 g_PXReciprocalTwoPi = { { PxInvTwoPi, PxInvTwoPi, PxInvTwoPi, PxInvTwoPi } };
+PX_GLOBALCONST PX_VECTORF32 g_PXTwoPi = { { PxTwoPi, PxTwoPi, PxTwoPi, PxTwoPi } };
+
+#endif
author	git perforce import user <a@b>	2016-10-25 12:29:14 -0600
committer	Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees>	2016-10-25 18:56:37 -0500
commit	3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
tree	fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /PxShared/src/foundation/include
download	physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip