NvCloth library v1.0.0

author: mtamis <[email protected]> 2017-02-15 16:06:25 +0100
committer: mtamis <[email protected]> 2017-02-15 16:06:25 +0100
commit: 85305930aeeb1d513e23522bd91f29ba81aa6d14 (patch)
tree: 45f1bb20a45a300d1fef107e436cac95602a0e57 /PxShared/src
download: nvcloth-85305930aeeb1d513e23522bd91f29ba81aa6d14.tar.xz
nvcloth-85305930aeeb1d513e23522bd91f29ba81aa6d14.zip
255 files changed, 59040 insertions, 0 deletions
diff --git a/PxShared/src/compiler/cmake/Android/CMakeLists.txt b/PxShared/src/compiler/cmake/Android/CMakeLists.txt
new file mode 100644
index 0000000..0499c29
--- /dev/null
+++ b/PxShared/src/compiler/cmake/Android/CMakeLists.txt
@@ -0,0 +1,43 @@
+cmake_minimum_required(VERSION 3.3)
+include(../common/CMakeLists.txt)
+
+STRING(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_LOWERCASE)
+
+IF(NOT DEFINED TARGET_BUILD_PLATFORM) # Not defined, default to Android
+	SET(TARGET_BUILD_PLATFORM "Android")
+ENDIF()
+
+SET(PLATFORM_LIST Android)
+
+IF (NOT ${TARGET_BUILD_PLATFORM} IN_LIST PLATFORM_LIST)
+	MESSAGE(FATAL_ERROR "Invalid platform:" ${TARGET_BUILD_PLATFORM})
+ENDIF()
+
+if(${ANDROID_ABI} STREQUAL "armeabi-v7a")
+	SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}  -std=c++11 -fno-rtti -fno-exceptions -ffast-math -ffunction-sections -fdata-sections -D__STDC_LIMIT_MACROS -Wno-invalid-offsetof ")
+elseif(${ANDROID_ABI} STREQUAL "arm64-v8a")
+	SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}  -std=c++11 -fno-rtti -fno-exceptions -ffast-math -ffunction-sections -fdata-sections -D__STDC_LIMIT_MACROS -Wno-invalid-offsetof ")
+elseif(${ANDROID_ABI} STREQUAL "x86")
+	SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}  -std=c++11 -fno-rtti -fno-exceptions -ffast-math -ffunction-sections -fdata-sections -D__STDC_LIMIT_MACROS -Wno-invalid-offsetof -fpack-struct=8 -malign-double ")
+elseif(${ANDROID_ABI} STREQUAL "x86_64")
+	SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}  -std=c++11 -fno-rtti -fno-exceptions -ffast-math -ffunction-sections -fdata-sections -D__STDC_LIMIT_MACROS -Wno-invalid-offsetof -mstackrealign -msse3  ")
+endif()
+	
+SET(CMAKE_CXX_FLAGS_DEBUG "-O0 -g")
+SET(CMAKE_CXX_FLAGS_CHECKED "-O2")
+SET(CMAKE_CXX_FLAGS_PROFILE "-O2")
+SET(CMAKE_CXX_FLAGS_RELEASE "-O2")
+
+SET(PXSHARED_ANDROID_COMPILE_DEFS _LIB;)
+SET(PXSHARED_ANDROID_DEBUG_COMPILE_DEFS _DEBUG;PX_DEBUG=1;PX_CHECKED=1)
+SET(PXSHARED_ANDROID_CHECKED_COMPILE_DEFS NDEBUG;PX_CHECKED=1)
+SET(PXSHARED_ANDROID_PROFILE_COMPILE_DEFS NDEBUG;PX_PROFILE=1)
+SET(PXSHARED_ANDROID_RELEASE_COMPILE_DEFS NDEBUG)
+
+# NOTE: PxCudaContextManager excluded on this platform
+
+# Include project cmake files here
+INCLUDE(PxFoundation.cmake)
+INCLUDE(PsFastXml.cmake)
+INCLUDE(PxPvdSDK.cmake)
+INCLUDE(PxTask.cmake)
diff --git a/PxShared/src/compiler/cmake/Android/PsFastXml.cmake b/PxShared/src/compiler/cmake/Android/PsFastXml.cmake
new file mode 100644
index 0000000..81d356e
--- /dev/null
+++ b/PxShared/src/compiler/cmake/Android/PsFastXml.cmake
@@ -0,0 +1,40 @@
+#
+# Build PsFastXml
+#
+
+SET(PXSHARED_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../../src)
+
+SET(LL_SOURCE_DIR ${PXSHARED_SOURCE_DIR}/fastxml)
+
+# Use generator expressions to set config specific preprocessor definitions
+SET(PSFASTXML_COMPILE_DEFS 
+	# Common to all configurations
+	${PXSHARED_ANDROID_COMPILE_DEFS};PX_FOUNDATION_DLL=0;PxShared_STATIC_LIB;
+)
+
+if(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "debug")
+	LIST(APPEND PSFASTXML_COMPILE_DEFS 
+		${PXSHARED_ANDROID_DEBUG_COMPILE_DEFS}
+	)
+elseif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "checked")
+	LIST(APPEND PSFASTXML_COMPILE_DEFS 
+		${PXSHARED_ANDROID_CHECKED_COMPILE_DEFS}
+	)
+elseif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "profile")
+	LIST(APPEND PSFASTXML_COMPILE_DEFS 
+		${PXSHARED_ANDROID_PROFILE_COMPILE_DEFS}
+	)
+elseif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL release)
+	LIST(APPEND PSFASTXML_COMPILE_DEFS 
+		${PXSHARED_ANDROID_RELEASE_COMPILE_DEFS}
+	)
+else(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "debug")
+	MESSAGE(FATAL_ERROR "Unknown configuration ${CMAKE_BUILD_TYPE}")
+endif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "debug")
+
+
+# include PsFastXml common
+INCLUDE(../common/PsFastXml.cmake)
+
+# enable -fPIC so we can link static libs with the editor
+SET_TARGET_PROPERTIES(PsFastXml PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
diff --git a/PxShared/src/compiler/cmake/Android/PxFoundation.cmake b/PxShared/src/compiler/cmake/Android/PxFoundation.cmake
new file mode 100644
index 0000000..c23a327
--- /dev/null
+++ b/PxShared/src/compiler/cmake/Android/PxFoundation.cmake
@@ -0,0 +1,62 @@
+#
+# Build PxFoundation
+#
+
+SET(PXSHARED_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../../src)
+
+SET(LL_SOURCE_DIR ${PXSHARED_SOURCE_DIR}/foundation)
+
+SET(PXFOUNDATION_LIBTYPE STATIC)
+
+SET(PXFOUNDATION_PLATFORM_FILES
+	${LL_SOURCE_DIR}/src/unix/PsUnixAtomic.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixCpu.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixFPU.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixMutex.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixPrintString.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixSList.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixSocket.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixSync.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixThread.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixTime.cpp
+	${ANDROID_NDK}/sources/android/cpufeatures/cpu-features.c 
+)
+
+SET(PXFOUNDATION_PLATFORM_INCLUDES
+	${LL_SOURCE_DIR}/include/linux
+	${ANDROID_NDK}/sources/android/cpufeatures
+)
+
+SET(PXFOUNDATION_COMPILE_DEFS
+	# Common to all configurations
+	${PXSHARED_ANDROID_COMPILE_DEFS};PxShared_STATIC_LIB;
+)
+
+if(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "debug")
+	LIST(APPEND PXFOUNDATION_COMPILE_DEFS
+		${PXSHARED_ANDROID_DEBUG_COMPILE_DEFS}
+	)
+elseif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "checked")
+	LIST(APPEND PXFOUNDATION_COMPILE_DEFS
+		${PXSHARED_ANDROID_CHECKED_COMPILE_DEFS}
+	)
+elseif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "profile")
+	LIST(APPEND PXFOUNDATION_COMPILE_DEFS
+		${PXSHARED_ANDROID_PROFILE_COMPILE_DEFS}
+	)
+elseif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL release)
+	LIST(APPEND PXFOUNDATION_COMPILE_DEFS
+		${PXSHARED_ANDROID_RELEASE_COMPILE_DEFS}
+	)
+else(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "debug")
+	MESSAGE(FATAL_ERROR "Unknown configuration ${CMAKE_BUILD_TYPE}")
+endif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "debug")
+
+
+# include PxFoundation common
+INCLUDE(../common/PxFoundation.cmake)
+
+TARGET_LINK_LIBRARIES(PxFoundation PUBLIC log)
+
+# enable -fPIC so we can link static libs with the editor
+SET_TARGET_PROPERTIES(PxFoundation PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
diff --git a/PxShared/src/compiler/cmake/Android/PxPvdSDK.cmake b/PxShared/src/compiler/cmake/Android/PxPvdSDK.cmake
new file mode 100644
index 0000000..f517efc
--- /dev/null
+++ b/PxShared/src/compiler/cmake/Android/PxPvdSDK.cmake
@@ -0,0 +1,44 @@
+#
+# Build PxPvdSDK
+#
+
+SET(PXSHARED_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../../src)
+
+SET(LL_SOURCE_DIR ${PXSHARED_SOURCE_DIR}/pvd)
+
+SET(PXPVDSDK_LIBTYPE STATIC)
+
+# Use generator expressions to set config specific preprocessor definitions
+SET(PXPVDSDK_COMPILE_DEFS 
+	# Common to all configurations
+	${PXSHARED_ANDROID_COMPILE_DEFS};PxShared_STATIC_LIB;
+)
+
+if(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "debug")
+	LIST(APPEND PXPVDSDK_COMPILE_DEFS 
+		${PXSHARED_ANDROID_DEBUG_COMPILE_DEFS}
+	)
+elseif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "checked")
+	LIST(APPEND PXPVDSDK_COMPILE_DEFS 
+		${PXSHARED_ANDROID_CHECKED_COMPILE_DEFS}
+	)
+elseif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "profile")
+	LIST(APPEND PXPVDSDK_COMPILE_DEFS 
+		${PXSHARED_ANDROID_PROFILE_COMPILE_DEFS}
+	)
+elseif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL release)
+	LIST(APPEND PXPVDSDK_COMPILE_DEFS 
+		${PXSHARED_ANDROID_RELEASE_COMPILE_DEFS}
+	)
+else(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "debug")
+	MESSAGE(FATAL_ERROR "Unknown configuration ${CMAKE_BUILD_TYPE}")
+endif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "debug")
+
+# include PxPvdSDK common
+INCLUDE(../common/PxPvdSDK.cmake)
+
+# Add linked libraries
+TARGET_LINK_LIBRARIES(PxPvdSDK PRIVATE PxFoundation)
+
+# enable -fPIC so we can link static libs with the editor
+SET_TARGET_PROPERTIES(PxPvdSDK PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
diff --git a/PxShared/src/compiler/cmake/Android/PxTask.cmake b/PxShared/src/compiler/cmake/Android/PxTask.cmake
new file mode 100644
index 0000000..e02aefd
--- /dev/null
+++ b/PxShared/src/compiler/cmake/Android/PxTask.cmake
@@ -0,0 +1,39 @@
+#
+# Build PxTask
+#
+
+SET(PXSHARED_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../../src)
+
+SET(LL_SOURCE_DIR ${PXSHARED_SOURCE_DIR}/task)
+
+SET(PXTASK_COMPILE_DEFS 
+	${PXSHARED_ANDROID_COMPILE_DEFS};PxShared_STATIC_LIB;
+)
+
+if(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "debug")
+	LIST(APPEND PXTASK_COMPILE_DEFS
+		${PXSHARED_ANDROID_DEBUG_COMPILE_DEFS}
+	)
+elseif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "checked")
+	LIST(APPEND PXTASK_COMPILE_DEFS
+		${PXSHARED_ANDROID_CHECKED_COMPILE_DEFS}
+	)
+elseif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "profile")
+	LIST(APPEND PXTASK_COMPILE_DEFS
+		${PXSHARED_ANDROID_PROFILE_COMPILE_DEFS}
+	)
+elseif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL release)
+	LIST(APPEND PXTASK_COMPILE_DEFS
+		${PXSHARED_ANDROID_RELEASE_COMPILE_DEFS}
+	)
+else(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "debug")
+	MESSAGE(FATAL_ERROR "Unknown configuration ${CMAKE_BUILD_TYPE}")
+endif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "debug")
+
+SET(PXTASK_LIBTYPE OBJECT)
+
+# include PxTask common
+INCLUDE(../common/PxTask.cmake)
+
+# enable -fPIC so we can link static libs with the editor
+SET_TARGET_PROPERTIES(PxTask PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
diff --git a/PxShared/src/compiler/cmake/IOS/CMakeLists.txt b/PxShared/src/compiler/cmake/IOS/CMakeLists.txt
new file mode 100644
index 0000000..d281e32
--- /dev/null
+++ b/PxShared/src/compiler/cmake/IOS/CMakeLists.txt
@@ -0,0 +1,73 @@
+cmake_minimum_required(VERSION 3.3)
+include(../common/CMakeLists.txt)
+
+
+IF(NOT DEFINED TARGET_BUILD_PLATFORM) # Not defined, default to IOS
+	SET(TARGET_BUILD_PLATFORM "IOS")
+ENDIF()
+
+SET(PLATFORM_LIST IOS)
+
+IF (NOT ${TARGET_BUILD_PLATFORM} IN_LIST PLATFORM_LIST)
+	MESSAGE(FATAL_ERROR "Invalid platform:" ${TARGET_BUILD_PLATFORM})
+ENDIF()
+
+SET(CMAKE_CXX_FLAGS "-std=c++11 -fno-rtti -fno-exceptions -ffast-math -ffunction-sections -fdata-sections -Werror -ferror-limit=0 -Wall -Wextra -fstrict-aliasing -Wstrict-aliasing=2 -Weverything -Wno-documentation-deprecated-sync -Wno-documentation-unknown-command -Wno-float-equal -Wno-padded -Wno-weak-vtables -Wno-cast-align -Wno-conversion -Wno-missing-noreturn -Wno-missing-variable-declarations -Wno-shift-sign-overflow -Wno-covered-switch-default -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-prototypes -Wno-unreachable-code -Wno-unused-macros -Wno-unused-member-function -Wno-used-but-marked-unused -Wno-weak-template-vtables -Wno-deprecated -Wno-non-virtual-dtor -Wno-invalid-noreturn -Wno-return-type-c-linkage -Wno-reserved-id-macro -Wno-c++98-compat-pedantic -Wno-unused-local-typedef -Wno-old-style-cast -Wno-newline-eof -Wno-unused-private-field -Wno-undefined-reinterpret-cast -Wno-invalid-offsetof -gdwarf-2")
+
+SET(CMAKE_SHARED_LINKER_FLAGS "")
+
+SET(CMAKE_CXX_FLAGS_DEBUG "-O0 -g")
+SET(CMAKE_CXX_FLAGS_CHECKED "-O3 -g")
+SET(CMAKE_CXX_FLAGS_PROFILE "-O3 -g")
+SET(CMAKE_CXX_FLAGS_RELEASE "-O3 -g")
+
+SET(CMAKE_OSX_DEPLOYMENT_TARGET "")
+
+SET(CMAKE_OSX_ARCHITECTURES "armv7 armv7s arm64")
+
+SET(IOS_PLATFORM_LOCATION "iPhoneOS.platform")
+
+# Setup iOS developer location unless specified manually with CMAKE_IOS_DEVELOPER_ROOT
+EXEC_PROGRAM(/usr/bin/xcode-select ARGS -print-path OUTPUT_VARIABLE CMAKE_XCODE_DEVELOPER_DIR)
+SET(XCODE_ROOT "${CMAKE_XCODE_DEVELOPER_DIR}/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
+IF(NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT)
+	IF(EXISTS ${XCODE_ROOT})
+		SET(CMAKE_IOS_DEVELOPER_ROOT ${XCODE_ROOT})
+	ENDIF(EXISTS ${XCODE_ROOT})
+ENDIF(NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT)
+SET(CMAKE_IOS_DEVELOPER_ROOT ${CMAKE_IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform")
+
+# Find and use the most recent iOS sdk unless specified manually with CMAKE_IOS_SDK_ROOT
+IF(NOT DEFINED CMAKE_IOS_SDK_ROOT)
+	FILE(GLOB _CMAKE_IOS_SDKS "${CMAKE_IOS_DEVELOPER_ROOT}/SDKs/*")
+	IF(_CMAKE_IOS_SDKS) 
+		LIST(SORT _CMAKE_IOS_SDKS)
+		LIST(REVERSE _CMAKE_IOS_SDKS)
+		LIST(GET _CMAKE_IOS_SDKS 0 CMAKE_IOS_SDK_ROOT)
+	ELSE(_CMAKE_IOS_SDKS)
+		MESSAGE(FATAL_ERROR "No iOS SDK's found in default search path ${CMAKE_IOS_DEVELOPER_ROOT}. Manually set CMAKE_IOS_SDK_ROOT or install the iOS SDK.")
+	ENDIF(_CMAKE_IOS_SDKS)
+	MESSAGE(STATUS "Toolchain using default iOS SDK: ${CMAKE_IOS_SDK_ROOT}")
+ENDIF(NOT DEFINED CMAKE_IOS_SDK_ROOT)
+SET(CMAKE_IOS_SDK_ROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK")
+
+# Set the sysroot default to the most recent SDK
+SET(CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")
+
+SET(CMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS "iphoneos")
+SET(CMAKE_XCODE_ATTRIBUTE_SDKROOT ${CMAKE_IOS_SDK_ROOT})
+SET(CMAKE_XCODE_ATTRIBUTE_IPHONEOS_DEPLOYMENT_TARGET "7.0")
+
+SET(PXSHARED_IOS_COMPILE_DEFS _LIB;DISABLE_CUDA_PHYSX;DISABLE_COMPUTE_PHYSX)
+SET(PXSHARED_IOS_DEBUG_COMPILE_DEFS _DEBUG;PX_DEBUG=1;PX_CHECKED=1)
+SET(PXSHARED_IOS_CHECKED_COMPILE_DEFS NDEBUG;PX_CHECKED=1)
+SET(PXSHARED_IOS_PROFILE_COMPILE_DEFS NDEBUG;PX_PROFILE=1)
+SET(PXSHARED_IOS_RELEASE_COMPILE_DEFS NDEBUG)
+
+# NOTE: PxCudaContextManager excluded on this platform
+
+# Include project cmake files here
+INCLUDE(PxFoundation.cmake)
+INCLUDE(PsFastXml.cmake)
+INCLUDE(PxPvdSDK.cmake)
+INCLUDE(PxTask.cmake)
diff --git a/PxShared/src/compiler/cmake/IOS/PsFastXml.cmake b/PxShared/src/compiler/cmake/IOS/PsFastXml.cmake
new file mode 100644
index 0000000..28b2a1b
--- /dev/null
+++ b/PxShared/src/compiler/cmake/IOS/PsFastXml.cmake
@@ -0,0 +1,22 @@
+#
+# Build PsFastXml
+#
+
+SET(PXSHARED_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../../src)
+
+SET(LL_SOURCE_DIR ${PXSHARED_SOURCE_DIR}/fastxml)
+
+# Use generator expressions to set config specific preprocessor definitions
+SET(PSFASTXML_COMPILE_DEFS 
+
+	# Common to all configurations
+	${PXSHARED_IOS_COMPILE_DEFS};PX_FOUNDATION_DLL=0;
+
+	$<$<CONFIG:debug>:${PXSHARED_IOS_DEBUG_COMPILE_DEFS};>
+	$<$<CONFIG:checked>:${PXSHARED_IOS_CHECKED_COMPILE_DEFS};>
+	$<$<CONFIG:profile>:${PXSHARED_IOS_PROFILE_COMPILE_DEFS};>
+	$<$<CONFIG:release>:${PXSHARED_IOS_RELEASE_COMPILE_DEFS};>
+)
+
+# include PsFastXml common
+INCLUDE(../common/PsFastXml.cmake)
+\ No newline at end of file
diff --git a/PxShared/src/compiler/cmake/IOS/PxFoundation.cmake b/PxShared/src/compiler/cmake/IOS/PxFoundation.cmake
new file mode 100644
index 0000000..7d022cf
--- /dev/null
+++ b/PxShared/src/compiler/cmake/IOS/PxFoundation.cmake
@@ -0,0 +1,40 @@
+#
+# Build PxFoundation
+#
+
+SET(PXSHARED_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../../src)
+
+SET(LL_SOURCE_DIR ${PXSHARED_SOURCE_DIR}/foundation)
+
+SET(PXFOUNDATION_LIBTYPE STATIC)
+
+SET(PXFOUNDATION_PLATFORM_FILES
+	${LL_SOURCE_DIR}/src/unix/PsUnixAtomic.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixCpu.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixFPU.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixMutex.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixPrintString.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixSList.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixSocket.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixSync.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixThread.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixTime.cpp
+)
+
+SET(PXFOUNDATION_PLATFORM_INCLUDES
+	${LL_SOURCE_DIR}/include/ios
+)
+
+SET(PXFOUNDATION_COMPILE_DEFS
+
+	# Common to all configurations
+	${PXSHARED_IOS_COMPILE_DEFS}
+
+	$<$<CONFIG:debug>:${PXSHARED_IOS_DEBUG_COMPILE_DEFS}>
+	$<$<CONFIG:checked>:${PXSHARED_IOS_CHECKED_COMPILE_DEFS}>
+	$<$<CONFIG:profile>:${PXSHARED_IOS_PROFILE_COMPILE_DEFS}>
+	$<$<CONFIG:release>:${PXSHARED_IOS_RELEASE_COMPILE_DEFS}>
+)
+
+# include PxFoundation common
+INCLUDE(../common/PxFoundation.cmake)
+\ No newline at end of file
diff --git a/PxShared/src/compiler/cmake/IOS/PxPvdSDK.cmake b/PxShared/src/compiler/cmake/IOS/PxPvdSDK.cmake
new file mode 100644
index 0000000..85e7e1f
--- /dev/null
+++ b/PxShared/src/compiler/cmake/IOS/PxPvdSDK.cmake
@@ -0,0 +1,24 @@
+#
+# Build PxPvdSDK
+#
+
+SET(PXSHARED_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../../src)
+
+SET(LL_SOURCE_DIR ${PXSHARED_SOURCE_DIR}/pvd)
+
+SET(PXPVDSDK_LIBTYPE STATIC)
+
+# Use generator expressions to set config specific preprocessor definitions
+SET(PXPVDSDK_COMPILE_DEFS 
+
+	# Common to all configurations
+	${PXSHARED_IOS_COMPILE_DEFS}
+
+	$<$<CONFIG:debug>:${PXSHARED_IOS_DEBUG_COMPILE_DEFS}>
+	$<$<CONFIG:checked>:${PXSHARED_IOS_CHECKED_COMPILE_DEFS}>
+	$<$<CONFIG:profile>:${PXSHARED_IOS_PROFILE_COMPILE_DEFS}>
+	$<$<CONFIG:release>:${PXSHARED_IOS_RELEASE_COMPILE_DEFS}>
+)
+
+# include PxPvdSDK common
+INCLUDE(../common/PxPvdSDK.cmake)
diff --git a/PxShared/src/compiler/cmake/IOS/PxTask.cmake b/PxShared/src/compiler/cmake/IOS/PxTask.cmake
new file mode 100644
index 0000000..197e241
--- /dev/null
+++ b/PxShared/src/compiler/cmake/IOS/PxTask.cmake
@@ -0,0 +1,18 @@
+#
+# Build PxTask
+#
+
+SET(PXSHARED_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../../src)
+
+SET(LL_SOURCE_DIR ${PXSHARED_SOURCE_DIR}/task)
+
+SET(PXTASK_COMPILE_DEFS 
+	${PXSHARED_IOS_COMPILE_DEFS};PX_FOUNDATION_DLL=0;
+	$<$<CONFIG:debug>:${PXSHARED_IOS_DEBUG_COMPILE_DEFS};>
+	$<$<CONFIG:checked>:${PXSHARED_IOS_CHECKED_COMPILE_DEFS};>
+	$<$<CONFIG:profile>:${PXSHARED_IOS_PROFILE_COMPILE_DEFS};>
+	$<$<CONFIG:release>:${PXSHARED_IOS_RELEASE_COMPILE_DEFS};>
+)
+
+# include PxTask common
+INCLUDE(../common/PxTask.cmake)
+\ No newline at end of file
diff --git a/PxShared/src/compiler/cmake/Linux/CMakeLists.txt b/PxShared/src/compiler/cmake/Linux/CMakeLists.txt
new file mode 100644
index 0000000..2fa592e
--- /dev/null
+++ b/PxShared/src/compiler/cmake/Linux/CMakeLists.txt
@@ -0,0 +1,87 @@
+cmake_minimum_required(VERSION 3.3)
+include(../common/CMakeLists.txt)
+
+STRING(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_LOWERCASE)
+
+IF(NOT DEFINED TARGET_BUILD_PLATFORM) # Not defined, default to Linux
+	SET(TARGET_BUILD_PLATFORM "Linux")
+ENDIF()
+
+SET(PLATFORM_LIST Linux)
+
+IF (NOT ${TARGET_BUILD_PLATFORM} IN_LIST PLATFORM_LIST)
+	MESSAGE(FATAL_ERROR "Invalid platform:" ${TARGET_BUILD_PLATFORM})
+ENDIF()
+
+IF (${CMAKE_LIBRARY_ARCHITECTURE} STREQUAL "x86_64-unknown-linux-gnu" OR ${CMAKE_LIBRARY_ARCHITECTURE} STREQUAL "x86_64-linux-gnu")
+	IF ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+	  # using Clang	  
+	  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fno-rtti -fno-exceptions -ffast-math -ffunction-sections -fdata-sections -Werror -ferror-limit=0 -Wall -Wextra -fstrict-aliasing -Wstrict-aliasing=2 -Weverything -Wno-documentation-deprecated-sync -Wno-documentation-unknown-command -Wno-float-equal -Wno-padded -Wno-weak-vtables -Wno-cast-align -Wno-conversion -Wno-missing-noreturn -Wno-missing-variable-declarations -Wno-shift-sign-overflow -Wno-covered-switch-default -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-prototypes -Wno-unreachable-code -Wno-unused-macros -Wno-unused-member-function -Wno-used-but-marked-unused -Wno-weak-template-vtables -Wno-deprecated -Wno-non-virtual-dtor -Wno-invalid-noreturn -Wno-return-type-c-linkage -Wno-reserved-id-macro -Wno-c++98-compat-pedantic -Wno-unused-local-typedef -Wno-old-style-cast -Wno-newline-eof -Wno-unused-private-field -Wno-undefined-func-template -Wno-format-nonliteral -Wno-implicit-fallthrough -Wno-undefined-reinterpret-cast")
+	ELSEIF ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+	  # using GCC
+	  SET(LIBPATH_SUFFIX "x64")
+	  SET(CMAKE_CXX_FLAGS "-Werror -m64 -fPIC -msse2 -mfpmath=sse -ffast-math -fno-exceptions -fno-rtti -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wextra -fno-strict-aliasing -fdiagnostics-show-option -Wno-invalid-offsetof -Wno-uninitialized -Wno-missing-field-initializers")	  
+	ENDIF("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")	
+ELSEIF(${CMAKE_LIBRARY_ARCHITECTURE} STREQUAL "arm-unknown-linux-gnueabihf")
+	SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -mfpu=neon -fno-rtti -fno-exceptions -ffast-math -ffunction-sections -fdata-sections -Werror -ferror-limit=0 -Wall -Wextra -fstrict-aliasing -Wstrict-aliasing=2 -Weverything -Wno-documentation-deprecated-sync -Wno-documentation-unknown-command -Wno-float-equal -Wno-padded -Wno-weak-vtables -Wno-cast-align -Wno-conversion -Wno-missing-noreturn -Wno-missing-variable-declarations -Wno-shift-sign-overflow -Wno-covered-switch-default -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-prototypes -Wno-unreachable-code -Wno-unused-macros -Wno-unused-member-function -Wno-used-but-marked-unused -Wno-weak-template-vtables -Wno-deprecated -Wno-non-virtual-dtor -Wno-old-style-cast -Wno-return-type-c-linkage -Wno-format-nonliteral -Wno-implicit-fallthrough")
+ELSEIF(${CMAKE_LIBRARY_ARCHITECTURE} STREQUAL "aarch64-unknown-linux-gnueabi")
+	SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fno-rtti -fno-exceptions -ffast-math -ffunction-sections -fdata-sections -Werror -ferror-limit=0 -Wall -Wextra -fstrict-aliasing -Wstrict-aliasing=2 -Weverything -Wno-documentation-deprecated-sync -Wno-documentation-unknown-command -Wno-float-equal -Wno-padded -Wno-weak-vtables -Wno-cast-align -Wno-conversion -Wno-missing-noreturn -Wno-missing-variable-declarations -Wno-shift-sign-overflow -Wno-covered-switch-default -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-prototypes -Wno-unreachable-code -Wno-unused-macros -Wno-unused-member-function -Wno-used-but-marked-unused -Wno-weak-template-vtables -Wno-deprecated -Wno-non-virtual-dtor -Wno-old-style-cast -Wno-return-type-c-linkage -Wno-format-nonliteral -Wno-unused-local-typedef -Wno-implicit-fallthrough")
+ELSE(${CMAKE_LIBRARY_ARCHITECTURE} STREQUAL "x86_64-unknown-linux-gnu" OR ${CMAKE_LIBRARY_ARCHITECTURE} STREQUAL "x86_64-linux-gnu")
+	MESSAGE(FATAL_ERROR "Unknown CMAKE_LIBRARY_ARCHITECTURE ${CMAKE_LIBRARY_ARCHITECTURE}")
+ENDIF(${CMAKE_LIBRARY_ARCHITECTURE} STREQUAL "x86_64-unknown-linux-gnu" OR ${CMAKE_LIBRARY_ARCHITECTURE} STREQUAL "x86_64-linux-gnu")
+
+
+SET(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -gdwarf-3")
+SET(CMAKE_CXX_FLAGS_CHECKED "-O3 -g -gdwarf-3")
+SET(CMAKE_CXX_FLAGS_PROFILE "-O3 -g -gdwarf-3")
+SET(CMAKE_CXX_FLAGS_RELEASE "-O3 -g -gdwarf-3")
+
+IF(DEFINED PX_GENERATE_GPU_PROJECTS)
+SET(PXSHARED_LINUX_COMPILE_DEFS _LIB)
+ELSE()
+# Disable cuda and dx for all projects on windows
+SET(PXSHARED_LINUX_COMPILE_DEFS _LIB;DISABLE_CUDA_PHYSX;)
+ENDIF()
+SET(PXSHARED_LINUX_DEBUG_COMPILE_DEFS _DEBUG;PX_DEBUG=1;PX_CHECKED=1)
+SET(PXSHARED_LINUX_CHECKED_COMPILE_DEFS NDEBUG;PX_CHECKED=1)
+SET(PXSHARED_LINUX_PROFILE_COMPILE_DEFS NDEBUG;PX_PROFILE=1)
+SET(PXSHARED_LINUX_RELEASE_COMPILE_DEFS NDEBUG)
+
+IF(DEFINED LIBPATH_SUFFIX)
+	SET(CMAKE_DEBUG_POSTFIX "${CMAKE_DEBUG_POSTFIX}_${LIBPATH_SUFFIX}")
+	SET(CMAKE_PROFILE_POSTFIX "${CMAKE_PROFILE_POSTFIX}_${LIBPATH_SUFFIX}")
+	SET(CMAKE_CHECKED_POSTFIX "${CMAKE_CHECKED_POSTFIX}_${LIBPATH_SUFFIX}")
+	SET(CMAKE_RELEASE_POSTFIX "${CMAKE_RELEASE_POSTFIX}_${LIBPATH_SUFFIX}")
+ENDIF()
+
+# NOTE: PxCudaContextManager excluded on this platform
+
+# Include project cmake files here
+IF(DEFINED PX_SELECT_COMPONENTS)
+	if ("PxFoundation" IN_LIST PX_SELECT_COMPONENTS)
+		INCLUDE(PxFoundation.cmake)
+	endif()
+	if ("PsFastXml" IN_LIST PX_SELECT_COMPONENTS)
+		INCLUDE(PsFastXml.cmake)
+	endif()
+	if ("PxPvdSDK" IN_LIST PX_SELECT_COMPONENTS)
+		INCLUDE(PxPvdSDK.cmake)
+	endif()
+	if ("PxTask" IN_LIST PX_SELECT_COMPONENTS)
+		INCLUDE(PxTask.cmake)
+	endif()
+	if ("PxCudaContextManager" IN_LIST PX_SELECT_COMPONENTS)
+		IF(DEFINED PX_GENERATE_GPU_PROJECTS)
+			INCLUDE(PxCudaContextManager.cmake)
+		ENDIF()
+	endif()
+ELSE()
+INCLUDE(PxFoundation.cmake)
+INCLUDE(PsFastXml.cmake)
+INCLUDE(PxPvdSDK.cmake)
+INCLUDE(PxTask.cmake)
+IF(DEFINED PX_GENERATE_GPU_PROJECTS)
+	INCLUDE(PxCudaContextManager.cmake)
+ENDIF()
+ENDIF()
+
diff --git a/PxShared/src/compiler/cmake/Linux/PsFastXml.cmake b/PxShared/src/compiler/cmake/Linux/PsFastXml.cmake
new file mode 100644
index 0000000..3c90c49
--- /dev/null
+++ b/PxShared/src/compiler/cmake/Linux/PsFastXml.cmake
@@ -0,0 +1,40 @@
+#
+# Build PsFastXml
+#
+
+SET(PXSHARED_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../../src)
+
+SET(LL_SOURCE_DIR ${PXSHARED_SOURCE_DIR}/fastxml)
+
+# Use generator expressions to set config specific preprocessor definitions
+SET(PSFASTXML_COMPILE_DEFS 
+	# Common to all configurations
+	${PXSHARED_LINUX_COMPILE_DEFS};PX_FOUNDATION_DLL=0;
+)
+
+if(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "debug")
+	LIST(APPEND PSFASTXML_COMPILE_DEFS 
+		${PXSHARED_LINUX_DEBUG_COMPILE_DEFS}
+	)
+elseif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "checked")
+	LIST(APPEND PSFASTXML_COMPILE_DEFS 
+		${PXSHARED_LINUX_CHECKED_COMPILE_DEFS}
+	)
+elseif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "profile")
+	LIST(APPEND PSFASTXML_COMPILE_DEFS 
+		${PXSHARED_LINUX_PROFILE_COMPILE_DEFS}
+	)
+elseif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL release)
+	LIST(APPEND PSFASTXML_COMPILE_DEFS 
+		${PXSHARED_LINUX_RELEASE_COMPILE_DEFS}
+	)
+else(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "debug")
+	MESSAGE(FATAL_ERROR "Unknown configuration ${CMAKE_BUILD_TYPE}")
+endif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "debug")
+
+
+# include PsFastXml common
+INCLUDE(../common/PsFastXml.cmake)
+
+# enable -fPIC so we can link static libs with the editor
+SET_TARGET_PROPERTIES(PsFastXml PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
diff --git a/PxShared/src/compiler/cmake/Linux/PxCudaContextManager.cmake b/PxShared/src/compiler/cmake/Linux/PxCudaContextManager.cmake
new file mode 100644
index 0000000..3454323
--- /dev/null
+++ b/PxShared/src/compiler/cmake/Linux/PxCudaContextManager.cmake
@@ -0,0 +1,29 @@
+#
+# Build PxCudaContextManager
+#
+
+FIND_PACKAGE(CUDA REQUIRED)
+
+SET(PXSHARED_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../../src)
+
+SET(LL_SOURCE_DIR ${PXSHARED_SOURCE_DIR}/cudamanager)
+
+SET(CUDA_COMPILER_OPTION_DEBUG "--compiler-options=-Wall,-O3,-fPIC,-msse2,-mfpmath=sse,-malign-double,-m64,-fvisibility=hidden")
+SET(CUDA_COMPILER_OPTION_CHECKED "--compiler-options=-Wall,-O3,-fPIC,-msse2,-mfpmath=sse,-malign-double,-m64,-fvisibility=hidden")
+SET(CUDA_COMPILER_OPTION_PROFILE "--compiler-options=-Wall,-O3,-fPIC,-msse2,-mfpmath=sse,-malign-double,-m64,-fvisibility=hidden")
+SET(CUDA_COMPILER_OPTION_RELEASE "--compiler-options=-Wall,-O3,-fPIC,-msse2,-mfpmath=sse,-malign-double,-m64,-fvisibility=hidden")
+
+# include PxCudaContextManager common
+INCLUDE(../common/PxCudaContextManager.cmake)
+
+# Use generator expressions to set config specific preprocessor definitions
+TARGET_COMPILE_DEFINITIONS(PxCudaContextManager 
+
+	# Common to all configurations
+	PRIVATE ${PXSHARED_LINUX_COMPILE_DEFS};
+
+	PRIVATE $<$<CONFIG:debug>:${PXSHARED_LINUX_DEBUG_COMPILE_DEFS};>
+	PRIVATE $<$<CONFIG:checked>:${PXSHARED_LINUX_CHECKED_COMPILE_DEFS};>
+	PRIVATE $<$<CONFIG:profile>:${PXSHARED_LINUX_PROFILE_COMPILE_DEFS};>
+	PRIVATE $<$<CONFIG:release>:${PXSHARED_LINUX_RELEASE_COMPILE_DEFS};>
+)
diff --git a/PxShared/src/compiler/cmake/Linux/PxFoundation.cmake b/PxShared/src/compiler/cmake/Linux/PxFoundation.cmake
new file mode 100644
index 0000000..f074805
--- /dev/null
+++ b/PxShared/src/compiler/cmake/Linux/PxFoundation.cmake
@@ -0,0 +1,67 @@
+#
+# Build PxFoundation
+#
+
+SET(PXSHARED_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../../src)
+
+SET(LL_SOURCE_DIR ${PXSHARED_SOURCE_DIR}/foundation)
+
+IF(DEFINED PX_STATIC_LIBRARIES)
+	SET(PXFOUNDATION_LIBTYPE STATIC)
+ELSE()
+	SET(PXFOUNDATION_LIBTYPE SHARED)
+	SET(PXFOUNDATION_SHARED_LIBRARY_DEFS PX_PVDSDK_DLL=1;PX_FOUNDATION_DLL=1;)
+ENDIF()
+
+SET(PXFOUNDATION_PLATFORM_FILES
+	${LL_SOURCE_DIR}/src/unix/PsUnixAtomic.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixCpu.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixFPU.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixMutex.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixPrintString.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixSList.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixSocket.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixSync.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixThread.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixTime.cpp
+)
+
+SET(PXFOUNDATION_PLATFORM_INCLUDES
+	${LL_SOURCE_DIR}/include/linux
+)
+
+SET(PXFOUNDATION_COMPILE_DEFS
+	# Common to all configurations
+	${PXSHARED_LINUX_COMPILE_DEFS}
+)
+
+if(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "debug")
+	LIST(APPEND PXFOUNDATION_COMPILE_DEFS
+		${PXSHARED_LINUX_DEBUG_COMPILE_DEFS};${PXFOUNDATION_SHARED_LIBRARY_DEFS};
+	)
+elseif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "checked")
+	LIST(APPEND PXFOUNDATION_COMPILE_DEFS
+		${PXSHARED_LINUX_CHECKED_COMPILE_DEFS};${PXFOUNDATION_SHARED_LIBRARY_DEFS};
+	)
+elseif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "profile")
+	LIST(APPEND PXFOUNDATION_COMPILE_DEFS
+		${PXSHARED_LINUX_PROFILE_COMPILE_DEFS};${PXFOUNDATION_SHARED_LIBRARY_DEFS};
+	)
+elseif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL release)
+	LIST(APPEND PXFOUNDATION_COMPILE_DEFS
+		${PXSHARED_LINUX_RELEASE_COMPILE_DEFS};${PXFOUNDATION_SHARED_LIBRARY_DEFS};
+	)
+else(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "debug")
+	MESSAGE(FATAL_ERROR "Unknown configuration ${CMAKE_BUILD_TYPE}")
+endif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "debug")
+
+
+# include PxFoundation common
+INCLUDE(../common/PxFoundation.cmake)
+
+IF(NOT DEFINED PX_STATIC_LIBRARIES)
+TARGET_LINK_LIBRARIES(PxFoundation PUBLIC rt)
+ENDIF()
+
+# enable -fPIC so we can link static libs with the editor
+SET_TARGET_PROPERTIES(PxFoundation PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
diff --git a/PxShared/src/compiler/cmake/Linux/PxPvdSDK.cmake b/PxShared/src/compiler/cmake/Linux/PxPvdSDK.cmake
new file mode 100644
index 0000000..81692a0
--- /dev/null
+++ b/PxShared/src/compiler/cmake/Linux/PxPvdSDK.cmake
@@ -0,0 +1,53 @@
+#
+# Build PxPvdSDK
+#
+
+SET(PXSHARED_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../../src)
+
+SET(LL_SOURCE_DIR ${PXSHARED_SOURCE_DIR}/pvd)
+
+IF(DEFINED PX_STATIC_LIBRARIES)
+	SET(PXPVDSDK_LIBTYPE STATIC)
+ELSE()
+	SET(PXPVDSDK_LIBTYPE SHARED)
+	SET(PXPVDSDK_SHARED_LIBRARY_DEFS PX_PVDSDK_DLL=1;PX_FOUNDATION_DLL=1;)
+ENDIF()
+
+# Use generator expressions to set config specific preprocessor definitions
+SET(PXPVDSDK_COMPILE_DEFS 
+	# Common to all configurations
+	${PXSHARED_LINUX_COMPILE_DEFS}
+)
+
+if(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "debug")
+	LIST(APPEND PXPVDSDK_COMPILE_DEFS 
+		${PXSHARED_LINUX_DEBUG_COMPILE_DEFS};${PXPVDSDK_SHARED_LIBRARY_DEFS}
+	)
+elseif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "checked")
+	LIST(APPEND PXPVDSDK_COMPILE_DEFS 
+		${PXSHARED_LINUX_CHECKED_COMPILE_DEFS};${PXPVDSDK_SHARED_LIBRARY_DEFS}
+	)
+elseif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "profile")
+	LIST(APPEND PXPVDSDK_COMPILE_DEFS 
+		${PXSHARED_LINUX_PROFILE_COMPILE_DEFS};${PXPVDSDK_SHARED_LIBRARY_DEFS}
+	)
+elseif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL release)
+	LIST(APPEND PXPVDSDK_COMPILE_DEFS 
+		${PXSHARED_LINUX_RELEASE_COMPILE_DEFS};${PXPVDSDK_SHARED_LIBRARY_DEFS}
+	)
+else(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "debug")
+	MESSAGE(FATAL_ERROR "Unknown configuration ${CMAKE_BUILD_TYPE}")
+endif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "debug")
+
+# include PxPvdSDK common
+INCLUDE(../common/PxPvdSDK.cmake)
+
+# Add linked libraries
+IF(DEFINED PX_STATIC_LIBRARIES)
+TARGET_LINK_LIBRARIES(PxPvdSDK PRIVATE PxFoundation )
+ELSE()
+TARGET_LINK_LIBRARIES(PxPvdSDK PRIVATE PxFoundation rt)
+ENDIF()
+
+# enable -fPIC so we can link static libs with the editor
+SET_TARGET_PROPERTIES(PxPvdSDK PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
diff --git a/PxShared/src/compiler/cmake/Linux/PxTask.cmake b/PxShared/src/compiler/cmake/Linux/PxTask.cmake
new file mode 100644
index 0000000..86689c4
--- /dev/null
+++ b/PxShared/src/compiler/cmake/Linux/PxTask.cmake
@@ -0,0 +1,43 @@
+#
+# Build PxTask
+#
+
+SET(PXSHARED_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../../src)
+
+SET(LL_SOURCE_DIR ${PXSHARED_SOURCE_DIR}/task)
+
+SET(PXTASK_COMPILE_DEFS 
+	${PXSHARED_LINUX_COMPILE_DEFS};
+)
+
+if(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "debug")
+	LIST(APPEND PXTASK_COMPILE_DEFS
+		${PXSHARED_LINUX_DEBUG_COMPILE_DEFS}
+	)
+elseif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "checked")
+	LIST(APPEND PXTASK_COMPILE_DEFS
+		${PXSHARED_LINUX_CHECKED_COMPILE_DEFS}
+	)
+elseif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "profile")
+	LIST(APPEND PXTASK_COMPILE_DEFS
+		${PXSHARED_LINUX_PROFILE_COMPILE_DEFS}
+	)
+elseif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL release)
+	LIST(APPEND PXTASK_COMPILE_DEFS
+		${PXSHARED_LINUX_RELEASE_COMPILE_DEFS}
+	)
+else(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "debug")
+	MESSAGE(FATAL_ERROR "Unknown configuration ${CMAKE_BUILD_TYPE}")
+endif(${CMAKE_BUILD_TYPE_LOWERCASE} STREQUAL "debug")
+
+IF(DEFINED PX_STATIC_LIBRARIES)
+	SET(PXTASK_LIBTYPE OBJECT)
+ELSE()
+	SET(PXTASK_LIBTYPE STATIC)
+ENDIF()
+
+# include PxTask common
+INCLUDE(../common/PxTask.cmake)
+
+# enable -fPIC so we can link static libs with the editor
+SET_TARGET_PROPERTIES(PxTask PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
diff --git a/PxShared/src/compiler/cmake/Mac/CMakeLists.txt b/PxShared/src/compiler/cmake/Mac/CMakeLists.txt
new file mode 100644
index 0000000..beb06bc
--- /dev/null
+++ b/PxShared/src/compiler/cmake/Mac/CMakeLists.txt
@@ -0,0 +1,48 @@
+cmake_minimum_required(VERSION 3.3)
+include(../common/CMakeLists.txt)
+
+
+IF(NOT DEFINED TARGET_BUILD_PLATFORM) # Not defined, default to Mac
+	SET(TARGET_BUILD_PLATFORM "Mac")
+ENDIF()
+
+SET(PLATFORM_LIST Mac)
+
+IF (NOT ${TARGET_BUILD_PLATFORM} IN_LIST PLATFORM_LIST)
+	MESSAGE(FATAL_ERROR "Invalid platform:" ${TARGET_BUILD_PLATFORM})
+ENDIF()
+
+SET(CMAKE_CXX_FLAGS "-msse2 -std=c++11 -fno-rtti -fno-exceptions -ffast-math -ffunction-sections -fdata-sections -Werror -ferror-limit=0 -Wall -Wextra -fstrict-aliasing -Wstrict-aliasing=2 -Weverything -Wno-documentation-deprecated-sync -Wno-documentation-unknown-command -Wno-float-equal -Wno-padded -Wno-weak-vtables -Wno-cast-align -Wno-conversion -Wno-missing-noreturn -Wno-missing-variable-declarations -Wno-shift-sign-overflow -Wno-covered-switch-default -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-prototypes -Wno-unreachable-code -Wno-unused-macros -Wno-unused-member-function -Wno-used-but-marked-unused -Wno-weak-template-vtables -Wno-deprecated -Wno-non-virtual-dtor -Wno-invalid-noreturn -Wno-return-type-c-linkage -Wno-reserved-id-macro -Wno-c++98-compat-pedantic -Wno-unused-local-typedef -Wno-old-style-cast -Wno-newline-eof -Wno-unused-private-field -Wno-undefined-reinterpret-cast -Wno-invalid-offsetof -gdwarf-2")
+
+IF (DEFINED PX_32BIT)
+SET(CMAKE_CXX_FLAGS "-arch i386 ${CMAKE_CXX_FLAGS}")
+ENDIF()
+IF (DEFINED PX_64BIT)
+SET(CMAKE_CXX_FLAGS "-arch x86_64 ${CMAKE_CXX_FLAGS}")
+ENDIF()
+
+SET(CMAKE_SHARED_LINKER_FLAGS "")
+
+SET(CMAKE_CXX_FLAGS_DEBUG "-O0 -g")
+SET(CMAKE_CXX_FLAGS_CHECKED "-O3 -g")
+SET(CMAKE_CXX_FLAGS_PROFILE "-O3 -g")
+SET(CMAKE_CXX_FLAGS_RELEASE "-O3 -g")
+
+# Build libs compatible with OS X 10.9
+SET(CMAKE_OSX_DEPLOYMENT_TARGET "10.9")
+
+#set(CMAKE_XCODE_ATTRIBUTE_DEBUG_INFORMATION_FORMAT "dwarf-with-dsym")
+
+SET(PXSHARED_MAC_COMPILE_DEFS _LIB;DISABLE_CUDA_PHYSX;DISABLE_COMPUTE_PHYSX)
+SET(PXSHARED_MAC_DEBUG_COMPILE_DEFS _DEBUG;PX_DEBUG=1;PX_CHECKED=1)
+SET(PXSHARED_MAC_CHECKED_COMPILE_DEFS NDEBUG;PX_CHECKED=1)
+SET(PXSHARED_MAC_PROFILE_COMPILE_DEFS NDEBUG;PX_PROFILE=1)
+SET(PXSHARED_MAC_RELEASE_COMPILE_DEFS NDEBUG)
+
+# NOTE: PxCudaContextManager excluded on this platform
+
+# Include project cmake files here
+INCLUDE(PxFoundation.cmake)
+INCLUDE(PsFastXml.cmake)
+INCLUDE(PxPvdSDK.cmake)
+INCLUDE(PxTask.cmake)
diff --git a/PxShared/src/compiler/cmake/Mac/PsFastXml.cmake b/PxShared/src/compiler/cmake/Mac/PsFastXml.cmake
new file mode 100644
index 0000000..7f140e0
--- /dev/null
+++ b/PxShared/src/compiler/cmake/Mac/PsFastXml.cmake
@@ -0,0 +1,22 @@
+#
+# Build PsFastXml
+#
+
+SET(PXSHARED_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../../src)
+
+SET(LL_SOURCE_DIR ${PXSHARED_SOURCE_DIR}/fastxml)
+
+# Use generator expressions to set config specific preprocessor definitions
+SET(PSFASTXML_COMPILE_DEFS 
+
+	# Common to all configurations
+	${PXSHARED_MAC_COMPILE_DEFS};PX_FOUNDATION_DLL=0;
+
+	$<$<CONFIG:debug>:${PXSHARED_MAC_DEBUG_COMPILE_DEFS};>
+	$<$<CONFIG:checked>:${PXSHARED_MAC_CHECKED_COMPILE_DEFS};>
+	$<$<CONFIG:profile>:${PXSHARED_MAC_PROFILE_COMPILE_DEFS};>
+	$<$<CONFIG:release>:${PXSHARED_MAC_RELEASE_COMPILE_DEFS};>
+)
+
+# include PsFastXml common
+INCLUDE(../common/PsFastXml.cmake)
+\ No newline at end of file
diff --git a/PxShared/src/compiler/cmake/Mac/PxFoundation.cmake b/PxShared/src/compiler/cmake/Mac/PxFoundation.cmake
new file mode 100644
index 0000000..2a21910
--- /dev/null
+++ b/PxShared/src/compiler/cmake/Mac/PxFoundation.cmake
@@ -0,0 +1,40 @@
+#
+# Build PxFoundation
+#
+
+SET(PXSHARED_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../../src)
+
+SET(LL_SOURCE_DIR ${PXSHARED_SOURCE_DIR}/foundation)
+
+SET(PXFOUNDATION_LIBTYPE SHARED)
+
+SET(PXFOUNDATION_PLATFORM_FILES
+	${LL_SOURCE_DIR}/src/unix/PsUnixAtomic.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixCpu.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixFPU.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixMutex.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixPrintString.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixSList.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixSocket.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixSync.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixThread.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixTime.cpp
+)
+
+SET(PXFOUNDATION_PLATFORM_INCLUDES
+	${LL_SOURCE_DIR}/include/mac
+)
+
+SET(PXFOUNDATION_COMPILE_DEFS
+
+	# Common to all configurations
+	${PXSHARED_MAC_COMPILE_DEFS}
+
+	$<$<CONFIG:debug>:${PXSHARED_MAC_DEBUG_COMPILE_DEFS}>
+	$<$<CONFIG:checked>:${PXSHARED_MAC_CHECKED_COMPILE_DEFS}>
+	$<$<CONFIG:profile>:${PXSHARED_MAC_PROFILE_COMPILE_DEFS}>
+	$<$<CONFIG:release>:${PXSHARED_MAC_RELEASE_COMPILE_DEFS}>
+)
+
+# include PxFoundation common
+INCLUDE(../common/PxFoundation.cmake)
+\ No newline at end of file
diff --git a/PxShared/src/compiler/cmake/Mac/PxPvdSDK.cmake b/PxShared/src/compiler/cmake/Mac/PxPvdSDK.cmake
new file mode 100644
index 0000000..c236882
--- /dev/null
+++ b/PxShared/src/compiler/cmake/Mac/PxPvdSDK.cmake
@@ -0,0 +1,28 @@
+#
+# Build PxPvdSDK
+#
+
+SET(PXSHARED_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../../src)
+
+SET(LL_SOURCE_DIR ${PXSHARED_SOURCE_DIR}/pvd)
+
+SET(PXPVDSDK_LIBTYPE SHARED)
+
+# Use generator expressions to set config specific preprocessor definitions
+SET(PXPVDSDK_COMPILE_DEFS 
+
+	# Common to all configurations
+	${PXSHARED_MAC_COMPILE_DEFS}
+
+	$<$<CONFIG:debug>:${PXSHARED_MAC_DEBUG_COMPILE_DEFS}>
+	$<$<CONFIG:checked>:${PXSHARED_MAC_CHECKED_COMPILE_DEFS}>
+	$<$<CONFIG:profile>:${PXSHARED_MAC_PROFILE_COMPILE_DEFS}>
+	$<$<CONFIG:release>:${PXSHARED_MAC_RELEASE_COMPILE_DEFS}>
+)
+
+# include PxPvdSDK common
+INCLUDE(../common/PxPvdSDK.cmake)
+
+# Add linked libraries
+TARGET_LINK_LIBRARIES(PxPvdSDK PRIVATE PxFoundation)
+
diff --git a/PxShared/src/compiler/cmake/Mac/PxTask.cmake b/PxShared/src/compiler/cmake/Mac/PxTask.cmake
new file mode 100644
index 0000000..2326a1f
--- /dev/null
+++ b/PxShared/src/compiler/cmake/Mac/PxTask.cmake
@@ -0,0 +1,18 @@
+#
+# Build PxTask
+#
+
+SET(PXSHARED_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../../src)
+
+SET(LL_SOURCE_DIR ${PXSHARED_SOURCE_DIR}/task)
+
+SET(PXTASK_COMPILE_DEFS 
+	${PXSHARED_MAC_COMPILE_DEFS};PX_FOUNDATION_DLL=0;
+	$<$<CONFIG:debug>:${PXSHARED_MAC_DEBUG_COMPILE_DEFS};>
+	$<$<CONFIG:checked>:${PXSHARED_MAC_CHECKED_COMPILE_DEFS};>
+	$<$<CONFIG:profile>:${PXSHARED_MAC_PROFILE_COMPILE_DEFS};>
+	$<$<CONFIG:release>:${PXSHARED_MAC_RELEASE_COMPILE_DEFS};>
+)
+
+# include PxTask common
+INCLUDE(../common/PxTask.cmake)
+\ No newline at end of file
diff --git a/PxShared/src/compiler/cmake/Modules/FindPxShared.cmake b/PxShared/src/compiler/cmake/Modules/FindPxShared.cmake
new file mode 100644
index 0000000..ea90ab6
--- /dev/null
+++ b/PxShared/src/compiler/cmake/Modules/FindPxShared.cmake
@@ -0,0 +1,18 @@
+# - Try to find PxShared
+# Once done this will define
+#  PXSHARED_FOUND - System has PxShared
+#  PXSHARED_INCLUDE_DIRS - The PxShared include directories
+
+# NOTE: We're including a version in this, but the first hint is without one - we should use that!
+FIND_PATH(		PXSHARED_INCLUDE_DIRS include/cudamanager/PxGpuCopyDesc.h
+				HINTS 
+				${GW_DEPS_ROOT}/PxShared
+				${GW_DEPS_ROOT}/sw/physx/PxShared/1.0/trunk/
+				)
+				
+MESSAGE(${PXSHARED_INCLUDE_DIRS})
+				
+INCLUDE(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(PxShared DEFAULT_MSG PXSHARED_INCLUDE_DIRS)
+
+mark_as_advanced(PXSHARED_INCLUDE_DIRS)
diff --git a/PxShared/src/compiler/cmake/Modules/FindnvToolsExt.cmake b/PxShared/src/compiler/cmake/Modules/FindnvToolsExt.cmake
new file mode 100644
index 0000000..c1675ae
--- /dev/null
+++ b/PxShared/src/compiler/cmake/Modules/FindnvToolsExt.cmake
@@ -0,0 +1,38 @@
+# - Try to find nvToolsExt
+# Once done this will define
+#  NVTOOLSEXT_FOUND - System has nvToolsExt
+#  NVTOOLSEXT_INCLUDE_DIRS - The nvToolsExt include directories
+#  NVTOOLSEXT_LIBRARIES - The libraries needed to use nvToolsExt
+#  NVTOOLSEXT_DEFINITIONS - Compiler switches required for using nvToolsExt
+
+FIND_PATH(		NVTOOLSEXT_INCLUDE_DIRS nvToolsExt.h
+				HINTS 
+				${GW_DEPS_ROOT}/PhysX_3.4/externals/nvToolsExt
+				${GW_DEPS_ROOT}/sw/physx/externals/nvToolsExt/1
+				PATH_SUFFIXES include)
+
+INCLUDE(FindPackageHandleStandardArgs)
+				
+IF(TARGET_BUILD_PLATFORM STREQUAL "Windows")
+	# NOTE: Doesn't make sense for all platforms - ARM
+	IF(CMAKE_CL_64)
+		SET(NVTOOLSEXT_LIBNAME "nvToolsExt64_1")
+		SET(NVTOOLSEXT_LIBPATH_SUFFIX "x64")
+	ELSE(CMAKE_CL_64)
+		SET(NVTOOLSEXT_LIBNAME nvToolsExt32_1)
+		SET(NVTOOLSEXT_LIBPATH_SUFFIX "Win32")
+	ENDIF(CMAKE_CL_64)				
+					
+					
+	FIND_LIBRARY(	NVTOOLSEXT_LIBRARIES ${NVTOOLSEXT_LIBNAME}
+					${GW_DEPS_ROOT}/PhysX_3.4/externals/nvToolsExt/lib/${NVTOOLSEXT_LIBPATH_SUFFIX}
+					${GW_DEPS_ROOT}/sw/physx/externals/nvToolsExt/1/lib/${NVTOOLSEXT_LIBPATH_SUFFIX}
+					)
+	
+	FIND_PACKAGE_HANDLE_STANDARD_ARGS(nvToolsExt DEFAULT_MSG NVTOOLSEXT_LIBRARIES NVTOOLSEXT_INCLUDE_DIRS)
+ELSE()
+	# Exclude the libraries for non-windows platforms
+	FIND_PACKAGE_HANDLE_STANDARD_ARGS(nvToolsExt DEFAULT_MSG NVTOOLSEXT_INCLUDE_DIRS)
+ENDIF()
+
+mark_as_advanced(NVTOOLSEXT_INCLUDE_DIRS NVTOOLSEXT_LIBRARIES)
+\ No newline at end of file
diff --git a/PxShared/src/compiler/cmake/RegenProjects.bat b/PxShared/src/compiler/cmake/RegenProjects.bat
new file mode 100644
index 0000000..bdde57f
--- /dev/null
+++ b/PxShared/src/compiler/cmake/RegenProjects.bat
@@ -0,0 +1,4 @@
+rmdir CMakeFiles /s /q
+del CMakeCache.txt
+cmake ../../.. -A x64
+
diff --git a/PxShared/src/compiler/cmake/common/CMakeLists.txt b/PxShared/src/compiler/cmake/common/CMakeLists.txt
new file mode 100644
index 0000000..22d2097
--- /dev/null
+++ b/PxShared/src/compiler/cmake/common/CMakeLists.txt
@@ -0,0 +1,84 @@
+cmake_minimum_required(VERSION 3.3)
+
+PROJECT(PxShared CXX)
+
+CMAKE_POLICY(SET CMP0057 NEW) # Enable IN_LIST
+
+IF(DEFINED ENV{GW_DEPS_ROOT})
+	SET(GW_DEPS_ROOT $ENV{GW_DEPS_ROOT})
+
+	SET(CMAKE_MODULE_PATH $ENV{GW_DEPS_ROOT}/sw/physx/tools/CMakeModules)
+
+	IF(EXISTS $ENV{GW_DEPS_ROOT}/Externals/CMakeModules)
+		SET(CMAKE_MODULE_PATH $ENV{GW_DEPS_ROOT}/Externals/CMakeModules)
+	ENDIF()
+ENDIF()
+
+# Add find modules to the path
+IF(NOT EXISTS ${CMAKE_MODULE_PATH})
+	MESSAGE(FATAL_ERROR "Could not find CMakeModules at ${CMAKE_MODULE_PATH}")
+ENDIF()
+
+MESSAGE("PxShared Build Platform: " ${TARGET_BUILD_PLATFORM})
+MESSAGE("Using CXX Compiler: " ${CMAKE_CXX_COMPILER})
+
+# TODO: Fail if we didn't find deps
+
+# Tell MSVC to stop doing MBCS
+#ADD_DEFINITIONS(-D_UNICODE -DUNICODE)
+
+IF(CMAKE_CONFIGURATION_TYPES)
+	SET(CMAKE_CONFIGURATION_TYPES debug checked profile release)
+	SET(CMAKE_CONFIGURATION_TYPES "${CMAKE_CONFIGURATION_TYPES}" CACHE STRING
+		"Reset config to what we need" 
+		FORCE)
+	
+	SET(CMAKE_SHARED_LINKER_FLAGS_CHECKED "")
+	SET(CMAKE_SHARED_LINKER_FLAGS_PROFILE "")
+
+	# Build PDBs for all configurations
+	SET(CMAKE_SHARED_LINKER_FLAGS "/DEBUG")
+
+ENDIF()
+
+# Default to appending "DEBUG", "PROFILE", etc to produced artifacts
+IF(NOT DEFINED APPEND_CONFIG_NAME)
+	SET(APPEND_CONFIG_NAME ON)
+ENDIF()
+
+IF (APPEND_CONFIG_NAME)
+	MESSAGE("Appending config to output names")
+
+	SET(CMAKE_DEBUG_POSTFIX "DEBUG")
+	SET(CMAKE_PROFILE_POSTFIX "PROFILE")
+	SET(CMAKE_CHECKED_POSTFIX "CHECKED")
+	SET(CMAKE_RELEASE_POSTFIX "")
+ENDIF()
+
+SET(PROJECT_ROOT_DIR ${PROJECT_SOURCE_DIR}/../../../../)
+
+INCLUDE(SetOutputPaths)
+
+IF(DEFINED PX_OUTPUT_EXE_DIR)
+	SetExeOutputPath(${PX_OUTPUT_EXE_DIR})
+ENDIF()
+IF(DEFINED PX_OUTPUT_DLL_DIR)
+	SetDllOutputPath(${PX_OUTPUT_DLL_DIR})
+ENDIF()
+IF(DEFINED PX_OUTPUT_LIB_DIR)
+	SetLibOutputPath(${PX_OUTPUT_LIB_DIR})
+ENDIF()
+# All EXE/DLL/LIB output will be overwritten if PX_OUTPUT_ALL_DIR is defined
+IF(DEFINED PX_OUTPUT_ALL_DIR)
+	SetSingleOutputPath(${PX_OUTPUT_ALL_DIR})
+ENDIF()
+
+# Prevent failure due to command line limitations
+IF(USE_RESPONSE_FILES)
+	SET(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
+	SET(CMAKE_C_USE_RESPONSE_FILE_FOR_INCLUDES 1)
+	SET(CMAKE_C_USE_RESPONSE_FILE_FOR_LIBRARIES 1)
+	SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS 1)
+	SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_INCLUDES 1)
+	SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_LIBRARIES 1)
+ENDIF()
diff --git a/PxShared/src/compiler/cmake/common/PsFastXml.cmake b/PxShared/src/compiler/cmake/common/PsFastXml.cmake
new file mode 100644
index 0000000..06c1282
--- /dev/null
+++ b/PxShared/src/compiler/cmake/common/PsFastXml.cmake
@@ -0,0 +1,37 @@
+#
+# Build PsFastXml common
+#
+
+SET(PSFASTXML_HEADERS
+	${LL_SOURCE_DIR}/include/PsFastXml.h
+)
+SOURCE_GROUP(include FILES ${PSFASTXML_HEADERS})
+
+SET(PSFASTXML_SOURCE
+	${LL_SOURCE_DIR}/src/PsFastXml.cpp
+)
+SOURCE_GROUP(src FILES ${PSFASTXML_SOURCE})
+
+ADD_LIBRARY(PsFastXml STATIC 	
+	${PSFASTXML_HEADERS}
+	${PSFASTXML_SOURCE}
+)
+
+TARGET_INCLUDE_DIRECTORIES(PsFastXml 
+	PRIVATE ${PXSHARED_SOURCE_DIR}/../include
+	PRIVATE ${PXSHARED_SOURCE_DIR}/foundation/include
+	PRIVATE ${LL_SOURCE_DIR}/include
+	
+	PRIVATE ${PLATFORM_INCLUDES}
+)
+
+TARGET_COMPILE_DEFINITIONS(PsFastXml 
+	PRIVATE ${PSFASTXML_COMPILE_DEFS}
+)
+
+SET_TARGET_PROPERTIES(PsFastXml PROPERTIES 
+	COMPILE_PDB_NAME_DEBUG "PsFastXml${CMAKE_DEBUG_POSTFIX}"
+	COMPILE_PDB_NAME_CHECKED "PsFastXml${CMAKE_CHECKED_POSTFIX}"
+	COMPILE_PDB_NAME_PROFILE "PsFastXml${CMAKE_PROFILE_POSTFIX}"
+	COMPILE_PDB_NAME_RELEASE "PsFastXml${CMAKE_RELEASE_POSTFIX}"
+)
+\ No newline at end of file
diff --git a/PxShared/src/compiler/cmake/common/PxCudaContextManager.cmake b/PxShared/src/compiler/cmake/common/PxCudaContextManager.cmake
new file mode 100644
index 0000000..ab76997
--- /dev/null
+++ b/PxShared/src/compiler/cmake/common/PxCudaContextManager.cmake
@@ -0,0 +1,77 @@
+#
+# Build PxCudaContextManager common
+#
+
+
+# CUDA!
+SET(CUDA_NVCC_FLAGS "-lineinfo -use_fast_math -ftz=true -prec-div=false -prec-sqrt=false  -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_50,code=compute_50 -D_CONSOLE -D_WIN32_WINNT=0x0501")
+
+CUDA_INCLUDE_DIRECTORIES(
+	${PXSHARED_SOURCE_DIR}/../include
+	${PXSHARED_SOURCE_DIR}/foundation/include
+	${PXSHARED_SOURCE_DIR}/cudamanager/include
+)
+
+SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
+
+# Compile the CuKernelSolver - it has different options than the other CUDA files
+SET(CUDA_NVCC_FLAGS_DEBUG   "-D_DEBUG -D_CONSOLE -D_WIN32_WINNT=0x0501 ${CUDA_COMPILER_OPTION_DEBUG}")
+SET(CUDA_NVCC_FLAGS_CHECKED "-DNDEBUG -D_CONSOLE -D_WIN32_WINNT=0x0501 ${CUDA_COMPILER_OPTION_CHECKED}")
+SET(CUDA_NVCC_FLAGS_PROFILE "-DNDEBUG -D_CONSOLE -D_WIN32_WINNT=0x0501 ${CUDA_COMPILER_OPTION_PROFILE}")
+SET(CUDA_NVCC_FLAGS_RELEASE "-DNDEBUG -D_CONSOLE -D_WIN32_WINNT=0x0501 ${CUDA_COMPILER_OPTION_RELEASE}")
+
+SET(CUDACONTEXTMANAGER_HEADERS	
+	${PXSHARED_SOURCE_DIR}/../include/cudamanager/PxCudaContextManager.h
+	${PXSHARED_SOURCE_DIR}/../include/cudamanager/PxCudaMemoryManager.h
+	${PXSHARED_SOURCE_DIR}/../include/cudamanager/PxGpuCopyDesc.h
+	${PXSHARED_SOURCE_DIR}/../include/cudamanager/PxGpuCopyDescQueue.h
+)
+SOURCE_GROUP(include FILES ${CUDACONTEXTMANAGER_HEADERS})
+
+SET(CUDACONTEXTMANAGER_KERNELS
+	${LL_SOURCE_DIR}/src/CUDA/UtilKernels.cu
+)
+SOURCE_GROUP("src kernels" FILES ${CUDACONTEXTMANAGER_KERNELS})
+
+SET(CUDACONTEXTMANAGER_SOURCE
+	${LL_SOURCE_DIR}/src/CudaContextManager.cpp
+	${LL_SOURCE_DIR}/src/CudaKernelWrangler.cpp
+	${LL_SOURCE_DIR}/src/CudaMemoryManager.cpp
+	${LL_SOURCE_DIR}/src/HeapManagerRef.cpp
+	${LL_SOURCE_DIR}/src/GpuDispatcher.cpp
+	${LL_SOURCE_DIR}/src/BlockingWait.cpp
+	${LL_SOURCE_DIR}/src/PhysXDeviceSettings.cpp
+)
+SOURCE_GROUP(src\\src FILES ${CUDACONTEXTMANAGER_SOURCE})
+
+SET(CUDACONTEXTMANAGER_SOURCE_HEADERS
+	${LL_SOURCE_DIR}/include/CudaContextManager.h
+	${LL_SOURCE_DIR}/include/CudaKernelWrangler.h
+	${LL_SOURCE_DIR}/include/GpuDispatcher.h
+	${LL_SOURCE_DIR}/include/PhysXDeviceSettings.h
+)
+SOURCE_GROUP(src\\src FILES ${CUDACONTEXTMANAGER_SOURCE_HEADERS})
+
+CUDA_ADD_LIBRARY(PxCudaContextManager STATIC 
+	${CUDACONTEXTMANAGER_HEADERS}
+	${CUDACONTEXTMANAGER_SOURCE}
+	${CUDACONTEXTMANAGER_SOURCE_HEADERS}
+	
+	${CUDACONTEXTMANAGER_KERNELS}	
+)
+
+# Target specific compile options
+
+
+TARGET_INCLUDE_DIRECTORIES(PxCudaContextManager 
+	PRIVATE ${PXSHARED_SOURCE_DIR}/../include
+	PRIVATE ${PXSHARED_SOURCE_DIR}/foundation/include
+	PRIVATE ${PXSHARED_SOURCE_DIR}/task/include
+	PRIVATE ${PXSHARED_SOURCE_DIR}/cudamanager/include
+	PRIVATE ${LL_SOURCE_DIR}/include
+	PRIVATE ${CUDA_INCLUDE_DIRS}
+
+)
+
+
+
diff --git a/PxShared/src/compiler/cmake/common/PxFoundation.cmake b/PxShared/src/compiler/cmake/common/PxFoundation.cmake
new file mode 100644
index 0000000..99905e2
--- /dev/null
+++ b/PxShared/src/compiler/cmake/common/PxFoundation.cmake
@@ -0,0 +1,118 @@
+#
+# Build PxFoundation common
+#
+
+SET(PXFOUNDATION_HEADERS	
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/Px.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxAllocatorCallback.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxAssert.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxBitAndData.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxBounds3.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxErrorCallback.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxErrors.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxFlags.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxFoundation.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxFoundationVersion.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxIntrinsics.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxIO.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxMat33.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxMat44.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxMath.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxMathUtils.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxMemory.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxPlane.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxPreprocessor.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxProfiler.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxQuat.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxSimpleTypes.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxStrideIterator.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxTransform.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxUnionCast.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxVec2.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxVec3.h
+	${PROJECT_SOURCE_DIR}/../../../../include/foundation/PxVec4.h
+)
+SOURCE_GROUP(include FILES ${PXFOUNDATION_HEADERS})
+
+SET(PXFOUNDATION_SOURCE
+	${LL_SOURCE_DIR}/src/PsAllocator.cpp
+	${LL_SOURCE_DIR}/src/PsAssert.cpp
+	${LL_SOURCE_DIR}/src/PsFoundation.cpp
+	${LL_SOURCE_DIR}/src/PsMathUtils.cpp
+	${LL_SOURCE_DIR}/src/PsString.cpp
+	${LL_SOURCE_DIR}/src/PsTempAllocator.cpp
+	${LL_SOURCE_DIR}/src/PsUtilities.cpp
+)
+SOURCE_GROUP(src\\src FILES ${PXFOUNDATION_SOURCE})
+
+SET(PXFOUNDATION_SOURCE_HEADERS
+	${LL_SOURCE_DIR}/include/Ps.h
+	${LL_SOURCE_DIR}/include/PsAlignedMalloc.h
+	${LL_SOURCE_DIR}/include/PsAlloca.h
+	${LL_SOURCE_DIR}/include/PsAllocator.h
+	${LL_SOURCE_DIR}/include/PsAoS.h
+	${LL_SOURCE_DIR}/include/PsArray.h
+	${LL_SOURCE_DIR}/include/PsAtomic.h
+	${LL_SOURCE_DIR}/include/PsBasicTemplates.h
+	${LL_SOURCE_DIR}/include/PsBitUtils.h
+	${LL_SOURCE_DIR}/include/PsBroadcast.h
+	${LL_SOURCE_DIR}/include/PsCpu.h
+	${LL_SOURCE_DIR}/include/PsFoundation.h
+	${LL_SOURCE_DIR}/include/PsFPU.h
+	${LL_SOURCE_DIR}/include/PsHash.h
+	${LL_SOURCE_DIR}/include/PsHashInternals.h
+	${LL_SOURCE_DIR}/include/PsHashMap.h
+	${LL_SOURCE_DIR}/include/PsHashSet.h
+	${LL_SOURCE_DIR}/include/PsInlineAllocator.h
+	${LL_SOURCE_DIR}/include/PsInlineAoS.h
+	${LL_SOURCE_DIR}/include/PsInlineArray.h
+	${LL_SOURCE_DIR}/include/PsIntrinsics.h
+	${LL_SOURCE_DIR}/include/PsMathUtils.h
+	${LL_SOURCE_DIR}/include/PsMutex.h
+	${LL_SOURCE_DIR}/include/PsPool.h
+	${LL_SOURCE_DIR}/include/PsSList.h
+	${LL_SOURCE_DIR}/include/PsSocket.h
+	${LL_SOURCE_DIR}/include/PsSort.h
+	${LL_SOURCE_DIR}/include/PsSortInternals.h
+	${LL_SOURCE_DIR}/include/PsString.h
+	${LL_SOURCE_DIR}/include/PsSync.h
+	${LL_SOURCE_DIR}/include/PsTempAllocator.h
+	${LL_SOURCE_DIR}/include/PsThread.h
+	${LL_SOURCE_DIR}/include/PsTime.h
+	${LL_SOURCE_DIR}/include/PsUserAllocated.h
+	${LL_SOURCE_DIR}/include/PsUtilities.h
+	${LL_SOURCE_DIR}/include/PsVecMath.h
+	${LL_SOURCE_DIR}/include/PsVecMathAoSScalar.h
+	${LL_SOURCE_DIR}/include/PsVecMathAoSScalarInline.h
+	${LL_SOURCE_DIR}/include/PsVecMathSSE.h
+	${LL_SOURCE_DIR}/include/PsVecMathUtilities.h
+	${LL_SOURCE_DIR}/include/PsVecQuat.h
+	${LL_SOURCE_DIR}/include/PsVecTransform.h
+)
+SOURCE_GROUP(src\\include FILES ${PXFOUNDATION_SOURCE_HEADERS})
+
+ADD_LIBRARY(PxFoundation ${PXFOUNDATION_LIBTYPE} 
+	${PXFOUNDATION_SOURCE}	
+	${PXFOUNDATION_SOURCE_HEADERS}
+	${PXFOUNDATION_HEADERS}
+	
+	${PXFOUNDATION_PLATFORM_FILES}
+)
+
+TARGET_INCLUDE_DIRECTORIES(PxFoundation 
+	PRIVATE ${PXSHARED_SOURCE_DIR}/../include
+	PRIVATE ${LL_SOURCE_DIR}/include
+	
+	PRIVATE ${PXFOUNDATION_PLATFORM_INCLUDES}
+)
+
+TARGET_COMPILE_DEFINITIONS(PxFoundation 
+	PRIVATE ${PXFOUNDATION_COMPILE_DEFS}
+)
+
+SET_TARGET_PROPERTIES(PxFoundation PROPERTIES 
+	COMPILE_PDB_NAME_DEBUG "PxFoundation${CMAKE_DEBUG_POSTFIX}"
+	COMPILE_PDB_NAME_CHECKED "PxFoundation${CMAKE_CHECKED_POSTFIX}"
+	COMPILE_PDB_NAME_PROFILE "PxFoundation${CMAKE_PROFILE_POSTFIX}"
+	COMPILE_PDB_NAME_RELEASE "PxFoundation${CMAKE_RELEASE_POSTFIX}"
+)
+\ No newline at end of file
diff --git a/PxShared/src/compiler/cmake/common/PxPvdSDK.cmake b/PxShared/src/compiler/cmake/common/PxPvdSDK.cmake
new file mode 100644
index 0000000..7014209
--- /dev/null
+++ b/PxShared/src/compiler/cmake/common/PxPvdSDK.cmake
@@ -0,0 +1,123 @@
+#
+# Build PxPvdSDK common
+#
+
+SET(PXPVDSDK_HEADERS
+	${PROJECT_SOURCE_DIR}/../../../../include/pvd/PxPvd.h
+	${PROJECT_SOURCE_DIR}/../../../../include/pvd/PxPvdTransport.h
+)
+SOURCE_GROUP(include FILES ${PXPVDSDK_HEADERS})
+
+SET(PXPVDSDK_SOURCE
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileBase.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileCompileTimeEventFilter.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileContextProvider.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileContextProviderImpl.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileDataBuffer.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileDataParsing.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileEventBuffer.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileEventBufferAtomic.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileEventBufferClient.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileEventBufferClientManager.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileEventFilter.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileEventHandler.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileEventId.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileEventImpl.cpp
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileEventMutex.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileEventNames.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileEventParser.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileEvents.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileEventSender.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileEventSerialization.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileEventSystem.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileMemory.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileMemoryBuffer.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileMemoryEventBuffer.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileMemoryEventParser.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileMemoryEventRecorder.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileMemoryEventReflexiveWriter.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileMemoryEvents.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileMemoryEventSummarizer.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileMemoryEventTypes.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileScopedEvent.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileScopedMutexLock.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileZone.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileZoneImpl.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileZoneManager.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxProfileZoneManagerImpl.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvd.cpp
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdBits.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdByteStreams.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdCommStreamEvents.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdCommStreamEventSink.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdCommStreamSDKEventTypes.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdCommStreamTypes.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdDataStream.cpp
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdDefaultFileTransport.cpp
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdDefaultFileTransport.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdDefaultSocketTransport.cpp
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdDefaultSocketTransport.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdFoundation.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdImpl.cpp
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdImpl.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdInternalByteStreams.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdMarshalling.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdMemClient.cpp
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdMemClient.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdObjectModel.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdObjectModelInternalTypeDefs.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdObjectModelInternalTypes.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdObjectModelMetaData.cpp
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdObjectModelMetaData.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdObjectRegistrar.cpp
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdObjectRegistrar.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdProfileZoneClient.cpp
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdProfileZoneClient.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdUserRenderer.cpp
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdUserRenderImpl.h
+	${PXSHARED_SOURCE_DIR}/pvd/src/PxPvdUserRenderTypes.h
+)
+SOURCE_GROUP(src\\src FILES ${PXPVDSDK_SOURCE})
+
+SET(PXPVDSDK_INTERNAL_HEADERS
+	${PXSHARED_SOURCE_DIR}/pvd/include/PsPvd.h
+	${PXSHARED_SOURCE_DIR}/pvd/include/PxProfileAllocatorWrapper.h
+	${PXSHARED_SOURCE_DIR}/pvd/include/PxPvdClient.h
+	${PXSHARED_SOURCE_DIR}/pvd/include/PxPvdDataStream.h
+	${PXSHARED_SOURCE_DIR}/pvd/include/PxPvdDataStreamHelpers.h
+	${PXSHARED_SOURCE_DIR}/pvd/include/PxPvdErrorCodes.h
+	${PXSHARED_SOURCE_DIR}/pvd/include/PxPvdObjectModelBaseTypes.h
+	${PXSHARED_SOURCE_DIR}/pvd/include/PxPvdRenderBuffer.h
+	${PXSHARED_SOURCE_DIR}/pvd/include/PxPvdUserRenderer.h
+)
+SOURCE_GROUP(src\\include FILES ${PXPVDSDK_INTERNAL_HEADERS})
+
+ADD_LIBRARY(PxPvdSDK ${PXPVDSDK_LIBTYPE} 
+	${PXPVDSDK_HEADERS}
+
+	${PXPVDSDK_INTERNAL_HEADERS}
+	${PXPVDSDK_SOURCE}
+	
+	${PXPVDSDK_PLATFORM_FILES}
+)
+
+TARGET_INCLUDE_DIRECTORIES(PxPvdSDK 
+	PRIVATE ${PXSHARED_SOURCE_DIR}/../include
+	PRIVATE ${PXSHARED_SOURCE_DIR}/foundation/include
+	PRIVATE ${PXSHARED_SOURCE_DIR}/pvd/include
+	PRIVATE ${PXSHARED_SOURCE_DIR}/filebuf/include
+
+	PRIVATE ${PXPVDSDK_PLATFORM_INCLUDES}
+	
+)
+
+TARGET_COMPILE_DEFINITIONS(PxPvdSDK 
+	PRIVATE ${PXPVDSDK_COMPILE_DEFS}
+)
+
+SET_TARGET_PROPERTIES(PxPvdSDK PROPERTIES 
+	COMPILE_PDB_NAME_DEBUG "PxPvdSDK${CMAKE_DEBUG_POSTFIX}"
+	COMPILE_PDB_NAME_CHECKED "PxPvdSDK${CMAKE_CHECKED_POSTFIX}"
+	COMPILE_PDB_NAME_PROFILE "PxPvdSDK${CMAKE_PROFILE_POSTFIX}"
+	COMPILE_PDB_NAME_RELEASE "PxPvdSDK${CMAKE_RELEASE_POSTFIX}"
+)
+\ No newline at end of file
diff --git a/PxShared/src/compiler/cmake/common/PxTask.cmake b/PxShared/src/compiler/cmake/common/PxTask.cmake
new file mode 100644
index 0000000..0d1cd1f
--- /dev/null
+++ b/PxShared/src/compiler/cmake/common/PxTask.cmake
@@ -0,0 +1,44 @@
+#
+# Build PxTask common
+#
+
+SET(PXTASK_HEADERS
+	${PROJECT_SOURCE_DIR}/../../../../include/task/PxCpuDispatcher.h
+	${PROJECT_SOURCE_DIR}/../../../../include/task/PxGpuDispatcher.h
+	${PROJECT_SOURCE_DIR}/../../../../include/task/PxGpuTask.h
+	${PROJECT_SOURCE_DIR}/../../../../include/task/PxTask.h
+	${PROJECT_SOURCE_DIR}/../../../../include/task/PxTaskDefine.h
+	${PROJECT_SOURCE_DIR}/../../../../include/task/PxTaskManager.h
+)
+SOURCE_GROUP(include FILES ${PXTASK_HEADERS})
+
+SET(PXTASK_SOURCE
+	${PXSHARED_SOURCE_DIR}/task/src/TaskManager.cpp
+)
+SOURCE_GROUP(src FILES ${PXTASK_SOURCE})
+
+ADD_LIBRARY(PxTask ${PXTASK_LIBTYPE}	
+	${PXTASK_HEADERS}
+	${PXTASK_SOURCE}
+)
+
+TARGET_INCLUDE_DIRECTORIES(PxTask 
+	PRIVATE ${PXSHARED_SOURCE_DIR}/../include
+	PRIVATE ${PXSHARED_SOURCE_DIR}/cudamanager/include
+	PRIVATE ${PXSHARED_SOURCE_DIR}/foundation/include
+	
+	PRIVATE ${PXTASK_PLATFORM_INCLUDES}
+)
+
+TARGET_COMPILE_DEFINITIONS(PxTask 
+	PRIVATE ${PXTASK_COMPILE_DEFS}
+)
+
+IF(NOT ${PXTASK_LIBTYPE} STREQUAL "OBJECT")
+	SET_TARGET_PROPERTIES(PxTask PROPERTIES 
+		COMPILE_PDB_NAME_DEBUG "PxTask${CMAKE_DEBUG_POSTFIX}"
+		COMPILE_PDB_NAME_CHECKED "PxTask${CMAKE_CHECKED_POSTFIX}"
+		COMPILE_PDB_NAME_PROFILE "PxTask${CMAKE_PROFILE_POSTFIX}"
+		COMPILE_PDB_NAME_RELEASE "PxTask${CMAKE_RELEASE_POSTFIX}"
+	)
+ENDIF()
+\ No newline at end of file
diff --git a/PxShared/src/compiler/cmake/findfileswithspec.py b/PxShared/src/compiler/cmake/findfileswithspec.py
new file mode 100644
index 0000000..484ff4f
--- /dev/null
+++ b/PxShared/src/compiler/cmake/findfileswithspec.py
@@ -0,0 +1,23 @@
+import os,argparse,sys,string
+
+#
+# Simple helper program - give it a path and it will list all of the files of the specified extension in relative format, using the 
+# pathroot variable as a substitution. This greatly simplifies one part of the process of creating a CMake file for a project
+#
+parser = argparse.ArgumentParser()
+parser.add_argument("dir", help="Path to find files in")
+parser.add_argument("extension", help="Spec to find (ie .cpp)")
+parser.add_argument("--pathroot", help="Path variable to prepend to each line, example: ${PX_ROOT}", default="${DUDER}")
+
+args = parser.parse_args()
+
+if not os.path.exists(args.dir):
+	print("Unable to find path {}".format(args.dir))
+	exit(1)
+
+for root, dirs, files in os.walk(args.dir):
+	for file in files:
+		if file.endswith(args.extension):
+			result = os.path.join(root, file)
+			
+			print(result.replace(args.dir, args.pathroot))
diff --git a/PxShared/src/compiler/cmake/html5/CMakeLists.txt b/PxShared/src/compiler/cmake/html5/CMakeLists.txt
new file mode 100644
index 0000000..8b9587a
--- /dev/null
+++ b/PxShared/src/compiler/cmake/html5/CMakeLists.txt
@@ -0,0 +1,36 @@
+cmake_minimum_required(VERSION 3.3)
+#set(CMAKE_VERBOSE_MAKEFILE ON)
+include(../common/CMakeLists.txt)
+
+
+IF(NOT DEFINED TARGET_BUILD_PLATFORM) # Not defined, default to HTML5
+	SET(TARGET_BUILD_PLATFORM "HTML5")
+ENDIF()
+
+SET(PLATFORM_LIST HTML5)
+
+IF (NOT ${TARGET_BUILD_PLATFORM} IN_LIST PLATFORM_LIST)
+	MESSAGE(FATAL_ERROR "Invalid platform:" ${TARGET_BUILD_PLATFORM})
+ENDIF()
+
+SET(CMAKE_CXX_FLAGS "${EPIC_BUILD_FLAGS} -fdiagnostics-show-option -fno-rtti -fno-exceptions -ffast-math -ffunction-sections -fdata-sections -Werror -ferror-limit=0 -Wall -Wextra -fstrict-aliasing -Wstrict-aliasing=2 -pedantic -Weverything -Wno-c++11-long-long -Wno-padded -Wno-reserved-id-macro -Wno-float-equal -Wno-sign-conversion -Wno-covered-switch-default -Wno-documentation-unknown-command -Wno-weak-vtables -Wno-missing-prototypes -Wno-unused-local-typedef -Wno-float-conversion -Wno-global-constructors -Wno-missing-variable-declarations -Wno-exit-time-destructors -Wno-unused-macros -Wno-undef -Wno-c++11-extra-semi -Wno-c++11-extensions -Wno-non-virtual-dtor -Wno-unknown-pragmas -Wno-old-style-cast -Wno-extra-semi -Wno-cast-align -Wno-documentation -Wno-shadow -Wno-conversion -Wno-newline-eof -Wno-header-hygiene -Wno-switch-enum -Wno-undefined-reinterpret-cast -Wno-variadic-macros -Wno-gnu-zero-variadic-macro-arguments -Wno-overloaded-virtual -Wno-dynamic-class-memaccess -Wno-nested-anon-types -Wno-invalid-offsetof -Wno-reorder -Wno-local-type-template-args -Wno-unreachable-code -Wno-unreachable-code-return -Wno-format-pedantic -Wno-unused-private-field -Wno-unused-parameter -Wno-unused-member-function -Wno-used-but-marked-unused -Wno-unused-variable -Wno-format-nonliteral -Wno-shift-sign-overflow -Wno-comma -Wno-expansion-to-defined -Wno-undefined-func-template -Wno-weak-template-vtables -Wno-double-promotion -Wno-nonportable-include-path -Wno-disabled-macro-expansion -Wno-missing-noreturn")
+SET(CMAKE_STATIC_LIBRARY_PREFIX "")
+
+SET(PXSHARED_HTML5_COMPILE_DEFS _LIB;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_DEPRECATE)
+SET(PXSHARED_HTML5_DEBUG_COMPILE_DEFS _DEBUG;PX_DEBUG=1;PX_CHECKED=1)
+SET(PXSHARED_HTML5_CHECKED_COMPILE_DEFS NDEBUG;PX_CHECKED=1)
+SET(PXSHARED_HTML5_PROFILE_COMPILE_DEFS NDEBUG;PX_PROFILE=1)
+SET(PXSHARED_HTML5_RELEASE_COMPILE_DEFS NDEBUG)
+
+SET(CMAKE_DEBUG_POSTFIX $ENV{LIB_SUFFIX})
+SET(CMAKE_PROFILE_POSTFIX $ENV{LIB_SUFFIX})
+SET(CMAKE_CHECKED_POSTFIX $ENV{LIB_SUFFIX})
+SET(CMAKE_RELEASE_POSTFIX $ENV{LIB_SUFFIX})
+
+# Include project cmake files here
+INCLUDE(PxFoundation.cmake)
+INCLUDE(PsFastXml.cmake)
+INCLUDE(PxPvdSDK.cmake)
+INCLUDE(PxTask.cmake)
+#	INCLUDE(PxCudaContextManager.cmake)
+
diff --git a/PxShared/src/compiler/cmake/html5/PsFastXml.cmake b/PxShared/src/compiler/cmake/html5/PsFastXml.cmake
new file mode 100644
index 0000000..3279134
--- /dev/null
+++ b/PxShared/src/compiler/cmake/html5/PsFastXml.cmake
@@ -0,0 +1,26 @@
+#
+# Build PsFastXml
+#
+
+SET(PXSHARED_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../../src)
+
+SET(LL_SOURCE_DIR ${PXSHARED_SOURCE_DIR}/fastxml)
+
+SET(PLATFORM_INCLUDES
+	$ENV{EMSCRIPTEN}/system/include
+)
+
+# Use generator expressions to set config specific preprocessor definitions
+SET(PSFASTXML_COMPILE_DEFS 
+
+	# Common to all configurations
+	${PXSHARED_HTML5_COMPILE_DEFS};PX_FOUNDATION_DLL=0;
+
+	$<$<CONFIG:debug>:${PXSHARED_HTML5_DEBUG_COMPILE_DEFS};>
+	$<$<CONFIG:checked>:${PXSHARED_HTML5_CHECKED_COMPILE_DEFS};>
+	$<$<CONFIG:profile>:${PXSHARED_HTML5_PROFILE_COMPILE_DEFS};>
+	$<$<CONFIG:release>:${PXSHARED_HTML5_RELEASE_COMPILE_DEFS};>
+)
+
+# include PsFastXml common
+INCLUDE(../common/PsFastXml.cmake)
+\ No newline at end of file
diff --git a/PxShared/src/compiler/cmake/html5/PxFoundation.cmake b/PxShared/src/compiler/cmake/html5/PxFoundation.cmake
new file mode 100644
index 0000000..a78e4e6
--- /dev/null
+++ b/PxShared/src/compiler/cmake/html5/PxFoundation.cmake
@@ -0,0 +1,41 @@
+#
+# Build PxFoundation
+#
+
+SET(PXSHARED_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../../src)
+
+SET(LL_SOURCE_DIR ${PXSHARED_SOURCE_DIR}/foundation)
+
+SET(PXFOUNDATION_LIBTYPE STATIC)
+
+SET(PXFOUNDATION_PLATFORM_FILES
+	${LL_SOURCE_DIR}/src/unix/PsUnixAtomic.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixCpu.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixFPU.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixMutex.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixPrintString.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixSList.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixSocket.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixSync.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixThread.cpp
+	${LL_SOURCE_DIR}/src/unix/PsUnixTime.cpp
+)
+
+SET(PXFOUNDATION_PLATFORM_INCLUDES
+	$ENV{EMSCRIPTEN}/system/include
+	${LL_SOURCE_DIR}/include/unix
+)
+
+SET(PXFOUNDATION_COMPILE_DEFS
+
+	# Common to all configurations
+	${PXSHARED_HTML5_COMPILE_DEFS}
+
+	$<$<CONFIG:debug>:${PXSHARED_HTML5_DEBUG_COMPILE_DEFS}>
+	$<$<CONFIG:checked>:${PXSHARED_HTML5_CHECKED_COMPILE_DEFS}>
+	$<$<CONFIG:profile>:${PXSHARED_HTML5_PROFILE_COMPILE_DEFS}>
+	$<$<CONFIG:release>:${PXSHARED_HTML5_RELEASE_COMPILE_DEFS}>
+)
+	
+# include PxFoundation common
+INCLUDE(../common/PxFoundation.cmake)
+\ No newline at end of file
diff --git a/PxShared/src/compiler/cmake/html5/PxPvdSDK.cmake b/PxShared/src/compiler/cmake/html5/PxPvdSDK.cmake
new file mode 100644
index 0000000..36a465f
--- /dev/null
+++ b/PxShared/src/compiler/cmake/html5/PxPvdSDK.cmake
@@ -0,0 +1,31 @@
+#
+# Build PxPvdSDK
+#
+
+SET(PXSHARED_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../../src)
+
+SET(LL_SOURCE_DIR ${PXSHARED_SOURCE_DIR}/pvd)
+
+	SET(PXPVDSDK_LIBTYPE STATIC)
+	
+	SET(PXPVDSDK_PLATFORM_INCLUDES
+		$ENV{EMSCRIPTEN}/system/include
+	)
+	
+	# Use generator expressions to set config specific preprocessor definitions
+	SET(PXPVDSDK_COMPILE_DEFS 
+		${PXSHARED_HTML5_COMPILE_DEFS}
+		
+		$<$<CONFIG:debug>:${PXSHARED_HTML5_DEBUG_COMPILE_DEFS}>
+		$<$<CONFIG:checked>:${PXSHARED_HTML5_CHECKED_COMPILE_DEFS}>
+		$<$<CONFIG:profile>:${PXSHARED_HTML5_PROFILE_COMPILE_DEFS}>
+		$<$<CONFIG:release>:${PXSHARED_HTML5_RELEASE_COMPILE_DEFS}>
+	)
+	
+# include PxPvdSDK common
+INCLUDE(../common/PxPvdSDK.cmake)
+
+# Add linked libraries
+TARGET_LINK_LIBRARIES(PxPvdSDK PRIVATE PxFoundation)
+
+
diff --git a/PxShared/src/compiler/cmake/html5/PxTask.cmake b/PxShared/src/compiler/cmake/html5/PxTask.cmake
new file mode 100644
index 0000000..5c00c13
--- /dev/null
+++ b/PxShared/src/compiler/cmake/html5/PxTask.cmake
@@ -0,0 +1,22 @@
+#
+# Build PxTask
+#
+
+SET(PXSHARED_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../../src)
+
+SET(LL_SOURCE_DIR ${PXSHARED_SOURCE_DIR}/task)
+
+SET(PXTASK_PLATFORM_INCLUDES
+	$ENV{EMSCRIPTEN}/system/include
+)
+
+SET(PXTASK_COMPILE_DEFS 
+	${PXSHARED_HTML5_COMPILE_DEFS};PX_FOUNDATION_DLL=0;
+	$<$<CONFIG:debug>:${PXSHARED_HTML5_DEBUG_COMPILE_DEFS};>
+	$<$<CONFIG:checked>:${PXSHARED_HTML5_CHECKED_COMPILE_DEFS};>
+	$<$<CONFIG:profile>:${PXSHARED_HTML5_PROFILE_COMPILE_DEFS};>
+	$<$<CONFIG:release>:${PXSHARED_HTML5_RELEASE_COMPILE_DEFS};>
+)
+
+# include PxTask common
+INCLUDE(../common/PxTask.cmake)
diff --git a/PxShared/src/compiler/cmake/windows/CMakeLists.txt b/PxShared/src/compiler/cmake/windows/CMakeLists.txt
new file mode 100644
index 0000000..39b7dfc
--- /dev/null
+++ b/PxShared/src/compiler/cmake/windows/CMakeLists.txt
@@ -0,0 +1,90 @@
+cmake_minimum_required(VERSION 3.3)
+include(../common/CMakeLists.txt)
+
+
+IF(NOT DEFINED TARGET_BUILD_PLATFORM) # Not defined, default to Windows
+	SET(TARGET_BUILD_PLATFORM "Windows")
+ENDIF()
+
+SET(PLATFORM_LIST Windows)
+
+IF (NOT ${TARGET_BUILD_PLATFORM} IN_LIST PLATFORM_LIST)
+	MESSAGE(FATAL_ERROR "Invalid platform:" ${TARGET_BUILD_PLATFORM})
+ENDIF()
+
+SET(CMAKE_CXX_FLAGS "/Wall /wd4514 /wd4820 /wd4127 /wd4710 /wd4711 /wd4577 /d2Zi+ /WX /W4 /GF /GS- /GR- /Gd /fp:fast")
+
+IF(DEFINED STATIC_WINCRT)
+    SET(WINCRT_NDEBUG "/MT")
+    SET(WINCRT_DEBUG "/MTd")
+ELSE()
+    SET(WINCRT_NDEBUG "/MD")
+    SET(WINCRT_DEBUG "/MDd")
+ENDIF()
+
+SET(CMAKE_CXX_FLAGS_DEBUG "/Od ${WINCRT_DEBUG} /RTCu /Zi")
+SET(CMAKE_CXX_FLAGS_CHECKED "/Ox ${WINCRT_NDEBUG} /Zi")
+SET(CMAKE_CXX_FLAGS_PROFILE "/Ox ${WINCRT_NDEBUG} /Zi")
+SET(CMAKE_CXX_FLAGS_RELEASE "/Ox ${WINCRT_NDEBUG} /Zi")
+
+# Build PDBs for all configurations
+SET(CMAKE_SHARED_LINKER_FLAGS "/DEBUG")
+
+# Controls PX_NVTX for all projects on windows
+SET(PXSHARED_WINDOWS_ENABLE_NVTX 0)	
+
+IF(DEFINED PX_GENERATE_GPU_PROJECTS)
+SET(PXSHARED_WINDOWS_COMPILE_DEFS WIN32;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_DEPRECATE;_WINSOCK_DEPRECATED_NO_WARNINGS;)
+ELSE()
+# Disable cuda and dx for all projects on windows
+SET(PXSHARED_WINDOWS_COMPILE_DEFS WIN32;_CRT_SECURE_NO_DEPRECATE;_CRT_NONSTDC_NO_DEPRECATE;_WINSOCK_DEPRECATED_NO_WARNINGS;DISABLE_CUDA_PHYSX;)
+ENDIF()
+SET(PXSHARED_WINDOWS_DEBUG_COMPILE_DEFS _DEBUG;PX_DEBUG=1;PX_CHECKED=1;PX_NVTX=${PXSHARED_WINDOWS_ENABLE_NVTX})
+SET(PXSHARED_WINDOWS_CHECKED_COMPILE_DEFS NDEBUG;PX_CHECKED=1;PX_NVTX=${PXSHARED_WINDOWS_ENABLE_NVTX})
+SET(PXSHARED_WINDOWS_PROFILE_COMPILE_DEFS NDEBUG;PX_PROFILE=1;PX_NVTX=${PXSHARED_WINDOWS_ENABLE_NVTX})
+SET(PXSHARED_WINDOWS_RELEASE_COMPILE_DEFS NDEBUG)
+
+IF(CMAKE_CL_64)
+	ADD_DEFINITIONS(-DWIN64)
+ENDIF(CMAKE_CL_64)		
+
+IF(CMAKE_CL_64)
+	SET(LIBPATH_SUFFIX "x64")
+ELSE(CMAKE_CL_64)
+	SET(LIBPATH_SUFFIX "x86")
+ENDIF(CMAKE_CL_64)				
+
+SET(CMAKE_DEBUG_POSTFIX "${CMAKE_DEBUG_POSTFIX}_${LIBPATH_SUFFIX}")
+SET(CMAKE_PROFILE_POSTFIX "${CMAKE_PROFILE_POSTFIX}_${LIBPATH_SUFFIX}")
+SET(CMAKE_CHECKED_POSTFIX "${CMAKE_CHECKED_POSTFIX}_${LIBPATH_SUFFIX}")
+SET(CMAKE_RELEASE_POSTFIX "${CMAKE_RELEASE_POSTFIX}_${LIBPATH_SUFFIX}")
+
+# Include project cmake files here
+IF(DEFINED PX_SELECT_COMPONENTS)
+	if ("PxFoundation" IN_LIST PX_SELECT_COMPONENTS)
+		INCLUDE(PxFoundation.cmake)
+	endif()
+	if ("PsFastXml" IN_LIST PX_SELECT_COMPONENTS)
+		INCLUDE(PsFastXml.cmake)
+	endif()
+	if ("PxPvdSDK" IN_LIST PX_SELECT_COMPONENTS)
+		INCLUDE(PxPvdSDK.cmake)
+	endif()
+	if ("PxTask" IN_LIST PX_SELECT_COMPONENTS)
+		INCLUDE(PxTask.cmake)
+	endif()
+	if ("PxCudaContextManager" IN_LIST PX_SELECT_COMPONENTS)
+		IF(DEFINED PX_GENERATE_GPU_PROJECTS)
+			INCLUDE(PxCudaContextManager.cmake)
+		ENDIF()
+	endif()
+ELSE()
+INCLUDE(PxFoundation.cmake)
+INCLUDE(PsFastXml.cmake)
+INCLUDE(PxPvdSDK.cmake)
+INCLUDE(PxTask.cmake)
+IF(DEFINED PX_GENERATE_GPU_PROJECTS)
+	INCLUDE(PxCudaContextManager.cmake)
+ENDIF()
+ENDIF()
+
diff --git a/PxShared/src/compiler/cmake/windows/PsFastXml.cmake b/PxShared/src/compiler/cmake/windows/PsFastXml.cmake
new file mode 100644
index 0000000..862b06e
--- /dev/null
+++ b/PxShared/src/compiler/cmake/windows/PsFastXml.cmake
@@ -0,0 +1,21 @@
+#
+# Build PsFastXml
+#
+
+SET(PXSHARED_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../../src)
+
+SET(LL_SOURCE_DIR ${PXSHARED_SOURCE_DIR}/fastxml)
+
+# Use generator expressions to set config specific preprocessor definitions
+SET(PSFASTXML_COMPILE_DEFS 
+	# Common to all configurations
+	${PXSHARED_WINDOWS_COMPILE_DEFS};PX_FOUNDATION_DLL=0;
+
+	$<$<CONFIG:debug>:${PXSHARED_WINDOWS_DEBUG_COMPILE_DEFS};>
+	$<$<CONFIG:checked>:${PXSHARED_WINDOWS_CHECKED_COMPILE_DEFS};>
+	$<$<CONFIG:profile>:${PXSHARED_WINDOWS_PROFILE_COMPILE_DEFS};>
+	$<$<CONFIG:release>:${PXSHARED_WINDOWS_RELEASE_COMPILE_DEFS};>
+)
+
+# include PsFastXml common
+INCLUDE(../common/PsFastXml.cmake)
+\ No newline at end of file
diff --git a/PxShared/src/compiler/cmake/windows/PxCudaContextManager.cmake b/PxShared/src/compiler/cmake/windows/PxCudaContextManager.cmake
new file mode 100644
index 0000000..6b59d6e
--- /dev/null
+++ b/PxShared/src/compiler/cmake/windows/PxCudaContextManager.cmake
@@ -0,0 +1,32 @@
+#
+# Build PxCudaContextManager
+#
+FIND_PACKAGE(CUDA REQUIRED)
+
+SET(PXSHARED_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../../src)
+
+SET(LL_SOURCE_DIR ${PXSHARED_SOURCE_DIR}/cudamanager)
+
+SET(CUDA_COMPILER_OPTION_DEBUG "--compiler-options=/W3,/nologo,/Ot,/Ox,/Zi,${WINCRT_DEBUG}")
+SET(CUDA_COMPILER_OPTION_CHECKED "--compiler-options=/W3,/nologo,/Ot,/Ox,/Zi,${WINCRT_NDEBUG}")
+SET(CUDA_COMPILER_OPTION_PROFILE "--compiler-options=/W3,/nologo,/Ot,/Ox,/Zi,${WINCRT_NDEBUG}")
+SET(CUDA_COMPILER_OPTION_RELEASE "--compiler-options=/W3,/nologo,/Ot,/Ox,/Zi,${WINCRT_NDEBUG}")
+
+# include PxCudaContextManager common
+INCLUDE(../common/PxCudaContextManager.cmake)
+
+# No linked libraries
+
+# Use generator expressions to set config specific preprocessor definitions
+TARGET_COMPILE_DEFINITIONS(PxCudaContextManager 
+
+	# Common to all configurations
+	PRIVATE ${PXSHARED_WINDOWS_COMPILE_DEFS};
+
+	PRIVATE $<$<CONFIG:debug>:${PXSHARED_WINDOWS_DEBUG_COMPILE_DEFS};>
+	PRIVATE $<$<CONFIG:checked>:${PXSHARED_WINDOWS_CHECKED_COMPILE_DEFS};>
+	PRIVATE $<$<CONFIG:profile>:${PXSHARED_WINDOWS_PROFILE_COMPILE_DEFS};>
+	PRIVATE $<$<CONFIG:release>:${PXSHARED_WINDOWS_RELEASE_COMPILE_DEFS};>
+)
+
+#TODO: Link flags
diff --git a/PxShared/src/compiler/cmake/windows/PxFoundation.cmake b/PxShared/src/compiler/cmake/windows/PxFoundation.cmake
new file mode 100644
index 0000000..31de53b
--- /dev/null
+++ b/PxShared/src/compiler/cmake/windows/PxFoundation.cmake
@@ -0,0 +1,70 @@
+#
+# Build PxFoundation
+#
+
+SET(PXSHARED_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../../src)
+
+SET(LL_SOURCE_DIR ${PXSHARED_SOURCE_DIR}/foundation)
+
+SET(PXFOUNDATION_LIBTYPE SHARED)
+
+SET(PXFOUNDATION_RESOURCE_FILE
+	${PXSHARED_SOURCE_DIR}/compiler/resource_${LIBPATH_SUFFIX}/PxFoundation.rc
+)
+SOURCE_GROUP(resource FILES ${PXFOUNDATION_RESOURCE_FILE})
+
+SET(PXFOUNDATION_PLATFORM_HEADERS
+	${PXSHARED_SOURCE_DIR}/../include/foundation/windows/PxWindowsIntrinsics.h
+	${PXSHARED_SOURCE_DIR}/../include/foundation/windows/PxWindowsFoundationDelayLoadHook.h
+)
+SOURCE_GROUP(include\\windows FILES ${PXFOUNDATION_PLATFORM_HEADERS})
+
+SET(PXFOUNDATION_PLATFORM_SOURCE
+	${LL_SOURCE_DIR}/src/windows/PsWindowsAtomic.cpp
+	${LL_SOURCE_DIR}/src/windows/PsWindowsCpu.cpp
+	${LL_SOURCE_DIR}/src/windows/PsWindowsFPU.cpp
+	${LL_SOURCE_DIR}/src/windows/PsWindowsMutex.cpp
+	${LL_SOURCE_DIR}/src/windows/PsWindowsPrintString.cpp
+	${LL_SOURCE_DIR}/src/windows/PsWindowsSList.cpp
+	${LL_SOURCE_DIR}/src/windows/PsWindowsSocket.cpp
+	${LL_SOURCE_DIR}/src/windows/PsWindowsSync.cpp
+	${LL_SOURCE_DIR}/src/windows/PsWindowsThread.cpp
+	${LL_SOURCE_DIR}/src/windows/PsWindowsTime.cpp
+)
+SOURCE_GROUP(src\\src\\windows FILES ${PXFOUNDATION_PLATFORM_SOURCE})
+
+SET(PXFOUNDATION_PLATFORM_SOURCE_HEADERS
+	${LL_SOURCE_DIR}/include/windows/PsWindowsAoS.h
+	${LL_SOURCE_DIR}/include/windows/PsWindowsFPU.h
+	${LL_SOURCE_DIR}/include/windows/PsWindowsInclude.h
+	${LL_SOURCE_DIR}/include/windows/PsWindowsInlineAoS.h
+	${LL_SOURCE_DIR}/include/windows/PsWindowsIntrinsics.h
+	${LL_SOURCE_DIR}/include/windows/PsWindowsLoadLibrary.h
+	${LL_SOURCE_DIR}/include/windows/PsWindowsTrigConstants.h
+)
+SOURCE_GROUP(src\\include\\windows FILES ${PXFOUNDATION_PLATFORM_SOURCE_HEADERS})
+
+
+SET(PXFOUNDATION_PLATFORM_FILES
+	${PXFOUNDATION_PLATFORM_SOURCE}
+	${PXFOUNDATION_PLATFORM_SOURCE_HEADERS}
+	${PXFOUNDATION_PLATFORM_HEADERS}
+	${PXFOUNDATION_RESOURCE_FILE}
+)
+
+SET(PXFOUNDATION_PLATFORM_INCLUDES
+	${LL_SOURCE_DIR}/include/windows
+)
+
+SET(PXFOUNDATION_COMPILE_DEFS
+	# Common to all configurations
+	${PXSHARED_WINDOWS_COMPILE_DEFS};PX_FOUNDATION_DLL=1;
+
+	$<$<CONFIG:debug>:${PXSHARED_WINDOWS_DEBUG_COMPILE_DEFS};>
+	$<$<CONFIG:checked>:${PXSHARED_WINDOWS_CHECKED_COMPILE_DEFS};>
+	$<$<CONFIG:profile>:${PXSHARED_WINDOWS_PROFILE_COMPILE_DEFS};>
+	$<$<CONFIG:release>:${PXSHARED_WINDOWS_RELEASE_COMPILE_DEFS};>
+)
+	
+# include PxFoundation common
+INCLUDE(../common/PxFoundation.cmake)
+\ No newline at end of file
diff --git a/PxShared/src/compiler/cmake/windows/PxPvdSDK.cmake b/PxShared/src/compiler/cmake/windows/PxPvdSDK.cmake
new file mode 100644
index 0000000..287ec1c
--- /dev/null
+++ b/PxShared/src/compiler/cmake/windows/PxPvdSDK.cmake
@@ -0,0 +1,55 @@
+#
+# Build PxPvdSDK
+#
+
+SET(PXSHARED_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../../src)
+
+SET(LL_SOURCE_DIR ${PXSHARED_SOURCE_DIR}/pvd)
+
+FIND_PACKAGE(nvToolsExt REQUIRED)
+
+SET(PXPVDSDK_LIBTYPE SHARED)
+
+SET(PXPVDSDK_RESOURCE_FILE
+	${PXSHARED_SOURCE_DIR}/compiler/resource_${LIBPATH_SUFFIX}/PxPvdSDK.rc
+)
+SOURCE_GROUP(resource FILES ${PXPVDSDK_RESOURCE_FILE})
+
+SET(PXPVDSDK_PLATFORM_HEADERS
+	${PXSHARED_SOURCE_DIR}/../include/pvd/windows/PxWindowsPvdDelayLoadHook.h
+)
+SOURCE_GROUP(include\\windows FILES ${PXPVDSDK_PLATFORM_HEADERS})
+
+SET(PXPVDSDK_PLATFORM_SOURCE
+	${PXSHARED_SOURCE_DIR}/pvd/src/windows/PxWindowsPvdDelayLoadHook.cpp
+)
+SOURCE_GROUP(src\\src\\windows FILES ${PXPVDSDK_PLATFORM_SOURCE})
+
+SET(PXPVDSDK_PLATFORM_FILES
+	${PXPVDSDK_RESOURCE_FILE}
+	${PXPVDSDK_PLATFORM_HEADERS}
+	${PXPVDSDK_PLATFORM_SOURCE}
+)
+
+SET(PXPVDSDK_PLATFORM_INCLUDES
+	${NVTOOLSEXT_INCLUDE_DIRS}
+)
+
+# Use generator expressions to set config specific preprocessor definitions
+SET(PXPVDSDK_COMPILE_DEFS
+	# Common to all configurations
+	${PXSHARED_WINDOWS_COMPILE_DEFS};PX_PVDSDK_DLL=1;PX_FOUNDATION_DLL=1;
+
+	$<$<CONFIG:debug>:${PXSHARED_WINDOWS_DEBUG_COMPILE_DEFS};>
+	$<$<CONFIG:checked>:${PXSHARED_WINDOWS_CHECKED_COMPILE_DEFS};>
+	$<$<CONFIG:profile>:${PXSHARED_WINDOWS_PROFILE_COMPILE_DEFS};>
+	$<$<CONFIG:release>:${PXSHARED_WINDOWS_RELEASE_COMPILE_DEFS};>
+)
+	
+# include PxPvdSDK common
+INCLUDE(../common/PxPvdSDK.cmake)
+
+# Add linked libraries
+TARGET_LINK_LIBRARIES(PxPvdSDK PUBLIC ${NVTOOLSEXT_LIBRARIES} PxFoundation)
+
+
diff --git a/PxShared/src/compiler/cmake/windows/PxTask.cmake b/PxShared/src/compiler/cmake/windows/PxTask.cmake
new file mode 100644
index 0000000..32d4b39
--- /dev/null
+++ b/PxShared/src/compiler/cmake/windows/PxTask.cmake
@@ -0,0 +1,19 @@
+#
+# Build PxTask
+#
+
+SET(PXSHARED_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../../src)
+
+SET(PXTASK_COMPILE_DEFS 
+	${PXSHARED_WINDOWS_COMPILE_DEFS};_LIB
+	
+	$<$<CONFIG:debug>:${PXSHARED_WINDOWS_DEBUG_COMPILE_DEFS};>
+	$<$<CONFIG:checked>:${PXSHARED_WINDOWS_CHECKED_COMPILE_DEFS};>
+	$<$<CONFIG:profile>:${PXSHARED_WINDOWS_PROFILE_COMPILE_DEFS};>
+	$<$<CONFIG:release>:${PXSHARED_WINDOWS_RELEASE_COMPILE_DEFS};>
+)
+
+SET(PXTASK_LIBTYPE STATIC)
+
+# include PxTask common
+INCLUDE(../common/PxTask.cmake)
diff --git a/PxShared/src/compiler/resource_x64/PxFoundation.rc b/PxShared/src/compiler/resource_x64/PxFoundation.rc
new file mode 100644
index 0000000..dfcfde2
--- /dev/null
+++ b/PxShared/src/compiler/resource_x64/PxFoundation.rc
diff --git a/PxShared/src/compiler/resource_x64/PxPvdSDK.rc b/PxShared/src/compiler/resource_x64/PxPvdSDK.rc
new file mode 100644
index 0000000..37308cf
--- /dev/null
+++ b/PxShared/src/compiler/resource_x64/PxPvdSDK.rc
diff --git a/PxShared/src/compiler/resource_x64/resource.h b/PxShared/src/compiler/resource_x64/resource.h
new file mode 100644
index 0000000..b421bea
--- /dev/null
+++ b/PxShared/src/compiler/resource_x64/resource.h
@@ -0,0 +1,44 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+//{{NO_DEPENDENCIES}}
+// Microsoft Visual C++ generated include file.
+// Used by PxFoundation.rc
+//
+
+// Next default values for new objects
+//
+#ifdef APSTUDIO_INVOKED
+#ifndef APSTUDIO_READONLY_SYMBOLS
+#define _APS_NEXT_RESOURCE_VALUE        101
+#define _APS_NEXT_COMMAND_VALUE         40001
+#define _APS_NEXT_CONTROL_VALUE         1000
+#define _APS_NEXT_SYMED_VALUE           101
+#endif
+#endif
diff --git a/PxShared/src/compiler/resource_x86/PxFoundation.rc b/PxShared/src/compiler/resource_x86/PxFoundation.rc
new file mode 100644
index 0000000..8b1085d
--- /dev/null
+++ b/PxShared/src/compiler/resource_x86/PxFoundation.rc
diff --git a/PxShared/src/compiler/resource_x86/PxPvdSDK.rc b/PxShared/src/compiler/resource_x86/PxPvdSDK.rc
new file mode 100644
index 0000000..90636c2
--- /dev/null
+++ b/PxShared/src/compiler/resource_x86/PxPvdSDK.rc
diff --git a/PxShared/src/compiler/resource_x86/resource.h b/PxShared/src/compiler/resource_x86/resource.h
new file mode 100644
index 0000000..b421bea
--- /dev/null
+++ b/PxShared/src/compiler/resource_x86/resource.h
@@ -0,0 +1,44 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+//{{NO_DEPENDENCIES}}
+// Microsoft Visual C++ generated include file.
+// Used by PxFoundation.rc
+//
+
+// Next default values for new objects
+//
+#ifdef APSTUDIO_INVOKED
+#ifndef APSTUDIO_READONLY_SYMBOLS
+#define _APS_NEXT_RESOURCE_VALUE        101
+#define _APS_NEXT_COMMAND_VALUE         40001
+#define _APS_NEXT_CONTROL_VALUE         1000
+#define _APS_NEXT_SYMED_VALUE           101
+#endif
+#endif
diff --git a/PxShared/src/cudamanager/include/CudaContextManager.h b/PxShared/src/cudamanager/include/CudaContextManager.h
new file mode 100644
index 0000000..3d68f82
--- /dev/null
+++ b/PxShared/src/cudamanager/include/CudaContextManager.h
@@ -0,0 +1,51 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXCUDACONTEXTMANAGER_CUDACONTEXTMANAGER_H
+#define PXCUDACONTEXTMANAGER_CUDACONTEXTMANAGER_H
+
+#include "task/PxTaskDefine.h"
+
+#if PX_SUPPORT_GPU_PHYSX
+
+namespace physx
+{
+
+class PxCudaContextManager;
+class PxCudaContextManagerDesc;
+class PxErrorCallback;
+
+/**
+Creates cuda context manager for PhysX and APEX.
+*/
+PxCudaContextManager* createCudaContextManager(const PxCudaContextManagerDesc& desc, PxErrorCallback& errorCallback);
+
+}
+
+#endif
+
+#endif // PXCUDACONTEXTMANAGER_CUDACONTEXTMANAGER_H
diff --git a/PxShared/src/cudamanager/include/CudaKernelWrangler.h b/PxShared/src/cudamanager/include/CudaKernelWrangler.h
new file mode 100644
index 0000000..36a2cc8
--- /dev/null
+++ b/PxShared/src/cudamanager/include/CudaKernelWrangler.h
@@ -0,0 +1,331 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef __CUDA_KERNEL_WRANGLER__
+#define __CUDA_KERNEL_WRANGLER__
+
+// Make this header is safe for inclusion in headers that are shared with device code.
+#if !defined(__CUDACC__)
+
+#include "task/PxTaskDefine.h"
+#include "task/PxGpuDispatcher.h"
+
+#include "PsUserAllocated.h"
+#include "PsArray.h"
+
+#include <cuda.h>
+
+namespace physx
+{
+
+class KernelWrangler : public shdfnd::UserAllocated
+{
+	PX_NOCOPY(KernelWrangler)
+public:
+	KernelWrangler(PxGpuDispatcher& gd, PxErrorCallback& errorCallback, const char** funcNames, uint16_t numFuncs);
+	~KernelWrangler();
+
+	CUfunction getCuFunction(uint16_t funcIndex) const
+	{
+		return mCuFunctions[ funcIndex ];
+	}
+
+	CUmodule getCuModule(uint16_t funcIndex) const
+	{
+		uint16_t modIndex = mCuFuncModIndex[ funcIndex ];
+		return mCuModules[ modIndex ];
+	}
+
+	static void const* const* getImages();
+	static int getNumImages();
+
+	bool hadError() const { return mError; }
+
+protected:
+	bool						mError;
+	shdfnd::Array<CUfunction>	mCuFunctions;
+	shdfnd::Array<uint16_t>		mCuFuncModIndex;
+	shdfnd::Array<CUmodule>	    mCuModules;
+	PxGpuDispatcher&			mGpuDispatcher;
+	PxErrorCallback&			mErrorCallback;
+};
+
+/* SJB - These were "borrowed" from an Ignacio Llamas email to devtech-compute.
+ * If we feel this is too clumsy, we can steal the boost based bits from APEX
+ */
+
+class ExplicitCudaFlush
+{
+public:
+	ExplicitCudaFlush(int cudaFlushCount) : mCudaFlushCount(cudaFlushCount), mDefaultCudaFlushCount(mCudaFlushCount) {}
+	~ExplicitCudaFlush() {}
+
+	void setCudaFlushCount(int value) { mCudaFlushCount = mDefaultCudaFlushCount = value; }
+	unsigned int getCudaFlushCount() const	{ return (unsigned int)mCudaFlushCount; }
+	void resetCudaFlushCount() { mCudaFlushCount = mDefaultCudaFlushCount; }
+
+	void decrementFlushCount()
+	{
+		if (mCudaFlushCount == 0) return;
+
+		if (--mCudaFlushCount == 0)
+		{
+			CUresult ret = cuStreamQuery(0); // flushes current push buffer
+			PX_UNUSED(ret);
+			PX_ASSERT(ret == CUDA_SUCCESS || ret == CUDA_ERROR_NOT_READY);
+
+			// For current implementation, disable resetting of cuda flush count
+			// reset cuda flush count
+			// mCudaFlushCount = mDefaultCudaFlushCount;
+		}
+	}
+
+private:
+	int mCudaFlushCount;
+	int mDefaultCudaFlushCount;
+};
+
+}
+
+template <typename T0>
+PX_NOINLINE CUresult launchKernel(CUfunction func, unsigned int numBlocks, unsigned int numThreads, unsigned int sharedMem, CUstream stream,
+								  T0 v0)
+{
+	void* kernelParams[] =
+	{
+		&v0,
+	};
+	return cuLaunchKernel(func, numBlocks, 1, 1, numThreads, 1, 1, sharedMem, stream, kernelParams, NULL);
+}
+
+template <typename T0, typename T1>
+PX_NOINLINE CUresult launchKernel(CUfunction func, unsigned int numBlocks, unsigned int numThreads, unsigned int sharedMem, CUstream stream,
+								  T0 v0, T1 v1)
+{
+	void* kernelParams[] =
+	{
+		&v0, &v1,
+	};
+	return cuLaunchKernel(func, numBlocks, 1, 1, numThreads, 1, 1, sharedMem, stream, kernelParams, NULL);
+}
+
+template <typename T0, typename T1, typename T2>
+PX_NOINLINE CUresult launchKernel(CUfunction func, unsigned int numBlocks, unsigned int numThreads, unsigned int sharedMem, CUstream stream,
+								  T0 v0, T1 v1, T2 v2)
+{
+	void* kernelParams[] =
+	{
+		&v0, &v1, &v2,
+	};
+	return cuLaunchKernel(func, numBlocks, 1, 1, numThreads, 1, 1, sharedMem, stream, kernelParams, NULL);
+}
+
+template <typename T0, typename T1, typename T2, typename T3>
+PX_NOINLINE CUresult launchKernel(CUfunction func, unsigned int numBlocks, unsigned int numThreads, unsigned int sharedMem, CUstream stream,
+								  T0 v0, T1 v1, T2 v2, T3 v3)
+{
+	void* kernelParams[] =
+	{
+		&v0, &v1, &v2, &v3,
+	};
+	return cuLaunchKernel(func, numBlocks, 1, 1, numThreads, 1, 1, sharedMem, stream, kernelParams, NULL);
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename T4>
+PX_NOINLINE CUresult launchKernel(CUfunction func, unsigned int numBlocks, unsigned int numThreads, unsigned int sharedMem, CUstream stream,
+								  T0 v0, T1 v1, T2 v2, T3 v3, T4 v4)
+{
+	void* kernelParams[] =
+	{
+		&v0, &v1, &v2, &v3, &v4,
+	};
+	return cuLaunchKernel(func, numBlocks, 1, 1, numThreads, 1, 1, sharedMem, stream, kernelParams, NULL);
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
+PX_NOINLINE CUresult launchKernel(CUfunction func, unsigned int numBlocks, unsigned int numThreads, unsigned int sharedMem, CUstream stream,
+								  T0 v0, T1 v1, T2 v2, T3 v3, T4 v4, T5 v5)
+{
+	void* kernelParams[] =
+	{
+		&v0, &v1, &v2, &v3, &v4, &v5,
+	};
+	return cuLaunchKernel(func, numBlocks, 1, 1, numThreads, 1, 1, sharedMem, stream, kernelParams, NULL);
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
+PX_NOINLINE CUresult launchKernel(CUfunction func, unsigned int numBlocks, unsigned int numThreads, unsigned int sharedMem, CUstream stream,
+								  T0 v0, T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6)
+{
+	void* kernelParams[] =
+	{
+		&v0, &v1, &v2, &v3, &v4, &v5, &v6,
+	};
+	return cuLaunchKernel(func, numBlocks, 1, 1, numThreads, 1, 1, sharedMem, stream, kernelParams, NULL);
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
+PX_NOINLINE CUresult launchKernel(CUfunction func, unsigned int numBlocks, unsigned int numThreads, unsigned int sharedMem, CUstream stream,
+								  T0 v0, T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7)
+{
+	void* kernelParams[] =
+	{
+		&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7,
+	};
+	return cuLaunchKernel(func, numBlocks, 1, 1, numThreads, 1, 1, sharedMem, stream, kernelParams, NULL);
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7,
+          typename T8>
+PX_NOINLINE CUresult launchKernel(CUfunction func, unsigned int numBlocks, unsigned int numThreads, unsigned int sharedMem, CUstream stream,
+								  T0 v0, T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8)
+{
+	void* kernelParams[] =
+	{
+		&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8,
+	};
+	return cuLaunchKernel(func, numBlocks, 1, 1, numThreads, 1, 1, sharedMem, stream, kernelParams, NULL);
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7,
+          typename T8, typename T9>
+PX_NOINLINE CUresult launchKernel(CUfunction func, unsigned int numBlocks, unsigned int numThreads, unsigned int sharedMem, CUstream stream,
+								  T0 v0, T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9)
+{
+	void* kernelParams[] =
+	{
+		&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9,
+	};
+	return cuLaunchKernel(func, numBlocks, 1, 1, numThreads, 1, 1, sharedMem, stream, kernelParams, NULL);
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7,
+          typename T8, typename T9, typename T10>
+PX_NOINLINE CUresult launchKernel(CUfunction func, unsigned int numBlocks, unsigned int numThreads, unsigned int sharedMem, CUstream stream,
+								  T0 v0, T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10)
+{
+	void* kernelParams[] =
+	{
+		&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10,
+	};
+	return cuLaunchKernel(func, numBlocks, 1, 1, numThreads, 1, 1, sharedMem, stream, kernelParams, NULL);
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7,
+          typename T8, typename T9, typename T10, typename T11>
+PX_NOINLINE CUresult launchKernel(CUfunction func, unsigned int numBlocks, unsigned int numThreads, unsigned int sharedMem, CUstream stream,
+								  T0 v0, T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11)
+{
+	void* kernelParams[] =
+	{
+		&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10, &v11,
+	};
+	return cuLaunchKernel(func, numBlocks, 1, 1, numThreads, 1, 1, sharedMem, stream, kernelParams, NULL);
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7,
+          typename T8, typename T9, typename T10, typename T11, typename T12>
+PX_NOINLINE CUresult launchKernel(CUfunction func, unsigned int numBlocks, unsigned int numThreads, unsigned int sharedMem, CUstream stream,
+								  T0 v0, T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12)
+{
+	void* kernelParams[] =
+	{
+		&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10, &v11, &v12,
+	};
+	return cuLaunchKernel(func, numBlocks, 1, 1, numThreads, 1, 1, sharedMem, stream, kernelParams, NULL);
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7,
+          typename T8, typename T9, typename T10, typename T11, typename T12, typename T13>
+PX_NOINLINE CUresult launchKernel(CUfunction func, unsigned int numBlocks, unsigned int numThreads, unsigned int sharedMem, CUstream stream,
+								  T0 v0, T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+								  T13 v13)
+{
+	void* kernelParams[] =
+	{
+		&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10, &v11, &v12, &v13,
+	};
+	return cuLaunchKernel(func, numBlocks, 1, 1, numThreads, 1, 1, sharedMem, stream, kernelParams, NULL);
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7,
+          typename T8, typename T9, typename T10, typename T11, typename T12, typename T13, typename T14>
+PX_NOINLINE CUresult launchKernel(CUfunction func, unsigned int numBlocks, unsigned int numThreads, unsigned int sharedMem, CUstream stream,
+								  T0 v0, T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+								  T13 v13, T14 v14)
+{
+	void* kernelParams[] =
+	{
+		&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10, &v11, &v12, &v13, &v14,
+	};
+	return cuLaunchKernel(func, numBlocks, 1, 1, numThreads, 1, 1, sharedMem, stream, kernelParams, NULL);
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7,
+          typename T8, typename T9, typename T10, typename T11, typename T12, typename T13, typename T14, typename T15>
+PX_NOINLINE CUresult launchKernel(CUfunction func, unsigned int numBlocks, unsigned int numThreads, unsigned int sharedMem, CUstream stream,
+								  T0 v0, T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+								  T13 v13, T14 v14, T15 v15)
+{
+	void* kernelParams[] =
+	{
+		&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10, &v11, &v12, &v13, &v14, &v15,
+	};
+	return cuLaunchKernel(func, numBlocks, 1, 1, numThreads, 1, 1, sharedMem, stream, kernelParams, NULL);
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7,
+          typename T8, typename T9, typename T10, typename T11, typename T12, typename T13, typename T14, typename T15,
+          typename T16>
+PX_NOINLINE CUresult launchKernel(CUfunction func, unsigned int numBlocks, unsigned int numThreads, unsigned int sharedMem, CUstream stream,
+								  T0 v0, T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+								  T13 v13, T14 v14, T15 v15, T16 v16)
+{
+	void* kernelParams[] =
+	{
+		&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10, &v11, &v12, &v13, &v14, &v15, &v16,
+	};
+	return cuLaunchKernel(func, numBlocks, 1, 1, numThreads, 1, 1, sharedMem, stream, kernelParams, NULL);
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7,
+          typename T8, typename T9, typename T10, typename T11, typename T12, typename T13, typename T14, typename T15,
+          typename T16, typename T17>
+PX_NOINLINE CUresult launchKernel(CUfunction func, unsigned int numBlocks, unsigned int numThreads, unsigned int sharedMem, CUstream stream,
+								  T0 v0, T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+								  T13 v13, T14 v14, T15 v15, T16 v16, T17 v17)
+{
+	void* kernelParams[] =
+	{
+		&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, &v8, &v9, &v10, &v11, &v12, &v13, &v14, &v15, &v16, &v17,
+	};
+	return cuLaunchKernel(func, numBlocks, 1, 1, numThreads, 1, 1, sharedMem, stream, kernelParams, NULL);
+}
+
+#endif
+
+#endif
diff --git a/PxShared/src/cudamanager/include/GpuDispatcher.h b/PxShared/src/cudamanager/include/GpuDispatcher.h
new file mode 100644
index 0000000..aedb345
--- /dev/null
+++ b/PxShared/src/cudamanager/include/GpuDispatcher.h
@@ -0,0 +1,334 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXTASK_GPUDISPATCHER_H
+#define PXTASK_GPUDISPATCHER_H
+
+#include "task/PxTask.h"
+#include "task/PxTaskDefine.h"
+#include "task/PxGpuTask.h"
+#include "task/PxTaskManager.h"
+#include "task/PxGpuDispatcher.h"
+#include "foundation/PxProfiler.h"
+
+#include "PsUserAllocated.h"
+#include "PsThread.h"
+#include "PsAtomic.h"
+#include "PsMutex.h"
+#include "PsSync.h"
+#include "PsArray.h"
+
+#include <cuda.h>
+
+namespace physx { 
+
+typedef uint16_t EventID;
+
+void releaseGpuDispatcher(PxGpuDispatcher&);
+
+class KernelWrangler;
+class BlockingWaitThread;
+class FanoutTask;
+class LaunchTask;
+class BlockTask;
+class PxGpuWorkerThread;
+
+class GpuDispatcherImpl : public PxGpuDispatcher, public shdfnd::UserAllocated
+{
+public:
+	GpuDispatcherImpl(PxErrorCallback& errorCallback, PxCudaContextManager& ctx);
+	virtual ~GpuDispatcherImpl();
+
+	void	start();
+	void    startSimulation();
+	void	startGroup();
+	void    submitTask(PxTask& task);
+	void    finishGroup();
+	void    addCompletionPrereq(PxBaseTask& task);
+	bool	failureDetected() const;
+	void    forceFailureMode();
+	void    stopSimulation();
+	void    launchCopyKernel(PxGpuCopyDesc* desc, uint32_t count, CUstream stream);
+	
+	PxBaseTask&	getPreLaunchTask();
+	void		addPreLaunchDependent(PxBaseTask& dependent);
+
+	PxBaseTask&	getPostLaunchTask();
+	void		addPostLaunchDependent(PxBaseTask& dependent);
+
+	PxCudaContextManager* getCudaContextManager();
+
+	PxGpuWorkerThread* mDispatcher;
+	BlockingWaitThread* mBlockingThread;
+	LaunchTask* mLaunchTask; // predecessor of tasks launching kernels
+	BlockTask* mBlockTask; // continuation of tasks launching kernels
+	FanoutTask* mSyncTask; // predecessor of tasks waiting for cuda context synchronize
+};
+
+class JobQueue
+{
+	PX_NOCOPY(JobQueue)
+public:
+	JobQueue() : taskarray(PX_DEBUG_EXP("PxTask*")) {}
+	void push(PxTask* t)
+	{
+		access.lock();
+		taskarray.pushBack(t);
+		access.unlock();
+	}
+	PxTask* popBack()
+	{
+		access.lock();
+		PxTask* t = NULL;
+		if (taskarray.size())
+		{
+			t = taskarray.popBack();
+		}
+		access.unlock();
+		return t;
+	}
+	uint32_t size()
+	{
+		return taskarray.size();
+	}
+	bool empty()
+	{
+		return taskarray.size() == 0;
+	}
+
+private:
+	shdfnd::Array<PxTask*> taskarray;
+	shdfnd::Mutex		  access;
+};
+
+class EventPool
+{
+	PX_NOCOPY(EventPool)
+public:
+	EventPool(uint32_t inflags) : flags(inflags), evarray(PX_DEBUG_EXP("CUevent")) {}
+	void add(CUevent ev)
+	{
+		access.lock();
+		evarray.pushBack(ev);
+		access.unlock();
+	}
+	CUevent get()
+	{
+		access.lock();
+		CUevent ev;
+		if (evarray.size())
+		{
+			ev = evarray.popBack();
+		}
+		else
+		{
+			cuEventCreate(&ev, flags);
+		}
+		access.unlock();
+		return ev;
+	}
+	bool empty() const
+	{
+		return evarray.size() == 0;
+	}
+	void clear()
+	{
+		access.lock();
+		for (uint32_t i = 0; i < evarray.size(); i++)
+		{
+			cuEventDestroy(evarray[i]);
+		}
+		access.unlock();
+	}
+
+private:
+	uint32_t flags;
+	shdfnd::Array<CUevent> evarray;
+	shdfnd::Mutex access;
+};
+
+class StreamCache
+{
+public:
+	StreamCache() : sarray(PX_DEBUG_EXP("CUstream")), freeIndices(PX_DEBUG_EXP("freeIndices"))
+	{
+	}
+	CUstream get(uint32_t s)
+	{
+		PX_ASSERT(s);
+		return sarray[ s - 1 ];
+	}
+	void push(uint32_t s)
+	{
+		freeIndices.pushBack(s);
+	}
+	uint32_t popBack()
+	{
+		if (freeIndices.size())
+		{
+			return freeIndices.popBack();
+		}
+		else
+		{
+			CUstream s;
+			cuStreamCreate(&s, 0);
+			sarray.pushBack(s);
+			return sarray.size();
+		}
+	}
+	void reset()
+	{
+		freeIndices.resize(sarray.size());
+		for (uint32_t i = 0 ; i < sarray.size() ; i++)
+		{
+			freeIndices[i] = i + 1;
+		}
+	}
+	bool empty()
+	{
+		return freeIndices.size() == 0;
+	}
+
+private:
+	shdfnd::Array<CUstream>	 sarray;
+	shdfnd::Array<uint32_t>	 freeIndices;
+};
+
+class KernelBar
+{
+public:
+	KernelBar()
+	{
+		reset();
+	}
+	void reset()
+	{
+		start = 0xffffffff;
+		stop = 0;
+	}
+
+	uint32_t start;
+	uint32_t stop;
+};
+
+const int SIZE_COMPLETION_RING = 1024;
+
+struct CudaBatch
+{
+	CUevent		blockingEvent;
+	CUstream    blockingStream; // sync on stream instead of event if lsb is zero (faster)
+	PxBaseTask*   continuationTask;
+};
+
+struct ReadyTask
+{
+	PxGpuTask* 	task;
+	uint32_t       iteration;
+};
+
+class PxGpuWorkerThread : public shdfnd::Thread
+{
+	PX_NOCOPY(PxGpuWorkerThread)
+public:
+	PxGpuWorkerThread();
+	~PxGpuWorkerThread();
+
+	void					setCudaContext(PxCudaContextManager& ctx);
+	void					emitStartEvent(const char *id);
+	void					emitStopEvent(const char *id);
+
+	/* API to TaskManager */
+	void					startSimulation();
+	void					stopSimulation();
+
+	/* API to GPU tasks */
+	void					addCompletionPrereq(PxBaseTask& task);
+
+	/* PxGpuTask execution thread */
+	void					execute();
+	void					pollSubmitted(shdfnd::Array<ReadyTask> *ready);
+	void					processActiveTasks();
+	void					flushBatch(CUevent endEvent, CUstream, PxBaseTask* task);
+	void					launchCopyKernel(PxGpuCopyDesc* desc, uint32_t count, CUstream stream);
+
+	/* Blocking wait thread */
+	void					blockingWaitFunc();
+
+	StreamCache				mCachedStreams;
+	shdfnd::Array<PxBaseTask*> mCompletionTasks;
+	JobQueue      			mSubmittedTaskList;
+	volatile int			mActiveGroups;
+	shdfnd::Sync			mInputReady;
+	shdfnd::Sync			mRecordEventQueued;
+	PxCudaContextManager* 	mCtxMgr;
+	bool                    mNewTasksSubmitted;
+	bool                    mFailureDetected;
+
+	bool                    mUsingConcurrentStreams;
+
+	CudaBatch				mCompletionRing[ SIZE_COMPLETION_RING ];
+	volatile int            mCompletionRingPush;
+	volatile int            mCompletionRingPop;
+
+	EventPool               mCachedBlockingEvents;
+	EventPool               mCachedNonBlockingEvents;
+
+	volatile int			mCountActiveScenes;
+
+	uint32_t*				mSmStartTimes;
+	uint32_t                 mSmClockFreq;
+
+	shdfnd::Array<ReadyTask> mReady[ PxGpuTaskHint::NUM_GPU_TASK_HINTS ];
+
+	KernelWrangler*         mUtilKernelWrapper;
+
+	CUevent                 mStartEvent;
+
+	shdfnd::Mutex			mMutex;
+};
+
+class BlockingWaitThread : public shdfnd::Thread
+{
+public:
+	BlockingWaitThread(PxGpuWorkerThread& worker) : mWorker(worker) {}
+	~BlockingWaitThread() {}
+
+	void		    execute();
+
+protected:
+	PxGpuWorkerThread& mWorker;
+
+private:
+	BlockingWaitThread& operator=(const BlockingWaitThread&);
+};
+
+#define GD_CHECK_CALL(call)  { CUresult ret = call;                          \
+		if( CUDA_SUCCESS != ret ) { mFailureDetected=true; PX_ASSERT(!ret); } }
+
+}
+
+#endif // PXTASK_GPUDISPATCHER_H
diff --git a/PxShared/src/cudamanager/include/PhysXDeviceSettings.h b/PxShared/src/cudamanager/include/PhysXDeviceSettings.h
new file mode 100644
index 0000000..5358915
--- /dev/null
+++ b/PxShared/src/cudamanager/include/PhysXDeviceSettings.h
@@ -0,0 +1,56 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXCUDACONTEXTMANAGER_PHYSXDEVICESETTINGS_H
+#define PXCUDACONTEXTMANAGER_PHYSXDEVICESETTINGS_H
+
+#include "task/PxTaskDefine.h"
+
+#if PX_SUPPORT_GPU_PHYSX
+
+namespace physx
+{
+	class PxErrorCallback;
+
+	/**
+	Helper functions to expose control panel functionality 
+	*/
+	class PhysXDeviceSettings
+	{
+	private:
+		PhysXDeviceSettings() {}
+
+	public:
+		static int getSuggestedCudaDeviceOrdinal(PxErrorCallback& errc);
+		static int isUsingDedicatedGPU();
+		static bool isSLIEnabled(void* graphicsDevice);
+	};
+}
+
+#endif
+
+#endif // PXCUDACONTEXTMANAGER_PHYSXDEVICESETTINGS_H
diff --git a/PxShared/src/cudamanager/src/BlockingWait.cpp b/PxShared/src/cudamanager/src/BlockingWait.cpp
new file mode 100644
index 0000000..fada532
--- /dev/null
+++ b/PxShared/src/cudamanager/src/BlockingWait.cpp
@@ -0,0 +1,120 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#include "task/PxTaskDefine.h"
+
+#if PX_SUPPORT_GPU_PHYSX
+
+#include "task/PxTask.h"
+#include "task/PxGpuTask.h"
+#include "cudamanager/PxCudaContextManager.h"
+
+#include "PsString.h"
+
+#if PX_SUPPORT_PXTASK_PROFILING
+#include "foundation/PxFoundation.h"
+#include "foundation/PxProfiler.h"
+#endif
+
+#include "GpuDispatcher.h"
+
+using namespace physx;
+
+/* Blocking thread / GPU Profiling Event Code */
+
+void PxGpuWorkerThread::blockingWaitFunc()
+{
+	mCtxMgr->acquireContext();
+
+	while (mCompletionRingPop != mCompletionRingPush)
+	{
+		CudaBatch& b = mCompletionRing[ mCompletionRingPop ];
+		PxBaseTask* t = b.continuationTask;
+
+		if (!b.blockingEvent)
+		{
+			PX_ASSERT(b.continuationTask != 0);
+
+			/* No blocking necessary, just allow continuation task to run */
+		}
+		else if (!mFailureDetected)
+		{
+			emitStartEvent("GpuDispatcher.BlockingWaitEvent");
+
+			if (1 & ~intptr_t(b.blockingStream))
+			{
+				GD_CHECK_CALL(cuStreamSynchronize(b.blockingStream));
+			}
+			else
+			{
+				GD_CHECK_CALL(cuEventSynchronize(b.blockingEvent));
+			}
+
+			emitStopEvent("GpuDispatcher.BlockingWaitEvent");
+		}
+
+		if (b.blockingEvent)
+		{
+			mCachedBlockingEvents.add(b.blockingEvent);
+		}
+		if (t)
+		{
+			t->removeReference();
+		}
+		mCompletionRingPop = (mCompletionRingPop + 1) % SIZE_COMPLETION_RING;
+	}
+
+	mCtxMgr->releaseContext();
+}
+
+
+/* Blocking wait thread
+
+   All this thread does is block waiting for CUDA Record Events to
+   be signaled.
+ */
+
+void BlockingWaitThread::execute()
+{
+	setName("GpuDispatcher.BlockingWait");
+	bool running = true;
+	while (running)
+	{
+		mWorker.mRecordEventQueued.wait();
+		if (quitIsSignalled())
+		{
+			running = false;
+		}
+
+		mWorker.mRecordEventQueued.reset();
+		mWorker.blockingWaitFunc();
+	}
+	quit();
+}
+
+#endif
+
diff --git a/PxShared/src/cudamanager/src/CUDA/UtilKernels.cu b/PxShared/src/cudamanager/src/CUDA/UtilKernels.cu
new file mode 100644
index 0000000..3c73364
--- /dev/null
+++ b/PxShared/src/cudamanager/src/CUDA/UtilKernels.cu
@@ -0,0 +1,164 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#include "cudamanager/PxGpuCopyDesc.h"
+#include "foundation/PxSimpleTypes.h"
+
+#include <cuda.h>
+
+using namespace physx;
+
+extern "C" __host__ void initUtilKernels() {}
+
+extern "C" __global__
+void Saturate( )
+{
+    // NOP
+}
+
+__device__
+void performCopy( const physx::PxGpuCopyDesc& desc, uint32_t totalBlocks )
+{
+	if( desc.type == physx::PxGpuCopyDesc::DeviceMemset32 )
+	{
+		uint32_t *dest = (uint32_t*) desc.dest;
+		uint32_t wordCount = desc.bytes >> 2;
+		size_t word = blockIdx.x * blockDim.x + threadIdx.x;
+		size_t stride = blockDim.x * totalBlocks;
+		for( ; word < wordCount ; word += stride )
+			dest[ word ] = desc.source;
+		return;
+	}
+
+	/* The idea here is to maximize throughput with minimal register and thread counts */
+	/* Manually unrolled 4 times, the compiler refuses to do it for me */
+
+	if( (desc.source & 0x7) != 0 || (desc.dest & 0x7) != 0 || (desc.bytes & 0x7) != 0)
+	{
+		/* Input is word aligned */
+
+		uint32_t *dest = (uint32_t*) desc.dest;
+		uint32_t *source = (uint32_t*) desc.source;
+		uint32_t wordCount = desc.bytes >> 2;
+		size_t word = blockIdx.x * blockDim.x + threadIdx.x;
+		size_t stride = blockDim.x * totalBlocks;
+		while( word < wordCount )
+		{
+			uint32_t a0, a1, a2, a3, a4, a5;
+			a0 = source[ word ];
+			if( word + stride < wordCount )
+				a1 = source[ word + stride ];
+			if( word + stride*2 < wordCount )
+				a2 = source[ word + stride*2 ];
+			if( word + stride*3 < wordCount )
+				a3 = source[ word + stride*3 ];
+			if( word + stride*4 < wordCount )
+				a4 = source[ word + stride*4 ];
+			if( word + stride*5 < wordCount )
+				a5 = source[ word + stride*5 ];
+
+			dest[ word ] = a0;
+			if( word + stride < wordCount )
+				dest[ word + stride ] = a1;
+			if( word + stride*2 < wordCount )
+				dest[ word + stride*2 ] = a2;
+			if( word + stride*3 < wordCount )
+				dest[ word + stride*3 ] = a3;
+			if( word + stride*4 < wordCount )
+				dest[ word + stride*4 ] = a4;
+			if( word + stride*5 < wordCount )
+				dest[ word + stride*5 ] = a5;
+
+			word += stride*6;
+		}
+	}
+	else
+	{
+		/* Input is DWord aligned */
+
+		uint2 *dest = (uint2*) desc.dest;
+		uint2 *source = (uint2*) desc.source;
+		uint32_t dwordCount = desc.bytes >> 3;
+		size_t word = blockIdx.x * blockDim.x + threadIdx.x;
+		size_t stride = blockDim.x * totalBlocks;
+		while( word < dwordCount )
+		{
+			uint2 a0, a1, a2, a3, a4, a5;
+			a0 = source[ word ];
+			if( word + stride < dwordCount )
+				a1 = source[ word + stride ];
+			if( word + stride*2 < dwordCount )
+				a2 = source[ word + stride*2 ];
+			if( word + stride*3 < dwordCount )
+				a3 = source[ word + stride*3 ];
+			if( word + stride*4 < dwordCount )
+				a4 = source[ word + stride*4 ];
+			if( word + stride*5 < dwordCount )
+				a5 = source[ word + stride*5 ];
+
+			dest[ word ] = a0;
+			if( word + stride < dwordCount )
+				dest[ word + stride ] = a1;
+			if( word + stride*2 < dwordCount )
+				dest[ word + stride*2 ] = a2;
+			if( word + stride*3 < dwordCount )
+				dest[ word + stride*3 ] = a3;
+			if( word + stride*4 < dwordCount )
+				dest[ word + stride*4 ] = a4;
+			if( word + stride*5 < dwordCount )
+				dest[ word + stride*5 ] = a5;
+
+			word += stride*6;
+		}
+	}
+
+	__threadfence_system();
+}
+
+extern "C" __global__
+void MemCopyAsync( physx::PxGpuCopyDesc desc )
+{
+	performCopy( desc, gridDim.x );
+}
+
+
+extern "C" __global__
+void MemCopyBatchedAsync( physx::PxGpuCopyDesc *desc )
+{
+	__shared__ physx::PxGpuCopyDesc sdesc;
+
+	if( threadIdx.x < sizeof(physx::PxGpuCopyDesc) / sizeof(uint32_t) )
+	{
+		uint32_t *dest = (uint32_t*)&sdesc;
+		uint32_t *source = (uint32_t*)(desc + blockIdx.y);
+		dest[ threadIdx.x ] = source[ threadIdx.x ];
+		__threadfence_block();
+	}
+	__syncthreads();
+
+	performCopy( sdesc, gridDim.x );
+}
diff --git a/PxShared/src/cudamanager/src/CudaContextManager.cpp b/PxShared/src/cudamanager/src/CudaContextManager.cpp
new file mode 100644
index 0000000..b5b6efc
--- /dev/null
+++ b/PxShared/src/cudamanager/src/CudaContextManager.cpp
@@ -0,0 +1,988 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved.
+
+#include "foundation/PxAssert.h"
+#include "foundation/PxErrorCallback.h"
+#include "foundation/PxMath.h"
+#include "foundation/PxPreprocessor.h"
+
+#include "cudamanager/PxCudaContextManager.h"
+#include "task/PxGpuDispatcher.h"
+
+#include "CudaMemoryManager.h"
+#include "GpuDispatcher.h"
+#include "PhysXDeviceSettings.h"
+
+#include "PsMutex.h"
+#include "PsThread.h"
+#include "PsUserAllocated.h"
+#include "PsString.h"
+
+#include <cuda.h>
+
+#if PX_WIN32 || PX_WIN64
+
+#ifdef PX_SECURE_LOAD_LIBRARY
+#include "nvSecureLoadLibrary.h"
+#endif
+
+#pragma warning (push)
+#pragma warning (disable : 4668) //'symbol' is not defined as a preprocessor macro, replacing with '0' for 'directives'
+#include <windows.h>
+#pragma warning (pop)
+
+class IDirect3DDevice9;
+class IDirect3DResource9;
+class IDirect3DVertexBuffer9;
+#include <cudad3d9.h>
+
+class IDXGIAdapter;
+class ID3D10Device;
+class ID3D10Resource;
+#include <cudad3d10.h>
+
+struct ID3D11Device;
+struct ID3D11Resource;
+#include <cudad3d11.h>
+
+#endif // PX_WINDOWS_FAMILY
+
+#if PX_LINUX
+#include <dlfcn.h>
+static void* GetProcAddress(void* handle, const char* name) { return dlsym(handle, name); }
+#endif
+
+#include <GL/gl.h>
+#include <cudaGL.h>
+#include <assert.h>
+
+#include "foundation/PxErrors.h"
+#include "foundation/PxErrorCallback.h"
+
+#define CU_INIT_UUID
+#include "CudaNode3DLowLatencyInterface.h"
+
+#define ENABLE_DEVICE_INFO_BRINGUP	0
+
+#include "GPUProfile.h"
+
+#if ENABLE_CUDA_DEVICE_RESET
+#include "cudaProfiler.h"
+#endif
+
+#if USE_PERFKIT
+#pragma warning (push)
+#pragma warning (disable : 4099)
+#pragma warning (disable : 4191)
+#define NVPM_INITGUID
+#include <stdio.h>
+#include "cuda.h"
+#include "../../../../../../../externals/nvPerfKit/4.1.0.14260/inc/NvPmApi.Manager.h"
+static NvPmApiManager S_NVPMManager;
+extern NvPmApiManager *GetNvPmApiManager() {return &S_NVPMManager;}
+const NvPmApi *GetNvPmApi() {return S_NVPMManager.Api();}
+NVPMContext hNVPMContext(0);
+
+void initPerfKit()
+{
+	//Sync with GPU
+	cuCtxSynchronize();
+	
+	// Reset counters
+	uint32_t nCount;
+	GetNvPmApi()->Sample(hNVPMContext, NULL, &nCount);
+}
+
+void endPerfKit()
+{
+	//Sync with GPU
+	cuCtxSynchronize();
+	
+	uint32_t nCount;
+	GetNvPmApi()->Sample(hNVPMContext, NULL, &nCount);
+	
+	uint64_t value;
+	uint64_t cycle;
+	
+	uint64_t sum = 0;
+	uint64_t maxVal = 0;
+	char name[512];
+
+	int nvStatus = 0;
+
+	PX_UNUSED(value);
+	PX_UNUSED(cycle);
+	PX_UNUSED(sum);
+	PX_UNUSED(maxVal);
+	PX_UNUSED(name);
+	PX_UNUSED(nvStatus);
+
+	printf("counters:\n");
+
+#if COUNT_L2_TO_L1_BYTES
+	nvStatus |= GetNvPmApi()->GetCounterValueByName(hNVPMContext, "l2_read_bytes", 0, &value, &cycle);
+	printf("L2->L1 bytes %d\n",value);
+#elif COUNT_SM_TO_L1_QUERIES
+	nvStatus |= GetNvPmApi()->GetCounterValueByName(hNVPMContext, "tex_cache_sector_queries", 0, &value, &cycle);
+	printf("SM->L1 queries %d\n",value);
+#endif
+
+#if COUNT_INST_EXECUTED || COUNT_STORE_INST_EXECUTED || COUNT_ACTIVE_CYCLES || COUNT_ACTIVE_WARPS
+	for (int i = 0; i != SM_COUNT; i++)
+	{
+#if COUNT_INST_EXECUTED
+		sprintf_s(name, 512, "sm_inst_executed_vsm%d", i);
+#elif COUNT_STORE_INST_EXECUTED
+		sprintf_s(name, 512, "sm_inst_executed_global_stores_vsm%d", i);
+#elif COUNT_ACTIVE_CYCLES
+		sprintf_s(name, 512, "sm_active_cycles_vsm%d", i);
+#elif COUNT_ACTIVE_WARPS
+		sprintf_s(name, 512, "sm_active_warps_vsm%d", i);
+#endif
+		nvStatus |= GetNvPmApi()->GetCounterValueByName(hNVPMContext, name, 0, &value, &cycle);
+		
+		sum += value;
+		maxVal = physx::PxMax(maxVal, value);
+	}
+#if COUNT_ACTIVE_CYCLES
+	printf("sum %I64d\n", sum);
+#else
+	printf("sum %I64d\n", sum);
+#endif
+
+	if (!nvStatus)
+	{
+		PX_ASSERT(false);
+	}
+#endif
+}
+
+#pragma warning (pop)
+#endif
+
+namespace physx
+{
+
+#if PX_VC
+#pragma warning(disable: 4191)	//'operator/operation' : unsafe conversion from 'type of expression' to 'type required'
+#endif
+
+#define MIN_SM_MAJOR_VERSION	2
+#define MIN_SM_MINOR_VERSION	0
+
+class CudaCtxMgr : public PxCudaContextManager, public shdfnd::UserAllocated
+{
+public:
+	CudaCtxMgr(const PxCudaContextManagerDesc& desc, PxErrorCallback& errorCallback);
+	~CudaCtxMgr();
+
+	bool            safeDelayImport(PxErrorCallback& errorCallback);
+	CUcontext       acquireContext();
+	void            releaseContext();
+
+	/* All these methods can be called without acquiring the context */
+
+	PxCudaMemoryManager* getMemoryManager();
+	PxGpuDispatcher* getGpuDispatcher();
+
+	bool            contextIsValid() const;
+	bool            supportsArchSM10() const;  // G80
+	bool            supportsArchSM11() const;  // G92
+	bool            supportsArchSM12() const;
+	bool            supportsArchSM13() const;  // GT200
+	bool            supportsArchSM20() const;  // GF100
+	bool            supportsArchSM30() const;  // GK100
+	bool            supportsArchSM35() const;  // GK110
+	bool            supportsArchSM50() const;  // GM100
+	bool            supportsArchSM52() const;  // GM200
+	bool            isIntegrated() const;      // true if GPU is integrated (MCP) part
+	bool            canMapHostMemory() const;  // true if GPU map host memory to GPU
+	int             getDriverVersion() const;
+	size_t          getDeviceTotalMemBytes() const;
+	int				getMultiprocessorCount() const;
+	int             getSharedMemPerBlock() const;
+	int             getSharedMemPerMultiprocessor() const;
+	unsigned int	getMaxThreadsPerBlock() const;
+	unsigned int	getClockRate() const;
+
+	const char*     getDeviceName() const;
+	const CUdevprop* getDeviceProperties() const;
+
+	PxCudaInteropMode::Enum	getInteropMode() const;
+
+	void			setUsingConcurrentStreams(bool);
+	bool			getUsingConcurrentStreams() const;
+
+	bool registerResourceInCudaD3D(CUgraphicsResource& resource, void* resourcePointer, PxCudaInteropRegisterFlags flags);
+	bool registerResourceInCudaGL(CUgraphicsResource& resource, uint32_t buffer, PxCudaInteropRegisterFlags flags);
+	bool unregisterResourceInCuda(CUgraphicsResource resource);
+
+	/*
+	\brief Determine if the user has configured a dedicated PhysX GPU in the NV Control Panel
+	\returns 1 if there is a dedicated PhysX GPU
+	\returns 0 if there is NOT a dedicated PhysX GPU
+	\returns -1 if the routine is not implemented
+	*/
+	int				usingDedicatedGPU() const;
+
+	void            release();
+
+private:
+
+	int             mSceneCount;
+	bool            mIsValid;
+	bool			mOwnContext;
+	CUdevice        mDevHandle;
+	CUcontext       mCtx;
+	CudaMemMgr* 	mMemMgr;
+
+	GpuDispatcherImpl* mDispatcher;
+    CUetblPhysXInterface* m_physXInterface;
+
+	/* Cached device attributes, so threads can query w/o context */
+	int             mComputeCapMajor;
+	int             mComputeCapMinor;
+	int				mIsIntegrated;
+	int				mCanMapHost;
+	int				mDriverVersion;
+	size_t			mTotalMemBytes;
+	int				mMultiprocessorCount;
+	int				mMaxThreadsPerBlock;
+	char            mDeviceName[128];
+	int				mSharedMemPerBlock;
+	int				mSharedMemPerMultiprocessor;
+	int				mClockRate;
+	PxCudaInteropMode::Enum mInteropMode;
+	bool            mUsingConcurrentStreams;
+
+#if PX_DEBUG
+	static uint32_t    mManagerRefCount;
+	static uint32_t    mContextRefCountTls;
+#endif
+};
+
+#if PX_DEBUG
+uint32_t CudaCtxMgr::mManagerRefCount = 0;
+uint32_t CudaCtxMgr::mContextRefCountTls = 0;
+#endif
+
+bool CudaCtxMgr::contextIsValid() const
+{
+	return mIsValid;
+}
+bool CudaCtxMgr::supportsArchSM10() const
+{
+	return mIsValid;
+}
+bool CudaCtxMgr::supportsArchSM11() const
+{
+	return mIsValid && (mComputeCapMinor >= 1 || mComputeCapMajor > 1);
+}
+bool CudaCtxMgr::supportsArchSM12() const
+{
+	return mIsValid && (mComputeCapMinor >= 2 || mComputeCapMajor > 1);
+}
+bool CudaCtxMgr::supportsArchSM13() const
+{
+	return mIsValid && (mComputeCapMinor >= 3 || mComputeCapMajor > 1);
+}
+bool CudaCtxMgr::supportsArchSM20() const
+{
+	return mIsValid && mComputeCapMajor >= 2;
+}
+bool CudaCtxMgr::supportsArchSM30() const
+{
+	return mIsValid && mComputeCapMajor >= 3;
+}
+bool CudaCtxMgr::supportsArchSM35() const
+{
+	return mIsValid && ((mComputeCapMajor > 3) || (mComputeCapMajor == 3 && mComputeCapMinor >= 5));
+}
+bool CudaCtxMgr::supportsArchSM50() const
+{
+	return mIsValid && mComputeCapMajor >= 5;
+}
+bool CudaCtxMgr::supportsArchSM52() const
+{
+	return mIsValid && ((mComputeCapMajor > 5) || (mComputeCapMajor == 5 && mComputeCapMinor >= 2));
+}
+
+bool CudaCtxMgr::isIntegrated() const
+{
+	return mIsValid && mIsIntegrated;
+}
+bool CudaCtxMgr::canMapHostMemory() const
+{
+	return mIsValid && mCanMapHost;
+}
+int  CudaCtxMgr::getDriverVersion() const
+{
+	return mDriverVersion;
+}
+size_t  CudaCtxMgr::getDeviceTotalMemBytes() const
+{
+	return mTotalMemBytes;
+}
+int	CudaCtxMgr::getMultiprocessorCount() const
+{
+	return mMultiprocessorCount;
+}
+int CudaCtxMgr::getSharedMemPerBlock() const
+{
+	return mSharedMemPerBlock;
+}
+int CudaCtxMgr::getSharedMemPerMultiprocessor() const
+{
+	return mSharedMemPerMultiprocessor;
+}
+unsigned int CudaCtxMgr::getMaxThreadsPerBlock() const
+{
+	return (unsigned int)mMaxThreadsPerBlock;
+}
+unsigned int CudaCtxMgr::getClockRate() const
+{
+	return (unsigned int)mClockRate;
+}
+
+const char* CudaCtxMgr::getDeviceName() const
+{
+	if (mIsValid)
+	{
+		return mDeviceName;
+	}
+	else
+	{
+		return "Invalid";
+	}
+}
+
+PxCudaInteropMode::Enum CudaCtxMgr::getInteropMode() const
+{
+	return mInteropMode;
+}
+
+void CudaCtxMgr::setUsingConcurrentStreams(bool value)
+{
+	mUsingConcurrentStreams = value;
+}
+
+bool CudaCtxMgr::getUsingConcurrentStreams() const
+{
+	return mUsingConcurrentStreams;
+}
+
+PxCudaMemoryManager* CudaCtxMgr::getMemoryManager()
+{
+	if (mIsValid)
+	{
+		return mMemMgr;
+	}
+	else
+	{
+		return NULL;
+	}
+}
+
+PxGpuDispatcher* CudaCtxMgr::getGpuDispatcher()
+{
+	if (mIsValid)
+	{
+		return mDispatcher;
+	}
+	else
+	{
+		return NULL;
+	}
+}
+
+int CudaCtxMgr::usingDedicatedGPU() const
+{
+	if (PxCudaInteropMode::NO_INTEROP == getInteropMode())
+	{
+		return PhysXDeviceSettings::isUsingDedicatedGPU();
+	}
+	else
+	{
+		return 0; // not a dedicated GPU
+	}
+}
+
+#define CUT_SAFE_CALL(call)  { CUresult ret = call;	\
+		if( CUDA_SUCCESS != ret ) { PX_ASSERT(0); } }
+
+/* If a context is not provided, an ordinal must be given */
+CudaCtxMgr::CudaCtxMgr(const PxCudaContextManagerDesc& desc, PxErrorCallback& errorCallback)
+	: mSceneCount(0)
+	, mOwnContext(false)
+	, mMemMgr(0)
+	, mDispatcher(0)
+	, m_physXInterface(0)
+	, mInteropMode(desc.interopMode)
+	, mUsingConcurrentStreams(true)
+{
+	CUresult status;
+	mIsValid = false;
+	mDeviceName[0] = 0;
+
+	if (safeDelayImport(errorCallback) == false)
+	{
+		// The table where this info is found is here: https://wiki.nvidia.com/nvcompute/index.php/NVCompute#CUDA_Planning
+		errorCallback.reportError(PxErrorCode::eDEBUG_INFO, "NVIDIA Release 331 graphics driver and above is required for GPU acceleration.", __FILE__, __LINE__);
+		return;
+	}
+
+	if (desc.ctx == 0)
+	{
+		int flags = CU_CTX_LMEM_RESIZE_TO_MAX | CU_CTX_SCHED_BLOCKING_SYNC | CU_CTX_MAP_HOST;
+		class FoundationErrorReporter : public PxErrorCallback
+		{
+		public:
+			FoundationErrorReporter(PxErrorCallback& ec)
+			: errorCallback(&ec)
+			{
+			}
+
+			virtual void reportError(PxErrorCode::Enum code, const char* message, const char* file, int line)
+			{
+				errorCallback->reportError( code, message, file, line);
+			}
+
+			PxErrorCallback* errorCallback;
+		} foundationErrorReporter(errorCallback);
+
+		int devOrdinal = PhysXDeviceSettings::getSuggestedCudaDeviceOrdinal(foundationErrorReporter);
+		if (devOrdinal < 0)
+		{
+			errorCallback.reportError(PxErrorCode::eDEBUG_INFO, "No PhysX capable GPU suggested.", __FILE__, __LINE__);
+			errorCallback.reportError(PxErrorCode::eDEBUG_INFO, "If you have a PhysX capable GPU, verify that PhysX is not set to CPU in the NVIDIA Control Panel.", __FILE__, __LINE__);
+			return;
+		}
+
+		status = cuInit(0);
+		if (CUDA_SUCCESS != status)
+		{
+			errorCallback.reportError(PxErrorCode::eDEBUG_WARNING, "cuInit failed", __FILE__, __LINE__);
+			return;
+		}
+		
+		// Try to create the context on Node3DLowLatency.
+		// If that does not work, try to create the cuda context using cuCtxCreatePhysX,
+		// since we must be on a driver that does not support cuCtxCreateOnNode3DLowLatency.
+		cuGetExportTable((const void**)&m_physXInterface, (const CUuuid*)&CU_ETID_PhysXInterface);
+
+		// if using a dedicated GPU or SLI we disable D3D interop (which is not supported over multiple GPUs)
+		// this ensures the users control panel setting is always respected
+		bool sliEnabled = false;
+		if (mInteropMode != PxCudaInteropMode::NO_INTEROP && desc.graphicsDevice != NULL)
+		{
+			sliEnabled = PhysXDeviceSettings::isSLIEnabled(desc.graphicsDevice) == 1 ? true : false;
+		}
+
+		if (PhysXDeviceSettings::isUsingDedicatedGPU() == 1 || sliEnabled)
+		{
+			if (mInteropMode == PxCudaInteropMode::D3D9_INTEROP ||
+				mInteropMode == PxCudaInteropMode::D3D10_INTEROP ||
+				mInteropMode == PxCudaInteropMode::D3D11_INTEROP)
+			{
+				mInteropMode = PxCudaInteropMode::NO_INTEROP;
+				if (sliEnabled)
+				{
+					errorCallback.reportError(PxErrorCode::eDEBUG_INFO, "D3D/CUDA interop cannot be used in tandem with SLI, disabling interop.  Query PxCudaContextManager::getInteropMode() for interop status.",
+						__FILE__,__LINE__);
+				}
+			}
+		}
+
+		if (mInteropMode == PxCudaInteropMode::NO_INTEROP)
+		{
+			status = cuDeviceGet(&mDevHandle, devOrdinal);
+			if (CUDA_SUCCESS != status)
+			{
+				errorCallback.reportError(PxErrorCode::eDEBUG_WARNING, "cuDeviceGet failed",__FILE__,__LINE__);
+				return;
+			}
+
+			if (m_physXInterface)
+				status = m_physXInterface->cuCtxCreateOnNode3DLowLatency(&mCtx, (unsigned int)flags, mDevHandle);
+			else
+				status = cuCtxCreate(&mCtx, (unsigned int)flags, mDevHandle);
+			if (CUDA_SUCCESS != status)
+			{
+				errorCallback.reportError(PxErrorCode::eDEBUG_WARNING, "cuCtxCreate failed",__FILE__,__LINE__);
+				return;
+			}
+			mOwnContext = true;
+		}
+		else if (mInteropMode == PxCudaInteropMode::OGL_INTEROP)
+		{
+			status = cuDeviceGet(&mDevHandle, devOrdinal);
+			if (CUDA_SUCCESS != status)
+			{
+				errorCallback.reportError(PxErrorCode::eDEBUG_WARNING, "cuDeviceGet failed",__FILE__,__LINE__);
+				return;
+			}
+
+			status = cuGLCtxCreate(&mCtx, (unsigned int)flags, mDevHandle);
+			if (CUDA_SUCCESS != status)
+			{
+				errorCallback.reportError(PxErrorCode::eDEBUG_WARNING, "cuCtxGLCreate failed",__FILE__,__LINE__);
+				return;
+			}
+
+			status = cuGLInit();
+			if (CUDA_SUCCESS != status)
+			{
+				errorCallback.reportError(PxErrorCode::eDEBUG_WARNING, "cuGLInit failed",__FILE__,__LINE__);
+				return;
+			}
+			mOwnContext = true;
+		}
+#if PX_WIN32 || PX_WIN64
+		else if (mInteropMode == PxCudaInteropMode::D3D9_INTEROP)
+		{
+			status = cuD3D9CtxCreate(&mCtx, &mDevHandle, (unsigned int)flags,
+			                         reinterpret_cast<IDirect3DDevice9*>(desc.graphicsDevice));
+
+			if (CUDA_SUCCESS != status)
+			{
+				errorCallback.reportError(PxErrorCode::eDEBUG_WARNING, "cuD3D9CtxCreate failed",__FILE__,__LINE__);
+				return;
+			}
+			mOwnContext = true;
+		}
+		else if (mInteropMode == PxCudaInteropMode::D3D10_INTEROP)
+		{
+			status = cuD3D10CtxCreate(&mCtx, &mDevHandle, (unsigned int)flags,
+			                          reinterpret_cast<ID3D10Device*>(desc.graphicsDevice));
+
+			if (CUDA_SUCCESS != status)
+			{
+				errorCallback.reportError(PxErrorCode::eDEBUG_WARNING, "cuD3D10CtxCreate failed",__FILE__,__LINE__);
+				return;
+			}
+			mOwnContext = true;
+		}
+		else if (mInteropMode == PxCudaInteropMode::D3D11_INTEROP)
+		{
+			status = cuD3D11CtxCreate(&mCtx, &mDevHandle, (unsigned int)flags,
+			                          reinterpret_cast<ID3D11Device*>(desc.graphicsDevice));
+
+			if (CUDA_SUCCESS != status)
+			{
+				errorCallback.reportError(PxErrorCode::eDEBUG_WARNING, "cuD3D11CtxCreate failed",__FILE__,__LINE__);
+				return;
+			}
+			mOwnContext = true;
+		}
+#endif //PX_WIN32 || PX_WIN64
+		else
+		{
+			errorCallback.reportError(PxErrorCode::eDEBUG_WARNING, "Requested interop type is not supported!",__FILE__,__LINE__);
+			return;
+		}
+	}
+	else
+	{
+		mCtx = *desc.ctx;
+		status = cuCtxGetDevice(&mDevHandle);
+		if (CUDA_SUCCESS != status)
+		{
+			errorCallback.reportError(PxErrorCode::eDEBUG_WARNING, "cuCtxGetDevice failed",__FILE__,__LINE__);
+			return;
+		}
+	}
+
+	// Verify we can at least allocate a CUDA event from this context
+	CUevent testEvent;
+	if (CUDA_SUCCESS == cuEventCreate(&testEvent, 0))
+	{
+		cuEventDestroy(testEvent);
+	}
+	else
+	{
+		errorCallback.reportError(PxErrorCode::eDEBUG_WARNING, "CUDA context validation failed",__FILE__,__LINE__);
+		return;
+	}
+
+	status = cuDeviceGetName(mDeviceName, sizeof(mDeviceName), mDevHandle);
+	if (CUDA_SUCCESS != status)
+	{
+		errorCallback.reportError(PxErrorCode::eDEBUG_WARNING, "cuDeviceGetName failed",__FILE__,__LINE__);
+		return;
+	}
+
+	cuDeviceGetAttribute(&mSharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, mDevHandle);
+	cuDeviceGetAttribute(&mSharedMemPerMultiprocessor, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, mDevHandle);
+	cuDeviceGetAttribute(&mClockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, mDevHandle);
+	cuDeviceGetAttribute(&mComputeCapMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, mDevHandle);
+	cuDeviceGetAttribute(&mComputeCapMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, mDevHandle);
+	cuDeviceGetAttribute(&mIsIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, mDevHandle);
+	cuDeviceGetAttribute(&mCanMapHost, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, mDevHandle);
+	cuDeviceGetAttribute(&mMultiprocessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, mDevHandle);
+	cuDeviceGetAttribute(&mMaxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, mDevHandle);
+
+	status = cuDeviceTotalMem((size_t*)&mTotalMemBytes, mDevHandle);
+	if (CUDA_SUCCESS != status)
+	{
+		errorCallback.reportError(PxErrorCode::eDEBUG_WARNING, "cuDeviceTotalMem failed",__FILE__,__LINE__);
+		return;
+	}
+
+	// minimum compute capability is MIN_SM_MAJOR_VERSION.MIN_SM_MINOR_VERSION
+	if ((mComputeCapMajor < MIN_SM_MAJOR_VERSION)	||
+		(mComputeCapMajor == MIN_SM_MAJOR_VERSION && mComputeCapMinor < MIN_SM_MINOR_VERSION))
+	{
+		char buffer[256];
+		physx::shdfnd::snprintf(buffer, 256, "Minimum GPU compute capability %d.%d is required", MIN_SM_MAJOR_VERSION, MIN_SM_MINOR_VERSION);
+		errorCallback.reportError(PxErrorCode::eDEBUG_WARNING,buffer,__FILE__,__LINE__);
+		return;
+	}
+
+	mMemMgr = PX_NEW(CudaMemMgr)(*this, errorCallback);
+	if (mMemMgr == NULL)
+	{
+		errorCallback.reportError(PxErrorCode::eDEBUG_WARNING, "CudaMemMgr failed: Unable to allocate heaps",__FILE__,__LINE__);
+		return;
+	}
+
+	bool succ = true;
+	for (uint32_t i = 0; i < PxCudaBufferMemorySpace::COUNT; i++)
+	{
+		PxCudaBufferType type(PxCudaBufferMemorySpace::Enum(i), PxCudaBufferFlags::F_READ_WRITE);
+		succ &= mMemMgr->setBaseSize(type, desc.memoryBaseSize[i]);
+		succ &= mMemMgr->setPageSize(type, desc.memoryPageSize[i]);
+		succ &= mMemMgr->setMaxMemorySize(type, desc.maxMemorySize[i]);
+		PX_ASSERT(succ);
+		if (!succ)
+		{
+			errorCallback.reportError(PxErrorCode::eDEBUG_WARNING, "CudaMemMgr failed: Invalid memory parameter",__FILE__,__LINE__);
+			return;
+		}
+	}
+
+#if PX_DEBUG
+	if(!mManagerRefCount++)
+		mContextRefCountTls = shdfnd::TlsAlloc();
+	if(!shdfnd::TlsGet(mContextRefCountTls))
+		CUT_SAFE_CALL(cuCtxSetCurrent(0));
+#endif
+
+	mIsValid = true;
+	mDispatcher = PX_NEW(GpuDispatcherImpl)(errorCallback, *this);
+	if (!mDispatcher || mDispatcher->failureDetected())
+	{
+		errorCallback.reportError(PxErrorCode::eDEBUG_WARNING, "Failed to create functional GPU dispatcher",__FILE__,__LINE__);
+		mIsValid = false;
+		return;
+	}
+
+	mDispatcher->start();
+
+#if ENABLE_DEVICE_INFO_BRINGUP
+	// Device info (Enable for Amodel and Emulator testing)
+	errorCallback.reportError(PxErrorCode::eDEBUG_INFO, "Device Name: %s", mDeviceName);
+	errorCallback.reportError(PxErrorCode::eDEBUG_INFO, "Shared Memory Per Block: %d", mSharedMemPerBlock);
+	errorCallback.reportError(PxErrorCode::eDEBUG_INFO, "Shared Memory Per Multiprocessor: %d", mSharedMemPerMultiprocessor);
+	errorCallback.reportError(PxErrorCode::eDEBUG_INFO, "Number of SM: %d", mMultiprocessorCount);
+	errorCallback.reportError(PxErrorCode::eDEBUG_INFO, "Max Threads Per Block: %d", mMaxThreadsPerBlock);
+#endif
+
+#if USE_PERFKIT
+	{
+#if _WIN64
+		wchar_t * dllName = L"..\\..\\..\\..\\..\\externals\\nvPerfKit\\4.1.0.14260\\bin\\win7_x64\\NvPmApi.Core.dll";
+#else
+		wchar_t * dllName = L"..\\..\\..\\..\\..\\externals\\nvPerfKit\\4.1.0.14260\\bin\\win7_x86\\NvPmApi.Core.dll";
+#endif
+
+		NVPMRESULT nvResult;
+
+		if ((nvResult = GetNvPmApiManager()->Construct(dllName)) != NVPM_OK)
+		{
+			printf("perfkit error 1\n");
+			return; 
+		}
+
+		if ((nvResult = GetNvPmApi()->Init()) != NVPM_OK)
+		{
+			printf("perfkit error 2\n");
+			return; 
+		}
+
+		acquireContext();
+
+		CUcontext ctx;
+		cuCtxGetCurrent(&ctx);
+		if ((nvResult = GetNvPmApi()->CreateContextFromCudaContext((APIContextHandle)ctx, &hNVPMContext)) != NVPM_OK)
+		{
+			printf("perfkit error 3\n");
+			return; // This is an error condition
+		}
+
+		uint32_t nvStatus = 0;
+
+#if COUNT_L2_TO_L1_BYTES
+		nvStatus |= GetNvPmApi()->AddCounterByName(hNVPMContext, "l2_read_bytes");
+#elif COUNT_SM_TO_L1_QUERIES
+		nvStatus |= GetNvPmApi()->AddCounterByName(hNVPMContext, "tex_cache_sector_queries");
+#endif
+
+#if COUNT_INST_EXECUTED || COUNT_STORE_INST_EXECUTED || COUNT_ACTIVE_CYCLES || COUNT_ACTIVE_WARPS
+		char name[512];
+		for (int i = 0; i != SM_COUNT; i++)
+		{
+#if COUNT_INST_EXECUTED
+			sprintf_s(name,512,"sm_inst_executed_vsm%d",i);
+#elif COUNT_STORE_INST_EXECUTED
+			sprintf_s(name, 512, "sm_inst_executed_global_stores_vsm%d",i);
+#elif COUNT_ACTIVE_CYCLES
+			sprintf_s(name, 512, "sm_active_cycles_vsm%d",i);
+#elif COUNT_ACTIVE_WARPS
+			sprintf_s(name, 512, "sm_active_warps_vsm%d",i);
+#endif
+			nvStatus |= GetNvPmApi()->AddCounterByName(hNVPMContext, name);
+		}
+#elif COUNT_GPU_BUSY
+		nvStatus |= GetNvPmApi()->AddCounterByName(hNVPMContext, "gpu_busy");
+#endif
+
+		if (nvStatus != 0)
+		{
+			printf("perfkit error 4\n");
+			return; // This is an error condition
+		}
+	}
+#endif
+}
+
+/* Some driver version mismatches can cause delay import crashes.  Load NVCUDA.dll
+ * manually, verify its version number, then allow delay importing to bind all the
+ * APIs.
+ */
+bool CudaCtxMgr::safeDelayImport(PxErrorCallback& errorCallback)
+{
+#if PX_WIN32 || PX_WIN64
+#ifdef PX_SECURE_LOAD_LIBRARY
+	HMODULE hCudaDriver = nvLoadSystemLibrary("nvcuda.dll");
+#else
+	HMODULE hCudaDriver = LoadLibrary("nvcuda.dll");
+#endif
+#elif PX_LINUX
+	void*	hCudaDriver = dlopen("libcuda.so", RTLD_NOW);
+#endif
+	if (!hCudaDriver)
+	{
+		errorCallback.reportError(PxErrorCode::eDEBUG_INFO, "nvcuda.dll not found or could not be loaded.", __FILE__, __LINE__);
+		return false;
+	}
+
+	typedef CUresult(CUDAAPI * pfnCuDriverGetVersion_t)(int*);
+	pfnCuDriverGetVersion_t pfnCuDriverGetVersion = (pfnCuDriverGetVersion_t) GetProcAddress(hCudaDriver, "cuDriverGetVersion");
+	if (!pfnCuDriverGetVersion)
+	{
+		errorCallback.reportError(PxErrorCode::eDEBUG_INFO, "cuDriverGetVersion missing in nvcuda.dll.", __FILE__, __LINE__);
+		return false;
+	}
+
+	CUresult status = pfnCuDriverGetVersion(&mDriverVersion);
+	if (status != CUDA_SUCCESS)
+	{
+		errorCallback.reportError(PxErrorCode::eDEBUG_INFO, "Retrieving CUDA driver version failed.", __FILE__, __LINE__);
+		return false;
+	}
+
+	/* Let's require a driver version >= to the version we compile against
+	 * Currently, CUDA_VERSION is 6000 or 6.0, but APEX still uses CUDA 5.0 so we can't assert on 6.0 yet.
+	 */
+	PX_COMPILE_TIME_ASSERT(5000 <= CUDA_VERSION);
+
+	if (mDriverVersion < CUDA_VERSION)
+	{
+		char buffer[256];
+		physx::shdfnd::snprintf(buffer, 256, "CUDA driver version is %u, expected at least %u.", mDriverVersion, CUDA_VERSION);
+		errorCallback.reportError(PxErrorCode::eDEBUG_INFO, buffer, __FILE__,__LINE__);
+		return false;
+	}
+
+	/* Now trigger delay import and API binding */
+	status = cuDriverGetVersion(&mDriverVersion);
+	if (status != CUDA_SUCCESS)
+	{
+		errorCallback.reportError(PxErrorCode::eDEBUG_INFO, "Failed to bind CUDA API.", __FILE__, __LINE__);
+		return false;
+	}
+
+	/* Not strictly necessary, but good practice */
+#if PX_WIN32 | PX_WIN64
+	FreeLibrary(hCudaDriver);
+#elif PX_LINUX
+	dlclose(hCudaDriver);
+#endif
+	
+
+	return true;
+}
+
+void CudaCtxMgr::release()
+{
+	delete this;
+}
+
+CudaCtxMgr::~CudaCtxMgr()
+{
+	if (mDispatcher)
+	{
+		releaseGpuDispatcher(*mDispatcher);
+	}
+	if (mMemMgr)
+	{
+		delete mMemMgr;
+	}
+	if (mOwnContext)
+	{
+		CUT_SAFE_CALL(cuCtxDestroy(mCtx));
+	}
+
+#if PX_DEBUG
+	if(!--mManagerRefCount)
+		 shdfnd::TlsFree(mContextRefCountTls);
+#endif
+
+#if ENABLE_CUDA_DEVICE_RESET
+	CUT_SAFE_CALL(cuProfilerStop());
+#endif
+}
+
+bool CudaCtxMgr::registerResourceInCudaGL(CUgraphicsResource& resource, uint32_t buffer, PxCudaInteropRegisterFlags flags)
+{
+	CUresult ret = CUDA_ERROR_UNKNOWN;
+
+	acquireContext();
+
+	PX_ASSERT(mInteropMode == PxCudaInteropMode::OGL_INTEROP);
+
+	ret = cuGraphicsGLRegisterBuffer(&resource, (GLuint) buffer, uint32_t(flags));
+
+	releaseContext();
+
+	return ret == CUDA_SUCCESS;
+}
+
+bool CudaCtxMgr::registerResourceInCudaD3D(CUgraphicsResource& resource, void* resourcePointer, PxCudaInteropRegisterFlags flags)
+{
+	CUresult ret = CUDA_ERROR_UNKNOWN;
+#if PX_WINDOWS_FAMILY
+	acquireContext();
+
+	switch (mInteropMode)
+	{
+	case PxCudaInteropMode::D3D9_INTEROP:
+		ret = cuGraphicsD3D9RegisterResource(&resource, (IDirect3DResource9*)resourcePointer, uint32_t(flags));
+		break;
+	case PxCudaInteropMode::D3D10_INTEROP:
+		ret = cuGraphicsD3D10RegisterResource(&resource, (ID3D10Resource*)resourcePointer, uint32_t(flags));
+		break;
+	case PxCudaInteropMode::D3D11_INTEROP:
+		ret = cuGraphicsD3D11RegisterResource(&resource, (ID3D11Resource*)resourcePointer, uint32_t(flags));
+		break;
+	case PxCudaInteropMode::NO_INTEROP:
+	case PxCudaInteropMode::OGL_INTEROP:
+	case PxCudaInteropMode::COUNT:
+	default:
+		PX_ALWAYS_ASSERT_MESSAGE("unexpected state in registerResourceInCuda3D");
+	}
+
+	releaseContext();
+#else
+	PX_UNUSED(resource);
+	PX_UNUSED(resourcePointer);
+	PX_UNUSED(flags);
+#endif //PX_WINDOWS_FAMILY
+	return ret == CUDA_SUCCESS;
+}
+
+bool CudaCtxMgr::unregisterResourceInCuda(CUgraphicsResource resource)
+{
+	CUresult ret = CUDA_ERROR_UNKNOWN;
+
+	acquireContext();
+
+	ret = cuGraphicsUnregisterResource(resource);
+
+	releaseContext();
+
+	return ret == CUDA_SUCCESS;
+}
+
+CUcontext CudaCtxMgr::acquireContext()
+{
+	CUcontext ctx = 0;
+	CUT_SAFE_CALL(cuCtxGetCurrent(&ctx));
+
+	if (ctx != mCtx)
+	{
+#if PX_DEBUG
+		PX_ASSERT(!shdfnd::TlsGet(mContextRefCountTls));
+#endif
+		CUT_SAFE_CALL(cuCtxSetCurrent(mCtx));
+	}
+
+#if PX_DEBUG
+	char* refCount = (char*)shdfnd::TlsGet(mContextRefCountTls);
+	shdfnd::TlsSet(mContextRefCountTls, ++refCount);
+#endif
+
+	return mCtx;
+}
+
+void CudaCtxMgr::releaseContext()
+{
+#if PX_DEBUG
+	char* refCount = (char*)shdfnd::TlsGet(mContextRefCountTls);
+	shdfnd::TlsSet(mContextRefCountTls, --refCount);
+	// see DE8475
+	if(!refCount)
+		CUT_SAFE_CALL(cuCtxSetCurrent(0));
+#endif
+}
+
+#if PX_SUPPORT_GPU_PHYSX
+extern "C" void initUtilKernels();
+
+PxCudaContextManager* createCudaContextManager(const PxCudaContextManagerDesc& desc, PxErrorCallback& errorCallback)
+{
+	//this call is needed to force UtilKernels linkage in case someone links PxCudaContextManager as Static Library!
+	initUtilKernels();
+
+	return PX_NEW(CudaCtxMgr)(desc, errorCallback);
+}
+
+#endif
+
+} // end physx namespace
+
+
diff --git a/PxShared/src/cudamanager/src/CudaKernelWrangler.cpp b/PxShared/src/cudamanager/src/CudaKernelWrangler.cpp
new file mode 100644
index 0000000..7579d63
--- /dev/null
+++ b/PxShared/src/cudamanager/src/CudaKernelWrangler.cpp
@@ -0,0 +1,242 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#include "task/PxGpuDispatcher.h"
+#include "cudamanager/PxCudaContextManager.h"
+
+#include "foundation/PxAssert.h"
+#include "foundation/PxErrorCallback.h"
+
+#include "PsString.h"
+
+#include "CudaKernelWrangler.h"
+
+#include <cuda.h>
+#include <texture_types.h>
+
+/**
+ * Workaround hacks for using nvcc --compiler output object files
+ * without linking with CUDART.  We must implement our own versions
+ * of these functions that the object files are hard-coded to call into.
+ */
+
+#define CUT_SAFE_CALL(call)  { CUresult ret = call; if( CUDA_SUCCESS != ret ) { PX_ASSERT(!ret); } }
+
+#define MAX_MODULES					64  // Max number of .cu files you will build
+static void* gModuleTable[ MAX_MODULES ];
+static int gNumModules = 0;
+
+#define MAX_FUNCTIONS				256 // Max number of kernel of entry points
+typedef struct
+{
+	int moduleIndex;
+	const char* functionName;
+} cuFuncDesc;
+static cuFuncDesc gFunctionTable[ MAX_FUNCTIONS ];
+static int gNumFunctions = 0;
+
+using namespace physx::shdfnd;
+using namespace physx;
+		
+KernelWrangler::KernelWrangler(PxGpuDispatcher& gd, PxErrorCallback& errorCallback, const char** funcNames, uint16_t numFuncs)
+	: mError(false)
+	, mCuFunctions(PX_DEBUG_EXP("CuFunctions"))
+	, mCuModules(PX_DEBUG_EXP("CuModules"))
+	, mGpuDispatcher(gd)
+	, mErrorCallback(errorCallback)
+{
+	PxScopedCudaLock _lock_(*gd.getCudaContextManager());
+
+	/* Formally load the CUDA modules, get CUmodule handles */
+	mCuModules.resize((uint32_t)gNumModules);
+	for (int i = 0 ; i < gNumModules ; ++i)
+	{
+		CUresult ret = cuModuleLoadDataEx(&mCuModules[(uint32_t)i], gModuleTable[i], 0, NULL, NULL);
+		if (ret != CUDA_SUCCESS && ret != CUDA_ERROR_NO_BINARY_FOR_GPU)
+		{
+			mErrorCallback.reportError(PxErrorCode::eINTERNAL_ERROR, "Failed to load CUDA module data.", __FILE__, __LINE__);
+			mError = true;
+			return;
+		}
+	}
+
+	/* matchup funcNames to CUDA modules, get CUfunction handles */
+	mCuFunctions.resize(numFuncs);
+	mCuFuncModIndex.resize(numFuncs);
+	for (uint32_t i = 0 ; i < numFuncs ; ++i)
+	{
+		for (int j = 0; ; ++j)
+		{
+			if(j == gNumFunctions)
+			{
+				char buffer[256];
+				physx::shdfnd::snprintf(buffer, 256, "Could not find registered CUDA function '%s'.", funcNames[i]);
+				mErrorCallback.reportError(PxErrorCode::eINTERNAL_ERROR, buffer, __FILE__, __LINE__);
+				mError = true;
+				return;
+			}
+
+			if (!physx::shdfnd::strcmp(gFunctionTable[j].functionName, funcNames[i]))
+			{
+				mCuFuncModIndex[i] = (uint16_t)gFunctionTable[j].moduleIndex;
+				CUresult ret = cuModuleGetFunction(&mCuFunctions[i], mCuModules[mCuFuncModIndex[i]], funcNames[i]);
+				if (ret != CUDA_SUCCESS)
+				{
+					char buffer[256];
+					physx::shdfnd::snprintf(buffer, 256, "Could not find CUDA module containing function '%s'.", funcNames[i]);
+					mErrorCallback.reportError(PxErrorCode::eINTERNAL_ERROR, buffer, __FILE__, __LINE__);
+					mError = true;
+					return;
+				}
+				break;
+			}
+		}
+	}
+}
+
+KernelWrangler::~KernelWrangler()
+{
+	if (mCuModules.size())
+	{
+		PxScopedCudaLock _lock_(*mGpuDispatcher.getCudaContextManager());
+
+		for (uint32_t i = 0 ; i < mCuModules.size() ; i++)
+			if(mCuModules[i])
+				CUT_SAFE_CALL(cuModuleUnload(mCuModules[i]));
+	}
+}
+
+void const* const* KernelWrangler::getImages()
+{
+	return gModuleTable;
+}
+
+int KernelWrangler::getNumImages()
+{
+	return gNumModules;
+}
+
+/*
+ * These calls are all made _before_ main() during static initialization
+ * of this DLL.
+ */
+
+#include <driver_types.h>
+
+#if PX_WINDOWS_FAMILY
+#define CUDARTAPI __stdcall
+#endif
+
+struct uint3;
+struct dim3;
+
+extern "C"
+void** CUDARTAPI __cudaRegisterFatBinary(void* fatBin)
+{
+	//HACK to get real fatbin in CUDA 4.0
+	struct CUIfatbinStruct
+	{
+		int magic;
+		int version;
+		void *fatbinArray;
+		char *fatbinFile;
+	};
+	const CUIfatbinStruct *fatbinStruct = (const CUIfatbinStruct *)fatBin;
+	if (fatbinStruct->magic == 0x466243B1)
+	{
+		fatBin = fatbinStruct->fatbinArray;
+	}
+
+	if (gNumModules < MAX_MODULES)
+	{
+		gModuleTable[ gNumModules ] = fatBin;
+		return (void**)(size_t) gNumModules++;
+	}
+	return NULL;
+}
+
+extern "C"
+void CUDARTAPI __cudaUnregisterFatBinary(void** fatCubinHandle)
+{
+	gModuleTable[(int)(size_t) fatCubinHandle ] = 0;
+}
+
+extern "C"
+void CUDARTAPI __cudaRegisterTexture(void**, const struct textureReference*, const void**, const char*, int, int, int)
+{
+}
+
+extern "C" void CUDARTAPI __cudaRegisterVar(void**, char*, char*, const char*, int, int, int, int)
+{
+}
+
+
+extern "C" void CUDARTAPI __cudaRegisterShared(void**, void**)
+{
+}
+
+extern "C"
+void CUDARTAPI __cudaRegisterFunction(void** fatCubinHandle, const char*, 
+	char*, const char* deviceName, int, uint3*, uint3*, dim3*, dim3*, int*)
+{
+	if (gNumFunctions < MAX_FUNCTIONS)
+	{
+		// We need this association of function to module in order to find textures and globals
+		gFunctionTable[ gNumFunctions ].moduleIndex = (int)(size_t) fatCubinHandle;
+		gFunctionTable[ gNumFunctions ].functionName = deviceName;
+		gNumFunctions++;
+	}
+}
+
+/* These functions are implemented just to resolve link dependencies */
+
+extern "C"
+cudaError_t CUDARTAPI cudaLaunch(const char* entry)
+{
+	PX_UNUSED(entry);
+	return cudaSuccess;
+}
+
+extern "C"
+cudaError_t CUDARTAPI cudaSetupArgument(const void*, size_t, size_t)
+{
+	return cudaSuccess;
+}
+
+extern "C"
+struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDesc(
+    int x, int y, int z, int w, enum cudaChannelFormatKind f)
+{
+	struct cudaChannelFormatDesc desc;
+	desc.x = x;
+	desc.y = y;
+	desc.z = z;
+	desc.w = w;
+	desc.f = f;
+	return desc;
+}
+
diff --git a/PxShared/src/cudamanager/src/CudaMemoryManager.cpp b/PxShared/src/cudamanager/src/CudaMemoryManager.cpp
new file mode 100644
index 0000000..b1c6f94
--- /dev/null
+++ b/PxShared/src/cudamanager/src/CudaMemoryManager.cpp
@@ -0,0 +1,649 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#include "cudamanager/PxCudaContextManager.h"
+#include "foundation/PxMath.h"
+#include "foundation/PxMemory.h"
+
+#include "CudaMemoryManager.h"
+#include "HeapManagerRef.h"
+
+#include <cuda.h>
+
+#define DEVICE_BASE_SIZE (0)
+#define DEVICE_PAGE_SIZE ( 2 * 1024*1024)
+#define PINNED_BASE_SIZE (0)
+#define PINNED_PAGE_SIZE ( 2 * 1024*1024)
+#define WC_BASE_SIZE (0)
+#define WC_PAGE_SIZE ( 2 * 1024*1024)
+#define MIN_BLOCK_SIZE 2048
+
+
+#define CMM_DELETE_SINGLE(x)	{ if(x) delete x; }
+#define CMM_DELETE_ARRAY(x)		{ if(x) delete [] x; }
+
+using namespace physx;
+
+CudaMemMgr::CudaMemMgr(PxCudaContextManager& mgr, physx::PxErrorCallback& errorCallback)
+	: mErrorCallback(errorCallback)
+	, mBufferPool("mBufferPool", 1024)
+	, mInitialized(false)
+	, mMgr(mgr)
+	, mDebugDisableAllocs(false)
+{
+	for (uint32_t i = 0; i < PxCudaBufferMemorySpace::COUNT; i++)
+	{
+		mHeap[i] = NULL;
+		mMemoryAllocator[i] = NULL;
+		mMemoryMaxSize[i] = size_t(-1);
+	}
+
+	mMemoryBaseSize[PxCudaBufferMemorySpace::T_GPU] = DEVICE_BASE_SIZE;
+	mMemoryBaseSize[PxCudaBufferMemorySpace::T_PINNED_HOST] = PINNED_BASE_SIZE;
+	mMemoryBaseSize[PxCudaBufferMemorySpace::T_WRITE_COMBINED] = WC_BASE_SIZE;
+	mMemoryBaseSize[PxCudaBufferMemorySpace::T_HOST] = 0;
+
+	mMemoryPageSize[PxCudaBufferMemorySpace::T_GPU] = DEVICE_PAGE_SIZE;
+	mMemoryPageSize[PxCudaBufferMemorySpace::T_PINNED_HOST] = PINNED_PAGE_SIZE;
+	mMemoryPageSize[PxCudaBufferMemorySpace::T_WRITE_COMBINED] = WC_PAGE_SIZE;
+	mMemoryPageSize[PxCudaBufferMemorySpace::T_HOST] = PINNED_PAGE_SIZE;
+}
+
+
+CudaMemMgr::~CudaMemMgr()
+{
+	for (uint32_t i = 0; i < PxCudaBufferMemorySpace::COUNT; i++)
+	{
+		CMM_DELETE_SINGLE(mHeap[i]);
+		CMM_DELETE_SINGLE(mMemoryAllocator[i]);
+	}
+}
+
+
+PX_INLINE bool CudaMemMgr::initialize()
+{
+	if (mInitialized)
+	{
+		return true;
+	}
+
+	for (uint32_t i = 0; i < PxCudaBufferMemorySpace::COUNT; i++)
+	{
+		mHeap[i] = PX_NEW(HeapManagerRef)(mErrorCallback, false);
+		PX_ASSERT(mHeap[i]);
+	}
+
+	mMemoryAllocator[PxCudaBufferMemorySpace::T_GPU]				= PX_NEW(DeviceMemAllocator)(mMgr, mMemoryMaxSize[PxCudaBufferMemorySpace::T_GPU]);
+	mMemoryAllocator[PxCudaBufferMemorySpace::T_PINNED_HOST]		= PX_NEW(PinnedMemAllocator)(mMgr, mMemoryMaxSize[PxCudaBufferMemorySpace::T_PINNED_HOST]);
+	mMemoryAllocator[PxCudaBufferMemorySpace::T_WRITE_COMBINED]	= PX_NEW(WriteCombinedMemAllocator)(mMgr, mMemoryMaxSize[PxCudaBufferMemorySpace::T_WRITE_COMBINED]);
+	mMemoryAllocator[PxCudaBufferMemorySpace::T_HOST]				= PX_NEW(HostMemAllocator)(mMemoryMaxSize[PxCudaBufferMemorySpace::T_HOST]);
+
+	bool succ = true;
+	for (uint32_t i = 0; i < PxCudaBufferMemorySpace::COUNT; i++)
+	{
+		succ &= mHeap[i]->init(mMemoryAllocator[i], mMemoryBaseSize[i], mMemoryPageSize[i], MIN_BLOCK_SIZE);
+		PX_ASSERT(succ);
+	}
+
+	for (uint32_t i = 0; i < PxCudaBufferMemorySpace::COUNT; i++)
+	{
+		succ &= mHeap[i] && mMemoryAllocator[i];
+	}
+
+	if (!succ)
+	{
+		for (uint32_t i = 0; i < PxCudaBufferMemorySpace::COUNT; i++)
+		{
+			CMM_DELETE_SINGLE(mHeap[i]);
+			CMM_DELETE_SINGLE(mMemoryAllocator[i]);
+		}
+		mInitialized = false;
+	}
+
+	return mInitialized = succ;;
+}
+
+
+bool CudaMemMgr::setPageSize(const PxCudaBufferType& type, size_t size)
+{
+	shdfnd::Mutex::ScopedLock lock(mMutex);
+
+	if (!mInitialized)
+	{
+		mMemoryPageSize[type.memorySpace] = PxMax(mMemoryPageSize[type.memorySpace], size);
+		return true;
+	}
+	else
+	{
+		bool ret = mHeap[type.memorySpace]->setPageSize(size);
+		mMemoryPageSize[type.memorySpace] = ret ? size : mMemoryPageSize[type.memorySpace];
+		return ret;
+	}
+}
+
+
+bool CudaMemMgr::setBaseSize(const PxCudaBufferType& type, size_t size)
+{
+	shdfnd::Mutex::ScopedLock lock(mMutex);
+
+	if (mInitialized || (((size - 1)&size) != 0))
+	{
+		return false;
+	}
+	else
+	{
+		mMemoryBaseSize[type.memorySpace] = PxMax(mMemoryBaseSize[type.memorySpace], size);
+		return true;
+	}
+}
+
+
+size_t CudaMemMgr::getBaseSize(const PxCudaBufferType& type)
+{
+	return mMemoryBaseSize[type.memorySpace];
+}
+
+
+size_t CudaMemMgr::getPageSize(const PxCudaBufferType& type)
+{
+	return mMemoryPageSize[type.memorySpace];
+}
+
+
+bool CudaMemMgr::setMaxMemorySize(const PxCudaBufferType& type, size_t size)
+{
+	shdfnd::Mutex::ScopedLock lock(mMutex);
+
+	if (mInitialized)
+	{
+		switch (type.memorySpace)
+		{
+		case PxCudaBufferMemorySpace::T_GPU:
+			static_cast<DeviceMemAllocator*>(mMemoryAllocator[type.memorySpace])->setMaxSize(size);
+			return true;
+			break;
+		case PxCudaBufferMemorySpace::T_PINNED_HOST:
+			static_cast<PinnedMemAllocator*>(mMemoryAllocator[type.memorySpace])->setMaxSize(size);
+			return true;
+			break;
+		case PxCudaBufferMemorySpace::T_HOST:
+			static_cast<HostMemAllocator*>(mMemoryAllocator[type.memorySpace])->setMaxSize(size);
+			return true;
+			break;
+		case PxCudaBufferMemorySpace::T_WRITE_COMBINED:
+			static_cast<WriteCombinedMemAllocator*>(mMemoryAllocator[type.memorySpace])->setMaxSize(size);
+			return true;
+			break;
+		case PxCudaBufferMemorySpace::COUNT:
+		default:
+			PX_ASSERT(!"unknown memory type");
+			break;
+		}
+	}
+	else
+	{
+		mMemoryMaxSize[type.memorySpace] = PxMax(mMemoryMaxSize[type.memorySpace], size);
+		return true;
+	}
+
+	return false;
+}
+
+size_t CudaMemMgr::getMaxMemorySize(const PxCudaBufferType& type)
+{
+	if (mInitialized)
+	{
+		switch (type.memorySpace)
+		{
+		case PxCudaBufferMemorySpace::T_GPU:
+			return static_cast<DeviceMemAllocator*>(mMemoryAllocator[type.memorySpace])->getMaxSize();
+			break;
+		case PxCudaBufferMemorySpace::T_PINNED_HOST:
+			return static_cast<PinnedMemAllocator*>(mMemoryAllocator[type.memorySpace])->getMaxSize();
+			break;
+		case PxCudaBufferMemorySpace::T_HOST:
+			return static_cast<HostMemAllocator*>(mMemoryAllocator[type.memorySpace])->getMaxSize();
+			break;
+		case PxCudaBufferMemorySpace::T_WRITE_COMBINED:
+			return static_cast<WriteCombinedMemAllocator*>(mMemoryAllocator[type.memorySpace])->getMaxSize();
+			break;
+		case PxCudaBufferMemorySpace::COUNT:
+		default:
+			PX_ASSERT(!"unknown memory type");
+			break;
+		}
+	}
+	return 0;
+}
+
+bool CudaMemMgr::reserve(const PxCudaBufferType& type, size_t size)
+{
+	shdfnd::Mutex::ScopedLock lock(mMutex);
+
+	if (!mInitialized)
+	{
+		if (!initialize())
+		{
+			return false;
+		}
+	}
+
+	return mHeap[type.memorySpace]->reserve(size);
+}
+
+
+PxCudaBuffer* CudaMemMgr::alloc(const PxCudaBufferType& type, size_t size, PX_ALLOC_INFO_PARAMS_DEF())
+{
+	PxCudaBufferPtr addr = alloc(type.memorySpace, size, PX_ALLOC_INFO_PARAMS_INPUT());
+
+	shdfnd::Mutex::ScopedLock lock(mMutex);
+	CudaBuffer* buffer = NULL;
+	if (addr)
+	{
+		buffer = mBufferPool.construct(type);
+		if (buffer)
+		{
+			buffer->init(addr, size, *this, PX_ALLOC_INFO_PARAMS_INPUT());
+		}
+	}
+	return buffer;
+}
+
+PxCudaBufferPtr CudaMemMgr::alloc(PxCudaBufferMemorySpace::Enum memorySpace, size_t size, PX_ALLOC_INFO_PARAMS_DEF())
+{
+	shdfnd::Mutex::ScopedLock lock(mMutex);
+
+	if (!mInitialized)
+	{
+		if (!initialize())
+		{
+			return 0;
+		}
+	}
+
+	if (mDebugDisableAllocs)
+	{
+		return 0;
+	}
+
+	return reinterpret_cast<PxCudaBufferPtr>(mHeap[memorySpace]->alloc(size, PX_ALLOC_INFO_PARAMS_INPUT()));
+}
+
+bool CudaMemMgr::free(PxCudaBufferMemorySpace::Enum memorySpace, PxCudaBufferPtr addr)
+{
+	shdfnd::Mutex::ScopedLock lock(mMutex);
+
+	if (!mInitialized)
+	{
+		return false;
+	}
+
+	if (addr)
+	{
+		return mHeap[memorySpace]->free((void*)(addr));
+	}
+	else
+	{
+		return false;
+	}
+}
+
+bool CudaMemMgr::realloc(PxCudaBufferMemorySpace::Enum memorySpace, PxCudaBufferPtr addr, size_t size, PX_ALLOC_INFO_PARAMS_DEF())
+{
+	shdfnd::Mutex::ScopedLock lock(mMutex);
+
+	if (!mInitialized)
+	{
+		return false;
+	}
+
+	if (!size)
+	{
+		return false;
+	}
+
+	if (mDebugDisableAllocs)
+	{
+		return NULL;
+	}
+
+	bool ret = false;
+	if (addr)
+	{
+		ret = mHeap[memorySpace]->realloc((void*)(addr), size, PX_ALLOC_INFO_PARAMS_INPUT());
+	}
+
+	return ret;
+}
+
+void CudaMemMgr::getStats(const PxCudaBufferType& type, PxCudaMemoryManagerStats& outStats)
+{
+	shdfnd::Mutex::ScopedLock lock(mMutex);
+
+	if (!mInitialized)
+	{
+		return;
+	}
+
+	ApexHeapStats hpStats;
+	mHeap[type.memorySpace]->getStats(hpStats, HeapStatsFlags::F_BASIC_STATS | HeapStatsFlags::F_ALLOC_ID_STATS);
+
+	outStats.heapSize = hpStats.heapSize;
+	outStats.totalAllocated = hpStats.totalAllocated;
+	outStats.maxAllocated = hpStats.maxAllocated;
+	PxMemCopy(outStats.allocIdStats, hpStats.allocIdStats, sizeof(PxAllocIdStats)*PxAllocId::NUM_IDS);
+}
+
+
+bool CudaMemMgr::free(CudaBuffer& buffer)
+{
+	PxCudaBufferMemorySpace::Enum memSpace = buffer.getTypeFast().memorySpace;
+	PxCudaBufferPtr addr = buffer.getPtrFast();
+
+	{
+		shdfnd::Mutex::ScopedLock lock(mMutex);
+		mBufferPool.destroy(&buffer);
+	}
+
+	return free(memSpace, addr);
+}
+
+
+bool CudaMemMgr::realloc(CudaBuffer& buffer, size_t size, PX_ALLOC_INFO_PARAMS_DEF())
+{
+	return realloc(buffer.getTypeFast().memorySpace, buffer.getPtrFast(), size, PX_ALLOC_INFO_PARAMS_INPUT());
+}
+
+PxCudaBufferPtr CudaMemMgr::getMappedPinnedPtr(PxCudaBufferPtr hostPtr)
+{
+	shdfnd::Mutex::ScopedLock lock(mMutex);
+	void* base = mHeap[PxCudaBufferMemorySpace::T_PINNED_HOST]->findBaseAddress((void*)hostPtr);
+	if (base)
+	{
+		size_t offset = ((PinnedMemAllocator*)mMemoryAllocator[PxCudaBufferMemorySpace::T_PINNED_HOST])->getMappedPinnedOffset(base);
+		return hostPtr + offset;
+	}
+	return 0;
+}
+
+bool CudaBuffer::free()
+{
+	return mMemManager->free(*this);
+}
+
+
+bool CudaBuffer::realloc(size_t size, PX_ALLOC_INFO_PARAMS_DEF())
+{
+	return mMemManager->realloc(*this, size, PX_ALLOC_INFO_PARAMS_INPUT());
+}
+
+
+DeviceMemAllocator::DeviceMemAllocator(PxCudaContextManager& mgr, size_t maxSize)
+	: mMgr(mgr)
+	, mMaxSize(maxSize)
+	, mAllocSize(0)
+{}
+
+
+DeviceMemAllocator::~DeviceMemAllocator()
+{
+	PX_ASSERT(mAllocSize == 0);
+}
+
+
+void* DeviceMemAllocator::alloc(const size_t size)
+{
+	if (mAllocSize + size > mMaxSize)
+	{
+		return NULL;
+	}
+	else
+	{
+		PxScopedCudaLock lock(mMgr);
+		CUdeviceptr dPtr;
+		CUresult result = cuMemAlloc(&dPtr,  uint32_t(size));
+
+		if (result == CUDA_SUCCESS)
+		{
+			mAllocSize += size;
+			return (void*)(size_t)(dPtr);
+		}
+		else
+		{
+			PX_ASSERT_WITH_MESSAGE(0, "Failed to allocate device memory.");
+			return NULL;
+		}
+	}
+}
+
+
+void DeviceMemAllocator::free(void* addr, const size_t size)
+{
+	PxScopedCudaLock lock(mMgr);
+	PX_ASSERT(mAllocSize >= size);
+	CUresult result = cuMemFree((CUdeviceptr)(size_t)(addr));
+	PX_UNUSED(result);
+	PX_ASSERT(result == CUDA_SUCCESS);
+	mAllocSize -= size;
+}
+
+
+PinnedMemAllocator::PinnedMemAllocator(PxCudaContextManager& mgr, size_t maxSize)
+	: mMgr(mgr)
+	, mMaxSize(maxSize)
+	, mAllocSize(0)
+{
+}
+
+
+PinnedMemAllocator::~PinnedMemAllocator()
+{
+	PX_ASSERT(mAllocSize == 0);
+}
+
+
+void* PinnedMemAllocator::alloc(const size_t size)
+{
+	if (mAllocSize + size > mMaxSize)
+	{
+		return NULL;
+	}
+	else
+	{
+		PxScopedCudaLock lock(mMgr);
+		void* hPtr;
+		unsigned int flags = 0;
+
+		if (mMgr.canMapHostMemory())
+		{
+			flags |= CU_MEMHOSTALLOC_DEVICEMAP;
+		}
+		CUresult result = cuMemHostAlloc(&hPtr, uint32_t(size), flags);
+
+		if (result == CUDA_SUCCESS)
+		{
+			if (hPtr)
+			{
+				mAllocSize += size;
+			}
+
+			if (mMgr.canMapHostMemory())
+			{
+				CUdeviceptr dptr = 0;
+				cuMemHostGetDevicePointer(&dptr, hPtr, 0);
+				mMappedPinnedPtrs.insert(hPtr, size_t(dptr));
+			}
+
+			return hPtr;
+		}
+		else
+		{
+			PX_ASSERT_WITH_MESSAGE(0, "Failed to allocate pinned memory.");			
+			return NULL;
+		}
+	}
+}
+
+
+void PinnedMemAllocator::free(void* addr, const size_t size)
+{
+	PxScopedCudaLock lock(mMgr);
+	PX_ASSERT(mAllocSize >= size);
+
+	if (mMgr.canMapHostMemory())
+	{
+		PX_ASSERT(mMappedPinnedPtrs.find(addr));
+		mMappedPinnedPtrs.erase(addr);
+	}
+	
+	CUresult result = cuMemFreeHost(addr);
+	PX_UNUSED(result);
+	PX_ASSERT(result == CUDA_SUCCESS);
+	mAllocSize -= size;
+}
+
+
+WriteCombinedMemAllocator::WriteCombinedMemAllocator(PxCudaContextManager& mgr, size_t maxSize)
+	: mMgr(mgr)
+	, mMaxSize(maxSize)
+	, mAllocSize(0)
+{
+	mWcMemSupport = mMgr.getDriverVersion() >= 2020 ? WcMem::SUPPORTED : WcMem::NOT_SUPPORTED;
+}
+
+
+WriteCombinedMemAllocator::~WriteCombinedMemAllocator()
+{
+	PX_ASSERT(mAllocSize == 0);
+}
+
+
+bool WriteCombinedMemAllocator::isWcMemSupported()
+{
+	if (mWcMemSupport == WcMem::SUPPORTED)
+	{
+		return true;
+	}
+	else
+	{
+		PX_ASSERT(mWcMemSupport == WcMem::NOT_SUPPORTED);
+		return false;
+	}
+}
+
+
+void* WriteCombinedMemAllocator::alloc(const size_t size)
+{
+	if (mAllocSize + size > mMaxSize)
+	{
+		return NULL;
+	}
+	else
+	{
+		PxScopedCudaLock lock(mMgr);
+		void* hPtr = NULL;
+
+		unsigned int flags = CU_MEMHOSTALLOC_WRITECOMBINED;
+
+		if (mMgr.canMapHostMemory())
+		{
+			flags |= CU_MEMHOSTALLOC_DEVICEMAP;
+		}
+
+		bool success = isWcMemSupported() && (cuMemHostAlloc(&hPtr, size, flags) == CUDA_SUCCESS);
+		if (success)
+		{
+			if (hPtr)
+			{
+				mAllocSize += size;
+			}
+
+			return hPtr;
+		}
+		else
+		{
+			PX_ASSERT_WITH_MESSAGE(0, "Failed to allocate write combined memory.");			
+			return NULL;
+		}
+	}
+}
+
+
+void WriteCombinedMemAllocator::free(void* addr, const size_t size)
+{
+	PxScopedCudaLock lock(mMgr);
+	PX_ASSERT(mAllocSize >= size);
+	CUresult result = cuMemFreeHost(addr);
+	PX_ASSERT(result == CUDA_SUCCESS);
+	PX_UNUSED(result);
+	mAllocSize -= size;
+}
+
+
+HostMemAllocator::HostMemAllocator(size_t maxSize)
+	: mMaxSize(maxSize)
+	, mAllocSize(0)
+{
+}
+
+
+HostMemAllocator::~HostMemAllocator()
+{
+	PX_ASSERT(mAllocSize == 0);
+}
+
+
+void* HostMemAllocator::alloc(const size_t size)
+{
+	if (mAllocSize + size > mMaxSize)
+	{
+		return NULL;
+	}
+	else
+	{
+		void* ret = PX_ALLOC(size, "host memory");
+		if (ret)
+		{
+			mAllocSize += size;
+			return ret;
+		}
+		else
+		{
+			PX_ASSERT_WITH_MESSAGE(0, "Failed to allocate host memory.");			
+			return NULL;
+		}
+	}
+}
+
+
+void HostMemAllocator::free(void* addr, const size_t size)
+{
+	PX_ASSERT(mAllocSize >= size);
+	PX_FREE(addr);
+	mAllocSize -= size;
+}
+
diff --git a/PxShared/src/cudamanager/src/CudaMemoryManager.h b/PxShared/src/cudamanager/src/CudaMemoryManager.h
new file mode 100644
index 0000000..071b4ab
--- /dev/null
+++ b/PxShared/src/cudamanager/src/CudaMemoryManager.h
@@ -0,0 +1,297 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXCUDACONTEXTMANAGER_CUDAMEMORYMANAGER_H
+#define PXCUDACONTEXTMANAGER_CUDAMEMORYMANAGER_H
+
+#include "task/PxTaskDefine.h"
+#include "HeapManagerInterface.h"
+
+#include "PsPool.h"
+#include "PsMutex.h"
+#include "PsUserAllocated.h"
+#include "PsHashMap.h"
+
+namespace physx
+{
+
+class CudaBuffer;
+class HeapManagerInterface;
+class CudaMemMgr;
+class PxCudaContextManager;
+
+class CudaBuffer: public PxCudaBuffer
+{
+public:
+	PX_INLINE CudaBuffer(const PxCudaBufferType& type)
+		: mType(type)
+	{}
+
+// Ni Interface
+	bool									free();
+	bool									realloc(size_t size, PX_ALLOC_INFO_PARAMS_DECL(NULL, 0, NULL, UNASSIGNED));
+	const PxCudaBufferType&					getType() const
+	{
+		return getTypeFast();
+	}
+	PxCudaBufferPtr							getPtr() const
+	{
+		return getPtrFast();
+	}
+	size_t									getSize() const
+	{
+		return getSizeFast();
+	}
+	PxCudaMemoryManager*					getCudaMemoryManager() const
+	{
+		return getCudaMemoryManagerFast();
+	}
+//
+	PX_INLINE	const PxCudaBufferType&		getTypeFast() const
+	{
+		return mType;
+	}
+	PX_INLINE	PxCudaBufferPtr				getPtrFast() const
+	{
+		return mPtr;
+	}
+	PX_INLINE	void						setPtr(PxCudaBufferPtr val)
+	{
+		mPtr = val;
+	}
+	PX_INLINE	size_t						getSizeFast() const
+	{
+		return mSize;
+	}
+	PX_INLINE	void						setSize(size_t val)
+	{
+		mSize = val;
+	}
+	PX_INLINE	PxCudaMemoryManager*			getCudaMemoryManagerFast() const
+	{
+		return reinterpret_cast<PxCudaMemoryManager*>(mMemManager);
+	}
+	PX_INLINE	void						init(PxCudaBufferPtr ptr, size_t size, CudaMemMgr& manager, PX_ALLOC_INFO_PARAMS_DECL(NULL, 0, NULL, UNASSIGNED))
+	{
+		mPtr = ptr;
+		mSize = size;
+		mMemManager = &manager;
+		mAllocInfo = PxAllocInfo(PX_ALLOC_INFO_PARAMS_INPUT());
+	}
+
+	void operator=(const CudaBuffer& in)
+	{
+		const_cast<PxCudaBufferType&>(mType) = in.mType;
+		mPtr = in.mPtr;
+		mSize = in.mSize;
+		mMemManager = in.mMemManager;
+	}
+
+private:
+	const PxCudaBufferType	mType;
+	PxCudaBufferPtr			mPtr;
+	size_t					mSize;
+	CudaMemMgr*				mMemManager;
+	PxAllocInfo				mAllocInfo;
+};
+
+
+class CudaMemMgr: public PxCudaMemoryManager, public shdfnd::UserAllocated
+{
+	PX_NOCOPY(CudaMemMgr)
+public:
+	CudaMemMgr(PxCudaContextManager& mMgr, physx::PxErrorCallback& errorCallback);
+	virtual ~CudaMemMgr();
+
+	PxCudaBuffer*				alloc(const PxCudaBufferType& type, size_t size, PX_ALLOC_INFO_PARAMS_DECL(NULL, 0, NULL, UNASSIGNED));
+	PxCudaBufferPtr				alloc(PxCudaBufferMemorySpace::Enum memorySpace, size_t size, PX_ALLOC_INFO_PARAMS_DECL(NULL, 0, NULL, UNASSIGNED));
+	bool						free(PxCudaBufferMemorySpace::Enum memorySpace, PxCudaBufferPtr addr);
+	bool						realloc(PxCudaBufferMemorySpace::Enum memorySpace, PxCudaBufferPtr addr, size_t size, PX_ALLOC_INFO_PARAMS_DECL(NULL, 0, NULL, UNASSIGNED));
+	void						getStats(const PxCudaBufferType& type, PxCudaMemoryManagerStats& outStats);
+	bool						reserve(const PxCudaBufferType& type, size_t size);
+	bool						setPageSize(const PxCudaBufferType& type, size_t size);
+	bool						setMaxMemorySize(const PxCudaBufferType& type, size_t size);
+	size_t						getBaseSize(const PxCudaBufferType& type);
+	size_t						getPageSize(const PxCudaBufferType& type);
+	size_t						getMaxMemorySize(const PxCudaBufferType& type);
+	void						debugDisableAllocs()
+	{
+		mDebugDisableAllocs = true;
+	}
+	PxCudaBufferPtr				getMappedPinnedPtr(PxCudaBufferPtr hostPtr);
+
+	// internals
+	bool						free(CudaBuffer& buffer);
+	bool						realloc(CudaBuffer& buffer, size_t size, PX_ALLOC_INFO_PARAMS_DECL(NULL, 0, NULL, UNASSIGNED));
+	bool						setBaseSize(const PxCudaBufferType& type, size_t size);
+
+private:
+	PX_INLINE	bool						initialize();
+	physx::PxErrorCallback&				mErrorCallback;
+	HeapManagerInterface*					mHeap[PxCudaBufferMemorySpace::COUNT];
+	HeapManagerInterface::Allocator*		mMemoryAllocator[PxCudaBufferMemorySpace::COUNT];
+	size_t									mMemoryBaseSize[PxCudaBufferMemorySpace::COUNT];
+	size_t									mMemoryPageSize[PxCudaBufferMemorySpace::COUNT];
+	size_t									mMemoryMaxSize[PxCudaBufferMemorySpace::COUNT];
+	shdfnd::Pool<CudaBuffer>				mBufferPool;
+	bool									mInitialized;
+	PxCudaContextManager&					mMgr;
+	shdfnd::Mutex							mMutex;
+	bool									mDebugDisableAllocs;	
+};
+
+// TODO, give MemoryAllocator prefix or namespace
+class DeviceMemAllocator: public HeapManagerInterface::Allocator, public shdfnd::UserAllocated
+{
+	PX_NOCOPY(DeviceMemAllocator)
+public:
+	DeviceMemAllocator(PxCudaContextManager& mgr, size_t maxSize);
+	virtual ~DeviceMemAllocator();
+
+	virtual void*	alloc(const size_t size);
+	virtual void	free(void* addr, const size_t size);
+
+	void	setMaxSize(size_t maxSize)
+	{
+		mMaxSize = maxSize;
+	}
+	size_t	getMaxSize()
+	{
+		return mMaxSize;
+	}
+
+private:
+	PxCudaContextManager& mMgr;
+	size_t			mMaxSize;
+	size_t			mAllocSize;
+};
+
+
+class PinnedMemAllocator: public HeapManagerInterface::Allocator, public shdfnd::UserAllocated
+{
+	PX_NOCOPY(PinnedMemAllocator)
+public:
+	PinnedMemAllocator(PxCudaContextManager& mMgr, size_t maxSize);
+	virtual ~PinnedMemAllocator();
+
+	virtual void*	alloc(const size_t size);
+	virtual void	free(void* addr, const size_t size);
+
+	void	setMaxSize(size_t maxSize)
+	{
+		mMaxSize = maxSize;
+	}
+	size_t	getMaxSize()
+	{
+		return mMaxSize;
+	}
+
+	size_t getMappedPinnedOffset(void* base)
+	{
+
+		PX_ASSERT(base);
+		const shdfnd::HashMap<void*, size_t>::Entry* entry = mMappedPinnedPtrs.find(base);
+		PX_ASSERT(entry);
+		return entry->second - size_t(base);
+	}
+
+private:
+	PxCudaContextManager& mMgr;
+	size_t			mMaxSize;
+	size_t			mAllocSize;
+	shdfnd::HashMap<void*, size_t> mMappedPinnedPtrs;
+};
+
+
+class HostMemAllocator: public HeapManagerInterface::Allocator, public shdfnd::UserAllocated
+{
+	PX_NOCOPY(HostMemAllocator)
+public:
+	HostMemAllocator(size_t maxSize);
+	virtual ~HostMemAllocator();
+
+	virtual void*	alloc(const size_t size);
+	virtual void	free(void* addr, const size_t size);
+
+	void	setMaxSize(size_t maxSize)
+	{
+		mMaxSize = maxSize;
+	}
+	size_t	getMaxSize()
+	{
+		return mMaxSize;
+	}
+
+private:
+	size_t			mMaxSize;
+	size_t			mAllocSize;
+};
+
+
+class WriteCombinedMemAllocator: public HeapManagerInterface::Allocator, public shdfnd::UserAllocated
+{
+	PX_NOCOPY(WriteCombinedMemAllocator)
+public:
+	WriteCombinedMemAllocator(PxCudaContextManager& mgr, size_t maxSize);
+	virtual ~WriteCombinedMemAllocator();
+
+	virtual void*	alloc(const size_t size);
+	virtual void	free(void* addr, const size_t size);
+
+	void	setMaxSize(size_t maxSize)
+	{
+		mMaxSize = maxSize;
+	}
+	size_t	getMaxSize()
+	{
+		return mMaxSize;
+	}
+
+private:
+	struct WcMem
+	{
+		enum Enum
+		{
+			NOT_CHECKED,
+			SUPPORTED,
+			NOT_SUPPORTED
+		};
+	};
+
+	bool	isWcMemSupported();
+
+private:
+	int					mCudaOrdinal;
+	PxCudaContextManager& mMgr;
+	WcMem::Enum			mWcMemSupport;
+	size_t				mMaxSize;
+	size_t				mAllocSize;
+};
+
+} // end physx namespace
+
+#endif // PXCUDACONTEXTMANAGER_CUDAMEMORYMANAGER_H
diff --git a/PxShared/src/cudamanager/src/CudaNode3DLowLatencyInterface.h b/PxShared/src/cudamanager/src/CudaNode3DLowLatencyInterface.h
new file mode 100644
index 0000000..f20d87a
--- /dev/null
+++ b/PxShared/src/cudamanager/src/CudaNode3DLowLatencyInterface.h
@@ -0,0 +1,128 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXCUDACONTEXTMANAGER_CUDANODE3DLOWLATENCYINTERFACE_H
+#define PXCUDACONTEXTMANAGER_CUDANODE3DLOWLATENCYINTERFACE_H
+
+#include <cuda.h>
+
+namespace physx
+{
+// Utility macros for defining and using UUID values for use with
+// the CUDA driver.
+//
+// CU_INIT_UUID must be #defined in exactly one translation unit
+// per linkage unit (i.e. one .c or .cpp file per binary).  This
+// allows multiple .c and .cpp files to include headers that define
+// UUIDs using CU_DEFINE_UUID: The translation unit that #defines
+// CU_INIT_UUID will define and initialize the UUIDs, and all other
+// translation units will link to that definition.
+
+// Define helper macro: CU_INIT_EXTERN_CONST 
+// In C, global consts have external linkage by default.  In C++,
+// global consts have internal linkage by default, and require the
+// "extern" storage class specifier to have external linkage.  C++
+// allows using "extern" with initializers, but it is illegal in C.
+// Thus, there is no common syntax for C and C++ to declare and
+// initialize global constants with external linkage.  This macro
+// helps reduce duplication of other macros by factoring out the
+// C/C++ discrepancy.
+#ifdef __cplusplus
+#define CU_INIT_EXTERN_CONST extern const
+#else
+#define CU_INIT_EXTERN_CONST const
+#endif
+
+// Define macro CU_DEFINE_UUID.  The parameters are the commonly
+// used "int-short-short-char[8]" style, which can be generated by
+// Microsoft's guidgen.exe tool, Visual Studio's "Create GUID"
+// option in the Tools menu (select style #2), and many web-based
+// UUID generator tools.  Here's an example of what "Create GUID"
+// style #2 generates:
+//
+//   DEFINE_GUID( <<name>>, 
+//   0x2446054, 0xbb8e, 0x4b2f, 0x8b, 0xfc, 0xa4, 0xfe, 0x44, 0x9, 0x8f, 0xb8);
+//
+// So to use one of these with CUDA, just change the macro to
+// CU_DEFINE_UUID and choose a symbol name.  For example:
+//
+//   CU_DEFINE_UUID( MyUuid, 
+//   0x2446054, 0xbb8e, 0x4b2f, 0x8b, 0xfc, 0xa4, 0xfe, 0x44, 0x9, 0x8f, 0xb8);
+//
+#if defined(CU_INIT_UUID)
+#define CU_CHAR(x) (char)(unsigned char)((x) & 0xff)
+// Define the symbol as exportable to other translation units, and
+// initialize the value.  Inner set of parens is necessary because
+// "bytes" array needs parens within the struct initializer, which
+// also needs parens.  
+#define CU_DEFINE_UUID(name, a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)          \
+    CU_INIT_EXTERN_CONST CUuuid name =                                         \
+    {                                                                          \
+      {                                                                        \
+        CU_CHAR(a), CU_CHAR((a) >> 8), CU_CHAR((a) >> 16), CU_CHAR((a) >> 24), \
+        CU_CHAR(b), CU_CHAR((b) >> 8),                                         \
+        CU_CHAR(c), CU_CHAR((c) >> 8),                                         \
+        CU_CHAR(d0),                                                           \
+        CU_CHAR(d1),                                                           \
+        CU_CHAR(d2),                                                           \
+        CU_CHAR(d3),                                                           \
+        CU_CHAR(d4),                                                           \
+        CU_CHAR(d5),                                                           \
+        CU_CHAR(d6),                                                           \
+        CU_CHAR(d7)                                                            \
+      }                                                                        \
+    }
+#else
+// Declare the symbol to be imported from another translation unit.
+#define CU_DEFINE_UUID(name, a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \
+    extern const CUuuid name
+#endif
+
+//------------------------------------------------------------------
+// Cuda Private API Interfaces for PhysX
+//------------------------------------------------------------------
+
+// This provides backdoor interfaces used by PhysX
+CU_DEFINE_UUID(CU_ETID_PhysXInterface, 0x8c0ba50c, 0x0410, 0x9a92, 0x89, 0xa7, 0xd0, 0xdf, 0x10, 0xe7, 0x72, 0x86);
+
+typedef struct CUetblPhysXInterface_st
+{
+    /* Size of this structure */
+    size_t size;
+    
+    /* Create a new CUDA context on Node3dLowLatency.
+     * - will usually it will just call cuCtxCreateOnNode3DLowLatency.
+     */
+    CUresult (CUDAAPI *cuCtxCreateOnNode3DLowLatency)(
+        CUcontext *pctx,
+        unsigned int flags,
+        CUdevice dev);
+
+}	CUetblPhysXInterface;
+}
+
+#endif // PXCUDACONTEXTMANAGER_CUDANODE3DLOWLATENCYINTERFACE_H
diff --git a/PxShared/src/cudamanager/src/GpuDispatcher.cpp b/PxShared/src/cudamanager/src/GpuDispatcher.cpp
new file mode 100644
index 0000000..0d05a97
--- /dev/null
+++ b/PxShared/src/cudamanager/src/GpuDispatcher.cpp
@@ -0,0 +1,942 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#include "task/PxTaskDefine.h"
+
+#if PX_SUPPORT_GPU_PHYSX
+
+#include "task/PxGpuDispatcher.h"
+#include "task/PxCpuDispatcher.h"
+#include "cudamanager/PxCudaContextManager.h"
+#include "cudamanager/PxGpuCopyDesc.h"
+
+#include "CudaKernelWrangler.h"
+#include "GpuDispatcher.h"
+
+#if PX_SUPPORT_PXTASK_PROFILING
+#include "foundation/PxProfiler.h"
+#endif
+
+#include "PsArray.h"
+#include "PsSync.h"
+#include "PsInlineArray.h"
+#include "PsUserAllocated.h"
+#include "PsAtomic.h"
+
+#if PX_VC
+#pragma warning(disable: 4062)	//enumerator 'identifier' in switch of enum 'enumeration' is not handled
+#endif
+
+using namespace physx;
+
+// hack to run CUDA calls in a single thread
+// used to capture CUDA APIC traces
+// #define SINGLE_CUDA_THREAD 1
+
+/* Kernels in UtilsKernels.cu */
+const char* UtilKernelNames[] =
+{
+	"Saturate",
+	"MemCopyAsync",
+	"MemCopyBatchedAsync",
+};
+enum
+{
+	KERNEL_SATURATE,
+	KERNEL_MEMCOPY,
+	KERNEL_MEMCOPY_BATCHED
+};
+
+/* ==================== LaunchTask =========================== */
+
+/**
+\brief A task that maintains a list of dependent tasks.
+
+This task maintains a list of dependent tasks that have their reference counts 
+reduced on completion of the task.
+
+The refcount is incremented every time a dependent task is added.
+*/
+class physx::FanoutTask : public PxBaseTask, public shdfnd::UserAllocated
+{
+	PX_NOCOPY(FanoutTask)
+public:
+	FanoutTask(const char* name) : mRefCount(0), mName(name), mNotifySubmission(false) {}
+
+	virtual void run() {}
+	virtual const char* getName(void) const { return mName; }
+
+	virtual void removeReference()
+	{
+		shdfnd::Mutex::ScopedLock lock(mMutex);
+		if (!shdfnd::atomicDecrement(&mRefCount))
+		{
+			// prevents access to mReferencesToRemove until release
+			shdfnd::atomicIncrement(&mRefCount);
+			mNotifySubmission = false;
+			PX_ASSERT(mReferencesToRemove.empty());
+			// save continuation and dependents
+			for (uint32_t i = 0; i < mDependents.size(); i++)
+				mReferencesToRemove.pushBack(mDependents[i]);
+			mDependents.clear();
+			mTm->getCpuDispatcher()->submitTask(*this);
+		}
+	}
+
+	/** 
+	\brief Increases reference count
+	*/
+	virtual void addReference()
+	{
+		shdfnd::Mutex::ScopedLock lock(mMutex);
+		shdfnd::atomicIncrement(&mRefCount);
+		mNotifySubmission = true;
+	}
+
+	/**
+	\brief Return the ref-count for this task 
+	*/
+	PX_INLINE int32_t getReference() const
+	{
+		return mRefCount;
+	}
+
+	/**
+	Adds a dependent task. It also sets the task manager querying it from the dependent task.  
+	The refcount is incremented every time a dependent task is added.
+	*/
+	void addDependent(PxBaseTask& dependent)
+	{
+		shdfnd::Mutex::ScopedLock lock(mMutex);
+		shdfnd::atomicIncrement(&mRefCount);
+		mTm = dependent.getTaskManager();
+		mDependents.pushBack(&dependent);
+		dependent.addReference();
+		mNotifySubmission = true;
+	}
+
+	/**
+	Reduces reference counts of the continuation task and the dependent tasks, also 
+	clearing the copy of continuation and dependents task list.
+	*/
+	virtual void release()
+	{
+		shdfnd::InlineArray<PxBaseTask*, 10> referencesToRemove;
+
+		{
+			shdfnd::Mutex::ScopedLock lock(mMutex);
+
+			const uint32_t contCount = mReferencesToRemove.size(); 
+			referencesToRemove.reserve(contCount);
+			for (uint32_t i=0; i < contCount; ++i)
+				referencesToRemove.pushBack(mReferencesToRemove[i]);
+				
+			mReferencesToRemove.clear();
+			// allow access to mReferencesToRemove again
+			if (mNotifySubmission)
+			{
+				removeReference();
+			}
+			else
+			{
+				physx::shdfnd::atomicDecrement(&mRefCount);
+			}
+
+			// the scoped lock needs to get freed before the continuation tasks get (potentially) submitted because
+			// those continuation tasks might trigger events that delete this task and corrupt the memory of the
+			// mutex (for example, assume this task is a member of the scene then the submitted tasks cause the simulation 
+			// to finish and then the scene gets released which in turn will delete this task. When this task then finally
+			// continues the heap memory will be corrupted.
+		}
+
+		for (uint32_t i=0; i < referencesToRemove.size(); ++i)
+			referencesToRemove[i]->removeReference();
+	}
+
+private:
+	friend class LaunchTask;
+
+	volatile int32_t mRefCount;
+	const char* mName;
+	shdfnd::InlineArray<PxBaseTask*, 4> mDependents;
+	shdfnd::InlineArray<PxBaseTask*, 4> mReferencesToRemove;
+	bool mNotifySubmission;
+	shdfnd::Mutex mMutex; // guarding mDependents and mNotifySubmission
+};
+
+class physx::LaunchTask : public physx::FanoutTask
+{
+public:
+	LaunchTask() : FanoutTask("GpuDispatcher.launch"), mIsBatchStarted(false)
+	{
+	}
+
+	virtual void run() 
+	{
+		if (mReferencesToRemove.size() >= 1)
+			mIsBatchStarted = true;
+	}
+
+	bool mIsBatchStarted;
+};
+
+class physx::BlockTask : public PxLightCpuTask, public shdfnd::UserAllocated
+{
+	PX_NOCOPY(BlockTask)
+public:
+	BlockTask(PxGpuWorkerThread* dispatcher, physx::LaunchTask* launchTask) 
+		: mDispatcher(dispatcher), mLaunchTask(launchTask), mSyncTask(NULL)
+	{}
+
+	virtual const char* getName(void) const 
+	{ 
+		return "GpuDispatcher.block"; 
+	}
+
+	virtual void removeReference()
+	{
+		shdfnd::Mutex::ScopedLock lock(mMutex);
+		if (!physx::shdfnd::atomicDecrement(&mRefCount))
+		{
+			// prevents access to mSyncTask until release
+			physx::shdfnd::atomicIncrement(&mRefCount);
+			PX_ASSERT(!mSyncTask);
+			shdfnd::swap(mSyncTask, mCont);
+			mTm->getCpuDispatcher()->submitTask(*this);
+		}
+	}
+
+	virtual void run() 
+	{
+		if (mLaunchTask->mIsBatchStarted)
+		{
+			mDispatcher->mCtxMgr->acquireContext();
+			CUevent stopEv = mDispatcher->mCachedBlockingEvents.get();
+			CUstream stream = (CUstream)0;
+			mSyncTask->addReference();
+			mDispatcher->flushBatch(stopEv, stream, mSyncTask);
+			mDispatcher->mCtxMgr->releaseContext();
+			mLaunchTask->mIsBatchStarted = false;
+		}
+	}
+
+	virtual void setContinuation(PxBaseTask* continuation) 
+	{
+		// this function is called multiple times, skip after first call
+		shdfnd::Mutex::ScopedLock lock(mMutex);
+		physx::shdfnd::atomicIncrement(&mRefCount);
+		if (!mCont)
+		{
+			mCont = continuation;
+			mTm = mCont->getTaskManager();
+			mCont->addReference();
+		}
+	}
+
+	virtual void release()
+	{
+		shdfnd::Mutex::ScopedLock lock(mMutex);
+		mSyncTask->removeReference();
+		mSyncTask = NULL;
+		// allow access to mSyncTask again
+		if (mCont)
+		{
+			removeReference();
+		}
+		else
+		{
+			physx::shdfnd::atomicDecrement(&mRefCount);
+		}
+	}
+
+	PxGpuWorkerThread* mDispatcher;
+	physx::LaunchTask* mLaunchTask;
+	PxBaseTask* mSyncTask;
+	shdfnd::Mutex mMutex; // guarding mCont
+};
+
+/* ==================== API functions =========================== */
+
+void physx::releaseGpuDispatcher(PxGpuDispatcher& gd)
+{
+	GpuDispatcherImpl* impl = (GpuDispatcherImpl*) &gd;
+	delete impl;
+}
+
+PxCudaContextManager* GpuDispatcherImpl::getCudaContextManager()
+{
+	return mDispatcher->mCtxMgr;
+}
+
+GpuDispatcherImpl::GpuDispatcherImpl(PxErrorCallback& errorCallback, PxCudaContextManager& ctx) :
+	mDispatcher(NULL),
+	mBlockingThread(NULL),
+	mLaunchTask(NULL),
+	mBlockTask(NULL),
+	mSyncTask(NULL)
+{
+	mDispatcher = PX_NEW(PxGpuWorkerThread);
+	if (!mDispatcher)
+	{
+		forceFailureMode();
+		return;
+	}
+
+	mDispatcher->setCudaContext(ctx);
+
+	mDispatcher->mUtilKernelWrapper = PX_NEW(KernelWrangler)(*this, errorCallback, UtilKernelNames, sizeof(UtilKernelNames) / sizeof(char*));
+	if (!mDispatcher->mUtilKernelWrapper || mDispatcher->mUtilKernelWrapper->hadError())
+	{
+		forceFailureMode();
+		return;
+	}
+
+	mLaunchTask = PX_NEW(LaunchTask);
+	mBlockTask = PX_NEW(BlockTask)(mDispatcher, mLaunchTask);
+	mSyncTask = PX_NEW(FanoutTask)("GpuDispatcher.sync");
+}
+
+GpuDispatcherImpl::~GpuDispatcherImpl()
+{
+	if (mBlockingThread)
+	{
+		mBlockingThread->signalQuit();
+		PX_ASSERT(mDispatcher);
+		mDispatcher->mRecordEventQueued.set();
+		mBlockingThread->waitForQuit();
+		delete mBlockingThread;
+	}
+
+	if (mDispatcher)
+	{
+		mDispatcher->signalQuit();
+		mDispatcher->mInputReady.set();
+		mDispatcher->waitForQuit();
+		delete mDispatcher;
+	}
+
+	if (mLaunchTask)
+		PX_DELETE(mLaunchTask);
+
+	if (mBlockTask)
+		PX_DELETE(mBlockTask);
+
+	if (mSyncTask)
+		PX_DELETE(mSyncTask);
+}
+
+void GpuDispatcherImpl::start()
+{
+#ifndef SINGLE_CUDA_THREAD
+	mDispatcher->start(shdfnd::Thread::getDefaultStackSize());
+#else
+	mDispatcher->execute();
+#endif
+
+	mBlockingThread = PX_NEW(BlockingWaitThread)(*mDispatcher);
+#ifndef SINGLE_CUDA_THREAD
+	mBlockingThread->start(shdfnd::Thread::getDefaultStackSize());
+#endif
+}
+
+void GpuDispatcherImpl::startSimulation()
+{
+	mDispatcher->startSimulation();
+}
+
+void GpuDispatcherImpl::stopSimulation()
+{
+	mDispatcher->stopSimulation();
+}
+
+void GpuDispatcherImpl::startGroup()
+{
+	shdfnd::atomicIncrement(&mDispatcher->mActiveGroups);
+}
+
+void GpuDispatcherImpl::submitTask(PxTask& task)
+{
+	mDispatcher->mSubmittedTaskList.push(&task);
+}
+
+void GpuDispatcherImpl::finishGroup()
+{
+	if (!shdfnd::atomicDecrement(&mDispatcher->mActiveGroups))
+	{
+#ifdef SINGLE_CUDA_THREAD
+		mDispatcher->mCtxMgr->acquireContext();
+		mDispatcher->processActiveTasks();
+		mDispatcher->mCtxMgr->releaseContext();
+		mDispatcher->blockingWaitFunc();
+#endif
+		mDispatcher->mInputReady.set();
+	}
+}
+
+void GpuDispatcherImpl::addCompletionPrereq(PxBaseTask& task)
+{
+	mDispatcher->addCompletionPrereq(task);
+}
+
+bool GpuDispatcherImpl::failureDetected() const
+{
+	return mDispatcher->mFailureDetected;
+}
+
+void GpuDispatcherImpl::forceFailureMode()
+{
+	mDispatcher->mFailureDetected = true;
+}
+
+void GpuDispatcherImpl::launchCopyKernel(PxGpuCopyDesc* desc, uint32_t count, CUstream stream)
+{
+	mDispatcher->launchCopyKernel(desc, count, stream);
+}
+
+PxBaseTask& GpuDispatcherImpl::getPreLaunchTask()
+{
+	PX_ASSERT(mLaunchTask);
+	return *mLaunchTask;
+}
+
+void GpuDispatcherImpl::addPreLaunchDependent(PxBaseTask& dependent)
+{
+	PX_ASSERT(mLaunchTask);
+	mLaunchTask->addDependent(dependent);
+}
+
+PxBaseTask& GpuDispatcherImpl::getPostLaunchTask()
+{
+	PX_ASSERT(mBlockTask);
+	return *mBlockTask;
+}
+
+void GpuDispatcherImpl::addPostLaunchDependent(PxBaseTask& dependent)
+{
+	PX_ASSERT(mSyncTask && mBlockTask);
+	mSyncTask->addDependent(dependent);
+	mBlockTask->setContinuation(mSyncTask);
+	mSyncTask->removeReference();
+}
+
+/* ==================== Worker Thread =========================== */
+
+PxGpuWorkerThread::PxGpuWorkerThread()
+	: mActiveGroups(0)
+	, mCtxMgr(NULL)
+	, mFailureDetected(false)
+	, mCompletionRingPush(0)
+	, mCompletionRingPop(0)
+	, mCachedBlockingEvents(CU_EVENT_BLOCKING_SYNC)
+	, mCachedNonBlockingEvents(0)
+	, mCountActiveScenes(0)
+	, mSmStartTimes(0)
+	, mUtilKernelWrapper(0)
+{
+}
+
+void PxGpuWorkerThread::setCudaContext(PxCudaContextManager& ctx)
+{
+	mCtxMgr = &ctx;
+}
+
+PxGpuWorkerThread::~PxGpuWorkerThread()
+{
+	mCtxMgr->acquireContext();
+	mCachedBlockingEvents.clear();
+	mCachedNonBlockingEvents.clear();
+	mCachedStreams.reset();
+	while (!mCachedStreams.empty())
+	{
+		GD_CHECK_CALL(cuStreamDestroy(mCachedStreams.get(mCachedStreams.popBack())));
+	}
+	mCtxMgr->releaseContext();
+
+	if (mSmStartTimes)
+	{
+		PX_FREE(mSmStartTimes);
+	}
+	if (mUtilKernelWrapper)
+	{
+		// will acquire the context for itself
+		PX_DELETE(mUtilKernelWrapper);
+	}
+}
+
+void PxGpuWorkerThread::emitStartEvent(const char *id)
+{
+	PX_UNUSED(id);
+#if PX_SUPPORT_PXTASK_PROFILING
+	PX_PROFILE_START_CROSSTHREAD(id,0);
+#endif
+}
+
+void PxGpuWorkerThread::emitStopEvent(const char *id)
+{
+	PX_UNUSED(id);
+#if PX_SUPPORT_PXTASK_PROFILING
+	PX_PROFILE_STOP_CROSSTHREAD(id,0);
+#endif
+}
+
+/* A TaskManager is informing us that its simulation is being stepped */
+void PxGpuWorkerThread::startSimulation()
+{
+	mUsingConcurrentStreams = mCtxMgr->getUsingConcurrentStreams();
+
+	shdfnd::atomicIncrement(&mCountActiveScenes);
+}
+
+
+/* A TaskManager is informing us that its simulation has ended */
+void PxGpuWorkerThread::stopSimulation()
+{
+	if (shdfnd::atomicDecrement(&mCountActiveScenes) == 0)
+		mCachedStreams.reset();
+}
+
+
+/* PxGpuDispatcher worker thread main loop */
+void PxGpuWorkerThread::execute()
+{
+	setName("GpuDispatcher.Worker");
+
+#ifndef SINGLE_CUDA_THREAD
+	bool running = true;
+	while (running)
+	{
+		mInputReady.wait();
+
+		if (quitIsSignalled())
+		{
+			break;
+		}
+
+		if (!mSubmittedTaskList.empty())
+		{
+			mCtxMgr->acquireContext();
+			processActiveTasks();
+			mCtxMgr->releaseContext();
+		}
+	}
+
+	quit();
+#endif
+}
+
+/* Expected to be called by a GPU task, or a function called by a GPU
+ * task.  The context is important because I think it does not require
+ * any locking since only one GPU task's launchInstance() function will
+ * be running at a time, per GpuDispatcherImpl (per CUDA context).
+ */
+void PxGpuWorkerThread::addCompletionPrereq(PxBaseTask& task)
+{
+	if (mFailureDetected)
+	{
+		return;
+	}
+
+	emitStartEvent("GpuDispatcher.AddCompletionEvent");
+	task.addReference();
+	mCompletionTasks.pushBack(&task);
+	emitStopEvent("GpuDispatcher.AddCompletionEvent");
+}
+
+namespace
+{
+	template <typename T0>
+	PX_NOINLINE CUresult launchKernelGrid(CUfunction func, unsigned int gridWidth, unsigned int gridHeight, unsigned int numThreads, unsigned int sharedMem, CUstream stream, T0 v0)
+	{
+		void* kernelParams[] =
+		{
+			&v0,
+		};
+		return cuLaunchKernel(func, gridWidth, gridHeight, 1, numThreads, 1, 1, sharedMem, stream, kernelParams, NULL);
+	}
+}
+
+void PxGpuWorkerThread::launchCopyKernel(PxGpuCopyDesc* desc, uint32_t count, CUstream stream)
+{
+	if (!mCtxMgr->canMapHostMemory())
+	{
+		for (uint32_t i = 0 ; i < count ; i++)
+		{
+			PX_ASSERT(desc->isValid());
+			switch (desc->type)
+			{
+			case PxGpuCopyDesc::DeviceMemset32:
+				GD_CHECK_CALL(cuMemsetD32Async(desc->dest, (uint32_t) desc->source, desc->bytes >> 2, stream));
+				break;
+			case PxGpuCopyDesc::DeviceToDevice:
+				GD_CHECK_CALL(cuMemcpyDtoDAsync(desc->dest, desc->source, desc->bytes, stream));
+				break;
+			case PxGpuCopyDesc::DeviceToHost:
+				GD_CHECK_CALL(cuMemcpyDtoHAsync((void*) desc->dest, desc->source, desc->bytes, stream));
+				break;
+			case PxGpuCopyDesc::HostToDevice:
+				GD_CHECK_CALL(cuMemcpyHtoDAsync(desc->dest, (void*) desc->source, desc->bytes, stream));
+				break;
+			}
+
+			desc++;
+		}
+	}
+	else if (count == 1)
+	{
+		CUfunction func = mUtilKernelWrapper->getCuFunction(KERNEL_MEMCOPY);
+		uint32_t smCount = (uint32_t)mCtxMgr->getMultiprocessorCount();
+
+		PX_ASSERT(desc->isValid());
+
+		CUdeviceptr dptr;
+		switch (desc->type)
+		{
+		case PxGpuCopyDesc::DeviceToHost:
+			dptr = mCtxMgr->getMemoryManager()->getMappedPinnedPtr(PxCudaBufferPtr(desc->dest));
+			desc->dest = dptr;
+			break;
+		case PxGpuCopyDesc::HostToDevice:
+			dptr = mCtxMgr->getMemoryManager()->getMappedPinnedPtr(PxCudaBufferPtr(desc->source));
+			desc->source = dptr;
+			break;
+		case PxGpuCopyDesc::DeviceMemset32:
+		case PxGpuCopyDesc::DeviceToDevice:
+			//do nothing, cases are here for GCCs warning system
+			break;
+		}
+
+		uint32_t numThreads;
+		if (mCtxMgr->supportsArchSM20())
+		{
+			numThreads = 256;
+		}
+		else
+		{
+			numThreads = 128;
+		}
+		uint32_t blocks = uint32_t(desc->bytes / (numThreads * 4 * 6));
+		if (blocks == 0)
+		{
+			blocks = 1;
+		}
+		if (blocks > smCount)
+		{
+			blocks = smCount;
+		}
+
+		GD_CHECK_CALL(
+			launchKernel(func, blocks, numThreads, 0, stream, *desc)
+			);
+	}
+	else
+	{
+		CUfunction func = mUtilKernelWrapper->getCuFunction(KERNEL_MEMCOPY_BATCHED);
+		CUdeviceptr dptr;
+
+		for (uint32_t i = 0 ; i < count ; i++)
+		{
+			PX_ASSERT(desc[i].isValid());
+
+			switch (desc[i].type)
+			{
+			case PxGpuCopyDesc::DeviceToHost:
+				dptr = mCtxMgr->getMemoryManager()->getMappedPinnedPtr(PxCudaBufferPtr(desc[i].dest));
+				desc[i].dest = dptr;
+				break;
+			case PxGpuCopyDesc::HostToDevice:
+				dptr = mCtxMgr->getMemoryManager()->getMappedPinnedPtr(PxCudaBufferPtr(desc[i].source));
+				desc[i].source = dptr;
+				break;
+			case PxGpuCopyDesc::DeviceMemset32:
+			case PxGpuCopyDesc::DeviceToDevice:
+				//do nothing, cases are here for GCCs warning system
+				break;
+			}
+		}
+
+		uint32_t numThreads, numBlocksX;
+		if (mCtxMgr->supportsArchSM20())
+		{
+			numThreads = 256;
+			numBlocksX = 1;
+		}
+		else
+		{
+			numThreads = 128;
+			numBlocksX = 2;
+		}
+
+		dptr = mCtxMgr->getMemoryManager()->getMappedPinnedPtr(PxCudaBufferPtr(desc));
+
+		GD_CHECK_CALL(
+			launchKernelGrid(func, numBlocksX, count, numThreads, 0, stream, 
+				dptr)
+			);
+	}
+}
+
+void PxGpuWorkerThread::flushBatch(CUevent endEvent, CUstream syncStream, PxBaseTask* task)
+{
+	shdfnd::Mutex::ScopedLock lock(mMutex);
+
+	if (mFailureDetected)
+	{
+		return;
+	}
+
+	if (endEvent && (1 & intptr_t(syncStream)))
+	{
+		// blocking record event on stream 0, flushes current push buffer
+		GD_CHECK_CALL(cuEventRecord(endEvent, 0));
+	} else {
+		CUresult ret = cuStreamQuery(0); // flushes current push buffer
+		PX_ASSERT(ret == CUDA_SUCCESS || ret == CUDA_ERROR_NOT_READY);
+		PX_UNUSED(ret);
+	}
+	
+	int next = (mCompletionRingPush + 1) % SIZE_COMPLETION_RING;
+	while (next == mCompletionRingPop)
+	{
+		// lordy, I hope we never get here.
+		PX_ALWAYS_ASSERT();
+		mCtxMgr->releaseContext();
+		sleep(1);
+		mCtxMgr->acquireContext();
+	}
+
+	CudaBatch& b = mCompletionRing[ mCompletionRingPush ];
+	b.blockingEvent = endEvent;
+	b.blockingStream = syncStream;
+	b.continuationTask = task;
+
+	mCompletionRingPush = next;
+	mRecordEventQueued.set();
+}
+
+// Take any submitted tasks into its appropriate read list
+void PxGpuWorkerThread::pollSubmitted(shdfnd::Array<ReadyTask>* ready)
+{
+	mInputReady.reset();
+	PxGpuTask* gt;
+	while ((gt = (PxGpuTask*) mSubmittedTaskList.popBack()) != 0)
+	{
+		PxGpuTaskHint::Enum type = gt->getTaskHint();
+		ReadyTask* r = &ready[ type ].insert();
+		r->task = gt;
+		r->iteration = 0;
+		if (mUsingConcurrentStreams && gt->mStreamIndex == 0)
+		{
+			gt->mStreamIndex = mCachedStreams.popBack();
+		}
+	}
+}
+
+void PxGpuWorkerThread::processActiveTasks()
+{
+	emitStartEvent("GpuDispatcher.ProcessTasksEvent");
+	
+	if (mFailureDetected)
+	{
+		while (!mSubmittedTaskList.empty())
+		{
+			mInputReady.reset();
+			mSubmittedTaskList.popBack()->release();
+		}
+		emitStopEvent("GpuDispatcher.ProcessTasksEvent");
+		return;
+	}
+
+	for (uint32_t i = 0; i < PxGpuTaskHint::NUM_GPU_TASK_HINTS; i++)
+	{
+		mReady[i].clear();
+	}
+
+	//OutputDebugString("batch beginning\n");
+
+	const CUevent nonBlockEv = mCachedNonBlockingEvents.get();
+	bool workToFlush = false;
+	bool tasksRemain = false;
+	PxGpuTaskHint::Enum curMode = PxGpuTaskHint::HostToDevice;
+
+	pollSubmitted(mReady);
+
+	do
+	{
+		// cycle current run mode when necessary
+		if (mReady[ curMode ].size() == 0)
+		{
+			if (curMode == PxGpuTaskHint::HostToDevice)
+			{
+				curMode = PxGpuTaskHint::Kernel;
+			}
+			else if (curMode == PxGpuTaskHint::Kernel)
+			{
+				curMode = PxGpuTaskHint::DeviceToHost;
+			}
+			else
+			{
+				curMode = PxGpuTaskHint::HostToDevice;
+			}
+		}
+
+		uint32_t singleStream = mReady[curMode].empty() ? 0 : mReady[curMode].front().task->mStreamIndex;
+
+		while (mReady[ curMode ].size())
+		{
+			bool needwfi = false;
+			for (uint32_t i = 0 ; i < mReady[ curMode ].size() ; i++)
+			{
+				ReadyTask& r = mReady[ curMode ][ i ];
+
+				if (r.task->mPreSyncRequired)
+				{
+					// If mPreSyncRequired is set *before* the task is run, it implies
+					// a WFI must be inserted before this task issues any work.  Multiple
+					// ready tasks may have this flag, so to avoid inserting multiple WFI
+					// requests, we skip marked tasks in this pass and note a WFI is needed.
+					needwfi = true;
+					r.task->mPreSyncRequired = false;
+				}
+				else
+				{
+					const CUstream s = (r.task->mStreamIndex > 0) ? mCachedStreams.get(r.task->mStreamIndex) : 0;
+#if PX_PROFILE
+					r.task->mTm->emitStartEvent(*r.task);
+#endif
+					bool active = r.task->launchInstance(s, int(r.iteration++));
+#if PX_PROFILE
+					r.task->mTm->emitStopEvent(*r.task);
+#endif
+					if(singleStream != r.task->mStreamIndex)
+						singleStream = 0;
+
+					// If the launchInstance() call reported a non-recoverable error, gracefully
+					// release all scheduled tasks
+					if (mFailureDetected)
+					{
+						// Release all ready tasks
+						for (uint32_t h = 0; h < PxGpuTaskHint::NUM_GPU_TASK_HINTS; h++)
+						{
+							for (uint32_t j = 0 ; j < mReady[ h ].size() ; j++)
+							{
+								mReady[ h ][ j ].task->release();
+							}
+							mReady[ h ].clear();
+						}
+
+						// Release all submitted tasks, until idle
+						while (!mSubmittedTaskList.empty())
+						{
+							mInputReady.reset();
+							mSubmittedTaskList.popBack()->release();
+						}
+						return;
+					}
+
+					workToFlush = true;
+					if (r.task->mPreSyncRequired)
+					{
+						// This task has asked for a sync point, meaning it has launched a copy
+						// or a kernel that must be completed before any later tasks are allowed
+						// to start.  Insert a WFI and clear the needwfi flag
+						GD_CHECK_CALL(cuEventRecord(nonBlockEv, 0));
+						needwfi = false;
+						r.task->mPreSyncRequired = false;
+					}
+
+					if (!active)
+					{
+						r.task->release();
+						mReady[ curMode ].replaceWithLast(i);
+						pollSubmitted(mReady);
+						i -= 1;
+					}
+				}
+			}
+
+			if (needwfi)
+			{
+				GD_CHECK_CALL(cuEventRecord(nonBlockEv, 0));
+			}
+		}
+
+		/* We have completed one of the three phases */
+
+		tasksRemain = false;
+		for (int e = (int) PxGpuTaskHint::HostToDevice ; e != (int) PxGpuTaskHint::NUM_GPU_TASK_HINTS ; e++)
+		{
+			tasksRemain |= (mReady[ e ].size() != 0);
+		}
+
+		if (!mCompletionTasks.empty())
+		{
+			workToFlush = true;
+		}
+
+		if (workToFlush && (tasksRemain == false || curMode == PxGpuTaskHint::DeviceToHost))
+		{
+			//OutputDebugString("batch ending\n");
+
+			while (mCompletionTasks.size())
+			{
+				PxBaseTask* t = mCompletionTasks.popBack();
+				if (workToFlush)
+				{
+					CUevent stopEv = mCachedBlockingEvents.get();
+					CUstream stream = singleStream ? mCachedStreams.get(singleStream) : (CUstream)1;
+					flushBatch(stopEv, stream, t);
+					workToFlush = false;
+				}
+				else
+				{
+					flushBatch(0, 0, t);
+				}
+			}
+			if (workToFlush)
+			{
+				/* Getting here is probably an indication of a bug in your task graph,
+				 * but it is possible to get this warning if you have CPU tasks that
+				 * can delay GpuTasks.  So, consider this warning "training wheels" and
+				 * disable it if you know your graph is correct.
+				 */
+				// SJB - Disabling this warning, APEX does this every frame because
+				// of how BasicIOS and IOFX interact.
+				//shdfnd::getFoundation().error(PX_WARN,
+				//	"CUDA work generated without a completion dependency!");
+				CUevent stopEv = mCachedBlockingEvents.get();
+				flushBatch(stopEv, (CUstream)1, NULL);
+			}
+		}
+	}
+	while (tasksRemain);
+
+	mCachedNonBlockingEvents.add(nonBlockEv);
+
+	emitStopEvent("GpuDispatcher.ProcessTasksEvent");
+}
+
+#endif
diff --git a/PxShared/src/cudamanager/src/HeapManagerInterface.h b/PxShared/src/cudamanager/src/HeapManagerInterface.h
new file mode 100644
index 0000000..7fe7f2e
--- /dev/null
+++ b/PxShared/src/cudamanager/src/HeapManagerInterface.h
@@ -0,0 +1,156 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+//----------------------------------------------------------------------------//
+// HeapManagerInterface.h
+//----------------------------------------------------------------------------//
+
+#ifndef PXCUDACONTEXTMANAGER_HEAPMANAGERINTERFACE_H
+#define PXCUDACONTEXTMANAGER_HEAPMANAGERINTERFACE_H
+
+#include "task/PxTaskDefine.h"
+#include "foundation/PxSimpleTypes.h"
+#include "cudamanager/PxCudaMemoryManager.h"
+
+#include <string.h>
+
+namespace physx
+{
+struct HeapStatsFlags
+{
+	enum Enum
+	{
+		F_BASIC_STATS				= 1 << 0,
+		F_INTERNAL_FRAGMENTATION	= 1 << 1,
+		F_BIGGEST_FREE_BLOCK		= 1 << 2,
+		F_HISTOGRAM					= 1 << 3,
+		F_ALLOC_ID_STATS			= 1 << 4,
+		F_ALL						= 0xFFFFFFFF,
+	};
+};
+
+#define BITSPERWORD sizeof(size_t)*8
+
+class ApexHeapStats
+{
+public:
+	ApexHeapStats():
+		heapSize(0),
+		totalAllocated(0),
+		maxAllocated(0),
+		internalFragmentation(0),
+		maxInternalFragmentation(0),
+		biggestFreeBlock(0),
+		numEntries(0)
+	{}
+
+	PX_INLINE void reset()
+	{
+		memset(this, 0, sizeof(ApexHeapStats));
+	}
+
+	// F_BASIC_STATS
+	size_t	heapSize;
+	size_t	totalAllocated;
+	size_t	maxAllocated;
+
+	// F_INTERNAL_FRAGMENTATION
+	size_t	internalFragmentation;
+	size_t	maxInternalFragmentation;
+
+	// F_BIGGEST_FREE_BLOCK
+	size_t	biggestFreeBlock;
+
+	// F_HISTOGRAM
+	size_t	freeBuddyHistogram[BITSPERWORD];
+	size_t	allocatedBuddyHistogram[BITSPERWORD];
+	size_t	numEntries;
+
+	// F_ALLOC_ID_STATS
+	PxAllocIdStats allocIdStats[PxAllocId::NUM_IDS];
+};
+
+
+class HeapManagerInterface
+{
+public:
+	// simple allocator interface over which the heap manager does its base allocation and allocates further pages
+	class Allocator
+	{
+	public:
+		virtual ~Allocator() {};
+
+		virtual		void*		alloc(const size_t size) = 0;
+		virtual		void		free(void* addr, const size_t size) = 0;
+	};
+
+	virtual ~HeapManagerInterface() {};
+
+	// INTERFACE METHODS
+	// init the HeapManager by passing it a block of memory and the smallest size of a memory block.
+	// returns true if init was successful
+	virtual		bool		init(Allocator* memAllocator, const size_t baseSize, const size_t pageSize, const size_t minBlockSize, const size_t maxIntFrag = size_t(-1)) = 0;
+
+	// Changes the page size. The size of allocations over the supplied Allocator are a multiple of the pageSize.
+	// returns true if the page size was valid. (!0, >minBlockSize, pow2)
+	virtual		bool		setPageSize(size_t pageSize) = 0;
+
+	// returns the address of an allocated block for the requested size.
+	// returns a NULL ptr if alloc failed.
+	virtual		void*		alloc(const size_t size, PX_ALLOC_INFO_PARAMS_DECL(NULL, 0, NULL, UNASSIGNED)) = 0;
+
+	// returns true if the block at the given address could be resized to size
+	// returns false if this failed. Manual free and alloc is still possible but needs a memcopy.
+	virtual		bool		realloc(void* addr, const size_t size, PX_ALLOC_INFO_PARAMS_DECL(NULL, 0, NULL, UNASSIGNED)) = 0;
+
+	// frees a given block.
+	// returns true if the operation was successful
+	virtual		bool		free(void* addr) = 0;
+
+	// deallocates all empty pages
+	virtual		void		freeEmptyPages() = 0;
+
+	// ensures that there there is free memory of at least the requested size
+	// returns true if the operation was successful. Free memory was already big enough or new pages were allocated successfully.
+	virtual		bool		reserve(size_t size) = 0;
+
+	// returns stats into a ApexHeapStats object, stats can be selected with HeapManagerStatsFlags.
+	// returns true if the operation was successful
+	virtual		bool		getStats(ApexHeapStats& stats, const uint32_t flags) = 0;
+
+	// discretisize memory into an array such that it can be visualized
+	// returns true if the operation was successful
+	virtual		bool		visualizeMemory(uint8_t* array, const size_t size) = 0;
+
+	// returns the base address of the page containing the memory block at addr. 
+	// returns NULL if addr doesn't correspond to a page
+	virtual		void*		findBaseAddress(void* addr) = 0;
+};
+
+} // end physx namespace
+
+#endif // PXCUDACONTEXTMANAGER_HEAPMANAGERINTERFACE_H
diff --git a/PxShared/src/cudamanager/src/HeapManagerLinkedList.h b/PxShared/src/cudamanager/src/HeapManagerLinkedList.h
new file mode 100644
index 0000000..45a359d
--- /dev/null
+++ b/PxShared/src/cudamanager/src/HeapManagerLinkedList.h
@@ -0,0 +1,204 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+//----------------------------------------------------------------------------//
+// HeapManagerLinkedList.h
+//----------------------------------------------------------------------------//
+
+#ifndef PXCUDACONTEXTMANAGER_HEAPMANAGERLINKEDLIST_H
+#define PXCUDACONTEXTMANAGER_HEAPMANAGERLINKEDLIST_H
+
+#include "foundation/PxAssert.h"
+
+namespace physx
+{
+
+template <typename T>
+class LinkedList
+{
+public:
+
+	PX_INLINE LinkedList()
+		: mHead(NULL)
+		, mNumElements(0)
+	{
+	}
+
+	PX_INLINE void insert(T*& elt)
+	{
+		if (mHead)
+		{
+			elt->next = mHead->next;
+		}
+		mHead = elt;
+		mNumElements++;
+	}
+
+	PX_INLINE void insertSorted(T*& elt)
+	{
+		if (!mHead)
+		{
+			mHead = elt;
+			mHead->next = NULL;
+		}
+		else if (!mHead->next || (mHead->addr > elt->addr))
+		{
+			if (mHead->addr > elt->addr)
+			{
+				elt->next = mHead;
+				mHead = elt;
+			}
+			else
+			{
+				mHead->next = elt;
+				elt->next = NULL;
+			}
+		}
+		else
+		{
+			T* cur = mHead;
+			while (cur->next && (elt->addr > cur->next->addr))
+			{
+				cur = cur->next;
+			}
+			elt->next = cur->next;
+			cur->next = elt;
+		}
+		mNumElements++;
+	}
+
+	PX_INLINE T* pop()
+	{
+		if (mHead)
+		{
+			T* ret = mHead;
+			mHead = mHead->next;
+			mNumElements--;
+			return ret;
+		}
+		return NULL;
+	}
+
+	PX_INLINE bool remove(const T* elt)
+	{
+		PX_ASSERT(elt);
+		if (mHead && mHead == elt)
+		{
+			mHead = mHead->next;
+			mNumElements--;
+			return true;
+		}
+		else
+		{
+			T* cur = mHead;
+			while (cur && cur->next != elt)
+			{
+				PX_ASSERT(cur->addr < elt->addr); // assert for sorted list property.
+				cur = cur->next;
+			}
+			if (cur && elt)
+			{
+				cur->next = elt->next;
+				mNumElements--;
+				return true;
+			}
+		}
+		return false;
+	}
+
+	PX_INLINE T* find(const size_t addr)
+	{
+		T* cur = mHead;
+		while (cur && cur->addr < addr)
+		{
+			cur = cur->next;
+		}
+
+		return cur && (cur->addr == addr) ? cur : NULL;
+	}
+
+	PX_INLINE T* findAndPop(const size_t addr)
+	{
+		if (mHead == NULL)
+		{
+			return NULL;
+		}
+
+		if (mHead->addr == addr)
+		{
+			return pop();
+		}
+
+		T* cur = mHead;
+		T* last = mHead;
+		while (cur)
+		{
+			if (cur->addr == addr)
+			{
+				last->next = cur->next;
+				mNumElements--;
+				return cur;
+			}
+			else if (cur->addr > addr)
+			{
+				return NULL; // because list is sorted.
+			}
+			else
+			{
+				last = cur;
+				cur = cur->next;
+			}
+		}
+		return NULL;
+	}
+
+	PX_INLINE	size_t	getSize()
+	{
+		return mNumElements;
+	}
+	PX_INLINE	T*		getHead()
+	{
+		return mHead;
+	}
+
+	// hacky
+	PX_INLINE	void	setSize(size_t s)
+	{
+		mNumElements = s;
+	}
+	PX_INLINE	void	setHead(T* h)
+	{
+		mHead = h;
+	}
+private:
+	T*		mHead;
+	size_t	mNumElements;
+};
+
+}  // end physx namespace
+
+#endif // PXCUDACONTEXTMANAGER_HEAPMANAGERLINKEDLIST_H
diff --git a/PxShared/src/cudamanager/src/HeapManagerRef.cpp b/PxShared/src/cudamanager/src/HeapManagerRef.cpp
new file mode 100644
index 0000000..bf3847f
--- /dev/null
+++ b/PxShared/src/cudamanager/src/HeapManagerRef.cpp
@@ -0,0 +1,1380 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#include "CudaMemoryManager.h"
+#include "HeapManagerRef.h"
+
+#include "PsSort.h"
+#include "PsArray.h"
+#include "PsAllocator.h"
+#include "PsString.h"
+
+#include "foundation/PxMath.h"
+#include "foundation/PxErrorCallback.h"
+#include "foundation/PxMemory.h"
+
+
+#if DUMP_HEAP_USAGE_TO_FILE
+#include "PsWindowsInclude.h"
+#endif
+
+using namespace physx::shdfnd;
+using namespace physx;
+
+
+#define CMM_DELETE_SINGLE(x)	{ if(x) delete x; x = NULL; }
+#define CMM_DELETE_ARRAY(x)		{ if(x) delete [] x; x = NULL; }
+
+HeapManagerRef::HeapManagerRef(physx::PxErrorCallback& errorCallback, bool enableMutex)
+	: mHeaps(PX_DEBUG_EXP("HeapManagerRef:mHeaps"))
+	, mBuddyPool("mBuddyPool", 1024)
+	, mPageSize(0)
+	, mMinBlockSize(0)
+	, mMaxIntFrag(size_t(-1))
+	, mNewEmptyPage(false)
+	, mMemAllocator(NULL)
+	, mGlobalAllocMem(0)
+	, mGlobalMaxAllocMem(0)
+	, mGlobalInternalFragmentation(0)
+	, mGlobalMaxInternalFragmentation(0)
+	, mErrorCallback(errorCallback)
+
+{
+	PX_UNUSED(enableMutex);  // SJB: heap alloc of shdfnd::Mutex not working for me
+}
+
+HeapManagerRef::~HeapManagerRef()
+{
+	for (uint32_t i = 0; i < mHeaps.size(); i++)
+	{
+		if (mMemAllocator && mHeaps[i].baseAddr)
+		{
+			mMemAllocator->free(reinterpret_cast<void*>(mHeaps[i].baseAddr), mHeaps[i].heap->getTotalMemorySize());
+		}
+		CMM_DELETE_SINGLE(mHeaps[i].heap);
+	}
+#if DUMP_HEAP_USAGE_TO_FILE
+	fclose(mLogFile);
+#endif
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// INTERFACE METHODS
+bool HeapManagerRef::init(Allocator* memAllocator, const size_t baseSize, const size_t pageSize, const size_t minBlockSize, const size_t maxIntFrag)
+{
+	shdfnd::Mutex::ScopedLock lock(mMutex);
+	// init Heap and do some basic checks.
+
+	// init only once
+	if (mHeaps.size())
+	{
+		return false;
+	}
+
+	if (baseSize && (minBlockSize > baseSize))
+	{
+		return false;
+	}
+
+	if (minBlockSize > pageSize)
+	{
+		return false;
+	}
+
+	if (baseSize && (baseSize % minBlockSize))
+	{
+		return false;
+	}
+
+	uint8_t minBlockSizeLog2;
+	if (minBlockSize != findNextPow2(minBlockSizeLog2, minBlockSize, 0, BITSPERWORD))
+	{
+		return false;
+	}
+
+	if (pageSize != findNextPow2(pageSize, minBlockSizeLog2, BITSPERWORD))
+	{
+		return false;
+	}
+
+	if (!memAllocator)
+	{
+		return false;
+	}
+
+	mMemAllocator = memAllocator;
+	mPageSize = pageSize;
+	mMinBlockSize = minBlockSize;
+	mMaxIntFrag = maxIntFrag;
+
+	memset(&mGlobalAllocIdStats, 0, sizeof(PxAllocIdStats)*PxAllocId::NUM_IDS);
+
+#if DUMP_HEAP_USAGE_TO_FILE
+	char fileName[1024];
+	sprintf_s(fileName, 1024, "HeapLog_%p.txt", this);
+	fopen_s(&mLogFile, fileName, "w");
+	fprintf(mLogFile, "HeapSize: %d, BlockSize: %d Addr: 0x0\n", baseSize, minBlockSize);
+	QueryPerformanceCounter((LARGE_INTEGER*)&m_qpc);
+	QueryPerformanceFrequency((LARGE_INTEGER*)&m_qpf);
+#endif
+
+	// init heap
+	if (baseSize)
+	{
+		return allocateNewHeap(baseSize, true) != NULL;
+	}
+	else
+	{
+		return true;
+	}
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+bool HeapManagerRef::setPageSize(size_t pageSize)
+{
+	shdfnd::Mutex::ScopedLock lock(mMutex);
+
+	if (pageSize == 0)
+	{
+		return false;
+	}
+
+	if (mMinBlockSize > pageSize)
+	{
+		return false;
+	}
+
+	if (pageSize != findNextPow2(pageSize, 0, BITSPERWORD))
+	{
+		return false;
+	}
+
+	mPageSize = pageSize;
+	return true;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+void* HeapManagerRef::alloc(const size_t size, PX_ALLOC_INFO_PARAMS_DEF())
+{
+	shdfnd::Mutex::ScopedLock lock(mMutex);
+
+#if DUMP_HEAP_USAGE_TO_FILE
+	unsigned __int64 qpc;
+	QueryPerformanceCounter((LARGE_INTEGER*)&qpc);
+	float dtime = (float)((double)(qpc - m_qpc) / (double)m_qpf);
+	fprintf(mLogFile, "alloc t: %f s: %d", dtime, size);
+#endif
+
+	void* ret = NULL;
+	// try to allocate it in one of the heaps/pages
+	for (uint32_t i = 0; !ret && i < mHeaps.size(); i++)
+	{
+		ret = mHeaps[i].heap->alloc(size, PX_ALLOC_INFO_PARAMS_INPUT());
+	}
+
+	// create a new page
+	if (!ret)
+	{
+		Heap* heap = allocateNewPages(size);
+		if (heap)
+		{
+			ret = heap->alloc(size, PX_ALLOC_INFO_PARAMS_INPUT());
+		}
+	}
+
+#if DUMP_HEAP_USAGE_TO_FILE
+	fprintf(mLogFile, " a: 0x%p\n", ret);
+#endif
+
+	return ret;
+}
+
+//(10/20/2009 feodorb) TODO: decide whether we move the binary search
+//somewhere away from here. Stands here for std::lower_bound replacement
+template<typename T>
+static uint32_t findUpperBound(const physx::shdfnd::Array<T>& refArray, const T& refValue)
+{
+	uint32_t start = 0, end = refArray.size();
+	while (end - start > 0)
+	{
+		uint32_t midPoint = start + ((end - start) >> 1);
+
+		if (!(refValue < refArray[midPoint]))
+		{
+			start = midPoint + 1;
+		}
+		else
+		{
+			end = midPoint;
+		}
+	}
+	return start;
+}
+
+Heap* HeapManagerRef::findHeap(void* addr) const
+{
+	HeapManagerPage searchPage;
+	searchPage.baseAddr = reinterpret_cast<size_t>(addr);
+
+	uint32_t upperBound = findUpperBound(mHeaps, searchPage);
+	PX_ASSERT(upperBound == 0 ||
+	          (searchPage.baseAddr >= mHeaps[upperBound - 1].baseAddr &&
+	           searchPage.baseAddr < mHeaps[upperBound - 1].baseAddr + mHeaps[upperBound - 1].heap->getTotalMemorySize())
+	         );
+
+	return (upperBound > 0) ? mHeaps[upperBound - 1].heap : 0;
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+bool HeapManagerRef::realloc(void* addr, const size_t size, PX_ALLOC_INFO_PARAMS_DEF())
+{
+	shdfnd::Mutex::ScopedLock lock(mMutex);
+
+#if DUMP_HEAP_USAGE_TO_FILE
+	unsigned __int64 qpc;
+	QueryPerformanceCounter((LARGE_INTEGER*)&qpc);
+	float dtime = (float)((double)(qpc - m_qpc) / (double)m_qpf);
+	fprintf(mLogFile, "realloc t: %f s: %d, a: 0x%p\n", dtime, size, addr);
+#endif
+
+	Heap* heap = findHeap(addr);
+
+	if (heap != 0)
+	{
+		bool ret = heap->realloc(addr, size, PX_ALLOC_INFO_PARAMS_INPUT());
+		if (ret && size > 0 && mNewEmptyPage)
+		{
+			shrinkMemory();
+		}
+		return ret;
+	}
+	return false;
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+bool HeapManagerRef::free(void* addr)
+{
+	shdfnd::Mutex::ScopedLock lock(mMutex);
+
+#if DUMP_HEAP_USAGE_TO_FILE
+	unsigned __int64 qpc;
+	QueryPerformanceCounter((LARGE_INTEGER*)&qpc);
+	float dtime = (float)((double)(qpc - m_qpc) / (double)m_qpf);
+	fprintf(mLogFile, "free t: %f a: 0x%p\n", dtime, addr);
+#endif
+
+	if (addr == NULL)
+	{
+		return false;
+	}
+
+	Heap* heap = findHeap(addr);
+	if (heap != 0)
+	{
+		bool ret = heap->free(addr);
+		if (ret && mNewEmptyPage)
+		{
+			shrinkMemory();
+		}
+		return ret;
+	}
+	return false;
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+static void addStats(ApexHeapStats& dst, const ApexHeapStats& src, uint32_t flags)
+{
+	if (flags & HeapStatsFlags::F_BASIC_STATS)
+	{
+		dst.heapSize += src.heapSize;
+	}
+	if (flags & HeapStatsFlags::F_BIGGEST_FREE_BLOCK)
+	{
+		dst.biggestFreeBlock = PxMax(dst.biggestFreeBlock, src.biggestFreeBlock);
+	}
+	if (flags & HeapStatsFlags::F_HISTOGRAM)
+	{
+		dst.numEntries = PxMax(dst.numEntries, src.numEntries);
+		for (uint32_t i = 0; i < BITSPERWORD; i++)
+		{
+			dst.freeBuddyHistogram[i] += src.freeBuddyHistogram[i];
+			dst.allocatedBuddyHistogram[i] += src.allocatedBuddyHistogram[i];
+		}
+	}
+}
+
+PX_INLINE void HeapManagerRef::addToStats(PxAllocId::Enum id, const size_t size, const size_t fragmentation)
+{
+	PxAllocIdStats& idStats = mGlobalAllocIdStats[id];
+	idStats.elements++;
+	idStats.size += size;
+	idStats.maxElements = PxMax(idStats.maxElements, idStats.elements);
+	idStats.maxSize = PxMax(idStats.maxSize, idStats.size);
+	mGlobalAllocMem += size;
+	mGlobalMaxAllocMem = PxMax(mGlobalMaxAllocMem, mGlobalAllocMem);
+	mGlobalInternalFragmentation += fragmentation;
+	mGlobalMaxInternalFragmentation = PxMax(mGlobalMaxInternalFragmentation, mGlobalInternalFragmentation);
+}
+
+
+PX_INLINE void HeapManagerRef::removeFromStats(PxAllocId::Enum id, const size_t size, const size_t fragmentation)
+{
+	PxAllocIdStats& idStats = mGlobalAllocIdStats[id];
+	PX_ASSERT(idStats.elements);
+	PX_ASSERT(idStats.size >= size);
+
+	idStats.elements--;
+	idStats.size -= size;
+	mGlobalAllocMem -= size;
+	mGlobalInternalFragmentation -= fragmentation;
+}
+
+PX_INLINE void HeapManagerRef::incStats(PxAllocId::Enum id, const size_t change, const size_t fragmentation)
+{
+	PxAllocIdStats& idStats = mGlobalAllocIdStats[id];
+	idStats.size += change;
+	idStats.maxSize = PxMax(idStats.maxSize, idStats.size);
+	mGlobalAllocMem += change;
+	mGlobalMaxAllocMem = PxMax(mGlobalMaxAllocMem, mGlobalAllocMem);
+	mGlobalInternalFragmentation += fragmentation;
+	mGlobalMaxInternalFragmentation = PxMax(mGlobalMaxInternalFragmentation, mGlobalInternalFragmentation);
+}
+
+PX_INLINE void HeapManagerRef::decStats(PxAllocId::Enum id, const size_t change, const size_t fragmentation)
+{
+	PxAllocIdStats& idStats = mGlobalAllocIdStats[id];
+	PX_ASSERT(idStats.size >= change);
+	idStats.size -= change;
+	mGlobalAllocMem += change;
+	mGlobalInternalFragmentation += fragmentation;
+}
+
+bool HeapManagerRef::getStats(ApexHeapStats& stats, const uint32_t flags)
+{
+	shdfnd::Mutex::ScopedLock lock(mMutex);
+
+	ApexHeapStats tmpStats;
+	stats.reset();
+	for (uint32_t i = 0; i < mHeaps.size(); i++)
+	{
+		mHeaps[i].heap->getStats(tmpStats, flags);
+		addStats(stats, tmpStats, flags);
+	}
+	if (flags & HeapStatsFlags::F_BASIC_STATS)
+	{
+		stats.totalAllocated = mGlobalAllocMem;
+		stats.maxAllocated = mGlobalMaxAllocMem;
+
+	}
+	if (flags & HeapStatsFlags::F_INTERNAL_FRAGMENTATION)
+	{
+		stats.internalFragmentation = mGlobalInternalFragmentation;
+		stats.maxInternalFragmentation = mGlobalMaxInternalFragmentation;
+	}
+	if (flags & HeapStatsFlags::F_ALLOC_ID_STATS)
+	{
+		// stats per allocation ID
+		PxMemCopy(stats.allocIdStats, mGlobalAllocIdStats, sizeof(PxAllocIdStats)*PxAllocId::NUM_IDS);
+	}
+	return true;
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+bool HeapManagerRef::visualizeMemory(uint8_t* array, const size_t arraySize)
+{
+	shdfnd::Mutex::ScopedLock lock(mMutex);
+
+	ApexHeapStats tmpStats;
+	getStats(tmpStats, HeapStatsFlags::F_BASIC_STATS);
+	float scale = float(arraySize) / float(tmpStats.heapSize);
+	uint8_t* start = array;
+	for (uint32_t i = 0; i < mHeaps.size(); i++)
+	{
+		size_t heapSize = mHeaps[i].heap->getTotalMemorySize();
+		size_t numVis = size_t(float(heapSize) * scale);
+		PX_ASSERT(start + numVis <= array + arraySize);
+		mHeaps[i].heap->visualizeMemory(start, numVis);
+		start += numVis;
+	}
+	return true;
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+void* HeapManagerRef::findBaseAddress(void* addr)
+{
+	Heap* heap = findHeap(addr);
+	if (heap)
+	{
+		return heap->getBaseAddress();
+	}
+	return NULL;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+Heap* HeapManagerRef::allocateNewHeap(size_t heapSize, bool isPersistent)
+{
+	if (!mMemAllocator)
+	{
+		return NULL;
+	}
+
+	void* newPage = mMemAllocator->alloc(heapSize);
+	if (newPage)
+	{
+		HeapManagerPage page;
+		page.baseAddr = reinterpret_cast<size_t>(newPage);
+		page.heap = PX_NEW(Heap)(*this, mErrorCallback);
+		page.isPersistent = isPersistent;
+		if (page.heap && page.heap->init(page.baseAddr, heapSize, mMinBlockSize, mMaxIntFrag))
+		{
+			mHeaps.pushBack(page);
+			shdfnd::sort(mHeaps.begin(), (uint32_t) mHeaps.size());
+			return page.heap;
+		}
+		else
+		{
+			mMemAllocator->free(newPage, page.heap->getTotalMemorySize());
+			CMM_DELETE_SINGLE(page.heap);
+		}
+	}
+	return NULL;
+}
+
+Heap* HeapManagerRef::allocateNewPages(size_t requestedSize)
+{
+	uint8_t pageSizeLog2;
+	uint8_t minBlockSizeLog2;
+	findNextPow2(minBlockSizeLog2, mMinBlockSize, 0, BITSPERWORD);
+	findNextPow2(pageSizeLog2, mPageSize, minBlockSizeLog2, BITSPERWORD);
+	const size_t allocSize = findNextPow2(requestedSize, pageSizeLog2, BITSPERWORD);
+	return allocateNewHeap(allocSize);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+void HeapManagerRef::removeDeletedHeapsFromList(uint32_t numDeletes)
+{
+	// remove pages from list, keeping it sorted.
+	if (numDeletes)
+	{
+		const uint32_t numEntries = (uint32_t) mHeaps.size();
+
+		//seek
+		uint32_t w = 0;
+		while (w < (numEntries) && mHeaps[w].heap != NULL)
+		{
+			w++;
+		}
+
+		// remove holes
+		uint32_t r = w + 1;
+		while (r < numEntries)
+		{
+			if (mHeaps[r].heap == NULL)
+			{
+				r++;
+			}
+			else
+			{
+				mHeaps[w++] = mHeaps[r++];
+			}
+		}
+
+		mHeaps.resize(numEntries - numDeletes);
+	}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+void HeapManagerRef::resetHeap(HeapManagerPage& page)
+{
+	PX_ASSERT(page.heap->getAllocatedMemorySize() == 0);
+	Heap* newHeap = PX_NEW(Heap)(*this, mErrorCallback);
+	if (newHeap)
+	{
+		if (newHeap->init(page.baseAddr, page.heap->getTotalMemorySize(), mMinBlockSize, mMaxIntFrag))
+		{
+			CMM_DELETE_SINGLE(page.heap);
+			page.heap = newHeap;
+		}
+		else
+		{
+			CMM_DELETE_SINGLE(newHeap);
+		}
+	}
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+void HeapManagerRef::freeEmptyPages()
+{
+	shdfnd::Mutex::ScopedLock lock(mMutex);
+
+	// release heaps
+	uint32_t numDeletes = 0;
+	const uint32_t numEntries = (uint32_t) mHeaps.size();
+	for (uint32_t i = 0; i < numEntries; i++)
+	{
+		HeapManagerPage& page = mHeaps[i];
+		PX_ASSERT(page.heap);
+		if (page.isPersistent)
+		{
+			// for persistent pages: reset without release.
+			if (page.heap->getAllocatedMemorySize() == 0)
+			{
+				resetHeap(page);
+			}
+		}
+		else if (page.heap->getAllocatedMemorySize() == 0)
+		{
+			mMemAllocator->free(reinterpret_cast<void*>(page.baseAddr), page.heap->getTotalMemorySize());
+			CMM_DELETE_SINGLE(page.heap);
+			numDeletes++;
+		}
+	}
+
+	if (numDeletes)
+	{
+		removeDeletedHeapsFromList(numDeletes);
+	}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+void HeapManagerRef::shrinkMemory()
+{
+	mNewEmptyPage = false;
+
+	// collect stats
+	size_t totalAllocated = 0;
+	size_t totalPageSize = 0;
+	const uint32_t numEntries = (uint32_t) mHeaps.size();
+	for (uint32_t i = 0; i < numEntries; i++)
+	{
+		HeapManagerPage& page = mHeaps[i];
+		totalAllocated += page.heap->getAllocatedMemorySize();
+		totalPageSize += page.heap->getTotalMemorySize();
+		PX_ASSERT(totalAllocated <= totalPageSize);
+	}
+
+	// shrink memory if free non-persistent space is half or more of the allocated pages.
+	// releasing from the back of address sorted list, other strategies like LRU, best fit are also possible.
+	if (totalPageSize)
+	{
+		float allocScale = float(totalAllocated) / float(totalPageSize);
+		if (allocScale <= 0.5f)
+		{
+			size_t sizeToRelease = totalAllocated ? (totalPageSize - totalAllocated) >> 1 : totalPageSize;
+			uint32_t numDeletes = 0;
+			for (uint32_t i = 0; i < numEntries; i++)
+			{
+				HeapManagerPage& page = mHeaps[numEntries - i - 1];
+				PX_ASSERT(page.heap);
+				if (page.heap->getAllocatedMemorySize() == 0)
+				{
+					if (!page.isPersistent && page.heap->getTotalMemorySize() <= sizeToRelease)
+					{
+						mMemAllocator->free(reinterpret_cast<void*>(page.baseAddr), page.heap->getTotalMemorySize());
+						sizeToRelease -= page.heap->getTotalMemorySize();
+						CMM_DELETE_SINGLE(page.heap);
+						numDeletes++;
+					}
+					else
+					{
+						resetHeap(page);
+					}
+				}
+			}
+
+			if (numDeletes)
+			{
+				removeDeletedHeapsFromList(numDeletes);
+			}
+		}
+	}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+bool HeapManagerRef::reserve(size_t size)
+{
+	shdfnd::Mutex::ScopedLock lock(mMutex);
+
+	size_t freeSize = 0;
+	for (uint32_t i = 0; i < mHeaps.size(); i++)
+	{
+		freeSize += mHeaps[i].heap->getTotalMemorySize() - mHeaps[i].heap->getAllocatedMemorySize();
+	}
+
+	if (freeSize < size)
+	{
+		return allocateNewPages(size - freeSize) != NULL;
+	}
+	else
+	{
+		return true;
+	}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+PX_INLINE size_t HeapManagerRef::findNextPow2(const size_t val, const uint8_t min, const uint8_t max)
+{
+	size_t ret = val;
+	for (uint8_t i = min; i <= max; i++)
+	{
+		ret = size_t(1) << i;
+		if (ret >= val)
+		{
+			break;
+		}
+	}
+	return ret;
+}
+
+
+PX_INLINE size_t HeapManagerRef::findNextPow2(uint8_t& pow, const size_t val, const uint8_t min, const uint8_t max)
+{
+	size_t ret = val;
+	for (pow = min; pow <= max; pow++)
+	{
+		ret = size_t(1) << pow;
+		if (ret >= val)
+		{
+			break;
+		}
+	}
+	return ret;
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+void BuddyList::clear()
+{
+	Buddy* cur = pop();
+	while (cur)
+	{
+		heap->getBuddyPool().destroy(cur);
+		cur = pop();
+	}
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+Heap::~Heap()
+{
+	for (uint32_t i = 0; i <= mMaxLevel; i++)
+	{
+		mFreeBuddiesAtLevel[i].clear();
+	}
+	CMM_DELETE_ARRAY(mFreeBuddiesAtLevel);
+
+	for (Buddy* buddy = mAllocatedBuddies.getHead(); buddy != NULL; buddy = buddy->next)
+	{
+		void* address = reinterpret_cast<void*>((buddy->addr << mMinBlockLog2) + mBaseAddr);
+#if KEEP_DEBUG_INFO
+		char buffer[256];
+		physx::shdfnd::snprintf(buffer, 256, "Memory leak!\naddress %p file %s, line %d, name %s",  address, buddy->file, buddy->line, buddy->allocName);
+		mErrorCallback.reportError(PxErrorCode::eDEBUG_WARNING, buffer, __FILE__, __LINE__);
+#else
+		char buffer[256];
+		physx::shdfnd::snprintf(buffer, 256, "Memory leak at address %p", address);
+		mErrorCallback.reportError(PxErrorCode::eDEBUG_WARNING, buffer, __FILE__, __LINE__);
+#endif
+	}
+
+	//clear it anyway
+	mAllocatedBuddies.clear();
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+bool Heap::init(size_t baseAddr, const size_t baseSize, const size_t minBlockSize, const size_t maxIntFrag)
+{
+	if (minBlockSize != HeapManagerRef::findNextPow2(mMinBlockLog2, minBlockSize, 0, BITSPERWORD))
+	{
+		return false;
+	}
+
+	if ((maxIntFrag != size_t(-1)) && (maxIntFrag != HeapManagerRef::findNextPow2(maxIntFrag, 0, BITSPERWORD)))
+	{
+		return false;
+	}
+
+	mMaxIntFrag = maxIntFrag;
+
+	mMinBlockSize = minBlockSize;
+	mTotalSize = baseSize;
+	mBaseAddr = baseAddr;
+
+	if (mBaseAddr == 0)
+	{
+		return false;
+	}
+
+	size_t numBlocks = baseSize >> mMinBlockLog2;
+	// allow only memory blocks which have a power of 2 in size. and numblocks must be at least 1.
+	if (numBlocks != HeapManagerRef::findNextPow2(mMaxLevel, numBlocks, 0, sizeof(size_t) * 4))
+	{
+		return false;
+	}
+
+	mFreeBuddiesAtLevel = PX_NEW(BuddyList)[(unsigned int)(mMaxLevel + 1)];
+	if (!mFreeBuddiesAtLevel)
+	{
+		return false;
+	}
+
+	// init size of buddy arrays
+	for (uint32_t i = 0; i <= mMaxLevel; i++)
+	{
+		mFreeBuddiesAtLevel[i].buddySize = size_t(1) << i;
+		mFreeBuddiesAtLevel[i].heap = this;
+	}
+	mAllocatedBuddies.heap = this;
+
+	Buddy* b = mManager.getBuddyPool().construct();
+	if (!b)
+	{
+		CMM_DELETE_ARRAY(mFreeBuddiesAtLevel);
+		return false;
+	}
+	b->level = mMaxLevel;
+
+	// add buddy to its array
+	mFreeBuddiesAtLevel[mMaxLevel].insert(b);
+
+	return true;
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+void* Heap::alloc(const size_t size, PX_ALLOC_INFO_PARAMS_DEF())
+{
+	/*
+	compute needed buddysize -> level
+	if(mFreeBuddiesAtLevel[level].array.size() > 0)
+	{
+		ret =  array.popBack()
+		allocList.pushBack(ret)
+		return ret.adr+basePtr;
+	}
+	else
+	{
+		if(nonemptylist at higher levels)
+			ret = recursive split
+		else if(nonemptylist at lower levels )
+			ret = recursive merge
+		else
+			ret = NULL; // heap full or to fragmented
+	}
+	*/
+
+	PX_UNUSED(allocId);
+	PX_UNUSED(allocName);
+	PX_UNUSED(line);
+	PX_UNUSED(file);
+
+	if (size == 0 || size > mTotalSize)
+	{
+		return NULL;
+	}
+
+	PX_ASSERT(allocId < PxAllocId::NUM_IDS);
+	//PX_ASSERT(allocId != PxAllocId::UNASSIGNED);	// enable to track unassigned memory
+
+	// compute needed buddysize -> level
+	uint8_t level = 0;
+	HeapManagerRef::findNextPow2(level, size, mMinBlockLog2, BITSPERWORD);
+	level = uint8_t(level - mMinBlockLog2);
+
+	Buddy* ret = NULL;
+	if (mFreeBuddiesAtLevel[level].getSize() > 0)
+	{
+		ret = mFreeBuddiesAtLevel[level].pop();
+	}
+	else
+	{
+		// prefer splitting
+		if (level != mMaxLevel)
+		{
+			ret = findBySplitting(level);
+		}
+		// else try merging
+		if (!ret && level != 0)
+		{
+			ret = findByMerging(level);
+		}
+	}
+
+	if (ret)
+	{
+		ret->occupiedSize = size;
+		size_t addr = ret->addr;
+		ret->allocId = uint16_t(allocId);
+#if KEEP_DEBUG_INFO
+		ret->file = file;
+		ret->line = (uint32_t)line;
+		ret->allocName = allocName;
+#endif
+
+		size_t allocSize;
+		if (mMaxIntFrag != size_t(-1))
+		{
+			allocSize = reduceIntFragment(*ret, mMaxIntFrag); // ret can be changed in here, that's why we store the address
+		}
+		else
+		{
+			allocSize = size_t(1) << (level + mMinBlockLog2);
+			mAllocatedBuddies.insertSorted(ret);
+		}
+		mAllocMem += allocSize;
+		mInternalFragmentation += allocSize - size;
+		mMaxAllocMem = PxMax(mAllocMem, mMaxAllocMem);
+		mMaxInternalFragmentation = PxMax(mInternalFragmentation, mMaxInternalFragmentation);
+		mManager.addToStats(allocId, allocSize, allocSize - size);
+
+		PX_ASSERT(sanityTest());
+		return reinterpret_cast<void*>((addr << mMinBlockLog2) + mBaseAddr);
+	}
+	else
+	{
+		PX_ASSERT(sanityTest());
+		return 	NULL;
+	}
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+bool Heap::realloc(void* addr, const size_t size, PX_ALLOC_INFO_PARAMS_DEF())
+{
+	PX_UNUSED(allocId);
+	PX_UNUSED(allocName);
+	PX_UNUSED(line);
+	PX_UNUSED(file);
+
+	if (addr == NULL || size > mTotalSize)
+	{
+		return false;
+	}
+
+	if (size == 0)
+	{
+		//realloc to 0 bytes can't keep the pointer as it was.
+		return false;
+		//return free(addr);
+	}
+
+	size_t inernal_addr = reinterpret_cast<size_t>(addr) - mBaseAddr;
+	inernal_addr >>= mMinBlockLog2;
+
+	// collect all buddies which are associated with this addr
+	shdfnd::Array<Buddy*, shdfnd::TempAllocator> budyList;
+	size_t totalAllocated = 0;
+	size_t buddyAllocated = 0;
+	Buddy* found = NULL;
+	do
+	{
+		found = mAllocatedBuddies.find(inernal_addr);
+		if (!found)
+		{
+			return false;
+		}
+		budyList.pushBack(found);
+		inernal_addr += size_t(1) << found->level;
+		totalAllocated += found->occupiedSize;
+		buddyAllocated += size_t(1) << (found->level + mMinBlockLog2);
+	}
+	while (found && !found->isLastBuddy);
+
+	Buddy* cur = budyList.popBack();
+
+	// increase size
+	if (totalAllocated < size)
+	{
+		size_t leftSpace = (size_t(1) << (cur->level + mMinBlockLog2)) - cur->occupiedSize;
+		size_t neededSpace = size - totalAllocated;
+		if (neededSpace <= leftSpace)
+		{
+			cur->occupiedSize += neededSpace;
+#if KEEP_DEBUG_INFO
+			cur->file = file;
+			cur->line = (uint32_t)line;
+			cur->allocName = allocName;
+#endif
+
+			mInternalFragmentation -= neededSpace;
+			mManager.decStats(PxAllocId::Enum(cur->allocId), 0, neededSpace);
+
+			// replace
+			mAllocatedBuddies.remove(cur);
+			mAllocatedBuddies.insertSorted(cur);
+			PX_ASSERT(sanityTest());
+			return true;
+		}
+		else
+		{
+			return false;
+#ifdef UNREACHABLE
+			// TODO:try merge free buddies until big enough,
+			// then add buddy and do internal fragmentation reduction.
+
+			// search for free blocks next to this one.
+			size_t addr = cur->addr + (size_t(1) << cur->level);
+			if (!mAllocatedBuddies.find(addr))
+			{
+				return false;
+			}
+
+			// if not found, return null, let user reallocate
+			PX_ASSERT(sanityTest());
+			return false;
+#endif
+		}
+	}
+	// reduce size
+	else
+	{
+		// succededly remove buddies until the requested size is reached.
+		// if internal fragmentation reduction is turned on, then an allocation can consist of multiple buddies.
+		mInternalFragmentation -= (size_t(1) << (cur->level + mMinBlockLog2)) - cur->occupiedSize;
+		mManager.decStats(PxAllocId::Enum(cur->allocId), 0, (size_t(1) << (cur->level + mMinBlockLog2)) - cur->occupiedSize);
+		size_t diff = totalAllocated - size;
+		while (diff >= cur->occupiedSize)
+		{
+			diff -= cur->occupiedSize;
+			cur->occupiedSize = 0;
+			bool succ = mAllocatedBuddies.remove(cur);
+			PX_UNUSED(succ);
+			PX_ASSERT(succ);
+			mFreeBuddiesAtLevel[cur->level].insertSorted(cur);
+			size_t allocSize = size_t(1) << (cur->level + mMinBlockLog2);
+			mAllocMem -= allocSize;
+			mManager.decStats(PxAllocId::Enum(cur->allocId), allocSize, 0);
+			cur = budyList.popBack();
+		}
+		cur->isLastBuddy = true;
+		cur->occupiedSize -= diff;
+
+#if KEEP_DEBUG_INFO
+		cur->file = file;
+		cur->line =(uint32_t)line;
+		cur->allocName = allocName;
+#endif
+
+		// replace
+		bool succ = mAllocatedBuddies.remove(cur);
+		PX_UNUSED(succ);
+		PX_ASSERT(succ);
+		mAllocatedBuddies.insertSorted(cur);
+		mInternalFragmentation += (size_t(1) << (cur->level + mMinBlockLog2)) - cur->occupiedSize;
+		mManager.incStats(PxAllocId::Enum(cur->allocId), 0, (size_t(1) << (cur->level + mMinBlockLog2)) - cur->occupiedSize);
+		PX_ASSERT(sanityTest());
+		return true;
+	}
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+bool Heap::free(void* addr)
+{
+	if (addr == NULL)
+	{
+		return false;
+	}
+
+	size_t internalAddr = reinterpret_cast<size_t>(addr) - mBaseAddr;
+	internalAddr >>= mMinBlockLog2;
+
+	bool ret = true;
+
+	bool dummy = true;
+	while (dummy)
+	{
+		Buddy* b = mAllocatedBuddies.findAndPop(internalAddr);
+
+		if (!b)
+		{
+			return false;
+		}
+
+		size_t allocSize = size_t(1) << (b->level + mMinBlockLog2);
+		mAllocMem -= allocSize;
+		mInternalFragmentation -= allocSize - b->occupiedSize;
+		mManager.removeFromStats(PxAllocId::Enum(b->allocId), allocSize, allocSize - b->occupiedSize);
+		b->occupiedSize = 0;
+
+		mFreeBuddiesAtLevel[b->level].insertSorted(b);
+
+		// check if this memory block occupied another buddy
+		if (b->isLastBuddy)
+		{
+			break;
+		}
+		else
+		{
+			internalAddr += size_t(1) << b->level;
+		}
+	}
+
+	if (mAllocMem == 0)
+	{
+		mManager.notifyEmptyPage();
+	}
+
+	PX_ASSERT(sanityTest());
+	return ret;
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+bool Heap::getStats(ApexHeapStats& stats, const uint32_t flags)
+{
+	if (flags & HeapStatsFlags::F_BASIC_STATS)
+	{
+		stats.heapSize = mTotalSize;
+		stats.totalAllocated = mAllocMem;
+		stats.maxAllocated = mMaxAllocMem;
+	}
+	if (flags & HeapStatsFlags::F_INTERNAL_FRAGMENTATION)
+	{
+		// internal fragmentation
+		stats.internalFragmentation = mInternalFragmentation;
+		stats.maxInternalFragmentation = mMaxInternalFragmentation;
+	}
+	if (flags & HeapStatsFlags::F_BIGGEST_FREE_BLOCK)
+	{
+		// bigggest free block
+		stats.biggestFreeBlock = 0;
+		uint8_t curLevel = mMaxLevel;
+		do
+		{
+			if (mFreeBuddiesAtLevel[curLevel].getSize())
+			{
+				stats.biggestFreeBlock = mFreeBuddiesAtLevel[curLevel].buddySize << mMinBlockLog2;
+				break;
+			}
+			curLevel--;
+		}
+		while (curLevel != 0);
+	}
+	if (flags & HeapStatsFlags::F_HISTOGRAM)
+	{
+		// histograms
+		for (uint8_t i = 0; i <= mMaxLevel; i++)
+		{
+			stats.freeBuddyHistogram[i] = mFreeBuddiesAtLevel[i].getSize();
+			stats.allocatedBuddyHistogram[i] = 0;
+		}
+		Buddy* b = mAllocatedBuddies.getHead();
+		while (b)
+		{
+			stats.allocatedBuddyHistogram[b->level]++;
+			b = b->next;
+		}
+		stats.numEntries = size_t(mMaxLevel + 1);
+	}
+	return true;
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+bool Heap::sanityTest()
+{
+	size_t sum = 0;
+
+	for (uint8_t i = 0; i <= mMaxLevel; i++)
+	{
+		size_t buddiesAtLevel = 0;
+		for (Buddy* cur = mFreeBuddiesAtLevel[i].getHead(); cur; cur = cur->next)
+		{
+			if ((size_t(1) << cur->level != mFreeBuddiesAtLevel[i].buddySize) ||
+			        (cur->occupiedSize > size_t(1) << (cur->level + mMinBlockLog2)))
+			{
+				return false;
+			}
+			sum += mFreeBuddiesAtLevel[i].buddySize << mMinBlockLog2;
+			buddiesAtLevel++;
+		}
+		if (mFreeBuddiesAtLevel[i].getSize() != buddiesAtLevel ||
+		        (buddiesAtLevel > (size_t(1) << (mMaxLevel - i))))
+		{
+			return false;
+		}
+	}
+
+	size_t numAllocated = 0;
+	for (Buddy* cur = mAllocatedBuddies.getHead(); cur; cur = cur->next)
+	{
+		sum += size_t(1) << (cur->level + mMinBlockLog2);
+		numAllocated++;
+	}
+
+	if (numAllocated != mAllocatedBuddies.getSize())
+	{
+		return false;
+	}
+
+	ptrdiff_t diff = ptrdiff_t(sum - (size_t(1) << (mMaxLevel + mMinBlockLog2)));
+	if (diff != 0)
+	{
+		return false;
+	}
+	else
+	{
+		return true;
+	}
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+bool Heap::visualizeMemory(uint8_t* array, const size_t size)
+{
+	float scale = (float)size / (size_t(1) << mMaxLevel);
+
+	for (size_t i = 0; i < size; i++)
+	{
+		array[i] = 0;
+	}
+	for (Buddy* cur = mAllocatedBuddies.getHead(); cur; cur = cur->next)
+	{
+		size_t start = (size_t)((float)(cur->addr) * scale);
+		size_t end = (size_t)((float)(cur->addr + (size_t(1) << size_t(cur->level))) * scale);
+		PX_ASSERT(start <= size);
+		PX_ASSERT(end <= size);
+		for (size_t i = start; i < end; i++)
+		{
+			PX_ASSERT(i < size);
+			array[i] = uint8_t(cur->level + 1);
+		}
+	}
+
+	return true;
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+bool Heap::splitBuddy(Buddy* b)
+{
+	if (b->level == 0)
+	{
+		return false;
+	}
+
+	b->level--;
+	size_t newSize = size_t(1) << b->level;
+
+	Buddy* b0 = b;
+	Buddy* b1 = mManager.getBuddyPool().construct(*b);
+	PX_ASSERT(b0 && b1);
+
+	b1->addr = b1->addr + newSize;
+
+	mFreeBuddiesAtLevel[b0->level].insertSorted(b0);
+	mFreeBuddiesAtLevel[b1->level].insertSorted(b1);
+	return true;
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+bool Heap::mergeBuddies(Buddy* b0, Buddy* b1)
+{
+	if (b0->level != b1->level || b0->level >= mMaxLevel || (b1->addr - b0->addr) != size_t(1) << size_t(b0->level))
+	{
+		return false;
+	}
+
+	Buddy* b = b0;
+	b->occupiedSize = 0;
+	b->isLastBuddy = true;
+	b->level++;
+	b->next = NULL;
+	mFreeBuddiesAtLevel[b->level].insertSorted(b);
+
+	mManager.getBuddyPool().destroy(b1);
+	return true;
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+Buddy* Heap::findBySplitting(uint8_t searchLevel)
+{
+	Buddy* ret = NULL;
+
+	uint8_t curLevel = searchLevel;
+
+	// walk through array of buddy lists and search for a free buddy which is at level >= searchLevel
+	for (; !mFreeBuddiesAtLevel[curLevel].getSize() && (curLevel < mMaxLevel); curLevel++)
+	{
+		;
+	}
+
+	// pop buddy at highest level and split until it has the correct level
+	ret = mFreeBuddiesAtLevel[curLevel].pop();
+	for (; ret && (curLevel != searchLevel) && curLevel > 0; curLevel--)
+	{
+		splitBuddy(ret);
+		ret = mFreeBuddiesAtLevel[curLevel - 1].pop();
+	}
+	return ret;
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+Buddy* Heap::findByMerging(uint8_t searchLevel)
+{
+	/*
+		while there is no pair to merge on this level, ask lower level to merge
+		may ask lower level to merge more than 1 pair after each failure
+		or just merge all pairs of lower levels
+	*/
+	if (searchLevel == 0)
+	{
+		return NULL;
+	}
+
+	uint8_t curLevel = uint8_t(searchLevel - 1);
+	bool dummy = true;
+	while (dummy)
+	{
+		int32_t shift = (mMaxLevel - (1 << (curLevel + 1)));
+		shift = shift >= 0 ? shift : 0;
+		size_t numToFind = size_t(1) << shift;
+		size_t found = findPairAndMerge(mFreeBuddiesAtLevel[curLevel], numToFind);
+		if (found)
+		{
+			if (curLevel == searchLevel - 1)
+			{
+				break;
+			}
+			curLevel++;
+		}
+		else
+		{
+			if (curLevel > 0)
+			{
+				curLevel--;
+			}
+			else
+			{
+				return NULL;
+			}
+		}
+	}
+	return mFreeBuddiesAtLevel[searchLevel].pop();
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+size_t Heap::findPairAndMerge(BuddyList& list, size_t numToFind)
+{
+	size_t found = 0;
+	Buddy* cur = list.getHead();
+	Buddy* last = list.getHead();
+	size_t diff = list.buddySize;
+	while ((found != numToFind) && cur && cur->next)
+	{
+		// find buddy pair b0 and b1, b0 must be at an even address, and b0 and b1 must be neighbours in address space.
+		// since the list is sorted, we do only compare neighbours in the list.
+		if (((cur->addr & (size_t(1) << size_t(cur->level))) == 0) && (cur->next->addr - cur->addr == diff))
+		{
+			Buddy* b0 = cur;
+			Buddy* b1 = cur->next;
+
+			if (cur == list.getHead())
+			{
+				list.setHead(cur->next->next);
+				cur = list.getHead();
+				last = cur;
+			}
+			else
+			{
+				cur = cur->next->next;
+				last->next = cur;
+			}
+			list.setSize(list.getSize() - 2);
+			if (mergeBuddies(b0, b1))
+			{
+				found++;
+			}
+		}
+		else
+		{
+			last = cur;
+			cur = cur->next;
+		}
+	}
+	return found;
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+size_t Heap::reduceIntFragment(Buddy& b, size_t threshold)
+{
+	/*
+	while internalFragmentation > threshold
+		cut buddy in the middle
+		if cut goes through occupied space
+			left buddy is full, can be pushed to alloc list clear last buddy flag
+		else
+			right buddy is free, push it to free list
+	*/
+	size_t allocSize = 0;
+	Buddy* curB = &b;
+	curB->isLastBuddy = false;
+	while (curB->level && ((size_t(1) << (curB->level + mMinBlockLog2)) - curB->occupiedSize) > threshold)
+	{
+		//split
+		Buddy* b0 = mManager.getBuddyPool().construct(*curB);
+		Buddy* b1 = curB;
+		b0->level--;
+		b1->level--;
+		b1->addr += size_t(1) << size_t(b1->level);
+		if ((size_t(1) << (b0->level + mMinBlockLog2)) < b0->occupiedSize)
+		{
+			b0->occupiedSize = size_t(1) << (b0->level + mMinBlockLog2);
+			b1->occupiedSize -= b0->occupiedSize;
+			mAllocatedBuddies.insertSorted(b0);
+			allocSize += size_t(1) << b1->level;
+			curB = b1;
+		}
+		else
+		{
+			b1->occupiedSize = 0;
+			mFreeBuddiesAtLevel[b1->level].insertSorted(b1);
+			curB = b0;
+		}
+	}
+	curB->isLastBuddy = true;
+	allocSize += size_t(1) << curB->level;
+	mAllocatedBuddies.insertSorted(curB);
+	return (allocSize << mMinBlockLog2);
+}
+
+
diff --git a/PxShared/src/cudamanager/src/HeapManagerRef.h b/PxShared/src/cudamanager/src/HeapManagerRef.h
new file mode 100644
index 0000000..e6e585e
--- /dev/null
+++ b/PxShared/src/cudamanager/src/HeapManagerRef.h
@@ -0,0 +1,297 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+//----------------------------------------------------------------------------//
+// HeapManagerRef.h
+//----------------------------------------------------------------------------//
+
+#ifndef PXCUDACONTEXTMANAGER_HEAPMANAGERREF_H
+#define PXCUDACONTEXTMANAGER_HEAPMANAGERREF_H
+
+#include "task/PxTaskDefine.h"
+
+#include "HeapManagerInterface.h"
+#include "HeapManagerLinkedList.h"
+#include "PsPool.h"
+#include "PsMutex.h"
+#include "PsArray.h"
+#include "PsUserAllocated.h"
+
+#define DUMP_HEAP_USAGE_TO_FILE 0
+
+#ifdef _DEBUG
+#define KEEP_DEBUG_INFO 1
+#else
+#define KEEP_DEBUG_INFO 0
+#endif
+
+#if DUMP_HEAP_USAGE_TO_FILE
+#include "stdio.h"
+#endif
+
+namespace physx
+{
+
+class Heap;
+struct Buddy
+{
+	PX_INLINE Buddy()
+		: addr(0)
+		, next(0)
+		, occupiedSize(0)
+		, allocId(PxAllocId::UNASSIGNED)
+		, isLastBuddy(true)
+		, level(0)
+#if KEEP_DEBUG_INFO
+		, file(NULL)
+		, allocName(NULL)
+		, line(0)
+#endif
+	{}
+
+	PX_INLINE Buddy(Buddy& b)
+		: addr(b.addr)
+		, next(b.next)
+		, occupiedSize(b.occupiedSize)
+		, allocId(b.allocId)
+		, isLastBuddy(b.isLastBuddy)
+		, level(b.level)
+#if KEEP_DEBUG_INFO
+		, file(b.file)
+		, allocName(b.allocName)
+		, line(b.line)
+#endif
+	{}
+
+	size_t		addr;
+	Buddy*		next;
+	size_t		occupiedSize;
+	uint16_t	allocId;
+	uint8_t 	isLastBuddy;
+	uint8_t		level;
+#if KEEP_DEBUG_INFO
+	const char* file;
+	const char* allocName;
+	uint32_t	line;
+#endif
+};
+
+struct BuddyList: public LinkedList<Buddy>, public shdfnd::UserAllocated
+{
+	BuddyList()
+		: buddySize(0)
+		, heap(NULL)
+	{}
+	void clear();
+
+	size_t		buddySize;  // = 2^level
+	Heap*		heap;
+};
+
+struct HeapManagerPage
+{
+	PX_INLINE bool operator < (const HeapManagerPage& p) const
+	{
+		return baseAddr < p.baseAddr;
+	}
+
+	PX_INLINE bool operator > (const HeapManagerPage& p) const
+	{
+		return baseAddr > p.baseAddr;
+	}
+
+	size_t	baseAddr;
+	Heap*	heap;
+	bool	isPersistent;
+};
+
+
+class HeapManagerRef: public HeapManagerInterface, public shdfnd::UserAllocated
+{
+	PX_NOCOPY(HeapManagerRef)
+public:
+	HeapManagerRef(physx::PxErrorCallback& errorCallback, bool enableMutex = true);
+	virtual ~HeapManagerRef();
+
+	// INTERFACE METHODS
+	virtual		bool		init(Allocator* memAllocator, const size_t baseSize, const size_t pageSize, const size_t minBlockSize, const size_t maxIntFrag);
+	virtual		bool		setPageSize(size_t pageSize);
+	virtual		void*		alloc(const size_t size, PX_ALLOC_INFO_PARAMS_DECL(NULL, 0, NULL, UNASSIGNED));
+	virtual		bool		realloc(void* addr, const size_t size, PX_ALLOC_INFO_PARAMS_DECL(NULL, 0, NULL, UNASSIGNED));
+	virtual		bool		free(void* addr);
+	virtual		void		freeEmptyPages();
+	virtual		bool		reserve(size_t size);
+	virtual		bool		getStats(ApexHeapStats& stats, const uint32_t flags);
+	virtual		bool		visualizeMemory(uint8_t* array, const size_t size);
+	virtual		void*		findBaseAddress(void* addr);
+
+	// INTERNALS
+	// searches 2^pow >= val,  searches pow in [min, max]
+	static PX_INLINE size_t	findNextPow2(const size_t val, const uint8_t min = 0, const uint8_t max = BITSPERWORD);
+	static PX_INLINE size_t	findNextPow2(uint8_t& pow, const size_t val, const uint8_t min = 0, const uint8_t max = BITSPERWORD);
+
+	PX_INLINE	void		addToStats(PxAllocId::Enum id, const size_t size, const size_t fragmentation);
+	PX_INLINE	void		removeFromStats(PxAllocId::Enum id, const size_t size, const size_t fragmentation);
+	PX_INLINE	void		incStats(PxAllocId::Enum id, const size_t change, const size_t fragmentation);
+	PX_INLINE	void		decStats(PxAllocId::Enum id, const size_t change, const size_t fragmentation);
+
+	PX_INLINE	void					notifyEmptyPage()
+	{
+		mNewEmptyPage = true;
+	}
+	PX_INLINE	shdfnd::Pool<Buddy>&	getBuddyPool()
+	{
+		return mBuddyPool;
+	}
+
+private:
+	Heap*	allocateNewHeap(size_t heapSize, bool isPersistent = false);
+	Heap*	allocateNewPages(size_t requestedSize);
+	void	resetHeap(HeapManagerPage& page);
+	void	removeDeletedHeapsFromList(uint32_t numDeletes);
+	void	shrinkMemory();
+
+	Heap*	findHeap(void* addr) const;
+
+private:
+	// heaps
+	shdfnd::Array<HeapManagerPage> mHeaps;
+	shdfnd::Pool<Buddy>		    mBuddyPool;
+	size_t						mPageSize;
+	size_t						mMinBlockSize;
+	size_t						mMaxIntFrag;
+	bool						mNewEmptyPage;
+	// lock
+	shdfnd::Mutex				mMutex;
+	// page allocator
+	Allocator*					mMemAllocator;
+	// overall stats
+	size_t						mGlobalAllocMem;
+	size_t						mGlobalMaxAllocMem;
+	size_t						mGlobalInternalFragmentation;
+	size_t						mGlobalMaxInternalFragmentation;
+	// stats per allocation ID
+	PxAllocIdStats				mGlobalAllocIdStats[PxAllocId::NUM_IDS];
+	// error callback
+	physx::PxErrorCallback&	mErrorCallback;
+
+#if DUMP_HEAP_USAGE_TO_FILE
+	FILE*				mLogFile;
+	unsigned __int64    m_qpc;
+	unsigned __int64    m_qpf;
+#endif
+};
+
+
+class Heap : public shdfnd::UserAllocated
+{
+public:
+	PX_INLINE Heap(HeapManagerRef& manager, physx::PxErrorCallback& errorCallback)
+		: mManager(manager)
+		, mErrorCallback(errorCallback)
+		, mBaseAddr(0)
+		, mMinBlockSize(0)
+		, mFreeBuddiesAtLevel(NULL)
+		, mMaxIntFrag(0)
+		, mTotalSize(0)
+		, mMaxLevel(0)
+		, mMinBlockLog2(0)
+		, mAllocMem(0)
+		, mMaxAllocMem(0)
+		, mInternalFragmentation(0)
+		, mMaxInternalFragmentation(0)
+	{}
+
+	PX_INLINE ~Heap();
+
+	bool	init(size_t baseAddr, const size_t baseSize, const size_t minBlockSize, const size_t maxIntFrag);
+	void*	alloc(const size_t size, PX_ALLOC_INFO_PARAMS_DECL(NULL, 0, NULL, UNASSIGNED));
+	bool	realloc(void* addr, const size_t size, PX_ALLOC_INFO_PARAMS_DECL(NULL, 0, NULL, UNASSIGNED));
+	bool	free(void* addr);
+	bool	getStats(ApexHeapStats& stats, const uint32_t flags);
+	bool	visualizeMemory(uint8_t* array, const size_t size);
+
+	PX_INLINE	size_t					getTotalMemorySize()
+	{
+		return mTotalSize;
+	}
+	PX_INLINE	size_t					getAllocatedMemorySize()
+	{
+		return mAllocMem;
+	}
+	PX_INLINE	shdfnd::Pool<Buddy>&	getBuddyPool()
+	{
+		return mManager.getBuddyPool();
+	}
+	PX_INLINE	void*					getBaseAddress() 
+	{ 
+		return (void*)mBaseAddr; 
+	}
+
+private:
+	// split buddy b with size 2^level into two buddies with level 2^(level-1) and append those to the free list. deletes b, assumes that b was removed from the list before.
+	bool	splitBuddy(Buddy* b);
+	// merge 2 buddies to next bigger one. deletes b0 and b1, assumes that they are already removed from their array.
+	bool	mergeBuddies(Buddy* b0, Buddy* b1);
+
+	// split of right, free children of a buddy if the internal fragmentation of a buddy is bigger than a threshold
+	// returns the size of all allocated buddies
+	size_t	reduceIntFragment(Buddy& b, size_t threshold);
+
+	// find a Buddy by splitting a Buddy at searchLevel
+	Buddy*	findBySplitting(uint8_t searchLevel);
+	Buddy*	findByMerging(uint8_t searchLevel);
+	size_t	findPairAndMerge(BuddyList& list, size_t numToFind);
+
+	bool	sanityTest();
+
+	void operator=(const Heap&)
+	{
+		PX_ASSERT(0);
+	}
+
+private:
+	HeapManagerRef&			mManager;
+	physx::PxErrorCallback& mErrorCallback;
+	size_t					mBaseAddr;
+	size_t					mMinBlockSize;
+	BuddyList*				mFreeBuddiesAtLevel;
+	BuddyList				mAllocatedBuddies;
+	size_t					mMaxIntFrag;
+	size_t					mTotalSize;
+	uint8_t					mMaxLevel;		// 2^maxLevel <= memorySize
+	uint8_t					mMinBlockLog2;
+
+	size_t					mAllocMem;		// fragmented
+	size_t					mMaxAllocMem;
+	size_t					mInternalFragmentation;
+	size_t					mMaxInternalFragmentation;
+};
+
+} // end physx namespace
+
+#endif // PXCUDACONTEXTMANAGER_HEAPMANAGERREF_H
diff --git a/PxShared/src/cudamanager/src/PhysXDevice.h b/PxShared/src/cudamanager/src/PhysXDevice.h
new file mode 100644
index 0000000..b066bdc
--- /dev/null
+++ b/PxShared/src/cudamanager/src/PhysXDevice.h
@@ -0,0 +1,119 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef ___PHYS_X_DEVICE_
+#define ___PHYS_X_DEVICE_
+
+#include "foundation/PxPreprocessor.h"
+
+#if PX_WINDOWS
+#	pragma warning (push)
+#	pragma warning (disable : 4668) //'symbol' is not defined as a preprocessor macro, replacing with '0' for 'directives'
+#	include "windows/PsWindowsInclude.h"
+#	pragma warning (pop)
+#endif
+
+#if PX_WINDOWS
+	#define PHYSX_DEV_DLL_API extern "C" __declspec(dllexport)
+	#ifdef _DEBUG
+	#   define PHYSX_DEV_DLL_PRIVATE_API extern "C" __declspec(dllexport)
+	#else
+	#   define PHYSX_DEV_DLL_PRIVATE_API
+	#endif
+#else
+	#define PHYSX_DEV_DLL_API
+	#define PHYSX_DEV_DLL_PRIVATE_API
+#endif
+
+/** typedefs */
+typedef int PHYSX_DEV_STATUS;
+typedef unsigned int PhysXDevHandle;
+
+/** PHYSX_DEV_STATUS values */
+enum
+{
+	PHYSX_DEV_OK = 0,
+	PHYSX_DEV_UNKNOWN_ERROR,
+	PHYSX_DEV_INVALID_HANDLE,
+	PHYSX_DEV_UNINITIALIZED,
+	PHYSX_DEV_NV_API_UNAVAILABLE,
+	PHYSX_DEV_CUDA_UNAVAILABLE,
+	PHYSX_DEV_CUDA_MEMORY_ALLOC_FAILURE,
+	PHYSX_DEV_LEGACY_MODE_GPU_HANDLE,
+	PHYSX_DEV_PHYSX_DEV_UNAVAILABLE,
+};
+
+
+/**
+ *  physxDevInit
+ *  Initialize the PhysX Device information functions.
+ *  Must be called before using any other API functions.
+ */
+PHYSX_DEV_DLL_API PHYSX_DEV_STATUS physxDevInit();
+
+/**
+ *  physxDevClose
+ *  Call this when finished with the PhysX Device API, it
+ *  frees memory that is allocated in physxDevInit
+ */
+PHYSX_DEV_DLL_API PHYSX_DEV_STATUS physxDevClose();
+
+/**
+ *  physxDevGetCudaOrdinal
+ *  Returns the CUDA device ordinal for the given PhysX GPU device
+ */
+PHYSX_DEV_DLL_API PHYSX_DEV_STATUS physxDevGetCudaOrdinal(int* cudaDevOrdinal, PhysXDevHandle devHandle);
+PHYSX_DEV_STATUS physxDevGetCudaOrdinalWrapper(int* cudaDevOrdinal);
+
+/**
+ *  physxDevGet
+ *  Returns the PhysX GPU device that the PhysX Engine
+ *  will use.  If the device is -1, the engine will
+ *  automatically choose which GPU to use.
+ *
+ *  This function handles the R177/R180 detection first, then decides accordingly
+ *
+ *  if(180+)
+ *      if(GPU Enabled) ? get NVAPI sel : -1
+ *  else (177)
+ *      if regkey ? regkey value : -1 (PHYSX_DEV_LEGACY_MODE_GPU_HANDLE returned)
+ */
+PHYSX_DEV_DLL_API PHYSX_DEV_STATUS physxDevGet(PhysXDevHandle* devHandle);
+
+/**
+ *  physxDevUsingDedicatedGPU
+ *  Returns whether or not PhysX has a dedicated GPU (set by the user in the NV CPL)
+ */
+PHYSX_DEV_DLL_API bool physxDevUsingDedicatedGPU();
+
+/**
+ *  physxDevSLIEnabled
+ *  Returns whether or not the device pointer specified (D3D device) is in an SLI group
+ */
+PHYSX_DEV_DLL_API bool physxDevSLIEnabled(void* graphicsDevice);
+
+#endif
diff --git a/PxShared/src/cudamanager/src/PhysXDeviceSettings.cpp b/PxShared/src/cudamanager/src/PhysXDeviceSettings.cpp
new file mode 100644
index 0000000..77896c2
--- /dev/null
+++ b/PxShared/src/cudamanager/src/PhysXDeviceSettings.cpp
@@ -0,0 +1,248 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#include "task/PxTaskDefine.h"
+
+#if PX_SUPPORT_GPU_PHYSX
+
+#include "foundation/PxErrorCallback.h"
+
+#include "PhysXDeviceSettings.h"
+#include "PhysXDevice.h"
+
+#if PX_VC
+#pragma warning(disable: 4191)	//'operator/operation' : unsafe conversion from 'type of expression' to 'type required'
+#endif
+
+namespace
+{
+#if PX_WIN32 || PX_WIN64
+	/** funcs for the dynamic loading of the PhysXDevice.dll file */
+	typedef PHYSX_DEV_STATUS __cdecl physxDevInit_t();
+	typedef PHYSX_DEV_STATUS __cdecl physxDevClose_t();
+	typedef PHYSX_DEV_STATUS __cdecl physxDevGet_t(PhysXDevHandle* devHandle);
+	typedef PHYSX_DEV_STATUS __cdecl physxDevGetCudaOrdinal_t(int* cudaDevOrdinal, PhysXDevHandle devHandle);
+	typedef bool __cdecl physxDevUsingDedicatedGPU_t();
+	typedef bool __cdecl physxDevSLIEnabled_t(void* graphicsDevice);
+
+	/** globals for cuda functions */
+	static physxDevInit_t* physxDevInit_f;
+	static physxDevClose_t* physxDevClose_f;
+	static physxDevGetCudaOrdinal_t* physxDevGetCudaOrdinal_f;
+	static physxDevGet_t* physxDevGet_f;
+	static physxDevUsingDedicatedGPU_t* physxDevUsingDedicatedGPU_f;
+	static physxDevSLIEnabled_t* physxDevSLIEnabled_f;
+
+	/** globals */
+	static HMODULE gPhysXDevModuleH;
+
+	PHYSX_DEV_STATUS initPhysXDeviceLib()
+	{
+		PHYSX_DEV_STATUS status;
+#if PX_X86
+		gPhysXDevModuleH = LoadLibrary("PhysXDevice.dll");
+#else
+		gPhysXDevModuleH = LoadLibrary("PhysXDevice64.dll");
+#endif
+		if (!gPhysXDevModuleH)
+		{
+			return PHYSX_DEV_PHYSX_DEV_UNAVAILABLE;
+		}
+
+		physxDevInit_f = (physxDevInit_t*)GetProcAddress(gPhysXDevModuleH, "physxDevInit");
+		physxDevClose_f = (physxDevClose_t*)GetProcAddress(gPhysXDevModuleH, "physxDevClose");
+		physxDevGetCudaOrdinal_f = (physxDevGetCudaOrdinal_t*)GetProcAddress(gPhysXDevModuleH, "physxDevGetCudaOrdinal");
+		physxDevGet_f = (physxDevGet_t*)GetProcAddress(gPhysXDevModuleH, "physxDevGet");
+		physxDevUsingDedicatedGPU_f = (physxDevUsingDedicatedGPU_t*)GetProcAddress(gPhysXDevModuleH, "physxDevUsingDedicatedGPU");
+		physxDevSLIEnabled_f = (physxDevSLIEnabled_t*)GetProcAddress(gPhysXDevModuleH, "physxDevSLIEnabled");
+
+		if (!physxDevInit_f ||
+			!physxDevClose_f ||
+			!physxDevGetCudaOrdinal_f ||
+			!physxDevGet_f)
+		{
+			FreeLibrary(gPhysXDevModuleH);
+			return PHYSX_DEV_CUDA_UNAVAILABLE;
+		}
+
+		status = physxDevInit_f();
+		if (PHYSX_DEV_OK != status)
+		{
+			FreeLibrary(gPhysXDevModuleH);
+			return status;
+		}
+
+		return PHYSX_DEV_OK;
+	}
+#endif // PX_WIN32 || PX_WIN64
+
+	PHYSX_DEV_STATUS getCudaOrdinal(int* cudaDevOrdinal)
+	{
+#if PX_WIN32 || PX_WIN64
+		PHYSX_DEV_STATUS status;
+		PhysXDevHandle selectedDev;
+
+		status = initPhysXDeviceLib();
+
+		if (PHYSX_DEV_OK != status)
+		{
+			return status;
+		}
+
+		status = physxDevGet_f(&selectedDev);
+		physxDevGetCudaOrdinal_f(cudaDevOrdinal, selectedDev);
+
+		physxDevClose_f();
+		FreeLibrary(gPhysXDevModuleH);
+
+		if (status == PHYSX_DEV_LEGACY_MODE_GPU_HANDLE) // R177 installed
+		{
+			return PHYSX_DEV_LEGACY_MODE_GPU_HANDLE;
+		}
+		else
+		{
+			return PHYSX_DEV_OK;
+		}
+#elif PX_LINUX
+		const char* deviceOrdinalString = ::getenv("PHYSX_GPU_DEVICE");
+		if (!deviceOrdinalString)
+			*cudaDevOrdinal = 0;
+		else
+			*cudaDevOrdinal = atoi(deviceOrdinalString);
+		return PHYSX_DEV_OK;
+#endif
+	}
+
+}
+
+namespace physx
+{
+
+	int PhysXDeviceSettings::getSuggestedCudaDeviceOrdinal(physx::PxErrorCallback& errc)
+	{
+		int cudaDevOrdinal = -1;
+		switch (getCudaOrdinal(&cudaDevOrdinal))
+		{
+		case PHYSX_DEV_OK:
+			break;
+
+		case PHYSX_DEV_UNKNOWN_ERROR:
+			errc.reportError(PxErrorCode::eDEBUG_WARNING, "unknown error during CUDA device detection\n", __FILE__, __LINE__);
+			break;
+
+		case PHYSX_DEV_NV_API_UNAVAILABLE:
+			errc.reportError(PxErrorCode::eDEBUG_WARNING, "NVAPI is not available\n", __FILE__, __LINE__);
+			break;
+
+		case PHYSX_DEV_CUDA_UNAVAILABLE:
+			errc.reportError(PxErrorCode::eDEBUG_WARNING, "CUDA is not available\n", __FILE__, __LINE__);
+			break;
+
+		case PHYSX_DEV_PHYSX_DEV_UNAVAILABLE:
+#if PX_X86
+			errc.reportError(PxErrorCode::eDEBUG_WARNING, "PhysXDevice.dll is not available\n", __FILE__, __LINE__);
+#else
+			errc.reportError(PxErrorCode::eDEBUG_WARNING, "PhysXDevice64.dll is not available\n", __FILE__, __LINE__);
+#endif
+			break;
+
+		default:
+			errc.reportError(PxErrorCode::eDEBUG_WARNING, "unknown error during CUDA device detection\n", __FILE__, __LINE__);
+			break;
+		}
+
+		return cudaDevOrdinal;
+	}
+
+	int PhysXDeviceSettings::isUsingDedicatedGPU()
+	{
+#if PX_WIN32 || PX_WIN64
+		PHYSX_DEV_STATUS status;
+		bool dedicated = false;
+
+		status = initPhysXDeviceLib();
+
+		if (PHYSX_DEV_OK != status)
+		{
+			return 0;
+		}
+
+		if (physxDevUsingDedicatedGPU_f)
+		{
+			dedicated = physxDevUsingDedicatedGPU_f();
+			physxDevClose_f();
+			FreeLibrary(gPhysXDevModuleH);
+			return(dedicated);
+		}
+		else
+		{
+			physxDevClose_f();
+			FreeLibrary(gPhysXDevModuleH);
+			return(-1);
+		}
+#elif PX_LINUX
+		// need some way to set this
+		return 0;
+#endif
+	}
+
+	bool PhysXDeviceSettings::isSLIEnabled(void* graphicsDevice)
+	{
+#if PX_WIN32 || PX_WIN64
+		PHYSX_DEV_STATUS status;
+		status = initPhysXDeviceLib();
+
+		if (PHYSX_DEV_OK != status)
+		{
+			return false;
+		}
+
+		if (physxDevSLIEnabled_f)
+		{
+			bool enabled = physxDevSLIEnabled_f(graphicsDevice);
+			physxDevClose_f();
+			FreeLibrary(gPhysXDevModuleH);
+			return enabled;
+		}
+		else
+		{
+			physxDevClose_f();
+			FreeLibrary(gPhysXDevModuleH);
+			return false;
+		}
+#elif PX_LINUX
+		// Unimplemented for Linux because we don't need it, not because it's really always false.
+		PX_UNUSED(graphicsDevice);
+		return false;
+#endif
+	}
+
+} // end physx namespace
+
+#endif // PX_SUPPORT_GPU_PHYSX
+
+
diff --git a/PxShared/src/fastxml/include/PsFastXml.h b/PxShared/src/fastxml/include/PsFastXml.h
new file mode 100644
index 0000000..e1f1c69
--- /dev/null
+++ b/PxShared/src/fastxml/include/PsFastXml.h
@@ -0,0 +1,167 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFASTXML_PSFASTXML_H
+#define PSFASTXML_PSFASTXML_H
+
+#include "foundation/PxSimpleTypes.h" // defines basic data types; modify for your platform as needed.
+#include "foundation/PxIO.h"
+#include "foundation/PxAssert.h"
+#include "PsAllocator.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+
+class FastXml
+{
+	PX_NOCOPY(FastXml)
+
+  public:
+	class AttributePairs
+	{
+		int argc;
+		const char** argv;
+
+	  public:
+		AttributePairs() : argc(0), argv(NULL)
+		{
+		}
+		AttributePairs(int c, const char** v) : argc(c), argv(v)
+		{
+		}
+
+		PX_INLINE int getNbAttr() const
+		{
+			return argc / 2;
+		}
+
+		const char* getKey(uint32_t index) const
+		{
+			PX_ASSERT((index * 2) < uint32_t(argc));
+			return argv[index * 2];
+		}
+
+		const char* getValue(uint32_t index) const
+		{
+			PX_ASSERT((index * 2 + 1) < uint32_t(argc));
+			return argv[index * 2 + 1];
+		}
+
+		const char* get(const char* attr) const
+		{
+			int32_t count = argc / 2;
+			for(int32_t i = 0; i < count; ++i)
+			{
+				const char* key = argv[i * 2], *value = argv[i * 2 + 1];
+				if(strcmp(key, attr) == 0)
+					return value;
+			}
+
+			return NULL;
+		}
+	};
+
+	/***
+	* Callbacks to the user with the contents of the XML file properly digested.
+	*/
+	class Callback
+	{
+	  public:
+		virtual ~Callback()
+		{
+		}
+		virtual bool processComment(const char* comment) = 0; // encountered a comment in the XML
+
+		// 'element' is the name of the element that is being closed.
+		// depth is the recursion depth of this element.
+		// Return true to continue processing the XML file.
+		// Return false to stop processing the XML file; leaves the read pointer of the stream right after this close
+		// tag.
+		// The bool 'isError' indicates whether processing was stopped due to an error, or intentionally canceled early.
+		virtual bool processClose(const char* element, uint32_t depth, bool& isError) = 0; // process the 'close'
+		// indicator for a previously
+		// encountered element
+
+		// return true to continue processing the XML document, false to skip.
+		virtual bool processElement(const char* elementName,    // name of the element
+		                            const char* elementData,    // element data, null if none
+		                            const AttributePairs& attr, // attributes
+		                            int32_t lineno) = 0;        // line number in the source XML file
+
+		// process the XML declaration header
+		virtual bool processXmlDeclaration(const AttributePairs&, // attributes
+		                                   const char* /*elementData*/, int32_t /*lineno*/)
+		{
+			return true;
+		}
+
+		virtual bool processDoctype(const char* /*rootElement*/, // Root element tag
+		                            const char* /*type*/,        // SYSTEM or PUBLIC
+		                            const char* /*fpi*/,         // Formal Public Identifier
+		                            const char* /*uri*/)         // Path to schema file
+		{
+			return true;
+		}
+
+		virtual void* allocate(uint32_t size)
+		{
+			return getAllocator().allocate(size, "FastXml", __FILE__, __LINE__);
+		}
+
+		virtual void deallocate(void* ptr)
+		{
+			getAllocator().deallocate(ptr);
+		}
+	};
+
+	virtual bool processXml(PxInputData& buff, bool streamFromMemory = false) = 0;
+
+	virtual const char* getError(int32_t& lineno) = 0; // report the reason for a parsing error, and the line number
+	// where it occurred.
+
+	FastXml()
+	{
+	}
+
+	virtual void release(void) = 0;
+
+  protected:
+	virtual ~FastXml()
+	{
+	}
+};
+
+FastXml* createFastXml(FastXml::Callback* iface);
+
+} // shdfnd
+} // physx
+
+#endif // PSFASTXML_PSFASTXML_H
diff --git a/PxShared/src/fastxml/src/PsFastXml.cpp b/PxShared/src/fastxml/src/PsFastXml.cpp
new file mode 100644
index 0000000..dcb8c37
--- /dev/null
+++ b/PxShared/src/fastxml/src/PsFastXml.cpp
@@ -0,0 +1,833 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxAssert.h"
+#include "foundation/PxMemory.h"
+#include "Ps.h"
+#include "PsFastXml.h"
+#include <stdio.h>
+#include <string.h>
+#include <new>
+#include <ctype.h>
+
+using namespace physx;
+
+namespace
+{
+#define MIN_CLOSE_COUNT 2
+#define DEFAULT_READ_BUFFER_SIZE (16 * 1024)
+#define NUM_ENTITY 5
+
+struct Entity
+{
+	const char* str;
+	unsigned int strLength;
+	char chr;
+};
+
+static const Entity entity[NUM_ENTITY] = {
+	{ "&lt;", 4, '<' }, { "&amp;", 5, '&' }, { "&gt;", 4, '>' }, { "&quot;", 6, '\"' }, { "&apos;", 6, '\'' }
+};
+
+class MyFastXml : public physx::shdfnd::FastXml
+{
+  public:
+	enum CharType
+	{
+		CT_DATA,
+		CT_EOF,
+		CT_SOFT,
+		CT_END_OF_ELEMENT, // either a forward slash or a greater than symbol
+		CT_END_OF_LINE
+	};
+
+	MyFastXml(Callback* c)
+	{
+		mStreamFromMemory = true;
+		mCallback = c;
+		memset(mTypes, CT_DATA, sizeof(mTypes));
+		mTypes[0] = CT_EOF;
+		mTypes[uint8_t(' ')] = mTypes[uint8_t('\t')] = CT_SOFT;
+		mTypes[uint8_t('/')] = mTypes[uint8_t('>')] = mTypes[uint8_t('?')] = CT_END_OF_ELEMENT;
+		mTypes[uint8_t('\n')] = mTypes[uint8_t('\r')] = CT_END_OF_LINE;
+		mError = 0;
+		mStackIndex = 0;
+		mFileBuf = NULL;
+		mReadBufferEnd = NULL;
+		mReadBuffer = NULL;
+		mReadBufferSize = DEFAULT_READ_BUFFER_SIZE;
+		mOpenCount = 0;
+		mLastReadLoc = 0;
+		for(uint32_t i = 0; i < (MAX_STACK + 1); i++)
+		{
+			mStack[i] = NULL;
+			mStackAllocated[i] = false;
+		}
+	}
+
+	char* processClose(char c, const char* element, char* scan, int32_t argc, const char** argv,
+	                   FastXml::Callback* iface, bool& isError)
+	{
+		AttributePairs attr(argc, argv);
+		isError = true; // by default, if we return null it's due to an error.
+		if(c == '/' || c == '?')
+		{
+			char* slash = const_cast<char*>(static_cast<const char*>(strchr(element, c)));
+			if(slash)
+				*slash = 0;
+
+			if(c == '?' && strcmp(element, "xml") == 0)
+			{
+				if(!iface->processXmlDeclaration(attr, 0, mLineNo))
+					return NULL;
+			}
+			else
+			{
+				if(!iface->processElement(element, 0, attr, mLineNo))
+				{
+					mError = "User aborted the parsing process";
+					return NULL;
+				}
+
+				pushElement(element);
+
+				const char* close = popElement();
+
+				if(!iface->processClose(close, mStackIndex, isError))
+				{
+					return NULL;
+				}
+			}
+
+			if(!slash)
+				++scan;
+		}
+		else
+		{
+			scan = skipNextData(scan);
+			char* data = scan; // this is the data portion of the element, only copies memory if we encounter line feeds
+			char* dest_data = 0;
+			while(*scan && *scan != '<')
+			{
+				if(getCharType(scan) == CT_END_OF_LINE)
+				{
+					if(*scan == '\r')
+						mLineNo++;
+					dest_data = scan;
+					*dest_data++ = ' '; // replace the linefeed with a space...
+					scan = skipNextData(scan);
+					while(*scan && *scan != '<')
+					{
+						if(getCharType(scan) == CT_END_OF_LINE)
+						{
+							if(*scan == '\r')
+								mLineNo++;
+							*dest_data++ = ' '; // replace the linefeed with a space...
+							scan = skipNextData(scan);
+						}
+						else
+						{
+							*dest_data++ = *scan++;
+						}
+					}
+					break;
+				}
+				else if('&' == *scan)
+				{
+					dest_data = scan;
+					while(*scan && *scan != '<')
+					{
+						if('&' == *scan)
+						{
+							if(*(scan + 1) && *(scan + 1) == '#' && *(scan + 2))
+							{
+								if(*(scan + 2) == 'x')
+								{
+									// Hexadecimal.
+									if(!*(scan + 3))
+										break;
+
+									char* q = scan + 3;
+									q = strchr(q, ';');
+
+									if(!q || !*q)
+										PX_ASSERT(0);
+
+									--q;
+									char ch = char(*q > '9' ? (tolower(*q) - 'a' + 10) : *q - '0');
+									if(*(--q) != tolower('x'))
+										ch |= char(*q > '9' ? (tolower(*q) - 'a' + 10) : *q - '0') << 4;
+
+									*dest_data++ = ch;
+								}
+								else
+								{
+									// Decimal.
+									if(!*(scan + 2))
+										break;
+
+									const char* q = scan + 2;
+									q = strchr(q, ';');
+
+									if(!q || !*q)
+										PX_ASSERT(0);
+
+									--q;
+									char ch = *q - '0';
+									if(*(--q) != '#')
+										ch |= (*q - '0') * 10;
+
+									*dest_data++ = ch;
+								}
+
+								char* start = scan;
+								char* end = strchr(start, ';');
+								if(end)
+								{
+									*end = 0;
+									scan = end + 1;
+								}
+
+								continue;
+							}
+
+							for(int i = 0; i < NUM_ENTITY; ++i)
+							{
+								if(strncmp(entity[i].str, scan, entity[i].strLength) == 0)
+								{
+									*dest_data++ = entity[i].chr;
+									scan += entity[i].strLength;
+									break;
+								}
+							}
+						}
+						else
+						{
+							*dest_data++ = *scan++;
+						}
+					}
+					break;
+				}
+				else
+					++scan;
+			}
+
+			if(*scan == '<')
+			{
+				if(scan[1] != '/')
+				{
+					PX_ASSERT(mOpenCount > 0);
+					mOpenCount--;
+				}
+				if(dest_data)
+				{
+					*dest_data = 0;
+				}
+				else
+				{
+					*scan = 0;
+				}
+
+				scan++; // skip it..
+
+				if(*data == 0)
+					data = 0;
+
+				if(!iface->processElement(element, data, attr, mLineNo))
+				{
+					mError = "User aborted the parsing process";
+					return 0;
+				}
+
+				pushElement(element);
+
+				// check for the comment use case...
+				if(scan[0] == '!' && scan[1] == '-' && scan[2] == '-')
+				{
+					scan += 3;
+					while(*scan && *scan == ' ')
+						++scan;
+
+					char* comment = scan;
+					char* comment_end = strstr(scan, "-->");
+					if(comment_end)
+					{
+						*comment_end = 0;
+						scan = comment_end + 3;
+						if(!iface->processComment(comment))
+						{
+							mError = "User aborted the parsing process";
+							return 0;
+						}
+					}
+				}
+				else if(*scan == '/')
+				{
+					scan = processClose(scan, iface, isError);
+					if(scan == NULL)
+					{
+						return NULL;
+					}
+				}
+			}
+			else
+			{
+				mError = "Data portion of an element wasn't terminated properly";
+				return NULL;
+			}
+		}
+
+		if(mOpenCount < MIN_CLOSE_COUNT)
+		{
+			scan = readData(scan);
+		}
+
+		return scan;
+	}
+
+	char* processClose(char* scan, FastXml::Callback* iface, bool& isError)
+	{
+		const char* start = popElement(), *close = start;
+		if(scan[1] != '>')
+		{
+			scan++;
+			close = scan;
+			while(*scan && *scan != '>')
+				scan++;
+			*scan = 0;
+		}
+
+		if(0 != strcmp(start, close))
+		{
+			mError = "Open and closing tags do not match";
+			return 0;
+		}
+
+		if(!iface->processClose(close, mStackIndex, isError))
+		{
+			// we need to set the read pointer!
+			uint32_t offset = uint32_t(mReadBufferEnd - scan) - 1;
+			uint32_t readLoc = mLastReadLoc - offset;
+			mFileBuf->seek(readLoc);
+			return NULL;
+		}
+		++scan;
+
+		return scan;
+	}
+
+	virtual bool processXml(physx::PxInputData& fileBuf, bool streamFromMemory)
+	{
+		releaseMemory();
+		mFileBuf = &fileBuf;
+		mStreamFromMemory = streamFromMemory;
+		return processXml(mCallback);
+	}
+
+	// if we have finished processing the data we had pending..
+	char* readData(char* scan)
+	{
+		for(uint32_t i = 0; i < (mStackIndex + 1); i++)
+		{
+			if(!mStackAllocated[i])
+			{
+				const char* text = mStack[i];
+				if(text)
+				{
+					uint32_t tlen = uint32_t(strlen(text));
+					mStack[i] = static_cast<const char*>(mCallback->allocate(tlen + 1));
+					PxMemCopy(const_cast<void*>(static_cast<const void*>(mStack[i])), text, tlen + 1);
+					mStackAllocated[i] = true;
+				}
+			}
+		}
+
+		if(!mStreamFromMemory)
+		{
+			if(scan == NULL)
+			{
+				uint32_t seekLoc = mFileBuf->tell();
+				mReadBufferSize = (mFileBuf->getLength() - seekLoc);
+			}
+			else
+			{
+				return scan;
+			}
+		}
+
+		if(mReadBuffer == NULL)
+		{
+			mReadBuffer = static_cast<char*>(mCallback->allocate(mReadBufferSize + 1));
+		}
+		uint32_t offset = 0;
+		uint32_t readLen = mReadBufferSize;
+
+		if(scan)
+		{
+			offset = uint32_t(scan - mReadBuffer);
+			uint32_t copyLen = mReadBufferSize - offset;
+			if(copyLen)
+			{
+				PX_ASSERT(scan >= mReadBuffer);
+				memmove(mReadBuffer, scan, copyLen);
+				mReadBuffer[copyLen] = 0;
+				readLen = mReadBufferSize - copyLen;
+			}
+			offset = copyLen;
+		}
+
+		uint32_t readCount = mFileBuf->read(&mReadBuffer[offset], readLen);
+
+		while(readCount > 0)
+		{
+
+			mReadBuffer[readCount + offset] = 0; // end of string terminator...
+			mReadBufferEnd = &mReadBuffer[readCount + offset];
+
+			const char* scan_ = &mReadBuffer[offset];
+			while(*scan_)
+			{
+				if(*scan_ == '<' && scan_[1] != '/')
+				{
+					mOpenCount++;
+				}
+				scan_++;
+			}
+
+			if(mOpenCount < MIN_CLOSE_COUNT)
+			{
+				uint32_t oldSize = uint32_t(mReadBufferEnd - mReadBuffer);
+				mReadBufferSize = mReadBufferSize * 2;
+				char* oldReadBuffer = mReadBuffer;
+				mReadBuffer = static_cast<char*>(mCallback->allocate(mReadBufferSize + 1));
+				PxMemCopy(mReadBuffer, oldReadBuffer, oldSize);
+				mCallback->deallocate(oldReadBuffer);
+				offset = oldSize;
+				uint32_t readSize = mReadBufferSize - oldSize;
+				readCount = mFileBuf->read(&mReadBuffer[offset], readSize);
+				if(readCount == 0)
+					break;
+			}
+			else
+			{
+				break;
+			}
+		}
+		mLastReadLoc = mFileBuf->tell();
+
+		return mReadBuffer;
+	}
+
+	bool processXml(FastXml::Callback* iface)
+	{
+		bool ret = true;
+
+		const int MAX_ATTRIBUTE = 2048; // can't imagine having more than 2,048 attributes in a single element right?
+
+		mLineNo = 1;
+
+		char* element, *scan = readData(0);
+
+		while(*scan)
+		{
+
+			scan = skipNextData(scan);
+
+			if(*scan == 0)
+				break;
+
+			if(*scan == '<')
+			{
+
+				if(scan[1] != '/')
+				{
+					PX_ASSERT(mOpenCount > 0);
+					mOpenCount--;
+				}
+				scan++;
+
+				if(*scan == '?') // Allow xml declarations
+				{
+					scan++;
+				}
+				else if(scan[0] == '!' && scan[1] == '-' && scan[2] == '-')
+				{
+					scan += 3;
+					while(*scan && *scan == ' ')
+						scan++;
+					char* comment = scan, *comment_end = strstr(scan, "-->");
+					if(comment_end)
+					{
+						*comment_end = 0;
+						scan = comment_end + 3;
+						if(!iface->processComment(comment))
+						{
+							mError = "User aborted the parsing process";
+							return false;
+						}
+					}
+					continue;
+				}
+				else if(scan[0] == '!') // Allow doctype
+				{
+					scan++;
+
+					// DOCTYPE syntax differs from usual XML so we parse it here
+
+					// Read DOCTYPE
+					const char* tag = "DOCTYPE";
+					if(!strstr(scan, tag))
+					{
+						mError = "Invalid DOCTYPE";
+						return false;
+					}
+
+					scan += strlen(tag);
+
+					// Skip whites
+					while(CT_SOFT == getCharType(scan))
+						++scan;
+
+					// Read rootElement
+					const char* rootElement = scan;
+					while(CT_DATA == getCharType(scan))
+						++scan;
+
+					char* endRootElement = scan;
+
+					// TODO: read remaining fields (fpi, uri, etc.)
+					while(CT_END_OF_ELEMENT != getCharType(scan++))
+						;
+
+					*endRootElement = 0;
+
+					if(!iface->processDoctype(rootElement, 0, 0, 0))
+					{
+						mError = "User aborted the parsing process";
+						return false;
+					}
+
+					continue; // Restart loop
+				}
+			}
+
+			if(*scan == '/')
+			{
+				bool isError;
+				scan = processClose(scan, iface, isError);
+				if(!scan)
+				{
+					if(isError)
+					{
+						mError = "User aborted the parsing process";
+					}
+					return !isError;
+				}
+			}
+			else
+			{
+				if(*scan == '?')
+					scan++;
+				element = scan;
+				int32_t argc = 0;
+				const char* argv[MAX_ATTRIBUTE];
+				bool close;
+				scan = nextSoftOrClose(scan, close);
+				if(close)
+				{
+					char c = *(scan - 1);
+					if(c != '?' && c != '/')
+					{
+						c = '>';
+					}
+					*scan++ = 0;
+					bool isError;
+					scan = processClose(c, element, scan, argc, argv, iface, isError);
+					if(!scan)
+					{
+						if(isError)
+						{
+							mError = "User aborted the parsing process";
+						}
+						return !isError;
+					}
+				}
+				else
+				{
+					if(*scan == 0)
+					{
+						return ret;
+					}
+
+					*scan = 0; // place a zero byte to indicate the end of the element name...
+					scan++;
+
+					while(*scan)
+					{
+						scan = skipNextData(scan); // advance past any soft seperators (tab or space)
+
+						if(getCharType(scan) == CT_END_OF_ELEMENT)
+						{
+							char c = *scan++;
+							if('?' == c)
+							{
+								if('>' != *scan) //?>
+								{
+									PX_ASSERT(0);
+									return false;
+								}
+
+								scan++;
+							}
+							bool isError;
+							scan = processClose(c, element, scan, argc, argv, iface, isError);
+							if(!scan)
+							{
+								if(isError)
+								{
+									mError = "User aborted the parsing process";
+								}
+								return !isError;
+							}
+							break;
+						}
+						else
+						{
+							if(argc >= MAX_ATTRIBUTE)
+							{
+								mError = "encountered too many attributes";
+								return false;
+							}
+							argv[argc] = scan;
+							scan = nextSep(scan); // scan up to a space, or an equal
+							if(*scan)
+							{
+								if(*scan != '=')
+								{
+									*scan = 0;
+									scan++;
+									while(*scan && *scan != '=')
+										scan++;
+									if(*scan == '=')
+										scan++;
+								}
+								else
+								{
+									*scan = 0;
+									scan++;
+								}
+
+								if(*scan) // if not eof...
+								{
+									scan = skipNextData(scan);
+									if(*scan == '"')
+									{
+										scan++;
+										argc++;
+										argv[argc] = scan;
+										argc++;
+										while(*scan && *scan != 34)
+											scan++;
+										if(*scan == '"')
+										{
+											*scan = 0;
+											scan++;
+										}
+										else
+										{
+											mError = "Failed to find closing quote for attribute";
+											return false;
+										}
+									}
+									else
+									{
+										// mError = "Expected quote to begin attribute";
+										// return false;
+										// PH: let's try to have a more graceful fallback
+										argc--;
+										while(*scan != '/' && *scan != '>' && *scan != 0)
+											scan++;
+									}
+								}
+							} // if( *scan )
+						}     // if ( mTypes[*scan]
+					}         // if( close )
+				}             // if( *scan == '/'
+			}                 // while( *scan )
+		}
+
+		if(mStackIndex)
+		{
+			mError = "Invalid file format";
+			return false;
+		}
+
+		return ret;
+	}
+
+	const char* getError(int32_t& lineno)
+	{
+		const char* ret = mError;
+		lineno = mLineNo;
+		mError = 0;
+		return ret;
+	}
+
+	virtual void release(void)
+	{
+		Callback* c = mCallback; // get the user allocator interface
+		MyFastXml* f = this;     // cast the this pointer
+		f->~MyFastXml();         // explicitely invoke the destructor for this class
+		c->deallocate(f);        // now free up the memory associated with it.
+	}
+
+  private:
+	virtual ~MyFastXml(void)
+	{
+		releaseMemory();
+	}
+
+	PX_INLINE void releaseMemory(void)
+	{
+		mFileBuf = NULL;
+		mCallback->deallocate(mReadBuffer);
+		mReadBuffer = NULL;
+		mStackIndex = 0;
+		mReadBufferEnd = NULL;
+		mOpenCount = 0;
+		mLastReadLoc = 0;
+		mError = NULL;
+		for(uint32_t i = 0; i < (mStackIndex + 1); i++)
+		{
+			if(mStackAllocated[i])
+			{
+				mCallback->deallocate(const_cast<void*>(static_cast<const void*>(mStack[i])));
+				mStackAllocated[i] = false;
+			}
+			mStack[i] = NULL;
+		}
+	}
+
+	PX_INLINE CharType getCharType(char* scan) const
+	{
+		return mTypes[uint8_t(*scan)];
+	}
+
+	PX_INLINE char* nextSoftOrClose(char* scan, bool& close)
+	{
+		while(*scan && getCharType(scan) != CT_SOFT && *scan != '>')
+			scan++;
+		close = *scan == '>';
+		return scan;
+	}
+
+	PX_INLINE char* nextSep(char* scan)
+	{
+		while(*scan && getCharType(scan) != CT_SOFT && *scan != '=')
+			scan++;
+		return scan;
+	}
+
+	PX_INLINE char* skipNextData(char* scan)
+	{
+		// while we have data, and we encounter soft seperators or line feeds...
+		while(*scan && (getCharType(scan) == CT_SOFT || getCharType(scan) == CT_END_OF_LINE))
+		{
+			if(*scan == '\n')
+				mLineNo++;
+			scan++;
+		}
+		return scan;
+	}
+
+	void pushElement(const char* element)
+	{
+		PX_ASSERT(mStackIndex < uint32_t(MAX_STACK));
+		if(mStackIndex < uint32_t(MAX_STACK))
+		{
+			if(mStackAllocated[mStackIndex])
+			{
+				mCallback->deallocate(const_cast<void*>(static_cast<const void*>(mStack[mStackIndex])));
+				mStackAllocated[mStackIndex] = false;
+			}
+			mStack[mStackIndex++] = element;
+		}
+	}
+
+	const char* popElement(void)
+	{
+		PX_ASSERT(mStackIndex > 0);
+		if(mStackAllocated[mStackIndex])
+		{
+			mCallback->deallocate(const_cast<void*>(static_cast<const void*>(mStack[mStackIndex])));
+			mStackAllocated[mStackIndex] = false;
+		}
+		mStack[mStackIndex] = NULL;
+		return mStackIndex ? mStack[--mStackIndex] : NULL;
+	}
+
+	static const int MAX_STACK = 2048;
+
+	CharType mTypes[256];
+
+	physx::PxInputData* mFileBuf;
+
+	char* mReadBuffer;
+	char* mReadBufferEnd;
+
+	uint32_t mOpenCount;
+	uint32_t mReadBufferSize;
+	uint32_t mLastReadLoc;
+
+	int32_t mLineNo;
+	const char* mError;
+	uint32_t mStackIndex;
+	const char* mStack[MAX_STACK + 1];
+	bool mStreamFromMemory;
+	bool mStackAllocated[MAX_STACK + 1];
+	Callback* mCallback;
+};
+}
+
+namespace physx
+{
+namespace shdfnd
+{
+
+FastXml* createFastXml(FastXml::Callback* iface)
+{
+	MyFastXml* m = static_cast<MyFastXml*>(iface->allocate(sizeof(MyFastXml)));
+	if(m)
+	{
+		new (m) MyFastXml(iface);
+	}
+	return static_cast<FastXml*>(m);
+}
+}
+}
diff --git a/PxShared/src/filebuf/include/PsAsciiConversion.h b/PxShared/src/filebuf/include/PsAsciiConversion.h
new file mode 100644
index 0000000..7c4fa3a
--- /dev/null
+++ b/PxShared/src/filebuf/include/PsAsciiConversion.h
@@ -0,0 +1,99 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PSFILEBUFFER_PSASCIICONVERSION_H
+#define PSFILEBUFFER_PSASCIICONVERSION_H
+
+/*!
+\file
+\brief PxAsciiConversion namespace contains string/value helper functions
+*/
+
+#include "PxMath.h"
+#include "PsString.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <float.h>
+
+namespace physx
+{
+namespace general_string_parsing2
+{
+namespace PxAsc
+{
+
+const uint32_t PxF32StrLen = 24;
+const uint32_t PxF64StrLen = 32;
+const uint32_t IntStrLen = 32;
+
+PX_INLINE bool isWhiteSpace(char c);
+PX_INLINE const char * skipNonWhiteSpace(const char *scan);
+PX_INLINE const char * skipWhiteSpace(const char *scan);
+
+//////////////////////////
+// str to value functions
+//////////////////////////
+PX_INLINE bool strToBool(const char *str, const char **endptr);
+PX_INLINE int8_t  strToI8(const char *str, const char **endptr);
+PX_INLINE int16_t strToI16(const char *str, const char **endptr);
+PX_INLINE int32_t strToI32(const char *str, const char **endptr);
+PX_INLINE int64_t strToI64(const char *str, const char **endptr);
+PX_INLINE uint8_t  strToU8(const char *str, const char **endptr);
+PX_INLINE uint16_t strToU16(const char *str, const char **endptr);
+PX_INLINE uint32_t strToU32(const char *str, const char **endptr);
+PX_INLINE uint64_t strToU64(const char *str, const char **endptr);
+PX_INLINE float strToF32(const char *str, const char **endptr);
+PX_INLINE double strToF64(const char *str, const char **endptr);
+PX_INLINE void strToF32s(float *v,uint32_t count,const char *str, const char**endptr);
+
+
+//////////////////////////
+// value to str functions
+//////////////////////////
+PX_INLINE const char * valueToStr( bool val, char *buf, uint32_t n );
+PX_INLINE const char * valueToStr( int8_t val, char *buf, uint32_t n );
+PX_INLINE const char * valueToStr( int16_t val, char *buf, uint32_t n );
+PX_INLINE const char * valueToStr( int32_t val, char *buf, uint32_t n );
+PX_INLINE const char * valueToStr( int64_t val, char *buf, uint32_t n );
+PX_INLINE const char * valueToStr( uint8_t val, char *buf, uint32_t n );
+PX_INLINE const char * valueToStr( uint16_t val, char *buf, uint32_t n );
+PX_INLINE const char * valueToStr( uint32_t val, char *buf, uint32_t n );
+PX_INLINE const char * valueToStr( uint64_t val, char *buf, uint32_t n );
+PX_INLINE const char * valueToStr( float val, char *buf, uint32_t n );
+PX_INLINE const char * valueToStr( double val, char *buf, uint32_t n );
+
+#include "PsAsciiConversion.inl"
+
+} // end of namespace
+} // end of namespace
+using namespace general_string_parsing2;
+} // end of namespace
+
+
+#endif // PSFILEBUFFER_PSASCIICONVERSION_H
diff --git a/PxShared/src/filebuf/include/PsAsciiConversion.inl b/PxShared/src/filebuf/include/PsAsciiConversion.inl
new file mode 100644
index 0000000..9e1ba14
--- /dev/null
+++ b/PxShared/src/filebuf/include/PsAsciiConversion.inl
@@ -0,0 +1,566 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+/*!
+\file
+\brief NvAsciiConversion namespace contains string/value helper functions
+*/
+
+#include <ctype.h>
+
+PX_INLINE bool isWhiteSpace(char c)
+{
+	bool ret = false;
+	if ( c == 32 || c == 9 || c == 13 || c == 10 || c == ',' ) ret = true;
+	return ret;
+}
+
+PX_INLINE const char * skipNonWhiteSpace(const char *scan)
+{
+	while ( !isWhiteSpace(*scan) && *scan) scan++;
+	if ( *scan == 0 ) scan = NULL;
+	return scan;
+}
+PX_INLINE const char * skipWhiteSpace(const char *scan)
+{
+	while ( isWhiteSpace(*scan) && *scan ) scan++;
+	if ( *scan == 0 ) scan = NULL;
+	return scan;
+}
+
+static double strtod_fast(const char * pString)
+{
+    //---
+    // Find the start of the string
+	const char* pNumberStart = skipWhiteSpace(pString);
+
+    //---
+    // Find the end of the string
+    const char* pNumberEnd = pNumberStart;
+
+    // skip optional sign
+    if( *pNumberEnd == '-' || *pNumberEnd == '+' )
+        ++pNumberEnd;
+
+    // skip optional digits
+    while( isdigit(*pNumberEnd) )
+        ++pNumberEnd;
+
+    // skip optional decimal and digits
+    if( *pNumberEnd == '.' )
+    {
+        ++pNumberEnd;
+
+        while( isdigit(*pNumberEnd) )
+            ++pNumberEnd;
+    }
+
+    // skip optional exponent
+    if(    *pNumberEnd == 'd'
+        || *pNumberEnd == 'D'
+        || *pNumberEnd == 'e'
+        || *pNumberEnd == 'E' )
+    {
+        ++pNumberEnd;
+
+        if( *pNumberEnd == '-' || *pNumberEnd == '+' )
+            ++pNumberEnd;
+
+        while( isdigit(*pNumberEnd) )
+            ++pNumberEnd;
+    }
+
+    //---
+    // Process the string
+	const uint32_t numberLen = (const uint32_t)(pNumberEnd-pNumberStart);
+    char buffer[32];
+    if( numberLen+1 < sizeof(buffer)/sizeof(buffer[0]) )
+    {
+        // copy into buffer and terminate with NUL before calling the
+        // standard function
+        memcpy( buffer, pNumberStart, numberLen*sizeof(buffer[0]) );
+        buffer[numberLen] = '\0';
+		const double result = strtod( buffer, NULL );
+
+        return result;
+    }
+    else
+    {
+        // buffer was too small so just call the standard function on the
+        // source input to get a proper result
+        return strtod( pString, NULL );
+    }
+}
+
+static float strtof_fast(const char* pString)
+{
+    return (float)strtod_fast(pString);
+}
+
+
+//////////////////////////
+// str to value functions
+//////////////////////////
+PX_INLINE bool strToBool(const char *str, const char **endptr)
+{
+	bool ret = false;
+	const char *begin = skipWhiteSpace(str);
+	const char *end = skipNonWhiteSpace(begin);
+
+	if( !end )
+		end = begin + strlen(str);
+
+	size_t len = (size_t)(end - begin);
+	if ( physx::shdfnd::strnicmp(begin,"true", len) == 0 || physx::shdfnd::strnicmp(begin,"1", len) == 0 )
+		ret = true;
+
+	if( endptr )
+		*endptr = skipNonWhiteSpace(begin);
+
+	return ret;
+}
+
+PX_INLINE int8_t  strToI8(const char *str, const char **endptr)
+{
+	int8_t ret;
+	const char *begin = skipWhiteSpace(str);
+	const char *end = skipNonWhiteSpace(begin);
+
+	if( !end )
+		end = begin + strlen(str);
+
+	if( strncmp(begin, "INT8_MIN", (size_t)(end-begin)) == 0)
+		ret = INT8_MIN;
+	else if( strncmp(begin, "INT8_MAX", (size_t)(end-begin)) == 0)
+		ret = INT8_MAX;
+	else if( strncmp(begin, "PX_MIN_I8", (size_t)(end-begin)) == 0)
+		ret = INT8_MIN;
+	else if( strncmp(begin, "PX_MAX_I8", (size_t)(end-begin)) == 0)
+		ret = INT8_MAX;
+	else
+	 	ret = (int8_t)strtol(begin, 0, 0); //FIXME
+
+	if( endptr )
+		*endptr = skipNonWhiteSpace(begin);
+
+	return ret;
+}
+
+PX_INLINE int16_t strToI16(const char *str, const char **endptr)
+{
+	int16_t ret;
+	const char *begin = skipWhiteSpace(str);
+	const char *end = skipNonWhiteSpace(begin);
+
+	if( !end )
+		end = begin + strlen(str);
+
+	if( strncmp(begin, "INT16_MIN", (size_t)(end-begin)) == 0)
+		ret = INT16_MIN;
+	else if( strncmp(begin, "INT16_MAX", (size_t)(end-begin)) == 0)
+		ret = INT16_MAX;
+	else if( strncmp(begin, "PX_MIN_I16", (size_t)(end-begin)) == 0)
+		ret = INT16_MIN;
+	else if( strncmp(begin, "PX_MAX_I16", (size_t)(end-begin)) == 0)
+		ret = INT16_MAX;
+	else
+	 	ret = (int16_t)strtol(begin, 0, 0); //FIXME
+
+	if( endptr )
+		*endptr = skipNonWhiteSpace(begin);
+
+	return ret;
+}
+
+PX_INLINE int32_t strToI32(const char *str, const char **endptr)
+{
+	int32_t ret;
+	const char *begin = skipWhiteSpace(str);
+	const char *end = skipNonWhiteSpace(begin);
+
+	if( !end )
+		end = begin + strlen(str);
+
+	if( strncmp(begin, "INT32_MIN", (size_t)(end-begin)) == 0)
+		ret = INT32_MIN;
+	else if( strncmp(begin, "INT32_MAX", (size_t)(end-begin)) == 0)
+		ret = INT32_MAX;
+	else if( strncmp(begin, "PX_MIN_I32", (size_t)(end-begin)) == 0)
+		ret = INT32_MIN;
+	else if( strncmp(begin, "PX_MAX_I32", (size_t)(end-begin)) == 0)
+		ret = INT32_MAX;
+	else
+	 	ret = (int32_t)strtol(begin, 0, 0); //FIXME
+
+	if( endptr )
+		*endptr = skipNonWhiteSpace(begin);
+
+	return ret;
+}
+
+PX_INLINE int64_t strToI64(const char *str, const char **endptr)
+{
+	int64_t ret;
+	const char *begin = skipWhiteSpace(str);
+
+	//FIXME
+#ifdef _WIN32 //NV_WINDOWS, NV_XBOX
+ 	ret = (int64_t)_strtoi64(begin,0,10);
+#else
+	ret = (int64_t)strtoll(begin,0,10);
+#endif
+
+	if( endptr )
+		*endptr = skipNonWhiteSpace(begin);
+
+	return ret;
+}
+
+PX_INLINE uint8_t  strToU8(const char *str, const char **endptr)
+{
+	uint8_t ret;
+	const char *begin = skipWhiteSpace(str);
+
+	ret = (uint8_t)strtoul(begin, 0, 0);
+
+	if( endptr )
+		*endptr = skipNonWhiteSpace(begin);
+
+	return ret;
+}
+
+PX_INLINE uint16_t strToU16(const char *str, const char **endptr)
+{
+	uint16_t ret;
+	const char *end;
+	const char *begin = skipWhiteSpace(str);
+
+	end = skipNonWhiteSpace(begin);
+	if( !end )
+		end = begin + strlen(str);
+
+	if( strncmp(begin, "UINT16_MAX", (size_t)(end-begin)) == 0)
+		ret = UINT16_MAX;
+	else if( strncmp(begin, "PX_MAX_U16", (size_t)(end-begin)) == 0)
+		ret = UINT16_MAX;
+	else
+	 	ret = (uint16_t)strtoul(begin,0,0);
+
+	if( endptr )
+		*endptr = skipNonWhiteSpace(begin);
+
+	return ret;
+}
+
+PX_INLINE uint32_t strToU32(const char *str, const char **endptr)
+{
+	uint32_t ret;
+	const char *begin = skipWhiteSpace(str);
+	const char *end = skipNonWhiteSpace(begin);
+
+	if( !end )
+		end = begin + strlen(str);
+
+	if( strncmp(begin, "UINT32_MAX", (size_t)(end-begin)) == 0)
+		ret = UINT32_MAX;
+	else if( strncmp(begin, "PX_U32_MAX", (size_t)(end-begin)) == 0)
+		ret = UINT32_MAX;
+	else
+	 	ret = (uint32_t)strtoul(begin,0,0);
+
+	if( endptr )
+		*endptr = skipNonWhiteSpace(begin);
+
+	return ret;
+}
+
+PX_INLINE uint64_t strToU64(const char *str, const char **endptr)
+{
+	uint64_t ret;
+	const char *begin;
+	begin = skipWhiteSpace(str);
+
+	//FIXME
+#ifdef _WIN32 //NV_WINDOWS, NV_XBOX
+ 	ret = (uint64_t)_strtoui64(begin,0,10);
+#else
+	ret = (uint64_t)strtoull(begin,0,10);
+#endif
+
+	if( endptr )
+		*endptr = skipNonWhiteSpace(begin);
+
+	return ret;
+}
+
+#ifndef DEBUGGING_MISMATCHES
+#define DEBUGGING_MISMATCHES 0
+#endif
+
+PX_INLINE float strToF32(const char *str, const char **endptr)
+{
+	float ret;
+	const char *begin = skipWhiteSpace(str);
+	const char *end = skipNonWhiteSpace(begin);
+
+	if( !end )
+		end = begin + strlen(str);
+
+	const uint32_t len = (uint32_t)(end - begin);
+
+	const char F32_MIN[] = "NV_MIN_F32";
+	const char F32_MAX[] = "NV_MAX_F32";
+	const char PX_F32_MIN[] = "PX_MIN_F32";
+	const char PX_F32_MAX[] = "PX_MAX_F32";
+
+	if( strncmp(begin, PX_F32_MIN, physx::PxMin(len, (uint32_t)(sizeof(PX_F32_MIN) - 1))) == 0)
+		ret = -PX_MAX_F32;
+	else if( strncmp(begin, PX_F32_MAX, physx::PxMin(len, (uint32_t)(sizeof(PX_F32_MAX) - 1))) == 0)
+		ret = PX_MAX_F32;
+	else if( strncmp(begin, F32_MIN, physx::PxMin(len, (uint32_t)(sizeof(F32_MIN) - 1))) == 0)
+		ret = -PX_MAX_F32;
+	else if( strncmp(begin, F32_MAX, physx::PxMin(len, (uint32_t)(sizeof(F32_MAX) - 1))) == 0)
+		ret = PX_MAX_F32;
+	else
+	{
+		ret = (float)strtof_fast(begin);
+	}
+
+#if DEBUGGING_MISMATCHES
+	float testRet = (float)atof(begin);
+	if( ret != testRet )
+	{
+		PX_ASSERT(0 && "Inaccurate float string");		
+	}
+#endif
+
+	if( endptr )
+		*endptr = skipNonWhiteSpace(begin);
+
+	return ret;
+}
+
+
+PX_INLINE double strToF64(const char *str, const char **endptr)
+{
+	double ret;
+	const char *begin = skipWhiteSpace(str);
+	const char *end = skipNonWhiteSpace(begin);
+
+	end = skipNonWhiteSpace(begin);
+
+	if( !end )
+		end = begin + strlen(str);
+
+	const uint32_t len = (const uint32_t)(end - begin);
+
+	const char F64_MIN[] = "PX_MIN_F364";
+	const char F64_MAX[] = "PX_MAX_F64";
+	const char PX_F64_MIN[] = "PX_MIN_F64";
+	const char PX_F64_MAX[] = "PX_MAX_F64";
+
+	if( strncmp(begin, F64_MIN, physx::PxMin(len, (uint32_t)(sizeof(F64_MIN) - 1))) == 0)
+		ret = -PX_MAX_F64;
+	else if( strncmp(begin, F64_MAX, physx::PxMin(len, (uint32_t)(sizeof(F64_MAX) - 1))) == 0)
+		ret = PX_MAX_F64;
+	else if( strncmp(begin, PX_F64_MIN, physx::PxMin(len, (uint32_t)(sizeof(PX_F64_MIN) - 1))) == 0)
+		ret = -PX_MAX_F64;
+	else if( strncmp(begin, PX_F64_MAX, physx::PxMin(len, (uint32_t)(sizeof(PX_F64_MAX) - 1))) == 0)
+		ret = PX_MAX_F64;
+	else
+		ret = (double)strtod_fast(begin);
+
+	if( endptr )
+		*endptr = skipNonWhiteSpace(begin);
+
+	return ret;
+}
+
+PX_INLINE void strToF32s(float *v,uint32_t count,const char *str, const char**endptr)
+{
+	const char *begin = skipWhiteSpace(str);
+
+	if ( *begin == '(' ) begin++;
+	for (uint32_t i=0; i<count && *begin; i++)
+	{
+		v[i] = (float)strToF32(begin, &begin);
+	}
+
+	if( endptr )
+		*endptr = skipNonWhiteSpace(begin);
+}
+
+
+//////////////////////////
+// value to str functions
+//////////////////////////
+PX_INLINE const char * valueToStr( bool val, char *buf, uint32_t n )
+{
+	physx::shdfnd::snprintf(buf, n,"%s",val ? "true" : "false");
+	return buf;
+}
+
+PX_INLINE const char * valueToStr( int8_t val, char *buf, uint32_t n )
+{
+	if( val == INT8_MIN )
+		physx::shdfnd::snprintf(buf, n,"%s","INT8_MIN" );
+	else if( val == INT8_MAX )
+		physx::shdfnd::snprintf(buf, n,"%s","INT8_MAX" );
+	else
+		physx::shdfnd::snprintf(buf, n, "%d", val);
+	return buf;
+}
+
+PX_INLINE const char * valueToStr( int16_t val, char *buf, uint32_t n )
+{
+	if( val == INT16_MIN )
+		physx::shdfnd::snprintf(buf, n,"%s","INT16_MIN" );
+	else if( val == INT16_MAX )
+		physx::shdfnd::snprintf(buf, n,"%s","INT16_MAX" );
+	else
+		physx::shdfnd::snprintf(buf, n,"%d",val );
+	return buf;
+}
+
+PX_INLINE const char * valueToStr( int32_t val, char *buf, uint32_t n )
+{
+	if( val == INT32_MIN )
+		physx::shdfnd::snprintf(buf, n,"%s","INT32_MIN" );
+	else if( val == INT32_MAX )
+		physx::shdfnd::snprintf(buf, n,"%s","INT32_MAX" );
+	else
+		physx::shdfnd::snprintf(buf, n,"%d",val );
+	return buf;
+}
+
+PX_INLINE const char * valueToStr( int64_t val, char *buf, uint32_t n )
+{
+	physx::shdfnd::snprintf(buf, n,"%lld",val );
+	return buf;
+}
+
+PX_INLINE const char * valueToStr( uint8_t val, char *buf, uint32_t n )
+{
+	physx::shdfnd::snprintf(buf, n, "%u", val);
+	return buf;
+}
+
+PX_INLINE const char * valueToStr( uint16_t val, char *buf, uint32_t n )
+{
+	if( val == UINT16_MAX )
+		physx::shdfnd::snprintf(buf, n,"%s","UINT16_MAX" );
+	else
+		physx::shdfnd::snprintf(buf, n,"%u",val );
+	return buf;
+}
+
+PX_INLINE const char * valueToStr( uint32_t val, char *buf, uint32_t n )
+{
+	if( val == UINT32_MAX )
+		physx::shdfnd::snprintf(buf, n,"%s","UINT32_MAX" );
+	else
+		physx::shdfnd::snprintf(buf, n,"%u",val );
+	return buf;
+}
+
+PX_INLINE const char * valueToStr( uint64_t val, char *buf, uint32_t n )
+{
+	physx::shdfnd::snprintf(buf, n,"%llu",val );
+	return buf;
+}
+
+PX_INLINE const char * valueToStr( float val, char *buf, uint32_t n )
+{
+	if( !physx::PxIsFinite(val) )
+	{
+		PX_ASSERT( 0 && "invalid floating point" );
+		physx::shdfnd::snprintf(buf, n,"%s","0" );
+	}
+	else if( val == -PX_MAX_F32 )
+		physx::shdfnd::snprintf(buf, n,"%s","PX_MIN_F32" );
+	else if( val == PX_MAX_F32 )
+		physx::shdfnd::snprintf(buf, n,"%s","PX_MAX_F32" );
+    else if ( val == 1 )
+    	physx::shdfnd::strlcpy(buf, n, "1");
+    else if ( val == 0 )
+    	physx::shdfnd::strlcpy(buf, n, "0");
+    else if ( val == - 1 )
+    	physx::shdfnd::strlcpy(buf, n, "-1");
+    else
+    {
+		physx::shdfnd::snprintf(buf,n,"%.9g", (double)val ); // %g expects double
+		const char *dot = strchr(buf,'.');
+		const char *e = strchr(buf,'e');
+		if ( dot && !e )
+		{
+			int32_t len = (int32_t)strlen(buf);
+			char *foo = &buf[len-1];
+			while ( *foo == '0' ) foo--;
+			if ( *foo == '.' )
+				*foo = 0;
+			else
+				foo[1] = 0;
+		}
+    }
+	return buf;
+}
+
+PX_INLINE const char * valueToStr( double val, char *buf, uint32_t n )
+{
+	if( !physx::PxIsFinite(val) )
+	{
+		PX_ASSERT( 0 && "invalid floating point" );
+		physx::shdfnd::snprintf(buf, n,"%s","0" );
+	}
+	else if( val == -PX_MAX_F64 )
+		physx::shdfnd::snprintf(buf, n,"%s","PX_MIN_F64" );
+	else if( val == PX_MAX_F64 )
+		physx::shdfnd::snprintf(buf, n,"%s","PX_MAX_F64" );
+    else if ( val == 1 )
+		physx::shdfnd::strlcpy(buf, n, "1");
+    else if ( val == 0 )
+    	physx::shdfnd::strlcpy(buf, n, "0");
+    else if ( val == - 1 )
+    	physx::shdfnd::strlcpy(buf, n, "-1");
+    else
+    {
+		physx::shdfnd::snprintf(buf,n,"%.18g", val );
+		const char *dot = strchr(buf,'.');
+		const char *e = strchr(buf,'e');
+		if ( dot && !e )
+		{
+			int32_t len = (int32_t)strlen(buf);
+			char *foo = &buf[len-1];
+			while ( *foo == '0' ) foo--;
+			if ( *foo == '.' )
+				*foo = 0;
+			else
+				foo[1] = 0;
+		}
+    }
+	return buf;
+}
diff --git a/PxShared/src/filebuf/include/PsFileBuffer.h b/PxShared/src/filebuf/include/PsFileBuffer.h
new file mode 100644
index 0000000..d768968
--- /dev/null
+++ b/PxShared/src/filebuf/include/PsFileBuffer.h
@@ -0,0 +1,250 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PSFILEBUFFER_PSFILEBUFFER_H
+#define PSFILEBUFFER_PSFILEBUFFER_H
+
+#include "filebuf/PxFileBuf.h"
+
+#include "Ps.h"
+#include "PsUserAllocated.h"
+#include <stdio.h>
+
+namespace physx
+{
+namespace general_PxIOStream2
+{
+	using namespace shdfnd;
+
+//Use this class if you want to use your own allocator
+class PxFileBufferBase : public PxFileBuf
+{
+public:
+	PxFileBufferBase(const char *fileName,OpenMode mode)
+	{
+		mOpenMode = mode;
+		mFph = NULL;
+		mFileLength = 0;
+		mSeekRead   = 0;
+		mSeekWrite  = 0;
+		mSeekCurrent = 0;
+		switch ( mode )
+		{
+			case OPEN_READ_ONLY:
+				mFph = fopen(fileName,"rb");
+				break;
+			case OPEN_WRITE_ONLY:
+				mFph = fopen(fileName,"wb");
+				break;
+			case OPEN_READ_WRITE_NEW:
+				mFph = fopen(fileName,"wb+");
+				break;
+			case OPEN_READ_WRITE_EXISTING:
+				mFph = fopen(fileName,"rb+");
+				break;
+			case OPEN_FILE_NOT_FOUND:
+				break;
+		}
+		if ( mFph )
+		{
+			fseek(mFph,0L,SEEK_END);
+			mFileLength = static_cast<uint32_t>(ftell(mFph));
+			fseek(mFph,0L,SEEK_SET);
+		}
+		else
+		{
+			mOpenMode = OPEN_FILE_NOT_FOUND;
+		}
+    }
+
+	virtual						~PxFileBufferBase()
+	{
+		close();
+	}
+
+	virtual void close()
+	{
+		if( mFph )
+		{
+			fclose(mFph);
+			mFph = 0;
+		}
+	}
+
+	virtual SeekType isSeekable(void) const
+	{
+		return mSeekType;
+	}
+
+	virtual		uint32_t			read(void* buffer, uint32_t size)	
+	{
+		uint32_t ret = 0;
+		if ( mFph )
+		{
+			setSeekRead();
+			ret = static_cast<uint32_t>(::fread(buffer,1,size,mFph));
+			mSeekRead+=ret;
+			mSeekCurrent+=ret;
+		}
+		return ret;
+	}
+
+	virtual		uint32_t			peek(void* buffer, uint32_t size)
+	{
+		uint32_t ret = 0;
+		if ( mFph )
+		{
+			uint32_t loc = tellRead();
+			setSeekRead();
+			ret = static_cast<uint32_t>(::fread(buffer,1,size,mFph));
+			mSeekCurrent+=ret;
+			seekRead(loc);
+		}
+		return ret;
+	}
+
+	virtual		uint32_t		write(const void* buffer, uint32_t size)
+	{
+		uint32_t ret = 0;
+		if ( mFph )
+		{
+			setSeekWrite();
+			ret = static_cast<uint32_t>(::fwrite(buffer,1,size,mFph));
+			mSeekWrite+=ret;
+			mSeekCurrent+=ret;
+			if ( mSeekWrite > mFileLength )
+			{
+				mFileLength = mSeekWrite;
+			}
+		}
+		return ret;
+	}
+
+	virtual uint32_t tellRead(void) const
+	{
+		return mSeekRead;
+	}
+
+	virtual uint32_t tellWrite(void) const
+	{
+		return mSeekWrite;
+	}
+
+	virtual uint32_t seekRead(uint32_t loc) 
+	{
+		mSeekRead = loc;
+		if ( mSeekRead > mFileLength )
+		{
+			mSeekRead = mFileLength;
+		}
+		return mSeekRead;
+	}
+
+	virtual uint32_t seekWrite(uint32_t loc)
+	{
+		mSeekWrite = loc;
+		if ( mSeekWrite > mFileLength )
+		{
+			mSeekWrite = mFileLength;
+		}
+		return mSeekWrite;
+	}
+
+	virtual void flush(void)
+	{
+		if ( mFph )
+		{
+			::fflush(mFph);
+		}
+	}
+
+	virtual OpenMode	getOpenMode(void) const
+	{
+		return mOpenMode;
+	}
+
+	virtual uint32_t getFileLength(void) const
+	{
+		return mFileLength;
+	}
+
+private:
+	// Moves the actual file pointer to the current read location
+	void setSeekRead(void) 
+	{
+		if ( mSeekRead != mSeekCurrent && mFph )
+		{
+			if ( mSeekRead >= mFileLength )
+			{
+				fseek(mFph,0L,SEEK_END);
+			}
+			else
+			{
+				fseek(mFph,static_cast<long>(mSeekRead),SEEK_SET);
+			}
+			mSeekCurrent = mSeekRead = static_cast<uint32_t>(ftell(mFph));
+		}
+	}
+	// Moves the actual file pointer to the current write location
+	void setSeekWrite(void)
+	{
+		if ( mSeekWrite != mSeekCurrent && mFph )
+		{
+			if ( mSeekWrite >= mFileLength )
+			{
+				fseek(mFph,0L,SEEK_END);
+			}
+			else
+			{
+				fseek(mFph,static_cast<long>(mSeekWrite),SEEK_SET);
+			}
+			mSeekCurrent = mSeekWrite = static_cast<uint32_t>(ftell(mFph));
+		}
+	}
+
+
+	FILE		*mFph;
+	uint32_t		mSeekRead;
+	uint32_t		mSeekWrite;
+	uint32_t		mSeekCurrent;
+	uint32_t		mFileLength;
+	SeekType	mSeekType;
+	OpenMode	mOpenMode;
+};
+
+//Use this class if you want to use PhysX memory allocator
+class PsFileBuffer: public PxFileBufferBase, public UserAllocated
+{
+public:
+	PsFileBuffer(const char *fileName,OpenMode mode): PxFileBufferBase(fileName, mode) {}
+};
+
+}
+using namespace general_PxIOStream2;
+}
+
+#endif // PSFILEBUFFER_PSFILEBUFFER_H
diff --git a/PxShared/src/filebuf/include/PsIOStream.h b/PxShared/src/filebuf/include/PsIOStream.h
new file mode 100644
index 0000000..07c73f9
--- /dev/null
+++ b/PxShared/src/filebuf/include/PsIOStream.h
@@ -0,0 +1,137 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PSFILEBUFFER_PSIOSTREAM_H
+#define PSFILEBUFFER_PSIOSTREAM_H
+
+/*!
+\file
+\brief PsIOStream class
+*/
+#include "filebuf/PxFileBuf.h"
+
+#include "Ps.h"
+#include "PsString.h"
+#include <string.h>
+#include <stdlib.h>
+#include "PsAsciiConversion.h"
+
+#define safePrintf physx::shdfnd::snprintf
+
+PX_PUSH_PACK_DEFAULT
+
+namespace physx
+{
+	namespace general_PxIOStream2
+	{
+
+/**
+\brief A wrapper class for physx::PxFileBuf that provides both binary and ASCII streaming capabilities
+*/
+class PsIOStream
+{
+	static const uint32_t MAX_STREAM_STRING = 1024;
+public:
+	/**
+	\param [in] stream the physx::PxFileBuf through which all reads and writes will be performed
+	\param [in] streamLen the length of the input data stream when de-serializing
+	*/
+	PsIOStream(physx::PxFileBuf &stream,uint32_t streamLen) : mBinary(true), mStreamLen(streamLen), mStream(stream) { }
+	~PsIOStream(void) { }
+
+	/**
+	\brief Set the stream to binary or ASCII
+
+	\param [in] state if true, stream is binary, if false, stream is ASCII
+
+	If the stream is binary, stream access is passed straight through to the respecitve 
+	physx::PxFileBuf methods.  If the stream is ASCII, all stream reads and writes are converted to
+	human readable ASCII.
+	*/
+	PX_INLINE void setBinary(bool state) { mBinary = state; }
+	PX_INLINE bool getBinary() { return mBinary; }
+
+	PX_INLINE PsIOStream& operator<<(bool v);
+	PX_INLINE PsIOStream& operator<<(char c);
+	PX_INLINE PsIOStream& operator<<(uint8_t v);
+	PX_INLINE PsIOStream& operator<<(int8_t v);
+
+	PX_INLINE PsIOStream& operator<<(const char *c);
+	PX_INLINE PsIOStream& operator<<(int64_t v);
+	PX_INLINE PsIOStream& operator<<(uint64_t v);
+	PX_INLINE PsIOStream& operator<<(double v);
+	PX_INLINE PsIOStream& operator<<(float v);
+	PX_INLINE PsIOStream& operator<<(uint32_t v);
+	PX_INLINE PsIOStream& operator<<(int32_t v);
+	PX_INLINE PsIOStream& operator<<(uint16_t v);
+	PX_INLINE PsIOStream& operator<<(int16_t v);
+	PX_INLINE PsIOStream& operator<<(const physx::PxVec3 &v);
+	PX_INLINE PsIOStream& operator<<(const physx::PxQuat &v);
+	PX_INLINE PsIOStream& operator<<(const physx::PxBounds3 &v);
+
+	PX_INLINE PsIOStream& operator>>(const char *&c);
+	PX_INLINE PsIOStream& operator>>(bool &v);
+	PX_INLINE PsIOStream& operator>>(char &c);
+	PX_INLINE PsIOStream& operator>>(uint8_t &v);
+	PX_INLINE PsIOStream& operator>>(int8_t &v);
+	PX_INLINE PsIOStream& operator>>(int64_t &v);
+	PX_INLINE PsIOStream& operator>>(uint64_t &v);
+	PX_INLINE PsIOStream& operator>>(double &v);
+	PX_INLINE PsIOStream& operator>>(float &v);
+	PX_INLINE PsIOStream& operator>>(uint32_t &v);
+	PX_INLINE PsIOStream& operator>>(int32_t &v);
+	PX_INLINE PsIOStream& operator>>(uint16_t &v);
+	PX_INLINE PsIOStream& operator>>(int16_t &v);
+	PX_INLINE PsIOStream& operator>>(physx::PxVec3 &v);
+	PX_INLINE PsIOStream& operator>>(physx::PxQuat &v);
+	PX_INLINE PsIOStream& operator>>(physx::PxBounds3 &v);
+
+	uint32_t getStreamLen(void) const { return mStreamLen; }
+
+	physx::PxFileBuf& getStream(void) { return mStream; }
+
+	PX_INLINE void storeString(const char *c,bool zeroTerminate=false);
+
+private:
+	PsIOStream& operator=( const PsIOStream& );
+
+
+	bool      mBinary; // true if we are serializing binary data.  Otherwise, everything is assumed converted to ASCII
+	uint32_t     mStreamLen; // the length of the input data stream when de-serializing.
+	physx::PxFileBuf &mStream;
+	char			mReadString[MAX_STREAM_STRING]; // a temp buffer for streaming strings on input.
+};
+
+#include "PsIOStream.inl" // inline methods...
+
+	} // end of namespace
+	using namespace general_PxIOStream2;
+} // end of physx namespace
+
+PX_POP_PACK
+
+#endif // PSFILEBUFFER_PSIOSTREAM_H
diff --git a/PxShared/src/filebuf/include/PsIOStream.inl b/PxShared/src/filebuf/include/PsIOStream.inl
new file mode 100644
index 0000000..e821e6a
--- /dev/null
+++ b/PxShared/src/filebuf/include/PsIOStream.inl
@@ -0,0 +1,451 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+
+/*
+ * Copyright 2009-2011 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.  Users and possessors of this source code
+ * are hereby granted a nonexclusive, royalty-free license to use this code
+ * in individual and commercial software.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.   This source code is a "commercial item" as
+ * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer  software"  and "commercial computer software
+ * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ *
+ * Any use of this source code in individual and commercial software must
+ * include, in the user documentation and internal comments to the code,
+ * the above Disclaimer and U.S. Government End Users Notice.
+ */
+
+/*!
+\file
+\brief PsIOStream inline implementation
+*/
+
+PX_INLINE PsIOStream& PsIOStream::operator<<(bool v)
+{
+	if ( mBinary )
+	{
+		mStream.storeByte((uint8_t)v);
+	}
+	else
+	{
+		char scratch[6];
+		storeString( physx::PxAsc::valueToStr(v, scratch, 6) );
+	}
+	return *this;
+}
+
+
+PX_INLINE PsIOStream& PsIOStream::operator<<(char c)
+{
+	mStream.storeByte((uint8_t)c);
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator<<(uint8_t c)
+{
+	if ( mBinary )
+	{
+		mStream.storeByte((uint8_t)c);
+	}
+	else
+	{
+		char scratch[physx::PxAsc::IntStrLen];
+		storeString( physx::PxAsc::valueToStr(c, scratch, physx::PxAsc::IntStrLen) );
+	}
+
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator<<(int8_t c)
+{
+	if ( mBinary )
+	{
+		mStream.storeByte((uint8_t)c);
+	}
+	else
+	{
+		char scratch[physx::PxAsc::IntStrLen];
+		storeString( physx::PxAsc::valueToStr(c, scratch, physx::PxAsc::IntStrLen) );
+	}
+
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator<<(const char *c)
+{
+	if ( mBinary )
+	{
+		c = c ? c : ""; // it it is a null pointer, assign it to an empty string.
+		uint32_t len = (uint32_t)strlen(c);
+		PX_ASSERT( len < (MAX_STREAM_STRING-1));
+		if ( len > (MAX_STREAM_STRING-1) )
+		{
+			len = MAX_STREAM_STRING-1;
+		}
+		mStream.storeDword(len);
+		if ( len )
+			mStream.write(c,len);
+	}
+	else
+	{
+		storeString(c);
+	}
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator<<(uint64_t v)
+{
+	if ( mBinary )
+	{
+		mStream.storeDouble( (double) v );
+	}
+	else
+	{
+		char scratch[physx::PxAsc::IntStrLen];
+		storeString( physx::PxAsc::valueToStr(v, scratch, physx::PxAsc::IntStrLen) );
+	}
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator<<(int64_t v)
+{
+	if ( mBinary )
+	{
+		mStream.storeDouble( (double) v );
+	}
+	else
+	{
+		char scratch[physx::PxAsc::IntStrLen];
+		storeString( physx::PxAsc::valueToStr(v, scratch, physx::PxAsc::IntStrLen) );
+	}
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator<<(double v)
+{
+	if ( mBinary )
+	{
+		mStream.storeDouble( (double) v );
+	}
+	else
+	{
+		char scratch[physx::PxAsc::PxF64StrLen];
+		storeString( physx::PxAsc::valueToStr(v, scratch, physx::PxAsc::PxF64StrLen) );
+	}
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator<<(float v)
+{
+	if ( mBinary )
+	{
+		mStream.storeFloat(v);
+	}
+	else
+	{
+		char scratch[physx::PxAsc::PxF32StrLen];
+		storeString( physx::PxAsc::valueToStr(v, scratch, physx::PxAsc::PxF32StrLen) );
+
+	}
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator<<(uint32_t v)
+{
+	if ( mBinary )
+	{
+		mStream.storeDword(v);
+	}
+	else
+	{
+		char scratch[physx::PxAsc::IntStrLen];
+		storeString( physx::PxAsc::valueToStr(v, scratch, physx::PxAsc::IntStrLen) );
+	}
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator<<(int32_t v)
+{
+	if ( mBinary )
+	{
+		mStream.storeDword( (uint32_t) v );
+	}
+	else
+	{
+		char scratch[physx::PxAsc::IntStrLen];
+		storeString( physx::PxAsc::valueToStr(v, scratch, physx::PxAsc::IntStrLen) );
+	}
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator<<(uint16_t v)
+{
+	if ( mBinary )
+	{
+		mStream.storeWord(v);
+	}
+	else
+	{
+		char scratch[physx::PxAsc::IntStrLen];
+		storeString( physx::PxAsc::valueToStr(v, scratch, physx::PxAsc::IntStrLen) );
+	}
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator<<(int16_t v)
+{
+	if ( mBinary )
+	{
+		mStream.storeWord( (uint16_t) v );
+	}
+	else
+	{
+		char scratch[physx::PxAsc::IntStrLen];
+		storeString( physx::PxAsc::valueToStr(v, scratch, physx::PxAsc::IntStrLen) );
+	}
+	return *this;
+}
+
+
+PX_INLINE PsIOStream& PsIOStream::operator>>(uint32_t &v)
+{
+	if ( mBinary )
+	{
+		v = mStream.readDword();
+	}
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator>>(char &v)
+{
+	if ( mBinary )
+	{
+		v = (char)mStream.readByte();
+	}
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator>>(uint8_t &v)
+{
+	if ( mBinary )
+	{
+		v = mStream.readByte();
+	}
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator>>(int8_t &v)
+{
+	if ( mBinary )
+	{
+		v = (int8_t)mStream.readByte();
+	}
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator>>(int64_t &v)
+{
+	if ( mBinary )
+	{
+		v = mStream.readDword();
+	}
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator>>(uint64_t &v)
+{
+	if ( mBinary )
+	{
+		v = (uint64_t)mStream.readDouble();
+	}
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator>>(double &v)
+{
+	if ( mBinary )
+	{
+		v = mStream.readDouble();
+	}
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator>>(float &v)
+{
+	if ( mBinary )
+	{
+		v = mStream.readFloat();
+	}
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator>>(int32_t &v)
+{
+	if ( mBinary )
+	{
+		v = (int32_t)mStream.readDword();
+	}
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator>>(uint16_t &v)
+{
+	if ( mBinary )
+	{
+		v = mStream.readWord();
+	}
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator>>(int16_t &v)
+{
+	if ( mBinary )
+	{
+		v = (int16_t)mStream.readWord();
+	}
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator>>(bool &v)
+{
+	int8_t iv;
+	iv = (int8_t)mStream.readByte();
+	v = iv ? true : false;
+	return *this;
+}
+
+#define NX_IOSTREAM_COMMA_SEPARATOR if(!mBinary) *this << ' ';
+
+PX_INLINE PsIOStream& PsIOStream::operator<<(const physx::PxVec3 &v)
+{
+	*this << v.x; 
+	NX_IOSTREAM_COMMA_SEPARATOR;
+	*this << v.y;
+	NX_IOSTREAM_COMMA_SEPARATOR;
+	*this << v.z;
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator<<(const physx::PxQuat &v)
+{
+	*this << v.x;
+	NX_IOSTREAM_COMMA_SEPARATOR;
+	*this << v.y;
+	NX_IOSTREAM_COMMA_SEPARATOR;
+	*this << v.z;
+	NX_IOSTREAM_COMMA_SEPARATOR;
+	*this << v.w;
+	return *this;
+}
+
+
+PX_INLINE PsIOStream& PsIOStream::operator<<(const physx::PxBounds3 &v)
+{
+	*this << v.minimum;
+	NX_IOSTREAM_COMMA_SEPARATOR;
+	*this << v.maximum;
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator>>(physx::PxVec3 &v)
+{
+	*this >> v.x;
+	*this >> v.y;
+	*this >> v.z;
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator>>(physx::PxQuat &v)
+{
+	*this>>v.x;
+	*this>>v.y;
+	*this>>v.z;
+	*this>>v.w;
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator>>(physx::PxBounds3 &v)
+{
+	*this >> v.minimum;
+	*this >> v.maximum;
+	return *this;
+}
+
+PX_INLINE PsIOStream& PsIOStream::operator>>(const char *&str)
+{
+	str = NULL; // by default no string streamed...
+	if ( mBinary )
+	{
+		uint32_t len=0;
+		*this >> len;
+
+		PX_ASSERT( len < (MAX_STREAM_STRING-1) );
+		if ( len < (MAX_STREAM_STRING-1) )
+		{
+			mStream.read(mReadString,len);
+			mReadString[len] = 0;
+			str = mReadString;
+		}
+	}
+	return *this;
+}
+
+
+PX_INLINE void  PsIOStream::storeString(const char *c,bool zeroTerminate)
+{
+	while ( *c )
+	{
+		mStream.storeByte((uint8_t)*c);
+		c++;
+	}
+	if ( zeroTerminate )
+	{
+		mStream.storeByte(0);
+	}
+}
diff --git a/PxShared/src/filebuf/include/PsMemoryBuffer.h b/PxShared/src/filebuf/include/PsMemoryBuffer.h
new file mode 100644
index 0000000..5b59386
--- /dev/null
+++ b/PxShared/src/filebuf/include/PsMemoryBuffer.h
@@ -0,0 +1,449 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PSFILEBUFFER_PSMEMORYBUFFER_H
+#define PSFILEBUFFER_PSMEMORYBUFFER_H
+
+#include "Ps.h"
+#include "PsUserAllocated.h"
+#include "PsAlignedMalloc.h"
+#include "filebuf/PxFileBuf.h"
+#include "foundation/PxAssert.h"
+
+namespace physx
+{
+namespace general_PxIOStream2
+{
+	using namespace shdfnd;
+
+	const uint32_t BUFFER_SIZE_DEFAULT = 4096;
+
+//Use this class if you want to use your own allocator
+template<class Allocator>
+class PxMemoryBufferBase : public PxFileBuf, public Allocator
+{
+	PX_NOCOPY(PxMemoryBufferBase)
+	void init(const void *readMem, uint32_t readLen)
+	{
+		mAllocator = this;
+
+		mReadBuffer = mReadLoc = static_cast<const uint8_t *>(readMem);
+		mReadStop   = &mReadLoc[readLen];
+
+		mWriteBuffer = mWriteLoc = mWriteStop = NULL;
+		mWriteBufferSize = 0;
+		mDefaultWriteBufferSize = BUFFER_SIZE_DEFAULT;
+
+		mOpenMode = OPEN_READ_ONLY;
+		mSeekType = SEEKABLE_READ;
+	}
+
+	void init(uint32_t defaultWriteBufferSize)
+	{
+		mAllocator = this;
+
+		mReadBuffer = mReadLoc = mReadStop = NULL;
+
+		mWriteBuffer = mWriteLoc = mWriteStop = NULL;
+		mWriteBufferSize = 0;
+		mDefaultWriteBufferSize = defaultWriteBufferSize;
+
+		mOpenMode = OPEN_READ_WRITE_NEW;
+		mSeekType = SEEKABLE_READWRITE;
+	}
+
+public:
+	PxMemoryBufferBase(const void *readMem,uint32_t readLen)
+	{
+		init(readMem, readLen);
+    }
+
+	PxMemoryBufferBase(const void *readMem,uint32_t readLen, const Allocator &alloc): Allocator(alloc)
+	{
+		init(readMem, readLen);
+    }
+
+	PxMemoryBufferBase(uint32_t defaultWriteBufferSize = BUFFER_SIZE_DEFAULT)
+    {
+		init(defaultWriteBufferSize);
+	}
+
+	PxMemoryBufferBase(uint32_t defaultWriteBufferSize, const Allocator &alloc): Allocator(alloc)
+    {
+		init(defaultWriteBufferSize);
+	}
+
+	virtual ~PxMemoryBufferBase(void)
+	{
+		reset();
+	}
+
+	void setAllocator(Allocator *allocator)
+	{
+		mAllocator = allocator;
+	}
+
+	void initWriteBuffer(uint32_t size)
+	{
+		if ( mWriteBuffer == NULL )
+		{
+			if ( size < mDefaultWriteBufferSize ) size = mDefaultWriteBufferSize;
+			mWriteBuffer = static_cast<uint8_t *>(mAllocator->allocate(size));
+			PX_ASSERT( mWriteBuffer );
+    		mWriteLoc    = mWriteBuffer;
+    		mWriteStop	= &mWriteBuffer[size];
+    		mWriteBufferSize = size;
+    		mReadBuffer = mWriteBuffer;
+    		mReadStop	= &mWriteBuffer[size];
+    		mReadLoc    = mWriteBuffer;
+		}
+    }
+
+	void reset(void)
+	{
+		mAllocator->deallocate(mWriteBuffer);
+		mWriteBuffer = NULL;
+		mWriteBufferSize = 0;
+		mWriteLoc = NULL;
+		mWriteStop = NULL;
+		mReadBuffer = NULL;
+		mReadStop = NULL;
+		mReadLoc = NULL;
+    }
+
+	virtual OpenMode	getOpenMode(void) const
+	{
+		return mOpenMode;
+	}
+
+
+	SeekType isSeekable(void) const
+	{
+		return mSeekType;
+	}
+
+	virtual		uint32_t			read(void* buffer, uint32_t size)
+	{
+		if ( (mReadLoc+size) > mReadStop )
+		{
+			size = uint32_t(mReadStop - mReadLoc);
+		}
+		if ( size != 0 )
+		{
+			memmove(buffer,mReadLoc,size);
+			mReadLoc+=size;
+		}
+		return size;
+	}
+
+	virtual		uint32_t			peek(void* buffer, uint32_t size)
+	{
+		if ( (mReadLoc+size) > mReadStop )
+		{
+			size = uint32_t(mReadStop - mReadLoc);
+		}
+		if ( size != 0 )
+		{
+			memmove(buffer,mReadLoc,size);
+		}
+		return size;
+	}
+
+	virtual		uint32_t		write(const void* buffer, uint32_t size)
+	{
+		PX_ASSERT( mOpenMode ==	OPEN_READ_WRITE_NEW );
+		if ( mOpenMode == OPEN_READ_WRITE_NEW )
+		{
+    		if ( (mWriteLoc+size) > mWriteStop )
+    		    growWriteBuffer(size);
+    		memmove(mWriteLoc,buffer,size);
+    		mWriteLoc+=size;
+    		mReadStop = mWriteLoc;
+    	}
+    	else
+    	{
+    		size = 0;
+    	}
+		return size;
+	}
+
+	PX_INLINE const uint8_t * getReadLoc(void) const { return mReadLoc; }
+	PX_INLINE void advanceReadLoc(uint32_t len)
+	{
+		PX_ASSERT(mReadBuffer);
+		if ( mReadBuffer )
+		{
+			mReadLoc+=len;
+			if ( mReadLoc >= mReadStop )
+			{
+				mReadLoc = mReadStop;
+			}
+		}
+	}
+
+	virtual uint32_t tellRead(void) const
+	{
+		uint32_t ret=0;
+
+		if ( mReadBuffer )
+		{
+			ret = uint32_t(mReadLoc-mReadBuffer);
+		}
+		return ret;
+	}
+
+	virtual uint32_t tellWrite(void) const
+	{
+		return uint32_t(mWriteLoc-mWriteBuffer);
+	}
+
+	virtual uint32_t seekRead(uint32_t loc)
+	{
+		uint32_t ret = 0;
+		PX_ASSERT(mReadBuffer);
+		if ( mReadBuffer )
+		{
+			mReadLoc = &mReadBuffer[loc];
+			if ( mReadLoc >= mReadStop )
+			{
+				mReadLoc = mReadStop;
+			}
+			ret = uint32_t(mReadLoc-mReadBuffer);
+		}
+		return ret;
+	}
+
+	virtual uint32_t seekWrite(uint32_t loc)
+	{
+		uint32_t ret = 0;
+		PX_ASSERT( mOpenMode ==	OPEN_READ_WRITE_NEW );
+		if ( mWriteBuffer )
+		{
+    		if ( loc > mWriteBufferSize )
+			{
+				mWriteLoc = mWriteStop;
+    		    growWriteBuffer(loc - mWriteBufferSize);
+			}
+    		mWriteLoc = &mWriteBuffer[loc];
+			ret = uint32_t(mWriteLoc-mWriteBuffer);
+		}
+		return ret;
+	}
+
+	virtual void flush(void)
+	{
+
+	}
+
+	virtual uint32_t getFileLength(void) const
+	{
+		uint32_t ret = 0;
+		if ( mReadBuffer )
+		{
+			ret = uint32_t(mReadStop-mReadBuffer);
+		}
+		else if ( mWriteBuffer )
+		{
+			ret = uint32_t(mWriteLoc-mWriteBuffer);
+		}
+		return ret;
+	}
+
+	uint32_t	getWriteBufferSize(void) const
+	{
+		return uint32_t(mWriteLoc-mWriteBuffer);
+	}
+
+	void setWriteLoc(uint8_t *writeLoc)
+	{
+		PX_ASSERT(writeLoc >= mWriteBuffer && writeLoc < mWriteStop );
+		mWriteLoc = writeLoc;
+		mReadStop = mWriteLoc;
+	}
+
+	const uint8_t * getWriteBuffer(void) const
+	{
+		return mWriteBuffer;
+	}
+
+	/**
+	 * Attention: if you use aligned allocator you cannot free memory with PX_FREE macros instead use deallocate method from base
+	 */
+	uint8_t * getWriteBufferOwnership(uint32_t &dataLen) // return the write buffer, and zero it out, the caller is taking ownership of the memory
+	{
+		uint8_t *ret = mWriteBuffer;
+		dataLen = uint32_t(mWriteLoc-mWriteBuffer);
+		mWriteBuffer = NULL;
+		mWriteLoc = NULL;
+		mWriteStop = NULL;
+		mWriteBufferSize = 0;
+		return ret;
+	}
+
+
+	void alignRead(uint32_t a)
+	{
+		uint32_t loc = tellRead();
+		uint32_t aloc = ((loc+(a-1))/a)*a;
+		if ( aloc != loc )
+		{
+			seekRead(aloc);
+		}
+	}
+
+	void alignWrite(uint32_t a)
+	{
+		uint32_t loc = tellWrite();
+		uint32_t aloc = ((loc+(a-1))/a)*a;
+		if ( aloc != loc )
+		{
+			seekWrite(aloc);
+		}
+	}
+
+private:
+
+
+	// double the size of the write buffer or at least as large as the 'size' value passed in.
+	void growWriteBuffer(uint32_t size)
+	{
+		if ( mWriteBuffer == NULL )
+		{
+			if ( size < mDefaultWriteBufferSize ) size = mDefaultWriteBufferSize;
+			initWriteBuffer(size);
+		}
+		else
+		{
+			uint32_t oldWriteIndex = uint32_t(mWriteLoc - mWriteBuffer);
+			uint32_t newSize =	mWriteBufferSize*2;
+			uint32_t avail = newSize-oldWriteIndex;
+			if ( size >= avail ) newSize = newSize+size;
+			uint8_t *writeBuffer = static_cast<uint8_t *>(mAllocator->allocate(newSize));
+			PX_ASSERT( writeBuffer );
+			memmove(writeBuffer,mWriteBuffer,mWriteBufferSize);
+			mAllocator->deallocate(mWriteBuffer);
+			mWriteBuffer = writeBuffer;
+			mWriteBufferSize = newSize;
+			mWriteLoc = &mWriteBuffer[oldWriteIndex];
+			mWriteStop = &mWriteBuffer[mWriteBufferSize];
+			uint32_t oldReadLoc = uint32_t(mReadLoc-mReadBuffer);
+			mReadBuffer = mWriteBuffer;
+			mReadStop   = mWriteLoc;
+			mReadLoc = &mReadBuffer[oldReadLoc];
+		}
+	}
+
+	const	uint8_t	*mReadBuffer;
+	const	uint8_t	*mReadLoc;
+	const	uint8_t	*mReadStop;
+
+			uint8_t	*mWriteBuffer;
+			uint8_t	*mWriteLoc;
+			uint8_t	*mWriteStop;
+
+			uint32_t	mWriteBufferSize;
+			uint32_t	mDefaultWriteBufferSize;
+			Allocator	*mAllocator;
+			OpenMode	mOpenMode;
+			SeekType	mSeekType;
+
+};
+
+class PxMemoryBufferAllocator
+{
+public:
+	PxMemoryBufferAllocator(uint32_t a = 0) : alignment(a) {}
+
+	virtual void * allocate(uint32_t size)
+	{
+		switch(alignment)
+		{
+		case 0:
+			return PX_ALLOC(size, PX_DEBUG_EXP("PxMemoryBufferAllocator"));			
+		case 16 :
+			return physx::AlignedAllocator<16>().allocate(size, __FILE__, __LINE__);			
+		case 32 :
+			return physx::AlignedAllocator<32>().allocate(size, __FILE__, __LINE__);			
+		case 64 :
+			return physx::AlignedAllocator<64>().allocate(size, __FILE__, __LINE__);			
+		case 128 :
+			return physx::AlignedAllocator<128>().allocate(size, __FILE__, __LINE__);			
+		default :
+			PX_ASSERT(0);
+		}
+		return NULL;
+	}
+	virtual void deallocate(void *mem)
+	{
+		switch(alignment)
+		{
+		case 0:
+			PX_FREE(mem);
+			break;
+		case 16 :
+			physx::AlignedAllocator<16>().deallocate(mem);			
+			break;
+		case 32 :
+			physx::AlignedAllocator<32>().deallocate(mem);
+			break;
+		case 64 :
+			physx::AlignedAllocator<64>().deallocate(mem);
+			break;
+		case 128 :
+			physx::AlignedAllocator<128>().deallocate(mem);
+			break;
+		default :
+			PX_ASSERT(0);
+		}
+	}
+	virtual ~PxMemoryBufferAllocator(void) {}
+private:
+	PxMemoryBufferAllocator& operator=(const PxMemoryBufferAllocator&);
+
+	const uint32_t alignment;
+};
+
+//Use this class if you want to use PhysX memory allocator
+class PsMemoryBuffer: public PxMemoryBufferBase<PxMemoryBufferAllocator>, public UserAllocated
+{
+	PX_NOCOPY(PsMemoryBuffer)
+	typedef PxMemoryBufferBase<PxMemoryBufferAllocator> BaseClass;
+
+public:
+	PsMemoryBuffer(const void *readMem,uint32_t readLen): BaseClass(readMem, readLen) {}	
+	PsMemoryBuffer(const void *readMem,uint32_t readLen, uint32_t alignment): BaseClass(readMem, readLen, PxMemoryBufferAllocator(alignment)) {}
+
+	PsMemoryBuffer(uint32_t defaultWriteBufferSize=BUFFER_SIZE_DEFAULT): BaseClass(defaultWriteBufferSize) {}
+	PsMemoryBuffer(uint32_t defaultWriteBufferSize,uint32_t alignment): BaseClass(defaultWriteBufferSize, PxMemoryBufferAllocator(alignment)) {}
+};
+
+}
+using namespace general_PxIOStream2;
+}
+
+#endif // PSFILEBUFFER_PSMEMORYBUFFER_H
+
diff --git a/PxShared/src/foundation/doc/PsFoundation.chm b/PxShared/src/foundation/doc/PsFoundation.chm
new file mode 100644
index 0000000..df3c07f
--- /dev/null
+++ b/PxShared/src/foundation/doc/PsFoundation.chm
diff --git a/PxShared/src/foundation/doc/Readme.txt b/PxShared/src/foundation/doc/Readme.txt
new file mode 100644
index 0000000..88e1189
--- /dev/null
+++ b/PxShared/src/foundation/doc/Readme.txt
@@ -0,0 +1,18 @@
+This is the 'NVIDIA Shared' foundation library.
+
+This code should not ever appear in any public headers or interfaces.
+
+This library is primarily a platform abstraction layer.
+
+It contains code to handle mutexes, atomic operations, etc.
+
+It also handles some SIMD data types.
+
+It provides math utility functions.
+
+It implements a number of common container classes.
+
+It manages trapping all memory allocations.
+
+All projects should leverage against this foundation library to
+perform these common functions.
diff --git a/PxShared/src/foundation/doc/create_docs.cmd b/PxShared/src/foundation/doc/create_docs.cmd
new file mode 100644
index 0000000..4691d7d
--- /dev/null
+++ b/PxShared/src/foundation/doc/create_docs.cmd
@@ -0,0 +1,7 @@
+set DOXYGEN_DIR=..\..\..\..\..\..\..\devrel\GameWorks\BuildTools\doxygen-win\bin
+set HTMLHELP_DIR=..\..\..\..\..\..\..\devrel\GameWorks\BuildTools\HTMLHelpWorkshop
+
+%DOXYGEN_DIR%\doxygen.exe docs.doxyfile
+cd html
+..\%HTMLHELP_DIR%\hhc.exe index.hhp
+cd ..
diff --git a/PxShared/src/foundation/doc/docs.doxyfile b/PxShared/src/foundation/doc/docs.doxyfile
new file mode 100644
index 0000000..624a44f
--- /dev/null
+++ b/PxShared/src/foundation/doc/docs.doxyfile
@@ -0,0 +1,13 @@
+# Doxyfile 1.5.8
+
+PROJECT_NAME           = "NVIDIA(R) PsFoundation Reference"
+#ENABLED_SECTIONS       = PHYSICS_SDK_PAGES
+WARN_LOGFILE           = PsFoundation.err
+INPUT                  = ../include ../include/windows ../src ../src/windows
+EXTRACT_ALL			   = YES
+ENABLE_PREPROCESSING   = YES
+MACRO_EXPANSION        = YES
+GENERATE_HTMLHELP      = YES
+HTML_OUTPUT            = html/
+CHM_FILE               = ../PsFoundation.chm
+TOC_EXPAND             = YES
diff --git a/PxShared/src/foundation/externals/src/android/cpu-features.c b/PxShared/src/foundation/externals/src/android/cpu-features.c
new file mode 100644
index 0000000..4754c46
--- /dev/null
+++ b/PxShared/src/foundation/externals/src/android/cpu-features.c
@@ -0,0 +1,1082 @@
+/*
+ * Copyright (C) 2010 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* ChangeLog for this library:
+ *
+ * NDK r8d: Add android_setCpu().
+ *
+ * NDK r8c: Add new ARM CPU features: VFPv2, VFP_D32, VFP_FP16,
+ *          VFP_FMA, NEON_FMA, IDIV_ARM, IDIV_THUMB2 and iWMMXt.
+ *
+ *          Rewrite the code to parse /proc/self/auxv instead of
+ *          the "Features" field in /proc/cpuinfo.
+ *
+ *          Dynamically allocate the buffer that hold the content
+ *          of /proc/cpuinfo to deal with newer hardware.
+ *
+ * NDK r7c: Fix CPU count computation. The old method only reported the
+ *           number of _active_ CPUs when the library was initialized,
+ *           which could be less than the real total.
+ *
+ * NDK r5: Handle buggy kernels which report a CPU Architecture number of 7
+ *         for an ARMv6 CPU (see below).
+ *
+ *         Handle kernels that only report 'neon', and not 'vfpv3'
+ *         (VFPv3 is mandated by the ARM architecture is Neon is implemented)
+ *
+ *         Handle kernels that only report 'vfpv3d16', and not 'vfpv3'
+ *
+ *         Fix x86 compilation. Report ANDROID_CPU_FAMILY_X86 in
+ *         android_getCpuFamily().
+ *
+ * NDK r4: Initial release
+ */
+
+#if defined(__le32__)
+
+// When users enter this, we should only provide interface and
+// libportable will give the implementations.
+
+#else // !__le32__
+
+#include <sys/system_properties.h>
+#include <pthread.h>
+#include "cpu-features.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <errno.h>
+
+static  pthread_once_t     g_once;
+static  int                g_inited;
+static  AndroidCpuFamily   g_cpuFamily;
+static  uint64_t           g_cpuFeatures;
+static  int                g_cpuCount;
+
+#ifdef __arm__
+static  uint32_t           g_cpuIdArm;
+#endif
+
+static const int  android_cpufeatures_debug = 0;
+
+#ifdef __arm__
+#  define DEFAULT_CPU_FAMILY  ANDROID_CPU_FAMILY_ARM
+#elif defined __i386__
+#  define DEFAULT_CPU_FAMILY  ANDROID_CPU_FAMILY_X86
+#else
+#  define DEFAULT_CPU_FAMILY  ANDROID_CPU_FAMILY_UNKNOWN
+#endif
+
+#define  D(...) \
+    do { \
+        if (android_cpufeatures_debug) { \
+            printf(__VA_ARGS__); fflush(stdout); \
+        } \
+    } while (0)
+
+#ifdef __i386__
+static __inline__ void x86_cpuid(int func, int values[4])
+{
+    int a, b, c, d;
+    /* We need to preserve ebx since we're compiling PIC code */
+    /* this means we can't use "=b" for the second output register */
+    __asm__ __volatile__ ( \
+      "push %%ebx\n"
+      "cpuid\n" \
+      "mov %%ebx, %1\n"
+      "pop %%ebx\n"
+      : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \
+      : "a" (func) \
+    );
+    values[0] = a;
+    values[1] = b;
+    values[2] = c;
+    values[3] = d;
+}
+#endif
+
+/* Get the size of a file by reading it until the end. This is needed
+ * because files under /proc do not always return a valid size when
+ * using fseek(0, SEEK_END) + ftell(). Nor can they be mmap()-ed.
+ */
+static int
+get_file_size(const char* pathname)
+{
+    int fd, ret, result = 0;
+    char buffer[256];
+
+    fd = open(pathname, O_RDONLY);
+    if (fd < 0) {
+        D("Can't open %s: %s\n", pathname, strerror(errno));
+        return -1;
+    }
+
+    for (;;) {
+        int ret = read(fd, buffer, sizeof buffer);
+        if (ret < 0) {
+            if (errno == EINTR)
+                continue;
+            D("Error while reading %s: %s\n", pathname, strerror(errno));
+            break;
+        }
+        if (ret == 0)
+            break;
+
+        result += ret;
+    }
+    close(fd);
+    return result;
+}
+
+/* Read the content of /proc/cpuinfo into a user-provided buffer.
+ * Return the length of the data, or -1 on error. Does *not*
+ * zero-terminate the content. Will not read more
+ * than 'buffsize' bytes.
+ */
+static int
+read_file(const char*  pathname, char*  buffer, size_t  buffsize)
+{
+    int  fd, count;
+
+    fd = open(pathname, O_RDONLY);
+    if (fd < 0) {
+        D("Could not open %s: %s\n", pathname, strerror(errno));
+        return -1;
+    }
+    count = 0;
+    while (count < (int)buffsize) {
+        int ret = read(fd, buffer + count, buffsize - count);
+        if (ret < 0) {
+            if (errno == EINTR)
+                continue;
+            D("Error while reading from %s: %s\n", pathname, strerror(errno));
+            if (count == 0)
+                count = -1;
+            break;
+        }
+        if (ret == 0)
+            break;
+        count += ret;
+    }
+    close(fd);
+    return count;
+}
+
+/* Extract the content of a the first occurence of a given field in
+ * the content of /proc/cpuinfo and return it as a heap-allocated
+ * string that must be freed by the caller.
+ *
+ * Return NULL if not found
+ */
+static char*
+extract_cpuinfo_field(const char* buffer, int buflen, const char* field)
+{
+    int  fieldlen = strlen(field);
+    const char* bufend = buffer + buflen;
+    char* result = NULL;
+    int len, ignore;
+    const char *p, *q;
+
+    /* Look for first field occurence, and ensures it starts the line. */
+    p = buffer;
+    for (;;) {
+        p = memmem(p, bufend-p, field, fieldlen);
+        if (p == NULL)
+            goto EXIT;
+
+        if (p == buffer || p[-1] == '\n')
+            break;
+
+        p += fieldlen;
+    }
+
+    /* Skip to the first column followed by a space */
+    p += fieldlen;
+    p  = memchr(p, ':', bufend-p);
+    if (p == NULL || p[1] != ' ')
+        goto EXIT;
+
+    /* Find the end of the line */
+    p += 2;
+    q = memchr(p, '\n', bufend-p);
+    if (q == NULL)
+        q = bufend;
+
+    /* Copy the line into a heap-allocated buffer */
+    len = q-p;
+    result = malloc(len+1);
+    if (result == NULL)
+        goto EXIT;
+
+    memcpy(result, p, len);
+    result[len] = '\0';
+
+EXIT:
+    return result;
+}
+
+/* Checks that a space-separated list of items contains one given 'item'.
+ * Returns 1 if found, 0 otherwise.
+ */
+static int
+has_list_item(const char* list, const char* item)
+{
+    const char*  p = list;
+    int itemlen = strlen(item);
+
+    if (list == NULL)
+        return 0;
+
+    while (*p) {
+        const char*  q;
+
+        /* skip spaces */
+        while (*p == ' ' || *p == '\t')
+            p++;
+
+        /* find end of current list item */
+        q = p;
+        while (*q && *q != ' ' && *q != '\t')
+            q++;
+
+        if (itemlen == q-p && !memcmp(p, item, itemlen))
+            return 1;
+
+        /* skip to next item */
+        p = q;
+    }
+    return 0;
+}
+
+/* Parse a number starting from 'input', but not going further
+ * than 'limit'. Return the value into '*result'.
+ *
+ * NOTE: Does not skip over leading spaces, or deal with sign characters.
+ * NOTE: Ignores overflows.
+ *
+ * The function returns NULL in case of error (bad format), or the new
+ * position after the decimal number in case of success (which will always
+ * be <= 'limit').
+ */
+static const char*
+parse_number(const char* input, const char* limit, int base, int* result)
+{
+    const char* p = input;
+    int val = 0;
+    while (p < limit) {
+        int d = (*p - '0');
+        if ((unsigned)d >= 10U) {
+            d = (*p - 'a');
+            if ((unsigned)d >= 6U)
+              d = (*p - 'A');
+            if ((unsigned)d >= 6U)
+              break;
+            d += 10;
+        }
+        if (d >= base)
+          break;
+        val = val*base + d;
+        p++;
+    }
+    if (p == input)
+        return NULL;
+
+    *result = val;
+    return p;
+}
+
+static const char*
+parse_decimal(const char* input, const char* limit, int* result)
+{
+    return parse_number(input, limit, 10, result);
+}
+
+static const char*
+parse_hexadecimal(const char* input, const char* limit, int* result)
+{
+    return parse_number(input, limit, 16, result);
+}
+
+/* This small data type is used to represent a CPU list / mask, as read
+ * from sysfs on Linux. See http://www.kernel.org/doc/Documentation/cputopology.txt
+ *
+ * For now, we don't expect more than 32 cores on mobile devices, so keep
+ * everything simple.
+ */
+typedef struct {
+    uint32_t mask;
+} CpuList;
+
+static __inline__ void
+cpulist_init(CpuList* list) {
+    list->mask = 0;
+}
+
+static __inline__ void
+cpulist_and(CpuList* list1, CpuList* list2) {
+    list1->mask &= list2->mask;
+}
+
+static __inline__ void
+cpulist_set(CpuList* list, int index) {
+    if ((unsigned)index < 32) {
+        list->mask |= (uint32_t)(1U << index);
+    }
+}
+
+static __inline__ int
+cpulist_count(CpuList* list) {
+    return __builtin_popcount(list->mask);
+}
+
+/* Parse a textual list of cpus and store the result inside a CpuList object.
+ * Input format is the following:
+ * - comma-separated list of items (no spaces)
+ * - each item is either a single decimal number (cpu index), or a range made
+ *   of two numbers separated by a single dash (-). Ranges are inclusive.
+ *
+ * Examples:   0
+ *             2,4-127,128-143
+ *             0-1
+ */
+static void
+cpulist_parse(CpuList* list, const char* line, int line_len)
+{
+    const char* p = line;
+    const char* end = p + line_len;
+    const char* q;
+
+    /* NOTE: the input line coming from sysfs typically contains a
+     * trailing newline, so take care of it in the code below
+     */
+    while (p < end && *p != '\n')
+    {
+        int val, start_value, end_value;
+
+        /* Find the end of current item, and put it into 'q' */
+        q = memchr(p, ',', end-p);
+        if (q == NULL) {
+            q = end;
+        }
+
+        /* Get first value */
+        p = parse_decimal(p, q, &start_value);
+        if (p == NULL)
+            goto BAD_FORMAT;
+
+        end_value = start_value;
+
+        /* If we're not at the end of the item, expect a dash and
+         * and integer; extract end value.
+         */
+        if (p < q && *p == '-') {
+            p = parse_decimal(p+1, q, &end_value);
+            if (p == NULL)
+                goto BAD_FORMAT;
+        }
+
+        /* Set bits CPU list bits */
+        for (val = start_value; val <= end_value; val++) {
+            cpulist_set(list, val);
+        }
+
+        /* Jump to next item */
+        p = q;
+        if (p < end)
+            p++;
+    }
+
+BAD_FORMAT:
+    ;
+}
+
+/* Read a CPU list from one sysfs file */
+static void
+cpulist_read_from(CpuList* list, const char* filename)
+{
+    char   file[64];
+    int    filelen;
+
+    cpulist_init(list);
+
+    filelen = read_file(filename, file, sizeof file);
+    if (filelen < 0) {
+        D("Could not read %s: %s\n", filename, strerror(errno));
+        return;
+    }
+
+    cpulist_parse(list, file, filelen);
+}
+
+// See <asm/hwcap.h> kernel header.
+#define HWCAP_VFP       (1 << 6)
+#define HWCAP_IWMMXT    (1 << 9)
+#define HWCAP_NEON      (1 << 12)
+#define HWCAP_VFPv3     (1 << 13)
+#define HWCAP_VFPv3D16  (1 << 14)
+#define HWCAP_VFPv4     (1 << 16)
+#define HWCAP_IDIVA     (1 << 17)
+#define HWCAP_IDIVT     (1 << 18)
+
+#define AT_HWCAP 16
+
+#if defined(__arm__)
+/* Compute the ELF HWCAP flags.
+ */
+static uint32_t
+get_elf_hwcap(const char* cpuinfo, int cpuinfo_len)
+{
+  /* IMPORTANT:
+   *   Accessing /proc/self/auxv doesn't work anymore on all
+   *   platform versions. More specifically, when running inside
+   *   a regular application process, most of /proc/self/ will be
+   *   non-readable, including /proc/self/auxv. This doesn't
+   *   happen however if the application is debuggable, or when
+   *   running under the "shell" UID, which is why this was not
+   *   detected appropriately.
+   */
+#if 0
+    uint32_t result = 0;
+    const char filepath[] = "/proc/self/auxv";
+    int fd = open(filepath, O_RDONLY);
+    if (fd < 0) {
+        D("Could not open %s: %s\n", filepath, strerror(errno));
+        return 0;
+    }
+
+    struct { uint32_t tag; uint32_t value; } entry;
+
+    for (;;) {
+        int ret = read(fd, (char*)&entry, sizeof entry);
+        if (ret < 0) {
+            if (errno == EINTR)
+                continue;
+            D("Error while reading %s: %s\n", filepath, strerror(errno));
+            break;
+        }
+        // Detect end of list.
+        if (ret == 0 || (entry.tag == 0 && entry.value == 0))
+          break;
+        if (entry.tag == AT_HWCAP) {
+          result = entry.value;
+          break;
+        }
+    }
+    close(fd);
+    return result;
+#else
+    // Recreate ELF hwcaps by parsing /proc/cpuinfo Features tag.
+    uint32_t hwcaps = 0;
+
+    char* cpuFeatures = extract_cpuinfo_field(cpuinfo, cpuinfo_len, "Features");
+
+    if (cpuFeatures != NULL) {
+        D("Found cpuFeatures = '%s'\n", cpuFeatures);
+
+        if (has_list_item(cpuFeatures, "vfp"))
+            hwcaps |= HWCAP_VFP;
+        if (has_list_item(cpuFeatures, "vfpv3"))
+            hwcaps |= HWCAP_VFPv3;
+        if (has_list_item(cpuFeatures, "vfpv3d16"))
+            hwcaps |= HWCAP_VFPv3D16;
+        if (has_list_item(cpuFeatures, "vfpv4"))
+            hwcaps |= HWCAP_VFPv4;
+        if (has_list_item(cpuFeatures, "neon"))
+            hwcaps |= HWCAP_NEON;
+        if (has_list_item(cpuFeatures, "idiva"))
+            hwcaps |= HWCAP_IDIVA;
+        if (has_list_item(cpuFeatures, "idivt"))
+            hwcaps |= HWCAP_IDIVT;
+        if (has_list_item(cpuFeatures, "idiv"))
+            hwcaps |= HWCAP_IDIVA | HWCAP_IDIVT;
+        if (has_list_item(cpuFeatures, "iwmmxt"))
+            hwcaps |= HWCAP_IWMMXT;
+
+        free(cpuFeatures);
+    }
+    return hwcaps;
+#endif
+}
+#endif  /* __arm__ */
+
+/* Return the number of cpus present on a given device.
+ *
+ * To handle all weird kernel configurations, we need to compute the
+ * intersection of the 'present' and 'possible' CPU lists and count
+ * the result.
+ */
+static int
+get_cpu_count(void)
+{
+    CpuList cpus_present[1];
+    CpuList cpus_possible[1];
+
+    cpulist_read_from(cpus_present, "/sys/devices/system/cpu/present");
+    cpulist_read_from(cpus_possible, "/sys/devices/system/cpu/possible");
+
+    /* Compute the intersection of both sets to get the actual number of
+     * CPU cores that can be used on this device by the kernel.
+     */
+    cpulist_and(cpus_present, cpus_possible);
+
+    return cpulist_count(cpus_present);
+}
+
+static void
+android_cpuInitFamily(void)
+{
+#if defined(__arm__)
+    g_cpuFamily = ANDROID_CPU_FAMILY_ARM;
+#elif defined(__i386__)
+    g_cpuFamily = ANDROID_CPU_FAMILY_X86;
+#elif defined(__mips__)
+    g_cpuFamily = ANDROID_CPU_FAMILY_MIPS;
+#else
+    g_cpuFamily = ANDROID_CPU_FAMILY_UNKNOWN;
+#endif
+}
+
+static void
+android_cpuInit(void)
+{
+    char* cpuinfo = NULL;
+    int   cpuinfo_len;
+
+    android_cpuInitFamily();
+
+    g_cpuFeatures = 0;
+    g_cpuCount    = 1;
+    g_inited      = 1;
+
+    cpuinfo_len = get_file_size("/proc/cpuinfo");
+    if (cpuinfo_len < 0) {
+      D("cpuinfo_len cannot be computed!");
+      return;
+    }
+    cpuinfo = malloc(cpuinfo_len);
+    if (cpuinfo == NULL) {
+      D("cpuinfo buffer could not be allocated");
+      return;
+    }
+    cpuinfo_len = read_file("/proc/cpuinfo", cpuinfo, cpuinfo_len);
+    D("cpuinfo_len is (%d):\n%.*s\n", cpuinfo_len,
+      cpuinfo_len >= 0 ? cpuinfo_len : 0, cpuinfo);
+
+    if (cpuinfo_len < 0)  /* should not happen */ {
+        free(cpuinfo);
+        return;
+    }
+
+    /* Count the CPU cores, the value may be 0 for single-core CPUs */
+    g_cpuCount = get_cpu_count();
+    if (g_cpuCount == 0) {
+        g_cpuCount = 1;
+    }
+
+    D("found cpuCount = %d\n", g_cpuCount);
+
+#ifdef __arm__
+    {
+        char*  features = NULL;
+        char*  architecture = NULL;
+
+        /* Extract architecture from the "CPU Architecture" field.
+         * The list is well-known, unlike the the output of
+         * the 'Processor' field which can vary greatly.
+         *
+         * See the definition of the 'proc_arch' array in
+         * $KERNEL/arch/arm/kernel/setup.c and the 'c_show' function in
+         * same file.
+         */
+        char* cpuArch = extract_cpuinfo_field(cpuinfo, cpuinfo_len, "CPU architecture");
+
+        if (cpuArch != NULL) {
+            char*  end;
+            long   archNumber;
+            int    hasARMv7 = 0;
+
+            D("found cpuArch = '%s'\n", cpuArch);
+
+            /* read the initial decimal number, ignore the rest */
+            archNumber = strtol(cpuArch, &end, 10);
+
+            /* Here we assume that ARMv8 will be upwards compatible with v7
+             * in the future. Unfortunately, there is no 'Features' field to
+             * indicate that Thumb-2 is supported.
+             */
+            if (end > cpuArch && archNumber >= 7) {
+                hasARMv7 = 1;
+            }
+
+            /* Unfortunately, it seems that certain ARMv6-based CPUs
+             * report an incorrect architecture number of 7!
+             *
+             * See http://code.google.com/p/android/issues/detail?id=10812
+             *
+             * We try to correct this by looking at the 'elf_format'
+             * field reported by the 'Processor' field, which is of the
+             * form of "(v7l)" for an ARMv7-based CPU, and "(v6l)" for
+             * an ARMv6-one.
+             */
+            if (hasARMv7) {
+                char* cpuProc = extract_cpuinfo_field(cpuinfo, cpuinfo_len,
+                                                      "Processor");
+                if (cpuProc != NULL) {
+                    D("found cpuProc = '%s'\n", cpuProc);
+                    if (has_list_item(cpuProc, "(v6l)")) {
+                        D("CPU processor and architecture mismatch!!\n");
+                        hasARMv7 = 0;
+                    }
+                    free(cpuProc);
+                }
+            }
+
+            if (hasARMv7) {
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_ARMv7;
+            }
+
+            /* The LDREX / STREX instructions are available from ARMv6 */
+            if (archNumber >= 6) {
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_LDREX_STREX;
+            }
+
+            free(cpuArch);
+        }
+
+        /* Extract the list of CPU features from ELF hwcaps */
+        uint32_t hwcaps = get_elf_hwcap(cpuinfo, cpuinfo_len);
+
+        if (hwcaps != 0) {
+            int has_vfp = (hwcaps & HWCAP_VFP);
+            int has_vfpv3 = (hwcaps & HWCAP_VFPv3);
+            int has_vfpv3d16 = (hwcaps & HWCAP_VFPv3D16);
+            int has_vfpv4 = (hwcaps & HWCAP_VFPv4);
+            int has_neon = (hwcaps & HWCAP_NEON);
+            int has_idiva = (hwcaps & HWCAP_IDIVA);
+            int has_idivt = (hwcaps & HWCAP_IDIVT);
+            int has_iwmmxt = (hwcaps & HWCAP_IWMMXT);
+
+            // The kernel does a poor job at ensuring consistency when
+            // describing CPU features. So lots of guessing is needed.
+
+            // 'vfpv4' implies VFPv3|VFP_FMA|FP16
+            if (has_vfpv4)
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv3    |
+                                 ANDROID_CPU_ARM_FEATURE_VFP_FP16 |
+                                 ANDROID_CPU_ARM_FEATURE_VFP_FMA;
+
+            // 'vfpv3' or 'vfpv3d16' imply VFPv3. Note that unlike GCC,
+            // a value of 'vfpv3' doesn't necessarily mean that the D32
+            // feature is present, so be conservative. All CPUs in the
+            // field that support D32 also support NEON, so this should
+            // not be a problem in practice.
+            if (has_vfpv3 || has_vfpv3d16)
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv3;
+
+            // 'vfp' is super ambiguous. Depending on the kernel, it can
+            // either mean VFPv2 or VFPv3. Make it depend on ARMv7.
+            if (has_vfp) {
+              if (g_cpuFeatures & ANDROID_CPU_ARM_FEATURE_ARMv7)
+                  g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv3;
+              else
+                  g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv2;
+            }
+
+            // Neon implies VFPv3|D32, and if vfpv4 is detected, NEON_FMA
+            if (has_neon) {
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv3 |
+                                 ANDROID_CPU_ARM_FEATURE_NEON |
+                                 ANDROID_CPU_ARM_FEATURE_VFP_D32;
+              if (has_vfpv4)
+                  g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_NEON_FMA;
+            }
+
+            // VFPv3 implies VFPv2 and ARMv7
+            if (g_cpuFeatures & ANDROID_CPU_ARM_FEATURE_VFPv3)
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv2 |
+                                 ANDROID_CPU_ARM_FEATURE_ARMv7;
+
+            if (has_idiva)
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_IDIV_ARM;
+            if (has_idivt)
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2;
+
+            if (has_iwmmxt)
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_iWMMXt;
+        }
+
+        /* Extract the cpuid value from various fields */
+        // The CPUID value is broken up in several entries in /proc/cpuinfo.
+        // This table is used to rebuild it from the entries.
+        static const struct CpuIdEntry {
+            const char* field;
+            char        format;
+            char        bit_lshift;
+            char        bit_length;
+        } cpu_id_entries[] = {
+            { "CPU implementer", 'x', 24, 8 },
+            { "CPU variant", 'x', 20, 4 },
+            { "CPU part", 'x', 4, 12 },
+            { "CPU revision", 'd', 0, 4 },
+        };
+        size_t i;
+        D("Parsing /proc/cpuinfo to recover CPUID\n");
+        for (i = 0;
+             i < sizeof(cpu_id_entries)/sizeof(cpu_id_entries[0]);
+             ++i) {
+            const struct CpuIdEntry* entry = &cpu_id_entries[i];
+            char* value = extract_cpuinfo_field(cpuinfo,
+                                                cpuinfo_len,
+                                                entry->field);
+            if (value == NULL)
+                continue;
+
+            D("field=%s value='%s'\n", entry->field, value);
+            char* value_end = value + strlen(value);
+            int val = 0;
+            const char* start = value;
+            const char* p;
+            if (value[0] == '0' && (value[1] == 'x' || value[1] == 'X')) {
+              start += 2;
+              p = parse_hexadecimal(start, value_end, &val);
+            } else if (entry->format == 'x')
+              p = parse_hexadecimal(value, value_end, &val);
+            else
+              p = parse_decimal(value, value_end, &val);
+
+            if (p > (const char*)start) {
+              val &= ((1 << entry->bit_length)-1);
+              val <<= entry->bit_lshift;
+              g_cpuIdArm |= (uint32_t) val;
+            }
+
+            free(value);
+        }
+
+        // Handle kernel configuration bugs that prevent the correct
+        // reporting of CPU features.
+        static const struct CpuFix {
+            uint32_t  cpuid;
+            uint64_t  or_flags;
+        } cpu_fixes[] = {
+            /* The Nexus 4 (Qualcomm Krait) kernel configuration
+             * forgets to report IDIV support. */
+            { 0x510006f2, ANDROID_CPU_ARM_FEATURE_IDIV_ARM |
+                          ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2 },
+            { 0x510006f3, ANDROID_CPU_ARM_FEATURE_IDIV_ARM |
+                          ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2 },
+        };
+        size_t n;
+        for (n = 0; n < sizeof(cpu_fixes)/sizeof(cpu_fixes[0]); ++n) {
+            const struct CpuFix* entry = &cpu_fixes[n];
+
+            if (g_cpuIdArm == entry->cpuid)
+                g_cpuFeatures |= entry->or_flags;
+        }
+
+    }
+#endif /* __arm__ */
+
+#ifdef __i386__
+    int regs[4];
+
+/* According to http://en.wikipedia.org/wiki/CPUID */
+#define VENDOR_INTEL_b  0x756e6547
+#define VENDOR_INTEL_c  0x6c65746e
+#define VENDOR_INTEL_d  0x49656e69
+
+    x86_cpuid(0, regs);
+    int vendorIsIntel = (regs[1] == VENDOR_INTEL_b &&
+                         regs[2] == VENDOR_INTEL_c &&
+                         regs[3] == VENDOR_INTEL_d);
+
+    x86_cpuid(1, regs);
+    if ((regs[2] & (1 << 9)) != 0) {
+        g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_SSSE3;
+    }
+    if ((regs[2] & (1 << 23)) != 0) {
+        g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_POPCNT;
+    }
+    if (vendorIsIntel && (regs[2] & (1 << 22)) != 0) {
+        g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_MOVBE;
+    }
+#endif
+
+    free(cpuinfo);
+}
+
+
+AndroidCpuFamily
+android_getCpuFamily(void)
+{
+    pthread_once(&g_once, android_cpuInit);
+    return g_cpuFamily;
+}
+
+
+uint64_t
+android_getCpuFeatures(void)
+{
+    pthread_once(&g_once, android_cpuInit);
+    return g_cpuFeatures;
+}
+
+
+int
+android_getCpuCount(void)
+{
+    pthread_once(&g_once, android_cpuInit);
+    return g_cpuCount;
+}
+
+static void
+android_cpuInitDummy(void)
+{
+    g_inited = 1;
+}
+
+int
+android_setCpu(int cpu_count, uint64_t cpu_features)
+{
+    /* Fail if the library was already initialized. */
+    if (g_inited)
+        return 0;
+
+    android_cpuInitFamily();
+    g_cpuCount = (cpu_count <= 0 ? 1 : cpu_count);
+    g_cpuFeatures = cpu_features;
+    pthread_once(&g_once, android_cpuInitDummy);
+
+    return 1;
+}
+
+#ifdef __arm__
+uint32_t
+android_getCpuIdArm(void)
+{
+    pthread_once(&g_once, android_cpuInit);
+    return g_cpuIdArm;
+}
+
+int
+android_setCpuArm(int cpu_count, uint64_t cpu_features, uint32_t cpu_id)
+{
+    if (!android_setCpu(cpu_count, cpu_features))
+        return 0;
+
+    g_cpuIdArm = cpu_id;
+    return 1;
+}
+#endif  /* __arm__ */
+
+/*
+ * Technical note: Making sense of ARM's FPU architecture versions.
+ *
+ * FPA was ARM's first attempt at an FPU architecture. There is no Android
+ * device that actually uses it since this technology was already obsolete
+ * when the project started. If you see references to FPA instructions
+ * somewhere, you can be sure that this doesn't apply to Android at all.
+ *
+ * FPA was followed by "VFP", soon renamed "VFPv1" due to the emergence of
+ * new versions / additions to it. ARM considers this obsolete right now,
+ * and no known Android device implements it either.
+ *
+ * VFPv2 added a few instructions to VFPv1, and is an *optional* extension
+ * supported by some ARMv5TE, ARMv6 and ARMv6T2 CPUs. Note that a device
+ * supporting the 'armeabi' ABI doesn't necessarily support these.
+ *
+ * VFPv3-D16 adds a few instructions on top of VFPv2 and is typically used
+ * on ARMv7-A CPUs which implement a FPU. Note that it is also mandated
+ * by the Android 'armeabi-v7a' ABI. The -D16 suffix in its name means
+ * that it provides 16 double-precision FPU registers (d0-d15) and 32
+ * single-precision ones (s0-s31) which happen to be mapped to the same
+ * register banks.
+ *
+ * VFPv3-D32 is the name of an extension to VFPv3-D16 that provides 16
+ * additional double precision registers (d16-d31). Note that there are
+ * still only 32 single precision registers.
+ *
+ * VFPv3xD is a *subset* of VFPv3-D16 that only provides single-precision
+ * registers. It is only used on ARMv7-M (i.e. on micro-controllers) which
+ * are not supported by Android. Note that it is not compatible with VFPv2.
+ *
+ * NOTE: The term 'VFPv3' usually designate either VFPv3-D16 or VFPv3-D32
+ *       depending on context. For example GCC uses it for VFPv3-D32, but
+ *       the Linux kernel code uses it for VFPv3-D16 (especially in
+ *       /proc/cpuinfo). Always try to use the full designation when
+ *       possible.
+ *
+ * NEON, a.k.a. "ARM Advanced SIMD" is an extension that provides
+ * instructions to perform parallel computations on vectors of 8, 16,
+ * 32, 64 and 128 bit quantities. NEON requires VFPv32-D32 since all
+ * NEON registers are also mapped to the same register banks.
+ *
+ * VFPv4-D16, adds a few instructions on top of VFPv3-D16 in order to
+ * perform fused multiply-accumulate on VFP registers, as well as
+ * half-precision (16-bit) conversion operations.
+ *
+ * VFPv4-D32 is VFPv4-D16 with 32, instead of 16, FPU double precision
+ * registers.
+ *
+ * VPFv4-NEON is VFPv4-D32 with NEON instructions. It also adds fused
+ * multiply-accumulate instructions that work on the NEON registers.
+ *
+ * NOTE: Similarly, "VFPv4" might either reference VFPv4-D16 or VFPv4-D32
+ *       depending on context.
+ *
+ * The following information was determined by scanning the binutils-2.22
+ * sources:
+ *
+ * Basic VFP instruction subsets:
+ *
+ * #define FPU_VFP_EXT_V1xD 0x08000000     // Base VFP instruction set.
+ * #define FPU_VFP_EXT_V1   0x04000000     // Double-precision insns.
+ * #define FPU_VFP_EXT_V2   0x02000000     // ARM10E VFPr1.
+ * #define FPU_VFP_EXT_V3xD 0x01000000     // VFPv3 single-precision.
+ * #define FPU_VFP_EXT_V3   0x00800000     // VFPv3 double-precision.
+ * #define FPU_NEON_EXT_V1  0x00400000     // Neon (SIMD) insns.
+ * #define FPU_VFP_EXT_D32  0x00200000     // Registers D16-D31.
+ * #define FPU_VFP_EXT_FP16 0x00100000     // Half-precision extensions.
+ * #define FPU_NEON_EXT_FMA 0x00080000     // Neon fused multiply-add
+ * #define FPU_VFP_EXT_FMA  0x00040000     // VFP fused multiply-add
+ *
+ * FPU types (excluding NEON)
+ *
+ * FPU_VFP_V1xD (EXT_V1xD)
+ *    |
+ *    +--------------------------+
+ *    |                          |
+ * FPU_VFP_V1 (+EXT_V1)       FPU_VFP_V3xD (+EXT_V2+EXT_V3xD)
+ *    |                          |
+ *    |                          |
+ * FPU_VFP_V2 (+EXT_V2)       FPU_VFP_V4_SP_D16 (+EXT_FP16+EXT_FMA)
+ *    |
+ * FPU_VFP_V3D16 (+EXT_Vx3D+EXT_V3)
+ *    |
+ *    +--------------------------+
+ *    |                          |
+ * FPU_VFP_V3 (+EXT_D32)     FPU_VFP_V4D16 (+EXT_FP16+EXT_FMA)
+ *    |                          |
+ *    |                      FPU_VFP_V4 (+EXT_D32)
+ *    |
+ * FPU_VFP_HARD (+EXT_FMA+NEON_EXT_FMA)
+ *
+ * VFP architectures:
+ *
+ * ARCH_VFP_V1xD  (EXT_V1xD)
+ *   |
+ *   +------------------+
+ *   |                  |
+ *   |             ARCH_VFP_V3xD (+EXT_V2+EXT_V3xD)
+ *   |                  |
+ *   |             ARCH_VFP_V3xD_FP16 (+EXT_FP16)
+ *   |                  |
+ *   |             ARCH_VFP_V4_SP_D16 (+EXT_FMA)
+ *   |
+ * ARCH_VFP_V1 (+EXT_V1)
+ *   |
+ * ARCH_VFP_V2 (+EXT_V2)
+ *   |
+ * ARCH_VFP_V3D16 (+EXT_V3xD+EXT_V3)
+ *   |
+ *   +-------------------+
+ *   |                   |
+ *   |         ARCH_VFP_V3D16_FP16  (+EXT_FP16)
+ *   |
+ *   +-------------------+
+ *   |                   |
+ *   |         ARCH_VFP_V4_D16 (+EXT_FP16+EXT_FMA)
+ *   |                   |
+ *   |         ARCH_VFP_V4 (+EXT_D32)
+ *   |                   |
+ *   |         ARCH_NEON_VFP_V4 (+EXT_NEON+EXT_NEON_FMA)
+ *   |
+ * ARCH_VFP_V3 (+EXT_D32)
+ *   |
+ *   +-------------------+
+ *   |                   |
+ *   |         ARCH_VFP_V3_FP16 (+EXT_FP16)
+ *   |
+ * ARCH_VFP_V3_PLUS_NEON_V1 (+EXT_NEON)
+ *   |
+ * ARCH_NEON_FP16 (+EXT_FP16)
+ *
+ * -fpu=<name> values and their correspondance with FPU architectures above:
+ *
+ *   {"vfp",               FPU_ARCH_VFP_V2},
+ *   {"vfp9",              FPU_ARCH_VFP_V2},
+ *   {"vfp3",              FPU_ARCH_VFP_V3}, // For backwards compatbility.
+ *   {"vfp10",             FPU_ARCH_VFP_V2},
+ *   {"vfp10-r0",          FPU_ARCH_VFP_V1},
+ *   {"vfpxd",             FPU_ARCH_VFP_V1xD},
+ *   {"vfpv2",             FPU_ARCH_VFP_V2},
+ *   {"vfpv3",             FPU_ARCH_VFP_V3},
+ *   {"vfpv3-fp16",        FPU_ARCH_VFP_V3_FP16},
+ *   {"vfpv3-d16",         FPU_ARCH_VFP_V3D16},
+ *   {"vfpv3-d16-fp16",    FPU_ARCH_VFP_V3D16_FP16},
+ *   {"vfpv3xd",           FPU_ARCH_VFP_V3xD},
+ *   {"vfpv3xd-fp16",      FPU_ARCH_VFP_V3xD_FP16},
+ *   {"neon",              FPU_ARCH_VFP_V3_PLUS_NEON_V1},
+ *   {"neon-fp16",         FPU_ARCH_NEON_FP16},
+ *   {"vfpv4",             FPU_ARCH_VFP_V4},
+ *   {"vfpv4-d16",         FPU_ARCH_VFP_V4D16},
+ *   {"fpv4-sp-d16",       FPU_ARCH_VFP_V4_SP_D16},
+ *   {"neon-vfpv4",        FPU_ARCH_NEON_VFP_V4},
+ *
+ *
+ * Simplified diagram that only includes FPUs supported by Android:
+ * Only ARCH_VFP_V3D16 is actually mandated by the armeabi-v7a ABI,
+ * all others are optional and must be probed at runtime.
+ *
+ * ARCH_VFP_V3D16 (EXT_V1xD+EXT_V1+EXT_V2+EXT_V3xD+EXT_V3)
+ *   |
+ *   +-------------------+
+ *   |                   |
+ *   |         ARCH_VFP_V3D16_FP16  (+EXT_FP16)
+ *   |
+ *   +-------------------+
+ *   |                   |
+ *   |         ARCH_VFP_V4_D16 (+EXT_FP16+EXT_FMA)
+ *   |                   |
+ *   |         ARCH_VFP_V4 (+EXT_D32)
+ *   |                   |
+ *   |         ARCH_NEON_VFP_V4 (+EXT_NEON+EXT_NEON_FMA)
+ *   |
+ * ARCH_VFP_V3 (+EXT_D32)
+ *   |
+ *   +-------------------+
+ *   |                   |
+ *   |         ARCH_VFP_V3_FP16 (+EXT_FP16)
+ *   |
+ * ARCH_VFP_V3_PLUS_NEON_V1 (+EXT_NEON)
+ *   |
+ * ARCH_NEON_FP16 (+EXT_FP16)
+ *
+ */
+
+#endif // defined(__le32__)
diff --git a/PxShared/src/foundation/externals/src/android/cpu-features.h b/PxShared/src/foundation/externals/src/android/cpu-features.h
new file mode 100644
index 0000000..89f7666
--- /dev/null
+++ b/PxShared/src/foundation/externals/src/android/cpu-features.h
@@ -0,0 +1,208 @@
+/*
+ * Copyright (C) 2010 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#ifndef CPU_FEATURES_H
+#define CPU_FEATURES_H
+
+#include <sys/cdefs.h>
+#include <stdint.h>
+
+__BEGIN_DECLS
+
+typedef enum
+{
+	ANDROID_CPU_FAMILY_UNKNOWN = 0,
+	ANDROID_CPU_FAMILY_ARM,
+	ANDROID_CPU_FAMILY_X86,
+	ANDROID_CPU_FAMILY_MIPS,
+	ANDROID_CPU_FAMILY_MAX /* do not remove */
+} AndroidCpuFamily;
+
+/* Return family of the device's CPU */
+extern AndroidCpuFamily android_getCpuFamily(void);
+
+/* The list of feature flags for ARM CPUs that can be recognized by the
+ * library. Value details are:
+ *
+ *   VFPv2:
+ *     CPU supports the VFPv2 instruction set. Many, but not all, ARMv6 CPUs
+ *     support these instructions. VFPv2 is a subset of VFPv3 so this will
+ *     be set whenever VFPv3 is set too.
+ *
+ *   ARMv7:
+ *     CPU supports the ARMv7-A basic instruction set.
+ *     This feature is mandated by the 'armeabi-v7a' ABI.
+ *
+ *   VFPv3:
+ *     CPU supports the VFPv3-D16 instruction set, providing hardware FPU
+ *     support for single and double precision floating point registers.
+ *     Note that only 16 FPU registers are available by default, unless
+ *     the D32 bit is set too. This feature is also mandated by the
+ *     'armeabi-v7a' ABI.
+ *
+ *   VFP_D32:
+ *     CPU VFP optional extension that provides 32 FPU registers,
+ *     instead of 16. Note that ARM mandates this feature is the 'NEON'
+ *     feature is implemented by the CPU.
+ *
+ *   NEON:
+ *     CPU FPU supports "ARM Advanced SIMD" instructions, also known as
+ *     NEON. Note that this mandates the VFP_D32 feature as well, per the
+ *     ARM Architecture specification.
+ *
+ *   VFP_FP16:
+ *     Half-width floating precision VFP extension. If set, the CPU
+ *     supports instructions to perform floating-point operations on
+ *     16-bit registers. This is part of the VFPv4 specification, but
+ *     not mandated by any Android ABI.
+ *
+ *   VFP_FMA:
+ *     Fused multiply-accumulate VFP instructions extension. Also part of
+ *     the VFPv4 specification, but not mandated by any Android ABI.
+ *
+ *   NEON_FMA:
+ *     Fused multiply-accumulate NEON instructions extension. Optional
+ *     extension from the VFPv4 specification, but not mandated by any
+ *     Android ABI.
+ *
+ *   IDIV_ARM:
+ *     Integer division available in ARM mode. Only available
+ *     on recent CPUs (e.g. Cortex-A15).
+ *
+ *   IDIV_THUMB2:
+ *     Integer division available in Thumb-2 mode. Only available
+ *     on recent CPUs (e.g. Cortex-A15).
+ *
+ *   iWMMXt:
+ *     Optional extension that adds MMX registers and operations to an
+ *     ARM CPU. This is only available on a few XScale-based CPU designs
+ *     sold by Marvell. Pretty rare in practice.
+ *
+ * If you want to tell the compiler to generate code that targets one of
+ * the feature set above, you should probably use one of the following
+ * flags (for more details, see technical note at the end of this file):
+ *
+ *   -mfpu=vfp
+ *   -mfpu=vfpv2
+ *     These are equivalent and tell GCC to use VFPv2 instructions for
+ *     floating-point operations. Use this if you want your code to
+ *     run on *some* ARMv6 devices, and any ARMv7-A device supported
+ *     by Android.
+ *
+ *     Generated code requires VFPv2 feature.
+ *
+ *   -mfpu=vfpv3-d16
+ *     Tell GCC to use VFPv3 instructions (using only 16 FPU registers).
+ *     This should be generic code that runs on any CPU that supports the
+ *     'armeabi-v7a' Android ABI. Note that no ARMv6 CPU supports this.
+ *
+ *     Generated code requires VFPv3 feature.
+ *
+ *   -mfpu=vfpv3
+ *     Tell GCC to use VFPv3 instructions with 32 FPU registers.
+ *     Generated code requires VFPv3|VFP_D32 features.
+ *
+ *   -mfpu=neon
+ *     Tell GCC to use VFPv3 instructions with 32 FPU registers, and
+ *     also support NEON intrinsics (see <arm_neon.h>).
+ *     Generated code requires VFPv3|VFP_D32|NEON features.
+ *
+ *   -mfpu=vfpv4-d16
+ *     Generated code requires VFPv3|VFP_FP16|VFP_FMA features.
+ *
+ *   -mfpu=vfpv4
+ *     Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32 features.
+ *
+ *   -mfpu=neon-vfpv4
+ *     Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32|NEON|NEON_FMA
+ *     features.
+ *
+ *   -mcpu=cortex-a7
+ *   -mcpu=cortex-a15
+ *     Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32|
+ *                             NEON|NEON_FMA|IDIV_ARM|IDIV_THUMB2
+ *     This flag implies -mfpu=neon-vfpv4.
+ *
+ *   -mcpu=iwmmxt
+ *     Allows the use of iWMMXt instrinsics with GCC.
+ */
+enum
+{
+	ANDROID_CPU_ARM_FEATURE_ARMv7       = (1 << 0),
+	ANDROID_CPU_ARM_FEATURE_VFPv3       = (1 << 1),
+	ANDROID_CPU_ARM_FEATURE_NEON        = (1 << 2),
+	ANDROID_CPU_ARM_FEATURE_LDREX_STREX = (1 << 3),
+	ANDROID_CPU_ARM_FEATURE_VFPv2       = (1 << 4),
+	ANDROID_CPU_ARM_FEATURE_VFP_D32     = (1 << 5),
+	ANDROID_CPU_ARM_FEATURE_VFP_FP16    = (1 << 6),
+	ANDROID_CPU_ARM_FEATURE_VFP_FMA     = (1 << 7),
+	ANDROID_CPU_ARM_FEATURE_NEON_FMA    = (1 << 8),
+	ANDROID_CPU_ARM_FEATURE_IDIV_ARM    = (1 << 9),
+	ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2 = (1 << 10),
+	ANDROID_CPU_ARM_FEATURE_iWMMXt      = (1 << 11),
+};
+
+enum
+{
+	ANDROID_CPU_X86_FEATURE_SSSE3  = (1 << 0),
+	ANDROID_CPU_X86_FEATURE_POPCNT = (1 << 1),
+	ANDROID_CPU_X86_FEATURE_MOVBE  = (1 << 2),
+};
+
+extern uint64_t android_getCpuFeatures(void);
+
+/* Return the number of CPU cores detected on this device. */
+extern int android_getCpuCount(void);
+
+/* The following is used to force the CPU count and features
+ * mask in sandboxed processes. Under 4.1 and higher, these processes
+ * cannot access /proc, which is the only way to get information from
+ * the kernel about the current hardware (at least on ARM).
+ *
+ * It _must_ be called only once, and before any android_getCpuXXX
+ * function, any other case will fail.
+ *
+ * This function return 1 on success, and 0 on failure.
+ */
+extern int android_setCpu(int cpu_count, uint64_t cpu_features);
+
+#ifdef __arm__
+/* Retrieve the ARM 32-bit CPUID value from the kernel.
+ * Note that this cannot work on sandboxed processes under 4.1 and
+ * higher, unless you called android_setCpuArm() before.
+ */
+extern uint32_t android_getCpuIdArm(void);
+
+/* An ARM-specific variant of android_setCpu() that also allows you
+ * to set the ARM CPUID field.
+ */
+extern int android_setCpuArm(int cpu_count, uint64_t cpu_features, uint32_t cpu_id);
+#endif
+
+__END_DECLS
+
+#endif /* CPU_FEATURES_H */
diff --git a/PxShared/src/foundation/include/Ps.h b/PxShared/src/foundation/include/Ps.h
new file mode 100644
index 0000000..89fc9c7
--- /dev/null
+++ b/PxShared/src/foundation/include/Ps.h
@@ -0,0 +1,70 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PS_H
+#define PSFOUNDATION_PS_H
+
+/*! \file top level include file for shared foundation */
+
+#include "foundation/Px.h"
+
+/**
+Platform specific defines
+*/
+#if PX_WINDOWS_FAMILY || PX_XBOXONE
+#pragma intrinsic(memcmp)
+#pragma intrinsic(memcpy)
+#pragma intrinsic(memset)
+#pragma intrinsic(abs)
+#pragma intrinsic(labs)
+#endif
+
+// An expression that should expand to nothing in non PX_CHECKED builds.
+// We currently use this only for tagging the purpose of containers for memory use tracking.
+#if PX_CHECKED
+#define PX_DEBUG_EXP(x) (x)
+#else
+#define PX_DEBUG_EXP(x)
+#endif
+
+#define PX_SIGN_BITMASK 0x80000000
+
+namespace physx
+{
+namespace shdfnd
+{
+// Int-as-bool type - has some uses for efficiency and with SIMD
+typedef int IntBool;
+static const IntBool IntFalse = 0;
+static const IntBool IntTrue = 1;
+}
+
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PS_H
diff --git a/PxShared/src/foundation/include/PsAlignedMalloc.h b/PxShared/src/foundation/include/PsAlignedMalloc.h
new file mode 100644
index 0000000..4be8409
--- /dev/null
+++ b/PxShared/src/foundation/include/PsAlignedMalloc.h
@@ -0,0 +1,88 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSALIGNEDMALLOC_H
+#define PSFOUNDATION_PSALIGNEDMALLOC_H
+
+#include "PsUserAllocated.h"
+
+/*!
+Allocate aligned memory.
+Alignment must be a power of 2!
+-- should be templated by a base allocator
+*/
+
+namespace physx
+{
+namespace shdfnd
+{
+/**
+Allocator, which is used to access the global PxAllocatorCallback instance
+(used for dynamic data types template instantiation), which can align memory
+*/
+
+// SCS: AlignedMalloc with 3 params not found, seems not used on PC either
+// disabled for now to avoid GCC error
+
+template <uint32_t N, typename BaseAllocator = NonTrackingAllocator>
+class AlignedAllocator : public BaseAllocator
+{
+  public:
+	AlignedAllocator(const BaseAllocator& base = BaseAllocator()) : BaseAllocator(base)
+	{
+	}
+
+	void* allocate(size_t size, const char* file, int line)
+	{
+		size_t pad = N - 1 + sizeof(size_t); // store offset for delete.
+		uint8_t* base = reinterpret_cast<uint8_t*>(BaseAllocator::allocate(size + pad, file, line));
+		if(!base)
+			return NULL;
+
+		uint8_t* ptr = reinterpret_cast<uint8_t*>(size_t(base + pad) & ~(size_t(N) - 1)); // aligned pointer, ensuring N
+		// is a size_t
+		// wide mask
+		reinterpret_cast<size_t*>(ptr)[-1] = size_t(ptr - base); // store offset
+
+		return ptr;
+	}
+	void deallocate(void* ptr)
+	{
+		if(ptr == NULL)
+			return;
+
+		uint8_t* base = reinterpret_cast<uint8_t*>(ptr) - reinterpret_cast<size_t*>(ptr)[-1];
+		BaseAllocator::deallocate(base);
+	}
+};
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSALIGNEDMALLOC_H
diff --git a/PxShared/src/foundation/include/PsAlloca.h b/PxShared/src/foundation/include/PsAlloca.h
new file mode 100644
index 0000000..add64c4
--- /dev/null
+++ b/PxShared/src/foundation/include/PsAlloca.h
@@ -0,0 +1,76 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSALLOCA_H
+#define PSFOUNDATION_PSALLOCA_H
+
+#include "PsTempAllocator.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+template <typename T, typename Alloc = TempAllocator>
+class ScopedPointer : private Alloc
+{
+  public:
+	~ScopedPointer()
+	{
+		if(mOwned)
+			Alloc::deallocate(mPointer);
+	}
+
+	operator T*() const
+	{
+		return mPointer;
+	}
+
+	T* mPointer;
+	bool mOwned;
+};
+
+} // namespace shdfnd
+} // namespace physx
+
+/*! Stack allocation for \c count instances of \c type. Falling back to temp allocator if using more than 1kB. */
+#ifdef __SPU__
+#define PX_ALLOCA(var, type, count) type* var = reinterpret_cast<type*>(PxAlloca(sizeof(type) * (count)))
+#else
+#define PX_ALLOCA(var, type, count)                                                                                    \
+	physx::shdfnd::ScopedPointer<type> var;                                                                            \
+	{                                                                                                                  \
+		uint32_t size = sizeof(type) * (count);                                                                        \
+		var.mOwned = size > 1024;                                                                                      \
+		if(var.mOwned)                                                                                                 \
+			var.mPointer = reinterpret_cast<type*>(physx::shdfnd::TempAllocator().allocate(size, __FILE__, __LINE__)); \
+		else                                                                                                           \
+			var.mPointer = reinterpret_cast<type*>(PxAlloca(size));                                                    \
+	}
+#endif
+#endif // #ifndef PSFOUNDATION_PSALLOCA_H
diff --git a/PxShared/src/foundation/include/PsAllocator.h b/PxShared/src/foundation/include/PsAllocator.h
new file mode 100644
index 0000000..cbf32d3
--- /dev/null
+++ b/PxShared/src/foundation/include/PsAllocator.h
@@ -0,0 +1,367 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSALLOCATOR_H
+#define PSFOUNDATION_PSALLOCATOR_H
+
+#include "foundation/PxAllocatorCallback.h"
+#include "foundation/PxFoundation.h"
+#include "Ps.h"
+#include "foundation/PxAssert.h"
+
+#if(PX_WINDOWS_FAMILY || PX_XBOXONE)
+#include <exception>
+#include <typeinfo.h>
+#endif
+#if(PX_APPLE_FAMILY)
+#include <typeinfo>
+#endif
+
+#include <new>
+
+// Allocation macros going through user allocator
+#if PX_CHECKED
+#define PX_ALLOC(n, name) physx::shdfnd::NamedAllocator(name).allocate(n, __FILE__, __LINE__)
+#else
+#define PX_ALLOC(n, name) physx::shdfnd::NonTrackingAllocator().allocate(n, __FILE__, __LINE__)
+#endif
+#define PX_ALLOC_TEMP(n, name) PX_ALLOC(n, name)
+#define PX_FREE(x) physx::shdfnd::NonTrackingAllocator().deallocate(x)
+#define PX_FREE_AND_RESET(x)                                                                                           \
+	{                                                                                                                  \
+		PX_FREE(x);                                                                                                    \
+		x = 0;                                                                                                         \
+	}
+
+// The following macros support plain-old-types and classes derived from UserAllocated.
+#define PX_NEW(T) new (physx::shdfnd::ReflectionAllocator<T>(), __FILE__, __LINE__) T
+#define PX_NEW_TEMP(T) PX_NEW(T)
+#define PX_DELETE(x) delete x
+#define PX_DELETE_AND_RESET(x)                                                                                         \
+	{                                                                                                                  \
+		PX_DELETE(x);                                                                                                  \
+		x = 0;                                                                                                         \
+	}
+#define PX_DELETE_POD(x)                                                                                               \
+	{                                                                                                                  \
+		PX_FREE(x);                                                                                                    \
+		x = 0;                                                                                                         \
+	}
+#define PX_DELETE_ARRAY(x)                                                                                             \
+	{                                                                                                                  \
+		PX_DELETE([] x);                                                                                               \
+		x = 0;                                                                                                         \
+	}
+
+// aligned allocation
+#define PX_ALIGNED16_ALLOC(n) physx::shdfnd::AlignedAllocator<16>().allocate(n, __FILE__, __LINE__)
+#define PX_ALIGNED16_FREE(x) physx::shdfnd::AlignedAllocator<16>().deallocate(x)
+
+//! placement new macro to make it easy to spot bad use of 'new'
+#define PX_PLACEMENT_NEW(p, T) new (p) T
+
+#if PX_DEBUG || PX_CHECKED
+#define PX_USE_NAMED_ALLOCATOR 1
+#else
+#define PX_USE_NAMED_ALLOCATOR 0
+#endif
+
+// Don't use inline for alloca !!!
+#if PX_WINDOWS_FAMILY
+#include <malloc.h>
+#define PxAlloca(x) _alloca(x)
+#elif PX_LINUX || PX_ANDROID
+#include <malloc.h>
+#define PxAlloca(x) alloca(x)
+#elif PX_APPLE_FAMILY
+#include <alloca.h>
+#define PxAlloca(x) alloca(x)
+#elif PX_PS4
+#include <memory.h>
+#define PxAlloca(x) alloca(x)
+#elif PX_XBOXONE
+#include <malloc.h>
+#define PxAlloca(x) alloca(x)
+#elif PX_NX
+#include <malloc.h>
+#define PxAlloca(x) alloca(x)
+#endif
+
+#define PxAllocaAligned(x, alignment) ((size_t(PxAlloca(x + alignment)) + (alignment - 1)) & ~size_t(alignment - 1))
+
+namespace physx
+{
+namespace shdfnd
+{
+
+PX_FOUNDATION_API PxAllocatorCallback& getAllocator();
+
+/**
+Allocator used to access the global PxAllocatorCallback instance without providing additional information.
+*/
+
+class PX_FOUNDATION_API Allocator
+{
+  public:
+	Allocator(const char* = 0)
+	{
+	}
+	void* allocate(size_t size, const char* file, int line);
+	void deallocate(void* ptr);
+};
+
+/*
+ * Bootstrap allocator using malloc/free.
+ * Don't use unless your objects get allocated before foundation is initialized.
+ */
+class RawAllocator
+{
+  public:
+	RawAllocator(const char* = 0)
+	{
+	}
+	void* allocate(size_t size, const char*, int)
+	{
+		// malloc returns valid pointer for size==0, no need to check
+		return ::malloc(size);
+	}
+	void deallocate(void* ptr)
+	{
+		// free(0) is guaranteed to have no side effect, no need to check
+		::free(ptr);
+	}
+};
+
+/*
+ * Allocator that simply calls straight back to the application without tracking.
+ * This is used by the heap (Foundation::mNamedAllocMap) that tracks allocations
+ * because it needs to be able to grow as a result of an allocation.
+ * Making the hash table re-entrant to deal with this may not make sense.
+ */
+class NonTrackingAllocator
+{
+  public:
+	PX_FORCE_INLINE NonTrackingAllocator(const char* = 0)
+	{
+	}
+	PX_FORCE_INLINE void* allocate(size_t size, const char* file, int line)
+	{
+		return !size ? 0 : getAllocator().allocate(size, "NonTrackedAlloc", file, line);
+	}
+	PX_FORCE_INLINE void deallocate(void* ptr)
+	{
+		if(ptr)
+			getAllocator().deallocate(ptr);
+	}
+};
+
+/*
+\brief	Virtual allocator callback used to provide run-time defined allocators to foundation types like Array or Bitmap.
+        This is used by VirtualAllocator
+*/
+class VirtualAllocatorCallback
+{
+  public:
+	VirtualAllocatorCallback()
+	{
+	}
+	virtual ~VirtualAllocatorCallback()
+	{
+	}
+	virtual void* allocate(const size_t size, const char* file, const int line) = 0;
+	virtual void deallocate(void* ptr) = 0;
+};
+
+/*
+\brief Virtual allocator to be used by foundation types to provide run-time defined allocators.
+Due to the fact that Array extends its allocator, rather than contains a reference/pointer to it, the VirtualAllocator
+must
+be a concrete type containing a pointer to a virtual callback. The callback may not be available at instantiation time,
+therefore
+methods are provided to set the callback later.
+*/
+class VirtualAllocator
+{
+  public:
+	VirtualAllocator(VirtualAllocatorCallback* callback = NULL) : mCallback(callback)
+	{
+	}
+
+	void* allocate(const size_t size, const char* file, const int line)
+	{
+		PX_ASSERT(mCallback);
+		if(size)
+			return mCallback->allocate(size, file, line);
+		return NULL;
+	}
+	void deallocate(void* ptr)
+	{
+		PX_ASSERT(mCallback);
+		if(ptr)
+			mCallback->deallocate(ptr);
+	}
+
+	void setCallback(VirtualAllocatorCallback* callback)
+	{
+		mCallback = callback;
+	}
+	VirtualAllocatorCallback* getCallback()
+	{
+		return mCallback;
+	}
+
+  private:
+	VirtualAllocatorCallback* mCallback;
+	VirtualAllocator& operator=(const VirtualAllocator&);
+};
+
+#if PX_USE_NAMED_ALLOCATOR // can be slow, so only use in debug/checked
+class PX_FOUNDATION_API NamedAllocator
+{
+  public:
+	NamedAllocator(const PxEMPTY);
+	NamedAllocator(const char* name = 0); // todo: should not have default argument!
+	NamedAllocator(const NamedAllocator&);
+	~NamedAllocator();
+	NamedAllocator& operator=(const NamedAllocator&);
+	void* allocate(size_t size, const char* filename, int line);
+	void deallocate(void* ptr);
+};
+#else
+class NamedAllocator;
+#endif // PX_DEBUG
+
+/**
+Allocator used to access the global PxAllocatorCallback instance using a static name derived from T.
+*/
+
+template <typename T>
+class ReflectionAllocator
+{
+	static const char* getName()
+	{
+		if(!PxGetFoundation().getReportAllocationNames())
+			return "<allocation names disabled>";
+#if PX_GCC_FAMILY
+		return __PRETTY_FUNCTION__;
+#else
+		// name() calls malloc(), raw_name() wouldn't
+		return typeid(T).name();
+#endif
+	}
+
+  public:
+	ReflectionAllocator(const PxEMPTY)
+	{
+	}
+	ReflectionAllocator(const char* = 0)
+	{
+	}
+	inline ReflectionAllocator(const ReflectionAllocator&)
+	{
+	}
+	void* allocate(size_t size, const char* filename, int line)
+	{
+		return size ? getAllocator().allocate(size, getName(), filename, line) : 0;
+	}
+	void deallocate(void* ptr)
+	{
+		if(ptr)
+			getAllocator().deallocate(ptr);
+	}
+};
+
+template <typename T>
+struct AllocatorTraits
+{
+#if PX_USE_NAMED_ALLOCATOR
+	typedef NamedAllocator Type;
+#else
+	typedef ReflectionAllocator<T> Type;
+#endif
+};
+
+// if you get a build error here, you are trying to PX_NEW a class
+// that is neither plain-old-type nor derived from UserAllocated
+template <typename T, typename X>
+union EnableIfPod
+{
+	int i;
+	T t;
+	typedef X Type;
+};
+
+} // namespace shdfnd
+} // namespace physx
+
+// Global placement new for ReflectionAllocator templated by
+// plain-old-type. Allows using PX_NEW for pointers and built-in-types.
+//
+// ATTENTION: You need to use PX_DELETE_POD or PX_FREE to deallocate
+// memory, not PX_DELETE. PX_DELETE_POD redirects to PX_FREE.
+//
+// Rationale: PX_DELETE uses global operator delete(void*), which we dont' want to overload.
+// Any other definition of PX_DELETE couldn't support array syntax 'PX_DELETE([]a);'.
+// PX_DELETE_POD was preferred over PX_DELETE_ARRAY because it is used
+// less often and applies to both single instances and arrays.
+template <typename T>
+PX_INLINE void* operator new(size_t size, physx::shdfnd::ReflectionAllocator<T> alloc, const char* fileName,
+                             typename physx::shdfnd::EnableIfPod<T, int>::Type line)
+{
+	return alloc.allocate(size, fileName, line);
+}
+
+template <typename T>
+PX_INLINE void* operator new [](size_t size, physx::shdfnd::ReflectionAllocator<T> alloc, const char* fileName,
+                                typename physx::shdfnd::EnableIfPod<T, int>::Type line)
+{ return alloc.allocate(size, fileName, line); }
+
+// If construction after placement new throws, this placement delete is being called.
+template <typename T>
+PX_INLINE void operator delete(void* ptr, physx::shdfnd::ReflectionAllocator<T> alloc, const char* fileName,
+                               typename physx::shdfnd::EnableIfPod<T, int>::Type line)
+{
+	PX_UNUSED(fileName);
+	PX_UNUSED(line);
+
+	alloc.deallocate(ptr);
+}
+
+// If construction after placement new throws, this placement delete is being called.
+template <typename T>
+PX_INLINE void operator delete [](void* ptr, physx::shdfnd::ReflectionAllocator<T> alloc, const char* fileName,
+                                  typename physx::shdfnd::EnableIfPod<T, int>::Type line)
+{
+	PX_UNUSED(fileName);
+	PX_UNUSED(line);
+
+	alloc.deallocate(ptr);
+}
+
+#endif // #ifndef PSFOUNDATION_PSALLOCATOR_H
diff --git a/PxShared/src/foundation/include/PsAoS.h b/PxShared/src/foundation/include/PsAoS.h
new file mode 100644
index 0000000..641a40a
--- /dev/null
+++ b/PxShared/src/foundation/include/PsAoS.h
@@ -0,0 +1,45 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSAOS_H
+#define PSFOUNDATION_PSAOS_H
+
+#include "foundation/Px.h"
+
+#if PX_WINDOWS && !PX_NEON
+#include "windows/PsWindowsAoS.h"
+#elif(PX_UNIX_FAMILY || PX_PS4 || PX_NX)
+#include "unix/PsUnixAoS.h"
+#elif PX_XBOXONE
+#include "XboxOne/PsXboxOneAoS.h"
+#else
+#error "Platform not supported!"
+#endif
+
+#endif
diff --git a/PxShared/src/foundation/include/PsArray.h b/PxShared/src/foundation/include/PsArray.h
new file mode 100644
index 0000000..8433fbe
--- /dev/null
+++ b/PxShared/src/foundation/include/PsArray.h
@@ -0,0 +1,806 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSARRAY_H
+#define PSFOUNDATION_PSARRAY_H
+
+#include "foundation/PxAssert.h"
+#include "foundation/PxIntrinsics.h"
+#include "PsAllocator.h"
+#include "PsBasicTemplates.h"
+
+#if PX_LIBCPP
+#include <type_traits>
+#else
+#include <tr1/type_traits>
+#endif
+
+#if PX_VC == 9 || PX_VC == 10
+#pragma warning(push)
+#pragma warning(disable : 4347) // behavior change: 'function template' is called instead of 'function'
+#endif
+
+namespace physx
+{
+namespace shdfnd
+{
+template <class Serializer>
+void exportArray(Serializer& stream, const void* data, uint32_t size, uint32_t sizeOfElement, uint32_t capacity);
+char* importArray(char* address, void** data, uint32_t size, uint32_t sizeOfElement, uint32_t capacity);
+
+/*!
+An array is a sequential container.
+
+Implementation note
+* entries between 0 and size are valid objects
+* we use inheritance to build this because the array is included inline in a lot
+  of objects and we want the allocator to take no space if it's not stateful, which
+  aggregation doesn't allow. Also, we want the metadata at the front for the inline
+  case where the allocator contains some inline storage space
+*/
+template <class T, class Alloc = typename AllocatorTraits<T>::Type>
+class Array : protected Alloc
+{
+  public:
+	typedef T* Iterator;
+	typedef const T* ConstIterator;
+
+	explicit Array(const PxEMPTY v) : Alloc(v)
+	{
+		if(mData)
+			mCapacity |= PX_SIGN_BITMASK;
+	}
+
+	/*!
+	Default array constructor. Initialize an empty array
+	*/
+	PX_INLINE explicit Array(const Alloc& alloc = Alloc()) : Alloc(alloc), mData(0), mSize(0), mCapacity(0)
+	{
+	}
+
+	/*!
+	Initialize array with given capacity
+	*/
+	PX_INLINE explicit Array(uint32_t size, const T& a = T(), const Alloc& alloc = Alloc())
+	: Alloc(alloc), mData(0), mSize(0), mCapacity(0)
+	{
+		resize(size, a);
+	}
+
+	/*!
+	Copy-constructor. Copy all entries from other array
+	*/
+	template <class A>
+	PX_INLINE explicit Array(const Array<T, A>& other, const Alloc& alloc = Alloc())
+	: Alloc(alloc)
+	{
+		copy(other);
+	}
+
+	// This is necessary else the basic default copy constructor is used in the case of both arrays being of the same
+	// template instance
+	// The C++ standard clearly states that a template constructor is never a copy constructor [2]. In other words,
+	// the presence of a template constructor does not suppress the implicit declaration of the copy constructor.
+	// Also never make a copy constructor explicit, or copy-initialization* will no longer work. This is because
+	// 'binding an rvalue to a const reference requires an accessible copy constructor' (http://gcc.gnu.org/bugs/)
+	// *http://stackoverflow.com/questions/1051379/is-there-a-difference-in-c-between-copy-initialization-and-assignment-initializ
+	PX_INLINE Array(const Array& other, const Alloc& alloc = Alloc()) : Alloc(alloc)
+	{
+		copy(other);
+	}
+
+	/*!
+	Initialize array with given length
+	*/
+	PX_INLINE explicit Array(const T* first, const T* last, const Alloc& alloc = Alloc())
+	: Alloc(alloc), mSize(last < first ? 0 : uint32_t(last - first)), mCapacity(mSize)
+	{
+		mData = allocate(mSize);
+		copy(mData, mData + mSize, first);
+	}
+
+	/*!
+	Destructor
+	*/
+	PX_INLINE ~Array()
+	{
+		destroy(mData, mData + mSize);
+
+		if(capacity() && !isInUserMemory())
+			deallocate(mData);
+	}
+
+	/*!
+	Assignment operator. Copy content (deep-copy)
+	*/
+	template <class A>
+	PX_INLINE Array& operator=(const Array<T, A>& rhs)
+	{
+		if(&rhs == this)
+			return *this;
+
+		clear();
+		reserve(rhs.mSize);
+		copy(mData, mData + rhs.mSize, rhs.mData);
+
+		mSize = rhs.mSize;
+		return *this;
+	}
+
+	PX_INLINE Array& operator=(const Array& t) // Needs to be declared, see comment at copy-constructor
+	{
+		return operator=<Alloc>(t);
+	}
+
+	PX_FORCE_INLINE static bool isArrayOfPOD()
+	{
+#if PX_LIBCPP
+		return std::is_trivially_copyable<T>::value;
+#else
+		return std::tr1::is_pod<T>::value;
+#endif
+	}
+
+	/*!
+	Array indexing operator.
+	\param i
+	The index of the element that will be returned.
+	\return
+	The element i in the array.
+	*/
+	PX_FORCE_INLINE const T& operator[](uint32_t i) const
+	{
+		PX_ASSERT(i < mSize);
+		return mData[i];
+	}
+
+	/*!
+	Array indexing operator.
+	\param i
+	The index of the element that will be returned.
+	\return
+	The element i in the array.
+	*/
+	PX_FORCE_INLINE T& operator[](uint32_t i)
+	{
+		PX_ASSERT(i < mSize);
+		return mData[i];
+	}
+
+	/*!
+	Returns a pointer to the initial element of the array.
+	\return
+	a pointer to the initial element of the array.
+	*/
+	PX_FORCE_INLINE ConstIterator begin() const
+	{
+		return mData;
+	}
+
+	PX_FORCE_INLINE Iterator begin()
+	{
+		return mData;
+	}
+
+	/*!
+	Returns an iterator beyond the last element of the array. Do not dereference.
+	\return
+	a pointer to the element beyond the last element of the array.
+	*/
+
+	PX_FORCE_INLINE ConstIterator end() const
+	{
+		return mData + mSize;
+	}
+
+	PX_FORCE_INLINE Iterator end()
+	{
+		return mData + mSize;
+	}
+
+	/*!
+	Returns a reference to the first element of the array. Undefined if the array is empty.
+	\return a reference to the first element of the array
+	*/
+
+	PX_FORCE_INLINE const T& front() const
+	{
+		PX_ASSERT(mSize);
+		return mData[0];
+	}
+
+	PX_FORCE_INLINE T& front()
+	{
+		PX_ASSERT(mSize);
+		return mData[0];
+	}
+
+	/*!
+	Returns a reference to the last element of the array. Undefined if the array is empty
+	\return a reference to the last element of the array
+	*/
+
+	PX_FORCE_INLINE const T& back() const
+	{
+		PX_ASSERT(mSize);
+		return mData[mSize - 1];
+	}
+
+	PX_FORCE_INLINE T& back()
+	{
+		PX_ASSERT(mSize);
+		return mData[mSize - 1];
+	}
+
+	/*!
+	Returns the number of entries in the array. This can, and probably will,
+	differ from the array capacity.
+	\return
+	The number of of entries in the array.
+	*/
+	PX_FORCE_INLINE uint32_t size() const
+	{
+		return mSize;
+	}
+
+	/*!
+	Clears the array.
+	*/
+	PX_INLINE void clear()
+	{
+		destroy(mData, mData + mSize);
+		mSize = 0;
+	}
+
+	/*!
+	Returns whether the array is empty (i.e. whether its size is 0).
+	\return
+	true if the array is empty
+	*/
+	PX_FORCE_INLINE bool empty() const
+	{
+		return mSize == 0;
+	}
+
+	/*!
+	Finds the first occurrence of an element in the array.
+	\param a
+	The element to find.
+	*/
+
+	PX_INLINE Iterator find(const T& a)
+	{
+		uint32_t index;
+		for(index = 0; index < mSize && mData[index] != a; index++)
+			;
+		return mData + index;
+	}
+
+	PX_INLINE ConstIterator find(const T& a) const
+	{
+		uint32_t index;
+		for(index = 0; index < mSize && mData[index] != a; index++)
+			;
+		return mData + index;
+	}
+
+	/////////////////////////////////////////////////////////////////////////
+	/*!
+	Adds one element to the end of the array. Operation is O(1).
+	\param a
+	The element that will be added to this array.
+	*/
+	/////////////////////////////////////////////////////////////////////////
+
+	PX_FORCE_INLINE T& pushBack(const T& a)
+	{
+		if(capacity() <= mSize)
+			return growAndPushBack(a);
+
+		PX_PLACEMENT_NEW(reinterpret_cast<void*>(mData + mSize), T)(a);
+
+		return mData[mSize++];
+	}
+
+	/////////////////////////////////////////////////////////////////////////
+	/*!
+	Returns the element at the end of the array. Only legal if the array is non-empty.
+	*/
+	/////////////////////////////////////////////////////////////////////////
+	PX_INLINE T popBack()
+	{
+		PX_ASSERT(mSize);
+		T t = mData[mSize - 1];
+
+		if(!isArrayOfPOD())
+		{
+			mData[--mSize].~T();
+		}
+		else
+		{
+			--mSize;
+		}
+
+		return t;
+	}
+
+	/////////////////////////////////////////////////////////////////////////
+	/*!
+	Construct one element at the end of the array. Operation is O(1).
+	*/
+	/////////////////////////////////////////////////////////////////////////
+	PX_INLINE T& insert()
+	{
+		if(capacity() <= mSize)
+			grow(capacityIncrement());
+
+		T* ptr = mData + mSize++;
+		new (ptr) T; // not 'T()' because PODs should not get default-initialized.
+		return *ptr;
+	}
+
+	/////////////////////////////////////////////////////////////////////////
+	/*!
+	Subtracts the element on position i from the array and replace it with
+	the last element.
+	Operation is O(1)
+	\param i
+	The position of the element that will be subtracted from this array.
+	*/
+	/////////////////////////////////////////////////////////////////////////
+	PX_INLINE void replaceWithLast(uint32_t i)
+	{
+		PX_ASSERT(i < mSize);
+		mData[i] = mData[--mSize];
+
+		if(!isArrayOfPOD())
+		{
+			mData[mSize].~T();
+		}
+	}
+
+	PX_INLINE void replaceWithLast(Iterator i)
+	{
+		replaceWithLast(static_cast<uint32_t>(i - mData));
+	}
+
+	/////////////////////////////////////////////////////////////////////////
+	/*!
+	Replaces the first occurrence of the element a with the last element
+	Operation is O(n)
+	\param a
+	The position of the element that will be subtracted from this array.
+	\return true if the element has been removed.
+	*/
+	/////////////////////////////////////////////////////////////////////////
+
+	PX_INLINE bool findAndReplaceWithLast(const T& a)
+	{
+		uint32_t index = 0;
+		while(index < mSize && mData[index] != a)
+			++index;
+		if(index == mSize)
+			return false;
+		replaceWithLast(index);
+		return true;
+	}
+
+	/////////////////////////////////////////////////////////////////////////
+	/*!
+	Subtracts the element on position i from the array. Shift the entire
+	array one step.
+	Operation is O(n)
+	\param i
+	The position of the element that will be subtracted from this array.
+	*/
+	/////////////////////////////////////////////////////////////////////////
+	PX_INLINE void remove(uint32_t i)
+	{
+		PX_ASSERT(i < mSize);
+
+		if(isArrayOfPOD())
+		{
+			if(i + 1 != mSize)
+			{
+				physx::intrinsics::memMove(mData + i, mData + i + 1, (mSize - i - 1) * sizeof(T));
+			}
+		}
+		else
+		{
+			T* it = mData + i;
+			it->~T();
+			while (++i < mSize)
+			{								
+				new (it) T(mData[i]);
+				++it;
+				it->~T();
+			} 
+		}
+		--mSize;
+	}
+
+	/////////////////////////////////////////////////////////////////////////
+	/*!
+	Removes a range from the array.  Shifts the array so order is maintained.
+	Operation is O(n)
+	\param begin
+	The starting position of the element that will be subtracted from this array.
+	\param count
+	The number of elments that will be subtracted from this array.
+	*/
+	/////////////////////////////////////////////////////////////////////////
+	PX_INLINE void removeRange(uint32_t begin, uint32_t count)
+	{
+		PX_ASSERT(begin < mSize);
+		PX_ASSERT((begin + count) <= mSize);
+
+		if(!isArrayOfPOD())
+		{
+			for(uint32_t i = 0; i < count; i++)
+			{
+				mData[begin + i].~T(); // call the destructor on the ones being removed first.
+			}
+		}
+
+		T* dest = &mData[begin];                       // location we are copying the tail end objects to
+		T* src = &mData[begin + count];                // start of tail objects
+		uint32_t move_count = mSize - (begin + count); // compute remainder that needs to be copied down
+
+		if(isArrayOfPOD())
+		{
+			physx::intrinsics::memMove(dest, src, move_count * sizeof(T));
+		}
+		else
+		{
+			for(uint32_t i = 0; i < move_count; i++)
+			{
+				new (dest) T(*src); // copy the old one to the new location
+				src->~T();          // call the destructor on the old location
+				dest++;
+				src++;
+			}
+		}
+		mSize -= count;
+	}
+
+	//////////////////////////////////////////////////////////////////////////
+	/*!
+	Resize array
+	*/
+	//////////////////////////////////////////////////////////////////////////
+	PX_NOINLINE void resize(const uint32_t size, const T& a = T());
+
+	PX_NOINLINE void resizeUninitialized(const uint32_t size);
+
+	//////////////////////////////////////////////////////////////////////////
+	/*!
+	Resize array such that only as much memory is allocated to hold the
+	existing elements
+	*/
+	//////////////////////////////////////////////////////////////////////////
+	PX_INLINE void shrink()
+	{
+		recreate(mSize);
+	}
+
+	//////////////////////////////////////////////////////////////////////////
+	/*!
+	Deletes all array elements and frees memory.
+	*/
+	//////////////////////////////////////////////////////////////////////////
+	PX_INLINE void reset()
+	{
+		resize(0);
+		shrink();
+	}
+
+	//////////////////////////////////////////////////////////////////////////
+	/*!
+	Ensure that the array has at least size capacity.
+	*/
+	//////////////////////////////////////////////////////////////////////////
+	PX_INLINE void reserve(const uint32_t capacity)
+	{
+		if(capacity > this->capacity())
+			grow(capacity);
+	}
+
+	//////////////////////////////////////////////////////////////////////////
+	/*!
+	Query the capacity(allocated mem) for the array.
+	*/
+	//////////////////////////////////////////////////////////////////////////
+	PX_FORCE_INLINE uint32_t capacity() const
+	{
+		return mCapacity & ~PX_SIGN_BITMASK;
+	}
+
+	//////////////////////////////////////////////////////////////////////////
+	/*!
+	Unsafe function to force the size of the array
+	*/
+	//////////////////////////////////////////////////////////////////////////
+	PX_FORCE_INLINE void forceSize_Unsafe(uint32_t size)
+	{
+		PX_ASSERT(size <= mCapacity);
+		mSize = size;
+	}
+
+	//////////////////////////////////////////////////////////////////////////
+	/*!
+	Swap contents of an array without allocating temporary storage
+	*/
+	//////////////////////////////////////////////////////////////////////////
+	PX_INLINE void swap(Array<T, Alloc>& other)
+	{
+		shdfnd::swap(mData, other.mData);
+		shdfnd::swap(mSize, other.mSize);
+		shdfnd::swap(mCapacity, other.mCapacity);
+	}
+
+	//////////////////////////////////////////////////////////////////////////
+	/*!
+	Assign a range of values to this vector (resizes to length of range)
+	*/
+	//////////////////////////////////////////////////////////////////////////
+	PX_INLINE void assign(const T* first, const T* last)
+	{
+		resizeUninitialized(uint32_t(last - first));
+		copy(begin(), end(), first);
+	}
+
+	// We need one bit to mark arrays that have been deserialized from a user-provided memory block.
+	// For alignment & memory saving purpose we store that bit in the rarely used capacity member.
+	PX_FORCE_INLINE uint32_t isInUserMemory() const
+	{
+		return mCapacity & PX_SIGN_BITMASK;
+	}
+
+	/// return reference to allocator
+	PX_INLINE Alloc& getAllocator()
+	{
+		return *this;
+	}
+
+  protected:
+	// constructor for where we don't own the memory
+	Array(T* memory, uint32_t size, uint32_t capacity, const Alloc& alloc = Alloc())
+	: Alloc(alloc), mData(memory), mSize(size), mCapacity(capacity | PX_SIGN_BITMASK)
+	{
+	}
+
+	template <class A>
+	PX_NOINLINE void copy(const Array<T, A>& other);
+
+	PX_INLINE T* allocate(uint32_t size)
+	{
+		if(size > 0)
+		{
+			T* p = reinterpret_cast<T*>(Alloc::allocate(sizeof(T) * size, __FILE__, __LINE__));
+/**
+Mark a specified amount of memory with 0xcd pattern. This is used to check that the meta data
+definition for serialized classes is complete in checked builds.
+*/
+#if PX_CHECKED
+			if(p)
+			{
+				for(uint32_t i = 0; i < (sizeof(T) * size); ++i)
+					reinterpret_cast<uint8_t*>(p)[i] = 0xcd;
+			}
+#endif
+			return p;
+		}
+		return 0;
+	}
+
+	PX_INLINE void deallocate(void* mem)
+	{
+		Alloc::deallocate(mem);
+	}
+
+	static PX_INLINE bool isZeroInit(const T& object)
+	{
+		if (!isArrayOfPOD())
+			return false;
+		char ZeroBuffOnStack[sizeof(object)] = {};
+		// bgaldrikian - casting to void* to avoid compiler error:
+		// error : first operand of this 'memcmp' call is a pointer to dynamic class [...]; vtable pointer will be compared [-Werror,-Wdynamic-class-memaccess]
+		// even though POD check prevents memcmp from being used on a dynamic class
+		return memcmp(reinterpret_cast<const void*>(&object), ZeroBuffOnStack, sizeof(object)) == 0;
+	}
+
+	static PX_INLINE void create(T* first, T* last, const T& a)
+	{
+		if(isZeroInit(a))
+		{
+			if(last > first)
+				physx::intrinsics::memZero(first, uint32_t((last - first) * sizeof(T)));
+		}
+		else
+		{
+			for(; first < last; ++first)
+				::new (first) T(a);
+		}
+	}
+
+	static PX_INLINE void copy(T* first, T* last, const T* src)
+	{
+		if(last <= first)
+			return;
+
+		if(isArrayOfPOD())
+		{
+			physx::intrinsics::memCopy(first, src, uint32_t((last - first) * sizeof(T)));
+		}
+		else
+		{
+			for(; first < last; ++first, ++src)
+				::new (first) T(*src);
+		}
+	}
+
+	static PX_INLINE void destroy(T* first, T* last)
+	{
+		if(!isArrayOfPOD())
+		{
+			for(; first < last; ++first)
+				first->~T();
+		}
+	}
+
+	/*!
+	Called when pushBack() needs to grow the array.
+	\param a The element that will be added to this array.
+	*/
+	PX_NOINLINE T& growAndPushBack(const T& a);
+
+	/*!
+	Resizes the available memory for the array.
+
+	\param capacity
+	The number of entries that the set should be able to hold.
+	*/
+	PX_INLINE void grow(uint32_t capacity)
+	{
+		PX_ASSERT(this->capacity() < capacity);
+		recreate(capacity);
+	}
+
+	/*!
+	Creates a new memory block, copies all entries to the new block and destroys old entries.
+
+	\param capacity
+	The number of entries that the set should be able to hold.
+	*/
+	PX_NOINLINE void recreate(uint32_t capacity);
+
+	// The idea here is to prevent accidental bugs with pushBack or insert. Unfortunately
+	// it interacts badly with InlineArrays with smaller inline allocations.
+	// TODO(dsequeira): policy template arg, this is exactly what they're for.
+	PX_INLINE uint32_t capacityIncrement() const
+	{
+		const uint32_t capacity = this->capacity();
+		return capacity == 0 ? 1 : capacity * 2;
+	}
+
+	T* mData;
+	uint32_t mSize;
+	uint32_t mCapacity;
+};
+
+template <class T, class Alloc>
+PX_NOINLINE void Array<T, Alloc>::resize(const uint32_t size, const T& a)
+{
+	reserve(size);
+	create(mData + mSize, mData + size, a);
+	destroy(mData + size, mData + mSize);
+	mSize = size;
+}
+
+template <class T, class Alloc>
+template <class A>
+PX_NOINLINE void Array<T, Alloc>::copy(const Array<T, A>& other)
+{
+	if(!other.empty())
+	{
+		mData = allocate(mSize = mCapacity = other.size());
+		copy(mData, mData + mSize, other.begin());
+	}
+	else
+	{
+		mData = NULL;
+		mSize = 0;
+		mCapacity = 0;
+	}
+
+	// mData = allocate(other.mSize);
+	// mSize = other.mSize;
+	// mCapacity = other.mSize;
+	// copy(mData, mData + mSize, other.mData);
+}
+
+template <class T, class Alloc>
+PX_NOINLINE void Array<T, Alloc>::resizeUninitialized(const uint32_t size)
+{
+	reserve(size);
+	mSize = size;
+}
+
+template <class T, class Alloc>
+PX_NOINLINE T& Array<T, Alloc>::growAndPushBack(const T& a)
+{
+	uint32_t capacity = capacityIncrement();
+
+	T* newData = allocate(capacity);
+	PX_ASSERT((!capacity) || (newData && (newData != mData)));
+	copy(newData, newData + mSize, mData);
+
+	// inserting element before destroying old array
+	// avoids referencing destroyed object when duplicating array element.
+	PX_PLACEMENT_NEW(reinterpret_cast<void*>(newData + mSize), T)(a);
+
+	destroy(mData, mData + mSize);
+	if(!isInUserMemory())
+		deallocate(mData);
+
+	mData = newData;
+	mCapacity = capacity;
+
+	return mData[mSize++];
+}
+
+template <class T, class Alloc>
+PX_NOINLINE void Array<T, Alloc>::recreate(uint32_t capacity)
+{
+	T* newData = allocate(capacity);
+	PX_ASSERT((!capacity) || (newData && (newData != mData)));
+
+	copy(newData, newData + mSize, mData);
+	destroy(mData, mData + mSize);
+	if(!isInUserMemory())
+		deallocate(mData);
+
+	mData = newData;
+	mCapacity = capacity;
+}
+
+template <class T, class Alloc>
+PX_INLINE void swap(Array<T, Alloc>& x, Array<T, Alloc>& y)
+{
+	x.swap(y);
+}
+
+} // namespace shdfnd
+} // namespace physx
+
+#if PX_VC == 9 || PX_VC == 10
+#pragma warning(pop)
+#endif
+
+#endif // #ifndef PSFOUNDATION_PSARRAY_H
diff --git a/PxShared/src/foundation/include/PsAtomic.h b/PxShared/src/foundation/include/PsAtomic.h
new file mode 100644
index 0000000..23df190
--- /dev/null
+++ b/PxShared/src/foundation/include/PsAtomic.h
@@ -0,0 +1,63 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSATOMIC_H
+#define PSFOUNDATION_PSATOMIC_H
+
+#include "Ps.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+/* set *dest equal to val. Return the old value of *dest */
+PX_FOUNDATION_API int32_t atomicExchange(volatile int32_t* dest, int32_t val);
+
+/* if *dest == comp, replace with exch. Return original value of *dest */
+PX_FOUNDATION_API int32_t atomicCompareExchange(volatile int32_t* dest, int32_t exch, int32_t comp);
+
+/* if *dest == comp, replace with exch. Return original value of *dest */
+PX_FOUNDATION_API void* atomicCompareExchangePointer(volatile void** dest, void* exch, void* comp);
+
+/* increment the specified location. Return the incremented value */
+PX_FOUNDATION_API int32_t atomicIncrement(volatile int32_t* val);
+
+/* decrement the specified location. Return the decremented value */
+PX_FOUNDATION_API int32_t atomicDecrement(volatile int32_t* val);
+
+/* add delta to *val. Return the new value */
+PX_FOUNDATION_API int32_t atomicAdd(volatile int32_t* val, int32_t delta);
+
+/* compute the maximum of dest and val. Return the new value */
+PX_FOUNDATION_API int32_t atomicMax(volatile int32_t* val, int32_t val2);
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSATOMIC_H
diff --git a/PxShared/src/foundation/include/PsBasicTemplates.h b/PxShared/src/foundation/include/PsBasicTemplates.h
new file mode 100644
index 0000000..514da05
--- /dev/null
+++ b/PxShared/src/foundation/include/PsBasicTemplates.h
@@ -0,0 +1,146 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSBASICTEMPLATES_H
+#define PSFOUNDATION_PSBASICTEMPLATES_H
+
+#include "Ps.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+template <typename A>
+struct Equal
+{
+	bool operator()(const A& a, const A& b) const
+	{
+		return a == b;
+	}
+};
+
+template <typename A>
+struct Less
+{
+	bool operator()(const A& a, const A& b) const
+	{
+		return a < b;
+	}
+};
+
+template <typename A>
+struct Greater
+{
+	bool operator()(const A& a, const A& b) const
+	{
+		return a > b;
+	}
+};
+
+template <class F, class S>
+class Pair
+{
+  public:
+	F first;
+	S second;
+	Pair() : first(F()), second(S())
+	{
+	}
+	Pair(const F& f, const S& s) : first(f), second(s)
+	{
+	}
+	Pair(const Pair& p) : first(p.first), second(p.second)
+	{
+	}
+	// CN - fix for /.../PsBasicTemplates.h(61) : warning C4512: 'physx::shdfnd::Pair<F,S>' : assignment operator could
+	// not be generated
+	Pair& operator=(const Pair& p)
+	{
+		first = p.first;
+		second = p.second;
+		return *this;
+	}
+	bool operator==(const Pair& p) const
+	{
+		return first == p.first && second == p.second;
+	}
+	bool operator<(const Pair& p) const
+	{
+		if(first < p.first)
+			return true;
+		else
+			return !(p.first < first) && (second < p.second);
+	}
+};
+
+template <unsigned int A>
+struct LogTwo
+{
+	static const unsigned int value = LogTwo<(A >> 1)>::value + 1;
+};
+template <>
+struct LogTwo<1>
+{
+	static const unsigned int value = 0;
+};
+
+template <typename T>
+struct UnConst
+{
+	typedef T Type;
+};
+template <typename T>
+struct UnConst<const T>
+{
+	typedef T Type;
+};
+
+template <typename T>
+T pointerOffset(void* p, ptrdiff_t offset)
+{
+	return reinterpret_cast<T>(reinterpret_cast<char*>(p) + offset);
+}
+template <typename T>
+T pointerOffset(const void* p, ptrdiff_t offset)
+{
+	return reinterpret_cast<T>(reinterpret_cast<const char*>(p) + offset);
+}
+
+template <class T>
+PX_CUDA_CALLABLE PX_INLINE void swap(T& x, T& y)
+{
+	const T tmp = x;
+	x = y;
+	y = tmp;
+}
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSBASICTEMPLATES_H
diff --git a/PxShared/src/foundation/include/PsBitUtils.h b/PxShared/src/foundation/include/PsBitUtils.h
new file mode 100644
index 0000000..f69f47a
--- /dev/null
+++ b/PxShared/src/foundation/include/PsBitUtils.h
@@ -0,0 +1,109 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSBITUTILS_H
+#define PSFOUNDATION_PSBITUTILS_H
+
+#include "foundation/PxIntrinsics.h"
+#include "foundation/PxAssert.h"
+#include "PsIntrinsics.h"
+#include "Ps.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+PX_INLINE uint32_t bitCount(uint32_t v)
+{
+	// from http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+	uint32_t const w = v - ((v >> 1) & 0x55555555);
+	uint32_t const x = (w & 0x33333333) + ((w >> 2) & 0x33333333);
+	return (((x + (x >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
+}
+
+PX_INLINE bool isPowerOfTwo(uint32_t x)
+{
+	return x != 0 && (x & (x - 1)) == 0;
+}
+
+// "Next Largest Power of 2
+// Given a binary integer value x, the next largest power of 2 can be computed by a SWAR algorithm
+// that recursively "folds" the upper bits into the lower bits. This process yields a bit vector with
+// the same most significant 1 as x, but all 1's below it. Adding 1 to that value yields the next
+// largest power of 2. For a 32-bit value:"
+PX_INLINE uint32_t nextPowerOfTwo(uint32_t x)
+{
+	x |= (x >> 1);
+	x |= (x >> 2);
+	x |= (x >> 4);
+	x |= (x >> 8);
+	x |= (x >> 16);
+	return x + 1;
+}
+
+/*!
+Return the index of the highest set bit. Not valid for zero arg.
+*/
+
+PX_INLINE uint32_t lowestSetBit(uint32_t x)
+{
+	PX_ASSERT(x);
+	return lowestSetBitUnsafe(x);
+}
+
+/*!
+Return the index of the highest set bit. Not valid for zero arg.
+*/
+
+PX_INLINE uint32_t highestSetBit(uint32_t x)
+{
+	PX_ASSERT(x);
+	return highestSetBitUnsafe(x);
+}
+
+// Helper function to approximate log2 of an integer value
+// assumes that the input is actually power of two.
+// todo: replace 2 usages with 'highestSetBit'
+PX_INLINE uint32_t ilog2(uint32_t num)
+{
+	for(uint32_t i = 0; i < 32; i++)
+	{
+		num >>= 1;
+		if(num == 0)
+			return i;
+	}
+
+	PX_ASSERT(0);
+	return uint32_t(-1);
+}
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSBITUTILS_H
diff --git a/PxShared/src/foundation/include/PsBroadcast.h b/PxShared/src/foundation/include/PsBroadcast.h
new file mode 100644
index 0000000..3eb1cba
--- /dev/null
+++ b/PxShared/src/foundation/include/PsBroadcast.h
@@ -0,0 +1,277 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PXPVDSDK_PXBROADCAST_H
+#define PXPVDSDK_PXBROADCAST_H
+
+#include "Ps.h"
+#include "PsInlineArray.h"
+
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxErrorCallback.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+
+/**
+\brief Abstract listener class that listens to allocation and deallocation events from the
+    foundation memory system.
+
+<b>Threading:</b> All methods of this class should be thread safe as it can be called from the user thread
+or the physics processing thread(s).
+*/
+class AllocationListener
+{
+  public:
+	/**
+	\brief callback when memory is allocated.
+	\param size Size of the allocation in bytes.
+	\param typeName Type this data is being allocated for.
+	\param filename File the allocation came from.
+	\param line the allocation came from.
+	\param allocatedMemory memory that will be returned from the allocation.
+	*/
+	virtual void onAllocation(size_t size, const char* typeName, const char* filename, int line,
+	                          void* allocatedMemory) = 0;
+
+	/**
+	\brief callback when memory is deallocated.
+	\param allocatedMemory memory just before allocation.
+	*/
+	virtual void onDeallocation(void* allocatedMemory) = 0;
+
+  protected:
+	virtual ~AllocationListener()
+	{
+	}
+};
+
+/**
+\brief Broadcast class implementation, registering listeners.
+
+<b>Threading:</b> All methods of this class should be thread safe as it can be called from the user thread
+or the physics processing thread(s). There is not internal locking
+*/
+template <class Listener, class Base>
+class Broadcast : public Base
+{
+  public:
+	static const uint32_t MAX_NB_LISTENERS = 16;
+
+	/**
+	\brief The default constructor.
+	*/
+	Broadcast()
+	{
+	}
+
+	/**
+	\brief Register new listener.
+
+	\note It is NOT SAFE to register and deregister listeners while allocations may be taking place.
+	moreover, there is no thread safety to registration/deregistration.
+
+	\param listener Listener to register.
+	*/
+	void registerListener(Listener& listener)
+	{
+		if(mListeners.size() < MAX_NB_LISTENERS)
+			mListeners.pushBack(&listener);
+	}
+
+	/**
+	\brief Deregister an existing listener.
+
+	\note It is NOT SAFE to register and deregister listeners while allocations may be taking place.
+	moreover, there is no thread safety to registration/deregistration.
+
+	\param listener Listener to deregister.
+	*/
+	void deregisterListener(Listener& listener)
+	{
+		mListeners.findAndReplaceWithLast(&listener);
+	}
+
+	/**
+	\brief Get number of registered listeners.
+
+	\return Number of listeners.
+	*/
+	uint32_t getNbListeners() const
+	{
+		return mListeners.size();
+	}
+
+	/**
+	\brief Get an existing listener from given index.
+
+	\param index Index of the listener.
+	\return Listener on given index.
+	*/
+	Listener& getListener(uint32_t index)
+	{
+		PX_ASSERT(index <= mListeners.size());
+		return *mListeners[index];
+	}
+
+  protected:
+	virtual ~Broadcast()
+	{
+	}
+
+	physx::shdfnd::InlineArray<Listener*, MAX_NB_LISTENERS, physx::shdfnd::NonTrackingAllocator> mListeners;
+};
+
+/**
+\brief Abstract base class for an application defined memory allocator that allows an external listener
+to audit the memory allocations.
+*/
+class BroadcastingAllocator : public Broadcast<AllocationListener, PxAllocatorCallback>
+{
+	PX_NOCOPY(BroadcastingAllocator)
+
+  public:
+	/**
+	\brief The default constructor.
+	*/
+	BroadcastingAllocator(PxAllocatorCallback& allocator, PxErrorCallback& error) : mAllocator(allocator), mError(error)
+	{
+		mListeners.clear();
+	}
+
+	/**
+	\brief The default constructor.
+	*/
+	virtual ~BroadcastingAllocator()
+	{
+		mListeners.clear();
+	}
+
+	/**
+	\brief Allocates size bytes of memory, which must be 16-byte aligned.
+
+	This method should never return NULL.  If you run out of memory, then
+	you should terminate the app or take some other appropriate action.
+
+	<b>Threading:</b> This function should be thread safe as it can be called in the context of the user thread
+	and physics processing thread(s).
+
+	\param size			Number of bytes to allocate.
+	\param typeName		Name of the datatype that is being allocated
+	\param filename		The source file which allocated the memory
+	\param line			The source line which allocated the memory
+	\return				The allocated block of memory.
+	*/
+	void* allocate(size_t size, const char* typeName, const char* filename, int line)
+	{
+		void* mem = mAllocator.allocate(size, typeName, filename, line);
+
+		if(!mem)
+		{
+			mError.reportError(PxErrorCode::eABORT, "User allocator returned NULL.", __FILE__, __LINE__);
+			return NULL;
+		}
+
+		if((reinterpret_cast<size_t>(mem) & 15))
+		{
+			mError.reportError(PxErrorCode::eABORT, "Allocations must be 16-byte aligned.", __FILE__, __LINE__);
+			return NULL;
+		}
+
+		for(uint32_t i = 0; i < mListeners.size(); i++)
+			mListeners[i]->onAllocation(size, typeName, filename, line, mem);
+
+		return mem;
+	}
+
+	/**
+	\brief Frees memory previously allocated by allocate().
+
+	<b>Threading:</b> This function should be thread safe as it can be called in the context of the user thread
+	and physics processing thread(s).
+
+	\param ptr Memory to free.
+	*/
+	void deallocate(void* ptr)
+	{
+		for(uint32_t i = 0; i < mListeners.size(); i++)
+		{
+			mListeners[i]->onDeallocation(ptr);
+		}
+		mAllocator.deallocate(ptr);
+	}
+
+  private:
+	PxAllocatorCallback& mAllocator;
+	PxErrorCallback& mError;
+};
+
+/**
+\brief Abstract base class for an application defined error callback that allows an external listener
+to report errors.
+*/
+class BroadcastingErrorCallback : public Broadcast<PxErrorCallback, PxErrorCallback>
+{
+	PX_NOCOPY(BroadcastingErrorCallback)
+  public:
+	/**
+	\brief The default constructor.
+	*/
+	BroadcastingErrorCallback(PxErrorCallback& errorCallback)
+	{
+		registerListener(errorCallback);
+	}
+
+	/**
+	\brief The default destructor.
+	*/
+	virtual ~BroadcastingErrorCallback()
+	{
+		mListeners.clear();
+	}
+
+	/**
+	\brief Reports an error code.
+	\param code Error code, see #PxErrorCode
+	\param message Message to display.
+	\param file File error occured in.
+	\param line Line number error occured on.
+	*/
+	void reportError(PxErrorCode::Enum code, const char* message, const char* file, int line)
+	{
+		for(uint32_t i = 0; i < mListeners.size(); i++)
+			mListeners[i]->reportError(code, message, file, line);
+	}
+};
+}
+} // namespace physx
+
+#endif // PXPVDSDK_PXBROADCAST_H
diff --git a/PxShared/src/foundation/include/PsCpu.h b/PxShared/src/foundation/include/PsCpu.h
new file mode 100644
index 0000000..2dcc5c1
--- /dev/null
+++ b/PxShared/src/foundation/include/PsCpu.h
@@ -0,0 +1,47 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSCPU_H
+#define PSFOUNDATION_PSCPU_H
+
+#include "Ps.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+class Cpu
+{
+  public:
+	static uint8_t getCpuId();
+};
+}
+}
+
+#endif // #ifndef PSFOUNDATION_PSCPU_H
diff --git a/PxShared/src/foundation/include/PsFPU.h b/PxShared/src/foundation/include/PsFPU.h
new file mode 100644
index 0000000..fd990a8
--- /dev/null
+++ b/PxShared/src/foundation/include/PsFPU.h
@@ -0,0 +1,103 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSFPU_H
+#define PSFOUNDATION_PSFPU_H
+
+#include "Ps.h"
+#include "PsIntrinsics.h"
+
+#define PX_IR(x) ((uint32_t&)(x))
+#define PX_SIR(x) ((int32_t&)(x))
+#define PX_FR(x) ((float&)(x))
+
+// signed integer representation of a floating-point value.
+
+// Floating-point representation of a integer value.
+
+#define PX_SIGN_BITMASK 0x80000000
+
+#define PX_FPU_GUARD shdfnd::FPUGuard scopedFpGuard;
+#define PX_SIMD_GUARD shdfnd::SIMDGuard scopedFpGuard;
+
+#define PX_SUPPORT_GUARDS (PX_WINDOWS_FAMILY || PX_XBOXONE || (PX_LINUX && (PX_X86 || PX_X64)) || PX_PS4 || PX_OSX)
+
+namespace physx
+{
+namespace shdfnd
+{
+// sets the default SDK state for scalar and SIMD units
+class PX_FOUNDATION_API FPUGuard
+{
+  public:
+	FPUGuard();  // set fpu control word for PhysX
+	~FPUGuard(); // restore fpu control word
+  private:
+	uint32_t mControlWords[8];
+};
+
+// sets default SDK state for simd unit only, lighter weight than FPUGuard
+class SIMDGuard
+{
+  public:
+	PX_INLINE SIMDGuard();  // set simd control word for PhysX
+	PX_INLINE ~SIMDGuard(); // restore simd control word
+  private:
+#if PX_SUPPORT_GUARDS
+	uint32_t mControlWord;
+#endif
+};
+
+/**
+\brief Enables floating point exceptions for the scalar and SIMD unit
+*/
+PX_FOUNDATION_API void enableFPExceptions();
+
+/**
+\brief Disables floating point exceptions for the scalar and SIMD unit
+*/
+PX_FOUNDATION_API void disableFPExceptions();
+
+} // namespace shdfnd
+} // namespace physx
+
+#if PX_WINDOWS_FAMILY || PX_XBOXONE
+#include "windows/PsWindowsFPU.h"
+#elif (PX_LINUX && PX_SSE2) || PX_PS4 || PX_OSX
+#include "unix/PsUnixFPU.h"
+#else
+PX_INLINE physx::shdfnd::SIMDGuard::SIMDGuard()
+{
+}
+PX_INLINE physx::shdfnd::SIMDGuard::~SIMDGuard()
+{
+}
+#endif
+
+#endif // #ifndef PSFOUNDATION_PSFPU_H
diff --git a/PxShared/src/foundation/include/PsFoundation.h b/PxShared/src/foundation/include/PsFoundation.h
new file mode 100644
index 0000000..68f2cc7
--- /dev/null
+++ b/PxShared/src/foundation/include/PsFoundation.h
@@ -0,0 +1,216 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PX_FOUNDATION_PSFOUNDATION_H
+#define PX_FOUNDATION_PSFOUNDATION_H
+
+#include "foundation/PxFoundation.h"
+#include "foundation/PxErrors.h"
+#include "foundation/PxProfiler.h"
+
+#include "PsBroadcast.h"
+#include "PsAllocator.h"
+#include "PsTempAllocator.h"
+#include "PsMutex.h"
+#include "PsHashMap.h"
+#include "PsUserAllocated.h"
+
+#include <stdarg.h>
+
+namespace physx
+{
+namespace shdfnd
+{
+
+#if PX_VC
+#pragma warning(push)
+#pragma warning(disable : 4251) // class needs to have dll-interface to be used by clients of class
+#endif
+
+class PX_FOUNDATION_API Foundation : public PxFoundation, public UserAllocated
+{
+	PX_NOCOPY(Foundation)
+
+  public:
+	typedef MutexT<Allocator> Mutex;
+
+	typedef HashMap<const NamedAllocator*, const char*, Hash<const NamedAllocator*>, NonTrackingAllocator> AllocNameMap;
+	typedef Array<TempAllocatorChunk*, Allocator> AllocFreeTable;
+
+  public:
+	// factory
+	// note, you MUST eventually call release if createInstance returned true!
+	static Foundation* createInstance(PxU32 version, PxErrorCallback& errc, PxAllocatorCallback& alloc);
+	static Foundation& getInstance();
+	void release();
+	static void incRefCount(); // this call requires a foundation object to exist already
+	static void decRefCount(); // this call requires a foundation object to exist already
+
+	// Begin Errors
+	virtual PxErrorCallback& getErrorCallback()
+	{
+		return mErrorCallback;
+	} // Return the user's error callback
+	PxErrorCallback& getInternalErrorCallback()
+	{
+		return mBroadcastingError;
+	} // Return the broadcasting error callback
+
+	void registerErrorCallback(PxErrorCallback& listener);
+	void deregisterErrorCallback(PxErrorCallback& listener);
+
+	virtual void setErrorLevel(PxErrorCode::Enum mask)
+	{
+		mErrorMask = mask;
+	}
+	virtual PxErrorCode::Enum getErrorLevel() const
+	{
+		return mErrorMask;
+	}
+
+	void error(PxErrorCode::Enum, const char* file, int line, const char* messageFmt, ...); // Report errors with the
+	                                                                                        // broadcasting
+	void errorImpl(PxErrorCode::Enum, const char* file, int line, const char* messageFmt, va_list); // error callback
+	static PxU32 getWarnOnceTimestamp();
+
+	// End errors
+
+	// Begin Allocations
+	virtual PxAllocatorCallback& getAllocatorCallback()
+	{
+		return mAllocatorCallback;
+	} // Return the user's allocator callback
+	PxAllocatorCallback& getAllocator()
+	{
+		return mBroadcastingAllocator;
+	} // Return the broadcasting allocator
+
+	void registerAllocationListener(physx::shdfnd::AllocationListener& listener);
+	void deregisterAllocationListener(physx::shdfnd::AllocationListener& listener);
+
+	virtual bool getReportAllocationNames() const
+	{
+		return mReportAllocationNames;
+	}
+	virtual void setReportAllocationNames(bool value)
+	{
+		mReportAllocationNames = value;
+	}
+
+	PX_INLINE AllocNameMap& getNamedAllocMap()
+	{
+		return mNamedAllocMap;
+	}
+	PX_INLINE Mutex& getNamedAllocMutex()
+	{
+		return mNamedAllocMutex;
+	}
+
+	PX_INLINE AllocFreeTable& getTempAllocFreeTable()
+	{
+		return mTempAllocFreeTable;
+	}
+	PX_INLINE Mutex& getTempAllocMutex()
+	{
+		return mTempAllocMutex;
+	}
+	// End allocations
+
+  private:
+	static void destroyInstance();
+
+	Foundation(PxErrorCallback& errc, PxAllocatorCallback& alloc);
+	~Foundation();
+
+	// init order is tricky here: the mutexes require the allocator, the allocator may require the error stream
+	PxAllocatorCallback& mAllocatorCallback;
+	PxErrorCallback& mErrorCallback;
+
+	BroadcastingAllocator mBroadcastingAllocator;
+	BroadcastingErrorCallback mBroadcastingError;
+
+	bool mReportAllocationNames;
+
+	PxErrorCode::Enum mErrorMask;
+	Mutex mErrorMutex;
+
+	AllocNameMap mNamedAllocMap;
+	Mutex mNamedAllocMutex;
+
+	AllocFreeTable mTempAllocFreeTable;
+	Mutex mTempAllocMutex;
+
+	Mutex mListenerMutex;
+
+	static Foundation* mInstance;
+	static PxU32 mRefCount;
+	static PxU32 mWarnOnceTimestap;
+};
+#if PX_VC
+#pragma warning(pop)
+#endif
+
+PX_INLINE Foundation& getFoundation()
+{
+	return Foundation::getInstance();
+}
+
+} // namespace shdfnd
+} // namespace physx
+
+// shortcut macros:
+// usage: Foundation::error(PX_WARN, "static friction %f is is lower than dynamic friction %d", sfr, dfr);
+#define PX_WARN ::physx::PxErrorCode::eDEBUG_WARNING, __FILE__, __LINE__
+#define PX_INFO ::physx::PxErrorCode::eDEBUG_INFO, __FILE__, __LINE__
+
+#if PX_DEBUG || PX_CHECKED
+#define PX_WARN_ONCE(string)                                                                                           \
+	{                                                                                                                  \
+		static PxU32 timestamp = 0;                                                                                    \
+		if(timestamp != Ps::getFoundation().getWarnOnceTimestamp())                                                    \
+		{                                                                                                              \
+			timestamp = Ps::getFoundation().getWarnOnceTimestamp();                                                    \
+			Ps::getFoundation().error(PX_WARN, string);                                                                \
+		}                                                                                                              \
+	\
+}
+#define PX_WARN_ONCE_IF(condition, string)                                                                             \
+	{                                                                                                                  \
+		if(condition)                                                                                                  \
+		{                                                                                                              \
+			PX_WARN_ONCE(string)                                                                                       \
+		}                                                                                                              \
+	\
+}
+#else
+#define PX_WARN_ONCE(string) ((void)0)
+#define PX_WARN_ONCE_IF(condition, string) ((void)0)
+#endif
+
+#endif
diff --git a/PxShared/src/foundation/include/PsHash.h b/PxShared/src/foundation/include/PsHash.h
new file mode 100644
index 0000000..6b74fb2
--- /dev/null
+++ b/PxShared/src/foundation/include/PsHash.h
@@ -0,0 +1,162 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSHASH_H
+#define PSFOUNDATION_PSHASH_H
+
+#include "Ps.h"
+#include "PsBasicTemplates.h"
+
+#if PX_VC
+#pragma warning(push)
+#pragma warning(disable : 4302)
+#endif
+
+#if PX_LINUX
+#include "foundation/PxSimpleTypes.h"
+#endif
+
+/*!
+Central definition of hash functions
+*/
+
+namespace physx
+{
+namespace shdfnd
+{
+// Hash functions
+
+// Thomas Wang's 32 bit mix
+// http://www.cris.com/~Ttwang/tech/inthash.htm
+PX_FORCE_INLINE uint32_t hash(const uint32_t key)
+{
+	uint32_t k = key;
+	k += ~(k << 15);
+	k ^= (k >> 10);
+	k += (k << 3);
+	k ^= (k >> 6);
+	k += ~(k << 11);
+	k ^= (k >> 16);
+	return uint32_t(k);
+}
+
+PX_FORCE_INLINE uint32_t hash(const int32_t key)
+{
+	return hash(uint32_t(key));
+}
+
+// Thomas Wang's 64 bit mix
+// http://www.cris.com/~Ttwang/tech/inthash.htm
+PX_FORCE_INLINE uint32_t hash(const uint64_t key)
+{
+	uint64_t k = key;
+	k += ~(k << 32);
+	k ^= (k >> 22);
+	k += ~(k << 13);
+	k ^= (k >> 8);
+	k += (k << 3);
+	k ^= (k >> 15);
+	k += ~(k << 27);
+	k ^= (k >> 31);
+	return uint32_t(UINT32_MAX & k);
+}
+
+#if PX_APPLE_FAMILY
+// hash for size_t, to make gcc happy
+PX_INLINE uint32_t hash(const size_t key)
+{
+#if PX_P64_FAMILY
+	return hash(uint64_t(key));
+#else
+	return hash(uint32_t(key));
+#endif
+}
+#endif
+
+// Hash function for pointers
+PX_INLINE uint32_t hash(const void* ptr)
+{
+#if PX_P64_FAMILY
+	return hash(uint64_t(ptr));
+#else
+	return hash(uint32_t(UINT32_MAX & size_t(ptr)));
+#endif
+}
+
+// Hash function for pairs
+template <typename F, typename S>
+PX_INLINE uint32_t hash(const Pair<F, S>& p)
+{
+	uint32_t seed = 0x876543;
+	uint32_t m = 1000007;
+	return hash(p.second) ^ (m * (hash(p.first) ^ (m * seed)));
+}
+
+// hash object for hash map template parameter
+template <class Key>
+struct Hash
+{
+	uint32_t operator()(const Key& k) const
+	{
+		return hash(k);
+	}
+	bool equal(const Key& k0, const Key& k1) const
+	{
+		return k0 == k1;
+	}
+};
+
+// specialization for strings
+template <>
+struct Hash<const char*>
+{
+  public:
+	uint32_t operator()(const char* _string) const
+	{
+		// "DJB" string hash
+		const uint8_t* string = reinterpret_cast<const uint8_t*>(_string);
+		uint32_t h = 5381;
+		for(const uint8_t* ptr = string; *ptr; ptr++)
+			h = ((h << 5) + h) ^ uint32_t(*ptr);
+		return h;
+	}
+	bool equal(const char* string0, const char* string1) const
+	{
+		return !strcmp(string0, string1);
+	}
+};
+
+} // namespace shdfnd
+} // namespace physx
+
+#if PX_VC
+#pragma warning(pop)
+#endif
+
+#endif // #ifndef PSFOUNDATION_PSHASH_H
diff --git a/PxShared/src/foundation/include/PsHashInternals.h b/PxShared/src/foundation/include/PsHashInternals.h
new file mode 100644
index 0000000..809baa3
--- /dev/null
+++ b/PxShared/src/foundation/include/PsHashInternals.h
@@ -0,0 +1,795 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSHASHINTERNALS_H
+#define PSFOUNDATION_PSHASHINTERNALS_H
+
+#include "PsBasicTemplates.h"
+#include "PsArray.h"
+#include "PsBitUtils.h"
+#include "PsHash.h"
+#include "foundation/PxIntrinsics.h"
+
+#if PX_VC
+#pragma warning(push)
+#pragma warning(disable : 4127) // conditional expression is constant
+#endif
+namespace physx
+{
+namespace shdfnd
+{
+namespace internal
+{
+template <class Entry, class Key, class HashFn, class GetKey, class Allocator, bool compacting>
+class HashBase : private Allocator
+{
+	void init(uint32_t initialTableSize, float loadFactor)
+	{
+		mBuffer = NULL;
+		mEntries = NULL;
+		mEntriesNext = NULL;
+		mHash = NULL;
+		mEntriesCapacity = 0;
+		mHashSize = 0;
+		mLoadFactor = loadFactor;
+		mFreeList = uint32_t(EOL);
+		mTimestamp = 0;
+		mEntriesCount = 0;
+
+		if(initialTableSize)
+			reserveInternal(initialTableSize);
+	}
+
+  public:
+	typedef Entry EntryType;
+
+	HashBase(uint32_t initialTableSize = 64, float loadFactor = 0.75f) : Allocator(PX_DEBUG_EXP("hashBase"))
+	{
+		init(initialTableSize, loadFactor);
+	}
+
+	HashBase(uint32_t initialTableSize, float loadFactor, const Allocator& alloc) : Allocator(alloc)
+	{
+		init(initialTableSize, loadFactor);
+	}
+
+	HashBase(const Allocator& alloc) : Allocator(alloc)
+	{
+		init(64, 0.75f);
+	}
+
+	~HashBase()
+	{
+		destroy(); // No need to clear()
+
+		if(mBuffer)
+			Allocator::deallocate(mBuffer);
+	}
+
+	static const uint32_t EOL = 0xffffffff;
+
+	PX_INLINE Entry* create(const Key& k, bool& exists)
+	{
+		uint32_t h = 0;
+		if(mHashSize)
+		{
+			h = hash(k);
+			uint32_t index = mHash[h];
+			while(index != EOL && !HashFn().equal(GetKey()(mEntries[index]), k))
+				index = mEntriesNext[index];
+			exists = index != EOL;
+			if(exists)
+				return mEntries + index;
+		}
+		else
+			exists = false;
+
+		if(freeListEmpty())
+		{
+			grow();
+			h = hash(k);
+		}
+
+		uint32_t entryIndex = freeListGetNext();
+
+		mEntriesNext[entryIndex] = mHash[h];
+		mHash[h] = entryIndex;
+
+		mEntriesCount++;
+		mTimestamp++;
+
+		return mEntries + entryIndex;
+	}
+
+	PX_INLINE const Entry* find(const Key& k) const
+	{
+		if(!mEntriesCount)
+			return NULL;
+
+		const uint32_t h = hash(k);
+		uint32_t index = mHash[h];
+		while(index != EOL && !HashFn().equal(GetKey()(mEntries[index]), k))
+			index = mEntriesNext[index];
+		return index != EOL ? mEntries + index : NULL;
+	}
+
+	PX_INLINE bool erase(const Key& k, Entry& e)
+	{
+		if(!mEntriesCount)
+			return false;
+
+		const uint32_t h = hash(k);
+		uint32_t* ptr = mHash + h;
+		while(*ptr != EOL && !HashFn().equal(GetKey()(mEntries[*ptr]), k))
+			ptr = mEntriesNext + *ptr;
+
+		if(*ptr == EOL)
+			return false;
+
+		PX_PLACEMENT_NEW(&e, Entry)(mEntries[*ptr]);		
+
+		return eraseInternal(ptr);
+	}
+
+	PX_INLINE bool erase(const Key& k)
+	{
+		if(!mEntriesCount)
+			return false;
+
+		const uint32_t h = hash(k);
+		uint32_t* ptr = mHash + h;
+		while(*ptr != EOL && !HashFn().equal(GetKey()(mEntries[*ptr]), k))
+			ptr = mEntriesNext + *ptr;
+
+		if(*ptr == EOL)
+			return false;		
+
+		return eraseInternal(ptr);
+	}
+
+	PX_INLINE uint32_t size() const
+	{
+		return mEntriesCount;
+	}
+
+	PX_INLINE uint32_t capacity() const
+	{
+		return mHashSize;
+	}
+
+	void clear()
+	{
+		if(!mHashSize || mEntriesCount == 0)
+			return;
+
+		destroy();
+
+		intrinsics::memSet(mHash, EOL, mHashSize * sizeof(uint32_t));
+
+		const uint32_t sizeMinus1 = mEntriesCapacity - 1;
+		for(uint32_t i = 0; i < sizeMinus1; i++)
+		{
+			prefetchLine(mEntriesNext + i, 128);
+			mEntriesNext[i] = i + 1;
+		}
+		mEntriesNext[mEntriesCapacity - 1] = uint32_t(EOL);
+		mFreeList = 0;
+		mEntriesCount = 0;
+	}
+
+	void reserve(uint32_t size)
+	{
+		if(size > mHashSize)
+			reserveInternal(size);
+	}
+
+	PX_INLINE const Entry* getEntries() const
+	{
+		return mEntries;
+	}
+
+	PX_INLINE Entry* insertUnique(const Key& k)
+	{
+		PX_ASSERT(find(k) == NULL);
+		uint32_t h = hash(k);
+
+		uint32_t entryIndex = freeListGetNext();
+
+		mEntriesNext[entryIndex] = mHash[h];
+		mHash[h] = entryIndex;
+
+		mEntriesCount++;
+		mTimestamp++;
+
+		return mEntries + entryIndex;
+	}
+
+  private:
+	void destroy()
+	{
+		for(uint32_t i = 0; i < mHashSize; i++)
+		{
+			for(uint32_t j = mHash[i]; j != EOL; j = mEntriesNext[j])
+				mEntries[j].~Entry();
+		}
+	}
+
+	template <typename HK, typename GK, class A, bool comp>
+	PX_NOINLINE void copy(const HashBase<Entry, Key, HK, GK, A, comp>& other);
+
+	// free list management - if we're coalescing, then we use mFreeList to hold
+	// the top of the free list and it should always be equal to size(). Otherwise,
+	// we build a free list in the next() pointers.
+
+	PX_INLINE void freeListAdd(uint32_t index)
+	{
+		if(compacting)
+		{
+			mFreeList--;
+			PX_ASSERT(mFreeList == mEntriesCount);
+		}
+		else
+		{
+			mEntriesNext[index] = mFreeList;
+			mFreeList = index;
+		}
+	}
+
+	PX_INLINE void freeListAdd(uint32_t start, uint32_t end)
+	{
+		if(!compacting)
+		{
+			for(uint32_t i = start; i < end - 1; i++) // add the new entries to the free list
+				mEntriesNext[i] = i + 1;
+
+			// link in old free list
+			mEntriesNext[end - 1] = mFreeList;
+			PX_ASSERT(mFreeList != end - 1);
+			mFreeList = start;
+		}
+		else if(mFreeList == EOL) // don't reset the free ptr for the compacting hash unless it's empty
+			mFreeList = start;
+	}
+
+	PX_INLINE uint32_t freeListGetNext()
+	{
+		PX_ASSERT(!freeListEmpty());
+		if(compacting)
+		{
+			PX_ASSERT(mFreeList == mEntriesCount);
+			return mFreeList++;
+		}
+		else
+		{
+			uint32_t entryIndex = mFreeList;
+			mFreeList = mEntriesNext[mFreeList];
+			return entryIndex;
+		}
+	}
+
+	PX_INLINE bool freeListEmpty() const
+	{
+		if(compacting)
+			return mEntriesCount == mEntriesCapacity;
+		else
+			return mFreeList == EOL;
+	}
+
+	PX_INLINE void replaceWithLast(uint32_t index)
+	{
+		PX_PLACEMENT_NEW(mEntries + index, Entry)(mEntries[mEntriesCount]);
+		mEntries[mEntriesCount].~Entry();
+		mEntriesNext[index] = mEntriesNext[mEntriesCount];
+
+		uint32_t h = hash(GetKey()(mEntries[index]));
+		uint32_t* ptr;
+		for(ptr = mHash + h; *ptr != mEntriesCount; ptr = mEntriesNext + *ptr)
+			PX_ASSERT(*ptr != EOL);
+		*ptr = index;
+	}
+
+	PX_INLINE uint32_t hash(const Key& k, uint32_t hashSize) const
+	{
+		return HashFn()(k) & (hashSize - 1);
+	}
+
+	PX_INLINE uint32_t hash(const Key& k) const
+	{
+		return hash(k, mHashSize);
+	}
+
+	PX_INLINE bool eraseInternal(uint32_t* ptr)
+	{
+		const uint32_t index = *ptr;
+
+		*ptr = mEntriesNext[index];
+
+		mEntries[index].~Entry();
+
+		mEntriesCount--;
+		mTimestamp++;
+
+		if (compacting && index != mEntriesCount)
+			replaceWithLast(index);
+
+		freeListAdd(index);
+		return true;
+	}
+
+	void reserveInternal(uint32_t size)
+	{
+		if(!isPowerOfTwo(size))
+			size = nextPowerOfTwo(size);
+
+		PX_ASSERT(!(size & (size - 1)));
+
+		// decide whether iteration can be done on the entries directly
+		bool resizeCompact = compacting || freeListEmpty();
+
+		// define new table sizes
+		uint32_t oldEntriesCapacity = mEntriesCapacity;
+		uint32_t newEntriesCapacity = uint32_t(float(size) * mLoadFactor);
+		uint32_t newHashSize = size;
+
+		// allocate new common buffer and setup pointers to new tables
+		uint8_t* newBuffer;
+		uint32_t* newHash;
+		uint32_t* newEntriesNext;
+		Entry* newEntries;
+		{
+			uint32_t newHashByteOffset = 0;
+			uint32_t newEntriesNextBytesOffset = newHashByteOffset + newHashSize * sizeof(uint32_t);
+			uint32_t newEntriesByteOffset = newEntriesNextBytesOffset + newEntriesCapacity * sizeof(uint32_t);
+			newEntriesByteOffset += (16 - (newEntriesByteOffset & 15)) & 15;
+			uint32_t newBufferByteSize = newEntriesByteOffset + newEntriesCapacity * sizeof(Entry);
+
+			newBuffer = reinterpret_cast<uint8_t*>(Allocator::allocate(newBufferByteSize, __FILE__, __LINE__));
+			PX_ASSERT(newBuffer);
+
+			newHash = reinterpret_cast<uint32_t*>(newBuffer + newHashByteOffset);
+			newEntriesNext = reinterpret_cast<uint32_t*>(newBuffer + newEntriesNextBytesOffset);
+			newEntries = reinterpret_cast<Entry*>(newBuffer + newEntriesByteOffset);
+		}
+
+		// initialize new hash table
+		intrinsics::memSet(newHash, uint32_t(EOL), newHashSize * sizeof(uint32_t));
+
+		// iterate over old entries, re-hash and create new entries
+		if(resizeCompact)
+		{
+			// check that old free list is empty - we don't need to copy the next entries
+			PX_ASSERT(compacting || mFreeList == EOL);
+
+			for(uint32_t index = 0; index < mEntriesCount; ++index)
+			{
+				uint32_t h = hash(GetKey()(mEntries[index]), newHashSize);
+				newEntriesNext[index] = newHash[h];
+				newHash[h] = index;
+
+				PX_PLACEMENT_NEW(newEntries + index, Entry)(mEntries[index]);
+				mEntries[index].~Entry();
+			}
+		}
+		else
+		{
+			// copy old free list, only required for non compact resizing
+			intrinsics::memCopy(newEntriesNext, mEntriesNext, mEntriesCapacity * sizeof(uint32_t));
+
+			for(uint32_t bucket = 0; bucket < mHashSize; bucket++)
+			{
+				uint32_t index = mHash[bucket];
+				while(index != EOL)
+				{
+					uint32_t h = hash(GetKey()(mEntries[index]), newHashSize);
+					newEntriesNext[index] = newHash[h];
+					PX_ASSERT(index != newHash[h]);
+
+					newHash[h] = index;
+
+					PX_PLACEMENT_NEW(newEntries + index, Entry)(mEntries[index]);
+					mEntries[index].~Entry();
+
+					index = mEntriesNext[index];
+				}
+			}
+		}
+
+		// swap buffer and pointers
+		Allocator::deallocate(mBuffer);
+		mBuffer = newBuffer;
+		mHash = newHash;
+		mHashSize = newHashSize;
+		mEntriesNext = newEntriesNext;
+		mEntries = newEntries;
+		mEntriesCapacity = newEntriesCapacity;
+
+		freeListAdd(oldEntriesCapacity, newEntriesCapacity);
+	}
+
+	void grow()
+	{
+		PX_ASSERT((mFreeList == EOL) || (compacting && (mEntriesCount == mEntriesCapacity)));
+
+		uint32_t size = mHashSize == 0 ? 16 : mHashSize * 2;
+		reserve(size);
+	}
+
+	uint8_t* mBuffer;
+	Entry* mEntries;
+	uint32_t* mEntriesNext; // same size as mEntries
+	uint32_t* mHash;
+	uint32_t mEntriesCapacity;
+	uint32_t mHashSize;
+	float mLoadFactor;
+	uint32_t mFreeList;
+	uint32_t mTimestamp;
+	uint32_t mEntriesCount; // number of entries
+
+  public:
+	class Iter
+	{
+	  public:
+		PX_INLINE Iter(HashBase& b) : mBucket(0), mEntry(uint32_t(b.EOL)), mTimestamp(b.mTimestamp), mBase(b)
+		{
+			if(mBase.mEntriesCapacity > 0)
+			{
+				mEntry = mBase.mHash[0];
+				skip();
+			}
+		}
+
+		PX_INLINE void check() const
+		{
+			PX_ASSERT(mTimestamp == mBase.mTimestamp);
+		}
+		PX_INLINE const Entry& operator*() const
+		{
+			check();
+			return mBase.mEntries[mEntry];
+		}
+		PX_INLINE Entry& operator*()
+		{
+			check();
+			return mBase.mEntries[mEntry];
+		}
+		PX_INLINE const Entry* operator->() const
+		{
+			check();
+			return mBase.mEntries + mEntry;
+		}
+		PX_INLINE Entry* operator->()
+		{
+			check();
+			return mBase.mEntries + mEntry;
+		}
+		PX_INLINE Iter operator++()
+		{
+			check();
+			advance();
+			return *this;
+		}
+		PX_INLINE Iter operator++(int)
+		{
+			check();
+			Iter i = *this;
+			advance();
+			return i;
+		}
+		PX_INLINE bool done() const
+		{
+			check();
+			return mEntry == mBase.EOL;
+		}
+
+	  private:
+		PX_INLINE void advance()
+		{
+			mEntry = mBase.mEntriesNext[mEntry];
+			skip();
+		}
+		PX_INLINE void skip()
+		{
+			while(mEntry == mBase.EOL)
+			{
+				if(++mBucket == mBase.mHashSize)
+					break;
+				mEntry = mBase.mHash[mBucket];
+			}
+		}
+
+		Iter& operator=(const Iter&);
+
+		uint32_t mBucket;
+		uint32_t mEntry;
+		uint32_t mTimestamp;
+		HashBase& mBase;
+	};
+
+	/*!
+	Iterate over entries in a hash base and allow entry erase while iterating
+	*/
+	class EraseIterator
+	{
+	public:
+		PX_INLINE EraseIterator(HashBase& b): mBase(b)
+		{
+			reset();
+		}
+
+		PX_INLINE Entry* eraseCurrentGetNext(bool eraseCurrent)
+		{
+			if(eraseCurrent && mCurrentEntryIndexPtr)
+			{
+				mBase.eraseInternal(mCurrentEntryIndexPtr);
+				// if next was valid return the same ptr, if next was EOL search new hash entry
+				if(*mCurrentEntryIndexPtr != mBase.EOL)
+					return mBase.mEntries + *mCurrentEntryIndexPtr;
+				else
+					return traverseHashEntries();
+			}
+
+			// traverse mHash to find next entry
+			if(mCurrentEntryIndexPtr == NULL)
+				return traverseHashEntries();
+			
+			const uint32_t index = *mCurrentEntryIndexPtr;			
+			if(mBase.mEntriesNext[index] == mBase.EOL)
+			{
+				return traverseHashEntries();
+			}
+			else
+			{
+				mCurrentEntryIndexPtr = mBase.mEntriesNext + index;
+				return mBase.mEntries + *mCurrentEntryIndexPtr;
+			}
+		}
+
+		PX_INLINE void reset()
+		{
+			mCurrentHashIndex = 0;
+			mCurrentEntryIndexPtr = NULL;			
+		}
+
+	private:
+		PX_INLINE Entry* traverseHashEntries()
+		{
+			mCurrentEntryIndexPtr = NULL;			
+			while (mCurrentEntryIndexPtr == NULL && mCurrentHashIndex < mBase.mHashSize)
+			{
+				if (mBase.mHash[mCurrentHashIndex] != mBase.EOL)
+				{
+					mCurrentEntryIndexPtr = mBase.mHash + mCurrentHashIndex;
+					mCurrentHashIndex++;
+					return mBase.mEntries + *mCurrentEntryIndexPtr;
+				}
+				else
+				{
+					mCurrentHashIndex++;
+				}
+			}
+			return NULL;
+		}
+
+		EraseIterator& operator=(const EraseIterator&);
+	private:
+		uint32_t*	mCurrentEntryIndexPtr;
+		uint32_t	mCurrentHashIndex;		
+		HashBase&	mBase;
+	};
+};
+
+template <class Entry, class Key, class HashFn, class GetKey, class Allocator, bool compacting>
+template <typename HK, typename GK, class A, bool comp>
+PX_NOINLINE void
+HashBase<Entry, Key, HashFn, GetKey, Allocator, compacting>::copy(const HashBase<Entry, Key, HK, GK, A, comp>& other)
+{
+	reserve(other.mEntriesCount);
+
+	for(uint32_t i = 0; i < other.mEntriesCount; i++)
+	{
+		for(uint32_t j = other.mHash[i]; j != EOL; j = other.mEntriesNext[j])
+		{
+			const Entry& otherEntry = other.mEntries[j];
+
+			bool exists;
+			Entry* newEntry = create(GK()(otherEntry), exists);
+			PX_ASSERT(!exists);
+
+			PX_PLACEMENT_NEW(newEntry, Entry)(otherEntry);
+		}
+	}
+}
+
+template <class Key, class HashFn, class Allocator = typename AllocatorTraits<Key>::Type, bool Coalesced = false>
+class HashSetBase
+{
+	PX_NOCOPY(HashSetBase)
+  public:
+	struct GetKey
+	{
+		PX_INLINE const Key& operator()(const Key& e)
+		{
+			return e;
+		}
+	};
+
+	typedef HashBase<Key, Key, HashFn, GetKey, Allocator, Coalesced> BaseMap;
+	typedef typename BaseMap::Iter Iterator;
+
+	HashSetBase(uint32_t initialTableSize, float loadFactor, const Allocator& alloc)
+	: mBase(initialTableSize, loadFactor, alloc)
+	{
+	}
+
+	HashSetBase(const Allocator& alloc) : mBase(64, 0.75f, alloc)
+	{
+	}
+
+	HashSetBase(uint32_t initialTableSize = 64, float loadFactor = 0.75f) : mBase(initialTableSize, loadFactor)
+	{
+	}
+
+	bool insert(const Key& k)
+	{
+		bool exists;
+		Key* e = mBase.create(k, exists);
+		if(!exists)
+			PX_PLACEMENT_NEW(e, Key)(k);
+		return !exists;
+	}
+
+	PX_INLINE bool contains(const Key& k) const
+	{
+		return mBase.find(k) != 0;
+	}
+	PX_INLINE bool erase(const Key& k)
+	{
+		return mBase.erase(k);
+	}
+	PX_INLINE uint32_t size() const
+	{
+		return mBase.size();
+	}
+	PX_INLINE uint32_t capacity() const
+	{
+		return mBase.capacity();
+	}
+	PX_INLINE void reserve(uint32_t size)
+	{
+		mBase.reserve(size);
+	}
+	PX_INLINE void clear()
+	{
+		mBase.clear();
+	}
+
+  protected:
+	BaseMap mBase;
+};
+
+template <class Key, class Value, class HashFn, class Allocator = typename AllocatorTraits<Pair<const Key, Value> >::Type>
+class HashMapBase
+{
+	PX_NOCOPY(HashMapBase)
+  public:
+	typedef Pair<const Key, Value> Entry;
+
+	struct GetKey
+	{
+		PX_INLINE const Key& operator()(const Entry& e)
+		{
+			return e.first;
+		}
+	};
+
+	typedef HashBase<Entry, Key, HashFn, GetKey, Allocator, true> BaseMap;
+	typedef typename BaseMap::Iter Iterator;
+	typedef typename BaseMap::EraseIterator EraseIterator;
+
+	HashMapBase(uint32_t initialTableSize, float loadFactor, const Allocator& alloc)
+	: mBase(initialTableSize, loadFactor, alloc)
+	{
+	}
+
+	HashMapBase(const Allocator& alloc) : mBase(64, 0.75f, alloc)
+	{
+	}
+
+	HashMapBase(uint32_t initialTableSize = 64, float loadFactor = 0.75f) : mBase(initialTableSize, loadFactor)
+	{
+	}
+
+	bool insert(const Key /*&*/ k, const Value /*&*/ v)
+	{
+		bool exists;
+		Entry* e = mBase.create(k, exists);
+		if(!exists)
+			PX_PLACEMENT_NEW(e, Entry)(k, v);
+		return !exists;
+	}
+
+	Value& operator[](const Key& k)
+	{
+		bool exists;
+		Entry* e = mBase.create(k, exists);
+		if(!exists)
+			PX_PLACEMENT_NEW(e, Entry)(k, Value());
+
+		return e->second;
+	}
+
+	PX_INLINE const Entry* find(const Key& k) const
+	{
+		return mBase.find(k);
+	}
+	PX_INLINE bool erase(const Key& k)
+	{
+		return mBase.erase(k);
+	}
+	PX_INLINE bool erase(const Key& k, Entry& e)
+	{		
+		return mBase.erase(k, e);
+	}
+	PX_INLINE uint32_t size() const
+	{
+		return mBase.size();
+	}
+	PX_INLINE uint32_t capacity() const
+	{
+		return mBase.capacity();
+	}
+	PX_INLINE Iterator getIterator()
+	{
+		return Iterator(mBase);
+	}
+	PX_INLINE EraseIterator getEraseIterator()
+	{
+		return EraseIterator(mBase);
+	}
+	PX_INLINE void reserve(uint32_t size)
+	{
+		mBase.reserve(size);
+	}
+	PX_INLINE void clear()
+	{
+		mBase.clear();
+	}
+
+  protected:
+	BaseMap mBase;
+};
+}
+
+} // namespace shdfnd
+} // namespace physx
+
+#if PX_VC
+#pragma warning(pop)
+#endif
+#endif // #ifndef PSFOUNDATION_PSHASHINTERNALS_H
diff --git a/PxShared/src/foundation/include/PsHashMap.h b/PxShared/src/foundation/include/PsHashMap.h
new file mode 100644
index 0000000..5091dee
--- /dev/null
+++ b/PxShared/src/foundation/include/PsHashMap.h
@@ -0,0 +1,118 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSHASHMAP_H
+#define PSFOUNDATION_PSHASHMAP_H
+
+#include "PsHashInternals.h"
+
+// TODO: make this doxy-format
+//
+// This header defines two hash maps. Hash maps
+// * support custom initial table sizes (rounded up internally to power-of-2)
+// * support custom static allocator objects
+// * auto-resize, based on a load factor (i.e. a 64-entry .75 load factor hash will resize
+//                                        when the 49th element is inserted)
+// * are based on open hashing
+// * have O(1) contains, erase
+//
+// Maps have STL-like copying semantics, and properly initialize and destruct copies of objects
+//
+// There are two forms of map: coalesced and uncoalesced. Coalesced maps keep the entries in the
+// initial segment of an array, so are fast to iterate over; however deletion is approximately
+// twice as expensive.
+//
+// HashMap<T>:
+//		bool			insert(const Key& k, const Value& v)	O(1) amortized (exponential resize policy)
+//		Value &			operator[](const Key& k)				O(1) for existing objects, else O(1) amortized
+//		const Entry *	find(const Key& k);						O(1)
+//		bool			erase(const T& k);						O(1)
+//		uint32_t			size();									constant
+//		void			reserve(uint32_t size);					O(MAX(currentOccupancy,size))
+//		void			clear();								O(currentOccupancy) (with zero constant for objects
+// without
+// destructors)
+//      Iterator		getIterator();
+//
+// operator[] creates an entry if one does not exist, initializing with the default constructor.
+// CoalescedHashMap<T> does not support getIterator, but instead supports
+// 		const Key *getEntries();
+//
+// Use of iterators:
+//
+// for(HashMap::Iterator iter = test.getIterator(); !iter.done(); ++iter)
+//			myFunction(iter->first, iter->second);
+
+namespace physx
+{
+namespace shdfnd
+{
+template <class Key, class Value, class HashFn = Hash<Key>, class Allocator = NonTrackingAllocator>
+class HashMap : public internal::HashMapBase<Key, Value, HashFn, Allocator>
+{
+  public:
+	typedef internal::HashMapBase<Key, Value, HashFn, Allocator> HashMapBase;
+	typedef typename HashMapBase::Iterator Iterator;
+
+	HashMap(uint32_t initialTableSize = 64, float loadFactor = 0.75f) : HashMapBase(initialTableSize, loadFactor)
+	{
+	}
+	HashMap(uint32_t initialTableSize, float loadFactor, const Allocator& alloc)
+	: HashMapBase(initialTableSize, loadFactor, alloc)
+	{
+	}
+	HashMap(const Allocator& alloc) : HashMapBase(64, 0.75f, alloc)
+	{
+	}
+	Iterator getIterator()
+	{
+		return Iterator(HashMapBase::mBase);
+	}
+};
+
+template <class Key, class Value, class HashFn = Hash<Key>, class Allocator = NonTrackingAllocator>
+class CoalescedHashMap : public internal::HashMapBase<Key, Value, HashFn, Allocator>
+{
+  public:
+	typedef internal::HashMapBase<Key, Value, HashFn, Allocator> HashMapBase;
+
+	CoalescedHashMap(uint32_t initialTableSize = 64, float loadFactor = 0.75f)
+	: HashMapBase(initialTableSize, loadFactor)
+	{
+	}
+	const Pair<const Key, Value>* getEntries() const
+	{
+		return HashMapBase::mBase.getEntries();
+	}
+};
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSHASHMAP_H
diff --git a/PxShared/src/foundation/include/PsHashSet.h b/PxShared/src/foundation/include/PsHashSet.h
new file mode 100644
index 0000000..195f01d
--- /dev/null
+++ b/PxShared/src/foundation/include/PsHashSet.h
@@ -0,0 +1,127 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSHASHSET_H
+#define PSFOUNDATION_PSHASHSET_H
+
+#include "PsHashInternals.h"
+
+// TODO: make this doxy-format
+
+// This header defines two hash sets. Hash sets
+// * support custom initial table sizes (rounded up internally to power-of-2)
+// * support custom static allocator objects
+// * auto-resize, based on a load factor (i.e. a 64-entry .75 load factor hash will resize
+//                                        when the 49th element is inserted)
+// * are based on open hashing
+//
+// Sets have STL-like copying semantics, and properly initialize and destruct copies of objects
+//
+// There are two forms of set: coalesced and uncoalesced. Coalesced sets keep the entries in the
+// initial segment of an array, so are fast to iterate over; however deletion is approximately
+// twice as expensive.
+//
+// HashSet<T>:
+//		bool		insert(const T& k)						amortized O(1) (exponential resize policy)
+// 		bool		contains(const T& k)	const;			O(1)
+//		bool		erase(const T& k);						O(1)
+//		uint32_t		size()					const;			constant
+//		void		reserve(uint32_t size);					O(MAX(size, currentOccupancy))
+//		void		clear();								O(currentOccupancy) (with zero constant for objects without
+// destructors)
+//      Iterator    getIterator();
+//
+// Use of iterators:
+//
+// for(HashSet::Iterator iter = test.getIterator(); !iter.done(); ++iter)
+//			myFunction(*iter);
+//
+// CoalescedHashSet<T> does not support getIterator, but instead supports
+// 		const Key *getEntries();
+//
+// insertion into a set already containing the element fails returning false, as does
+// erasure of an element not in the set
+//
+
+namespace physx
+{
+namespace shdfnd
+{
+template <class Key, class HashFn = Hash<Key>, class Allocator = NonTrackingAllocator>
+class HashSet : public internal::HashSetBase<Key, HashFn, Allocator, false>
+{
+  public:
+	typedef internal::HashSetBase<Key, HashFn, Allocator, false> HashSetBase;
+	typedef typename HashSetBase::Iterator Iterator;
+
+	HashSet(uint32_t initialTableSize = 64, float loadFactor = 0.75f) : HashSetBase(initialTableSize, loadFactor)
+	{
+	}
+	HashSet(uint32_t initialTableSize, float loadFactor, const Allocator& alloc)
+	: HashSetBase(initialTableSize, loadFactor, alloc)
+	{
+	}
+	HashSet(const Allocator& alloc) : HashSetBase(64, 0.75f, alloc)
+	{
+	}
+	Iterator getIterator()
+	{
+		return Iterator(HashSetBase::mBase);
+	}
+};
+
+template <class Key, class HashFn = Hash<Key>, class Allocator = NonTrackingAllocator>
+class CoalescedHashSet : public internal::HashSetBase<Key, HashFn, Allocator, true>
+{
+  public:
+	typedef typename internal::HashSetBase<Key, HashFn, Allocator, true> HashSetBase;
+
+	CoalescedHashSet(uint32_t initialTableSize = 64, float loadFactor = 0.75f)
+	: HashSetBase(initialTableSize, loadFactor)
+	{
+	}
+
+	CoalescedHashSet(uint32_t initialTableSize, float loadFactor, const Allocator& alloc)
+	: HashSetBase(initialTableSize, loadFactor, alloc)
+	{
+	}
+	CoalescedHashSet(const Allocator& alloc) : HashSetBase(64, 0.75f, alloc)
+	{
+	}
+
+	const Key* getEntries() const
+	{
+		return HashSetBase::mBase.getEntries();
+	}
+};
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSHASHSET_H
diff --git a/PxShared/src/foundation/include/PsInlineAllocator.h b/PxShared/src/foundation/include/PsInlineAllocator.h
new file mode 100644
index 0000000..f87abd6
--- /dev/null
+++ b/PxShared/src/foundation/include/PsInlineAllocator.h
@@ -0,0 +1,91 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSINLINEALLOCATOR_H
+#define PSFOUNDATION_PSINLINEALLOCATOR_H
+
+#include "PsUserAllocated.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+// this is used by the array class to allocate some space for a small number
+// of objects along with the metadata
+template <uint32_t N, typename BaseAllocator>
+class InlineAllocator : private BaseAllocator
+{
+  public:
+	InlineAllocator(const PxEMPTY v) : BaseAllocator(v)
+	{
+	}
+
+	InlineAllocator(const BaseAllocator& alloc = BaseAllocator()) : BaseAllocator(alloc), mBufferUsed(false)
+	{
+	}
+
+	InlineAllocator(const InlineAllocator& aloc) : BaseAllocator(aloc), mBufferUsed(false)
+	{
+	}
+
+	void* allocate(uint32_t size, const char* filename, int line)
+	{
+		if(!mBufferUsed && size <= N)
+		{
+			mBufferUsed = true;
+			return mBuffer;
+		}
+		return BaseAllocator::allocate(size, filename, line);
+	}
+
+	void deallocate(void* ptr)
+	{
+		if(ptr == mBuffer)
+			mBufferUsed = false;
+		else
+			BaseAllocator::deallocate(ptr);
+	}
+
+	PX_FORCE_INLINE uint8_t* getInlineBuffer()
+	{
+		return mBuffer;
+	}
+	PX_FORCE_INLINE bool isBufferUsed() const
+	{
+		return mBufferUsed;
+	}
+
+  protected:
+	uint8_t mBuffer[N];
+	bool mBufferUsed;
+};
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSINLINEALLOCATOR_H
diff --git a/PxShared/src/foundation/include/PsInlineAoS.h b/PxShared/src/foundation/include/PsInlineAoS.h
new file mode 100644
index 0000000..6d43607
--- /dev/null
+++ b/PxShared/src/foundation/include/PsInlineAoS.h
@@ -0,0 +1,48 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSINLINEAOS_H
+#define PSFOUNDATION_PSINLINEAOS_H
+
+#include "foundation/PxPreprocessor.h"
+
+#if PX_WINDOWS
+#include "windows/PsWindowsTrigConstants.h"
+#include "windows/PsWindowsInlineAoS.h"
+#elif(PX_UNIX_FAMILY || PX_PS4 || PX_NX)
+#include "unix/PsUnixTrigConstants.h"
+#include "unix/PsUnixInlineAoS.h"
+#elif PX_XBOXONE
+#include "XboxOne/PsXboxOneTrigConstants.h"
+#include "XboxOne/PsXboxOneInlineAoS.h"
+#else
+#error "Platform not supported!"
+#endif
+
+#endif
diff --git a/PxShared/src/foundation/include/PsInlineArray.h b/PxShared/src/foundation/include/PsInlineArray.h
new file mode 100644
index 0000000..e9ea939
--- /dev/null
+++ b/PxShared/src/foundation/include/PsInlineArray.h
@@ -0,0 +1,68 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSINLINEARRAY_H
+#define PSFOUNDATION_PSINLINEARRAY_H
+
+#include "PsArray.h"
+#include "PsInlineAllocator.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+
+// array that pre-allocates for N elements
+template <typename T, uint32_t N, typename Alloc = typename AllocatorTraits<T>::Type>
+class InlineArray : public Array<T, InlineAllocator<N * sizeof(T), Alloc> >
+{
+	typedef InlineAllocator<N * sizeof(T), Alloc> Allocator;
+
+  public:
+	InlineArray(const PxEMPTY v) : Array<T, Allocator>(v)
+	{
+		if(isInlined())
+			this->mData = reinterpret_cast<T*>(Array<T, Allocator>::getInlineBuffer());
+	}
+
+	PX_INLINE bool isInlined() const
+	{
+		return Allocator::isBufferUsed();
+	}
+
+	PX_INLINE explicit InlineArray(const Alloc& alloc = Alloc()) : Array<T, Allocator>(alloc)
+	{
+		this->mData = this->allocate(N);
+		this->mCapacity = N;
+	}
+};
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSINLINEARRAY_H
diff --git a/PxShared/src/foundation/include/PsIntrinsics.h b/PxShared/src/foundation/include/PsIntrinsics.h
new file mode 100644
index 0000000..1e1b9d1
--- /dev/null
+++ b/PxShared/src/foundation/include/PsIntrinsics.h
@@ -0,0 +1,47 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSINTRINSICS_H
+#define PSFOUNDATION_PSINTRINSICS_H
+
+#include "foundation/PxPreprocessor.h"
+
+#if PX_WINDOWS_FAMILY
+#include "windows/PsWindowsIntrinsics.h"
+#elif(PX_LINUX || PX_ANDROID || PX_APPLE_FAMILY || PX_PS4)
+#include "unix/PsUnixIntrinsics.h"
+#elif PX_XBOXONE
+#include "XboxOne/PsXboxOneIntrinsics.h"
+#elif PX_NX
+#include "nx/PsNXIntrinsics.h"
+#else
+#error "Platform not supported!"
+#endif
+
+#endif // #ifndef PSFOUNDATION_PSINTRINSICS_H
diff --git a/PxShared/src/foundation/include/PsMathUtils.h b/PxShared/src/foundation/include/PsMathUtils.h
new file mode 100644
index 0000000..794419b
--- /dev/null
+++ b/PxShared/src/foundation/include/PsMathUtils.h
@@ -0,0 +1,697 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSMATHUTILS_H
+#define PSFOUNDATION_PSMATHUTILS_H
+
+#include "foundation/PxPreprocessor.h"
+#include "foundation/PxTransform.h"
+#include "foundation/PxMat33.h"
+#include "Ps.h"
+#include "PsIntrinsics.h"
+
+// General guideline is: if it's an abstract math function, it belongs here.
+// If it's a math function where the inputs have specific semantics (e.g.
+// separateSwingTwist) it doesn't.
+
+namespace physx
+{
+namespace shdfnd
+{
+/**
+\brief sign returns the sign of its argument. The sign of zero is undefined.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 sign(const PxF32 a)
+{
+	return intrinsics::sign(a);
+}
+
+/**
+\brief sign returns the sign of its argument. The sign of zero is undefined.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 sign(const PxF64 a)
+{
+	return (a >= 0.0) ? 1.0 : -1.0;
+}
+
+/**
+\brief sign returns the sign of its argument. The sign of zero is undefined.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxI32 sign(const PxI32 a)
+{
+	return (a >= 0) ? 1 : -1;
+}
+
+/**
+\brief Returns true if the two numbers are within eps of each other.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE bool equals(const PxF32 a, const PxF32 b, const PxF32 eps)
+{
+	return (PxAbs(a - b) < eps);
+}
+
+/**
+\brief Returns true if the two numbers are within eps of each other.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE bool equals(const PxF64 a, const PxF64 b, const PxF64 eps)
+{
+	return (PxAbs(a - b) < eps);
+}
+
+/**
+\brief The floor function returns a floating-point value representing the largest integer that is less than or equal to
+x.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 floor(const PxF32 a)
+{
+	return floatFloor(a);
+}
+
+/**
+\brief The floor function returns a floating-point value representing the largest integer that is less than or equal to
+x.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 floor(const PxF64 a)
+{
+	return ::floor(a);
+}
+
+/**
+\brief The ceil function returns a single value representing the smallest integer that is greater than or equal to x.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 ceil(const PxF32 a)
+{
+	return ::ceilf(a);
+}
+
+/**
+\brief The ceil function returns a double value representing the smallest integer that is greater than or equal to x.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 ceil(const PxF64 a)
+{
+	return ::ceil(a);
+}
+
+/**
+\brief mod returns the floating-point remainder of x / y.
+
+If the value of y is 0.0, mod returns a quiet NaN.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 mod(const PxF32 x, const PxF32 y)
+{
+	return PxF32(::fmodf(x, y));
+}
+
+/**
+\brief mod returns the floating-point remainder of x / y.
+
+If the value of y is 0.0, mod returns a quiet NaN.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 mod(const PxF64 x, const PxF64 y)
+{
+	return ::fmod(x, y);
+}
+
+/**
+\brief Square.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 sqr(const PxF32 a)
+{
+	return a * a;
+}
+
+/**
+\brief Square.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 sqr(const PxF64 a)
+{
+	return a * a;
+}
+
+/**
+\brief Calculates x raised to the power of y.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 pow(const PxF32 x, const PxF32 y)
+{
+	return ::powf(x, y);
+}
+
+/**
+\brief Calculates x raised to the power of y.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 pow(const PxF64 x, const PxF64 y)
+{
+	return ::pow(x, y);
+}
+
+/**
+\brief Calculates e^n
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 exp(const PxF32 a)
+{
+	return ::expf(a);
+}
+/**
+
+\brief Calculates e^n
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 exp(const PxF64 a)
+{
+	return ::exp(a);
+}
+
+/**
+\brief Calculates 2^n
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 exp2(const PxF32 a)
+{
+	return ::expf(a * 0.693147180559945309417f);
+}
+/**
+
+\brief Calculates 2^n
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 exp2(const PxF64 a)
+{
+	return ::exp(a * 0.693147180559945309417);
+}
+
+/**
+\brief Calculates logarithms.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 logE(const PxF32 a)
+{
+	return ::logf(a);
+}
+
+/**
+\brief Calculates logarithms.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 logE(const PxF64 a)
+{
+	return ::log(a);
+}
+
+/**
+\brief Calculates logarithms.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 log2(const PxF32 a)
+{
+	return ::logf(a) / 0.693147180559945309417f;
+}
+
+/**
+\brief Calculates logarithms.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 log2(const PxF64 a)
+{
+	return ::log(a) / 0.693147180559945309417;
+}
+
+/**
+\brief Calculates logarithms.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 log10(const PxF32 a)
+{
+	return ::log10f(a);
+}
+
+/**
+\brief Calculates logarithms.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 log10(const PxF64 a)
+{
+	return ::log10(a);
+}
+
+/**
+\brief Converts degrees to radians.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 degToRad(const PxF32 a)
+{
+	return 0.01745329251994329547f * a;
+}
+
+/**
+\brief Converts degrees to radians.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 degToRad(const PxF64 a)
+{
+	return 0.01745329251994329547 * a;
+}
+
+/**
+\brief Converts radians to degrees.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 radToDeg(const PxF32 a)
+{
+	return 57.29577951308232286465f * a;
+}
+
+/**
+\brief Converts radians to degrees.
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF64 radToDeg(const PxF64 a)
+{
+	return 57.29577951308232286465 * a;
+}
+
+//! \brief compute sine and cosine at the same time. There is a 'fsincos' on PC that we probably want to use here
+PX_CUDA_CALLABLE PX_FORCE_INLINE void sincos(const PxF32 radians, PxF32& sin, PxF32& cos)
+{
+	/* something like:
+	_asm fld  Local
+	_asm fsincos
+	_asm fstp LocalCos
+	_asm fstp LocalSin
+	*/
+	sin = PxSin(radians);
+	cos = PxCos(radians);
+}
+
+/**
+\brief uniform random number in [a,b]
+*/
+PX_FORCE_INLINE PxI32 rand(const PxI32 a, const PxI32 b)
+{
+	return a + PxI32(::rand() % (b - a + 1));
+}
+
+/**
+\brief uniform random number in [a,b]
+*/
+PX_FORCE_INLINE PxF32 rand(const PxF32 a, const PxF32 b)
+{
+	return a + (b - a) * ::rand() / RAND_MAX;
+}
+
+//! \brief return angle between two vectors in radians
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxF32 angle(const PxVec3& v0, const PxVec3& v1)
+{
+	const PxF32 cos = v0.dot(v1);                 // |v0|*|v1|*Cos(Angle)
+	const PxF32 sin = (v0.cross(v1)).magnitude(); // |v0|*|v1|*Sin(Angle)
+	return PxAtan2(sin, cos);
+}
+
+//! If possible use instead fsel on the dot product /*fsel(d.dot(p),onething,anotherthing);*/
+//! Compares orientations (more readable, user-friendly function)
+PX_CUDA_CALLABLE PX_FORCE_INLINE bool sameDirection(const PxVec3& d, const PxVec3& p)
+{
+	return d.dot(p) >= 0.0f;
+}
+
+//! Checks 2 values have different signs
+PX_CUDA_CALLABLE PX_FORCE_INLINE IntBool differentSign(PxReal f0, PxReal f1)
+{
+#if !PX_EMSCRIPTEN
+	union
+	{
+		PxU32 u;
+		PxReal f;
+	} u1, u2;
+	u1.f = f0;
+	u2.f = f1;
+	return IntBool((u1.u ^ u2.u) & PX_SIGN_BITMASK);
+#else
+	// javascript floats are 64-bits...
+	return IntBool( (f0*f1) < 0.0f );
+#endif
+}
+
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxMat33 star(const PxVec3& v)
+{
+	return PxMat33(PxVec3(0, v.z, -v.y), PxVec3(-v.z, 0, v.x), PxVec3(v.y, -v.x, 0));
+}
+
+PX_CUDA_CALLABLE PX_INLINE PxVec3 log(const PxQuat& q)
+{
+	const PxReal s = q.getImaginaryPart().magnitude();
+	if(s < 1e-12f)
+		return PxVec3(0.0f);
+	// force the half-angle to have magnitude <= pi/2
+	PxReal halfAngle = q.w < 0 ? PxAtan2(-s, -q.w) : PxAtan2(s, q.w);
+	PX_ASSERT(halfAngle >= -PxPi / 2 && halfAngle <= PxPi / 2);
+
+	return q.getImaginaryPart().getNormalized() * 2.f * halfAngle;
+}
+
+PX_CUDA_CALLABLE PX_INLINE PxQuat exp(const PxVec3& v)
+{
+	const PxReal m = v.magnitudeSquared();
+	return m < 1e-24f ? PxQuat(PxIdentity) : PxQuat(PxSqrt(m), v * PxRecipSqrt(m));
+}
+
+// quat to rotate v0 t0 v1
+PX_CUDA_CALLABLE PX_INLINE PxQuat rotationArc(const PxVec3& v0, const PxVec3& v1)
+{
+	const PxVec3 cross = v0.cross(v1);
+	const PxReal d = v0.dot(v1);
+	if(d <= -0.99999f)
+		return (PxAbs(v0.x) < 0.1f ? PxQuat(0.0f, v0.z, -v0.y, 0.0f) : PxQuat(v0.y, -v0.x, 0.0, 0.0)).getNormalized();
+
+	const PxReal s = PxSqrt((1 + d) * 2), r = 1 / s;
+
+	return PxQuat(cross.x * r, cross.y * r, cross.z * r, s * 0.5f).getNormalized();
+}
+
+/**
+\brief returns largest axis
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxU32 largestAxis(const PxVec3& v)
+{
+	PxU32 m = PxU32(v.y > v.x ? 1 : 0);
+	return v.z > v[m] ? 2 : m;
+}
+
+/**
+\brief returns indices for the largest axis and 2 other axii
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxU32 largestAxis(const PxVec3& v, PxU32& other1, PxU32& other2)
+{
+	if(v.x >= PxMax(v.y, v.z))
+	{
+		other1 = 1;
+		other2 = 2;
+		return 0;
+	}
+	else if(v.y >= v.z)
+	{
+		other1 = 0;
+		other2 = 2;
+		return 1;
+	}
+	else
+	{
+		other1 = 0;
+		other2 = 1;
+		return 2;
+	}
+}
+
+/**
+\brief returns axis with smallest absolute value
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxU32 closestAxis(const PxVec3& v)
+{
+	PxU32 m = PxU32(PxAbs(v.y) > PxAbs(v.x) ? 1 : 0);
+	return PxAbs(v.z) > PxAbs(v[m]) ? 2 : m;
+}
+
+PX_CUDA_CALLABLE PX_INLINE PxU32 closestAxis(const PxVec3& v, PxU32& j, PxU32& k)
+{
+	// find largest 2D plane projection
+	const PxF32 absPx = PxAbs(v.x);
+	const PxF32 absNy = PxAbs(v.y);
+	const PxF32 absNz = PxAbs(v.z);
+
+	PxU32 m = 0; // x biggest axis
+	j = 1;
+	k = 2;
+	if(absNy > absPx && absNy > absNz)
+	{
+		// y biggest
+		j = 2;
+		k = 0;
+		m = 1;
+	}
+	else if(absNz > absPx)
+	{
+		// z biggest
+		j = 0;
+		k = 1;
+		m = 2;
+	}
+	return m;
+}
+
+/*!
+Extend an edge along its length by a factor
+*/
+PX_CUDA_CALLABLE PX_FORCE_INLINE void makeFatEdge(PxVec3& p0, PxVec3& p1, PxReal fatCoeff)
+{
+	PxVec3 delta = p1 - p0;
+
+	const PxReal m = delta.magnitude();
+	if(m > 0.0f)
+	{
+		delta *= fatCoeff / m;
+		p0 -= delta;
+		p1 += delta;
+	}
+}
+
+//! Compute point as combination of barycentric coordinates
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxVec3
+computeBarycentricPoint(const PxVec3& p0, const PxVec3& p1, const PxVec3& p2, PxReal u, PxReal v)
+{
+	// This seems to confuse the compiler...
+	// return (1.0f - u - v)*p0 + u*p1 + v*p2;
+	const PxF32 w = 1.0f - u - v;
+	return PxVec3(w * p0.x + u * p1.x + v * p2.x, w * p0.y + u * p1.y + v * p2.y, w * p0.z + u * p1.z + v * p2.z);
+}
+
+// generates a pair of quaternions (swing, twist) such that in = swing * twist, with
+// swing.x = 0
+// twist.y = twist.z = 0, and twist is a unit quat
+PX_FORCE_INLINE void separateSwingTwist(const PxQuat& q, PxQuat& swing, PxQuat& twist)
+{
+	twist = q.x != 0.0f ? PxQuat(q.x, 0, 0, q.w).getNormalized() : PxQuat(PxIdentity);
+	swing = q * twist.getConjugate();
+}
+
+// generate two tangent vectors to a given normal
+PX_FORCE_INLINE void normalToTangents(const PxVec3& normal, PxVec3& tangent0, PxVec3& tangent1)
+{
+	tangent0 = PxAbs(normal.x) < 0.70710678f ? PxVec3(0, -normal.z, normal.y) : PxVec3(-normal.y, normal.x, 0);
+	tangent0.normalize();
+	tangent1 = normal.cross(tangent0);
+}
+
+// todo: what is this function doing?
+PX_FOUNDATION_API PxQuat computeQuatFromNormal(const PxVec3& n);
+
+/**
+\brief computes a oriented bounding box around the scaled basis.
+\param basis Input = skewed basis, Output = (normalized) orthogonal basis.
+\return Bounding box extent.
+*/
+PX_FOUNDATION_API PxVec3 optimizeBoundingBox(PxMat33& basis);
+
+PX_FOUNDATION_API PxQuat slerp(const PxReal t, const PxQuat& left, const PxQuat& right);
+
+PX_CUDA_CALLABLE PX_INLINE PxVec3 ellipseClamp(const PxVec3& point, const PxVec3& radii)
+{
+	// This function need to be implemented in the header file because
+	// it is included in a spu shader program.
+
+	// finds the closest point on the ellipse to a given point
+
+	// (p.y, p.z) is the input point
+	// (e.y, e.z) are the radii of the ellipse
+
+	// lagrange multiplier method with Newton/Halley hybrid root-finder.
+	// see http://www.geometrictools.com/Documentation/DistancePointToEllipse2.pdf
+	// for proof of Newton step robustness and initial estimate.
+	// Halley converges much faster but sometimes overshoots - when that happens we take
+	// a newton step instead
+
+	// converges in 1-2 iterations where D&C works well, and it's good with 4 iterations
+	// with any ellipse that isn't completely crazy
+
+	const PxU32 MAX_ITERATIONS = 20;
+	const PxReal convergenceThreshold = 1e-4f;
+
+	// iteration requires first quadrant but we recover generality later
+
+	PxVec3 q(0, PxAbs(point.y), PxAbs(point.z));
+	const PxReal tinyEps = 1e-6f; // very close to minor axis is numerically problematic but trivial
+	if(radii.y >= radii.z)
+	{
+		if(q.z < tinyEps)
+			return PxVec3(0, point.y > 0 ? radii.y : -radii.y, 0);
+	}
+	else
+	{
+		if(q.y < tinyEps)
+			return PxVec3(0, 0, point.z > 0 ? radii.z : -radii.z);
+	}
+
+	PxVec3 denom, e2 = radii.multiply(radii), eq = radii.multiply(q);
+
+	// we can use any initial guess which is > maximum(-e.y^2,-e.z^2) and for which f(t) is > 0.
+	// this guess works well near the axes, but is weak along the diagonals.
+
+	PxReal t = PxMax(eq.y - e2.y, eq.z - e2.z);
+
+	for(PxU32 i = 0; i < MAX_ITERATIONS; i++)
+	{
+		denom = PxVec3(0, 1 / (t + e2.y), 1 / (t + e2.z));
+		PxVec3 denom2 = eq.multiply(denom);
+
+		PxVec3 fv = denom2.multiply(denom2);
+		PxReal f = fv.y + fv.z - 1;
+
+		// although in exact arithmetic we are guaranteed f>0, we can get here
+		// on the first iteration via catastrophic cancellation if the point is
+		// very close to the origin. In that case we just behave as if f=0
+
+		if(f < convergenceThreshold)
+			return e2.multiply(point).multiply(denom);
+
+		PxReal df = fv.dot(denom) * -2.0f;
+		t = t - f / df;
+	}
+
+	// we didn't converge, so clamp what we have
+	PxVec3 r = e2.multiply(point).multiply(denom);
+	return r * PxRecipSqrt(sqr(r.y / radii.y) + sqr(r.z / radii.z));
+}
+
+PX_CUDA_CALLABLE PX_INLINE PxReal tanHalf(PxReal sin, PxReal cos)
+{
+	return sin / (1 + cos);
+}
+
+PX_INLINE PxQuat quatFromTanQVector(const PxVec3& v)
+{
+	PxReal v2 = v.dot(v);
+	if(v2 < 1e-12f)
+		return PxQuat(PxIdentity);
+	PxReal d = 1 / (1 + v2);
+	return PxQuat(v.x * 2, v.y * 2, v.z * 2, 1 - v2) * d;
+}
+
+PX_FORCE_INLINE PxVec3 cross100(const PxVec3& b)
+{
+	return PxVec3(0.0f, -b.z, b.y);
+}
+PX_FORCE_INLINE PxVec3 cross010(const PxVec3& b)
+{
+	return PxVec3(b.z, 0.0f, -b.x);
+}
+PX_FORCE_INLINE PxVec3 cross001(const PxVec3& b)
+{
+	return PxVec3(-b.y, b.x, 0.0f);
+}
+
+PX_INLINE void decomposeVector(PxVec3& normalCompo, PxVec3& tangentCompo, const PxVec3& outwardDir,
+                               const PxVec3& outwardNormal)
+{
+	normalCompo = outwardNormal * (outwardDir.dot(outwardNormal));
+	tangentCompo = outwardDir - normalCompo;
+}
+
+//! \brief Return (i+1)%3
+// Avoid variable shift for XBox:
+// PX_INLINE PxU32 Ps::getNextIndex3(PxU32 i)			{	return (1<<i) & 3;			}
+PX_INLINE PxU32 getNextIndex3(PxU32 i)
+{
+	return (i + 1 + (i >> 1)) & 3;
+}
+
+PX_INLINE PxMat33 rotFrom2Vectors(const PxVec3& from, const PxVec3& to)
+{
+	// See bottom of http://www.euclideanspace.com/maths/algebra/matrix/orthogonal/rotation/index.htm
+
+	// Early exit if to = from
+	if((from - to).magnitudeSquared() < 1e-4f)
+		return PxMat33(PxIdentity);
+
+	// Early exit if to = -from
+	if((from + to).magnitudeSquared() < 1e-4f)
+		return PxMat33::createDiagonal(PxVec3(1.0f, -1.0f, -1.0f));
+
+	PxVec3 n = from.cross(to);
+
+	PxReal C = from.dot(to), S = PxSqrt(1 - C * C), CC = 1 - C;
+
+	PxReal xx = n.x * n.x, yy = n.y * n.y, zz = n.z * n.z, xy = n.x * n.y, yz = n.y * n.z, xz = n.x * n.z;
+
+	PxMat33 R;
+
+	R(0, 0) = 1 + CC * (xx - 1);
+	R(0, 1) = -n.z * S + CC * xy;
+	R(0, 2) = n.y * S + CC * xz;
+
+	R(1, 0) = n.z * S + CC * xy;
+	R(1, 1) = 1 + CC * (yy - 1);
+	R(1, 2) = -n.x * S + CC * yz;
+
+	R(2, 0) = -n.y * S + CC * xz;
+	R(2, 1) = n.x * S + CC * yz;
+	R(2, 2) = 1 + CC * (zz - 1);
+
+	return R;
+}
+
+PX_FOUNDATION_API void integrateTransform(const PxTransform& curTrans, const PxVec3& linvel, const PxVec3& angvel,
+                                          PxReal timeStep, PxTransform& result);
+
+PX_INLINE void computeBasis(const PxVec3& dir, PxVec3& right, PxVec3& up)
+{
+	// Derive two remaining vectors
+	if(PxAbs(dir.y) <= 0.9999f)
+	{
+		right = PxVec3(dir.z, 0.0f, -dir.x);
+		right.normalize();
+
+		// PT: normalize not needed for 'up' because dir & right are unit vectors,
+		// and by construction the angle between them is 90 degrees (i.e. sin(angle)=1)
+		up = PxVec3(dir.y * right.z, dir.z * right.x - dir.x * right.z, -dir.y * right.x);
+	}
+	else
+	{
+		right = PxVec3(1.0f, 0.0f, 0.0f);
+
+		up = PxVec3(0.0f, dir.z, -dir.y);
+		up.normalize();
+	}
+}
+
+PX_INLINE void computeBasis(const PxVec3& p0, const PxVec3& p1, PxVec3& dir, PxVec3& right, PxVec3& up)
+{
+	// Compute the new direction vector
+	dir = p1 - p0;
+	dir.normalize();
+
+	// Derive two remaining vectors
+	computeBasis(dir, right, up);
+}
+
+PX_FORCE_INLINE bool isAlmostZero(const PxVec3& v)
+{
+	if(PxAbs(v.x) > 1e-6f || PxAbs(v.y) > 1e-6f || PxAbs(v.z) > 1e-6f)
+		return false;
+	return true;
+}
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif
diff --git a/PxShared/src/foundation/include/PsMutex.h b/PxShared/src/foundation/include/PsMutex.h
new file mode 100644
index 0000000..7c93796
--- /dev/null
+++ b/PxShared/src/foundation/include/PsMutex.h
@@ -0,0 +1,330 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSMUTEX_H
+#define PSFOUNDATION_PSMUTEX_H
+
+#include "PsAllocator.h"
+
+/*
+ * This <new> inclusion is a best known fix for gcc 4.4.1 error:
+ * Creating object file for apex/src/PsAllocator.cpp ...
+ * In file included from apex/include/PsFoundation.h:30,
+ *                from apex/src/PsAllocator.cpp:26:
+ * apex/include/PsMutex.h: In constructor  'physx::shdfnd::MutexT<Alloc>::MutexT(const Alloc&)':
+ * apex/include/PsMutex.h:92: error: no matching function for call to 'operator new(unsigned int,
+ * physx::shdfnd::MutexImpl*&)'
+ * <built-in>:0: note: candidates are: void* operator new(unsigned int)
+ */
+#include <new>
+
+namespace physx
+{
+namespace shdfnd
+{
+class PX_FOUNDATION_API MutexImpl
+{
+  public:
+	/**
+	The constructor for Mutex creates a mutex. It is initially unlocked.
+	*/
+	MutexImpl();
+
+	/**
+	The destructor for Mutex deletes the mutex.
+	*/
+	~MutexImpl();
+
+	/**
+	Acquire (lock) the mutex. If the mutex is already locked
+	by another thread, this method blocks until the mutex is
+	unlocked.
+	*/
+	void lock();
+
+	/**
+	Acquire (lock) the mutex. If the mutex is already locked
+	by another thread, this method returns false without blocking.
+	*/
+	bool trylock();
+
+	/**
+	Release (unlock) the mutex.
+	*/
+	void unlock();
+
+	/**
+	Size of this class.
+	*/
+	static const uint32_t& getSize();
+};
+
+template <typename Alloc = ReflectionAllocator<MutexImpl> >
+class MutexT : protected Alloc
+{
+	PX_NOCOPY(MutexT)
+  public:
+	class ScopedLock
+	{
+		MutexT<Alloc>& mMutex;
+		PX_NOCOPY(ScopedLock)
+	  public:
+		PX_INLINE ScopedLock(MutexT<Alloc>& mutex) : mMutex(mutex)
+		{
+			mMutex.lock();
+		}
+		PX_INLINE ~ScopedLock()
+		{
+			mMutex.unlock();
+		}
+	};
+
+	/**
+	The constructor for Mutex creates a mutex. It is initially unlocked.
+	*/
+	MutexT(const Alloc& alloc = Alloc()) : Alloc(alloc)
+	{
+		mImpl = reinterpret_cast<MutexImpl*>(Alloc::allocate(MutexImpl::getSize(), __FILE__, __LINE__));
+		PX_PLACEMENT_NEW(mImpl, MutexImpl)();
+	}
+
+	/**
+	The destructor for Mutex deletes the mutex.
+	*/
+	~MutexT()
+	{
+		mImpl->~MutexImpl();
+		Alloc::deallocate(mImpl);
+	}
+
+	/**
+	Acquire (lock) the mutex. If the mutex is already locked
+	by another thread, this method blocks until the mutex is
+	unlocked.
+	*/
+	void lock() const
+	{
+		mImpl->lock();
+	}
+
+	/**
+	Acquire (lock) the mutex. If the mutex is already locked
+	by another thread, this method returns false without blocking,
+	returns true if lock is successfully acquired
+	*/
+	bool trylock() const
+	{
+		return mImpl->trylock();
+	}
+
+	/**
+	Release (unlock) the mutex, the calling thread must have
+	previously called lock() or method will error
+	*/
+	void unlock() const
+	{
+		mImpl->unlock();
+	}
+
+  private:
+	MutexImpl* mImpl;
+};
+
+class PX_FOUNDATION_API ReadWriteLock
+{
+	PX_NOCOPY(ReadWriteLock)
+  public:
+	ReadWriteLock();
+	~ReadWriteLock();
+
+	void lockReader();
+	void lockWriter();
+
+	void unlockReader();
+	void unlockWriter();
+
+  private:
+	class ReadWriteLockImpl* mImpl;
+};
+
+class ScopedReadLock
+{
+	PX_NOCOPY(ScopedReadLock)
+  public:
+	PX_INLINE ScopedReadLock(ReadWriteLock& lock) : mLock(lock)
+	{
+		mLock.lockReader();
+	}
+	PX_INLINE ~ScopedReadLock()
+	{
+		mLock.unlockReader();
+	}
+
+  private:
+	ReadWriteLock& mLock;
+};
+
+class ScopedWriteLock
+{
+	PX_NOCOPY(ScopedWriteLock)
+  public:
+	PX_INLINE ScopedWriteLock(ReadWriteLock& lock) : mLock(lock)
+	{
+		mLock.lockWriter();
+	}
+	PX_INLINE ~ScopedWriteLock()
+	{
+		mLock.unlockWriter();
+	}
+
+  private:
+	ReadWriteLock& mLock;
+};
+
+typedef MutexT<> Mutex;
+
+/*
+ * Use this type of lock for mutex behaviour that must operate on SPU and PPU
+ * On non-PS3 platforms, it is implemented using Mutex
+ */
+class AtomicLock
+{
+	Mutex mMutex;
+	PX_NOCOPY(AtomicLock)
+
+  public:
+	AtomicLock()
+	{
+	}
+
+	bool lock()
+	{
+		mMutex.lock();
+		return true;
+	}
+
+	bool trylock()
+	{
+		return mMutex.trylock();
+	}
+
+	bool unlock()
+	{
+		mMutex.unlock();
+		return true;
+	}
+};
+
+class AtomicLockCopy
+{
+	AtomicLock* pLock;
+
+  public:
+	AtomicLockCopy() : pLock(NULL)
+	{
+	}
+
+	AtomicLockCopy& operator=(AtomicLock& lock)
+	{
+		pLock = &lock;
+		return *this;
+	}
+
+	bool lock()
+	{
+		return pLock->lock();
+	}
+
+	bool trylock()
+	{
+		return pLock->trylock();
+	}
+
+	bool unlock()
+	{
+		return pLock->unlock();
+	}
+};
+
+class AtomicRwLock
+{
+	ReadWriteLock m_Lock;
+	PX_NOCOPY(AtomicRwLock)
+
+  public:
+	AtomicRwLock()
+	{
+	}
+
+	void lockReader()
+	{
+		m_Lock.lockReader();
+	}
+	void lockWriter()
+	{
+		m_Lock.lockWriter();
+	}
+
+	bool tryLockReader()
+	{
+		// Todo - implement this
+		m_Lock.lockReader();
+		return true;
+	}
+
+	void unlockReader()
+	{
+		m_Lock.unlockReader();
+	}
+	void unlockWriter()
+	{
+		m_Lock.unlockWriter();
+	}
+};
+
+class ScopedAtomicLock
+{
+	PX_INLINE ScopedAtomicLock(AtomicLock& lock) : mLock(lock)
+	{
+		mLock.lock();
+	}
+	PX_INLINE ~ScopedAtomicLock()
+	{
+		mLock.unlock();
+	}
+
+	PX_NOCOPY(ScopedAtomicLock)
+  private:
+	AtomicLock& mLock;
+};
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSMUTEX_H
diff --git a/PxShared/src/foundation/include/PsPool.h b/PxShared/src/foundation/include/PsPool.h
new file mode 100644
index 0000000..796251a
--- /dev/null
+++ b/PxShared/src/foundation/include/PsPool.h
@@ -0,0 +1,298 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSPOOL_H
+#define PSFOUNDATION_PSPOOL_H
+
+#include "PsArray.h"
+#include "PsSort.h"
+#include "PsBasicTemplates.h"
+#include "PsInlineArray.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+
+/*!
+Simple allocation pool
+*/
+template <class T, class Alloc = typename AllocatorTraits<T>::Type>
+class PoolBase : public UserAllocated, public Alloc
+{
+	PX_NOCOPY(PoolBase)
+  protected:
+	PoolBase(const Alloc& alloc, uint32_t elementsPerSlab, uint32_t slabSize)
+	: Alloc(alloc), mSlabs(alloc), mElementsPerSlab(elementsPerSlab), mUsed(0), mSlabSize(slabSize), mFreeElement(0)
+	{
+		PX_COMPILE_TIME_ASSERT(sizeof(T) >= sizeof(size_t));
+	}
+
+  public:
+	~PoolBase()
+	{
+		if(mUsed)
+			disposeElements();
+
+		for(void** slabIt = mSlabs.begin(), *slabEnd = mSlabs.end(); slabIt != slabEnd; ++slabIt)
+			Alloc::deallocate(*slabIt);
+	}
+
+	// Allocate space for single object
+	PX_INLINE T* allocate()
+	{
+		if(mFreeElement == 0)
+			allocateSlab();
+		T* p = reinterpret_cast<T*>(mFreeElement);
+		mFreeElement = mFreeElement->mNext;
+		mUsed++;
+/**
+Mark a specified amount of memory with 0xcd pattern. This is used to check that the meta data
+definition for serialized classes is complete in checked builds.
+*/
+#if PX_CHECKED
+		for(uint32_t i = 0; i < sizeof(T); ++i)
+			reinterpret_cast<uint8_t*>(p)[i] = 0xcd;
+#endif
+		return p;
+	}
+
+	// Put space for a single element back in the lists
+	PX_INLINE void deallocate(T* p)
+	{
+		if(p)
+		{
+			PX_ASSERT(mUsed);
+			mUsed--;
+			push(reinterpret_cast<FreeList*>(p));
+		}
+	}
+
+	PX_INLINE T* construct()
+	{
+		T* t = allocate();
+		return t ? new (t) T() : 0;
+	}
+
+	template <class A1>
+	PX_INLINE T* construct(A1& a)
+	{
+		T* t = allocate();
+		return t ? new (t) T(a) : 0;
+	}
+
+	template <class A1, class A2>
+	PX_INLINE T* construct(A1& a, A2& b)
+	{
+		T* t = allocate();
+		return t ? new (t) T(a, b) : 0;
+	}
+
+	template <class A1, class A2, class A3>
+	PX_INLINE T* construct(A1& a, A2& b, A3& c)
+	{
+		T* t = allocate();
+		return t ? new (t) T(a, b, c) : 0;
+	}
+
+	template <class A1, class A2, class A3>
+	PX_INLINE T* construct(A1* a, A2& b, A3& c)
+	{
+		T* t = allocate();
+		return t ? new (t) T(a, b, c) : 0;
+	}
+
+	template <class A1, class A2, class A3, class A4>
+	PX_INLINE T* construct(A1& a, A2& b, A3& c, A4& d)
+	{
+		T* t = allocate();
+		return t ? new (t) T(a, b, c, d) : 0;
+	}
+
+	template <class A1, class A2, class A3, class A4, class A5>
+	PX_INLINE T* construct(A1& a, A2& b, A3& c, A4& d, A5& e)
+	{
+		T* t = allocate();
+		return t ? new (t) T(a, b, c, d, e) : 0;
+	}
+
+	PX_INLINE void destroy(T* const p)
+	{
+		if(p)
+		{
+			p->~T();
+			deallocate(p);
+		}
+	}
+
+  protected:
+	struct FreeList
+	{
+		FreeList* mNext;
+	};
+
+	// All the allocated slabs, sorted by pointer
+	InlineArray<void*, 64, Alloc> mSlabs;
+
+	uint32_t mElementsPerSlab;
+	uint32_t mUsed;
+	uint32_t mSlabSize;
+
+	FreeList* mFreeElement; // Head of free-list
+
+	// Helper function to get bitmap of allocated elements
+
+	void push(FreeList* p)
+	{
+		p->mNext = mFreeElement;
+		mFreeElement = p;
+	}
+
+	// Allocate a slab and segregate it into the freelist
+	void allocateSlab()
+	{
+		T* slab = reinterpret_cast<T*>(Alloc::allocate(mSlabSize, __FILE__, __LINE__));
+
+		mSlabs.pushBack(slab);
+
+		// Build a chain of nodes for the freelist
+		T* it = slab + mElementsPerSlab;
+		while(--it >= slab)
+			push(reinterpret_cast<FreeList*>(it));
+	}
+
+	/*
+	Cleanup method. Go through all active slabs and call destructor for live objects,
+	then free their memory
+	*/
+	void disposeElements()
+	{
+		Array<void*, Alloc> freeNodes(*this);
+		while(mFreeElement)
+		{
+			freeNodes.pushBack(mFreeElement);
+			mFreeElement = mFreeElement->mNext;
+		}
+		Alloc& alloc(*this);
+		sort(freeNodes.begin(), freeNodes.size(), Less<void*>(), alloc);
+		sort(mSlabs.begin(), mSlabs.size(), Less<void*>(), alloc);
+
+		typename Array<void*, Alloc>::Iterator slabIt = mSlabs.begin(), slabEnd = mSlabs.end();
+		for(typename Array<void*, Alloc>::Iterator freeIt = freeNodes.begin(); slabIt != slabEnd; ++slabIt)
+		{
+			for(T* tIt = reinterpret_cast<T*>(*slabIt), *tEnd = tIt + mElementsPerSlab; tIt != tEnd; ++tIt)
+			{
+				if(freeIt != freeNodes.end() && *freeIt == tIt)
+					++freeIt;
+				else
+					tIt->~T();
+			}
+		}
+	}
+
+	/*
+	Go through all slabs and call destructor if the slab is empty
+	*/
+	void releaseEmptySlabs()
+	{
+		Array<void*, Alloc> freeNodes(*this);
+		Array<void*, Alloc> slabNodes(mSlabs, *this);
+		while(mFreeElement)
+		{
+			freeNodes.pushBack(mFreeElement);
+			mFreeElement = mFreeElement->mNext;
+		}
+
+		typename Array<void*, Alloc>::Iterator freeIt = freeNodes.begin(), freeEnd = freeNodes.end(),
+		                                       lastCheck = freeNodes.end() - mElementsPerSlab;
+
+		if(freeNodes.size() > mElementsPerSlab)
+		{
+			Alloc& alloc(*this);
+			sort(freeNodes.begin(), freeNodes.size(), Less<void*>(), alloc);
+			sort(slabNodes.begin(), slabNodes.size(), Less<void*>(), alloc);
+
+			mSlabs.clear();
+			for(void** slabIt = slabNodes.begin(), *slabEnd = slabNodes.end(); slabIt != slabEnd; ++slabIt)
+			{
+				while((freeIt < lastCheck) && (*slabIt > (*freeIt)))
+				{
+					push(reinterpret_cast<FreeList*>(*freeIt));
+					freeIt++;
+				}
+
+				if(*slabIt == (*freeIt)) // the slab's first element in freeList
+				{
+					const size_t endSlabAddress = size_t(*slabIt) + mSlabSize;
+					const size_t endFreeAddress = size_t(*(freeIt + mElementsPerSlab - 1));
+					if(endFreeAddress + sizeof(T) == endSlabAddress)
+					{ // all slab's element in freeList
+						Alloc::deallocate(*slabIt);
+						freeIt += mElementsPerSlab;
+						continue;
+					}
+				}
+
+				mSlabs.pushBack(*slabIt);
+			}
+		}
+
+		while(freeIt != freeEnd)
+		{
+			push(reinterpret_cast<FreeList*>(*freeIt));
+			++freeIt;
+		}
+	}
+};
+
+// original pool implementation
+template <class T, class Alloc = typename AllocatorTraits<T>::Type>
+class Pool : public PoolBase<T, Alloc>
+{
+  public:
+	Pool(const Alloc& alloc = Alloc(), uint32_t elementsPerSlab = 32)
+	: PoolBase<T, Alloc>(alloc, elementsPerSlab, elementsPerSlab * sizeof(T))
+	{
+	}
+};
+
+// allows specification of the slab size instead of the occupancy
+template <class T, uint32_t slabSize, class Alloc = typename AllocatorTraits<T>::Type>
+class Pool2 : public PoolBase<T, Alloc>
+{
+  public:
+	Pool2(const Alloc& alloc = Alloc()) : PoolBase<T, Alloc>(alloc, slabSize / sizeof(T), slabSize)
+	{
+	}
+};
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSPOOL_H
diff --git a/PxShared/src/foundation/include/PsSList.h b/PxShared/src/foundation/include/PsSList.h
new file mode 100644
index 0000000..f811c37
--- /dev/null
+++ b/PxShared/src/foundation/include/PsSList.h
@@ -0,0 +1,140 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSSLIST_H
+#define PSFOUNDATION_PSSLIST_H
+
+#include "foundation/Px.h"
+#include "foundation/PxAssert.h"
+#include "PsAlignedMalloc.h"
+
+#if PX_P64_FAMILY
+#define PX_SLIST_ALIGNMENT 16
+#else
+#define PX_SLIST_ALIGNMENT 8
+#endif
+
+namespace physx
+{
+namespace shdfnd
+{
+
+#if PX_VC
+#pragma warning(push)
+#pragma warning(disable : 4324) // Padding was added at the end of a structure because of a __declspec(align) value.
+#endif
+
+#if !PX_GCC_FAMILY
+__declspec(align(PX_SLIST_ALIGNMENT))
+#endif
+    class SListEntry
+{
+	friend struct SListImpl;
+
+  public:
+	SListEntry() : mNext(NULL)
+	{
+		PX_ASSERT((size_t(this) & (PX_SLIST_ALIGNMENT - 1)) == 0);
+	}
+
+	// Only use on elements returned by SList::flush()
+	// because the operation is not atomic.
+	SListEntry* next()
+	{
+		return mNext;
+	}
+
+  private:
+	SListEntry* mNext;
+}
+#if PX_GCC_FAMILY
+__attribute__((aligned(PX_SLIST_ALIGNMENT)));
+#else
+;
+#endif
+
+#if PX_VC
+#pragma warning(pop)
+#endif
+
+// template-less implementation
+struct PX_FOUNDATION_API SListImpl
+{
+	SListImpl();
+	~SListImpl();
+	void push(SListEntry* entry);
+	SListEntry* pop();
+	SListEntry* flush();
+	static const uint32_t& getSize();
+};
+
+template <typename Alloc = ReflectionAllocator<SListImpl> >
+class SListT : protected Alloc
+{
+  public:
+	SListT(const Alloc& alloc = Alloc()) : Alloc(alloc)
+	{
+		mImpl = reinterpret_cast<SListImpl*>(Alloc::allocate(SListImpl::getSize(), __FILE__, __LINE__));
+		PX_ASSERT((size_t(mImpl) & (PX_SLIST_ALIGNMENT - 1)) == 0);
+		PX_PLACEMENT_NEW(mImpl, SListImpl)();
+	}
+	~SListT()
+	{
+		mImpl->~SListImpl();
+		Alloc::deallocate(mImpl);
+	}
+
+	// pushes a new element to the list
+	void push(SListEntry& entry)
+	{
+		mImpl->push(&entry);
+	}
+
+	// pops an element from the list
+	SListEntry* pop()
+	{
+		return mImpl->pop();
+	}
+
+	// removes all items from list, returns pointer to first element
+	SListEntry* flush()
+	{
+		return mImpl->flush();
+	}
+
+  private:
+	SListImpl* mImpl;
+};
+
+typedef SListT<> SList;
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSSLIST_H
diff --git a/PxShared/src/foundation/include/PsSocket.h b/PxShared/src/foundation/include/PsSocket.h
new file mode 100644
index 0000000..0d8bf55
--- /dev/null
+++ b/PxShared/src/foundation/include/PsSocket.h
@@ -0,0 +1,186 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSSOCKET_H
+#define PSFOUNDATION_PSSOCKET_H
+
+#include "PsUserAllocated.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+/**
+Socket abstraction API
+*/
+
+class PX_FOUNDATION_API Socket : public UserAllocated
+{
+  public:
+	static const uint32_t DEFAULT_BUFFER_SIZE;
+
+	Socket(bool inEnableBuffering = true, bool blocking = true);
+
+	virtual ~Socket();
+
+	/*!
+	Opens a network socket for input and/or output
+
+	\param host
+	Name of the host to connect to. This can be an IP, URL, etc
+
+	\param port
+	The port to connect to on the remote host
+
+	\param timeout
+	Timeout in ms until the connection must be established.
+
+	\return
+	True if the connection was successful, false otherwise
+	*/
+	bool connect(const char* host, uint16_t port, uint32_t timeout = 1000);
+
+	/*!
+	Opens a network socket for input and/or output as a server.  Put the connection in listening mode
+
+	\param port
+	The port on which the socket listens
+	*/
+	bool listen(uint16_t port);
+
+	/*!
+	Accept a connection on a socket that is in listening mode
+
+	\note
+	This method only supports a single connection client.  Additional clients
+	that connect to the listening port will overwrite the existing socket handle.
+
+	\param block
+	whether or not the call should block
+
+	\return whether a connection was established
+	*/
+	bool accept(bool block);
+
+	/*!
+	Disconnects an open socket
+	*/
+	void disconnect();
+
+	/*!
+	Returns whether the socket is currently open (connected) or not.
+
+	\return
+	True if the socket is connected, false otherwise
+	*/
+	bool isConnected() const;
+
+	/*!
+	Returns the name of the connected host. This is the same as the string
+	that was supplied to the connect call.
+
+	\return
+	The name of the connected host
+	*/
+	const char* getHost() const;
+
+	/*!
+	Returns the port of the connected host. This is the same as the port
+	that was supplied to the connect call.
+
+	\return
+	The port of the connected host
+	*/
+	uint16_t getPort() const;
+
+	/*!
+	Flushes the output stream. Until the stream is flushed, there is no
+	guarantee that the written data has actually reached the destination
+	storage. Flush forces all buffered data to be sent to the output.
+
+	\note flush always blocks. If the socket is in non-blocking mode, this will result
+	the thread spinning.
+
+	\return
+	True if the flush was successful, false otherwise
+	*/
+	bool flush();
+
+	/*!
+	Writes data to the output stream.
+
+	\param data
+	Pointer to a  block of data to write to the stream
+
+	\param length
+	Amount of data to write, in bytes
+
+	\return
+	Number of bytes actually written. This could be lower than length if the socket is non-blocking.
+	*/
+
+	uint32_t write(const uint8_t* data, uint32_t length);
+
+	/*!
+	Reads data from the output stream.
+
+	\param data
+	Pointer to a buffer where the read data will be stored.
+
+	\param length
+	Amount of data to read, in bytes.
+
+	\return
+	Number of bytes actually read. This could be lower than length if the stream end is
+	encountered or the socket is non-blocking.
+	*/
+	uint32_t read(uint8_t* data, uint32_t length);
+
+	/*!
+	Sets blocking mode of the socket.
+	Socket must be connected, otherwise calling this method won't take any effect.
+	*/
+	void setBlocking(bool blocking);
+
+	/*!
+	Returns whether read/write/flush calls to the socket are blocking.
+
+	\return
+	True if the socket is blocking.
+	*/
+	bool isBlocking() const;
+
+  private:
+	class SocketImpl* mImpl;
+};
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // PSFOUNDATION_PSSOCKET_H
diff --git a/PxShared/src/foundation/include/PsSort.h b/PxShared/src/foundation/include/PsSort.h
new file mode 100644
index 0000000..30808ae
--- /dev/null
+++ b/PxShared/src/foundation/include/PsSort.h
@@ -0,0 +1,130 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSSORT_H
+#define PSFOUNDATION_PSSORT_H
+
+/** \addtogroup foundation
+@{
+*/
+
+#include "PsSortInternals.h"
+#include "PsAlloca.h"
+
+#define PX_SORT_PARANOIA PX_DEBUG
+
+/**
+\brief Sorts an array of objects in ascending order, assuming
+that the predicate implements the < operator:
+
+\see Less, Greater
+*/
+
+#if PX_VC
+#pragma warning(push)
+#pragma warning(disable : 4706) // disable the warning that we did an assignment within a conditional expression, as
+// this was intentional.
+#endif
+
+namespace physx
+{
+namespace shdfnd
+{
+template <class T, class Predicate, class Allocator>
+void sort(T* elements, uint32_t count, const Predicate& compare, const Allocator& inAllocator,
+          const uint32_t initialStackSize = 32)
+{
+	static const uint32_t SMALL_SORT_CUTOFF = 5; // must be >= 3 since we need 3 for median
+
+	PX_ALLOCA(stackMem, int32_t, initialStackSize);
+	internal::Stack<Allocator> stack(stackMem, initialStackSize, inAllocator);
+
+	int32_t first = 0, last = int32_t(count - 1);
+	if(last > first)
+	{
+		for(;;)
+		{
+			while(last > first)
+			{
+				PX_ASSERT(first >= 0 && last < int32_t(count));
+				if(uint32_t(last - first) < SMALL_SORT_CUTOFF)
+				{
+					internal::smallSort(elements, first, last, compare);
+					break;
+				}
+				else
+				{
+					const int32_t partIndex = internal::partition(elements, first, last, compare);
+
+					// push smaller sublist to minimize stack usage
+					if((partIndex - first) < (last - partIndex))
+					{
+						stack.push(first, partIndex - 1);
+						first = partIndex + 1;
+					}
+					else
+					{
+						stack.push(partIndex + 1, last);
+						last = partIndex - 1;
+					}
+				}
+			}
+
+			if(stack.empty())
+				break;
+
+			stack.pop(first, last);
+		}
+	}
+#if PX_SORT_PARANOIA
+	for(uint32_t i = 1; i < count; i++)
+		PX_ASSERT(!compare(elements[i], elements[i - 1]));
+#endif
+}
+
+template <class T, class Predicate>
+void sort(T* elements, uint32_t count, const Predicate& compare)
+{
+	sort(elements, count, compare, typename shdfnd::AllocatorTraits<T>::Type());
+}
+
+template <class T>
+void sort(T* elements, uint32_t count)
+{
+	sort(elements, count, shdfnd::Less<T>(), typename shdfnd::AllocatorTraits<T>::Type());
+}
+
+} // namespace shdfnd
+} // namespace physx
+
+#if PX_VC
+#pragma warning(pop)
+#endif
+
+#endif // #ifndef PSFOUNDATION_PSSORT_H
diff --git a/PxShared/src/foundation/include/PsSortInternals.h b/PxShared/src/foundation/include/PsSortInternals.h
new file mode 100644
index 0000000..3aa0f7f
--- /dev/null
+++ b/PxShared/src/foundation/include/PsSortInternals.h
@@ -0,0 +1,188 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSSORTINTERNALS_H
+#define PSFOUNDATION_PSSORTINTERNALS_H
+
+/** \addtogroup foundation
+@{
+*/
+
+#include "foundation/PxAssert.h"
+#include "foundation/PxIntrinsics.h"
+#include "PsBasicTemplates.h"
+#include "PsUserAllocated.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+namespace internal
+{
+template <class T, class Predicate>
+PX_INLINE void median3(T* elements, int32_t first, int32_t last, Predicate& compare)
+{
+	/*
+	This creates sentinels because we know there is an element at the start minimum(or equal)
+	than the pivot and an element at the end greater(or equal) than the pivot. Plus the
+	median of 3 reduces the chance of degenerate behavour.
+	*/
+
+	int32_t mid = (first + last) / 2;
+
+	if(compare(elements[mid], elements[first]))
+		swap(elements[first], elements[mid]);
+
+	if(compare(elements[last], elements[first]))
+		swap(elements[first], elements[last]);
+
+	if(compare(elements[last], elements[mid]))
+		swap(elements[mid], elements[last]);
+
+	// keep the pivot at last-1
+	swap(elements[mid], elements[last - 1]);
+}
+
+template <class T, class Predicate>
+PX_INLINE int32_t partition(T* elements, int32_t first, int32_t last, Predicate& compare)
+{
+	median3(elements, first, last, compare);
+
+	/*
+	WARNING: using the line:
+
+	T partValue = elements[last-1];
+
+	and changing the scan loops to:
+
+	while(comparator.greater(partValue, elements[++i]));
+	while(comparator.greater(elements[--j], partValue);
+
+	triggers a compiler optimizer bug on xenon where it stores a double to the stack for partValue
+	then loads it as a single...:-(
+	*/
+
+	int32_t i = first;    // we know first is less than pivot(but i gets pre incremented)
+	int32_t j = last - 1; // pivot is in last-1 (but j gets pre decremented)
+
+	for(;;)
+	{
+		while(compare(elements[++i], elements[last - 1]))
+			;
+		while(compare(elements[last - 1], elements[--j]))
+			;
+
+		if(i >= j)
+			break;
+
+		PX_ASSERT(i <= last && j >= first);
+		swap(elements[i], elements[j]);
+	}
+	// put the pivot in place
+
+	PX_ASSERT(i <= last && first <= (last - 1));
+	swap(elements[i], elements[last - 1]);
+
+	return i;
+}
+
+template <class T, class Predicate>
+PX_INLINE void smallSort(T* elements, int32_t first, int32_t last, Predicate& compare)
+{
+	// selection sort - could reduce to fsel on 360 with floats.
+
+	for(int32_t i = first; i < last; i++)
+	{
+		int32_t m = i;
+		for(int32_t j = i + 1; j <= last; j++)
+			if(compare(elements[j], elements[m]))
+				m = j;
+
+		if(m != i)
+			swap(elements[m], elements[i]);
+	}
+}
+
+template <class Allocator>
+class Stack
+{
+	Allocator mAllocator;
+	uint32_t mSize, mCapacity;
+	int32_t* mMemory;
+	bool mRealloc;
+
+  public:
+	Stack(int32_t* memory, uint32_t capacity, const Allocator& inAllocator)
+	: mAllocator(inAllocator), mSize(0), mCapacity(capacity), mMemory(memory), mRealloc(false)
+	{
+	}
+	~Stack()
+	{
+		if(mRealloc)
+			mAllocator.deallocate(mMemory);
+	}
+
+	void grow()
+	{
+		mCapacity *= 2;
+		int32_t* newMem =
+		    reinterpret_cast<int32_t*>(mAllocator.allocate(sizeof(int32_t) * mCapacity, __FILE__, __LINE__));
+		intrinsics::memCopy(newMem, mMemory, mSize * sizeof(int32_t));
+		if(mRealloc)
+			mAllocator.deallocate(mMemory);
+		mRealloc = true;
+		mMemory = newMem;
+	}
+
+	PX_INLINE void push(int32_t start, int32_t end)
+	{
+		if(mSize >= mCapacity - 1)
+			grow();
+		mMemory[mSize++] = start;
+		mMemory[mSize++] = end;
+	}
+
+	PX_INLINE void pop(int32_t& start, int32_t& end)
+	{
+		PX_ASSERT(!empty());
+		end = mMemory[--mSize];
+		start = mMemory[--mSize];
+	}
+
+	PX_INLINE bool empty()
+	{
+		return mSize == 0;
+	}
+};
+} // namespace internal
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSSORTINTERNALS_H
diff --git a/PxShared/src/foundation/include/PsString.h b/PxShared/src/foundation/include/PsString.h
new file mode 100644
index 0000000..17d25c8
--- /dev/null
+++ b/PxShared/src/foundation/include/PsString.h
@@ -0,0 +1,90 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSSTRING_H
+#define PSFOUNDATION_PSSTRING_H
+
+#include "foundation/PxPreprocessor.h"
+#include "foundation/PxSimpleTypes.h"
+#include <stdarg.h>
+
+namespace physx
+{
+namespace shdfnd
+{
+
+// the following functions have C99 semantics. Note that C99 requires for snprintf and vsnprintf:
+// * the resulting string is always NULL-terminated regardless of truncation.
+// * in the case of truncation the return value is the number of characters that would have been created.
+
+PX_FOUNDATION_API int32_t sscanf(const char* buffer, const char* format, ...);
+PX_FOUNDATION_API int32_t strcmp(const char* str1, const char* str2);
+PX_FOUNDATION_API int32_t strncmp(const char* str1, const char* str2, size_t count);
+PX_FOUNDATION_API int32_t snprintf(char* dst, size_t dstSize, const char* format, ...);
+PX_FOUNDATION_API int32_t vsnprintf(char* dst, size_t dstSize, const char* src, va_list arg);
+
+// strlcat and strlcpy have BSD semantics:
+// * dstSize is always the size of the destination buffer
+// * the resulting string is always NULL-terminated regardless of truncation
+// * in the case of truncation the return value is the length of the string that would have been created
+
+PX_FOUNDATION_API size_t strlcat(char* dst, size_t dstSize, const char* src);
+PX_FOUNDATION_API size_t strlcpy(char* dst, size_t dstSize, const char* src);
+
+// case-insensitive string comparison
+PX_FOUNDATION_API int32_t stricmp(const char* str1, const char* str2);
+PX_FOUNDATION_API int32_t strnicmp(const char* str1, const char* str2, size_t count);
+
+// in-place string case conversion
+PX_FOUNDATION_API void strlwr(char* str);
+PX_FOUNDATION_API void strupr(char* str);
+
+/**
+\brief The maximum supported formatted output string length
+(number of characters after replacement).
+
+@see printFormatted()
+*/
+static const size_t MAX_PRINTFORMATTED_LENGTH = 1024;
+
+/**
+\brief Prints the formatted data, trying to make sure it's visible to the app programmer
+
+@see NS_MAX_PRINTFORMATTED_LENGTH
+*/
+PX_FOUNDATION_API void printFormatted(const char*, ...);
+
+/**
+\brief Prints the string literally (does not consume % specifier), trying to make sure it's visible to the app
+programmer
+*/
+PX_FOUNDATION_API void printString(const char*);
+}
+}
+#endif // #ifndef PSFOUNDATION_PSSTRING_H
diff --git a/PxShared/src/foundation/include/PsSync.h b/PxShared/src/foundation/include/PsSync.h
new file mode 100644
index 0000000..8b99731
--- /dev/null
+++ b/PxShared/src/foundation/include/PsSync.h
@@ -0,0 +1,138 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSSYNC_H
+#define PSFOUNDATION_PSSYNC_H
+
+#include "PsAllocator.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+/*!
+Implementation notes:
+* - Calling set() on an already signaled Sync does not change its state.
+* - Calling reset() on an already reset Sync does not change its state.
+* - Calling set() on a reset Sync wakes all waiting threads (potential for thread contention).
+* - Calling wait() on an already signaled Sync will return true immediately.
+* - NOTE: be careful when pulsing an event with set() followed by reset(), because a
+*   thread that is not waiting on the event will miss the signal.
+*/
+class PX_FOUNDATION_API SyncImpl
+{
+  public:
+	static const uint32_t waitForever = 0xffffffff;
+
+	SyncImpl();
+
+	~SyncImpl();
+
+	/** Wait on the object for at most the given number of ms. Returns
+	*  true if the object is signaled. Sync::waitForever will block forever
+	*  or until the object is signaled.
+	*/
+
+	bool wait(uint32_t milliseconds = waitForever);
+
+	/** Signal the synchronization object, waking all threads waiting on it */
+
+	void set();
+
+	/** Reset the synchronization object */
+
+	void reset();
+
+	/**
+   Size of this class.
+   */
+	static const uint32_t& getSize();
+};
+
+/*!
+Implementation notes:
+* - Calling set() on an already signaled Sync does not change its state.
+* - Calling reset() on an already reset Sync does not change its state.
+* - Calling set() on a reset Sync wakes all waiting threads (potential for thread contention).
+* - Calling wait() on an already signaled Sync will return true immediately.
+* - NOTE: be careful when pulsing an event with set() followed by reset(), because a
+*   thread that is not waiting on the event will miss the signal.
+*/
+template <typename Alloc = ReflectionAllocator<SyncImpl> >
+class SyncT : protected Alloc
+{
+  public:
+	static const uint32_t waitForever = SyncImpl::waitForever;
+
+	SyncT(const Alloc& alloc = Alloc()) : Alloc(alloc)
+	{
+		mImpl = reinterpret_cast<SyncImpl*>(Alloc::allocate(SyncImpl::getSize(), __FILE__, __LINE__));
+		PX_PLACEMENT_NEW(mImpl, SyncImpl)();
+	}
+
+	~SyncT()
+	{
+		mImpl->~SyncImpl();
+		Alloc::deallocate(mImpl);
+	}
+
+	/** Wait on the object for at most the given number of ms. Returns
+	*  true if the object is signaled. Sync::waitForever will block forever
+	*  or until the object is signaled.
+	*/
+
+	bool wait(uint32_t milliseconds = SyncImpl::waitForever)
+	{
+		return mImpl->wait(milliseconds);
+	}
+
+	/** Signal the synchronization object, waking all threads waiting on it */
+
+	void set()
+	{
+		mImpl->set();
+	}
+
+	/** Reset the synchronization object */
+
+	void reset()
+	{
+		mImpl->reset();
+	}
+
+  private:
+	class SyncImpl* mImpl;
+};
+
+typedef SyncT<> Sync;
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSSYNC_H
diff --git a/PxShared/src/foundation/include/PsTempAllocator.h b/PxShared/src/foundation/include/PsTempAllocator.h
new file mode 100644
index 0000000..7a063dc
--- /dev/null
+++ b/PxShared/src/foundation/include/PsTempAllocator.h
@@ -0,0 +1,62 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSTEMPALLOCATOR_H
+#define PSFOUNDATION_PSTEMPALLOCATOR_H
+
+#include "PsAllocator.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+union TempAllocatorChunk
+{
+	TempAllocatorChunk() : mNext(0)
+	{
+	}
+	TempAllocatorChunk* mNext; // while chunk is free
+	uint32_t mIndex;           // while chunk is allocated
+	uint8_t mPad[16];          // 16 byte aligned allocations
+};
+
+class TempAllocator
+{
+  public:
+	PX_FORCE_INLINE TempAllocator(const char* = 0)
+	{
+	}
+	PX_FOUNDATION_API void* allocate(size_t size, const char* file, int line);
+	PX_FOUNDATION_API void deallocate(void* ptr);
+};
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSTEMPALLOCATOR_H
diff --git a/PxShared/src/foundation/include/PsThread.h b/PxShared/src/foundation/include/PsThread.h
new file mode 100644
index 0000000..8ba553a
--- /dev/null
+++ b/PxShared/src/foundation/include/PsThread.h
@@ -0,0 +1,382 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSTHREAD_H
+#define PSFOUNDATION_PSTHREAD_H
+
+#include "PsUserAllocated.h"
+
+// dsequeira: according to existing comment here (David Black would be my guess)
+// "This is useful to reduce bus contention on tight spin locks. And it needs
+// to be a macro as the xenon compiler often ignores even __forceinline." What's not
+// clear is why a pause function needs inlining...? (TODO: check with XBox team)
+
+// todo: these need to go somewhere else
+
+#if PX_WINDOWS_FAMILY || PX_XBOXONE
+#define PxSpinLockPause() __asm pause
+#elif PX_LINUX || PX_ANDROID || PX_PS4 || PX_APPLE_FAMILY || PX_NX
+#define PxSpinLockPause() asm("nop")
+#else
+#error "Platform not supported!"
+#endif
+
+namespace physx
+{
+namespace shdfnd
+{
+struct ThreadPriority // todo: put in some other header file
+{
+	enum Enum
+	{
+		/**
+	    \brief High priority
+	    */
+		eHIGH         = 0,
+
+		/**
+	    \brief Above Normal priority
+	    */
+		eABOVE_NORMAL = 1,
+
+		/**
+	    \brief Normal/default priority
+	    */
+		eNORMAL       = 2,
+
+		/**
+	    \brief Below Normal priority
+	    */
+		eBELOW_NORMAL = 3,
+
+		/**
+	    \brief Low priority.
+	    */
+		eLOW          = 4,
+		eFORCE_DWORD  = 0xffFFffFF
+	};
+};
+
+class Runnable
+{
+  public:
+	Runnable()
+	{
+	}
+	virtual ~Runnable()
+	{
+	}
+	virtual void execute(void)
+	{
+	}
+};
+
+class PX_FOUNDATION_API ThreadImpl
+{
+  public:
+	typedef size_t Id; // space for a pointer or an integer
+	typedef void* (*ExecuteFn)(void*);
+
+	static uint32_t getDefaultStackSize();
+	static Id getId();
+
+	/**
+	Construct (but do not start) the thread object. The OS thread object will not be created
+	until start() is called. Executes in the context
+	of the spawning thread.
+	*/
+
+	ThreadImpl();
+
+	/**
+	Construct and start the the thread, passing the given arg to the given fn. (pthread style)
+	*/
+
+	ThreadImpl(ExecuteFn fn, void* arg);
+
+	/**
+	Deallocate all resources associated with the thread. Should be called in the
+	context of the spawning thread.
+	*/
+
+	~ThreadImpl();
+
+	/**
+	Create the OS thread and start it running. Called in the context of the spawning thread.
+	If an affinity mask has previously been set then it will be applied after the
+	thread has been created.
+	*/
+
+	void start(uint32_t stackSize, Runnable* r);
+
+	/**
+	Violently kill the current thread. Blunt instrument, not recommended since
+	it can leave all kinds of things unreleased (stack, memory, mutexes...) Should
+	be called in the context of the spawning thread.
+	*/
+
+	void kill();
+
+	/**
+	Stop the thread. Signals the spawned thread that it should stop, so the
+	thread should check regularly
+	*/
+
+	void signalQuit();
+
+	/**
+	Wait for a thread to stop. Should be called in the context of the spawning
+	thread. Returns false if the thread has not been started.
+	*/
+
+	bool waitForQuit();
+
+	/**
+	check whether the thread is signalled to quit. Called in the context of the
+	spawned thread.
+	*/
+
+	bool quitIsSignalled();
+
+	/**
+	Cleanly shut down this thread. Called in the context of the spawned thread.
+	*/
+	void quit();
+
+	/**
+	Change the affinity mask for this thread. The mask is a platform
+	specific value.
+
+	On Windows, Linux, PS4, XboxOne and NX platforms, each set mask bit represents
+	the index of a logical processor that the OS may schedule thread execution on.
+	Bits outside the range of valid logical processors may be ignored or cause
+	the function to return an error.
+
+	On Apple platforms, this function has no effect.
+
+	If the thread has not yet been started then the mask is stored
+	and applied when the thread is started.
+
+	If the thread has already been started then this method	returns the
+	previous affinity mask on success, otherwise it returns zero.
+	*/
+	uint32_t setAffinityMask(uint32_t mask);
+
+	static ThreadPriority::Enum getPriority(Id threadId);
+
+	/** Set thread priority. */
+	void setPriority(ThreadPriority::Enum prio);
+
+	/** set the thread's name */
+	void setName(const char* name);
+
+	/** Put the current thread to sleep for the given number of milliseconds */
+	static void sleep(uint32_t ms);
+
+	/** Yield the current thread's slot on the CPU */
+	static void yield();
+
+	/** Return the number of physical cores (does not include hyper-threaded cores), returns 0 on failure */
+	static uint32_t getNbPhysicalCores();
+
+	/**
+   Size of this class.
+   */
+	static const uint32_t& getSize();
+};
+
+/**
+Thread abstraction API
+*/
+template <typename Alloc = ReflectionAllocator<ThreadImpl> >
+class ThreadT : protected Alloc, public UserAllocated, public Runnable
+{
+  public:
+	typedef ThreadImpl::Id Id; // space for a pointer or an integer
+
+	/**
+	Construct (but do not start) the thread object. Executes in the context
+	of the spawning thread
+	*/
+	ThreadT(const Alloc& alloc = Alloc()) : Alloc(alloc)
+	{
+		mImpl = reinterpret_cast<ThreadImpl*>(Alloc::allocate(ThreadImpl::getSize(), __FILE__, __LINE__));
+		PX_PLACEMENT_NEW(mImpl, ThreadImpl)();
+	}
+
+	/**
+	Construct and start the the thread, passing the given arg to the given fn. (pthread style)
+	*/
+	ThreadT(ThreadImpl::ExecuteFn fn, void* arg, const Alloc& alloc = Alloc()) : Alloc(alloc)
+	{
+		mImpl = reinterpret_cast<ThreadImpl*>(Alloc::allocate(ThreadImpl::getSize(), __FILE__, __LINE__));
+		PX_PLACEMENT_NEW(mImpl, ThreadImpl)(fn, arg);
+	}
+
+	/**
+	Deallocate all resources associated with the thread. Should be called in the
+	context of the spawning thread.
+	*/
+	virtual ~ThreadT()
+	{
+		mImpl->~ThreadImpl();
+		Alloc::deallocate(mImpl);
+	}
+
+	/**
+	start the thread running. Called in the context of the spawning thread.
+	*/
+
+	void start(uint32_t stackSize = ThreadImpl::getDefaultStackSize())
+	{
+		mImpl->start(stackSize, this);
+	}
+
+	/**
+	Violently kill the current thread. Blunt instrument, not recommended since
+	it can leave all kinds of things unreleased (stack, memory, mutexes...) Should
+	be called in the context of the spawning thread.
+	*/
+
+	void kill()
+	{
+		mImpl->kill();
+	}
+
+	/**
+	The virtual execute() method is the user defined function that will
+	run in the new thread. Called in the context of the spawned thread.
+	*/
+
+	virtual void execute(void)
+	{
+	}
+
+	/**
+	stop the thread. Signals the spawned thread that it should stop, so the
+	thread should check regularly
+	*/
+
+	void signalQuit()
+	{
+		mImpl->signalQuit();
+	}
+
+	/**
+	Wait for a thread to stop. Should be called in the context of the spawning
+	thread. Returns false if the thread has not been started.
+	*/
+
+	bool waitForQuit()
+	{
+		return mImpl->waitForQuit();
+	}
+
+	/**
+	check whether the thread is signalled to quit. Called in the context of the
+	spawned thread.
+	*/
+
+	bool quitIsSignalled()
+	{
+		return mImpl->quitIsSignalled();
+	}
+
+	/**
+	Cleanly shut down this thread. Called in the context of the spawned thread.
+	*/
+	void quit()
+	{
+		mImpl->quit();
+	}
+
+	uint32_t setAffinityMask(uint32_t mask)
+	{
+		return mImpl->setAffinityMask(mask);
+	}
+
+	static ThreadPriority::Enum getPriority(ThreadImpl::Id threadId)
+	{
+		return ThreadImpl::getPriority(threadId);
+	}
+
+	/** Set thread priority. */
+	void setPriority(ThreadPriority::Enum prio)
+	{
+		mImpl->setPriority(prio);
+	}
+
+	/** set the thread's name */
+	void setName(const char* name)
+	{
+		mImpl->setName(name);
+	}
+
+	/** Put the current thread to sleep for the given number of milliseconds */
+	static void sleep(uint32_t ms)
+	{
+		ThreadImpl::sleep(ms);
+	}
+
+	/** Yield the current thread's slot on the CPU */
+	static void yield()
+	{
+		ThreadImpl::yield();
+	}
+
+	static uint32_t getDefaultStackSize()
+	{
+		return ThreadImpl::getDefaultStackSize();
+	}
+
+	static ThreadImpl::Id getId()
+	{
+		return ThreadImpl::getId();
+	}
+
+	static uint32_t getNbPhysicalCores()
+	{
+		return ThreadImpl::getNbPhysicalCores();
+	}
+
+  private:
+	class ThreadImpl* mImpl;
+};
+
+typedef ThreadT<> Thread;
+
+PX_FOUNDATION_API uint32_t TlsAlloc();
+PX_FOUNDATION_API void TlsFree(uint32_t index);
+PX_FOUNDATION_API void* TlsGet(uint32_t index);
+PX_FOUNDATION_API uint32_t TlsSet(uint32_t index, void* value);
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSTHREAD_H
diff --git a/PxShared/src/foundation/include/PsTime.h b/PxShared/src/foundation/include/PsTime.h
new file mode 100644
index 0000000..b9c7031
--- /dev/null
+++ b/PxShared/src/foundation/include/PsTime.h
@@ -0,0 +1,95 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSTIME_H
+#define PSFOUNDATION_PSTIME_H
+
+#include "Ps.h"
+
+#if PX_LINUX || PX_ANDROID
+#include <time.h>
+#endif
+
+namespace physx
+{
+namespace shdfnd
+{
+
+struct CounterFrequencyToTensOfNanos
+{
+	uint64_t mNumerator;
+	uint64_t mDenominator;
+	CounterFrequencyToTensOfNanos(uint64_t inNum, uint64_t inDenom) : mNumerator(inNum), mDenominator(inDenom)
+	{
+	}
+
+	// quite slow.
+	uint64_t toTensOfNanos(uint64_t inCounter) const
+	{
+		return (inCounter * mNumerator) / mDenominator;
+	}
+};
+
+class PX_FOUNDATION_API Time
+{
+  public:
+	typedef double Second;
+	static const uint64_t sNumTensOfNanoSecondsInASecond = 100000000;
+	// This is supposedly guaranteed to not change after system boot
+	// regardless of processors, speedstep, etc.
+	static const CounterFrequencyToTensOfNanos& getBootCounterFrequency();
+
+	static CounterFrequencyToTensOfNanos getCounterFrequency();
+
+	static uint64_t getCurrentCounterValue();
+
+	// SLOW!!
+	// Thar be a 64 bit divide in thar!
+	static uint64_t getCurrentTimeInTensOfNanoSeconds()
+	{
+		uint64_t ticks = getCurrentCounterValue();
+		return getBootCounterFrequency().toTensOfNanos(ticks);
+	}
+
+	Time();
+	Second getElapsedSeconds();
+	Second peekElapsedSeconds();
+	Second getLastTime() const;
+
+  private:
+#if PX_LINUX || PX_ANDROID || PX_APPLE_FAMILY || PX_PS4
+	Second mLastTime;
+#else
+	int64_t mTickCount;
+#endif
+};
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSTIME_H
diff --git a/PxShared/src/foundation/include/PsUserAllocated.h b/PxShared/src/foundation/include/PsUserAllocated.h
new file mode 100644
index 0000000..f41d29e
--- /dev/null
+++ b/PxShared/src/foundation/include/PsUserAllocated.h
@@ -0,0 +1,92 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSUSERALLOCATED_H
+#define PSFOUNDATION_PSUSERALLOCATED_H
+
+#include "PsAllocator.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+/**
+Provides new and delete using a UserAllocator.
+Guarantees that 'delete x;' uses the UserAllocator too.
+*/
+class UserAllocated
+{
+  public:
+	// PX_SERIALIZATION
+	PX_INLINE void* operator new(size_t, void* address)
+	{
+		return address;
+	}
+	//~PX_SERIALIZATION
+	// Matching operator delete to the above operator new.  Don't ask me
+	// how this makes any sense - Nuernberger.
+	PX_INLINE void operator delete(void*, void*)
+	{
+	}
+
+	template <typename Alloc>
+	PX_INLINE void* operator new(size_t size, Alloc alloc, const char* fileName, int line)
+	{
+		return alloc.allocate(size, fileName, line);
+	}
+	template <typename Alloc>
+	PX_INLINE void* operator new [](size_t size, Alloc alloc, const char* fileName, int line)
+	{ return alloc.allocate(size, fileName, line); }
+
+	// placement delete
+	template <typename Alloc>
+	PX_INLINE void operator delete(void* ptr, Alloc alloc, const char* fileName, int line)
+	{
+		PX_UNUSED(fileName);
+		PX_UNUSED(line);
+		alloc.deallocate(ptr);
+	}
+	template <typename Alloc>
+	PX_INLINE void operator delete [](void* ptr, Alloc alloc, const char* fileName, int line)
+	{
+		PX_UNUSED(fileName);
+		PX_UNUSED(line);
+		alloc.deallocate(ptr);
+	} PX_INLINE void
+	operator delete(void* ptr)
+	{
+		NonTrackingAllocator().deallocate(ptr);
+	}
+	PX_INLINE void operator delete [](void* ptr)
+	{ NonTrackingAllocator().deallocate(ptr); }
+};
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSUSERALLOCATED_H
diff --git a/PxShared/src/foundation/include/PsUtilities.h b/PxShared/src/foundation/include/PsUtilities.h
new file mode 100644
index 0000000..32fe4ec
--- /dev/null
+++ b/PxShared/src/foundation/include/PsUtilities.h
@@ -0,0 +1,165 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSUTILITIES_H
+#define PSFOUNDATION_PSUTILITIES_H
+
+#include "foundation/PxVec3.h"
+#include "foundation/PxAssert.h"
+#include "Ps.h"
+#include "PsIntrinsics.h"
+#include "PsBasicTemplates.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+PX_INLINE char littleEndian()
+{
+	int i = 1;
+	return *(reinterpret_cast<char*>(&i));
+}
+
+// PT: checked casts
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxU32 to32(PxU64 value)
+{
+	PX_ASSERT(value <= 0xffffffff);
+	return PxU32(value);
+}
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxU16 to16(PxU32 value)
+{
+	PX_ASSERT(value <= 0xffff);
+	return PxU16(value);
+}
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxU8 to8(PxU16 value)
+{
+	PX_ASSERT(value <= 0xff);
+	return PxU8(value);
+}
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxU8 to8(PxU32 value)
+{
+	PX_ASSERT(value <= 0xff);
+	return PxU8(value);
+}
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxU8 to8(PxI32 value)
+{
+	PX_ASSERT(value <= 0xff);
+	PX_ASSERT(value >= 0);
+	return PxU8(value);
+}
+PX_CUDA_CALLABLE PX_FORCE_INLINE PxI8 toI8(PxU32 value)
+{
+	PX_ASSERT(value <= 0x7f);
+	return PxI8(value);
+}
+
+/*!
+Get number of elements in array
+*/
+template <typename T, size_t N>
+char (&ArraySizeHelper(T (&array)[N]))[N];
+#define PX_ARRAY_SIZE(_array) (sizeof(physx::shdfnd::ArraySizeHelper(_array)))
+
+/*!
+Sort two elements using operator<
+
+On return x will be the smaller of the two
+*/
+template <class T>
+PX_CUDA_CALLABLE PX_FORCE_INLINE void order(T& x, T& y)
+{
+	if(y < x)
+		swap(x, y);
+}
+
+// most architectures can do predication on real comparisons, and on VMX, it matters
+
+PX_CUDA_CALLABLE PX_FORCE_INLINE void order(PxReal& x, PxReal& y)
+{
+	PxReal newX = PxMin(x, y);
+	PxReal newY = PxMax(x, y);
+	x = newX;
+	y = newY;
+}
+
+/*!
+Sort two elements using operator< and also keep order
+of any extra data
+*/
+template <class T, class E1>
+PX_CUDA_CALLABLE PX_FORCE_INLINE void order(T& x, T& y, E1& xe1, E1& ye1)
+{
+	if(y < x)
+	{
+		swap(x, y);
+		swap(xe1, ye1);
+	}
+}
+
+#if PX_GCC_FAMILY && !PX_EMSCRIPTEN
+__attribute__((noreturn))
+#endif
+    PX_INLINE void debugBreak()
+{
+#if PX_WINDOWS || PX_XBOXONE
+	__debugbreak();
+#elif PX_ANDROID
+	raise(SIGTRAP); // works better than __builtin_trap. Proper call stack and can be continued.
+#elif PX_LINUX
+	asm("int $3");
+#elif PX_GCC_FAMILY
+	__builtin_trap();
+#else
+	PX_ASSERT(false);
+#endif
+}
+
+bool checkValid(const float&);
+bool checkValid(const PxVec3&);
+bool checkValid(const PxQuat&);
+bool checkValid(const PxMat33&);
+bool checkValid(const PxTransform&);
+bool checkValid(const char*);
+
+// equivalent to std::max_element
+template <typename T>
+inline const T* maxElement(const T* first, const T* last)
+{
+	const T* m = first;
+	for(const T* it = first + 1; it < last; ++it)
+		if(*m < *it)
+			m = it;
+
+	return m;
+}
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif
diff --git a/PxShared/src/foundation/include/PsVecMath.h b/PxShared/src/foundation/include/PsVecMath.h
new file mode 100644
index 0000000..4e891d8
--- /dev/null
+++ b/PxShared/src/foundation/include/PsVecMath.h
@@ -0,0 +1,1335 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSVECMATH_H
+#define PSFOUNDATION_PSVECMATH_H
+
+#include "Ps.h"
+#include "PsIntrinsics.h"
+#include "foundation/PxVec3.h"
+#include "foundation/PxVec4.h"
+#include "foundation/PxMat33.h"
+#include "foundation/PxUnionCast.h"
+
+// We can opt to use the scalar version of vectorised functions.
+// This can catch type safety issues and might even work out more optimal on pc.
+// It will also be useful for benchmarking and testing.
+// NEVER submit with vector intrinsics deactivated without good reason.
+// AM: deactivating SIMD for debug win64 just so autobuild will also exercise
+// non-SIMD path, until a dedicated non-SIMD platform sich as Arm comes online.
+// TODO: dima: reference all platforms with SIMD support here,
+// all unknown/experimental cases should better default to NO SIMD.
+
+// enable/disable SIMD
+#if !defined(PX_SIMD_DISABLED)
+#if PX_INTEL_FAMILY && (!defined(__EMSCRIPTEN__) || defined(__SSE2__))
+#define COMPILE_VECTOR_INTRINSICS 1
+#elif PX_ANDROID&& PX_NEON
+#define COMPILE_VECTOR_INTRINSICS 1
+#elif PX_IOS&& PX_NEON
+#define COMPILE_VECTOR_INTRINSICS 1
+#elif PX_NX
+#define COMPILE_VECTOR_INTRINSICS 1
+#else
+#define COMPILE_VECTOR_INTRINSICS 0
+#endif
+#else
+#define COMPILE_VECTOR_INTRINSICS 0
+#endif
+
+#if COMPILE_VECTOR_INTRINSICS && PX_INTEL_FAMILY&&(PX_UNIX_FAMILY || PX_PS4)
+// only SSE2 compatible platforms should reach this
+#if PX_EMSCRIPTEN
+#include <emmintrin.h>
+#endif
+#include <xmmintrin.h>
+#endif
+
+namespace physx
+{
+namespace shdfnd
+{
+namespace aos
+{
+
+// Basic AoS types are
+// FloatV	- 16-byte aligned representation of float.
+// Vec3V		- 16-byte aligned representation of PxVec3 stored as (x y z 0).
+// Vec4V		- 16-byte aligned representation of vector of 4 floats stored as (x y z w).
+// BoolV		- 16-byte aligned representation of vector of 4 bools stored as (x y z w).
+// VecU32V	- 16-byte aligned representation of 4 unsigned ints stored as (x y z w).
+// VecI32V	- 16-byte aligned representation of 4 signed ints stored as (x y z w).
+// Mat33V	- 16-byte aligned representation of any 3x3 matrix.
+// Mat34V	- 16-byte aligned representation of transformation matrix (rotation in col1,col2,col3 and translation in
+// col4).
+// Mat44V	- 16-byte aligned representation of any 4x4 matrix.
+
+#if COMPILE_VECTOR_INTRINSICS
+#include "PsAoS.h"
+#else
+#include "PsVecMathAoSScalar.h"
+#endif
+
+//////////////////////////////////////////
+// Construct a simd type from a scalar type
+//////////////////////////////////////////
+
+// FloatV
+//(f,f,f,f)
+PX_FORCE_INLINE FloatV FLoad(const PxF32 f);
+
+// Vec3V
+//(f,f,f,0)
+PX_FORCE_INLINE Vec3V V3Load(const PxF32 f);
+//(f.x,f.y,f.z,0)
+PX_FORCE_INLINE Vec3V V3LoadU(const PxVec3& f);
+//(f.x,f.y,f.z,0), f must be 16-byte aligned
+PX_FORCE_INLINE Vec3V V3LoadA(const PxVec3& f);
+//(f.x,f.y,f.z,w_undefined), f must be 16-byte aligned
+PX_FORCE_INLINE Vec3V V3LoadUnsafeA(const PxVec3& f);
+//(f.x,f.y,f.z,0)
+PX_FORCE_INLINE Vec3V V3LoadU(const PxF32* f);
+//(f.x,f.y,f.z,0), f must be 16-byte aligned
+PX_FORCE_INLINE Vec3V V3LoadA(const PxF32* f);
+
+// Vec4V
+//(f,f,f,f)
+PX_FORCE_INLINE Vec4V V4Load(const PxF32 f);
+//(f[0],f[1],f[2],f[3])
+PX_FORCE_INLINE Vec4V V4LoadU(const PxF32* const f);
+//(f[0],f[1],f[2],f[3]), f must be 16-byte aligned
+PX_FORCE_INLINE Vec4V V4LoadA(const PxF32* const f);
+//(x,y,z,w)
+PX_FORCE_INLINE Vec4V V4LoadXYZW(const PxF32& x, const PxF32& y, const PxF32& z, const PxF32& w);
+
+// BoolV
+//(f,f,f,f)
+PX_FORCE_INLINE BoolV BLoad(const bool f);
+//(f[0],f[1],f[2],f[3])
+PX_FORCE_INLINE BoolV BLoad(const bool* const f);
+
+// VecU32V
+//(f,f,f,f)
+PX_FORCE_INLINE VecU32V U4Load(const PxU32 f);
+//(f[0],f[1],f[2],f[3])
+PX_FORCE_INLINE VecU32V U4LoadU(const PxU32* f);
+//(f[0],f[1],f[2],f[3]), f must be 16-byte aligned
+PX_FORCE_INLINE VecU32V U4LoadA(const PxU32* f);
+//((U32)x, (U32)y, (U32)z, (U32)w)
+PX_FORCE_INLINE VecU32V U4LoadXYZW(PxU32 x, PxU32 y, PxU32 z, PxU32 w);
+
+// VecI32V
+//(i,i,i,i)
+PX_FORCE_INLINE VecI32V I4Load(const PxI32 i);
+//(i,i,i,i)
+PX_FORCE_INLINE VecI32V I4LoadU(const PxI32* i);
+//(i,i,i,i)
+PX_FORCE_INLINE VecI32V I4LoadA(const PxI32* i);
+
+// QuatV
+//(x = v[0], y=v[1], z=v[2], w=v3[3]) and array don't need to aligned
+PX_FORCE_INLINE QuatV QuatVLoadU(const PxF32* v);
+//(x = v[0], y=v[1], z=v[2], w=v3[3]) and array need to aligned, fast load
+PX_FORCE_INLINE QuatV QuatVLoadA(const PxF32* v);
+//(x, y, z, w)
+PX_FORCE_INLINE QuatV QuatVLoadXYZW(const PxF32 x, const PxF32 y, const PxF32 z, const PxF32 w);
+
+// not added to public api
+Vec4V Vec4V_From_PxVec3_WUndefined(const PxVec3& v);
+
+///////////////////////////////////////////////////
+// Construct a simd type from a different simd type
+///////////////////////////////////////////////////
+
+// Vec3V
+//(v.x,v.y,v.z,0)
+PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V(Vec4V v);
+//(v.x,v.y,v.z,undefined) - be very careful with w!=0 because many functions require w==0 for correct operation eg V3Dot, V3Length, V3Cross etc etc.
+PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V_WUndefined(const Vec4V v);
+
+// Vec4V
+//(f.x,f.y,f.z,f.w)
+PX_FORCE_INLINE Vec4V Vec4V_From_Vec3V(Vec3V f);
+//((PxF32)f.x, (PxF32)f.y, (PxF32)f.z, (PxF32)f.w)
+PX_FORCE_INLINE Vec4V Vec4V_From_VecU32V(VecU32V a);
+//((PxF32)f.x, (PxF32)f.y, (PxF32)f.z, (PxF32)f.w)
+PX_FORCE_INLINE Vec4V Vec4V_From_VecI32V(VecI32V a);
+//(*(reinterpret_cast<PxF32*>(&f.x), (reinterpret_cast<PxF32*>(&f.y), (reinterpret_cast<PxF32*>(&f.z),
+//(reinterpret_cast<PxF32*>(&f.w))
+PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecU32V(VecU32V a);
+//(*(reinterpret_cast<PxF32*>(&f.x), (reinterpret_cast<PxF32*>(&f.y), (reinterpret_cast<PxF32*>(&f.z),
+//(reinterpret_cast<PxF32*>(&f.w))
+PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecI32V(VecI32V a);
+
+// VecU32V
+//(*(reinterpret_cast<PxU32*>(&f.x), (reinterpret_cast<PxU32*>(&f.y), (reinterpret_cast<PxU32*>(&f.z),
+//(reinterpret_cast<PxU32*>(&f.w))
+PX_FORCE_INLINE VecU32V VecU32V_ReinterpretFrom_Vec4V(Vec4V a);
+//(b[0], b[1], b[2], b[3])
+PX_FORCE_INLINE VecU32V VecU32V_From_BoolV(const BoolVArg b);
+
+// VecI32V
+//(*(reinterpret_cast<PxI32*>(&f.x), (reinterpret_cast<PxI32*>(&f.y), (reinterpret_cast<PxI32*>(&f.z),
+//(reinterpret_cast<PxI32*>(&f.w))
+PX_FORCE_INLINE VecI32V VecI32V_ReinterpretFrom_Vec4V(Vec4V a);
+//((I32)a.x, (I32)a.y, (I32)a.z, (I32)a.w)
+PX_FORCE_INLINE VecI32V VecI32V_From_Vec4V(Vec4V a);
+//((I32)b.x, (I32)b.y, (I32)b.z, (I32)b.w)
+PX_FORCE_INLINE VecI32V VecI32V_From_BoolV(const BoolVArg b);
+
+///////////////////////////////////////////////////
+// Convert from a simd type back to a scalar type
+///////////////////////////////////////////////////
+
+// FloatV
+// a.x
+PX_FORCE_INLINE void FStore(const FloatV a, PxF32* PX_RESTRICT f);
+
+// Vec3V
+//(a.x,a.y,a.z)
+PX_FORCE_INLINE void V3StoreA(const Vec3V a, PxVec3& f);
+//(a.x,a.y,a.z)
+PX_FORCE_INLINE void V3StoreU(const Vec3V a, PxVec3& f);
+
+// Vec4V
+PX_FORCE_INLINE void V4StoreA(const Vec4V a, PxF32* f);
+PX_FORCE_INLINE void V4StoreU(const Vec4V a, PxF32* f);
+
+// BoolV
+PX_FORCE_INLINE void BStoreA(const BoolV b, PxU32* f);
+
+// VecU32V
+PX_FORCE_INLINE void U4StoreA(const VecU32V uv, PxU32* u);
+
+// VecI32V
+PX_FORCE_INLINE void I4StoreA(const VecI32V iv, PxI32* i);
+
+//////////////////////////////////////////////////////////////////
+// Test that simd types have elements in the floating point range
+//////////////////////////////////////////////////////////////////
+
+// check for each component is valid ie in floating point range
+PX_FORCE_INLINE bool isFiniteFloatV(const FloatV a);
+// check for each component is valid ie in floating point range
+PX_FORCE_INLINE bool isFiniteVec3V(const Vec3V a);
+// check for each component is valid ie in floating point range
+PX_FORCE_INLINE bool isFiniteVec4V(const Vec4V a);
+
+// Check that w-component is zero.
+PX_FORCE_INLINE bool isValidVec3V(const Vec3V a);
+
+//////////////////////////////////////////////////////////////////
+// Tests that all elements of two 16-byte types are completely equivalent.
+// Use these tests for unit testing and asserts only.
+//////////////////////////////////////////////////////////////////
+
+namespace _VecMathTests
+{
+PX_FORCE_INLINE Vec3V getInvalidVec3V();
+PX_FORCE_INLINE bool allElementsEqualFloatV(const FloatV a, const FloatV b);
+PX_FORCE_INLINE bool allElementsEqualVec3V(const Vec3V a, const Vec3V b);
+PX_FORCE_INLINE bool allElementsEqualVec4V(const Vec4V a, const Vec4V b);
+PX_FORCE_INLINE bool allElementsEqualBoolV(const BoolV a, const BoolV b);
+PX_FORCE_INLINE bool allElementsEqualVecU32V(const VecU32V a, const VecU32V b);
+PX_FORCE_INLINE bool allElementsEqualVecI32V(const VecI32V a, const VecI32V b);
+
+PX_FORCE_INLINE bool allElementsEqualMat33V(const Mat33V& a, const Mat33V& b)
+{
+	return (allElementsEqualVec3V(a.col0, b.col0) && allElementsEqualVec3V(a.col1, b.col1) &&
+	        allElementsEqualVec3V(a.col2, b.col2));
+}
+PX_FORCE_INLINE bool allElementsEqualMat34V(const Mat34V& a, const Mat34V& b)
+{
+	return (allElementsEqualVec3V(a.col0, b.col0) && allElementsEqualVec3V(a.col1, b.col1) &&
+	        allElementsEqualVec3V(a.col2, b.col2) && allElementsEqualVec3V(a.col3, b.col3));
+}
+PX_FORCE_INLINE bool allElementsEqualMat44V(const Mat44V& a, const Mat44V& b)
+{
+	return (allElementsEqualVec4V(a.col0, b.col0) && allElementsEqualVec4V(a.col1, b.col1) &&
+	        allElementsEqualVec4V(a.col2, b.col2) && allElementsEqualVec4V(a.col3, b.col3));
+}
+
+PX_FORCE_INLINE bool allElementsNearEqualFloatV(const FloatV a, const FloatV b);
+PX_FORCE_INLINE bool allElementsNearEqualVec3V(const Vec3V a, const Vec3V b);
+PX_FORCE_INLINE bool allElementsNearEqualVec4V(const Vec4V a, const Vec4V b);
+PX_FORCE_INLINE bool allElementsNearEqualMat33V(const Mat33V& a, const Mat33V& b)
+{
+	return (allElementsNearEqualVec3V(a.col0, b.col0) && allElementsNearEqualVec3V(a.col1, b.col1) &&
+	        allElementsNearEqualVec3V(a.col2, b.col2));
+}
+PX_FORCE_INLINE bool allElementsNearEqualMat34V(const Mat34V& a, const Mat34V& b)
+{
+	return (allElementsNearEqualVec3V(a.col0, b.col0) && allElementsNearEqualVec3V(a.col1, b.col1) &&
+	        allElementsNearEqualVec3V(a.col2, b.col2) && allElementsNearEqualVec3V(a.col3, b.col3));
+}
+PX_FORCE_INLINE bool allElementsNearEqualMat44V(const Mat44V& a, const Mat44V& b)
+{
+	return (allElementsNearEqualVec4V(a.col0, b.col0) && allElementsNearEqualVec4V(a.col1, b.col1) &&
+	        allElementsNearEqualVec4V(a.col2, b.col2) && allElementsNearEqualVec4V(a.col3, b.col3));
+}
+}
+
+//////////////////////////////////////////////////////////////////
+// Math operations on FloatV
+//////////////////////////////////////////////////////////////////
+
+//(0,0,0,0)
+PX_FORCE_INLINE FloatV FZero();
+//(1,1,1,1)
+PX_FORCE_INLINE FloatV FOne();
+//(0.5,0.5,0.5,0.5)
+PX_FORCE_INLINE FloatV FHalf();
+//(PX_EPS_REAL,PX_EPS_REAL,PX_EPS_REAL,PX_EPS_REAL)
+PX_FORCE_INLINE FloatV FEps();
+//(PX_MAX_REAL, PX_MAX_REAL, PX_MAX_REAL PX_MAX_REAL)
+PX_FORCE_INLINE FloatV FMax();
+//(-PX_MAX_REAL, -PX_MAX_REAL, -PX_MAX_REAL -PX_MAX_REAL)
+PX_FORCE_INLINE FloatV FNegMax();
+//(1e-6f, 1e-6f, 1e-6f, 1e-6f)
+PX_FORCE_INLINE FloatV FEps6();
+//((PxF32*)&1, (PxF32*)&1, (PxF32*)&1, (PxF32*)&1)
+
+//-f (per component)
+PX_FORCE_INLINE FloatV FNeg(const FloatV f);
+// a+b (per component)
+PX_FORCE_INLINE FloatV FAdd(const FloatV a, const FloatV b);
+// a-b (per component)
+PX_FORCE_INLINE FloatV FSub(const FloatV a, const FloatV b);
+// a*b (per component)
+PX_FORCE_INLINE FloatV FMul(const FloatV a, const FloatV b);
+// a/b (per component)
+PX_FORCE_INLINE FloatV FDiv(const FloatV a, const FloatV b);
+// a/b (per component)
+PX_FORCE_INLINE FloatV FDivFast(const FloatV a, const FloatV b);
+// 1.0f/a
+PX_FORCE_INLINE FloatV FRecip(const FloatV a);
+// 1.0f/a
+PX_FORCE_INLINE FloatV FRecipFast(const FloatV a);
+// 1.0f/sqrt(a)
+PX_FORCE_INLINE FloatV FRsqrt(const FloatV a);
+// 1.0f/sqrt(a)
+PX_FORCE_INLINE FloatV FRsqrtFast(const FloatV a);
+// sqrt(a)
+PX_FORCE_INLINE FloatV FSqrt(const FloatV a);
+// a*b+c
+PX_FORCE_INLINE FloatV FScaleAdd(const FloatV a, const FloatV b, const FloatV c);
+// c-a*b
+PX_FORCE_INLINE FloatV FNegScaleSub(const FloatV a, const FloatV b, const FloatV c);
+// fabs(a)
+PX_FORCE_INLINE FloatV FAbs(const FloatV a);
+// c ? a : b (per component)
+PX_FORCE_INLINE FloatV FSel(const BoolV c, const FloatV a, const FloatV b);
+// a>b (per component)
+PX_FORCE_INLINE BoolV FIsGrtr(const FloatV a, const FloatV b);
+// a>=b (per component)
+PX_FORCE_INLINE BoolV FIsGrtrOrEq(const FloatV a, const FloatV b);
+// a==b (per component)
+PX_FORCE_INLINE BoolV FIsEq(const FloatV a, const FloatV b);
+// Max(a,b) (per component)
+PX_FORCE_INLINE FloatV FMax(const FloatV a, const FloatV b);
+// Min(a,b) (per component)
+PX_FORCE_INLINE FloatV FMin(const FloatV a, const FloatV b);
+// Clamp(a,b) (per component)
+PX_FORCE_INLINE FloatV FClamp(const FloatV a, const FloatV minV, const FloatV maxV);
+
+// a.x>b.x
+PX_FORCE_INLINE PxU32 FAllGrtr(const FloatV a, const FloatV b);
+// a.x>=b.x
+PX_FORCE_INLINE PxU32 FAllGrtrOrEq(const FloatV a, const FloatV b);
+// a.x==b.x
+PX_FORCE_INLINE PxU32 FAllEq(const FloatV a, const FloatV b);
+// a<min || a>max
+PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV min, const FloatV max);
+// a>=min && a<=max
+PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV min, const FloatV max);
+// a<-bounds || a>bounds
+PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV bounds);
+// a>=-bounds && a<=bounds
+PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV bounds);
+
+// round float a to the near int
+PX_FORCE_INLINE FloatV FRound(const FloatV a);
+// calculate the sin of float a
+PX_FORCE_INLINE FloatV FSin(const FloatV a);
+// calculate the cos of float b
+PX_FORCE_INLINE FloatV FCos(const FloatV a);
+
+//////////////////////////////////////////////////////////////////
+// Math operations on Vec3V
+//////////////////////////////////////////////////////////////////
+
+//(f,f,f,f)
+PX_FORCE_INLINE Vec3V V3Splat(const FloatV f);
+
+//(x,y,z)
+PX_FORCE_INLINE Vec3V V3Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z);
+
+//(1,0,0,0)
+PX_FORCE_INLINE Vec3V V3UnitX();
+//(0,1,0,0)
+PX_FORCE_INLINE Vec3V V3UnitY();
+//(0,0,1,0)
+PX_FORCE_INLINE Vec3V V3UnitZ();
+
+//(f.x,f.x,f.x,f.x)
+PX_FORCE_INLINE FloatV V3GetX(const Vec3V f);
+//(f.y,f.y,f.y,f.y)
+PX_FORCE_INLINE FloatV V3GetY(const Vec3V f);
+//(f.z,f.z,f.z,f.z)
+PX_FORCE_INLINE FloatV V3GetZ(const Vec3V f);
+
+//(f,v.y,v.z,v.w)
+PX_FORCE_INLINE Vec3V V3SetX(const Vec3V v, const FloatV f);
+//(v.x,f,v.z,v.w)
+PX_FORCE_INLINE Vec3V V3SetY(const Vec3V v, const FloatV f);
+//(v.x,v.y,f,v.w)
+PX_FORCE_INLINE Vec3V V3SetZ(const Vec3V v, const FloatV f);
+
+// v.x=f
+PX_FORCE_INLINE void V3WriteX(Vec3V& v, const PxF32 f);
+// v.y=f
+PX_FORCE_INLINE void V3WriteY(Vec3V& v, const PxF32 f);
+// v.z=f
+PX_FORCE_INLINE void V3WriteZ(Vec3V& v, const PxF32 f);
+// v.x=f.x, v.y=f.y, v.z=f.z
+PX_FORCE_INLINE void V3WriteXYZ(Vec3V& v, const PxVec3& f);
+// return v.x
+PX_FORCE_INLINE PxF32 V3ReadX(const Vec3V& v);
+// return v.y
+PX_FORCE_INLINE PxF32 V3ReadY(const Vec3V& v);
+// return v.y
+PX_FORCE_INLINE PxF32 V3ReadZ(const Vec3V& v);
+// return (v.x,v.y,v.z)
+PX_FORCE_INLINE const PxVec3& V3ReadXYZ(const Vec3V& v);
+
+//(a.x, b.x, c.x)
+PX_FORCE_INLINE Vec3V V3ColX(const Vec3V a, const Vec3V b, const Vec3V c);
+//(a.y, b.y, c.y)
+PX_FORCE_INLINE Vec3V V3ColY(const Vec3V a, const Vec3V b, const Vec3V c);
+//(a.z, b.z, c.z)
+PX_FORCE_INLINE Vec3V V3ColZ(const Vec3V a, const Vec3V b, const Vec3V c);
+
+//(0,0,0,0)
+PX_FORCE_INLINE Vec3V V3Zero();
+//(1,1,1,1)
+PX_FORCE_INLINE Vec3V V3One();
+//(PX_EPS_REAL,PX_EPS_REAL,PX_EPS_REAL,PX_EPS_REAL)
+PX_FORCE_INLINE Vec3V V3Eps();
+//-c (per component)
+PX_FORCE_INLINE Vec3V V3Neg(const Vec3V c);
+// a+b (per component)
+PX_FORCE_INLINE Vec3V V3Add(const Vec3V a, const Vec3V b);
+// a-b (per component)
+PX_FORCE_INLINE Vec3V V3Sub(const Vec3V a, const Vec3V b);
+// a*b (per component)
+PX_FORCE_INLINE Vec3V V3Scale(const Vec3V a, const FloatV b);
+// a*b (per component)
+PX_FORCE_INLINE Vec3V V3Mul(const Vec3V a, const Vec3V b);
+// a/b (per component)
+PX_FORCE_INLINE Vec3V V3ScaleInv(const Vec3V a, const FloatV b);
+// a/b (per component)
+PX_FORCE_INLINE Vec3V V3Div(const Vec3V a, const Vec3V b);
+// a/b (per component)
+PX_FORCE_INLINE Vec3V V3ScaleInvFast(const Vec3V a, const FloatV b);
+// a/b (per component)
+PX_FORCE_INLINE Vec3V V3DivFast(const Vec3V a, const Vec3V b);
+// 1.0f/a
+PX_FORCE_INLINE Vec3V V3Recip(const Vec3V a);
+// 1.0f/a
+PX_FORCE_INLINE Vec3V V3RecipFast(const Vec3V a);
+// 1.0f/sqrt(a)
+PX_FORCE_INLINE Vec3V V3Rsqrt(const Vec3V a);
+// 1.0f/sqrt(a)
+PX_FORCE_INLINE Vec3V V3RsqrtFast(const Vec3V a);
+// a*b+c
+PX_FORCE_INLINE Vec3V V3ScaleAdd(const Vec3V a, const FloatV b, const Vec3V c);
+// c-a*b
+PX_FORCE_INLINE Vec3V V3NegScaleSub(const Vec3V a, const FloatV b, const Vec3V c);
+// a*b+c
+PX_FORCE_INLINE Vec3V V3MulAdd(const Vec3V a, const Vec3V b, const Vec3V c);
+// c-a*b
+PX_FORCE_INLINE Vec3V V3NegMulSub(const Vec3V a, const Vec3V b, const Vec3V c);
+// fabs(a)
+PX_FORCE_INLINE Vec3V V3Abs(const Vec3V a);
+
+// a.b 
+// Note: a.w and b.w must have value zero
+PX_FORCE_INLINE FloatV V3Dot(const Vec3V a, const Vec3V b);
+// aXb
+// Note: a.w and b.w must have value zero
+PX_FORCE_INLINE Vec3V V3Cross(const Vec3V a, const Vec3V b);
+// |a.a|^1/2
+// Note: a.w must have value zero
+PX_FORCE_INLINE FloatV V3Length(const Vec3V a);
+// a.a
+// Note: a.w must have value zero
+PX_FORCE_INLINE FloatV V3LengthSq(const Vec3V a);
+// a*|a.a|^-1/2
+// Note: a.w must have value zero
+PX_FORCE_INLINE Vec3V V3Normalize(const Vec3V a);
+// a.a>0 ? a*|a.a|^-1/2 : (0,0,0,0)
+// Note: a.w must have value zero
+PX_FORCE_INLINE FloatV V3Length(const Vec3V a);
+// a.a>0 ? a*|a.a|^-1/2 : unsafeReturnValue 
+// Note: a.w must have value zero
+PX_FORCE_INLINE Vec3V V3NormalizeSafe(const Vec3V a, const Vec3V unsafeReturnValue);
+// a.x + a.y + a.z
+// Note: a.w must have value zero
+PX_FORCE_INLINE FloatV V3SumElems(const Vec3V a);
+
+// c ? a : b (per component)
+PX_FORCE_INLINE Vec3V V3Sel(const BoolV c, const Vec3V a, const Vec3V b);
+// a>b (per component)
+PX_FORCE_INLINE BoolV V3IsGrtr(const Vec3V a, const Vec3V b);
+// a>=b (per component)
+PX_FORCE_INLINE BoolV V3IsGrtrOrEq(const Vec3V a, const Vec3V b);
+// a==b (per component)
+PX_FORCE_INLINE BoolV V3IsEq(const Vec3V a, const Vec3V b);
+// Max(a,b) (per component)
+PX_FORCE_INLINE Vec3V V3Max(const Vec3V a, const Vec3V b);
+// Min(a,b) (per component)
+PX_FORCE_INLINE Vec3V V3Min(const Vec3V a, const Vec3V b);
+
+// Extract the maximum value from a
+// Note: a.w must have value zero
+PX_FORCE_INLINE FloatV V3ExtractMax(const Vec3V a);
+
+// Extract the minimum value from a
+// Note: a.w must have value zero
+PX_FORCE_INLINE FloatV V3ExtractMin(const Vec3V a);
+
+// Clamp(a,b) (per component)
+PX_FORCE_INLINE Vec3V V3Clamp(const Vec3V a, const Vec3V minV, const Vec3V maxV);
+
+// Extract the sign for each component
+PX_FORCE_INLINE Vec3V V3Sign(const Vec3V a);
+
+// Test all components.
+// (a.x>b.x && a.y>b.y && a.z>b.z)
+// Note: a.w and b.w must have value zero
+PX_FORCE_INLINE PxU32 V3AllGrtr(const Vec3V a, const Vec3V b);
+// (a.x>=b.x && a.y>=b.y && a.z>=b.z)
+// Note: a.w and b.w must have value zero
+PX_FORCE_INLINE PxU32 V3AllGrtrOrEq(const Vec3V a, const Vec3V b);
+// (a.x==b.x && a.y==b.y && a.z==b.z)
+// Note: a.w and b.w must have value zero
+PX_FORCE_INLINE PxU32 V3AllEq(const Vec3V a, const Vec3V b);
+// a.x<min.x || a.y<min.y || a.z<min.z || a.x>max.x || a.y>max.y || a.z>max.z
+// Note: a.w and min.w and max.w must have value zero
+PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V min, const Vec3V max);
+// a.x>=min.x && a.y>=min.y && a.z>=min.z && a.x<=max.x && a.y<=max.y && a.z<=max.z
+// Note: a.w and min.w and max.w must have value zero
+PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V min, const Vec3V max);
+// a.x<-bounds.x || a.y<=-bounds.y || a.z<bounds.z || a.x>bounds.x || a.y>bounds.y || a.z>bounds.z
+// Note: a.w and bounds.w must have value zero
+PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V bounds);
+// a.x>=-bounds.x && a.y>=-bounds.y && a.z>=-bounds.z && a.x<=bounds.x && a.y<=bounds.y && a.z<=bounds.z
+// Note: a.w and bounds.w must have value zero
+PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V bounds);
+
+//(floor(a.x + 0.5f), floor(a.y + 0.5f), floor(a.z + 0.5f))
+PX_FORCE_INLINE Vec3V V3Round(const Vec3V a);
+
+//(sinf(a.x), sinf(a.y), sinf(a.z))
+PX_FORCE_INLINE Vec3V V3Sin(const Vec3V a);
+//(cosf(a.x), cosf(a.y), cosf(a.z))
+PX_FORCE_INLINE Vec3V V3Cos(const Vec3V a);
+
+//(a.y,a.z,a.z)
+PX_FORCE_INLINE Vec3V V3PermYZZ(const Vec3V a);
+//(a.x,a.y,a.x)
+PX_FORCE_INLINE Vec3V V3PermXYX(const Vec3V a);
+//(a.y,a.z,a.x)
+PX_FORCE_INLINE Vec3V V3PermYZX(const Vec3V a);
+//(a.z, a.x, a.y)
+PX_FORCE_INLINE Vec3V V3PermZXY(const Vec3V a);
+//(a.z,a.z,a.y)
+PX_FORCE_INLINE Vec3V V3PermZZY(const Vec3V a);
+//(a.y,a.x,a.x)
+PX_FORCE_INLINE Vec3V V3PermYXX(const Vec3V a);
+//(0, v1.z, v0.y)
+PX_FORCE_INLINE Vec3V V3Perm_Zero_1Z_0Y(const Vec3V v0, const Vec3V v1);
+//(v0.z, 0, v1.x)
+PX_FORCE_INLINE Vec3V V3Perm_0Z_Zero_1X(const Vec3V v0, const Vec3V v1);
+//(v1.y, v0.x, 0)
+PX_FORCE_INLINE Vec3V V3Perm_1Y_0X_Zero(const Vec3V v0, const Vec3V v1);
+
+// Transpose 3 Vec3Vs inplace. Sets the w component to zero
+// [ x0, y0, z0, w0] [ x1, y1, z1, w1]  [ x2, y2, z2, w2]  -> [x0 x1 x2 0] [y0 y1 y2 0] [z0 z1 z2 0]
+PX_FORCE_INLINE void V3Transpose(Vec3V& col0, Vec3V& col1, Vec3V& col2);
+
+//////////////////////////////////////////////////////////////////
+// Math operations on Vec4V
+//////////////////////////////////////////////////////////////////
+
+//(f,f,f,f)
+PX_FORCE_INLINE Vec4V V4Splat(const FloatV f);
+
+//(f[0],f[1],f[2],f[3])
+PX_FORCE_INLINE Vec4V V4Merge(const FloatV* const f);
+//(x,y,z,w)
+PX_FORCE_INLINE Vec4V V4Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z, const FloatVArg w);
+//(x.w, y.w, z.w, w.w)
+PX_FORCE_INLINE Vec4V V4MergeW(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w);
+//(x.z, y.z, z.z, w.z)
+PX_FORCE_INLINE Vec4V V4MergeZ(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w);
+//(x.y, y.y, z.y, w.y)
+PX_FORCE_INLINE Vec4V V4MergeY(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w);
+//(x.x, y.x, z.x, w.x)
+PX_FORCE_INLINE Vec4V V4MergeX(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w);
+
+//(a.x, b.x, a.y, b.y)
+PX_FORCE_INLINE Vec4V V4UnpackXY(const Vec4VArg a, const Vec4VArg b);
+//(a.z, b.z, a.w, b.w)
+PX_FORCE_INLINE Vec4V V4UnpackZW(const Vec4VArg a, const Vec4VArg b);
+
+//(1,0,0,0)
+PX_FORCE_INLINE Vec4V V4UnitW();
+//(0,1,0,0)
+PX_FORCE_INLINE Vec4V V4UnitY();
+//(0,0,1,0)
+PX_FORCE_INLINE Vec4V V4UnitZ();
+//(0,0,0,1)
+PX_FORCE_INLINE Vec4V V4UnitW();
+
+//(f.x,f.x,f.x,f.x)
+PX_FORCE_INLINE FloatV V4GetX(const Vec4V f);
+//(f.y,f.y,f.y,f.y)
+PX_FORCE_INLINE FloatV V4GetY(const Vec4V f);
+//(f.z,f.z,f.z,f.z)
+PX_FORCE_INLINE FloatV V4GetZ(const Vec4V f);
+//(f.w,f.w,f.w,f.w)
+PX_FORCE_INLINE FloatV V4GetW(const Vec4V f);
+
+//(f,v.y,v.z,v.w)
+PX_FORCE_INLINE Vec4V V4SetX(const Vec4V v, const FloatV f);
+//(v.x,f,v.z,v.w)
+PX_FORCE_INLINE Vec4V V4SetY(const Vec4V v, const FloatV f);
+//(v.x,v.y,f,v.w)
+PX_FORCE_INLINE Vec4V V4SetZ(const Vec4V v, const FloatV f);
+//(v.x,v.y,v.z,f)
+PX_FORCE_INLINE Vec4V V4SetW(const Vec4V v, const FloatV f);
+
+//(v.x,v.y,v.z,0)
+PX_FORCE_INLINE Vec4V V4ClearW(const Vec4V v);
+
+//(a[elementIndex], a[elementIndex], a[elementIndex], a[elementIndex])
+template <int elementIndex>
+PX_FORCE_INLINE Vec4V V4SplatElement(Vec4V a);
+
+// v.x=f
+PX_FORCE_INLINE void V4WriteX(Vec4V& v, const PxF32 f);
+// v.y=f
+PX_FORCE_INLINE void V4WriteY(Vec4V& v, const PxF32 f);
+// v.z=f
+PX_FORCE_INLINE void V4WriteZ(Vec4V& v, const PxF32 f);
+// v.w=f
+PX_FORCE_INLINE void V4WriteW(Vec4V& v, const PxF32 f);
+// v.x=f.x, v.y=f.y, v.z=f.z
+PX_FORCE_INLINE void V4WriteXYZ(Vec4V& v, const PxVec3& f);
+// return v.x
+PX_FORCE_INLINE PxF32 V4ReadX(const Vec4V& v);
+// return v.y
+PX_FORCE_INLINE PxF32 V4ReadY(const Vec4V& v);
+// return v.z
+PX_FORCE_INLINE PxF32 V4ReadZ(const Vec4V& v);
+// return v.w
+PX_FORCE_INLINE PxF32 V4ReadW(const Vec4V& v);
+// return (v.x,v.y,v.z)
+PX_FORCE_INLINE const PxVec3& V4ReadXYZ(const Vec4V& v);
+
+//(0,0,0,0)
+PX_FORCE_INLINE Vec4V V4Zero();
+//(1,1,1,1)
+PX_FORCE_INLINE Vec4V V4One();
+//(PX_EPS_REAL,PX_EPS_REAL,PX_EPS_REAL,PX_EPS_REAL)
+PX_FORCE_INLINE Vec4V V4Eps();
+
+//-c (per component)
+PX_FORCE_INLINE Vec4V V4Neg(const Vec4V c);
+// a+b (per component)
+PX_FORCE_INLINE Vec4V V4Add(const Vec4V a, const Vec4V b);
+// a-b (per component)
+PX_FORCE_INLINE Vec4V V4Sub(const Vec4V a, const Vec4V b);
+// a*b (per component)
+PX_FORCE_INLINE Vec4V V4Scale(const Vec4V a, const FloatV b);
+// a*b (per component)
+PX_FORCE_INLINE Vec4V V4Mul(const Vec4V a, const Vec4V b);
+// a/b (per component)
+PX_FORCE_INLINE Vec4V V4ScaleInv(const Vec4V a, const FloatV b);
+// a/b (per component)
+PX_FORCE_INLINE Vec4V V4Div(const Vec4V a, const Vec4V b);
+// a/b (per component)
+PX_FORCE_INLINE Vec4V V4ScaleInvFast(const Vec4V a, const FloatV b);
+// a/b (per component)
+PX_FORCE_INLINE Vec4V V4DivFast(const Vec4V a, const Vec4V b);
+// 1.0f/a
+PX_FORCE_INLINE Vec4V V4Recip(const Vec4V a);
+// 1.0f/a
+PX_FORCE_INLINE Vec4V V4RecipFast(const Vec4V a);
+// 1.0f/sqrt(a)
+PX_FORCE_INLINE Vec4V V4Rsqrt(const Vec4V a);
+// 1.0f/sqrt(a)
+PX_FORCE_INLINE Vec4V V4RsqrtFast(const Vec4V a);
+// a*b+c
+PX_FORCE_INLINE Vec4V V4ScaleAdd(const Vec4V a, const FloatV b, const Vec4V c);
+// c-a*b
+PX_FORCE_INLINE Vec4V V4NegScaleSub(const Vec4V a, const FloatV b, const Vec4V c);
+// a*b+c
+PX_FORCE_INLINE Vec4V V4MulAdd(const Vec4V a, const Vec4V b, const Vec4V c);
+// c-a*b
+PX_FORCE_INLINE Vec4V V4NegMulSub(const Vec4V a, const Vec4V b, const Vec4V c);
+
+// fabs(a)
+PX_FORCE_INLINE Vec4V V4Abs(const Vec4V a);
+// bitwise a & ~b
+PX_FORCE_INLINE Vec4V V4Andc(const Vec4V a, const VecU32V b);
+
+// a.b (W is taken into account)
+PX_FORCE_INLINE FloatV V4Dot(const Vec4V a, const Vec4V b);
+// a.b (same computation as V3Dot. W is ignored in input)
+PX_FORCE_INLINE FloatV V4Dot3(const Vec4V a, const Vec4V b);
+// aXb (same computation as V3Cross. W is ignored in input and undefined in output)
+PX_FORCE_INLINE Vec4V V4Cross(const Vec4V a, const Vec4V b);
+
+//|a.a|^1/2
+PX_FORCE_INLINE FloatV V4Length(const Vec4V a);
+// a.a
+PX_FORCE_INLINE FloatV V4LengthSq(const Vec4V a);
+
+// a*|a.a|^-1/2
+PX_FORCE_INLINE Vec4V V4Normalize(const Vec4V a);
+// a.a>0 ? a*|a.a|^-1/2 : unsafeReturnValue 
+PX_FORCE_INLINE Vec4V V4NormalizeSafe(const Vec4V a, const Vec4V unsafeReturnValue);
+// a*|a.a|^-1/2
+PX_FORCE_INLINE Vec4V V4NormalizeFast(const Vec4V a);
+
+// c ? a : b (per component)
+PX_FORCE_INLINE Vec4V V4Sel(const BoolV c, const Vec4V a, const Vec4V b);
+// a>b (per component)
+PX_FORCE_INLINE BoolV V4IsGrtr(const Vec4V a, const Vec4V b);
+// a>=b (per component)
+PX_FORCE_INLINE BoolV V4IsGrtrOrEq(const Vec4V a, const Vec4V b);
+// a==b (per component)
+PX_FORCE_INLINE BoolV V4IsEq(const Vec4V a, const Vec4V b);
+// Max(a,b) (per component)
+PX_FORCE_INLINE Vec4V V4Max(const Vec4V a, const Vec4V b);
+// Min(a,b) (per component)
+PX_FORCE_INLINE Vec4V V4Min(const Vec4V a, const Vec4V b);
+// Get the maximum component from a
+PX_FORCE_INLINE FloatV V4ExtractMax(const Vec4V a);
+// Get the minimum component from a
+PX_FORCE_INLINE FloatV V4ExtractMin(const Vec4V a);
+
+// Clamp(a,b) (per component)
+PX_FORCE_INLINE Vec4V V4Clamp(const Vec4V a, const Vec4V minV, const Vec4V maxV);
+
+// return 1 if all components of a are greater than all components of b.
+PX_FORCE_INLINE PxU32 V4AllGrtr(const Vec4V a, const Vec4V b);
+// return 1 if all components of a are greater than or equal to all components of b
+PX_FORCE_INLINE PxU32 V4AllGrtrOrEq(const Vec4V a, const Vec4V b);
+// return 1 if XYZ components of a are greater than or equal to XYZ components of b. W is ignored.
+PX_FORCE_INLINE PxU32 V4AllGrtrOrEq3(const Vec4V a, const Vec4V b);
+// return 1 if all components of a are equal to all components of b
+PX_FORCE_INLINE PxU32 V4AllEq(const Vec4V a, const Vec4V b);
+// return 1 if any XYZ component of a is greater than the corresponding component of b. W is ignored.
+PX_FORCE_INLINE PxU32 V4AnyGrtr3(const Vec4V a, const Vec4V b);
+
+// round(a)(per component)
+PX_FORCE_INLINE Vec4V V4Round(const Vec4V a);
+// sin(a) (per component)
+PX_FORCE_INLINE Vec4V V4Sin(const Vec4V a);
+// cos(a) (per component)
+PX_FORCE_INLINE Vec4V V4Cos(const Vec4V a);
+
+// Permute v into a new vec4v with YXWZ format
+PX_FORCE_INLINE Vec4V V4PermYXWZ(const Vec4V v);
+// Permute v into a new vec4v with XZXZ format
+PX_FORCE_INLINE Vec4V V4PermXZXZ(const Vec4V v);
+// Permute v into a new vec4v with YWYW format
+PX_FORCE_INLINE Vec4V V4PermYWYW(const Vec4V v);
+// Permute v into a new vec4v with YZXW format
+PX_FORCE_INLINE Vec4V V4PermYZXW(const Vec4V v);
+
+// Permute v into a new vec4v with format {a[x], a[y], a[z], a[w]}
+// V4Perm<1,3,1,3> is equal to V4PermYWYW
+// V4Perm<0,2,0,2> is equal to V4PermXZXZ
+// V3Perm<1,0,3,2> is equal to V4PermYXWZ
+template <PxU8 x, PxU8 y, PxU8 z, PxU8 w>
+PX_FORCE_INLINE Vec4V V4Perm(const Vec4V a);
+
+// Transpose 4 Vec4Vs inplace.
+// [ x0, y0, z0, w0] [ x1, y1, z1, w1] [ x2, y2, z2, w2] [ x3, y3, z3, w3] ->
+// [ x0, x1, x2, x3] [ y0, y1, y2, y3] [ z0, z1, z2, z3] [ w0, w1, w2, w3]
+PX_FORCE_INLINE void V3Transpose(Vec3V& col0, Vec3V& col1, Vec3V& col2);
+
+// q = cos(a/2) + u*sin(a/2)
+PX_FORCE_INLINE QuatV QuatV_From_RotationAxisAngle(const Vec3V u, const FloatV a);
+// convert q to a unit quaternion
+PX_FORCE_INLINE QuatV QuatNormalize(const QuatV q);
+//|q.q|^1/2
+PX_FORCE_INLINE FloatV QuatLength(const QuatV q);
+// q.q
+PX_FORCE_INLINE FloatV QuatLengthSq(const QuatV q);
+// a.b
+PX_FORCE_INLINE FloatV QuatDot(const QuatV a, const QuatV b);
+//(-q.x, -q.y, -q.z, q.w)
+PX_FORCE_INLINE QuatV QuatConjugate(const QuatV q);
+//(q.x, q.y, q.z)
+PX_FORCE_INLINE Vec3V QuatGetImaginaryPart(const QuatV q);
+// convert quaternion to matrix 33
+PX_FORCE_INLINE Mat33V QuatGetMat33V(const QuatVArg q);
+// convert quaternion to matrix 33
+PX_FORCE_INLINE void QuatGetMat33V(const QuatVArg q, Vec3V& column0, Vec3V& column1, Vec3V& column2);
+// convert matrix 33 to quaternion
+PX_FORCE_INLINE QuatV Mat33GetQuatV(const Mat33V& a);
+// brief computes rotation of x-axis
+PX_FORCE_INLINE Vec3V QuatGetBasisVector0(const QuatV q);
+// brief computes rotation of y-axis
+PX_FORCE_INLINE Vec3V QuatGetBasisVector1(const QuatV q);
+// brief computes rotation of z-axis
+PX_FORCE_INLINE Vec3V QuatGetBasisVector2(const QuatV q);
+// calculate the rotation vector from q and v
+PX_FORCE_INLINE Vec3V QuatRotate(const QuatV q, const Vec3V v);
+// calculate the rotation vector from the conjugate quaternion and v
+PX_FORCE_INLINE Vec3V QuatRotateInv(const QuatV q, const Vec3V v);
+// quaternion multiplication
+PX_FORCE_INLINE QuatV QuatMul(const QuatV a, const QuatV b);
+// quaternion add
+PX_FORCE_INLINE QuatV QuatAdd(const QuatV a, const QuatV b);
+// (-q.x, -q.y, -q.z, -q.w)
+PX_FORCE_INLINE QuatV QuatNeg(const QuatV q);
+// (a.x - b.x, a.y-b.y, a.z-b.z, a.w-b.w )
+PX_FORCE_INLINE QuatV QuatSub(const QuatV a, const QuatV b);
+// (a.x*b, a.y*b, a.z*b, a.w*b)
+PX_FORCE_INLINE QuatV QuatScale(const QuatV a, const FloatV b);
+// (x = v[0], y = v[1], z = v[2], w =v[3])
+PX_FORCE_INLINE QuatV QuatMerge(const FloatV* const v);
+// (x = v[0], y = v[1], z = v[2], w =v[3])
+PX_FORCE_INLINE QuatV QuatMerge(const FloatVArg x, const FloatVArg y, const FloatVArg z, const FloatVArg w);
+// (x = 0.f, y = 0.f, z = 0.f, w = 1.f)
+PX_FORCE_INLINE QuatV QuatIdentity();
+// check for each component is valid
+PX_FORCE_INLINE bool isFiniteQuatV(const QuatV q);
+// check for each component is valid
+PX_FORCE_INLINE bool isValidQuatV(const QuatV q);
+// check for each component is valid
+PX_FORCE_INLINE bool isSaneQuatV(const QuatV q);
+
+// Math operations on 16-byte aligned booleans.
+// x=false	y=false		z=false		w=false
+PX_FORCE_INLINE BoolV BFFFF();
+// x=false	y=false		z=false		w=true
+PX_FORCE_INLINE BoolV BFFFT();
+// x=false	y=false		z=true		w=false
+PX_FORCE_INLINE BoolV BFFTF();
+// x=false	y=false		z=true		w=true
+PX_FORCE_INLINE BoolV BFFTT();
+// x=false	y=true		z=false		w=false
+PX_FORCE_INLINE BoolV BFTFF();
+// x=false	y=true		z=false		w=true
+PX_FORCE_INLINE BoolV BFTFT();
+// x=false	y=true		z=true		w=false
+PX_FORCE_INLINE BoolV BFTTF();
+// x=false	y=true		z=true		w=true
+PX_FORCE_INLINE BoolV BFTTT();
+// x=true	y=false		z=false		w=false
+PX_FORCE_INLINE BoolV BTFFF();
+// x=true	y=false		z=false		w=true
+PX_FORCE_INLINE BoolV BTFFT();
+// x=true	y=false		z=true		w=false
+PX_FORCE_INLINE BoolV BTFTF();
+// x=true	y=false		z=true		w=true
+PX_FORCE_INLINE BoolV BTFTT();
+// x=true	y=true		z=false		w=false
+PX_FORCE_INLINE BoolV BTTFF();
+// x=true	y=true		z=false		w=true
+PX_FORCE_INLINE BoolV BTTFT();
+// x=true	y=true		z=true		w=false
+PX_FORCE_INLINE BoolV BTTTF();
+// x=true	y=true		z=true		w=true
+PX_FORCE_INLINE BoolV BTTTT();
+
+// x=false	y=false		z=false		w=true
+PX_FORCE_INLINE BoolV BWMask();
+// x=true	y=false		z=false		w=false
+PX_FORCE_INLINE BoolV BXMask();
+// x=false	y=true		z=false		w=false
+PX_FORCE_INLINE BoolV BYMask();
+// x=false	y=false		z=true		w=false
+PX_FORCE_INLINE BoolV BZMask();
+
+// get x component
+PX_FORCE_INLINE BoolV BGetX(const BoolV f);
+// get y component
+PX_FORCE_INLINE BoolV BGetY(const BoolV f);
+// get z component
+PX_FORCE_INLINE BoolV BGetZ(const BoolV f);
+// get w component
+PX_FORCE_INLINE BoolV BGetW(const BoolV f);
+
+// Use elementIndex to splat xxxx or yyyy or zzzz or wwww
+template <int elementIndex>
+PX_FORCE_INLINE BoolV BSplatElement(Vec4V a);
+
+// component-wise && (AND)
+PX_FORCE_INLINE BoolV BAnd(const BoolV a, const BoolV b);
+// component-wise || (OR)
+PX_FORCE_INLINE BoolV BOr(const BoolV a, const BoolV b);
+// component-wise not
+PX_FORCE_INLINE BoolV BNot(const BoolV a);
+
+// if all four components are true, return true, otherwise return false
+PX_FORCE_INLINE BoolV BAllTrue4(const BoolV a);
+
+// if any four components is true, return true, otherwise return false
+PX_FORCE_INLINE BoolV BAnyTrue4(const BoolV a);
+
+// if all three(0, 1, 2) components are true, return true, otherwise return false
+PX_FORCE_INLINE BoolV BAllTrue3(const BoolV a);
+
+// if any three (0, 1, 2) components is true, return true, otherwise return false
+PX_FORCE_INLINE BoolV BAnyTrue3(const BoolV a);
+
+// Return 1 if all components equal, zero otherwise.
+PX_FORCE_INLINE PxU32 BAllEq(const BoolV a, const BoolV b);
+
+// Specialized/faster BAllEq function for b==TTTT
+PX_FORCE_INLINE PxU32 BAllEqTTTT(const BoolV a);
+// Specialized/faster BAllEq function for b==FFFF
+PX_FORCE_INLINE PxU32 BAllEqFFFF(const BoolV a);
+
+/// Get BoolV as bits set in an PxU32. A bit in the output is set if the element is 'true' in the input.
+/// There is a bit for each element in a, with element 0s value held in bit0, element 1 in bit 1s and so forth.
+/// If nothing is true in the input it will return 0, and if all are true if will return 0xf.
+/// NOTE! That performance of the function varies considerably by platform, thus it is recommended to use
+/// where your algorithm really needs a BoolV in an integer variable.
+PX_FORCE_INLINE PxU32 BGetBitMask(const BoolV a);
+
+// VecI32V stuff
+
+PX_FORCE_INLINE VecI32V VecI32V_Zero();
+
+PX_FORCE_INLINE VecI32V VecI32V_One();
+
+PX_FORCE_INLINE VecI32V VecI32V_Two();
+
+PX_FORCE_INLINE VecI32V VecI32V_MinusOne();
+
+// Compute a shift parameter for VecI32V_LeftShift and VecI32V_RightShift
+// Each element of shift must be identical ie the vector must have form {count, count, count, count} with count>=0
+PX_FORCE_INLINE VecShiftV VecI32V_PrepareShift(const VecI32VArg shift);
+
+// Shift each element of a leftwards by the same amount
+// Compute shift with VecI32V_PrepareShift
+//{a.x<<shift[0], a.y<<shift[0], a.z<<shift[0], a.w<<shift[0]}
+PX_FORCE_INLINE VecI32V VecI32V_LeftShift(const VecI32VArg a, const VecShiftVArg shift);
+
+// Shift each element of a rightwards by the same amount
+// Compute shift with VecI32V_PrepareShift
+//{a.x>>shift[0], a.y>>shift[0], a.z>>shift[0], a.w>>shift[0]}
+PX_FORCE_INLINE VecI32V VecI32V_RightShift(const VecI32VArg a, const VecShiftVArg shift);
+
+PX_FORCE_INLINE VecI32V VecI32V_Add(const VecI32VArg a, const VecI32VArg b);
+
+PX_FORCE_INLINE VecI32V VecI32V_Or(const VecI32VArg a, const VecI32VArg b);
+
+PX_FORCE_INLINE VecI32V VecI32V_GetX(const VecI32VArg a);
+
+PX_FORCE_INLINE VecI32V VecI32V_GetY(const VecI32VArg a);
+
+PX_FORCE_INLINE VecI32V VecI32V_GetZ(const VecI32VArg a);
+
+PX_FORCE_INLINE VecI32V VecI32V_GetW(const VecI32VArg a);
+
+PX_FORCE_INLINE VecI32V VecI32V_Sub(const VecI32VArg a, const VecI32VArg b);
+
+PX_FORCE_INLINE BoolV VecI32V_IsGrtr(const VecI32VArg a, const VecI32VArg b);
+
+PX_FORCE_INLINE BoolV VecI32V_IsEq(const VecI32VArg a, const VecI32VArg b);
+
+PX_FORCE_INLINE VecI32V V4I32Sel(const BoolV c, const VecI32V a, const VecI32V b);
+
+// VecU32V stuff
+
+PX_FORCE_INLINE VecU32V U4Zero();
+
+PX_FORCE_INLINE VecU32V U4One();
+
+PX_FORCE_INLINE VecU32V U4Two();
+
+PX_FORCE_INLINE BoolV V4IsEqU32(const VecU32V a, const VecU32V b);
+
+PX_FORCE_INLINE VecU32V V4U32Sel(const BoolV c, const VecU32V a, const VecU32V b);
+
+PX_FORCE_INLINE VecU32V V4U32or(VecU32V a, VecU32V b);
+
+PX_FORCE_INLINE VecU32V V4U32xor(VecU32V a, VecU32V b);
+
+PX_FORCE_INLINE VecU32V V4U32and(VecU32V a, VecU32V b);
+
+PX_FORCE_INLINE VecU32V V4U32Andc(VecU32V a, VecU32V b);
+
+// VecU32 - why does this not return a bool?
+PX_FORCE_INLINE VecU32V V4IsGrtrV32u(const Vec4V a, const Vec4V b);
+
+// Math operations on 16-byte aligned Mat33s (represents any 3x3 matrix)
+// a*b
+PX_FORCE_INLINE Vec3V M33MulV3(const Mat33V& a, const Vec3V b);
+// A*x + b
+PX_FORCE_INLINE Vec3V M33MulV3AddV3(const Mat33V& A, const Vec3V b, const Vec3V c);
+// transpose(a) * b
+PX_FORCE_INLINE Vec3V M33TrnspsMulV3(const Mat33V& a, const Vec3V b);
+// a*b
+PX_FORCE_INLINE Mat33V M33MulM33(const Mat33V& a, const Mat33V& b);
+// a+b
+PX_FORCE_INLINE Mat33V M33Add(const Mat33V& a, const Mat33V& b);
+// a+b
+PX_FORCE_INLINE Mat33V M33Sub(const Mat33V& a, const Mat33V& b);
+//-a
+PX_FORCE_INLINE Mat33V M33Neg(const Mat33V& a);
+// absolute value of the matrix
+PX_FORCE_INLINE Mat33V M33Abs(const Mat33V& a);
+// inverse mat
+PX_FORCE_INLINE Mat33V M33Inverse(const Mat33V& a);
+// transpose(a)
+PX_FORCE_INLINE Mat33V M33Trnsps(const Mat33V& a);
+// create an identity matrix
+PX_FORCE_INLINE Mat33V M33Identity();
+
+// create a vec3 to store the diagonal element of the M33
+PX_FORCE_INLINE Mat33V M33Diagonal(const Vec3VArg);
+
+// Not implemented
+// return 1 if all components of a are equal to all components of b
+// PX_FORCE_INLINE PxU32 V4U32AllEq(const VecU32V a, const VecU32V b);
+// v.w=f
+// PX_FORCE_INLINE void V3WriteW(Vec3V& v, const PxF32 f);
+// PX_FORCE_INLINE PxF32 V3ReadW(const Vec3V& v);
+
+// Not used
+// PX_FORCE_INLINE Vec4V V4LoadAligned(Vec4V* addr);
+// PX_FORCE_INLINE Vec4V V4LoadUnaligned(Vec4V* addr);
+// floor(a)(per component)
+// PX_FORCE_INLINE Vec4V V4Floor(Vec4V a);
+// ceil(a) (per component)
+// PX_FORCE_INLINE Vec4V V4Ceil(Vec4V a);
+// PX_FORCE_INLINE VecU32V V4ConvertToU32VSaturate(const Vec4V a, PxU32 power);
+
+// Math operations on 16-byte aligned Mat34s (represents transformation matrix - rotation and translation).
+// namespace _Mat34V
+//{
+//	//a*b
+//	PX_FORCE_INLINE Vec3V multiplyV(const Mat34V& a, const Vec3V b);
+//	//a_rotation * b
+//	PX_FORCE_INLINE Vec3V multiply3X3V(const Mat34V& a, const Vec3V b);
+//	//transpose(a_rotation)*b
+//	PX_FORCE_INLINE Vec3V multiplyTranspose3X3V(const Mat34V& a, const Vec3V b);
+//	//a*b
+//	PX_FORCE_INLINE Mat34V multiplyV(const Mat34V& a, const Mat34V& b);
+//	//a_rotation*b
+//	PX_FORCE_INLINE Mat33V multiply3X3V(const Mat34V& a, const Mat33V& b);
+//	//a_rotation*b_rotation
+//	PX_FORCE_INLINE Mat33V multiply3X3V(const Mat34V& a, const Mat34V& b);
+//	//a+b
+//	PX_FORCE_INLINE Mat34V addV(const Mat34V& a, const Mat34V& b);
+//	//a^-1
+//	PX_FORCE_INLINE Mat34V getInverseV(const Mat34V& a);
+//	//transpose(a_rotation)
+//	PX_FORCE_INLINE Mat33V getTranspose3X3(const Mat34V& a);
+//}; //namespace _Mat34V
+
+// a*b
+//#define M34MulV3(a,b)			(M34MulV3(a,b))
+////a_rotation * b
+//#define M34Mul33V3(a,b)			(M34Mul33V3(a,b))
+////transpose(a_rotation)*b
+//#define M34TrnspsMul33V3(a,b)	(M34TrnspsMul33V3(a,b))
+////a*b
+//#define M34MulM34(a,b)			(_Mat34V::multiplyV(a,b))
+// a_rotation*b
+//#define M34MulM33(a,b)			(M34MulM33(a,b))
+// a_rotation*b_rotation
+//#define M34Mul33MM34(a,b)		(M34MulM33(a,b))
+// a+b
+//#define M34Add(a,b)				(M34Add(a,b))
+////a^-1
+//#define M34Inverse(a,b)			(M34Inverse(a))
+// transpose(a_rotation)
+//#define M34Trnsps33(a)			(M33Trnsps3X3(a))
+
+// Math operations on 16-byte aligned Mat44s (represents any 4x4 matrix)
+// namespace _Mat44V
+//{
+//	//a*b
+//	PX_FORCE_INLINE Vec4V multiplyV(const Mat44V& a, const Vec4V b);
+//	//transpose(a)*b
+//	PX_FORCE_INLINE Vec4V multiplyTransposeV(const Mat44V& a, const Vec4V b);
+//	//a*b
+//	PX_FORCE_INLINE Mat44V multiplyV(const Mat44V& a, const Mat44V& b);
+//	//a+b
+//	PX_FORCE_INLINE Mat44V addV(const Mat44V& a, const Mat44V& b);
+//	//a&-1
+//	PX_FORCE_INLINE Mat44V getInverseV(const Mat44V& a);
+//	//transpose(a)
+//	PX_FORCE_INLINE Mat44V getTransposeV(const Mat44V& a);
+//}; //namespace _Mat44V
+
+// namespace _VecU32V
+//{
+//	// pack 8 U32s to 8 U16s with saturation
+//	PX_FORCE_INLINE VecU16V pack2U32VToU16VSaturate(VecU32V a, VecU32V b);
+//	PX_FORCE_INLINE VecU32V orV(VecU32V a, VecU32V b);
+//	PX_FORCE_INLINE VecU32V andV(VecU32V a, VecU32V b);
+//	PX_FORCE_INLINE VecU32V andcV(VecU32V a, VecU32V b);
+//	// conversion from integer to float
+//	PX_FORCE_INLINE Vec4V convertToVec4V(VecU32V a);
+//	// splat a[elementIndex] into all fields of a
+//	template<int elementIndex>
+//	PX_FORCE_INLINE VecU32V splatElement(VecU32V a);
+//	PX_FORCE_INLINE void storeAligned(VecU32V a, VecU32V* address);
+//};
+
+// namespace _VecI32V
+//{
+//	template<int a> PX_FORCE_INLINE VecI32V splatI32();
+//};
+//
+// namespace _VecU16V
+//{
+//	PX_FORCE_INLINE VecU16V orV(VecU16V a, VecU16V b);
+//	PX_FORCE_INLINE VecU16V andV(VecU16V a, VecU16V b);
+//	PX_FORCE_INLINE VecU16V andcV(VecU16V a, VecU16V b);
+//	PX_FORCE_INLINE void storeAligned(VecU16V val, VecU16V *address);
+//	PX_FORCE_INLINE VecU16V loadAligned(VecU16V* addr);
+//	PX_FORCE_INLINE VecU16V loadUnaligned(VecU16V* addr);
+//	PX_FORCE_INLINE VecU16V compareGt(VecU16V a, VecU16V b);
+//	template<int elementIndex>
+//	PX_FORCE_INLINE VecU16V splatElement(VecU16V a);
+//	PX_FORCE_INLINE VecU16V subtractModulo(VecU16V a, VecU16V b);
+//	PX_FORCE_INLINE VecU16V addModulo(VecU16V a, VecU16V b);
+//	PX_FORCE_INLINE VecU32V getLo16(VecU16V a); // [0,2,4,6] 16-bit values to [0,1,2,3] 32-bit vector
+//	PX_FORCE_INLINE VecU32V getHi16(VecU16V a); // [1,3,5,7] 16-bit values to [0,1,2,3] 32-bit vector
+//};
+//
+// namespace _VecI16V
+//{
+//	template <int val> PX_FORCE_INLINE VecI16V splatImmediate();
+//};
+//
+// namespace _VecU8V
+//{
+//};
+
+// a*b
+//#define M44MulV4(a,b)		(M44MulV4(a,b))
+////transpose(a)*b
+//#define M44TrnspsMulV4(a,b) (M44TrnspsMulV4(a,b))
+////a*b
+//#define M44MulM44(a,b)		(M44MulM44(a,b))
+////a+b
+//#define M44Add(a,b)			(M44Add(a,b))
+////a&-1
+//#define M44Inverse(a)		(M44Inverse(a))
+////transpose(a)
+//#define M44Trnsps(a)		(M44Trnsps(a))
+
+// dsequeira: these used to be assert'd out in SIMD builds, but they're necessary if
+// we want to be able to write some scalar functions which run using SIMD data structures
+
+PX_FORCE_INLINE void V3WriteX(Vec3V& v, const PxF32 f)
+{
+	reinterpret_cast<PxVec3&>(v).x = f;
+}
+
+PX_FORCE_INLINE void V3WriteY(Vec3V& v, const PxF32 f)
+{
+	reinterpret_cast<PxVec3&>(v).y = f;
+}
+
+PX_FORCE_INLINE void V3WriteZ(Vec3V& v, const PxF32 f)
+{
+	reinterpret_cast<PxVec3&>(v).z = f;
+}
+
+PX_FORCE_INLINE void V3WriteXYZ(Vec3V& v, const PxVec3& f)
+{
+	reinterpret_cast<PxVec3&>(v) = f;
+}
+
+PX_FORCE_INLINE PxF32 V3ReadX(const Vec3V& v)
+{
+	return reinterpret_cast<const PxVec3&>(v).x;
+}
+
+PX_FORCE_INLINE PxF32 V3ReadY(const Vec3V& v)
+{
+	return reinterpret_cast<const PxVec3&>(v).y;
+}
+
+PX_FORCE_INLINE PxF32 V3ReadZ(const Vec3V& v)
+{
+	return reinterpret_cast<const PxVec3&>(v).z;
+}
+
+PX_FORCE_INLINE const PxVec3& V3ReadXYZ(const Vec3V& v)
+{
+	return reinterpret_cast<const PxVec3&>(v);
+}
+
+PX_FORCE_INLINE void V4WriteX(Vec4V& v, const PxF32 f)
+{
+	reinterpret_cast<PxVec4&>(v).x = f;
+}
+
+PX_FORCE_INLINE void V4WriteY(Vec4V& v, const PxF32 f)
+{
+	reinterpret_cast<PxVec4&>(v).y = f;
+}
+
+PX_FORCE_INLINE void V4WriteZ(Vec4V& v, const PxF32 f)
+{
+	reinterpret_cast<PxVec4&>(v).z = f;
+}
+
+PX_FORCE_INLINE void V4WriteW(Vec4V& v, const PxF32 f)
+{
+	reinterpret_cast<PxVec4&>(v).w = f;
+}
+
+PX_FORCE_INLINE void V4WriteXYZ(Vec4V& v, const PxVec3& f)
+{
+	reinterpret_cast<PxVec3&>(v) = f;
+}
+
+PX_FORCE_INLINE PxF32 V4ReadX(const Vec4V& v)
+{
+	return reinterpret_cast<const PxVec4&>(v).x;
+}
+
+PX_FORCE_INLINE PxF32 V4ReadY(const Vec4V& v)
+{
+	return reinterpret_cast<const PxVec4&>(v).y;
+}
+
+PX_FORCE_INLINE PxF32 V4ReadZ(const Vec4V& v)
+{
+	return reinterpret_cast<const PxVec4&>(v).z;
+}
+
+PX_FORCE_INLINE PxF32 V4ReadW(const Vec4V& v)
+{
+	return reinterpret_cast<const PxVec4&>(v).w;
+}
+
+PX_FORCE_INLINE const PxVec3& V4ReadXYZ(const Vec4V& v)
+{
+	return reinterpret_cast<const PxVec3&>(v);
+}
+
+// this macro transposes 4 Vec4V into 3 Vec4V (assuming that the W component can be ignored
+#define PX_TRANSPOSE_44_34(inA, inB, inC, inD, outA, outB, outC)                                                       \
+	\
+outA = V4UnpackXY(inA, inC);                                                                                           \
+	\
+inA = V4UnpackZW(inA, inC);                                                                                            \
+	\
+inC = V4UnpackXY(inB, inD);                                                                                            \
+	\
+inB = V4UnpackZW(inB, inD);                                                                                            \
+	\
+outB = V4UnpackZW(outA, inC);                                                                                          \
+	\
+outA = V4UnpackXY(outA, inC);                                                                                          \
+	\
+outC = V4UnpackXY(inA, inB);
+
+// this macro transposes 3 Vec4V into 4 Vec4V (with W components as garbage!)
+#define PX_TRANSPOSE_34_44(inA, inB, inC, outA, outB, outC, outD)                                                      \
+	outA = V4UnpackXY(inA, inC);                                                                                       \
+	inA = V4UnpackZW(inA, inC);                                                                                        \
+	outC = V4UnpackXY(inB, inB);                                                                                       \
+	inC = V4UnpackZW(inB, inB);                                                                                        \
+	outB = V4UnpackZW(outA, outC);                                                                                     \
+	outA = V4UnpackXY(outA, outC);                                                                                     \
+	outC = V4UnpackXY(inA, inC);                                                                                       \
+	outD = V4UnpackZW(inA, inC);
+
+#define PX_TRANSPOSE_44(inA, inB, inC, inD, outA, outB, outC, outD)                                                    \
+	outA = V4UnpackXY(inA, inC);                                                                                       \
+	inA = V4UnpackZW(inA, inC);                                                                                        \
+	inC = V4UnpackXY(inB, inD);                                                                                        \
+	inB = V4UnpackZW(inB, inD);                                                                                        \
+	outB = V4UnpackZW(outA, inC);                                                                                      \
+	outA = V4UnpackXY(outA, inC);                                                                                      \
+	outC = V4UnpackXY(inA, inB);                                                                                       \
+	outD = V4UnpackZW(inA, inB);
+
+// This function returns a Vec4V, where each element is the dot product of one pair of Vec3Vs. On PC, each element in
+// the result should be identical to the results if V3Dot was performed
+// for each pair of Vec3V.
+// However, on other platforms, the result might diverge by some small margin due to differences in FP rounding, e.g. if
+// _mm_dp_ps was used or some other approximate dot product or fused madd operations
+// were used.
+// Where there does not exist a hw-accelerated dot-product operation, this approach should be the fastest way to compute
+// the dot product of 4 vectors.
+PX_FORCE_INLINE Vec4V V3Dot4(const Vec3VArg a0, const Vec3VArg b0, const Vec3VArg a1, const Vec3VArg b1,
+                             const Vec3VArg a2, const Vec3VArg b2, const Vec3VArg a3, const Vec3VArg b3)
+{
+	Vec4V a0b0 = Vec4V_From_Vec3V(V3Mul(a0, b0));
+	Vec4V a1b1 = Vec4V_From_Vec3V(V3Mul(a1, b1));
+	Vec4V a2b2 = Vec4V_From_Vec3V(V3Mul(a2, b2));
+	Vec4V a3b3 = Vec4V_From_Vec3V(V3Mul(a3, b3));
+
+	Vec4V aTrnsps, bTrnsps, cTrnsps;
+
+	PX_TRANSPOSE_44_34(a0b0, a1b1, a2b2, a3b3, aTrnsps, bTrnsps, cTrnsps);
+
+	return V4Add(V4Add(aTrnsps, bTrnsps), cTrnsps);
+}
+
+//(f.x,f.y,f.z,0) - Alternative/faster V3LoadU implementation when it is safe to read "W", i.e. the 32bits after the PxVec3.
+PX_FORCE_INLINE Vec3V V3LoadU_SafeReadW(const PxVec3& f)
+{
+	return Vec3V_From_Vec4V(V4LoadU(&f.x));
+}
+
+// Now for the cross-platform implementations of the 16-byte aligned maths functions (win32/360/ppu/spu etc).
+#if COMPILE_VECTOR_INTRINSICS
+#include "PsInlineAoS.h"
+#else // #if COMPILE_VECTOR_INTRINSICS
+#include "PsVecMathAoSScalarInline.h"
+#endif // #if !COMPILE_VECTOR_INTRINSICS
+#include "PsVecQuat.h"
+
+} // namespace aos
+} // namespace shdfnd
+} // namespace physx
+
+#endif // PSFOUNDATION_PSVECMATH_H
diff --git a/PxShared/src/foundation/include/PsVecMathAoSScalar.h b/PxShared/src/foundation/include/PsVecMathAoSScalar.h
new file mode 100644
index 0000000..beb6cdc
--- /dev/null
+++ b/PxShared/src/foundation/include/PsVecMathAoSScalar.h
@@ -0,0 +1,242 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSVECMATHAOSSCALAR_H
+#define PSFOUNDATION_PSVECMATHAOSSCALAR_H
+
+#if COMPILE_VECTOR_INTRINSICS
+#error Scalar version should not be included when using vector intrinsics.
+#endif
+
+// Remove this define when all platforms use simd solver.
+#define PX_SUPPORT_SIMD
+
+struct VecI16V;
+struct VecU16V;
+struct VecI32V;
+struct VecU32V;
+struct Vec4V;
+typedef Vec4V QuatV;
+
+PX_ALIGN_PREFIX(16)
+struct FloatV
+{
+	PxF32 x;
+	PxF32 pad[3];
+	FloatV()
+	{
+	}
+	FloatV(const PxF32 _x) : x(_x)
+	{
+	}
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct Vec4V
+{
+	PxF32 x, y, z, w;
+	Vec4V()
+	{
+	}
+	Vec4V(const PxF32 _x, const PxF32 _y, const PxF32 _z, const PxF32 _w) : x(_x), y(_y), z(_z), w(_w)
+	{
+	}
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct Vec3V
+{
+	PxF32 x, y, z;
+	PxF32 pad;
+	Vec3V()
+	{
+	}
+	Vec3V(const PxF32 _x, const PxF32 _y, const PxF32 _z) : x(_x), y(_y), z(_z), pad(0.0f)
+	{
+	}
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct BoolV
+{
+	PxU32 ux, uy, uz, uw;
+	BoolV()
+	{
+	}
+	BoolV(const PxU32 _x, const PxU32 _y, const PxU32 _z, const PxU32 _w) : ux(_x), uy(_y), uz(_z), uw(_w)
+	{
+	}
+} PX_ALIGN_SUFFIX(16);
+
+struct Mat33V
+{
+	Mat33V()
+	{
+	}
+	Mat33V(const Vec3V& c0, const Vec3V& c1, const Vec3V& c2) : col0(c0), col1(c1), col2(c2)
+	{
+	}
+	Vec3V col0;
+	Vec3V col1;
+	Vec3V col2;
+};
+
+struct Mat34V
+{
+	Mat34V()
+	{
+	}
+	Mat34V(const Vec3V& c0, const Vec3V& c1, const Vec3V& c2, const Vec3V& c3) : col0(c0), col1(c1), col2(c2), col3(c3)
+	{
+	}
+	Vec3V col0;
+	Vec3V col1;
+	Vec3V col2;
+	Vec3V col3;
+};
+
+struct Mat43V
+{
+	Mat43V()
+	{
+	}
+	Mat43V(const Vec4V& c0, const Vec4V& c1, const Vec4V& c2) : col0(c0), col1(c1), col2(c2)
+	{
+	}
+	Vec4V col0;
+	Vec4V col1;
+	Vec4V col2;
+};
+
+struct Mat44V
+{
+	Mat44V()
+	{
+	}
+	Mat44V(const Vec4V& c0, const Vec4V& c1, const Vec4V& c2, const Vec4V& c3) : col0(c0), col1(c1), col2(c2), col3(c3)
+	{
+	}
+	Vec4V col0;
+	Vec4V col1;
+	Vec4V col2;
+	Vec4V col3;
+};
+
+PX_ALIGN_PREFIX(16)
+struct VecU32V
+{
+	PxU32 u32[4];
+	PX_FORCE_INLINE VecU32V()
+	{
+	}
+	PX_FORCE_INLINE VecU32V(PxU32 a, PxU32 b, PxU32 c, PxU32 d)
+	{
+		u32[0] = a;
+		u32[1] = b;
+		u32[2] = c;
+		u32[3] = d;
+	}
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct VecI32V
+{
+	PxI32 i32[4];
+	PX_FORCE_INLINE VecI32V()
+	{
+	}
+	PX_FORCE_INLINE VecI32V(PxI32 a, PxI32 b, PxI32 c, PxI32 d)
+	{
+		i32[0] = a;
+		i32[1] = b;
+		i32[2] = c;
+		i32[3] = d;
+	}
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct VecI16V
+{
+	PxI16 i16[8];
+	PX_FORCE_INLINE VecI16V()
+	{
+	}
+	PX_FORCE_INLINE VecI16V(PxI16 a, PxI16 b, PxI16 c, PxI16 d, PxI16 e, PxI16 f, PxI16 g, PxI16 h)
+	{
+		i16[0] = a;
+		i16[1] = b;
+		i16[2] = c;
+		i16[3] = d;
+		i16[4] = e;
+		i16[5] = f;
+		i16[6] = g;
+		i16[7] = h;
+	}
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct VecU16V
+{
+	union
+	{
+		PxU16 u16[8];
+		PxI16 i16[8];
+	};
+	PX_FORCE_INLINE VecU16V()
+	{
+	}
+	PX_FORCE_INLINE VecU16V(PxU16 a, PxU16 b, PxU16 c, PxU16 d, PxU16 e, PxU16 f, PxU16 g, PxU16 h)
+	{
+		u16[0] = a;
+		u16[1] = b;
+		u16[2] = c;
+		u16[3] = d;
+		u16[4] = e;
+		u16[5] = f;
+		u16[6] = g;
+		u16[7] = h;
+	}
+} PX_ALIGN_SUFFIX(16);
+
+#define FloatVArg FloatV &
+#define Vec3VArg Vec3V &
+#define Vec4VArg Vec4V &
+#define BoolVArg BoolV &
+#define VecU32VArg VecU32V &
+#define VecI32VArg VecI32V &
+#define VecU16VArg VecU16V &
+#define VecI16VArg VecI16V &
+#define QuatVArg QuatV &
+
+#define VecCrossV Vec3V
+
+typedef VecI32V VecShiftV;
+#define VecShiftVArg VecShiftV &
+
+#endif // PX_PHYSICS_COMMON_VECMATH_INLINE_SCALAR
diff --git a/PxShared/src/foundation/include/PsVecMathAoSScalarInline.h b/PxShared/src/foundation/include/PsVecMathAoSScalarInline.h
new file mode 100644
index 0000000..9bef465
--- /dev/null
+++ b/PxShared/src/foundation/include/PsVecMathAoSScalarInline.h
@@ -0,0 +1,2254 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSVECMATHAOSSCALARINLINE_H
+#define PSFOUNDATION_PSVECMATHAOSSCALARINLINE_H
+
+#if COMPILE_VECTOR_INTRINSICS
+#error Scalar version should not be included when using vector intrinsics.
+#endif
+
+#define BOOL_TO_U32(b) (PxU32)(- PxI32(b))
+#define TRUE_TO_U32 (PxU32)(-1)
+#define FALSE_TO_U32 (PxU32)(0)
+
+#define BOOL_TO_U16(b) (PxU16)(- PxI32(b))
+
+
+#define VECMATHAOS_ASSERT(x) { PX_ASSERT(x); }
+
+/////////////////////////////////////////////////////////////////////
+////INTERNAL USE ONLY AND TESTS
+/////////////////////////////////////////////////////////////////////
+
+namespace internalScalarSimd
+{
+PX_FORCE_INLINE PxF32 FStore(const FloatV a)
+{
+	return a.x;
+}
+
+PX_FORCE_INLINE bool hasZeroElementInFloatV(const FloatV a)
+{
+	return (0 == a.x);
+}
+
+PX_FORCE_INLINE bool hasZeroElementInVec3V(const Vec3V a)
+{
+	return (0 == a.x || 0 == a.y || 0 == a.z);
+}
+
+PX_FORCE_INLINE bool hasZeroElementInVec4V(const Vec4V a)
+{
+	return (0 == a.x || 0 == a.y || 0 == a.z || 0 == a.w);
+}
+}
+
+namespace _VecMathTests
+{
+// PT: this function returns an invalid Vec3V (W!=0.0f) just for unit-testing 'isValidVec3V'
+PX_FORCE_INLINE Vec3V getInvalidVec3V()
+{
+	Vec3V tmp;
+	tmp.x = tmp.y = tmp.z = 0.0f;
+	tmp.pad = 1.0f;
+	return tmp;
+}
+
+PX_FORCE_INLINE bool allElementsEqualFloatV(const FloatV a, const FloatV b)
+{
+	return (a.x == b.x);
+}
+
+PX_FORCE_INLINE bool allElementsEqualVec3V(const Vec3V a, const Vec3V b)
+{
+	return (a.x == b.x && a.y == b.y && a.z == b.z);
+}
+
+PX_FORCE_INLINE bool allElementsEqualVec4V(const Vec4V a, const Vec4V b)
+{
+	return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w);
+}
+
+PX_FORCE_INLINE bool allElementsEqualBoolV(const BoolV a, const BoolV b)
+{
+	return (a.ux == b.ux && a.uy == b.uy && a.uz == b.uz && a.uw == b.uw);
+}
+
+PX_FORCE_INLINE bool allElementsEqualVecU32V(const VecU32V a, const VecU32V b)
+{
+	return (a.u32[0] == b.u32[0] && a.u32[1] == b.u32[1] && a.u32[2] == b.u32[2] && a.u32[3] == b.u32[3]);
+}
+
+PX_FORCE_INLINE bool allElementsEqualVecI32V(const VecI32V a, const VecI32V b)
+{
+	return (a.i32[0] == b.i32[0] && a.i32[1] == b.i32[1] && a.i32[2] == b.i32[2] && a.i32[3] == b.i32[3]);
+}
+
+#define VECMATH_AOS_EPSILON (1e-3f)
+
+PX_FORCE_INLINE bool allElementsNearEqualFloatV(const FloatV a, const FloatV b)
+{
+	const PxF32 cx = a.x - b.x;
+	return (cx > -VECMATH_AOS_EPSILON && cx < VECMATH_AOS_EPSILON);
+}
+
+PX_FORCE_INLINE bool allElementsNearEqualVec3V(const Vec3V a, const Vec3V b)
+{
+	const PxF32 cx = a.x - b.x;
+	const PxF32 cy = a.y - b.y;
+	const PxF32 cz = a.z - b.z;
+	return (cx > -VECMATH_AOS_EPSILON && cx < VECMATH_AOS_EPSILON && cy > -VECMATH_AOS_EPSILON &&
+	        cy < VECMATH_AOS_EPSILON && cz > -VECMATH_AOS_EPSILON && cz < VECMATH_AOS_EPSILON);
+}
+
+PX_FORCE_INLINE bool allElementsNearEqualVec4V(const Vec4V a, const Vec4V b)
+{
+	const PxF32 cx = a.x - b.x;
+	const PxF32 cy = a.y - b.y;
+	const PxF32 cz = a.z - b.z;
+	const PxF32 cw = a.w - b.w;
+	return (cx > -VECMATH_AOS_EPSILON && cx < VECMATH_AOS_EPSILON && cy > -VECMATH_AOS_EPSILON &&
+	        cy < VECMATH_AOS_EPSILON && cz > -VECMATH_AOS_EPSILON && cz < VECMATH_AOS_EPSILON &&
+	        cw > -VECMATH_AOS_EPSILON && cw < VECMATH_AOS_EPSILON);
+}
+}
+
+///////////////////////////////////////////////////////
+
+PX_FORCE_INLINE bool isValidVec3V(const Vec3V a)
+{
+	return a.pad == 0.f;
+}
+
+PX_FORCE_INLINE bool isFiniteFloatV(const FloatV a)
+{
+	return PxIsFinite(a.x);
+}
+
+PX_FORCE_INLINE bool isFiniteVec3V(const Vec3V a)
+{
+	return PxIsFinite(a.x) && PxIsFinite(a.y) && PxIsFinite(a.z);
+}
+
+PX_FORCE_INLINE bool isFiniteVec4V(const Vec4V a)
+{
+	return PxIsFinite(a.x) && PxIsFinite(a.y) && PxIsFinite(a.z) && PxIsFinite(a.w);
+}
+
+/////////////////////////////////////////////////////////////////////
+////VECTORISED FUNCTION IMPLEMENTATIONS
+/////////////////////////////////////////////////////////////////////
+
+PX_FORCE_INLINE FloatV FLoad(const PxF32 f)
+{
+	return FloatV(f);
+}
+
+PX_FORCE_INLINE Vec3V V3Load(const PxF32 f)
+{
+	return Vec3V(f, f, f);
+}
+
+PX_FORCE_INLINE Vec4V V4Load(const PxF32 f)
+{
+	return Vec4V(f, f, f, f);
+}
+
+PX_FORCE_INLINE BoolV BLoad(const bool f)
+{
+#if PX_ARM
+	// SD: Android ARM builds fail if this is done with a cast.
+	// Might also fail because of something else but the select
+	// operator here seems to fix everything that failed in release builds.
+	return f ? BTTTT() : BFFFF();
+#else
+	return BoolV(BOOL_TO_U32(f), BOOL_TO_U32(f), BOOL_TO_U32(f), BOOL_TO_U32(f));
+#endif
+}
+
+PX_FORCE_INLINE Vec3V V3LoadA(const PxVec3& f)
+{
+	return Vec3V(f.x, f.y, f.z);
+}
+
+PX_FORCE_INLINE Vec3V V3LoadU(const PxVec3& f)
+{
+	return Vec3V(f.x, f.y, f.z);
+}
+
+PX_FORCE_INLINE Vec3V V3LoadUnsafeA(const PxVec3& f)
+{
+	return Vec3V(f.x, f.y, f.z);
+}
+
+PX_FORCE_INLINE Vec3V V3LoadA(const PxF32* const f)
+{
+	return Vec3V(f[0], f[1], f[2]);
+}
+
+PX_FORCE_INLINE Vec3V V3LoadU(const PxF32* const f)
+{
+	return Vec3V(f[0], f[1], f[2]);
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V(Vec4V f)
+{
+	return Vec3V(f.x, f.y, f.z);
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V_WUndefined(const Vec4V v)
+{
+	return Vec3V(v.x, v.y, v.z);
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_Vec3V(Vec3V f)
+{
+	return Vec4V(f.x, f.y, f.z, 0.0f);
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_FloatV(FloatV f)
+{
+	return Vec4V(f.x, f.x, f.x, f.x);
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_FloatV(FloatV f)
+{
+	return Vec3V(f.x, f.x, f.x);
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_FloatV_WUndefined(FloatV f)
+{
+	return Vec3V(f.x, f.x, f.x);
+}
+
+PX_FORCE_INLINE Vec4V V4LoadA(const PxF32* const f)
+{
+	return Vec4V(f[0], f[1], f[2], f[3]);
+}
+
+PX_FORCE_INLINE void V4StoreA(const Vec4V a, PxF32* f)
+{
+	*reinterpret_cast<Vec4V*>(f) = a;
+}
+
+PX_FORCE_INLINE void V4StoreU(const Vec4V a, PxF32* f)
+{
+	*reinterpret_cast<PxVec4*>(f) = *reinterpret_cast<const PxVec4*>(&a.x);
+}
+
+PX_FORCE_INLINE void BStoreA(const BoolV a, PxU32* f)
+{
+	*reinterpret_cast<BoolV*>(f) = a;
+}
+
+PX_FORCE_INLINE void U4StoreA(const VecU32V uv, PxU32* u)
+{
+	*reinterpret_cast<VecU32V*>(u) = uv;
+}
+
+PX_FORCE_INLINE void I4StoreA(const VecI32V iv, PxI32* i)
+{
+	*reinterpret_cast<VecI32V*>(i) = iv;
+}
+
+PX_FORCE_INLINE Vec4V V4LoadU(const PxF32* const f)
+{
+	return Vec4V(f[0], f[1], f[2], f[3]);
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_PxVec3_WUndefined(const PxVec3& f)
+{
+	return Vec4V(f[0], f[1], f[2], 0.0f);
+}
+
+PX_FORCE_INLINE BoolV BLoad(const bool* const f)
+{
+	return BoolV(BOOL_TO_U32(f[0]), BOOL_TO_U32(f[1]), BOOL_TO_U32(f[2]), BOOL_TO_U32(f[3]));
+}
+
+PX_FORCE_INLINE void FStore(const FloatV a, PxF32* PX_RESTRICT f)
+{
+	*f = a.x;
+}
+
+PX_FORCE_INLINE void V3StoreA(const Vec3V a, PxVec3& f)
+{
+	f = PxVec3(a.x, a.y, a.z);
+}
+
+PX_FORCE_INLINE void V3StoreU(const Vec3V a, PxVec3& f)
+{
+	f = PxVec3(a.x, a.y, a.z);
+}
+
+PX_FORCE_INLINE void Store_From_BoolV(const BoolV b, PxU32* b2)
+{
+	*b2 = b.ux;
+}
+
+//////////////////////////
+// FLOATV
+//////////////////////////
+
+PX_FORCE_INLINE FloatV FZero()
+{
+	return FLoad(0.0f);
+}
+
+PX_FORCE_INLINE FloatV FOne()
+{
+	return FLoad(1.0f);
+}
+
+PX_FORCE_INLINE FloatV FHalf()
+{
+	return FLoad(0.5f);
+}
+
+PX_FORCE_INLINE FloatV FEps()
+{
+	return FLoad(PX_EPS_REAL);
+}
+
+PX_FORCE_INLINE FloatV FEps6()
+{
+	return FLoad(1e-6f);
+}
+
+PX_FORCE_INLINE FloatV FMax()
+{
+	return FLoad(PX_MAX_REAL);
+}
+
+PX_FORCE_INLINE FloatV FNegMax()
+{
+	return FLoad(-PX_MAX_REAL);
+}
+
+PX_FORCE_INLINE FloatV FNeg(const FloatV f)
+{
+	return FloatV(-f.x);
+}
+
+PX_FORCE_INLINE FloatV FAdd(const FloatV a, const FloatV b)
+{
+	return FloatV(a.x + b.x);
+}
+
+PX_FORCE_INLINE FloatV FSub(const FloatV a, const FloatV b)
+{
+	return FloatV(a.x - b.x);
+}
+
+PX_FORCE_INLINE FloatV FMul(const FloatV a, const FloatV b)
+{
+	return FloatV(a.x * b.x);
+}
+
+PX_FORCE_INLINE FloatV FDiv(const FloatV a, const FloatV b)
+{
+	VECMATHAOS_ASSERT(b.x != 0.0f);
+	return FloatV(a.x / b.x);
+}
+
+PX_FORCE_INLINE FloatV FDivFast(const FloatV a, const FloatV b)
+{
+	VECMATHAOS_ASSERT(b.x != 0.0f);
+	return FloatV(a.x / b.x);
+}
+
+PX_FORCE_INLINE FloatV FRecip(const FloatV a)
+{
+	VECMATHAOS_ASSERT(a.x != 0.0f);
+	return 1.0f / a.x;
+}
+
+PX_FORCE_INLINE FloatV FRecipFast(const FloatV a)
+{
+	VECMATHAOS_ASSERT(a.x != 0.0f);
+	return 1.0f / a.x;
+}
+
+PX_FORCE_INLINE FloatV FRsqrt(const FloatV a)
+{
+	VECMATHAOS_ASSERT(a.x != 0.0f);
+	return PxRecipSqrt(a.x);
+}
+
+PX_FORCE_INLINE FloatV FSqrt(const FloatV a)
+{
+	return PxSqrt(a.x);
+}
+
+PX_FORCE_INLINE FloatV FRsqrtFast(const FloatV a)
+{
+	VECMATHAOS_ASSERT(a.x != 0.0f);
+	return PxRecipSqrt(a.x);
+}
+
+PX_FORCE_INLINE FloatV FScaleAdd(const FloatV a, const FloatV b, const FloatV c)
+{
+	return FAdd(FMul(a, b), c);
+}
+
+PX_FORCE_INLINE FloatV FNegScaleSub(const FloatV a, const FloatV b, const FloatV c)
+{
+	return FSub(c, FMul(a, b));
+}
+
+PX_FORCE_INLINE FloatV FAbs(const FloatV a)
+{
+	return FloatV(PxAbs(a.x));
+}
+
+PX_FORCE_INLINE FloatV FSel(const BoolV c, const FloatV a, const FloatV b)
+{
+	return FloatV(c.ux ? a.x : b.x);
+}
+
+PX_FORCE_INLINE BoolV FIsGrtr(const FloatV a, const FloatV b)
+{
+	return BLoad(a.x > b.x);
+}
+
+PX_FORCE_INLINE BoolV FIsGrtrOrEq(const FloatV a, const FloatV b)
+{
+	return BLoad(a.x >= b.x);
+}
+
+PX_FORCE_INLINE BoolV FIsEq(const FloatV a, const FloatV b)
+{
+	return BLoad(a.x == b.x);
+}
+
+PX_FORCE_INLINE FloatV FMax(const FloatV a, const FloatV b)
+{
+	return (a.x > b.x ? FloatV(a.x) : FloatV(b.x));
+}
+
+PX_FORCE_INLINE FloatV FMin(const FloatV a, const FloatV b)
+{
+	return (a.x > b.x ? FloatV(b.x) : FloatV(a.x));
+}
+
+PX_FORCE_INLINE FloatV FClamp(const FloatV a, const FloatV minV, const FloatV maxV)
+{
+	return FMax(FMin(a, maxV), minV);
+}
+
+PX_FORCE_INLINE PxU32 FAllGrtr(const FloatV a, const FloatV b)
+{
+	return BOOL_TO_U32(a.x > b.x);
+}
+
+PX_FORCE_INLINE PxU32 FAllGrtrOrEq(const FloatV a, const FloatV b)
+{
+	return BOOL_TO_U32(a.x >= b.x);
+}
+PX_FORCE_INLINE PxU32 FAllEq(const FloatV a, const FloatV b)
+{
+	return BOOL_TO_U32(a.x == b.x);
+}
+
+PX_FORCE_INLINE FloatV FRound(const FloatV a)
+{
+	return floorf(a.x + 0.5f);
+}
+
+PX_FORCE_INLINE FloatV FSin(const FloatV a)
+{
+	return sinf(a.x);
+}
+
+PX_FORCE_INLINE FloatV FCos(const FloatV a)
+{
+	return cosf(a.x);
+}
+
+PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV min, const FloatV max)
+{
+	return BOOL_TO_U32(a.x > max.x || a.x < min.x);
+}
+
+PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV min, const FloatV max)
+{
+	return BOOL_TO_U32(a.x >= min.x && a.x <= max.x);
+}
+
+PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV bounds)
+{
+	return FOutOfBounds(a, FNeg(bounds), bounds);
+}
+
+PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV bounds)
+{
+	return FInBounds(a, FNeg(bounds), bounds);
+}
+
+/////////////////////
+// VEC3V
+/////////////////////
+
+PX_FORCE_INLINE Vec3V V3Splat(const FloatV f)
+{
+	return Vec3V(f.x, f.x, f.x);
+}
+
+PX_FORCE_INLINE Vec3V V3Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z)
+{
+	return Vec3V(x.x, y.x, z.x);
+}
+
+PX_FORCE_INLINE Vec3V V3UnitX()
+{
+	return Vec3V(1.0f, 0.0f, 0.0f);
+}
+
+PX_FORCE_INLINE Vec3V V3UnitY()
+{
+	return Vec3V(0.0f, 1.0f, 0.0f);
+}
+
+PX_FORCE_INLINE Vec3V V3UnitZ()
+{
+	return Vec3V(0.0f, 0.0f, 1.0f);
+}
+
+PX_FORCE_INLINE FloatV V3GetX(const Vec3V f)
+{
+	return FloatV(f.x);
+}
+
+PX_FORCE_INLINE FloatV V3GetY(const Vec3V f)
+{
+	return FloatV(f.y);
+}
+
+PX_FORCE_INLINE FloatV V3GetZ(const Vec3V f)
+{
+	return FloatV(f.z);
+}
+
+PX_FORCE_INLINE Vec3V V3SetX(const Vec3V v, const FloatV f)
+{
+	return Vec3V(f.x, v.y, v.z);
+}
+
+PX_FORCE_INLINE Vec3V V3SetY(const Vec3V v, const FloatV f)
+{
+	return Vec3V(v.x, f.x, v.z);
+}
+
+PX_FORCE_INLINE Vec3V V3SetZ(const Vec3V v, const FloatV f)
+{
+	return Vec3V(v.x, v.y, f.x);
+}
+
+PX_FORCE_INLINE Vec3V V3ColX(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	return Vec3V(a.x, b.x, c.x);
+}
+
+PX_FORCE_INLINE Vec3V V3ColY(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	return Vec3V(a.y, b.y, c.y);
+}
+
+PX_FORCE_INLINE Vec3V V3ColZ(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	return Vec3V(a.z, b.z, c.z);
+}
+
+PX_FORCE_INLINE Vec3V V3Zero()
+{
+	return V3Load(0.0f);
+}
+
+PX_FORCE_INLINE Vec3V V3One()
+{
+	return V3Load(1.0f);
+}
+
+PX_FORCE_INLINE Vec3V V3Eps()
+{
+	return V3Load(PX_EPS_REAL);
+}
+
+PX_FORCE_INLINE Vec3V V3Neg(const Vec3V c)
+{
+	return Vec3V(-c.x, -c.y, -c.z);
+}
+
+PX_FORCE_INLINE Vec3V V3Add(const Vec3V a, const Vec3V b)
+{
+	return Vec3V(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+
+PX_FORCE_INLINE Vec3V V3Sub(const Vec3V a, const Vec3V b)
+{
+	return Vec3V(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+
+PX_FORCE_INLINE Vec3V V3Scale(const Vec3V a, const FloatV b)
+{
+	return Vec3V(a.x * b.x, a.y * b.x, a.z * b.x);
+}
+
+PX_FORCE_INLINE Vec3V V3Mul(const Vec3V a, const Vec3V b)
+{
+	return Vec3V(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+
+PX_FORCE_INLINE Vec3V V3ScaleInv(const Vec3V a, const FloatV b)
+{
+	const PxF32 bInv = 1.0f / b.x;
+	return Vec3V(a.x * bInv, a.y * bInv, a.z * bInv);
+}
+
+PX_FORCE_INLINE Vec3V V3Div(const Vec3V a, const Vec3V b)
+{
+	return Vec3V(a.x / b.x, a.y / b.y, a.z / b.z);
+}
+
+PX_FORCE_INLINE Vec3V V3ScaleInvFast(const Vec3V a, const FloatV b)
+{
+	const PxF32 bInv = 1.0f / b.x;
+	return Vec3V(a.x * bInv, a.y * bInv, a.z * bInv);
+}
+
+PX_FORCE_INLINE Vec3V V3DivFast(const Vec3V a, const Vec3V b)
+{
+	return Vec3V(a.x / b.x, a.y / b.y, a.z / b.z);
+}
+
+PX_FORCE_INLINE Vec3V V3Recip(const Vec3V a)
+{
+	return Vec3V(1.0f / a.x, 1.0f / a.y, 1.0f / a.z);
+}
+
+PX_FORCE_INLINE Vec3V V3RecipFast(const Vec3V a)
+{
+	return Vec3V(1.0f / a.x, 1.0f / a.y, 1.0f / a.z);
+}
+
+PX_FORCE_INLINE Vec3V V3Rsqrt(const Vec3V a)
+{
+	return Vec3V(PxRecipSqrt(a.x), PxRecipSqrt(a.y), PxRecipSqrt(a.z));
+}
+
+PX_FORCE_INLINE Vec3V V3RsqrtFast(const Vec3V a)
+{
+	return Vec3V(PxRecipSqrt(a.x), PxRecipSqrt(a.y), PxRecipSqrt(a.z));
+}
+
+PX_FORCE_INLINE Vec3V V3ScaleAdd(const Vec3V a, const FloatV b, const Vec3V c)
+{
+	return V3Add(V3Scale(a, b), c);
+}
+
+PX_FORCE_INLINE Vec3V V3NegScaleSub(const Vec3V a, const FloatV b, const Vec3V c)
+{
+	return V3Sub(c, V3Scale(a, b));
+}
+
+PX_FORCE_INLINE Vec3V V3MulAdd(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	return V3Add(V3Mul(a, b), c);
+}
+
+PX_FORCE_INLINE Vec3V V3NegMulSub(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	return V3Sub(c, V3Mul(a, b));
+}
+
+PX_FORCE_INLINE FloatV V3Dot(const Vec3V a, const Vec3V b)
+{
+	return FloatV(a.x * b.x + a.y * b.y + a.z * b.z);
+}
+
+PX_FORCE_INLINE VecCrossV V3PrepareCross(const Vec3VArg normal)
+{
+	return normal;
+}
+
+PX_FORCE_INLINE Vec3V V3Cross(const Vec3V a, const Vec3V b)
+{
+	return Vec3V(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+}
+
+PX_FORCE_INLINE FloatV V3Length(const Vec3V a)
+{
+	return FloatV(PxSqrt(a.x * a.x + a.y * a.y + a.z * a.z));
+}
+
+PX_FORCE_INLINE FloatV V3LengthSq(const Vec3V a)
+{
+	return FloatV(a.x * a.x + a.y * a.y + a.z * a.z);
+}
+
+PX_FORCE_INLINE Vec3V V3Normalize(const Vec3V a)
+{
+	VECMATHAOS_ASSERT(a.x != 0 || a.y != 0 || a.z != 0);
+	const PxF32 lengthInv = 1.0f / PxSqrt(a.x * a.x + a.y * a.y + a.z * a.z);
+	return Vec3V(a.x * lengthInv, a.y * lengthInv, a.z * lengthInv);
+}
+
+PX_FORCE_INLINE Vec3V V3NormalizeSafe(const Vec3V a, const Vec3V unsafeReturnValue)
+{
+	const PxF32 length = PxSqrt(a.x * a.x + a.y * a.y + a.z * a.z);
+	if(PX_EPS_REAL >= length)
+	{
+		return unsafeReturnValue;
+	}
+	else
+	{
+		const PxF32 lengthInv = 1.0f / length;
+		return Vec3V(a.x * lengthInv, a.y * lengthInv, a.z * lengthInv);
+	}
+}
+
+PX_FORCE_INLINE Vec3V V3NormalizeFast(const Vec3V a)
+{
+	VECMATHAOS_ASSERT(a.x != 0 || a.y != 0 || a.z != 0);
+	const PxF32 lengthInv = 1.0f / PxSqrt(a.x * a.x + a.y * a.y + a.z * a.z);
+	return Vec3V(a.x * lengthInv, a.y * lengthInv, a.z * lengthInv);
+}
+
+PX_FORCE_INLINE Vec3V V3Sel(const BoolV c, const Vec3V a, const Vec3V b)
+{
+	return Vec3V(c.ux ? a.x : b.x, c.uy ? a.y : b.y, c.uz ? a.z : b.z);
+}
+
+PX_FORCE_INLINE BoolV V3IsGrtr(const Vec3V a, const Vec3V b)
+{
+	return BoolV(BOOL_TO_U32(a.x > b.x), BOOL_TO_U32(a.y > b.y), BOOL_TO_U32(a.z > b.z), FALSE_TO_U32);
+}
+
+PX_FORCE_INLINE BoolV V3IsGrtrOrEq(const Vec3V a, const Vec3V b)
+{
+	return BoolV(BOOL_TO_U32(a.x >= b.x), BOOL_TO_U32(a.y >= b.y), BOOL_TO_U32(a.z >= b.z), TRUE_TO_U32);
+}
+
+PX_FORCE_INLINE BoolV V3IsEq(const Vec3V a, const Vec3V b)
+{
+	return BoolV(BOOL_TO_U32(a.x == b.x), BOOL_TO_U32(a.y == b.y), BOOL_TO_U32(a.z == b.z), TRUE_TO_U32);
+}
+
+PX_FORCE_INLINE Vec3V V3Max(const Vec3V a, const Vec3V b)
+{
+	return Vec3V(a.x > b.x ? a.x : b.x, a.y > b.y ? a.y : b.y, a.z > b.z ? a.z : b.z);
+}
+
+PX_FORCE_INLINE Vec3V V3Min(const Vec3V a, const Vec3V b)
+{
+	return Vec3V(a.x < b.x ? a.x : b.x, a.y < b.y ? a.y : b.y, a.z < b.z ? a.z : b.z);
+}
+
+PX_FORCE_INLINE FloatV V3ExtractMax(const Vec3V a)
+{
+	const PxF32 t0 = (a.x >= a.y) ? a.x : a.y;
+	return t0 >= a.z ? t0 : a.z;
+}
+
+PX_FORCE_INLINE FloatV V3ExtractMin(const Vec3V a)
+{
+	const PxF32 t0 = (a.x <= a.y) ? a.x : a.y;
+	return t0 <= a.z ? t0 : a.z;
+}
+
+// return (a >= 0.0f) ? 1.0f : -1.0f;
+PX_FORCE_INLINE Vec3V V3Sign(const Vec3V a)
+{
+	return Vec3V((a.x >= 0.f ? 1.f : -1.f), (a.y >= 0.f ? 1.f : -1.f), (a.z >= 0.f ? 1.f : -1.f));
+}
+
+PX_FORCE_INLINE Vec3V V3Clamp(const Vec3V a, const Vec3V minV, const Vec3V maxV)
+{
+	return V3Max(V3Min(a, maxV), minV);
+}
+
+PX_FORCE_INLINE Vec3V V3Abs(const Vec3V a)
+{
+	return V3Max(a, V3Neg(a));
+}
+
+PX_FORCE_INLINE PxU32 V3AllGrtr(const Vec3V a, const Vec3V b)
+{
+	return BOOL_TO_U32((a.x > b.x) & (a.y > b.y) & (a.z > b.z));
+}
+
+PX_FORCE_INLINE PxU32 V3AllGrtrOrEq(const Vec3V a, const Vec3V b)
+{
+	return BOOL_TO_U32((a.x >= b.x) & (a.y >= b.y) & (a.z >= b.z));
+}
+
+PX_FORCE_INLINE PxU32 V3AllEq(const Vec3V a, const Vec3V b)
+{
+	return BOOL_TO_U32((a.x == b.x) & (a.y == b.y) & (a.z == b.z));
+}
+
+PX_FORCE_INLINE Vec3V V3Round(const Vec3V a)
+{
+	return Vec3V(floorf(a.x + 0.5f), floorf(a.y + 0.5f), floorf(a.z + 0.5f));
+}
+
+PX_FORCE_INLINE Vec3V V3Sin(const Vec3V a)
+{
+	return Vec3V(sinf(a.x), sinf(a.y), sinf(a.z));
+}
+
+PX_FORCE_INLINE Vec3V V3Cos(const Vec3V a)
+{
+	return Vec3V(cosf(a.x), cosf(a.y), cosf(a.z));
+}
+
+PX_FORCE_INLINE Vec3V V3PermYZZ(const Vec3V a)
+{
+	return Vec3V(a.y, a.z, a.z);
+}
+
+PX_FORCE_INLINE Vec3V V3PermXYX(const Vec3V a)
+{
+	return Vec3V(a.x, a.y, a.x);
+}
+
+PX_FORCE_INLINE Vec3V V3PermYZX(const Vec3V a)
+{
+	return Vec3V(a.y, a.z, a.x);
+}
+
+PX_FORCE_INLINE Vec3V V3PermZXY(const Vec3V a)
+{
+	return Vec3V(a.z, a.x, a.y);
+}
+
+PX_FORCE_INLINE Vec3V V3PermZZY(const Vec3V a)
+{
+	return Vec3V(a.z, a.z, a.y);
+}
+
+PX_FORCE_INLINE Vec3V V3PermYXX(const Vec3V a)
+{
+	return Vec3V(a.y, a.x, a.x);
+}
+
+PX_FORCE_INLINE Vec3V V3Perm_Zero_1Z_0Y(const Vec3V v0, const Vec3V v1)
+{
+	return Vec3V(0.0f, v1.z, v0.y);
+}
+
+PX_FORCE_INLINE Vec3V V3Perm_0Z_Zero_1X(const Vec3V v0, const Vec3V v1)
+{
+	return Vec3V(v0.z, 0.0f, v1.x);
+}
+
+PX_FORCE_INLINE Vec3V V3Perm_1Y_0X_Zero(const Vec3V v0, const Vec3V v1)
+{
+	return Vec3V(v1.y, v0.x, 0.0f);
+}
+
+PX_FORCE_INLINE FloatV V3SumElems(const Vec3V a)
+{
+	return FloatV(a.x + a.y + a.z);
+}
+
+PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V min, const Vec3V max)
+{
+	return BOOL_TO_U32(a.x > max.x || a.y > max.y || a.z > max.z || a.x < min.x || a.y < min.y || a.z < min.z);
+}
+
+PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V min, const Vec3V max)
+{
+	return BOOL_TO_U32(a.x <= max.x && a.y <= max.y && a.z <= max.z && a.x >= min.x && a.y >= min.y && a.z >= min.z);
+}
+
+PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V bounds)
+{
+	return V3OutOfBounds(a, V3Neg(bounds), bounds);
+}
+
+PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V bounds)
+{
+	return V3InBounds(a, V3Neg(bounds), bounds);
+}
+
+PX_FORCE_INLINE void V3Transpose(Vec3V& col0, Vec3V& col1, Vec3V& col2)
+{
+	const PxF32 t01 = col0.y, t02 = col0.z, t12 = col1.z;
+	col0.y = col1.x;
+	col0.z = col2.x;
+	col1.z = col2.y;
+	col1.x = t01;
+	col2.x = t02;
+	col2.y = t12;
+}
+
+/////////////////////////
+// VEC4V
+/////////////////////////
+
+PX_FORCE_INLINE Vec4V V4Splat(const FloatV f)
+{
+	return Vec4V(f.x, f.x, f.x, f.x);
+}
+
+PX_FORCE_INLINE Vec4V V4Merge(const FloatV* const floatVArray)
+{
+	return Vec4V(floatVArray[0].x, floatVArray[1].x, floatVArray[2].x, floatVArray[3].x);
+}
+
+PX_FORCE_INLINE Vec4V V4Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z, const FloatVArg w)
+{
+	return Vec4V(x.x, y.x, z.x, w.x);
+}
+
+PX_FORCE_INLINE Vec4V V4MergeW(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	return Vec4V(x.w, y.w, z.w, w.w);
+}
+
+PX_FORCE_INLINE Vec4V V4MergeZ(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	return Vec4V(x.z, y.z, z.z, w.z);
+}
+
+PX_FORCE_INLINE Vec4V V4MergeY(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	return Vec4V(x.y, y.y, z.y, w.y);
+}
+
+PX_FORCE_INLINE Vec4V V4MergeX(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	return Vec4V(x.x, y.x, z.x, w.x);
+}
+
+PX_FORCE_INLINE Vec4V V4UnpackXY(const Vec4VArg a, const Vec4VArg b)
+{
+	return Vec4V(a.x, b.x, a.y, b.y);
+}
+
+PX_FORCE_INLINE Vec4V V4UnpackZW(const Vec4VArg a, const Vec4VArg b)
+{
+	return Vec4V(a.z, b.z, a.w, b.w);
+}
+
+PX_FORCE_INLINE Vec4V V4UnitX()
+{
+	return Vec4V(1.0f, 0.0f, 0.0f, 0.0f);
+}
+
+PX_FORCE_INLINE Vec4V V4UnitY()
+{
+	return Vec4V(0.0f, 1.0f, 0.0f, 0.0f);
+}
+
+PX_FORCE_INLINE Vec4V V4UnitZ()
+{
+	return Vec4V(0.0f, 0.0f, 1.0f, 0.0f);
+}
+
+PX_FORCE_INLINE Vec4V V4UnitW()
+{
+	return Vec4V(0.0f, 0.0f, 0.0f, 1.0f);
+}
+
+PX_FORCE_INLINE FloatV V4GetX(const Vec4V f)
+{
+	return FloatV(f.x);
+}
+
+PX_FORCE_INLINE FloatV V4GetY(const Vec4V f)
+{
+	return FloatV(f.y);
+}
+
+PX_FORCE_INLINE FloatV V4GetZ(const Vec4V f)
+{
+	return FloatV(f.z);
+}
+
+PX_FORCE_INLINE FloatV V4GetW(const Vec4V f)
+{
+	return FloatV(f.w);
+}
+
+PX_FORCE_INLINE Vec4V V4SetX(const Vec4V v, const FloatV f)
+{
+	return Vec4V(f.x, v.y, v.z, v.w);
+}
+
+PX_FORCE_INLINE Vec4V V4SetY(const Vec4V v, const FloatV f)
+{
+	return Vec4V(v.x, f.x, v.z, v.w);
+}
+
+PX_FORCE_INLINE Vec4V V4SetZ(const Vec4V v, const FloatV f)
+{
+	return Vec4V(v.x, v.y, f.x, v.w);
+}
+
+PX_FORCE_INLINE Vec4V V4SetW(const Vec4V v, const FloatV f)
+{
+	return Vec4V(v.x, v.y, v.z, f.x);
+}
+
+PX_FORCE_INLINE Vec4V V4SetW(const Vec3V v, const FloatV f)
+{
+	return Vec4V(v.x, v.y, v.z, f.x);
+}
+
+PX_FORCE_INLINE Vec4V V4ClearW(const Vec4V v)
+{
+	return Vec4V(v.x, v.y, v.z, 0.0f);
+}
+
+PX_FORCE_INLINE Vec4V V4PermYXWZ(const Vec4V v)
+{
+	return Vec4V(v.y, v.x, v.w, v.z);
+}
+
+PX_FORCE_INLINE Vec4V V4PermXZXZ(const Vec4V v)
+{
+	return Vec4V(v.x, v.z, v.x, v.z);
+}
+
+PX_FORCE_INLINE Vec4V V4PermYWYW(const Vec4V v)
+{
+	return Vec4V(v.y, v.w, v.y, v.w);
+}
+
+PX_FORCE_INLINE Vec4V V4PermYZXW(const Vec4V v)
+{
+	return Vec4V(v.y, v.z, v.x, v.w);
+}
+
+template <PxU8 _x, PxU8 _y, PxU8 _z, PxU8 _w>
+PX_FORCE_INLINE Vec4V V4Perm(const Vec4V v)
+{
+	const PxF32 f[4] = { v.x, v.y, v.z, v.w };
+	return Vec4V(f[_x], f[_y], f[_z], f[_w]);
+}
+
+PX_FORCE_INLINE Vec4V V4Zero()
+{
+	return V4Load(0.0f);
+}
+
+PX_FORCE_INLINE Vec4V V4One()
+{
+	return V4Load(1.0f);
+}
+
+PX_FORCE_INLINE Vec4V V4Eps()
+{
+	return V4Load(PX_EPS_REAL);
+}
+
+PX_FORCE_INLINE Vec4V V4Neg(const Vec4V c)
+{
+	return Vec4V(-c.x, -c.y, -c.z, -c.w);
+}
+
+PX_FORCE_INLINE Vec4V V4Add(const Vec4V a, const Vec4V b)
+{
+	return Vec4V(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+
+PX_FORCE_INLINE Vec4V V4Sub(const Vec4V a, const Vec4V b)
+{
+	return Vec4V(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+}
+
+PX_FORCE_INLINE Vec4V V4Scale(const Vec4V a, const FloatV b)
+{
+	return Vec4V(a.x * b.x, a.y * b.x, a.z * b.x, a.w * b.x);
+}
+
+PX_FORCE_INLINE Vec4V V4Mul(const Vec4V a, const Vec4V b)
+{
+	return Vec4V(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
+}
+
+PX_FORCE_INLINE Vec4V V4ScaleInv(const Vec4V a, const FloatV b)
+{
+	const PxF32 bInv = 1.0f / b.x;
+	return Vec4V(a.x * bInv, a.y * bInv, a.z * bInv, a.w * bInv);
+}
+
+PX_FORCE_INLINE Vec4V V4Div(const Vec4V a, const Vec4V b)
+{
+	VECMATHAOS_ASSERT(b.x != 0 && b.y != 0 && b.z != 0 && b.w != 0);
+	return Vec4V(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
+}
+
+PX_FORCE_INLINE Vec4V V4ScaleInvFast(const Vec4V a, const FloatV b)
+{
+	const PxF32 bInv = 1.0f / b.x;
+	return Vec4V(a.x * bInv, a.y * bInv, a.z * bInv, a.w * bInv);
+}
+
+PX_FORCE_INLINE Vec4V V4DivFast(const Vec4V a, const Vec4V b)
+{
+	return Vec4V(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
+}
+
+PX_FORCE_INLINE Vec4V V4Recip(const Vec4V a)
+{
+	return Vec4V(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
+}
+
+PX_FORCE_INLINE Vec4V V4RecipFast(const Vec4V a)
+{
+	return Vec4V(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
+}
+
+PX_FORCE_INLINE Vec4V V4Rsqrt(const Vec4V a)
+{
+	return Vec4V(PxRecipSqrt(a.x), PxRecipSqrt(a.y), PxRecipSqrt(a.z), PxRecipSqrt(a.w));
+}
+
+PX_FORCE_INLINE Vec4V V4RsqrtFast(const Vec4V a)
+{
+	return Vec4V(PxRecipSqrt(a.x), PxRecipSqrt(a.y), PxRecipSqrt(a.z), PxRecipSqrt(a.w));
+}
+
+PX_FORCE_INLINE Vec4V V4Sqrt(const Vec4V a)
+{
+	return Vec4V(PxSqrt(a.x), PxSqrt(a.y), PxSqrt(a.z), PxSqrt(a.w));
+}
+
+PX_FORCE_INLINE Vec4V V4ScaleAdd(const Vec4V a, const FloatV b, const Vec4V c)
+{
+	return V4Add(V4Scale(a, b), c);
+}
+
+PX_FORCE_INLINE Vec4V V4NegScaleSub(const Vec4V a, const FloatV b, const Vec4V c)
+{
+	return V4Sub(c, V4Scale(a, b));
+}
+
+PX_FORCE_INLINE Vec4V V4MulAdd(const Vec4V a, const Vec4V b, const Vec4V c)
+{
+	return V4Add(V4Mul(a, b), c);
+}
+
+PX_FORCE_INLINE Vec4V V4NegMulSub(const Vec4V a, const Vec4V b, const Vec4V c)
+{
+	return V4Sub(c, V4Mul(a, b));
+}
+
+PX_FORCE_INLINE FloatV V4SumElements(const Vec4V a)
+{
+	return FloatV(a.x + a.y + a.z + a.w);
+}
+
+PX_FORCE_INLINE FloatV V4Dot(const Vec4V a, const Vec4V b)
+{
+	return FloatV(a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w);
+}
+
+PX_FORCE_INLINE FloatV V4Dot3(const Vec4V a, const Vec4V b)
+{
+	return FloatV(a.x * b.x + a.y * b.y + a.z * b.z);
+}
+
+PX_FORCE_INLINE Vec4V V4Cross(const Vec4V a, const Vec4V b)
+{
+	return Vec4V(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f);
+}
+
+PX_FORCE_INLINE FloatV V4Length(const Vec4V a)
+{
+	return FloatV(PxSqrt(a.x * a.x + a.y * a.y + a.z * a.z + a.w * a.w));
+}
+
+PX_FORCE_INLINE FloatV V4LengthSq(const Vec4V a)
+{
+	return V4Dot(a, a);
+}
+
+PX_FORCE_INLINE Vec4V V4Normalize(const Vec4V a)
+{
+	VECMATHAOS_ASSERT(0 != a.x || 0 != a.y || 0 != a.z || 0 != a.w);
+	const FloatV length = FloatV(V4Length(a));
+	return V4ScaleInv(a, length);
+}
+
+PX_FORCE_INLINE Vec4V V4NormalizeSafe(const Vec4V a, const Vec4V unsafeReturnValue)
+{
+	const FloatV length = FloatV(V4Length(a));
+	if(PX_EPS_REAL >= length.x)
+	{
+		return unsafeReturnValue;
+	}
+	else
+	{
+		return V4ScaleInv(a, length);
+	}
+}
+PX_FORCE_INLINE Vec4V V4NormalizeFast(const Vec4V a)
+{
+	VECMATHAOS_ASSERT(0 != a.x || 0 != a.y || 0 != a.z || 0 != a.w);
+	const FloatV length = FloatV(V4Length(a));
+	return V4ScaleInv(a, length);
+}
+
+PX_FORCE_INLINE Vec4V V4Sel(const BoolV c, const Vec4V a, const Vec4V b)
+{
+	return Vec4V(c.ux ? a.x : b.x, c.uy ? a.y : b.y, c.uz ? a.z : b.z, c.uw ? a.w : b.w);
+}
+
+PX_FORCE_INLINE BoolV V4IsGrtr(const Vec4V a, const Vec4V b)
+{
+	return BoolV(BOOL_TO_U32(a.x > b.x), BOOL_TO_U32(a.y > b.y), BOOL_TO_U32(a.z > b.z), BOOL_TO_U32(a.w > b.w));
+}
+
+PX_FORCE_INLINE BoolV V4IsGrtrOrEq(const Vec4V a, const Vec4V b)
+{
+	return BoolV(BOOL_TO_U32(a.x >= b.x), BOOL_TO_U32(a.y >= b.y), BOOL_TO_U32(a.z >= b.z), BOOL_TO_U32(a.w >= b.w));
+}
+
+PX_FORCE_INLINE BoolV V4IsEq(const Vec4V a, const Vec4V b)
+{
+	return BoolV(BOOL_TO_U32(a.x == b.x), BOOL_TO_U32(a.y == b.y), BOOL_TO_U32(a.z == b.z), BOOL_TO_U32(a.w == b.w));
+}
+
+PX_FORCE_INLINE Vec4V V4Max(const Vec4V a, const Vec4V b)
+{
+	return Vec4V(a.x > b.x ? a.x : b.x, a.y > b.y ? a.y : b.y, a.z > b.z ? a.z : b.z, a.w > b.w ? a.w : b.w);
+}
+
+PX_FORCE_INLINE Vec4V V4Min(const Vec4V a, const Vec4V b)
+{
+	return Vec4V(a.x < b.x ? a.x : b.x, a.y < b.y ? a.y : b.y, a.z < b.z ? a.z : b.z, a.w < b.w ? a.w : b.w);
+}
+
+PX_FORCE_INLINE FloatV V4ExtractMax(const Vec4V a)
+{
+	const PxF32 t0 = (a.x >= a.y) ? a.x : a.y;
+	const PxF32 t1 = (a.z >= a.w) ? a.x : a.w;
+	return t0 >= t1 ? t0 : t1;
+}
+
+PX_FORCE_INLINE FloatV V4ExtractMin(const Vec4V a)
+{
+	const PxF32 t0 = (a.x <= a.y) ? a.x : a.y;
+	const PxF32 t1 = (a.z <= a.w) ? a.x : a.w;
+	return t0 <= t1 ? t0 : t1;
+}
+
+PX_FORCE_INLINE Vec4V V4Clamp(const Vec4V a, const Vec4V minV, const Vec4V maxV)
+{
+	return V4Max(V4Min(a, maxV), minV);
+}
+
+PX_FORCE_INLINE Vec4V V4Round(const Vec4V a)
+{
+	return Vec4V(floorf(a.x + 0.5f), floorf(a.y + 0.5f), floorf(a.z + 0.5f), floorf(a.w + 0.5f));
+}
+
+PX_FORCE_INLINE Vec4V V4Sin(const Vec4V a)
+{
+	return Vec4V(sinf(a.x), sinf(a.y), sinf(a.z), sinf(a.w));
+}
+
+PX_FORCE_INLINE Vec4V V4Cos(const Vec4V a)
+{
+	return Vec4V(cosf(a.x), cosf(a.y), cosf(a.z), cosf(a.w));
+}
+
+PX_FORCE_INLINE PxU32 V4AllGrtr(const Vec4V a, const Vec4V b)
+{
+	return BOOL_TO_U32((a.x > b.x) & (a.y > b.y) & (a.z > b.z) & (a.w > b.w));
+}
+
+PX_FORCE_INLINE PxU32 V4AllGrtrOrEq(const Vec4V a, const Vec4V b)
+{
+	return BOOL_TO_U32((a.x >= b.x) & (a.y >= b.y) & (a.z >= b.z) & (a.w >= b.w));
+}
+
+PX_FORCE_INLINE PxU32 V4AllGrtrOrEq3(const Vec4V a, const Vec4V b)
+{
+	return BOOL_TO_U32((a.x >= b.x) & (a.y >= b.y) & (a.z >= b.z));
+}
+
+PX_FORCE_INLINE PxU32 V4AllEq(const Vec4V a, const Vec4V b)
+{
+	return BOOL_TO_U32((a.x == b.x) & (a.y == b.y) & (a.z == b.z) & (a.w == b.w));
+}
+
+PX_FORCE_INLINE PxU32 V4AnyGrtr3(const Vec4V a, const Vec4V b)
+{
+	return BOOL_TO_U32((a.x > b.x) | (a.y > b.y) | (a.z > b.z));
+}
+
+PX_FORCE_INLINE void V4Transpose(Vec4V& col0, Vec4V& col1, Vec4V& col2, Vec4V& col3)
+{
+	const PxF32 t01 = col0.y, t02 = col0.z, t03 = col0.w;
+	const PxF32 t12 = col1.z, t13 = col1.w;
+	const PxF32 t23 = col2.w;
+	col0.y = col1.x;
+	col0.z = col2.x;
+	col0.w = col3.x;
+	col1.z = col2.y;
+	col1.w = col3.y;
+	col2.w = col3.z;
+	col1.x = t01;
+	col2.x = t02;
+	col3.x = t03;
+	col2.y = t12;
+	col3.y = t13;
+	col3.z = t23;
+}
+
+PX_FORCE_INLINE BoolV BFFFF()
+{
+	return BoolV(FALSE_TO_U32, FALSE_TO_U32, FALSE_TO_U32, FALSE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BFFFT()
+{
+	return BoolV(FALSE_TO_U32, FALSE_TO_U32, FALSE_TO_U32, TRUE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BFFTF()
+{
+	return BoolV(FALSE_TO_U32, FALSE_TO_U32, TRUE_TO_U32, FALSE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BFFTT()
+{
+	return BoolV(FALSE_TO_U32, FALSE_TO_U32, TRUE_TO_U32, TRUE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BFTFF()
+{
+	return BoolV(FALSE_TO_U32, TRUE_TO_U32, FALSE_TO_U32, FALSE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BFTFT()
+{
+	return BoolV(FALSE_TO_U32, TRUE_TO_U32, FALSE_TO_U32, TRUE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BFTTF()
+{
+	return BoolV(FALSE_TO_U32, TRUE_TO_U32, TRUE_TO_U32, FALSE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BFTTT()
+{
+	return BoolV(FALSE_TO_U32, TRUE_TO_U32, TRUE_TO_U32, TRUE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BTFFF()
+{
+	return BoolV(TRUE_TO_U32, FALSE_TO_U32, FALSE_TO_U32, FALSE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BTFFT()
+{
+	return BoolV(TRUE_TO_U32, FALSE_TO_U32, FALSE_TO_U32, TRUE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BTFTF()
+{
+	return BoolV(TRUE_TO_U32, FALSE_TO_U32, TRUE_TO_U32, FALSE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BTFTT()
+{
+	return BoolV(TRUE_TO_U32, FALSE_TO_U32, TRUE_TO_U32, TRUE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BTTFF()
+{
+	return BoolV(TRUE_TO_U32, TRUE_TO_U32, FALSE_TO_U32, FALSE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BTTFT()
+{
+	return BoolV(TRUE_TO_U32, TRUE_TO_U32, FALSE_TO_U32, TRUE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BTTTF()
+{
+	return BoolV(TRUE_TO_U32, TRUE_TO_U32, TRUE_TO_U32, FALSE_TO_U32);
+}
+PX_FORCE_INLINE BoolV BTTTT()
+{
+	return BoolV(TRUE_TO_U32, TRUE_TO_U32, TRUE_TO_U32, TRUE_TO_U32);
+}
+
+PX_FORCE_INLINE BoolV BXMask()
+{
+	return BTFFF();
+}
+PX_FORCE_INLINE BoolV BYMask()
+{
+	return BFTFF();
+}
+PX_FORCE_INLINE BoolV BZMask()
+{
+	return BFFTF();
+}
+PX_FORCE_INLINE BoolV BWMask()
+{
+	return BFFFT();
+}
+
+PX_FORCE_INLINE BoolV BGetX(const BoolV a)
+{
+	return BoolV(a.ux, a.ux, a.ux, a.ux);
+}
+
+PX_FORCE_INLINE BoolV BGetY(const BoolV a)
+{
+	return BoolV(a.uy, a.uy, a.uy, a.uy);
+}
+
+PX_FORCE_INLINE BoolV BGetZ(const BoolV a)
+{
+	return BoolV(a.uz, a.uz, a.uz, a.uz);
+}
+
+PX_FORCE_INLINE BoolV BGetW(const BoolV a)
+{
+	return BoolV(a.uw, a.uw, a.uw, a.uw);
+}
+
+PX_FORCE_INLINE BoolV BSetX(const BoolV v, const BoolV f)
+{
+	return BoolV(f.ux, v.uy, v.uz, v.uw);
+}
+
+PX_FORCE_INLINE BoolV BSetY(const BoolV v, const BoolV f)
+{
+	return BoolV(v.ux, f.uy, v.uz, v.uw);
+}
+
+PX_FORCE_INLINE BoolV BSetZ(const BoolV v, const BoolV f)
+{
+	return BoolV(v.ux, v.uy, f.uz, v.uw);
+}
+
+PX_FORCE_INLINE BoolV BSetW(const BoolV v, const BoolV f)
+{
+	return BoolV(v.ux, v.uy, v.uz, f.uw);
+}
+
+template <int index>
+BoolV BSplatElement(BoolV a)
+{
+	PxU32* b = (PxU32*)&a;
+	return BoolV(b[index], b[index], b[index], b[index]);
+}
+
+PX_FORCE_INLINE BoolV BAnd(const BoolV a, const BoolV b)
+{
+	return BoolV(BOOL_TO_U32(a.ux && b.ux), BOOL_TO_U32(a.uy && b.uy), BOOL_TO_U32(a.uz && b.uz), BOOL_TO_U32(a.uw && b.uw));
+}
+
+PX_FORCE_INLINE BoolV BAndNot(const BoolV a, const BoolV b)
+{
+	return BoolV(a.ux & ~b.ux, a.uy & ~b.uy, a.uz & ~b.uz, a.uw & ~b.uw);
+}
+
+PX_FORCE_INLINE BoolV BNot(const BoolV a)
+{
+	return BoolV(~a.ux, ~a.uy, ~a.uz, ~a.uw);
+}
+
+PX_FORCE_INLINE BoolV BOr(const BoolV a, const BoolV b)
+{
+	return BoolV(BOOL_TO_U32(a.ux || b.ux), BOOL_TO_U32(a.uy || b.uy), BOOL_TO_U32(a.uz || b.uz), BOOL_TO_U32(a.uw || b.uw));
+}
+
+PX_FORCE_INLINE PxU32 BAllEq(const BoolV a, const BoolV b)
+{
+	return (a.ux == b.ux && a.uy == b.uy && a.uz == b.uz && a.uw == b.uw ? TRUE_TO_U32 : FALSE_TO_U32);
+}
+
+PX_FORCE_INLINE PxU32 BAllEqTTTT(const BoolV a)
+{
+	return BAllEq(a, BTTTT());
+}
+
+PX_FORCE_INLINE PxU32 BAllEqFFFF(const BoolV a)
+{
+	return BAllEq(a, BFFFF());
+}
+
+PX_FORCE_INLINE BoolV BAllTrue4(const BoolV a)
+{
+	return (a.ux & a.uy & a.uz & a.uw) ? BTTTT() : BFFFF();
+}
+
+PX_FORCE_INLINE BoolV BAnyTrue4(const BoolV a)
+{
+	return (a.ux | a.uy | a.uz | a.uw) ? BTTTT() : BFFFF();
+}
+
+PX_FORCE_INLINE BoolV BAllTrue3(const BoolV a)
+{
+	return (a.ux & a.uy & a.uz) ? BTTTT() : BFFFF();
+}
+
+PX_FORCE_INLINE BoolV BAnyTrue3(const BoolV a)
+{
+	return (a.ux | a.uy | a.uz) ? BTTTT() : BFFFF();
+}
+
+PX_FORCE_INLINE PxU32 BGetBitMask(const BoolV a)
+{
+	return (a.ux & 1) | (a.uy & 2) | (a.uz & 4) | (a.uw & 8);
+}
+
+//////////////////////////////////
+// MAT33V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec3V M33MulV3(const Mat33V& a, const Vec3V b)
+{
+	return Vec3V(a.col0.x * b.x + a.col1.x * b.y + a.col2.x * b.z, a.col0.y * b.x + a.col1.y * b.y + a.col2.y * b.z,
+	             a.col0.z * b.x + a.col1.z * b.y + a.col2.z * b.z);
+}
+
+PX_FORCE_INLINE Vec3V M33TrnspsMulV3(const Mat33V& a, const Vec3V b)
+{
+	return Vec3V(a.col0.x * b.x + a.col0.y * b.y + a.col0.z * b.z, a.col1.x * b.x + a.col1.y * b.y + a.col1.z * b.z,
+	             a.col2.x * b.x + a.col2.y * b.y + a.col2.z * b.z);
+}
+
+PX_FORCE_INLINE Vec3V M33MulV3AddV3(const Mat33V& A, const Vec3V b, const Vec3V c)
+{
+	const FloatV x = V3GetX(b);
+	const FloatV y = V3GetY(b);
+	const FloatV z = V3GetZ(b);
+	Vec3V result = V3ScaleAdd(A.col0, x, c);
+	result = V3ScaleAdd(A.col1, y, result);
+	return V3ScaleAdd(A.col2, z, result);
+}
+
+PX_FORCE_INLINE Mat33V M33MulM33(const Mat33V& a, const Mat33V& b)
+{
+	return Mat33V(M33MulV3(a, b.col0), M33MulV3(a, b.col1), M33MulV3(a, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Add(const Mat33V& a, const Mat33V& b)
+{
+	return Mat33V(V3Add(a.col0, b.col0), V3Add(a.col1, b.col1), V3Add(a.col2, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Scale(const Mat33V& a, const FloatV& b)
+{
+	return Mat33V(V3Scale(a.col0, b), V3Scale(a.col1, b), V3Scale(a.col2, b));
+}
+
+PX_FORCE_INLINE Mat33V M33Sub(const Mat33V& a, const Mat33V& b)
+{
+	return Mat33V(V3Sub(a.col0, b.col0), V3Sub(a.col1, b.col1), V3Sub(a.col2, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Neg(const Mat33V& a)
+{
+	return Mat33V(V3Neg(a.col0), V3Neg(a.col1), V3Neg(a.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Abs(const Mat33V& a)
+{
+	return Mat33V(V3Abs(a.col0), V3Abs(a.col1), V3Abs(a.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Diagonal(const Vec3VArg d)
+{
+	const Vec3V x = V3Mul(V3UnitX(), d);
+	const Vec3V y = V3Mul(V3UnitY(), d);
+	const Vec3V z = V3Mul(V3UnitZ(), d);
+	return Mat33V(x, y, z);
+}
+
+PX_FORCE_INLINE Mat33V M33Inverse(const Mat33V& a)
+{
+	const PxF32 det = a.col0.x * (a.col1.y * a.col2.z - a.col1.z * a.col2.y) -
+	                  a.col1.x * (a.col0.y * a.col2.z - a.col2.y * a.col0.z) +
+	                  a.col2.x * (a.col0.y * a.col1.z - a.col1.y * a.col0.z);
+
+	const PxF32 invDet = 1.0f / det;
+
+	Mat33V ret;
+	ret.col0.x = invDet * (a.col1.y * a.col2.z - a.col2.y * a.col1.z);
+	ret.col0.y = invDet * (a.col2.y * a.col0.z - a.col0.y * a.col2.z);
+	ret.col0.z = invDet * (a.col0.y * a.col1.z - a.col1.y * a.col0.z);
+
+	ret.col1.x = invDet * (a.col2.x * a.col1.z - a.col1.x * a.col2.z);
+	ret.col1.y = invDet * (a.col0.x * a.col2.z - a.col2.x * a.col0.z);
+	ret.col1.z = invDet * (a.col1.x * a.col0.z - a.col0.x * a.col1.z);
+
+	ret.col2.x = invDet * (a.col1.x * a.col2.y - a.col2.x * a.col1.y);
+	ret.col2.y = invDet * (a.col2.x * a.col0.y - a.col0.x * a.col2.y);
+	ret.col2.z = invDet * (a.col0.x * a.col1.y - a.col1.x * a.col0.y);
+
+	return ret;
+}
+
+PX_FORCE_INLINE Mat33V Mat33V_From_PxMat33(const PxMat33& m)
+{
+	return Mat33V(V3LoadU(m.column0), V3LoadU(m.column1), V3LoadU(m.column2));
+}
+
+PX_FORCE_INLINE void PxMat33_From_Mat33V(const Mat33V& m, PxMat33& out)
+{
+	PX_ASSERT((size_t(&out) & 15) == 0);
+	V3StoreU(m.col0, out.column0);
+	V3StoreU(m.col1, out.column1);
+	V3StoreU(m.col2, out.column2);
+}
+
+PX_FORCE_INLINE Mat33V M33Trnsps(const Mat33V& a)
+{
+	return Mat33V(Vec3V(a.col0.x, a.col1.x, a.col2.x), Vec3V(a.col0.y, a.col1.y, a.col2.y),
+	              Vec3V(a.col0.z, a.col1.z, a.col2.z));
+}
+
+PX_FORCE_INLINE Mat33V M33Identity()
+{
+	return Mat33V(V3UnitX(), V3UnitY(), V3UnitZ());
+}
+
+//////////////////////////////////
+// MAT34V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec3V M34MulV3(const Mat34V& a, const Vec3V b)
+{
+	return Vec3V(a.col0.x * b.x + a.col1.x * b.y + a.col2.x * b.z + a.col3.x,
+	             a.col0.y * b.x + a.col1.y * b.y + a.col2.y * b.z + a.col3.y,
+	             a.col0.z * b.x + a.col1.z * b.y + a.col2.z * b.z + a.col3.z);
+}
+
+PX_FORCE_INLINE Vec3V M34Mul33V3(const Mat34V& a, const Vec3V b)
+{
+	return Vec3V(a.col0.x * b.x + a.col1.x * b.y + a.col2.x * b.z, a.col0.y * b.x + a.col1.y * b.y + a.col2.y * b.z,
+	             a.col0.z * b.x + a.col1.z * b.y + a.col2.z * b.z);
+}
+
+PX_FORCE_INLINE Vec3V M34TrnspsMul33V3(const Mat34V& a, const Vec3V b)
+{
+	return Vec3V(a.col0.x * b.x + a.col0.y * b.y + a.col0.z * b.z, a.col1.x * b.x + a.col1.y * b.y + a.col1.z * b.z,
+	             a.col2.x * b.x + a.col2.y * b.y + a.col2.z * b.z);
+}
+
+PX_FORCE_INLINE Mat34V M34MulM34(const Mat34V& a, const Mat34V& b)
+{
+	return Mat34V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2), M34MulV3(a, b.col3));
+}
+
+PX_FORCE_INLINE Mat33V M34MulM33(const Mat34V& a, const Mat33V& b)
+{
+	return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M34Mul33V3(const Mat34V& a, const Mat33V& b)
+{
+	return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M34Mul33MM34(const Mat34V& a, const Mat34V& b)
+{
+	return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2));
+}
+
+PX_FORCE_INLINE Mat34V M34Add(const Mat34V& a, const Mat34V& b)
+{
+	return Mat34V(V3Add(a.col0, b.col0), V3Add(a.col1, b.col1), V3Add(a.col2, b.col2), V3Add(a.col3, b.col3));
+}
+
+PX_FORCE_INLINE Mat33V M34Trnsps33(const Mat34V& a)
+{
+	return Mat33V(Vec3V(a.col0.x, a.col1.x, a.col2.x), Vec3V(a.col0.y, a.col1.y, a.col2.y),
+	              Vec3V(a.col0.z, a.col1.z, a.col2.z));
+}
+
+//////////////////////////////////
+// MAT44V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec4V M44MulV4(const Mat44V& a, const Vec4V b)
+{
+	return Vec4V(a.col0.x * b.x + a.col1.x * b.y + a.col2.x * b.z + a.col3.x * b.w,
+	             a.col0.y * b.x + a.col1.y * b.y + a.col2.y * b.z + a.col3.y * b.w,
+	             a.col0.z * b.x + a.col1.z * b.y + a.col2.z * b.z + a.col3.z * b.w,
+	             a.col0.w * b.x + a.col1.w * b.y + a.col2.w * b.z + a.col3.w * b.w);
+}
+
+PX_FORCE_INLINE Vec4V M44TrnspsMulV4(const Mat44V& a, const Vec4V b)
+{
+	return Vec4V(a.col0.x * b.x + a.col0.y * b.y + a.col0.z * b.z + a.col0.w * b.w,
+	             a.col1.x * b.x + a.col1.y * b.y + a.col1.z * b.z + a.col1.w * b.w,
+	             a.col2.x * b.x + a.col2.y * b.y + a.col2.z * b.z + a.col2.w * b.w,
+	             a.col3.x * b.x + a.col3.y * b.y + a.col3.z * b.z + a.col3.w * b.w);
+}
+
+PX_FORCE_INLINE Mat44V M44MulM44(const Mat44V& a, const Mat44V& b)
+{
+	return Mat44V(M44MulV4(a, b.col0), M44MulV4(a, b.col1), M44MulV4(a, b.col2), M44MulV4(a, b.col3));
+}
+
+PX_FORCE_INLINE Mat44V M44Add(const Mat44V& a, const Mat44V& b)
+{
+	return Mat44V(V4Add(a.col0, b.col0), V4Add(a.col1, b.col1), V4Add(a.col2, b.col2), V4Add(a.col3, b.col3));
+}
+
+PX_FORCE_INLINE Mat44V M44Inverse(const Mat44V& a)
+{
+	PxF32 tmp[12];
+	PxF32 dst[16];
+	PxF32 det;
+
+	const PxF32 src[16] = { a.col0.x, a.col0.y, a.col0.z, a.col0.w, a.col1.x, a.col1.y, a.col1.z, a.col1.w,
+		                    a.col2.x, a.col2.y, a.col2.z, a.col2.w, a.col3.x, a.col3.y, a.col3.z, a.col3.w };
+
+	tmp[0] = src[10] * src[15];
+	tmp[1] = src[11] * src[14];
+	tmp[2] = src[9] * src[15];
+	tmp[3] = src[11] * src[13];
+	tmp[4] = src[9] * src[14];
+	tmp[5] = src[10] * src[13];
+	tmp[6] = src[8] * src[15];
+	tmp[7] = src[11] * src[12];
+	tmp[8] = src[8] * src[14];
+	tmp[9] = src[10] * src[12];
+	tmp[10] = src[8] * src[13];
+	tmp[11] = src[9] * src[12];
+
+	dst[0] = tmp[0] * src[5] + tmp[3] * src[6] + tmp[4] * src[7];
+	dst[0] -= tmp[1] * src[5] + tmp[2] * src[6] + tmp[5] * src[7];
+	dst[1] = tmp[1] * src[4] + tmp[6] * src[6] + tmp[9] * src[7];
+	dst[1] -= tmp[0] * src[4] + tmp[7] * src[6] + tmp[8] * src[7];
+	dst[2] = tmp[2] * src[4] + tmp[7] * src[5] + tmp[10] * src[7];
+	dst[2] -= tmp[3] * src[4] + tmp[6] * src[5] + tmp[11] * src[7];
+	dst[3] = tmp[5] * src[4] + tmp[8] * src[5] + tmp[11] * src[6];
+	dst[3] -= tmp[4] * src[4] + tmp[9] * src[5] + tmp[10] * src[6];
+	dst[4] = tmp[1] * src[1] + tmp[2] * src[2] + tmp[5] * src[3];
+	dst[4] -= tmp[0] * src[1] + tmp[3] * src[2] + tmp[4] * src[3];
+	dst[5] = tmp[0] * src[0] + tmp[7] * src[2] + tmp[8] * src[3];
+	dst[5] -= tmp[1] * src[0] + tmp[6] * src[2] + tmp[9] * src[3];
+	dst[6] = tmp[3] * src[0] + tmp[6] * src[1] + tmp[11] * src[3];
+	dst[6] -= tmp[2] * src[0] + tmp[7] * src[1] + tmp[10] * src[3];
+	dst[7] = tmp[4] * src[0] + tmp[9] * src[1] + tmp[10] * src[2];
+	dst[7] -= tmp[5] * src[0] + tmp[8] * src[1] + tmp[11] * src[2];
+
+	tmp[0] = src[2] * src[7];
+	tmp[1] = src[3] * src[6];
+	tmp[2] = src[1] * src[7];
+	tmp[3] = src[3] * src[5];
+	tmp[4] = src[1] * src[6];
+	tmp[5] = src[2] * src[5];
+	tmp[6] = src[0] * src[7];
+	tmp[7] = src[3] * src[4];
+	tmp[8] = src[0] * src[6];
+	tmp[9] = src[2] * src[4];
+	tmp[10] = src[0] * src[5];
+	tmp[11] = src[1] * src[4];
+
+	dst[8] = tmp[0] * src[13] + tmp[3] * src[14] + tmp[4] * src[15];
+	dst[8] -= tmp[1] * src[13] + tmp[2] * src[14] + tmp[5] * src[15];
+	dst[9] = tmp[1] * src[12] + tmp[6] * src[14] + tmp[9] * src[15];
+	dst[9] -= tmp[0] * src[12] + tmp[7] * src[14] + tmp[8] * src[15];
+	dst[10] = tmp[2] * src[12] + tmp[7] * src[13] + tmp[10] * src[15];
+	dst[10] -= tmp[3] * src[12] + tmp[6] * src[13] + tmp[11] * src[15];
+	dst[11] = tmp[5] * src[12] + tmp[8] * src[13] + tmp[11] * src[14];
+	dst[11] -= tmp[4] * src[12] + tmp[9] * src[13] + tmp[10] * src[14];
+	dst[12] = tmp[2] * src[10] + tmp[5] * src[11] + tmp[1] * src[9];
+	dst[12] -= tmp[4] * src[11] + tmp[0] * src[9] + tmp[3] * src[10];
+	dst[13] = tmp[8] * src[11] + tmp[0] * src[8] + tmp[7] * src[10];
+	dst[13] -= tmp[6] * src[10] + tmp[9] * src[11] + tmp[1] * src[8];
+	dst[14] = tmp[6] * src[9] + tmp[11] * src[11] + tmp[3] * src[8];
+	dst[14] -= tmp[10] * src[11] + tmp[2] * src[8] + tmp[7] * src[9];
+	dst[15] = tmp[10] * src[10] + tmp[4] * src[8] + tmp[9] * src[9];
+	dst[15] -= tmp[8] * src[9] + tmp[11] * src[10] + tmp[5] * src[8];
+
+	det = src[0] * dst[0] + src[1] * dst[1] + src[2] * dst[2] + src[3] * dst[3];
+
+	det = 1.0f / det;
+	for(PxU32 j = 0; j < 16; j++)
+	{
+		dst[j] *= det;
+	}
+
+	return Mat44V(Vec4V(dst[0], dst[4], dst[8], dst[12]), Vec4V(dst[1], dst[5], dst[9], dst[13]),
+	              Vec4V(dst[2], dst[6], dst[10], dst[14]), Vec4V(dst[3], dst[7], dst[11], dst[15]));
+}
+
+PX_FORCE_INLINE Mat44V M44Trnsps(const Mat44V& a)
+{
+	return Mat44V(Vec4V(a.col0.x, a.col1.x, a.col2.x, a.col3.x), Vec4V(a.col0.y, a.col1.y, a.col2.y, a.col3.y),
+	              Vec4V(a.col0.z, a.col1.z, a.col2.z, a.col3.z), Vec4V(a.col0.w, a.col1.w, a.col2.w, a.col3.w));
+}
+
+PX_FORCE_INLINE Vec4V V4LoadXYZW(const PxF32& x, const PxF32& y, const PxF32& z, const PxF32& w)
+{
+	return Vec4V(x, y, z, w);
+}
+
+/*
+PX_FORCE_INLINE VecU16V V4U32PK(VecU32V a, VecU32V b)
+{
+    return VecU16V(
+        PxU16(PxClamp<PxU32>((a).u32[0], 0, 0xFFFF)),
+        PxU16(PxClamp<PxU32>((a).u32[1], 0, 0xFFFF)),
+        PxU16(PxClamp<PxU32>((a).u32[2], 0, 0xFFFF)),
+        PxU16(PxClamp<PxU32>((a).u32[3], 0, 0xFFFF)),
+        PxU16(PxClamp<PxU32>((b).u32[0], 0, 0xFFFF)),
+        PxU16(PxClamp<PxU32>((b).u32[1], 0, 0xFFFF)),
+        PxU16(PxClamp<PxU32>((b).u32[2], 0, 0xFFFF)),
+        PxU16(PxClamp<PxU32>((b).u32[3], 0, 0xFFFF)));
+}
+*/
+
+PX_FORCE_INLINE VecU32V V4U32Sel(const BoolV c, const VecU32V a, const VecU32V b)
+{
+	return VecU32V(c.ux ? a.u32[0] : b.u32[0], c.uy ? a.u32[1] : b.u32[1], c.uz ? a.u32[2] : b.u32[2],
+	               c.uw ? a.u32[3] : b.u32[3]);
+}
+
+PX_FORCE_INLINE VecU32V V4U32or(VecU32V a, VecU32V b)
+{
+	return VecU32V((a).u32[0] | (b).u32[0], (a).u32[1] | (b).u32[1], (a).u32[2] | (b).u32[2], (a).u32[3] | (b).u32[3]);
+}
+
+PX_FORCE_INLINE VecU32V V4U32xor(VecU32V a, VecU32V b)
+{
+	return VecU32V((a).u32[0] ^ (b).u32[0], (a).u32[1] ^ (b).u32[1], (a).u32[2] ^ (b).u32[2], (a).u32[3] ^ (b).u32[3]);
+}
+
+PX_FORCE_INLINE VecU32V V4U32and(VecU32V a, VecU32V b)
+{
+	return VecU32V((a).u32[0] & (b).u32[0], (a).u32[1] & (b).u32[1], (a).u32[2] & (b).u32[2], (a).u32[3] & (b).u32[3]);
+}
+
+PX_FORCE_INLINE VecU32V V4U32Andc(VecU32V a, VecU32V b)
+{
+	return VecU32V((a).u32[0] & ~(b).u32[0], (a).u32[1] & ~(b).u32[1], (a).u32[2] & ~(b).u32[2],
+	               (a).u32[3] & ~(b).u32[3]);
+}
+
+/*
+PX_FORCE_INLINE VecU16V V4U16Or(VecU16V a, VecU16V b)
+{
+    return VecU16V(
+        (a).u16[0]|(b).u16[0], (a).u16[1]|(b).u16[1], (a).u16[2]|(b).u16[2], (a).u16[3]|(b).u16[3],
+        (a).u16[4]|(b).u16[4], (a).u16[5]|(b).u16[5], (a).u16[6]|(b).u16[6], (a).u16[7]|(b).u16[7]);
+}
+*/
+
+/*
+PX_FORCE_INLINE VecU16V V4U16And(VecU16V a, VecU16V b)
+{
+    return VecU16V(
+        (a).u16[0]&(b).u16[0], (a).u16[1]&(b).u16[1], (a).u16[2]&(b).u16[2], (a).u16[3]&(b).u16[3],
+        (a).u16[4]&(b).u16[4], (a).u16[5]&(b).u16[5], (a).u16[6]&(b).u16[6], (a).u16[7]&(b).u16[7]);
+}
+*/
+
+/*
+PX_FORCE_INLINE VecU16V V4U16Andc(VecU16V a, VecU16V b)
+{
+    return VecU16V(
+        (a).u16[0]&~(b).u16[0], (a).u16[1]&~(b).u16[1], (a).u16[2]&~(b).u16[2], (a).u16[3]&~(b).u16[3],
+        (a).u16[4]&~(b).u16[4], (a).u16[5]&~(b).u16[5], (a).u16[6]&~(b).u16[6], (a).u16[7]&~(b).u16[7]);
+}
+*/
+
+/*
+template<int a> PX_FORCE_INLINE VecI32V V4ISplat()
+{
+    return VecI32V(a, a, a, a);
+}
+
+template<PxU32 a> PX_FORCE_INLINE VecU32V V4USplat()
+{
+    return VecU32V(a, a, a, a);
+}
+*/
+
+/*
+PX_FORCE_INLINE void V4U16StoreAligned(VecU16V val, VecU16V* address)
+{
+    *address = val;
+}
+*/
+
+PX_FORCE_INLINE void V4U32StoreAligned(VecU32V val, VecU32V* address)
+{
+	*address = val;
+}
+
+PX_FORCE_INLINE Vec4V V4Andc(const Vec4V a, const VecU32V b)
+{
+	VecU32V r = V4U32Andc(*reinterpret_cast<const VecU32V*>(&a), b);
+	return (*reinterpret_cast<const Vec4V*>(&r));
+}
+
+PX_FORCE_INLINE VecU32V V4IsGrtrV32u(const Vec4V a, const Vec4V b)
+{
+	return VecU32V(a.x > b.x ? 0xFFFFffff : 0, a.y > b.y ? 0xFFFFffff : 0, a.z > b.z ? 0xFFFFffff : 0,
+	               a.w > b.w ? 0xFFFFffff : 0);
+}
+
+PX_FORCE_INLINE VecU16V V4U16LoadAligned(VecU16V* addr)
+{
+	return *addr;
+}
+
+PX_FORCE_INLINE VecU16V V4U16LoadUnaligned(VecU16V* addr)
+{
+	return *addr;
+}
+
+PX_FORCE_INLINE VecU16V V4U16CompareGt(VecU16V a, VecU16V b)
+{
+	return VecU16V
+		(
+		BOOL_TO_U16(a.u16[0] > b.u16[0]), BOOL_TO_U16(a.u16[1] > b.u16[1]), BOOL_TO_U16(a.u16[2] > b.u16[2]), BOOL_TO_U16(a.u16[3] > b.u16[3]),
+		BOOL_TO_U16(a.u16[4] > b.u16[4]), BOOL_TO_U16(a.u16[5] > b.u16[5]), BOOL_TO_U16(a.u16[6] > b.u16[6]), BOOL_TO_U16(a.u16[7] > b.u16[7])
+		);
+}
+
+PX_FORCE_INLINE VecU16V V4I16CompareGt(VecU16V a, VecU16V b)
+{
+	return VecU16V
+		(
+		BOOL_TO_U16(a.i16[0] > b.i16[0]), BOOL_TO_U16(a.i16[1] > b.i16[1]), BOOL_TO_U16(a.i16[2] > b.i16[2]), BOOL_TO_U16(a.i16[3] > b.i16[3]),
+		BOOL_TO_U16(a.i16[4] > b.i16[4]), BOOL_TO_U16(a.i16[5] > b.i16[5]), BOOL_TO_U16(a.i16[6] > b.i16[6]), BOOL_TO_U16(a.i16[7] > b.i16[7])
+		);
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_VecU32V(VecU32V a)
+{
+	return Vec4V(PxF32((a).u32[0]), PxF32((a).u32[1]), PxF32((a).u32[2]), PxF32((a).u32[3]));
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_VecI32V(VecI32V a)
+{
+	return Vec4V(PxF32((a).i32[0]), PxF32((a).i32[1]), PxF32((a).i32[2]), PxF32((a).i32[3]));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_From_Vec4V(Vec4V a)
+{
+	float* data = (float*)&a;
+	return VecI32V(PxI32(data[0]), PxI32(data[1]), PxI32(data[2]), PxI32(data[3]));
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecU32V(VecU32V a)
+{
+	Vec4V b = *reinterpret_cast<Vec4V*>(&a);
+	return b;
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecI32V(VecI32V a)
+{
+	Vec4V b = *reinterpret_cast<Vec4V*>(&a);
+	return b;
+}
+
+PX_FORCE_INLINE VecU32V VecU32V_ReinterpretFrom_Vec4V(Vec4V a)
+{
+	VecU32V b = *reinterpret_cast<VecU32V*>(&a);
+	return b;
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_ReinterpretFrom_Vec4V(Vec4V a)
+{
+	VecI32V b = *reinterpret_cast<VecI32V*>(&a);
+	return b;
+}
+
+template <int index>
+PX_FORCE_INLINE VecU32V V4U32SplatElement(VecU32V a)
+{
+	return VecU32V((a).u32[index], (a).u32[index], (a).u32[index], (a).u32[index]);
+}
+
+template <int index>
+PX_FORCE_INLINE VecU32V V4U32SplatElement(BoolV a)
+{
+	const PxU32 u = (&a.ux)[index];
+	return VecU32V(u, u, u, u);
+}
+
+template <int index>
+PX_FORCE_INLINE Vec4V V4SplatElement(Vec4V a)
+{
+	float* data = (float*)&a;
+	return Vec4V(data[index], data[index], data[index], data[index]);
+}
+
+PX_FORCE_INLINE VecU32V U4LoadXYZW(PxU32 x, PxU32 y, PxU32 z, PxU32 w)
+{
+	return VecU32V(x, y, z, w);
+}
+
+PX_FORCE_INLINE Vec4V V4Abs(const Vec4V a)
+{
+	return V4Max(a, V4Neg(a));
+}
+
+PX_FORCE_INLINE BoolV V4IsEqU32(const VecU32V a, const VecU32V b)
+{
+	return BoolV(BOOL_TO_U32(a.u32[0] == b.u32[0]), BOOL_TO_U32(a.u32[1] == b.u32[1]), BOOL_TO_U32(a.u32[2] == b.u32[2]), BOOL_TO_U32(a.u32[3] == b.u32[3]));
+}
+
+PX_FORCE_INLINE VecU32V U4Load(const PxU32 i)
+{
+	return VecU32V(i, i, i, i);
+}
+
+PX_FORCE_INLINE VecU32V U4LoadU(const PxU32* i)
+{
+	return VecU32V(i[0], i[1], i[2], i[3]);
+}
+
+PX_FORCE_INLINE VecU32V U4LoadA(const PxU32* i)
+{
+	return VecU32V(i[0], i[1], i[2], i[3]);
+}
+
+PX_FORCE_INLINE VecI32V I4Load(const PxI32 i)
+{
+	return VecI32V(i, i, i, i);
+}
+
+PX_FORCE_INLINE VecI32V I4LoadU(const PxI32* i)
+{
+	return VecI32V(i[0], i[1], i[2], i[3]);
+}
+
+PX_FORCE_INLINE VecI32V I4LoadA(const PxI32* i)
+{
+	return VecI32V(i[0], i[1], i[2], i[3]);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Add(const VecI32VArg a, const VecI32VArg b)
+{
+	return VecI32V(a.i32[0] + b.i32[0], a.i32[1] + b.i32[1], a.i32[2] + b.i32[2], a.i32[3] + b.i32[3]);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Sub(const VecI32VArg a, const VecI32VArg b)
+{
+	return VecI32V(a.i32[0] - b.i32[0], a.i32[1] - b.i32[1], a.i32[2] - b.i32[2], a.i32[3] - b.i32[3]);
+}
+
+PX_FORCE_INLINE BoolV VecI32V_IsGrtr(const VecI32VArg a, const VecI32VArg b)
+{
+	return BoolV(BOOL_TO_U32(a.i32[0] > b.i32[0]), BOOL_TO_U32(a.i32[1] > b.i32[1]), BOOL_TO_U32(a.i32[2] > b.i32[2]), BOOL_TO_U32(a.i32[3] > b.i32[3]));
+}
+
+PX_FORCE_INLINE BoolV VecI32V_IsEq(const VecI32VArg a, const VecI32VArg b)
+{
+	return BoolV(BOOL_TO_U32(a.i32[0] == b.i32[0]), BOOL_TO_U32(a.i32[1] == b.i32[1]), BOOL_TO_U32(a.i32[2] == b.i32[2]), BOOL_TO_U32(a.i32[3] == b.i32[3]));
+}
+
+PX_FORCE_INLINE VecI32V V4I32Sel(const BoolV c, const VecI32V a, const VecI32V b)
+{
+	return VecI32V(c.ux ? a.i32[0] : b.i32[0], c.uy ? a.i32[1] : b.i32[1], c.uz ? a.i32[2] : b.i32[2],
+	               c.uw ? a.i32[3] : b.i32[3]);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Zero()
+{
+	return VecI32V(0, 0, 0, 0);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_One()
+{
+	return VecI32V(1, 1, 1, 1);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Two()
+{
+	return VecI32V(2, 2, 2, 2);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_MinusOne()
+{
+	return VecI32V(-1, -1, -1, -1);
+}
+
+PX_FORCE_INLINE VecU32V U4Zero()
+{
+	return VecU32V(0, 0, 0, 0);
+}
+
+PX_FORCE_INLINE VecU32V U4One()
+{
+	return VecU32V(1, 1, 1, 1);
+}
+
+PX_FORCE_INLINE VecU32V U4Two()
+{
+	return VecU32V(2, 2, 2, 2);
+}
+
+PX_FORCE_INLINE VecShiftV VecI32V_PrepareShift(const VecI32VArg shift)
+{
+	return shift;
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_LeftShift(const VecI32VArg a, const VecShiftVArg count)
+{
+	return VecI32V(a.i32[0] << count.i32[0], a.i32[1] << count.i32[1], a.i32[2] << count.i32[2], a.i32[3]
+	                                                                                                 << count.i32[3]);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_RightShift(const VecI32VArg a, const VecShiftVArg count)
+{
+	return VecI32V(a.i32[0] >> count.i32[0], a.i32[1] >> count.i32[1], a.i32[2] >> count.i32[2],
+	               a.i32[3] >> count.i32[3]);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_And(const VecI32VArg a, const VecI32VArg b)
+{
+	return VecI32V(a.i32[0] & b.i32[0], a.i32[1] & b.i32[1], a.i32[2] & b.i32[2], a.i32[3] & b.i32[3]);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Or(const VecI32VArg a, const VecI32VArg b)
+{
+	return VecI32V(a.i32[0] | b.i32[0], a.i32[1] | b.i32[1], a.i32[2] | b.i32[2], a.i32[3] | b.i32[3]);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetX(const VecI32VArg a)
+{
+	return VecI32V(a.i32[0], a.i32[0], a.i32[0], a.i32[0]);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetY(const VecI32VArg a)
+{
+	return VecI32V(a.i32[1], a.i32[1], a.i32[1], a.i32[1]);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetZ(const VecI32VArg a)
+{
+	return VecI32V(a.i32[2], a.i32[2], a.i32[2], a.i32[2]);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetW(const VecI32VArg a)
+{
+	return VecI32V(a.i32[3], a.i32[3], a.i32[3], a.i32[3]);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Sel(const BoolV c, const VecI32VArg a, const VecI32VArg b)
+{
+	return VecI32V(c.ux ? a.i32[0] : b.i32[0], c.uy ? a.i32[1] : b.i32[1], c.uz ? a.i32[2] : b.i32[2],
+	               c.uw ? a.i32[3] : b.i32[3]);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Merge(const VecI32VArg a, const VecI32VArg b, const VecI32VArg c, const VecI32VArg d)
+{
+	return VecI32V(a.i32[0], b.i32[0], c.i32[0], d.i32[0]);
+}
+
+PX_FORCE_INLINE void PxI32_From_VecI32V(const VecI32VArg a, PxI32* i)
+{
+	*i = a.i32[0];
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_From_BoolV(const BoolVArg b)
+{
+	return VecI32V(PxI32(b.ux), PxI32(b.uy), PxI32(b.uz), PxI32(b.uw));
+}
+
+PX_FORCE_INLINE VecU32V VecU32V_From_BoolV(const BoolVArg b)
+{
+	return VecU32V(b.ux, b.uy, b.uz, b.uw);
+}
+
+PX_FORCE_INLINE void QuatGetMat33V(const QuatVArg q, Vec3V& column0, Vec3V& column1, Vec3V& column2)
+{
+	const FloatV one = FOne();
+	const FloatV x = V4GetX(q);
+	const FloatV y = V4GetY(q);
+	const FloatV z = V4GetZ(q);
+	const FloatV w = V4GetW(q);
+
+	const FloatV x2 = FAdd(x, x);
+	const FloatV y2 = FAdd(y, y);
+	const FloatV z2 = FAdd(z, z);
+
+	const FloatV xx = FMul(x2, x);
+	const FloatV yy = FMul(y2, y);
+	const FloatV zz = FMul(z2, z);
+
+	const FloatV xy = FMul(x2, y);
+	const FloatV xz = FMul(x2, z);
+	const FloatV xw = FMul(x2, w);
+
+	const FloatV yz = FMul(y2, z);
+	const FloatV yw = FMul(y2, w);
+	const FloatV zw = FMul(z2, w);
+
+	const FloatV v = FSub(one, xx);
+
+	column0 = V3Merge(FSub(FSub(one, yy), zz), FAdd(xy, zw), FSub(xz, yw));
+	column1 = V3Merge(FSub(xy, zw), FSub(v, zz), FAdd(yz, xw));
+	column2 = V3Merge(FAdd(xz, yw), FSub(yz, xw), FSub(v, yy));
+}
+
+
+// not used
+
+/*
+PX_FORCE_INLINE Vec4V V4LoadAligned(Vec4V* addr)
+{
+    return *addr;
+}
+*/
+
+/*
+PX_FORCE_INLINE Vec4V V4LoadUnaligned(Vec4V* addr)
+{
+    return *addr;
+}
+*/
+
+/*
+PX_FORCE_INLINE Vec4V V4Ceil(const Vec4V a)
+{
+    return Vec4V(PxCeil(a.x), PxCeil(a.y), PxCeil(a.z), PxCeil(a.w));
+}
+
+PX_FORCE_INLINE Vec4V V4Floor(const Vec4V a)
+{
+    return Vec4V(PxFloor(a.x), PxFloor(a.y), PxFloor(a.z), PxFloor(a.w));
+}
+*/
+
+/*
+PX_FORCE_INLINE VecU32V V4ConvertToU32VSaturate(const Vec4V a, PxU32 power)
+{
+    PX_ASSERT(power == 0 && "Non-zero power not supported in convertToU32VSaturate");
+    PX_UNUSED(power); // prevent warning in release builds
+    PxF32 ffffFFFFasFloat = PxF32(0xFFFF0000);
+    return VecU32V(
+        PxU32(PxClamp<PxF32>((a).x, 0.0f, ffffFFFFasFloat)),
+        PxU32(PxClamp<PxF32>((a).y, 0.0f, ffffFFFFasFloat)),
+        PxU32(PxClamp<PxF32>((a).z, 0.0f, ffffFFFFasFloat)),
+        PxU32(PxClamp<PxF32>((a).w, 0.0f, ffffFFFFasFloat)));
+}
+*/
+
+#endif // PSFOUNDATION_PSVECMATHAOSSCALARINLINE_H
diff --git a/PxShared/src/foundation/include/PsVecMathSSE.h b/PxShared/src/foundation/include/PsVecMathSSE.h
new file mode 100644
index 0000000..559fa68
--- /dev/null
+++ b/PxShared/src/foundation/include/PsVecMathSSE.h
@@ -0,0 +1,56 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSVECMATHSSE_H
+#define PSFOUNDATION_PSVECMATHSSE_H
+
+namespace
+{
+	const PX_ALIGN(16, PxF32) minus1w[4] = { 0.0f, 0.0f, 0.0f, -1.0f };
+}
+
+PX_FORCE_INLINE void QuatGetMat33V(const QuatVArg q, Vec3V& column0, Vec3V& column1, Vec3V& column2)
+{
+    const __m128 q2 = V4Add(q, q);
+    const __m128 qw2 = V4MulAdd(q2, V4GetW(q), _mm_load_ps(minus1w));			// (2wx, 2wy, 2wz, 2ww-1)
+    const __m128 nw2 = Vec3V_From_Vec4V(V4Neg(qw2));							// (-2wx, -2wy, -2wz, 0)
+    const __m128 v = Vec3V_From_Vec4V(q);
+
+    const __m128 a0 = _mm_shuffle_ps(qw2, nw2, _MM_SHUFFLE(3, 1, 2, 3));		// (2ww-1, 2wz, -2wy, 0)
+    column0 = V4MulAdd(v, V4GetX(q2), a0);
+
+    const __m128 a1 = _mm_shuffle_ps(qw2, nw2, _MM_SHUFFLE(3, 2, 0, 3));		// (2ww-1, 2wx, -2wz, 0)
+    column1 = V4MulAdd(v, V4GetY(q2), _mm_shuffle_ps(a1, a1, _MM_SHUFFLE(3, 1, 0, 2)));
+
+    const __m128 a2 = _mm_shuffle_ps(qw2, nw2, _MM_SHUFFLE(3, 0, 1, 3));		// (2ww-1, 2wy, -2wx, 0)
+    column2 = V4MulAdd(v, V4GetZ(q2), _mm_shuffle_ps(a2, a2, _MM_SHUFFLE(3, 0, 2, 1)));
+}
+
+#endif // PSFOUNDATION_PSVECMATHSSE_H
+
diff --git a/PxShared/src/foundation/include/PsVecMathUtilities.h b/PxShared/src/foundation/include/PsVecMathUtilities.h
new file mode 100644
index 0000000..21bc50a
--- /dev/null
+++ b/PxShared/src/foundation/include/PsVecMathUtilities.h
@@ -0,0 +1,57 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSVECMATHUTILITIES_H
+#define PSFOUNDATION_PSVECMATHUTILITIES_H
+
+#include "PsVecMath.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+namespace aos
+{
+/*!
+    Extend an edge along its length by a factor
+    */
+PX_FORCE_INLINE void makeFatEdge(Vec3V& p0, Vec3V& p1, const FloatVArg fatCoeff)
+{
+	const Vec3V delta = V3Sub(p1, p0);
+	const FloatV m = V3Length(delta);
+	const BoolV con = FIsGrtr(m, FZero());
+	const Vec3V fatDelta = V3Scale(V3ScaleInv(delta, m), fatCoeff);
+	p0 = V3Sel(con, V3Sub(p0, fatDelta), p0);
+	p1 = V3Sel(con, V3Add(p1, fatDelta), p1);
+}
+}
+}
+}
+
+#endif
diff --git a/PxShared/src/foundation/include/PsVecQuat.h b/PxShared/src/foundation/include/PsVecQuat.h
new file mode 100644
index 0000000..c4726fc
--- /dev/null
+++ b/PxShared/src/foundation/include/PsVecQuat.h
@@ -0,0 +1,455 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSVECQUAT_H
+#define PSFOUNDATION_PSVECQUAT_H
+
+//#include "PsInlineAoS.h"
+
+#ifndef PX_PIDIV2
+#define PX_PIDIV2 1.570796327f
+#endif
+
+//////////////////////////////////
+// QuatV
+//////////////////////////////////
+PX_FORCE_INLINE QuatV QuatVLoadXYZW(const PxF32 x, const PxF32 y, const PxF32 z, const PxF32 w)
+{
+	return V4LoadXYZW(x, y, z, w);
+}
+
+PX_FORCE_INLINE QuatV QuatVLoadU(const PxF32* v)
+{
+	return V4LoadU(v);
+}
+
+PX_FORCE_INLINE QuatV QuatVLoadA(const PxF32* v)
+{
+	return V4LoadA(v);
+}
+
+PX_FORCE_INLINE QuatV QuatV_From_RotationAxisAngle(const Vec3V u, const FloatV a)
+{
+	// q = cos(a/2) + u*sin(a/2)
+	const FloatV half = FLoad(0.5f);
+	const FloatV hangle = FMul(a, half);
+	const FloatV piByTwo(FLoad(PX_PIDIV2));
+	const FloatV PiByTwoMinHangle(FSub(piByTwo, hangle));
+	const Vec4V hangle2(Vec4V_From_Vec3V(V3Merge(hangle, PiByTwoMinHangle, hangle)));
+
+	/*const FloatV sina = FSin(hangle);
+	const FloatV cosa = FCos(hangle);*/
+
+	const Vec4V _sina = V4Sin(hangle2);
+	const FloatV sina = V4GetX(_sina);
+	const FloatV cosa = V4GetY(_sina);
+
+	const Vec3V v = V3Scale(u, sina);
+	// return V4Sel(BTTTF(), Vec4V_From_Vec3V(v), V4Splat(cosa));
+	return V4SetW(Vec4V_From_Vec3V(v), cosa);
+}
+
+// Normalize
+PX_FORCE_INLINE QuatV QuatNormalize(const QuatV q)
+{
+	return V4Normalize(q);
+}
+
+PX_FORCE_INLINE FloatV QuatLength(const QuatV q)
+{
+	return V4Length(q);
+}
+
+PX_FORCE_INLINE FloatV QuatLengthSq(const QuatV q)
+{
+	return V4LengthSq(q);
+}
+
+PX_FORCE_INLINE FloatV QuatDot(const QuatV a, const QuatV b) // convert this PxQuat to a unit quaternion
+{
+	return V4Dot(a, b);
+}
+
+PX_FORCE_INLINE QuatV QuatConjugate(const QuatV q)
+{
+	return V4SetW(V4Neg(q), V4GetW(q));
+}
+
+PX_FORCE_INLINE Vec3V QuatGetImaginaryPart(const QuatV q)
+{
+	return Vec3V_From_Vec4V(q);
+}
+
+/** brief computes rotation of x-axis */
+PX_FORCE_INLINE Vec3V QuatGetBasisVector0(const QuatV q)
+{
+	/*const PxF32 x2 = x*2.0f;
+	const PxF32 w2 = w*2.0f;
+	return PxVec3(	(w * w2) - 1.0f + x*x2,
+	                (z * w2)        + y*x2,
+	                (-y * w2)       + z*x2);*/
+
+	const FloatV two = FLoad(2.f);
+	const FloatV w = V4GetW(q);
+	const Vec3V u = Vec3V_From_Vec4V(q);
+
+	const FloatV x2 = FMul(V3GetX(u), two);
+	const FloatV w2 = FMul(w, two);
+
+	const Vec3V a = V3Scale(u, x2);
+	const Vec3V tmp = V3Merge(w, V3GetZ(u), FNeg(V3GetY(u)));
+	// const Vec3V b = V3Scale(tmp, w2);
+	// const Vec3V ab = V3Add(a, b);
+	const Vec3V ab = V3ScaleAdd(tmp, w2, a);
+	return V3SetX(ab, FSub(V3GetX(ab), FOne()));
+}
+
+/** brief computes rotation of y-axis */
+PX_FORCE_INLINE Vec3V QuatGetBasisVector1(const QuatV q)
+{
+	/*const PxF32 y2 = y*2.0f;
+	const PxF32 w2 = w*2.0f;
+	return PxVec3(	(-z * w2)       + x*y2,
+	                (w * w2) - 1.0f + y*y2,
+	                (x * w2)        + z*y2);*/
+
+	const FloatV two = FLoad(2.f);
+	const FloatV w = V4GetW(q);
+	const Vec3V u = Vec3V_From_Vec4V(q);
+
+	const FloatV y2 = FMul(V3GetY(u), two);
+	const FloatV w2 = FMul(w, two);
+
+	const Vec3V a = V3Scale(u, y2);
+	const Vec3V tmp = V3Merge(FNeg(V3GetZ(u)), w, V3GetX(u));
+	// const Vec3V b = V3Scale(tmp, w2);
+	// const Vec3V ab = V3Add(a, b);
+	const Vec3V ab = V3ScaleAdd(tmp, w2, a);
+	return V3SetY(ab, FSub(V3GetY(ab), FOne()));
+}
+
+/** brief computes rotation of z-axis */
+PX_FORCE_INLINE Vec3V QuatGetBasisVector2(const QuatV q)
+{
+	/*const PxF32 z2 = z*2.0f;
+	const PxF32 w2 = w*2.0f;
+	return PxVec3(	(y * w2)        + x*z2,
+	                (-x * w2)       + y*z2,
+	                (w * w2) - 1.0f + z*z2);*/
+
+	const FloatV two = FLoad(2.f);
+	const FloatV w = V4GetW(q);
+	const Vec3V u = Vec3V_From_Vec4V(q);
+
+	const FloatV z2 = FMul(V3GetZ(u), two);
+	const FloatV w2 = FMul(w, two);
+
+	const Vec3V a = V3Scale(u, z2);
+	const Vec3V tmp = V3Merge(V3GetY(u), FNeg(V3GetX(u)), w);
+	/*const Vec3V b = V3Scale(tmp, w2);
+	const Vec3V ab = V3Add(a, b);*/
+	const Vec3V ab = V3ScaleAdd(tmp, w2, a);
+	return V3SetZ(ab, FSub(V3GetZ(ab), FOne()));
+}
+
+PX_FORCE_INLINE Vec3V QuatRotate(const QuatV q, const Vec3V v)
+{
+	/*
+	const PxVec3 qv(x,y,z);
+	return (v*(w*w-0.5f) + (qv.cross(v))*w + qv*(qv.dot(v)))*2;
+	*/
+
+	const FloatV two = FLoad(2.f);
+	// const FloatV half = FloatV_From_F32(0.5f);
+	const FloatV nhalf = FLoad(-0.5f);
+	const Vec3V u = Vec3V_From_Vec4V(q);
+	const FloatV w = V4GetW(q);
+	// const FloatV w2 = FSub(FMul(w, w), half);
+	const FloatV w2 = FScaleAdd(w, w, nhalf);
+	const Vec3V a = V3Scale(v, w2);
+	// const Vec3V b = V3Scale(V3Cross(u, v), w);
+	// const Vec3V c = V3Scale(u, V3Dot(u, v));
+	// return V3Scale(V3Add(V3Add(a, b), c), two);
+	const Vec3V temp = V3ScaleAdd(V3Cross(u, v), w, a);
+	return V3Scale(V3ScaleAdd(u, V3Dot(u, v), temp), two);
+}
+
+PX_FORCE_INLINE Vec3V QuatTransform(const QuatV q, const Vec3V p, const Vec3V v)
+{
+	// p + q.rotate(v)
+	const FloatV two = FLoad(2.f);
+	// const FloatV half = FloatV_From_F32(0.5f);
+	const FloatV nhalf = FLoad(-0.5f);
+	const Vec3V u = Vec3V_From_Vec4V(q);
+	const FloatV w = V4GetW(q);
+	// const FloatV w2 = FSub(FMul(w, w), half);
+	const FloatV w2 = FScaleAdd(w, w, nhalf);
+	const Vec3V a = V3Scale(v, w2);
+	/*const Vec3V b = V3Scale(V3Cross(u, v), w);
+	const Vec3V c = V3Scale(u, V3Dot(u, v));
+	return V3ScaleAdd(V3Add(V3Add(a, b), c), two, p);*/
+	const Vec3V temp = V3ScaleAdd(V3Cross(u, v), w, a);
+	const Vec3V z = V3ScaleAdd(u, V3Dot(u, v), temp);
+	return V3ScaleAdd(z, two, p);
+}
+
+PX_FORCE_INLINE Vec3V QuatRotateInv(const QuatV q, const Vec3V v)
+{
+
+	//	const PxVec3 qv(x,y,z);
+	//	return (v*(w*w-0.5f) - (qv.cross(v))*w + qv*(qv.dot(v)))*2;
+
+	const FloatV two = FLoad(2.f);
+	const FloatV nhalf = FLoad(-0.5f);
+	const Vec3V u = Vec3V_From_Vec4V(q);
+	const FloatV w = V4GetW(q);
+	const FloatV w2 = FScaleAdd(w, w, nhalf);
+	const Vec3V a = V3Scale(v, w2);
+	/*const Vec3V b = V3Scale(V3Cross(u, v), w);
+	const Vec3V c = V3Scale(u, V3Dot(u, v));
+	return V3Scale(V3Add(V3Sub(a, b), c), two);*/
+	const Vec3V temp = V3NegScaleSub(V3Cross(u, v), w, a);
+	return V3Scale(V3ScaleAdd(u, V3Dot(u, v), temp), two);
+}
+
+PX_FORCE_INLINE QuatV QuatMul(const QuatV a, const QuatV b)
+{
+	const Vec3V imagA = Vec3V_From_Vec4V(a);
+	const Vec3V imagB = Vec3V_From_Vec4V(b);
+	const FloatV rA = V4GetW(a);
+	const FloatV rB = V4GetW(b);
+
+	const FloatV real = FSub(FMul(rA, rB), V3Dot(imagA, imagB));
+	const Vec3V v0 = V3Scale(imagA, rB);
+	const Vec3V v1 = V3Scale(imagB, rA);
+	const Vec3V v2 = V3Cross(imagA, imagB);
+	const Vec3V imag = V3Add(V3Add(v0, v1), v2);
+
+	return V4SetW(Vec4V_From_Vec3V(imag), real);
+}
+
+PX_FORCE_INLINE QuatV QuatAdd(const QuatV a, const QuatV b)
+{
+	return V4Add(a, b);
+}
+
+PX_FORCE_INLINE QuatV QuatNeg(const QuatV q)
+{
+	return V4Neg(q);
+}
+
+PX_FORCE_INLINE QuatV QuatSub(const QuatV a, const QuatV b)
+{
+	return V4Sub(a, b);
+}
+
+PX_FORCE_INLINE QuatV QuatScale(const QuatV a, const FloatV b)
+{
+	return V4Scale(a, b);
+}
+
+PX_FORCE_INLINE QuatV QuatMerge(const FloatV* const floatVArray)
+{
+	return V4Merge(floatVArray);
+}
+
+PX_FORCE_INLINE QuatV QuatMerge(const FloatVArg x, const FloatVArg y, const FloatVArg z, const FloatVArg w)
+{
+	return V4Merge(x, y, z, w);
+}
+
+PX_FORCE_INLINE QuatV QuatIdentity()
+{
+	return V4SetW(V4Zero(), FOne());
+}
+
+PX_FORCE_INLINE bool isFiniteQuatV(const QuatV q)
+{
+	return isFiniteVec4V(q);
+}
+
+PX_FORCE_INLINE bool isValidQuatV(const QuatV q)
+{
+	const FloatV unitTolerance = FLoad(1e-4f);
+	const FloatV tmp = FAbs(FSub(QuatLength(q), FOne()));
+	const BoolV con = FIsGrtr(unitTolerance, tmp);
+	return isFiniteVec4V(q) & (BAllEqTTTT(con) == 1);
+}
+
+PX_FORCE_INLINE bool isSaneQuatV(const QuatV q)
+{
+	const FloatV unitTolerance = FLoad(1e-2f);
+	const FloatV tmp = FAbs(FSub(QuatLength(q), FOne()));
+	const BoolV con = FIsGrtr(unitTolerance, tmp);
+	return isFiniteVec4V(q) & (BAllEqTTTT(con) == 1);
+}
+
+PX_FORCE_INLINE Mat33V QuatGetMat33V(const QuatVArg q)
+{
+	// const FloatV two = FloatV_From_F32(2.f);
+	// const FloatV one = FOne();
+
+	// const FloatV x = V4GetX(q);
+	// const FloatV y = V4GetY(q);
+	// const FloatV z = V4GetZ(q);
+	// const Vec4V _q = V4Mul(q, two);
+	//
+	////const FloatV w = V4GetW(q);
+
+	// const Vec4V t0 = V4Mul(_q, x); // 2xx, 2xy, 2xz, 2xw
+	// const Vec4V t1 = V4Mul(_q, y); // 2xy, 2yy, 2yz, 2yw
+	// const Vec4V t2 = V4Mul(_q, z); // 2xz, 2yz, 2zz, 2zw
+	////const Vec4V t3 = V4Mul(_q, w); // 2xw, 2yw, 2zw, 2ww
+
+	// const FloatV xx2 = V4GetX(t0);
+	// const FloatV xy2 = V4GetY(t0);
+	// const FloatV xz2 = V4GetZ(t0);
+	// const FloatV xw2 = V4GetW(t0);
+
+	// const FloatV yy2 = V4GetY(t1);
+	// const FloatV yz2 = V4GetZ(t1);
+	// const FloatV yw2 = V4GetW(t1);
+
+	// const FloatV zz2 = V4GetZ(t2);
+	// const FloatV zw2 = V4GetW(t2);
+
+	////const FloatV ww2 = V4GetW(t3);
+
+	// const FloatV c00 = FSub(one, FAdd(yy2, zz2));
+	// const FloatV c01 = FSub(xy2, zw2);
+	// const FloatV c02 = FAdd(xz2, yw2);
+
+	// const FloatV c10 = FAdd(xy2, zw2);
+	// const FloatV c11 = FSub(one, FAdd(xx2, zz2));
+	// const FloatV c12 = FSub(yz2, xw2);
+
+	// const FloatV c20 = FSub(xz2, yw2);
+	// const FloatV c21 = FAdd(yz2, xw2);
+	// const FloatV c22 = FSub(one, FAdd(xx2, yy2));
+
+	// const Vec3V c0 = V3Merge(c00, c10, c20);
+	// const Vec3V c1 = V3Merge(c01, c11, c21);
+	// const Vec3V c2 = V3Merge(c02, c12, c22);
+
+	// return Mat33V(c0, c1, c2);
+
+	const FloatV one = FOne();
+	const FloatV x = V4GetX(q);
+	const FloatV y = V4GetY(q);
+	const FloatV z = V4GetZ(q);
+	const FloatV w = V4GetW(q);
+
+	const FloatV x2 = FAdd(x, x);
+	const FloatV y2 = FAdd(y, y);
+	const FloatV z2 = FAdd(z, z);
+
+	const FloatV xx = FMul(x2, x);
+	const FloatV yy = FMul(y2, y);
+	const FloatV zz = FMul(z2, z);
+
+	const FloatV xy = FMul(x2, y);
+	const FloatV xz = FMul(x2, z);
+	const FloatV xw = FMul(x2, w);
+
+	const FloatV yz = FMul(y2, z);
+	const FloatV yw = FMul(y2, w);
+	const FloatV zw = FMul(z2, w);
+
+	const FloatV v = FSub(one, xx);
+
+	const Vec3V column0 = V3Merge(FSub(FSub(one, yy), zz), FAdd(xy, zw), FSub(xz, yw));
+	const Vec3V column1 = V3Merge(FSub(xy, zw), FSub(v, zz), FAdd(yz, xw));
+	const Vec3V column2 = V3Merge(FAdd(xz, yw), FSub(yz, xw), FSub(v, yy));
+	return Mat33V(column0, column1, column2);
+}
+
+PX_FORCE_INLINE QuatV Mat33GetQuatV(const Mat33V& a)
+{
+	const FloatV one = FOne();
+	const FloatV zero = FZero();
+	const FloatV half = FLoad(0.5f);
+	const FloatV two = FLoad(2.f);
+	const FloatV scale = FLoad(0.25f);
+	const FloatV a00 = V3GetX(a.col0);
+	const FloatV a11 = V3GetY(a.col1);
+	const FloatV a22 = V3GetZ(a.col2);
+
+	const FloatV a21 = V3GetZ(a.col1); // row=2, col=1;
+	const FloatV a12 = V3GetY(a.col2); // row=1, col=2;
+	const FloatV a02 = V3GetX(a.col2); // row=0, col=2;
+	const FloatV a20 = V3GetZ(a.col0); // row=2, col=0;
+	const FloatV a10 = V3GetY(a.col0); // row=1, col=0;
+	const FloatV a01 = V3GetX(a.col1); // row=0, col=1;
+
+	const Vec3V vec0 = V3Merge(a21, a02, a10);
+	const Vec3V vec1 = V3Merge(a12, a20, a01);
+	const Vec3V v = V3Sub(vec0, vec1);
+	const Vec3V g = V3Add(vec0, vec1);
+
+	const FloatV trace = FAdd(a00, FAdd(a11, a22));
+
+	if(FAllGrtrOrEq(trace, zero))
+	{
+		const FloatV h = FSqrt(FAdd(trace, one));
+		const FloatV w = FMul(half, h);
+		const FloatV s = FMul(half, FRecip(h));
+		const Vec3V u = V3Scale(v, s);
+		return V4SetW(Vec4V_From_Vec3V(u), w);
+	}
+	else
+	{
+		const FloatV ntrace = FNeg(trace);
+		const Vec3V d = V3Merge(a00, a11, a22);
+		const BoolV con0 = BAllTrue3(V3IsGrtrOrEq(V3Splat(a00), d));
+		const BoolV con1 = BAllTrue3(V3IsGrtrOrEq(V3Splat(a11), d));
+
+		const FloatV t0 = FAdd(one, FScaleAdd(a00, two, ntrace));
+		const FloatV t1 = FAdd(one, FScaleAdd(a11, two, ntrace));
+		const FloatV t2 = FAdd(one, FScaleAdd(a22, two, ntrace));
+
+		const FloatV t = FSel(con0, t0, FSel(con1, t1, t2));
+
+		const FloatV h = FMul(two, FSqrt(t));
+		const FloatV s = FRecip(h);
+		const FloatV g0 = FMul(scale, h);
+		const Vec3V vs = V3Scale(v, s);
+		const Vec3V gs = V3Scale(g, s);
+		const FloatV gsx = V3GetX(gs);
+		const FloatV gsy = V3GetY(gs);
+		const FloatV gsz = V3GetZ(gs);
+		// vs.x= (a21 - a12)*s; vs.y=(a02 - a20)*s; vs.z=(a10 - a01)*s;
+		// gs.x= (a21 + a12)*s; gs.y=(a02 + a20)*s; gs.z=(a10 + a01)*s;
+		const Vec4V v0 = V4Merge(g0, gsz, gsy, V3GetX(vs));
+		const Vec4V v1 = V4Merge(gsz, g0, gsx, V3GetY(vs));
+		const Vec4V v2 = V4Merge(gsy, gsx, g0, V3GetZ(vs));
+		return V4Sel(con0, v0, V4Sel(con1, v1, v2));
+	}
+}
+
+#endif
diff --git a/PxShared/src/foundation/include/PsVecTransform.h b/PxShared/src/foundation/include/PsVecTransform.h
new file mode 100644
index 0000000..5c16339
--- /dev/null
+++ b/PxShared/src/foundation/include/PsVecTransform.h
@@ -0,0 +1,283 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSVECTRANSFORM_H
+#define PSFOUNDATION_PSVECTRANSFORM_H
+
+#include "PsVecMath.h"
+#include "foundation/PxTransform.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+namespace aos
+{
+
+class PsTransformV
+{
+  public:
+	QuatV q;
+	Vec3V p;
+
+	PX_FORCE_INLINE PsTransformV(const PxTransform& orientation)
+	{
+		// const PxQuat oq = orientation.q;
+		// const PxF32 f[4] = {oq.x, oq.y, oq.z, oq.w};
+		q = QuatVLoadXYZW(orientation.q.x, orientation.q.y, orientation.q.z, orientation.q.w);
+		// q = QuatV_From_F32Array(&oq.x);
+		p = V3LoadU(orientation.p);
+	}
+
+	PX_FORCE_INLINE PsTransformV(const Vec3VArg p0 = V3Zero(), const QuatVArg q0 = QuatIdentity()) : q(q0), p(p0)
+	{
+		PX_ASSERT(isSaneQuatV(q0));
+	}
+
+	PX_FORCE_INLINE PsTransformV operator*(const PsTransformV& x) const
+	{
+		PX_ASSERT(x.isSane());
+		return transform(x);
+	}
+
+	PX_FORCE_INLINE PsTransformV getInverse() const
+	{
+		PX_ASSERT(isFinite());
+		// return PxTransform(q.rotateInv(-p),q.getConjugate());
+		return PsTransformV(QuatRotateInv(q, V3Neg(p)), QuatConjugate(q));
+	}
+
+	PX_FORCE_INLINE void normalize()
+	{
+		p = V3Zero();
+		q = QuatIdentity();
+	}
+
+	PX_FORCE_INLINE void Invalidate()
+	{
+		p = V3Splat(FMax());
+		q = QuatIdentity();
+	}
+
+	PX_FORCE_INLINE Vec3V transform(const Vec3VArg input) const
+	{
+		PX_ASSERT(isFinite());
+		// return q.rotate(input) + p;
+		return QuatTransform(q, p, input);
+	}
+
+	PX_FORCE_INLINE Vec3V transformInv(const Vec3VArg input) const
+	{
+		PX_ASSERT(isFinite());
+		// return q.rotateInv(input-p);
+		return QuatRotateInv(q, V3Sub(input, p));
+	}
+
+	PX_FORCE_INLINE Vec3V rotate(const Vec3VArg input) const
+	{
+		PX_ASSERT(isFinite());
+		// return q.rotate(input);
+		return QuatRotate(q, input);
+	}
+
+	PX_FORCE_INLINE Vec3V rotateInv(const Vec3VArg input) const
+	{
+		PX_ASSERT(isFinite());
+		// return q.rotateInv(input);
+		return QuatRotateInv(q, input);
+	}
+
+	//! Transform transform to parent (returns compound transform: first src, then *this)
+	PX_FORCE_INLINE PsTransformV transform(const PsTransformV& src) const
+	{
+		PX_ASSERT(src.isSane());
+		PX_ASSERT(isSane());
+		// src = [srct, srcr] -> [r*srct + t, r*srcr]
+		// return PxTransform(q.rotate(src.p) + p, q*src.q);
+		return PsTransformV(V3Add(QuatRotate(q, src.p), p), QuatMul(q, src.q));
+	}
+
+	/**
+	\brief returns true if finite and q is a unit quaternion
+	*/
+
+	PX_FORCE_INLINE bool isValid() const
+	{
+		// return p.isFinite() && q.isFinite() && q.isValid();
+		return isFiniteVec3V(p) & isFiniteQuatV(q) & isValidQuatV(q);
+	}
+
+	/**
+	\brief returns true if finite and quat magnitude is reasonably close to unit to allow for some accumulation of error
+	vs isValid
+	*/
+
+	PX_FORCE_INLINE bool isSane() const
+	{
+		// return isFinite() && q.isSane();
+		return isFinite() & isSaneQuatV(q);
+	}
+
+	/**
+	\brief returns true if all elems are finite (not NAN or INF, etc.)
+	*/
+	PX_FORCE_INLINE bool isFinite() const
+	{
+		// return p.isFinite() && q.isFinite();
+		return isFiniteVec3V(p) & isFiniteQuatV(q);
+	}
+
+	//! Transform transform from parent (returns compound transform: first src, then this->inverse)
+	PX_FORCE_INLINE PsTransformV transformInv(const PsTransformV& src) const
+	{
+		PX_ASSERT(src.isSane());
+		PX_ASSERT(isFinite());
+		// src = [srct, srcr] -> [r^-1*(srct-t), r^-1*srcr]
+		/*PxQuat qinv = q.getConjugate();
+		return PxTransform(qinv.rotate(src.p - p), qinv*src.q);*/
+		const QuatV qinv = QuatConjugate(q);
+		const Vec3V v = QuatRotate(qinv, V3Sub(src.p, p));
+		const QuatV rot = QuatMul(qinv, src.q);
+		return PsTransformV(v, rot);
+	}
+
+	static PX_FORCE_INLINE PsTransformV createIdentity()
+	{
+		return PsTransformV(V3Zero());
+	}
+};
+
+PX_FORCE_INLINE PsTransformV loadTransformA(const PxTransform& transform)
+{
+	const QuatV q0 = QuatVLoadA(&transform.q.x);
+	const Vec3V p0 = V3LoadA(&transform.p.x);
+
+	return PsTransformV(p0, q0);
+}
+
+PX_FORCE_INLINE PsTransformV loadTransformU(const PxTransform& transform)
+{
+	const QuatV q0 = QuatVLoadU(&transform.q.x);
+	const Vec3V p0 = V3LoadU(&transform.p.x);
+
+	return PsTransformV(p0, q0);
+}
+
+class PsMatTransformV
+{
+  public:
+	Mat33V rot;
+	Vec3V p;
+
+	PX_FORCE_INLINE PsMatTransformV()
+	{
+		p = V3Zero();
+		rot = M33Identity();
+	}
+	PX_FORCE_INLINE PsMatTransformV(const Vec3VArg _p, const Mat33V& _rot)
+	{
+		p = _p;
+		rot = _rot;
+	}
+
+	PX_FORCE_INLINE PsMatTransformV(const PsTransformV& other)
+	{
+		p = other.p;
+		QuatGetMat33V(other.q, rot.col0, rot.col1, rot.col2);
+	}
+
+	PX_FORCE_INLINE PsMatTransformV(const Vec3VArg _p, const QuatV& quat)
+	{
+		p = _p;
+		QuatGetMat33V(quat, rot.col0, rot.col1, rot.col2);
+	}
+
+	PX_FORCE_INLINE Vec3V getCol0() const
+	{
+		return rot.col0;
+	}
+
+	PX_FORCE_INLINE Vec3V getCol1() const
+	{
+		return rot.col1;
+	}
+
+	PX_FORCE_INLINE Vec3V getCol2() const
+	{
+		return rot.col2;
+	}
+
+	PX_FORCE_INLINE void setCol0(const Vec3VArg col0)
+	{
+		rot.col0 = col0;
+	}
+
+	PX_FORCE_INLINE void setCol1(const Vec3VArg col1)
+	{
+		rot.col1 = col1;
+	}
+
+	PX_FORCE_INLINE void setCol2(const Vec3VArg col2)
+	{
+		rot.col2 = col2;
+	}
+
+	PX_FORCE_INLINE Vec3V transform(const Vec3VArg input) const
+	{
+		return V3Add(p, M33MulV3(rot, input));
+	}
+
+	PX_FORCE_INLINE Vec3V transformInv(const Vec3VArg input) const
+	{
+		return M33TrnspsMulV3(rot, V3Sub(input, p)); // QuatRotateInv(q, V3Sub(input, p));
+	}
+
+	PX_FORCE_INLINE Vec3V rotate(const Vec3VArg input) const
+	{
+		return M33MulV3(rot, input);
+	}
+
+	PX_FORCE_INLINE Vec3V rotateInv(const Vec3VArg input) const
+	{
+		return M33TrnspsMulV3(rot, input);
+	}
+
+	PX_FORCE_INLINE PsMatTransformV transformInv(const PsMatTransformV& src) const
+	{
+
+		const Vec3V v = M33TrnspsMulV3(rot, V3Sub(src.p, p));
+		const Mat33V mat = M33MulM33(M33Trnsps(rot), src.rot);
+		return PsMatTransformV(v, mat);
+	}
+};
+}
+}
+}
+
+#endif
diff --git a/PxShared/src/foundation/include/nx/PsNXAbort.h b/PxShared/src/foundation/include/nx/PsNXAbort.h
new file mode 100644
index 0000000..3b0413e
--- /dev/null
+++ b/PxShared/src/foundation/include/nx/PsNXAbort.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PX_FOUNDATION_PX_NX_ABORT_H
+#define PX_FOUNDATION_PX_NX_ABORT_H
+
+#include "foundation/PxPreprocessor.h"
+#include "nn/nn_Assert.h"
+#include "nn/nn_Log.h"
+
+void abort(const char* message)
+{
+	NN_LOG(message);
+	NN_ASSERT(message == NULL);
+}
+
+#endif // PX_FOUNDATION_PX_NX_ABORT_H
diff --git a/PxShared/src/foundation/include/nx/PsNXIntrinsics.h b/PxShared/src/foundation/include/nx/PsNXIntrinsics.h
new file mode 100644
index 0000000..789e39c
--- /dev/null
+++ b/PxShared/src/foundation/include/nx/PsNXIntrinsics.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PX_FOUNDATION_PS_NX_INTRINSICS_H
+#define PX_FOUNDATION_PS_NX_INTRINSICS_H
+
+#include "Ps.h"
+#include "foundation/PxAssert.h"
+
+// this file is for internal intrinsics - that is, intrinsics that are used in
+// cross platform code but do not appear in the API
+
+#if !PX_NX
+	#error "This file should only be included by NX builds!!"
+#endif
+
+#include <math.h>
+
+namespace physx
+{
+namespace shdfnd
+{
+	/*
+	 * Implements a memory barrier
+	 */
+	PX_FORCE_INLINE void memoryBarrier()
+	{
+		__sync_synchronize();
+	}
+
+	/*!
+	Returns the index of the highest set bit. Not valid for zero arg.
+	*/
+	PX_FORCE_INLINE PxU32 highestSetBitUnsafe(PxU32 v)
+	{
+		// http://graphics.stanford.edu/~seander/bithacks.html
+		static const PxU32 MultiplyDeBruijnBitPosition[32] = 
+		{
+			0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
+			8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
+		};
+
+		v |= v >> 1; // first round up to one less than a power of 2 
+		v |= v >> 2;
+		v |= v >> 4;
+		v |= v >> 8;
+		v |= v >> 16;
+
+		return MultiplyDeBruijnBitPosition[(PxU32)(v * 0x07C4ACDDU) >> 27];
+	}
+
+	/*!
+	Returns the index of the highest set bit. Undefined for zero arg.
+	*/
+	PX_FORCE_INLINE PxU32 lowestSetBitUnsafe(PxU32 v)
+	{
+		// http://graphics.stanford.edu/~seander/bithacks.html
+		static const PxU32 MultiplyDeBruijnBitPosition[32] = 
+		{
+			0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 
+			31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
+		};
+		PxI32 w = v;
+		return MultiplyDeBruijnBitPosition[(PxU32)((w & -w) * 0x077CB531U) >> 27];
+	}
+
+	/*!
+	Returns the number of leading zeros in v. Returns 32 for v=0.
+	*/
+	PX_FORCE_INLINE PxU32 countLeadingZeros(PxU32 v)
+	{
+		PxI32 result = 0;
+		PxU32 testBit = (1<<31);
+		while ((v & testBit) == 0 && testBit != 0)
+			result ++, testBit >>= 1;
+		return result;
+	}
+
+	/*!
+	Prefetch aligned cache size around \c ptr+offset.
+	*/
+	PX_FORCE_INLINE void prefetchLine(const void* ptr, PxU32 offset = 0)
+	{
+		__builtin_prefetch((char* PX_RESTRICT)(ptr) + offset, 0, 3);
+	}
+
+	/*!
+	Prefetch \c count bytes starting at \c ptr.
+	*/
+	PX_FORCE_INLINE void prefetch(const void* ptr, PxU32 count = 1)
+	{
+		const char* cp = (char*)ptr;
+		PxU64 p = size_t(ptr);
+		PxU64 startLine = p>>6, endLine = (p+count-1)>>6;
+		PxU64 lines = endLine - startLine + 1;
+		do
+		{
+			prefetchLine(cp);
+			cp+=64;
+		} while(--lines);
+	}
+
+	//! \brief platform-specific reciprocal
+	PX_CUDA_CALLABLE PX_FORCE_INLINE float recipFast(float a)				{	return 1.0f/a;			}
+
+	//! \brief platform-specific fast reciprocal square root
+	PX_CUDA_CALLABLE PX_FORCE_INLINE float recipSqrtFast(float a)			{   return 1.0f/::sqrtf(a); }
+
+	//! \brief platform-specific floor
+	PX_CUDA_CALLABLE PX_FORCE_INLINE float floatFloor(float x)
+	{
+		return ::floorf(x);
+	}
+
+	#define PX_PRINTF printf
+	#define PX_EXPECT_TRUE(x) x
+	#define PX_EXPECT_FALSE(x) x
+
+} // namespace shdfnd
+} // namespace physx
+
+#define PX_EXPECT_TRUE(x) x
+#define PX_EXPECT_FALSE(x) x
+
+#endif
diff --git a/PxShared/src/foundation/include/unix/PsUnixAoS.h b/PxShared/src/foundation/include/unix/PsUnixAoS.h
new file mode 100644
index 0000000..122879f
--- /dev/null
+++ b/PxShared/src/foundation/include/unix/PsUnixAoS.h
@@ -0,0 +1,47 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSUNIXAOS_H
+#define PSFOUNDATION_PSUNIXAOS_H
+
+// no includes here! this file should be included from PxcVecMath.h only!!!
+
+#if !COMPILE_VECTOR_INTRINSICS
+#error Vector intrinsics should not be included when using scalar implementation.
+#endif
+
+#if PX_INTEL_FAMILY
+#include "sse2/PsUnixSse2AoS.h"
+#elif PX_NEON
+#include "neon/PsUnixNeonAoS.h"
+#else
+#error No SIMD implementation for this unix platform.
+#endif
+
+#endif // PSFOUNDATION_PSUNIXAOS_H
diff --git a/PxShared/src/foundation/include/unix/PsUnixFPU.h b/PxShared/src/foundation/include/unix/PsUnixFPU.h
new file mode 100644
index 0000000..edd5522
--- /dev/null
+++ b/PxShared/src/foundation/include/unix/PsUnixFPU.h
@@ -0,0 +1,69 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSUNIXFPU_H
+#define PSFOUNDATION_PSUNIXFPU_H
+
+#include "foundation/PxPreprocessor.h"
+
+#if PX_LINUX || PX_PS4 || PX_OSX
+
+#if PX_X86 || PX_X64
+#if PX_EMSCRIPTEN
+#include <emmintrin.h>
+#endif
+#include <xmmintrin.h>
+#elif PX_NEON
+#include <arm_neon.h>
+#endif
+
+
+PX_INLINE physx::shdfnd::SIMDGuard::SIMDGuard()
+{
+#if !PX_EMSCRIPTEN && (PX_X86 || PX_X64)
+	mControlWord = _mm_getcsr();
+	// set default (disable exceptions: _MM_MASK_MASK) and FTZ (_MM_FLUSH_ZERO_ON), DAZ (_MM_DENORMALS_ZERO_ON: (1<<6))
+	_mm_setcsr(_MM_MASK_MASK | _MM_FLUSH_ZERO_ON | (1 << 6));
+#endif
+}
+
+PX_INLINE physx::shdfnd::SIMDGuard::~SIMDGuard()
+{
+#if !PX_EMSCRIPTEN && (PX_X86 || PX_X64)
+	// restore control word and clear exception flags
+	// (setting exception state flags cause exceptions on the first following fp operation)
+	_mm_setcsr(mControlWord & ~_MM_EXCEPT_MASK);
+#endif
+}
+
+#else
+#error No SIMD implementation for this unix platform.
+#endif // PX_LINUX || PX_PS4 || PX_OSX
+
+#endif // #ifndef PSFOUNDATION_PSUNIXFPU_H
diff --git a/PxShared/src/foundation/include/unix/PsUnixInlineAoS.h b/PxShared/src/foundation/include/unix/PsUnixInlineAoS.h
new file mode 100644
index 0000000..e54f2c8
--- /dev/null
+++ b/PxShared/src/foundation/include/unix/PsUnixInlineAoS.h
@@ -0,0 +1,48 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSUNIXINLINEAOS_H
+#define PSFOUNDATION_PSUNIXINLINEAOS_H
+
+#if !COMPILE_VECTOR_INTRINSICS
+#error Vector intrinsics should not be included when using scalar implementation.
+#endif
+
+// Remove this define when all platforms use simd solver.
+#define PX_SUPPORT_SIMD
+
+#if PX_INTEL_FAMILY
+#include "sse2/PsUnixSse2InlineAoS.h"
+#elif PX_NEON
+#include "neon/PsUnixNeonInlineAoS.h"
+#else
+#error No SIMD implementation for this unix platform.
+#endif
+
+#endif // PSFOUNDATION_PSUNIXINLINEAOS_H
diff --git a/PxShared/src/foundation/include/unix/PsUnixIntrinsics.h b/PxShared/src/foundation/include/unix/PsUnixIntrinsics.h
new file mode 100644
index 0000000..4c6c892
--- /dev/null
+++ b/PxShared/src/foundation/include/unix/PsUnixIntrinsics.h
@@ -0,0 +1,153 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSUNIXINTRINSICS_H
+#define PSFOUNDATION_PSUNIXINTRINSICS_H
+
+#include "Ps.h"
+#include "foundation/PxAssert.h"
+#include <math.h>
+
+#if PX_ANDROID
+#include <signal.h> // for Ns::debugBreak() { raise(SIGTRAP); }
+#endif
+
+#if 0
+#include <libkern/OSAtomic.h>
+#endif
+
+// this file is for internal intrinsics - that is, intrinsics that are used in
+// cross platform code but do not appear in the API
+
+#if !(PX_LINUX || PX_ANDROID || PX_PS4 || PX_APPLE_FAMILY)
+#error "This file should only be included by unix builds!!"
+#endif
+
+namespace physx
+{
+namespace shdfnd
+{
+
+PX_FORCE_INLINE void memoryBarrier()
+{
+	__sync_synchronize();
+}
+
+/*!
+Return the index of the highest set bit. Undefined for zero arg.
+*/
+PX_INLINE uint32_t highestSetBitUnsafe(uint32_t v)
+{
+
+	return 31 - __builtin_clz(v);
+}
+
+/*!
+Return the index of the highest set bit. Undefined for zero arg.
+*/
+PX_INLINE int32_t lowestSetBitUnsafe(uint32_t v)
+{
+	return __builtin_ctz(v);
+}
+
+/*!
+Returns the index of the highest set bit. Returns 32 for v=0.
+*/
+PX_INLINE uint32_t countLeadingZeros(uint32_t v)
+{
+	if(v)
+		return __builtin_clz(v);
+	else
+		return 32;
+}
+
+/*!
+Prefetch aligned 64B x86, 32b ARM around \c ptr+offset.
+*/
+PX_FORCE_INLINE void prefetchLine(const void* ptr, uint32_t offset = 0)
+{
+	__builtin_prefetch(reinterpret_cast<const char* PX_RESTRICT>(ptr) + offset, 0, 3);
+}
+
+/*!
+Prefetch \c count bytes starting at \c ptr.
+*/
+#if PX_ANDROID || PX_IOS
+PX_FORCE_INLINE void prefetch(const void* ptr, uint32_t count = 1)
+{
+	const char* cp = static_cast<const char*>(ptr);
+	size_t p = reinterpret_cast<size_t>(ptr);
+	uint32_t startLine = uint32_t(p >> 5), endLine = uint32_t((p + count - 1) >> 5);
+	uint32_t lines = endLine - startLine + 1;
+	do
+	{
+		prefetchLine(cp);
+		cp += 32;
+	} while(--lines);
+}
+#else
+PX_FORCE_INLINE void prefetch(const void* ptr, uint32_t count = 1)
+{
+	const char* cp = reinterpret_cast<const char*>(ptr);
+	uint64_t p = size_t(ptr);
+	uint64_t startLine = p >> 6, endLine = (p + count - 1) >> 6;
+	uint64_t lines = endLine - startLine + 1;
+	do
+	{
+		prefetchLine(cp);
+		cp += 64;
+	} while(--lines);
+}
+#endif
+
+//! \brief platform-specific reciprocal
+PX_CUDA_CALLABLE PX_FORCE_INLINE float recipFast(float a)
+{
+	return 1.0f / a;
+}
+
+//! \brief platform-specific fast reciprocal square root
+PX_CUDA_CALLABLE PX_FORCE_INLINE float recipSqrtFast(float a)
+{
+	return 1.0f / ::sqrtf(a);
+}
+
+//! \brief platform-specific floor
+PX_CUDA_CALLABLE PX_FORCE_INLINE float floatFloor(float x)
+{
+	return ::floorf(x);
+}
+
+#define NS_EXPECT_TRUE(x) x
+#define NS_EXPECT_FALSE(x) x
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSUNIXINTRINSICS_H
diff --git a/PxShared/src/foundation/include/unix/PsUnixTrigConstants.h b/PxShared/src/foundation/include/unix/PsUnixTrigConstants.h
new file mode 100644
index 0000000..7f54733
--- /dev/null
+++ b/PxShared/src/foundation/include/unix/PsUnixTrigConstants.h
@@ -0,0 +1,82 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSUNIXTRIGCONSTANTS_H
+#define PSFOUNDATION_PSUNIXTRIGCONSTANTS_H
+
+//#define PX_GLOBALCONST extern const __declspec(selectany)
+#define PX_GLOBALCONST extern const __attribute__((weak))
+
+PX_ALIGN_PREFIX(16)
+struct PX_VECTORF32
+{
+	float f[4];
+} PX_ALIGN_SUFFIX(16);
+
+PX_GLOBALCONST PX_VECTORF32 g_PXSinCoefficients0 = { { 1.0f, -0.166666667f, 8.333333333e-3f, -1.984126984e-4f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXSinCoefficients1 = { { 2.755731922e-6f, -2.505210839e-8f, 1.605904384e-10f, -7.647163732e-13f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXSinCoefficients2 = { { 2.811457254e-15f, -8.220635247e-18f, 1.957294106e-20f, -3.868170171e-23f } };
+PX_GLOBALCONST PX_VECTORF32 g_PXCosCoefficients0 = { { 1.0f, -0.5f, 4.166666667e-2f, -1.388888889e-3f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXCosCoefficients1 = { { 2.480158730e-5f, -2.755731922e-7f, 2.087675699e-9f, -1.147074560e-11f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXCosCoefficients2 = { { 4.779477332e-14f, -1.561920697e-16f, 4.110317623e-19f, -8.896791392e-22f } };
+PX_GLOBALCONST PX_VECTORF32 g_PXTanCoefficients0 = { { 1.0f, 0.333333333f, 0.133333333f, 5.396825397e-2f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXTanCoefficients1 = { { 2.186948854e-2f, 8.863235530e-3f, 3.592128167e-3f, 1.455834485e-3f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXTanCoefficients2 = { { 5.900274264e-4f, 2.391290764e-4f, 9.691537707e-5f, 3.927832950e-5f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXASinCoefficients0 = { { -0.05806367563904f, -0.41861972469416f, 0.22480114791621f, 2.17337241360606f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXASinCoefficients1 = { { 0.61657275907170f, 4.29696498283455f, -1.18942822255452f, -6.53784832094831f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXASinCoefficients2 = { { -1.36926553863413f, -4.48179294237210f, 1.41810672941833f, 5.48179257935713f } };
+PX_GLOBALCONST PX_VECTORF32 g_PXATanCoefficients0 = { { 1.0f, 0.333333334f, 0.2f, 0.142857143f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXATanCoefficients1 = { { 1.111111111e-1f, 9.090909091e-2f, 7.692307692e-2f, 6.666666667e-2f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXATanCoefficients2 = { { 5.882352941e-2f, 5.263157895e-2f, 4.761904762e-2f, 4.347826087e-2f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXSinEstCoefficients = { { 1.0f, -1.66521856991541e-1f, 8.199913018755e-3f, -1.61475937228e-4f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXCosEstCoefficients = { { 1.0f, -4.95348008918096e-1f, 3.878259962881e-2f, -9.24587976263e-4f } };
+PX_GLOBALCONST PX_VECTORF32 g_PXTanEstCoefficients = { { 2.484f, -1.954923183e-1f, 2.467401101f, PxInvPi } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXATanEstCoefficients = { { 7.689891418951e-1f, 1.104742493348f, 8.661844266006e-1f, PxPiDivTwo } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXASinEstCoefficients = { { -1.36178272886711f, 2.37949493464538f, -8.08228565650486e-1f, 2.78440142746736e-1f } };
+PX_GLOBALCONST PX_VECTORF32 g_PXASinEstConstants = { { 1.00000011921f, PxPiDivTwo, 0.0f, 0.0f } };
+PX_GLOBALCONST PX_VECTORF32 g_PXPiConstants0 = { { PxPi, PxTwoPi, PxInvPi, PxInvTwoPi } };
+PX_GLOBALCONST PX_VECTORF32 g_PXReciprocalTwoPi = { { PxInvTwoPi, PxInvTwoPi, PxInvTwoPi, PxInvTwoPi } };
+PX_GLOBALCONST PX_VECTORF32 g_PXTwoPi = { { PxTwoPi, PxTwoPi, PxTwoPi, PxTwoPi } };
+
+#endif
diff --git a/PxShared/src/foundation/include/unix/neon/PsUnixNeonAoS.h b/PxShared/src/foundation/include/unix/neon/PsUnixNeonAoS.h
new file mode 100644
index 0000000..60a5be8
--- /dev/null
+++ b/PxShared/src/foundation/include/unix/neon/PsUnixNeonAoS.h
@@ -0,0 +1,129 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSUNIXNEONAOS_H
+#define PSFOUNDATION_PSUNIXNEONAOS_H
+
+// no includes here! this file should be included from PxcVecMath.h only!!!
+
+#if !COMPILE_VECTOR_INTRINSICS
+#error Vector intrinsics should not be included when using scalar implementation.
+#endif
+
+// only ARM NEON compatible platforms should reach this
+#include <arm_neon.h>
+
+typedef float32x2_t FloatV;
+typedef float32x4_t Vec3V;
+typedef float32x4_t Vec4V;
+typedef uint32x4_t BoolV;
+typedef float32x4_t QuatV;
+
+typedef uint32x4_t VecU32V;
+typedef int32x4_t VecI32V;
+typedef uint16x8_t VecU16V;
+typedef int16x8_t VecI16V;
+typedef uint8x16_t VecU8V;
+
+#define FloatVArg FloatV &
+#define Vec3VArg Vec3V &
+#define Vec4VArg Vec4V &
+#define BoolVArg BoolV &
+#define VecU32VArg VecU32V &
+#define VecI32VArg VecI32V &
+#define VecU16VArg VecU16V &
+#define VecI16VArg VecI16V &
+#define VecU8VArg VecU8V &
+#define QuatVArg QuatV &
+
+// KS - TODO - make an actual VecCrossV type for NEON
+#define VecCrossV Vec3V
+
+typedef VecI32V VecShiftV;
+#define VecShiftVArg VecShiftV &
+
+PX_ALIGN_PREFIX(16)
+struct Mat33V
+{
+	Mat33V()
+	{
+	}
+	Mat33V(const Vec3V& c0, const Vec3V& c1, const Vec3V& c2) : col0(c0), col1(c1), col2(c2)
+	{
+	}
+	Vec3V PX_ALIGN(16, col0);
+	Vec3V PX_ALIGN(16, col1);
+	Vec3V PX_ALIGN(16, col2);
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct Mat34V
+{
+	Mat34V()
+	{
+	}
+	Mat34V(const Vec3V& c0, const Vec3V& c1, const Vec3V& c2, const Vec3V& c3) : col0(c0), col1(c1), col2(c2), col3(c3)
+	{
+	}
+	Vec3V PX_ALIGN(16, col0);
+	Vec3V PX_ALIGN(16, col1);
+	Vec3V PX_ALIGN(16, col2);
+	Vec3V PX_ALIGN(16, col3);
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct Mat43V
+{
+	Mat43V()
+	{
+	}
+	Mat43V(const Vec4V& c0, const Vec4V& c1, const Vec4V& c2) : col0(c0), col1(c1), col2(c2)
+	{
+	}
+	Vec4V PX_ALIGN(16, col0);
+	Vec4V PX_ALIGN(16, col1);
+	Vec4V PX_ALIGN(16, col2);
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct Mat44V
+{
+	Mat44V()
+	{
+	}
+	Mat44V(const Vec4V& c0, const Vec4V& c1, const Vec4V& c2, const Vec4V& c3) : col0(c0), col1(c1), col2(c2), col3(c3)
+	{
+	}
+	Vec4V PX_ALIGN(16, col0);
+	Vec4V PX_ALIGN(16, col1);
+	Vec4V PX_ALIGN(16, col2);
+	Vec4V PX_ALIGN(16, col3);
+} PX_ALIGN_SUFFIX(16);
+
+#endif // PSFOUNDATION_PSUNIXNEONAOS_H
diff --git a/PxShared/src/foundation/include/unix/neon/PsUnixNeonInlineAoS.h b/PxShared/src/foundation/include/unix/neon/PsUnixNeonInlineAoS.h
new file mode 100644
index 0000000..2a0578d
--- /dev/null
+++ b/PxShared/src/foundation/include/unix/neon/PsUnixNeonInlineAoS.h
@@ -0,0 +1,3582 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSUNIXNEONINLINEAOS_H
+#define PSFOUNDATION_PSUNIXNEONINLINEAOS_H
+
+#if !COMPILE_VECTOR_INTRINSICS
+#error Vector intrinsics should not be included when using scalar implementation.
+#endif
+
+// improved estimates
+#define VRECIPEQ recipq_newton<1>
+#define VRECIPE recip_newton<1>
+#define VRECIPSQRTEQ rsqrtq_newton<1>
+#define VRECIPSQRTE rsqrt_newton<1>
+
+// "exact"
+#define VRECIPQ recipq_newton<4>
+#if PX_NX
+// StabilizationTests.AveragePoint needs more precision to succeed.
+#define VRECIP recip_newton<5> 
+#else
+#define VRECIP recip_newton<4>
+#endif
+#define VRECIPSQRTQ rsqrtq_newton<4>
+#define VRECIPSQRT rsqrt_newton<4>
+
+#define VECMATH_AOS_EPSILON (1e-3f)
+
+// Remove this define when all platforms use simd solver.
+#define PX_SUPPORT_SIMD
+
+//////////////////////////////////////////////////////////////////////
+//Test that Vec3V and FloatV are legal
+//////////////////////////////////
+
+#define FLOAT_COMPONENTS_EQUAL_THRESHOLD 0.01f
+PX_FORCE_INLINE bool isValidFloatV(const FloatV a)
+{
+	/*
+	PX_ALIGN(16, PxF32) data[4];
+	vst1_f32(reinterpret_cast<float32_t*>(data), a);
+	return 
+	PxU32* intData = reinterpret_cast<PxU32*>(data);
+	return intData[0] == intData[1];
+	*/
+	PX_ALIGN(16, PxF32) data[4];
+	vst1_f32(reinterpret_cast<float32_t*>(data), a);
+	const float32_t x = data[0];
+	const float32_t y = data[1];
+	
+	if (PxAbs(x - y) < FLOAT_COMPONENTS_EQUAL_THRESHOLD)
+	{
+		return true;
+	}
+
+	if (PxAbs((x - y) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD)
+	{
+		return true;
+	}
+
+	return false;
+}
+
+PX_FORCE_INLINE bool isValidVec3V(const Vec3V a)
+{
+	const float32_t w = vgetq_lane_f32(a, 3);
+	return (0.0f == w);
+	//const PxU32* intData = reinterpret_cast<const PxU32*>(&w);
+	//return *intData == 0;
+}
+
+PX_FORCE_INLINE bool isAligned16(const void* a)
+{
+	return(0 == (size_t(a) & 0x0f));
+}
+
+#if PX_DEBUG
+#define ASSERT_ISVALIDVEC3V(a) PX_ASSERT(isValidVec3V(a))
+#define ASSERT_ISVALIDFLOATV(a) PX_ASSERT(isValidFloatV(a))
+#define ASSERT_ISALIGNED16(a) PX_ASSERT(isAligned16(static_cast<const void*>(a)))
+#else
+#define ASSERT_ISVALIDVEC3V(a)
+#define ASSERT_ISVALIDFLOATV(a) 
+#define ASSERT_ISALIGNED16(a)
+#endif
+
+namespace internalUnitNeonSimd
+{
+PX_FORCE_INLINE PxU32 BAllTrue4_R(const BoolV a)
+{
+	const uint16x4_t dHigh = vget_high_u16(vreinterpretq_u16_u32(a));
+	const uint16x4_t dLow = vmovn_u32(a);
+	const uint16x8_t combined = vcombine_u16(dLow, dHigh);
+	const uint32x2_t finalReduce = vreinterpret_u32_u8(vmovn_u16(combined));
+	return PxU32(vget_lane_u32(finalReduce, 0) == 0xffffFFFF);
+}
+
+PX_FORCE_INLINE PxU32 BAllTrue3_R(const BoolV a)
+{
+	const uint16x4_t dHigh = vget_high_u16(vreinterpretq_u16_u32(a));
+	const uint16x4_t dLow = vmovn_u32(a);
+	const uint16x8_t combined = vcombine_u16(dLow, dHigh);
+	const uint32x2_t finalReduce = vreinterpret_u32_u8(vmovn_u16(combined));
+	return PxU32((vget_lane_u32(finalReduce, 0) & 0xffFFff) == 0xffFFff);
+}
+
+PX_FORCE_INLINE PxU32 BAnyTrue4_R(const BoolV a)
+{
+	const uint16x4_t dHigh = vget_high_u16(vreinterpretq_u16_u32(a));
+	const uint16x4_t dLow = vmovn_u32(a);
+	const uint16x8_t combined = vcombine_u16(dLow, dHigh);
+	const uint32x2_t finalReduce = vreinterpret_u32_u8(vmovn_u16(combined));
+	return PxU32(vget_lane_u32(finalReduce, 0) != 0x0);
+}
+
+PX_FORCE_INLINE PxU32 BAnyTrue3_R(const BoolV a)
+{
+	const uint16x4_t dHigh = vget_high_u16(vreinterpretq_u16_u32(a));
+	const uint16x4_t dLow = vmovn_u32(a);
+	const uint16x8_t combined = vcombine_u16(dLow, dHigh);
+	const uint32x2_t finalReduce = vreinterpret_u32_u8(vmovn_u16(combined));
+	return PxU32((vget_lane_u32(finalReduce, 0) & 0xffFFff) != 0);
+}
+}
+
+namespace _VecMathTests
+{
+// PT: this function returns an invalid Vec3V (W!=0.0f) just for unit-testing 'isValidVec3V'
+PX_FORCE_INLINE Vec3V getInvalidVec3V()
+{
+	PX_ALIGN(16, PxF32) data[4] = { 1.0f, 1.0f, 1.0f, 1.0f };
+	return V4LoadA(data);
+}
+
+PX_FORCE_INLINE bool allElementsEqualFloatV(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vget_lane_u32(vceq_f32(a, b), 0) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsEqualVec3V(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return V3AllEq(a, b) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsEqualVec4V(const Vec4V a, const Vec4V b)
+{
+	return V4AllEq(a, b) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsEqualBoolV(const BoolV a, const BoolV b)
+{
+	return internalUnitNeonSimd::BAllTrue4_R(vceqq_u32(a, b)) != 0;
+}
+
+PX_FORCE_INLINE PxU32 V4U32AllEq(const VecU32V a, const VecU32V b)
+{
+	return internalUnitNeonSimd::BAllTrue4_R(V4IsEqU32(a, b));
+}
+
+PX_FORCE_INLINE bool allElementsEqualVecU32V(const VecU32V a, const VecU32V b)
+{
+	return V4U32AllEq(a, b) != 0;
+}
+
+PX_FORCE_INLINE BoolV V4IsEqI32(const VecI32V a, const VecI32V b)
+{
+	return vceqq_s32(a, b);
+}
+
+PX_FORCE_INLINE PxU32 V4I32AllEq(const VecI32V a, const VecI32V b)
+{
+	return internalUnitNeonSimd::BAllTrue4_R(V4IsEqI32(a, b));
+}
+
+PX_FORCE_INLINE bool allElementsEqualVecI32V(const VecI32V a, const VecI32V b)
+{
+	return V4I32AllEq(a, b) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsNearEqualFloatV(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+
+	const float32x2_t c = vsub_f32(a, b);
+	const float32x2_t error = vdup_n_f32(VECMATH_AOS_EPSILON);
+// absolute compare abs(error) > abs(c)
+	const uint32x2_t greater = vcagt_f32(error, c);
+	const uint32x2_t min = vpmin_u32(greater, greater);
+	return vget_lane_u32(min, 0) != 0x0;
+}
+
+PX_FORCE_INLINE bool allElementsNearEqualVec3V(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	const float32x4_t c = vsubq_f32(a, b);
+	const float32x4_t error = vdupq_n_f32(VECMATH_AOS_EPSILON);
+// absolute compare abs(error) > abs(c)
+	const uint32x4_t greater = vcagtq_f32(error, c);
+	return internalUnitNeonSimd::BAllTrue3_R(greater) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsNearEqualVec4V(const Vec4V a, const Vec4V b)
+{
+	const float32x4_t c = vsubq_f32(a, b);
+	const float32x4_t error = vdupq_n_f32(VECMATH_AOS_EPSILON);
+// absolute compare abs(error) > abs(c)
+	const uint32x4_t greater = vcagtq_f32(error, c);
+	return internalUnitNeonSimd::BAllTrue4_R(greater) != 0x0;
+}
+}
+
+#if 0 // debugging printfs
+#include <stdio.h>
+PX_FORCE_INLINE void printVec(const float32x4_t& v, const char* name)
+{
+	PX_ALIGN(16, float32_t) data[4];
+	vst1q_f32(data, v);
+	printf("%s: (%f, %f, %f, %f)\n", name, data[0], data[1], data[2], data[3]);
+}
+
+PX_FORCE_INLINE void printVec(const float32x2_t& v, const char* name)
+{
+	PX_ALIGN(16, float32_t) data[2];
+	vst1_f32(data, v);
+	printf("%s: (%f, %f)\n", name, data[0], data[1]);
+}
+
+PX_FORCE_INLINE void printVec(const uint32x4_t& v, const char* name)
+{
+	PX_ALIGN(16, uint32_t) data[4];
+	vst1q_u32(data, v);
+	printf("%s: (0x%x, 0x%x, 0x%x, 0x%x)\n", name, data[0], data[1], data[2], data[3]);
+}
+
+PX_FORCE_INLINE void printVec(const uint16x8_t& v, const char* name)
+{
+	PX_ALIGN(16, uint16_t) data[8];
+	vst1q_u16(data, v);
+	printf("%s: (0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x)\n", name, data[0], data[1], data[2], data[3],
+		data[4], data[5], data[6], data[7]);
+}
+
+PX_FORCE_INLINE void printVec(const int32x4_t& v, const char* name)
+{
+	PX_ALIGN(16, int32_t) data[4];
+	vst1q_s32(data, v);
+	printf("%s: (0x%x, 0x%x, 0x%x, 0x%x)\n", name, data[0], data[1], data[2], data[3]);
+}
+
+PX_FORCE_INLINE void printVec(const int16x8_t& v, const char* name)
+{
+	PX_ALIGN(16, int16_t) data[8];
+	vst1q_s16(data, v);
+	printf("%s: (0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x)\n", name, data[0], data[1], data[2], data[3],
+		data[4], data[5], data[6], data[7]);
+}
+
+PX_FORCE_INLINE void printVec(const uint16x4_t& v, const char* name)
+{
+	PX_ALIGN(16, uint16_t) data[4];
+	vst1_u16(data, v);
+	printf("%s: (0x%x, 0x%x, 0x%x, 0x%x)\n", name, data[0], data[1], data[2], data[3]);
+}
+
+PX_FORCE_INLINE void printVec(const uint32x2_t& v, const char* name)
+{
+	PX_ALIGN(16, uint32_t) data[2];
+	vst1_u32(data, v);
+	printf("%s: (0x%x, 0x%x)\n", name, data[0], data[1]);
+}
+
+PX_FORCE_INLINE void printVar(const PxU32 v, const char* name)
+{
+	printf("%s: 0x%x\n", name, v);
+}
+
+PX_FORCE_INLINE void printVar(const PxF32 v, const char* name)
+{
+	printf("%s: %f\n", name, v);
+}
+
+#define PRINT_VAR(X) printVar((X), #X)
+#define PRINT_VEC(X) printVec((X), #X)
+#define PRINT_VEC_TITLE(TITLE, X) printVec((X), TITLE #X)
+#endif // debugging printf
+
+/////////////////////////////////////////////////////////////////////
+////FUNCTIONS USED ONLY FOR ASSERTS IN VECTORISED IMPLEMENTATIONS
+/////////////////////////////////////////////////////////////////////
+
+PX_FORCE_INLINE bool isFiniteFloatV(const FloatV a)
+{
+	PX_ALIGN(16, PxF32) data[4];
+	vst1_f32(reinterpret_cast<float32_t*>(data), a);
+	return PxIsFinite(data[0]) && PxIsFinite(data[1]);
+}
+
+PX_FORCE_INLINE bool isFiniteVec3V(const Vec3V a)
+{
+	PX_ALIGN(16, PxF32) data[4];
+	vst1q_f32(reinterpret_cast<float32_t*>(data), a);
+	return PxIsFinite(data[0]) && PxIsFinite(data[1]) && PxIsFinite(data[2]);
+}
+
+PX_FORCE_INLINE bool isFiniteVec4V(const Vec4V a)
+{
+	PX_ALIGN(16, PxF32) data[4];
+	vst1q_f32(reinterpret_cast<float32_t*>(data), a);
+	return PxIsFinite(data[0]) && PxIsFinite(data[1]) && PxIsFinite(data[2]) && PxIsFinite(data[3]);
+}
+
+PX_FORCE_INLINE bool hasZeroElementinFloatV(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return vget_lane_u32(vreinterpret_u32_f32(a), 0) == 0;
+}
+
+PX_FORCE_INLINE bool hasZeroElementInVec3V(const Vec3V a)
+{
+	const uint32x2_t dLow = vget_low_u32(vreinterpretq_u32_f32(a));
+	const uint32x2_t dMin = vpmin_u32(dLow, dLow);
+
+	return vget_lane_u32(dMin, 0) == 0 || vgetq_lane_u32(vreinterpretq_u32_f32(a), 2) == 0;
+}
+
+PX_FORCE_INLINE bool hasZeroElementInVec4V(const Vec4V a)
+{
+	const uint32x2_t dHigh = vget_high_u32(vreinterpretq_u32_f32(a));
+	const uint32x2_t dLow = vget_low_u32(vreinterpretq_u32_f32(a));
+
+	const uint32x2_t dMin = vmin_u32(dHigh, dLow);
+	const uint32x2_t pairMin = vpmin_u32(dMin, dMin);
+	return vget_lane_u32(pairMin, 0) == 0;
+}
+
+/////////////////////////////////////////////////////////////////////
+////VECTORISED FUNCTION IMPLEMENTATIONS
+/////////////////////////////////////////////////////////////////////
+
+PX_FORCE_INLINE FloatV FLoad(const PxF32 f)
+{
+	return vdup_n_f32(reinterpret_cast<const float32_t&>(f));
+}
+
+PX_FORCE_INLINE FloatV FLoadA(const PxF32* const f)
+{
+	ASSERT_ISALIGNED16(f);
+	return vld1_f32(reinterpret_cast<const float32_t*>(f));
+}
+
+PX_FORCE_INLINE Vec3V V3Load(const PxF32 f)
+{
+	PX_ALIGN(16, PxF32) data[4] = { f, f, f, 0.0f };
+	return V4LoadA(data);
+}
+
+PX_FORCE_INLINE Vec4V V4Load(const PxF32 f)
+{
+	return vdupq_n_f32(reinterpret_cast<const float32_t&>(f));
+}
+
+PX_FORCE_INLINE BoolV BLoad(const bool f)
+{
+	const PxU32 i = static_cast<PxU32>(-(static_cast<PxI32>(f)));
+	return vdupq_n_u32(i);
+}
+
+PX_FORCE_INLINE Vec3V V3LoadA(const PxVec3& f)
+{
+	ASSERT_ISALIGNED16(&f);
+	PX_ALIGN(16, PxF32) data[4] = { f.x, f.y, f.z, 0.0f };
+	return V4LoadA(data);
+}
+
+PX_FORCE_INLINE Vec3V V3LoadU(const PxVec3& f)
+{
+	PX_ALIGN(16, PxF32) data[4] = { f.x, f.y, f.z, 0.0f };
+	return V4LoadA(data);
+}
+
+PX_FORCE_INLINE Vec3V V3LoadUnsafeA(const PxVec3& f)
+{
+	ASSERT_ISALIGNED16(&f);
+	PX_ALIGN(16, PxF32) data[4] = { f.x, f.y, f.z, 0.0f };
+	return V4LoadA(data);
+}
+
+PX_FORCE_INLINE Vec3V V3LoadA(const PxF32* f)
+{
+	ASSERT_ISALIGNED16(f);
+	PX_ALIGN(16, PxF32) data[4] = { f[0], f[1], f[2], 0.0f };
+	return V4LoadA(data);
+}
+
+PX_FORCE_INLINE Vec3V V3LoadU(const PxF32* f)
+{
+	PX_ALIGN(16, PxF32) data[4] = { f[0], f[1], f[2], 0.0f };
+	return V4LoadA(data);
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V(Vec4V v)
+{
+	return vsetq_lane_f32(0.0f, v, 3);
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V_WUndefined(Vec4V v)
+{
+	return v;
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_Vec3V(Vec3V f)
+{
+	return f; // ok if it is implemented as the same type.
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_FloatV(FloatV f)
+{
+	return vcombine_f32(f, f);
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_FloatV(FloatV f)
+{
+	return Vec3V_From_Vec4V(Vec4V_From_FloatV(f));
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_FloatV_WUndefined(FloatV f)
+{
+	return Vec3V_From_Vec4V_WUndefined(Vec4V_From_FloatV(f));
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_PxVec3_WUndefined(const PxVec3& f)
+{
+	PX_ALIGN(16, PxF32) data[4] = { f.x, f.y, f.z, 0.0f };
+	return V4LoadA(data);
+}
+
+PX_FORCE_INLINE Mat33V Mat33V_From_PxMat33(const PxMat33& m)
+{
+	return Mat33V(V3LoadU(m.column0), V3LoadU(m.column1), V3LoadU(m.column2));
+}
+
+PX_FORCE_INLINE void PxMat33_From_Mat33V(const Mat33V& m, PxMat33& out)
+{
+	V3StoreU(m.col0, out.column0);
+	V3StoreU(m.col1, out.column1);
+	V3StoreU(m.col2, out.column2);
+}
+
+PX_FORCE_INLINE Vec4V V4LoadA(const PxF32* const f)
+{
+	ASSERT_ISALIGNED16(f);
+	return vld1q_f32(reinterpret_cast<const float32_t*>(f));
+}
+
+PX_FORCE_INLINE void V4StoreA(Vec4V a, PxF32* f)
+{
+	ASSERT_ISALIGNED16(f);
+	vst1q_f32(reinterpret_cast<float32_t*>(f), a);
+}
+
+PX_FORCE_INLINE void V4StoreU(const Vec4V a, PxF32* f)
+{
+	PX_ALIGN(16, PxF32) f2[4];
+	vst1q_f32(reinterpret_cast<float32_t*>(f2), a);
+	f[0] = f2[0];
+	f[1] = f2[1];
+	f[2] = f2[2];
+	f[3] = f2[3];
+}
+
+PX_FORCE_INLINE void BStoreA(const BoolV a, PxU32* u)
+{
+	ASSERT_ISALIGNED16(u);
+	vst1q_u32(reinterpret_cast<uint32_t*>(u), a);
+}
+
+PX_FORCE_INLINE void U4StoreA(const VecU32V uv, PxU32* u)
+{
+	ASSERT_ISALIGNED16(u);
+	vst1q_u32(reinterpret_cast<uint32_t*>(u), uv);
+}
+
+PX_FORCE_INLINE void I4StoreA(const VecI32V iv, PxI32* i)
+{
+	ASSERT_ISALIGNED16(i);
+	vst1q_s32(reinterpret_cast<int32_t*>(i), iv);
+}
+
+PX_FORCE_INLINE Vec4V V4LoadU(const PxF32* const f)
+{
+	return vld1q_f32(reinterpret_cast<const float32_t*>(f));
+}
+
+PX_FORCE_INLINE BoolV BLoad(const bool* const f)
+{
+	const PX_ALIGN(16, PxU32) b[4] = { static_cast<PxU32>(-static_cast<PxI32>(f[0])),
+		                               static_cast<PxU32>(-static_cast<PxI32>(f[1])),
+		                               static_cast<PxU32>(-static_cast<PxI32>(f[2])),
+		                               static_cast<PxU32>(-static_cast<PxI32>(f[3])) };
+	return vld1q_u32(b);
+}
+
+PX_FORCE_INLINE void FStore(const FloatV a, PxF32* PX_RESTRICT f)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	// vst1q_lane_f32(f, a, 0); // causes vst1 alignment bug
+	*f = vget_lane_f32(a, 0);
+}
+
+PX_FORCE_INLINE void Store_From_BoolV(const BoolV a, PxU32* PX_RESTRICT f)
+{
+	*f = vget_lane_u32(vget_low_u32(a), 0);
+}
+
+PX_FORCE_INLINE void V3StoreA(const Vec3V a, PxVec3& f)
+{
+	ASSERT_ISALIGNED16(&f);
+	PX_ALIGN(16, PxF32) f2[4];
+	vst1q_f32(reinterpret_cast<float32_t*>(f2), a);
+	f = PxVec3(f2[0], f2[1], f2[2]);
+}
+
+PX_FORCE_INLINE void V3StoreU(const Vec3V a, PxVec3& f)
+{
+	PX_ALIGN(16, PxF32) f2[4];
+	vst1q_f32(reinterpret_cast<float32_t*>(f2), a);
+	f = PxVec3(f2[0], f2[1], f2[2]);
+}
+
+//////////////////////////////////
+// FLOATV
+//////////////////////////////////
+
+PX_FORCE_INLINE FloatV FZero()
+{
+	return FLoad(0.0f);
+}
+
+PX_FORCE_INLINE FloatV FOne()
+{
+	return FLoad(1.0f);
+}
+
+PX_FORCE_INLINE FloatV FHalf()
+{
+	return FLoad(0.5f);
+}
+
+PX_FORCE_INLINE FloatV FEps()
+{
+	return FLoad(PX_EPS_REAL);
+}
+
+PX_FORCE_INLINE FloatV FEps6()
+{
+	return FLoad(1e-6f);
+}
+
+PX_FORCE_INLINE FloatV FMax()
+{
+	return FLoad(PX_MAX_REAL);
+}
+
+PX_FORCE_INLINE FloatV FNegMax()
+{
+	return FLoad(-PX_MAX_REAL);
+}
+
+PX_FORCE_INLINE FloatV IZero()
+{
+	return vreinterpret_f32_u32(vdup_n_u32(0));
+}
+
+PX_FORCE_INLINE FloatV IOne()
+{
+	return vreinterpret_f32_u32(vdup_n_u32(1));
+}
+
+PX_FORCE_INLINE FloatV ITwo()
+{
+	return vreinterpret_f32_u32(vdup_n_u32(2));
+}
+
+PX_FORCE_INLINE FloatV IThree()
+{
+	return vreinterpret_f32_u32(vdup_n_u32(3));
+}
+
+PX_FORCE_INLINE FloatV IFour()
+{
+	return vreinterpret_f32_u32(vdup_n_u32(4));
+}
+
+PX_FORCE_INLINE FloatV FNeg(const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return vneg_f32(f);
+}
+
+PX_FORCE_INLINE FloatV FAdd(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vadd_f32(a, b);
+}
+
+PX_FORCE_INLINE FloatV FSub(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vsub_f32(a, b);
+}
+
+PX_FORCE_INLINE FloatV FMul(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vmul_f32(a, b);
+}
+
+template <int n>
+PX_FORCE_INLINE float32x2_t recip_newton(const float32x2_t& in)
+{
+	float32x2_t recip = vrecpe_f32(in);
+	for(int i = 0; i < n; ++i)
+		recip = vmul_f32(recip, vrecps_f32(in, recip));
+	return recip;
+}
+
+template <int n>
+PX_FORCE_INLINE float32x4_t recipq_newton(const float32x4_t& in)
+{
+	float32x4_t recip = vrecpeq_f32(in);
+	for(int i = 0; i < n; ++i)
+		recip = vmulq_f32(recip, vrecpsq_f32(recip, in));
+	return recip;
+}
+
+template <int n>
+PX_FORCE_INLINE float32x2_t rsqrt_newton(const float32x2_t& in)
+{
+	float32x2_t rsqrt = vrsqrte_f32(in);
+	for(int i = 0; i < n; ++i)
+		rsqrt = vmul_f32(rsqrt, vrsqrts_f32(vmul_f32(rsqrt, rsqrt), in));
+	return rsqrt;
+}
+
+template <int n>
+PX_FORCE_INLINE float32x4_t rsqrtq_newton(const float32x4_t& in)
+{
+	float32x4_t rsqrt = vrsqrteq_f32(in);
+	for(int i = 0; i < n; ++i)
+		rsqrt = vmulq_f32(rsqrt, vrsqrtsq_f32(vmulq_f32(rsqrt, rsqrt), in));
+	return rsqrt;
+}
+
+PX_FORCE_INLINE FloatV FDiv(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vmul_f32(a, VRECIP(b));
+}
+
+PX_FORCE_INLINE FloatV FDivFast(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vmul_f32(a, VRECIPE(b));
+}
+
+PX_FORCE_INLINE FloatV FRecip(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return VRECIP(a);
+}
+
+PX_FORCE_INLINE FloatV FRecipFast(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return VRECIPE(a);
+}
+
+PX_FORCE_INLINE FloatV FRsqrt(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return VRECIPSQRT(a);
+}
+
+PX_FORCE_INLINE FloatV FSqrt(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return FSel(FIsEq(a, FZero()), a, vmul_f32(a, VRECIPSQRT(a)));
+}
+
+PX_FORCE_INLINE FloatV FRsqrtFast(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return VRECIPSQRTE(a);
+}
+
+PX_FORCE_INLINE FloatV FScaleAdd(const FloatV a, const FloatV b, const FloatV c)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	ASSERT_ISVALIDFLOATV(c);
+	return vmla_f32(c, a, b);
+}
+
+PX_FORCE_INLINE FloatV FNegScaleSub(const FloatV a, const FloatV b, const FloatV c)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	ASSERT_ISVALIDFLOATV(c);
+	return vmls_f32(c, a, b);
+}
+
+PX_FORCE_INLINE FloatV FAbs(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return vabs_f32(a);
+}
+
+PX_FORCE_INLINE FloatV FSel(const BoolV c, const FloatV a, const FloatV b)
+{
+	PX_ASSERT(	_VecMathTests::allElementsEqualBoolV(c, BTTTT()) || 
+				_VecMathTests::allElementsEqualBoolV(c, BFFFF()));
+	ASSERT_ISVALIDFLOATV(vbsl_f32(vget_low_u32(c), a, b));
+	return vbsl_f32(vget_low_u32(c), a, b);
+}
+
+PX_FORCE_INLINE BoolV FIsGrtr(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vdupq_lane_u32(vcgt_f32(a, b), 0);
+}
+
+PX_FORCE_INLINE BoolV FIsGrtrOrEq(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vdupq_lane_u32(vcge_f32(a, b), 0);
+}
+
+PX_FORCE_INLINE BoolV FIsEq(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vdupq_lane_u32(vceq_f32(a, b), 0);
+}
+
+PX_FORCE_INLINE FloatV FMax(const FloatV a, const FloatV b)
+{
+	//ASSERT_ISVALIDFLOATV(a);
+	//ASSERT_ISVALIDFLOATV(b);
+	return vmax_f32(a, b);
+}
+
+PX_FORCE_INLINE FloatV FMin(const FloatV a, const FloatV b)
+{
+	//ASSERT_ISVALIDFLOATV(a);
+	//ASSERT_ISVALIDFLOATV(b);
+	return vmin_f32(a, b);
+}
+
+PX_FORCE_INLINE FloatV FClamp(const FloatV a, const FloatV minV, const FloatV maxV)
+{
+	ASSERT_ISVALIDFLOATV(minV);
+	ASSERT_ISVALIDFLOATV(maxV);
+	return vmax_f32(vmin_f32(a, maxV), minV);
+}
+
+PX_FORCE_INLINE PxU32 FAllGrtr(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vget_lane_u32(vcgt_f32(a, b), 0);
+}
+
+PX_FORCE_INLINE PxU32 FAllGrtrOrEq(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vget_lane_u32(vcge_f32(a, b), 0);
+}
+
+PX_FORCE_INLINE PxU32 FAllEq(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vget_lane_u32(vceq_f32(a, b), 0);
+}
+
+PX_FORCE_INLINE FloatV FRound(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+
+	// truncate(a + (0.5f - sign(a)))
+	const float32x2_t half = vdup_n_f32(0.5f);
+	const float32x2_t sign = vcvt_f32_u32((vshr_n_u32(vreinterpret_u32_f32(a), 31)));
+	const float32x2_t aPlusHalf = vadd_f32(a, half);
+	const float32x2_t aRound = vsub_f32(aPlusHalf, sign);
+	int32x2_t tmp = vcvt_s32_f32(aRound);
+	return vcvt_f32_s32(tmp);
+}
+
+PX_FORCE_INLINE FloatV FSin(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+
+	// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
+	const FloatV recipTwoPi = FLoadA(g_PXReciprocalTwoPi.f);
+	const FloatV twoPi = FLoadA(g_PXTwoPi.f);
+	const FloatV tmp = FMul(a, recipTwoPi);
+	const FloatV b = FRound(tmp);
+	const FloatV V1 = FNegScaleSub(twoPi, b, a);
+
+	// sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
+	//           V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
+	const FloatV V2 = FMul(V1, V1);
+	const FloatV V3 = FMul(V2, V1);
+	const FloatV V5 = FMul(V3, V2);
+	const FloatV V7 = FMul(V5, V2);
+	const FloatV V9 = FMul(V7, V2);
+	const FloatV V11 = FMul(V9, V2);
+	const FloatV V13 = FMul(V11, V2);
+	const FloatV V15 = FMul(V13, V2);
+	const FloatV V17 = FMul(V15, V2);
+	const FloatV V19 = FMul(V17, V2);
+	const FloatV V21 = FMul(V19, V2);
+	const FloatV V23 = FMul(V21, V2);
+
+	const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f);
+	const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f);
+	const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f);
+
+	const FloatV S1 = V4GetY(sinCoefficients0);
+	const FloatV S2 = V4GetZ(sinCoefficients0);
+	const FloatV S3 = V4GetW(sinCoefficients0);
+	const FloatV S4 = V4GetX(sinCoefficients1);
+	const FloatV S5 = V4GetY(sinCoefficients1);
+	const FloatV S6 = V4GetZ(sinCoefficients1);
+	const FloatV S7 = V4GetW(sinCoefficients1);
+	const FloatV S8 = V4GetX(sinCoefficients2);
+	const FloatV S9 = V4GetY(sinCoefficients2);
+	const FloatV S10 = V4GetZ(sinCoefficients2);
+	const FloatV S11 = V4GetW(sinCoefficients2);
+
+	FloatV Result;
+	Result = FScaleAdd(S1, V3, V1);
+	Result = FScaleAdd(S2, V5, Result);
+	Result = FScaleAdd(S3, V7, Result);
+	Result = FScaleAdd(S4, V9, Result);
+	Result = FScaleAdd(S5, V11, Result);
+	Result = FScaleAdd(S6, V13, Result);
+	Result = FScaleAdd(S7, V15, Result);
+	Result = FScaleAdd(S8, V17, Result);
+	Result = FScaleAdd(S9, V19, Result);
+	Result = FScaleAdd(S10, V21, Result);
+	Result = FScaleAdd(S11, V23, Result);
+
+	return Result;
+}
+
+PX_FORCE_INLINE FloatV FCos(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+
+	// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
+	const FloatV recipTwoPi = FLoadA(g_PXReciprocalTwoPi.f);
+	const FloatV twoPi = FLoadA(g_PXTwoPi.f);
+	const FloatV tmp = FMul(a, recipTwoPi);
+	const FloatV b = FRound(tmp);
+	const FloatV V1 = FNegScaleSub(twoPi, b, a);
+
+	// cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
+	//           V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
+	const FloatV V2 = FMul(V1, V1);
+	const FloatV V4 = FMul(V2, V2);
+	const FloatV V6 = FMul(V4, V2);
+	const FloatV V8 = FMul(V4, V4);
+	const FloatV V10 = FMul(V6, V4);
+	const FloatV V12 = FMul(V6, V6);
+	const FloatV V14 = FMul(V8, V6);
+	const FloatV V16 = FMul(V8, V8);
+	const FloatV V18 = FMul(V10, V8);
+	const FloatV V20 = FMul(V10, V10);
+	const FloatV V22 = FMul(V12, V10);
+
+	const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f);
+	const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f);
+	const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f);
+
+	const FloatV C1 = V4GetY(cosCoefficients0);
+	const FloatV C2 = V4GetZ(cosCoefficients0);
+	const FloatV C3 = V4GetW(cosCoefficients0);
+	const FloatV C4 = V4GetX(cosCoefficients1);
+	const FloatV C5 = V4GetY(cosCoefficients1);
+	const FloatV C6 = V4GetZ(cosCoefficients1);
+	const FloatV C7 = V4GetW(cosCoefficients1);
+	const FloatV C8 = V4GetX(cosCoefficients2);
+	const FloatV C9 = V4GetY(cosCoefficients2);
+	const FloatV C10 = V4GetZ(cosCoefficients2);
+	const FloatV C11 = V4GetW(cosCoefficients2);
+
+	FloatV Result;
+	Result = FScaleAdd(C1, V2, FOne());
+	Result = FScaleAdd(C2, V4, Result);
+	Result = FScaleAdd(C3, V6, Result);
+	Result = FScaleAdd(C4, V8, Result);
+	Result = FScaleAdd(C5, V10, Result);
+	Result = FScaleAdd(C6, V12, Result);
+	Result = FScaleAdd(C7, V14, Result);
+	Result = FScaleAdd(C8, V16, Result);
+	Result = FScaleAdd(C9, V18, Result);
+	Result = FScaleAdd(C10, V20, Result);
+	Result = FScaleAdd(C11, V22, Result);
+
+	return Result;
+}
+
+PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV min, const FloatV max)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(min);
+	ASSERT_ISVALIDFLOATV(max);
+
+	const BoolV c = BOr(FIsGrtr(a, max), FIsGrtr(min, a));
+	return PxU32(!BAllEqFFFF(c));
+}
+
+PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV min, const FloatV max)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(min);
+	ASSERT_ISVALIDFLOATV(max);
+
+	const BoolV c = BAnd(FIsGrtrOrEq(a, min), FIsGrtrOrEq(max, a));
+	return PxU32(BAllEqTTTT(c));
+}
+
+PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV bounds)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(bounds);
+	const uint32x2_t greater = vcagt_f32(a, bounds);
+	return vget_lane_u32(greater, 0);
+}
+
+PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV bounds)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(bounds);
+	const uint32x2_t geq = vcage_f32(bounds, a);
+	return vget_lane_u32(geq, 0);
+}
+
+//////////////////////////////////
+// VEC3V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec3V V3Splat(const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+
+	const uint32x2_t mask = { 0xffffFFFF, 0x0 };
+
+	const uint32x2_t uHigh = vreinterpret_u32_f32(f);
+	const float32x2_t dHigh = vreinterpret_f32_u32(vand_u32(uHigh, mask));
+
+	return vcombine_f32(f, dHigh);
+}
+
+PX_FORCE_INLINE Vec3V V3Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z)
+{
+	ASSERT_ISVALIDFLOATV(x);
+	ASSERT_ISVALIDFLOATV(y);
+	ASSERT_ISVALIDFLOATV(z);
+
+	const uint32x2_t mask = { 0xffffFFFF, 0x0 };
+
+	const uint32x2_t dHigh = vand_u32(vreinterpret_u32_f32(z), mask);
+	const uint32x2_t dLow = vext_u32(vreinterpret_u32_f32(x), vreinterpret_u32_f32(y), 1);
+	return vreinterpretq_f32_u32(vcombine_u32(dLow, dHigh));
+}
+
+PX_FORCE_INLINE Vec3V V3UnitX()
+{
+	const float32x4_t x = { 1.0f, 0.0f, 0.0f, 0.0f };
+	return x;
+}
+
+PX_FORCE_INLINE Vec3V V3UnitY()
+{
+	const float32x4_t y = { 0, 1.0f, 0, 0 };
+	return y;
+}
+
+PX_FORCE_INLINE Vec3V V3UnitZ()
+{
+	const float32x4_t z = { 0, 0, 1.0f, 0 };
+	return z;
+}
+
+PX_FORCE_INLINE FloatV V3GetX(const Vec3V f)
+{
+	ASSERT_ISVALIDVEC3V(f);
+	const float32x2_t fLow = vget_low_f32(f);
+	return vdup_lane_f32(fLow, 0);
+}
+
+PX_FORCE_INLINE FloatV V3GetY(const Vec3V f)
+{
+	ASSERT_ISVALIDVEC3V(f);
+	const float32x2_t fLow = vget_low_f32(f);
+	return vdup_lane_f32(fLow, 1);
+}
+
+PX_FORCE_INLINE FloatV V3GetZ(const Vec3V f)
+{
+	ASSERT_ISVALIDVEC3V(f);
+	const float32x2_t fhigh = vget_high_f32(f);
+	return vdup_lane_f32(fhigh, 0);
+}
+
+PX_FORCE_INLINE Vec3V V3SetX(const Vec3V v, const FloatV f)
+{
+	ASSERT_ISVALIDVEC3V(v);
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BFTTT(), v, vcombine_f32(f, f));
+}
+
+PX_FORCE_INLINE Vec3V V3SetY(const Vec3V v, const FloatV f)
+{
+	ASSERT_ISVALIDVEC3V(v);
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTFTT(), v, vcombine_f32(f, f));
+}
+
+PX_FORCE_INLINE Vec3V V3SetZ(const Vec3V v, const FloatV f)
+{
+	ASSERT_ISVALIDVEC3V(v);
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTTFT(), v, vcombine_f32(f, f));
+}
+
+PX_FORCE_INLINE Vec3V V3ColX(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+
+	const float32x2_t aLow = vget_low_f32(a);
+	const float32x2_t bLow = vget_low_f32(b);
+	const float32x2_t cLow = vget_low_f32(c);
+	const float32x2_t zero = vdup_n_f32(0.0f);
+
+	const float32x2x2_t zipL = vzip_f32(aLow, bLow);
+	const float32x2x2_t zipH = vzip_f32(cLow, zero);
+
+	return vcombine_f32(zipL.val[0], zipH.val[0]);
+}
+
+PX_FORCE_INLINE Vec3V V3ColY(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+
+	const float32x2_t aLow = vget_low_f32(a);
+	const float32x2_t bLow = vget_low_f32(b);
+	const float32x2_t cLow = vget_low_f32(c);
+	const float32x2_t zero = vdup_n_f32(0.0f);
+
+	const float32x2x2_t zipL = vzip_f32(aLow, bLow);
+	const float32x2x2_t zipH = vzip_f32(cLow, zero);
+
+	return vcombine_f32(zipL.val[1], zipH.val[1]);
+}
+
+PX_FORCE_INLINE Vec3V V3ColZ(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+
+	const float32x2_t aHi = vget_high_f32(a);
+	const float32x2_t bHi = vget_high_f32(b);
+	const float32x2_t cHi = vget_high_f32(c);
+
+	const float32x2x2_t zipL = vzip_f32(aHi, bHi);
+
+	return vcombine_f32(zipL.val[0], cHi);
+}
+
+PX_FORCE_INLINE Vec3V V3Zero()
+{
+	return vdupq_n_f32(0.0f);
+}
+
+PX_FORCE_INLINE Vec3V V3Eps()
+{
+	return V3Load(PX_EPS_REAL);
+}
+
+PX_FORCE_INLINE Vec3V V3One()
+{
+	return V3Load(1.0f);
+}
+
+PX_FORCE_INLINE Vec3V V3Neg(const Vec3V f)
+{
+	ASSERT_ISVALIDVEC3V(f);
+	const float32x4_t tmp = vnegq_f32(f);
+	return vsetq_lane_f32(0.0f, tmp, 3);
+}
+
+PX_FORCE_INLINE Vec3V V3Add(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return vaddq_f32(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Add(const Vec3V a, const FloatV b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vaddq_f32(a, Vec3V_From_FloatV(b));
+}
+
+PX_FORCE_INLINE Vec3V V3Sub(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return vsubq_f32(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Sub(const Vec3V a, const FloatV b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return vsubq_f32(a, Vec3V_From_FloatV(b));
+}
+
+PX_FORCE_INLINE Vec3V V3Scale(const Vec3V a, const FloatV b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	const float32x4_t tmp = vmulq_lane_f32(a, b, 0);
+	return vsetq_lane_f32(0.0f, tmp, 3);
+}
+
+PX_FORCE_INLINE Vec3V V3Mul(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return vmulq_f32(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3ScaleInv(const Vec3V a, const FloatV b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	const float32x2_t invB = VRECIP(b);
+	const float32x4_t tmp = vmulq_lane_f32(a, invB, 0);
+	return vsetq_lane_f32(0.0f, tmp, 3);
+}
+
+PX_FORCE_INLINE Vec3V V3Div(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	float32x4_t invB = VRECIPQ(b);
+	invB = vsetq_lane_f32(0.0f, invB, 3);
+	return vmulq_f32(a, invB);
+}
+
+PX_FORCE_INLINE Vec3V V3ScaleInvFast(const Vec3V a, const FloatV b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	const float32x2_t invB = VRECIPE(b);
+	const float32x4_t tmp = vmulq_lane_f32(a, invB, 0);
+	return vsetq_lane_f32(0.0f, tmp, 3);
+}
+
+PX_FORCE_INLINE Vec3V V3DivFast(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	float32x4_t invB = VRECIPEQ(b);
+	invB = vsetq_lane_f32(0.0f, invB, 3);
+	return vmulq_f32(a, invB);
+}
+
+PX_FORCE_INLINE Vec3V V3Recip(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const float32x4_t recipA = VRECIPQ(a);
+	return vsetq_lane_f32(0.0f, recipA, 3);
+}
+
+PX_FORCE_INLINE Vec3V V3RecipFast(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const float32x4_t recipA = VRECIPEQ(a);
+	return vsetq_lane_f32(0.0f, recipA, 3);
+}
+
+PX_FORCE_INLINE Vec3V V3Rsqrt(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const float32x4_t rSqrA = VRECIPSQRTQ(a);
+	return vsetq_lane_f32(0.0f, rSqrA, 3);
+}
+
+PX_FORCE_INLINE Vec3V V3RsqrtFast(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const float32x4_t rSqrA = VRECIPSQRTEQ(a);
+	return vsetq_lane_f32(0.0f, rSqrA, 3);
+}
+
+PX_FORCE_INLINE Vec3V V3ScaleAdd(const Vec3V a, const FloatV b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	ASSERT_ISVALIDVEC3V(c);
+
+	float32x4_t tmp = vmlaq_lane_f32(c, a, b, 0);
+	// using vsetq_lane_f32 resulted in failures,
+	// probably related to a compiler bug on
+	// ndk r9d-win32, gcc 4.8, cardhu/shield
+
+	// code with issue
+	// return vsetq_lane_f32(0.0f, tmp, 3);
+
+	// workaround
+	float32x2_t w_z = vget_high_f32(tmp);
+	float32x2_t y_x = vget_low_f32(tmp);
+	w_z = vset_lane_f32(0.0f, w_z, 1);
+	return vcombine_f32(y_x, w_z);
+}
+
+PX_FORCE_INLINE Vec3V V3NegScaleSub(const Vec3V a, const FloatV b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	ASSERT_ISVALIDVEC3V(c);
+
+	float32x4_t tmp = vmlsq_lane_f32(c, a, b, 0);
+	// using vsetq_lane_f32 resulted in failures,
+	// probably related to a compiler bug on
+	// ndk r9d-win32, gcc 4.8, cardhu/shield
+
+	// code with issue
+	// return vsetq_lane_f32(0.0f, tmp, 3);
+
+	// workaround
+	float32x2_t w_z = vget_high_f32(tmp);
+	float32x2_t y_x = vget_low_f32(tmp);
+	w_z = vset_lane_f32(0.0f, w_z, 1);
+	return vcombine_f32(y_x, w_z);
+}
+
+PX_FORCE_INLINE Vec3V V3MulAdd(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+	return vmlaq_f32(c, a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3NegMulSub(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+	return vmlsq_f32(c, a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Abs(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return vabsq_f32(a);
+}
+
+PX_FORCE_INLINE FloatV V3Dot(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	
+	//	const uint32x2_t mask = {0xffffFFFF, 0x0};
+	const float32x4_t tmp = vmulq_f32(a, b);
+
+	const float32x2_t low = vget_low_f32(tmp);
+	const float32x2_t high = vget_high_f32(tmp);
+	//	const float32x2_t high = vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(high_), mask));
+
+	const float32x2_t sumTmp = vpadd_f32(low, high);       // = {0+z, x+y}
+	const float32x2_t sum0ZYX = vpadd_f32(sumTmp, sumTmp); // = {x+y+z, x+y+z}
+
+	return sum0ZYX;
+}
+
+PX_FORCE_INLINE Vec3V V3Cross(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+
+	const uint32x2_t TF = { 0xffffFFFF, 0x0 };
+	const float32x2_t ay_ax = vget_low_f32(a);  // d2
+	const float32x2_t aw_az = vget_high_f32(a); // d3
+	const float32x2_t by_bx = vget_low_f32(b);  // d4
+	const float32x2_t bw_bz = vget_high_f32(b); // d5
+	// Hi, Lo
+	const float32x2_t bz_by = vext_f32(by_bx, bw_bz, 1); // bz, by
+	const float32x2_t az_ay = vext_f32(ay_ax, aw_az, 1); // az, ay
+
+	const float32x2_t azbx = vmul_f32(aw_az, by_bx);      // 0, az*bx
+	const float32x2_t aybz_axby = vmul_f32(ay_ax, bz_by); // ay*bz, ax*by
+
+	const float32x2_t azbxSUBaxbz = vmls_f32(azbx, bw_bz, ay_ax);                  // 0, az*bx-ax*bz
+	const float32x2_t aybzSUBazby_axbySUBaybx = vmls_f32(aybz_axby, by_bx, az_ay); // ay*bz-az*by, ax*by-ay*bx
+
+	const float32x2_t retLow = vext_f32(aybzSUBazby_axbySUBaybx, azbxSUBaxbz, 1);           // az*bx-ax*bz, ay*bz-az*by
+	const uint32x2_t retHigh = vand_u32(TF, vreinterpret_u32_f32(aybzSUBazby_axbySUBaybx)); // 0, ax*by-ay*bx
+
+	return vcombine_f32(retLow, vreinterpret_f32_u32(retHigh));
+}
+
+PX_FORCE_INLINE VecCrossV V3PrepareCross(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return a;
+}
+
+PX_FORCE_INLINE FloatV V3Length(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+
+	//	const uint32x2_t mask = {0xffffFFFF, 0x0};
+
+	const float32x4_t tmp = vmulq_f32(a, a);
+	const float32x2_t low = vget_low_f32(tmp);
+	const float32x2_t high = vget_high_f32(tmp);
+	//	const float32x2_t high = vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(high_), mask));
+
+	const float32x2_t sumTmp = vpadd_f32(low, high);       // = {0+z, x+y}
+	const float32x2_t sum0ZYX = vpadd_f32(sumTmp, sumTmp); // = {x+y+z, x+y+z}
+
+	return FSqrt(sum0ZYX);
+}
+
+PX_FORCE_INLINE FloatV V3LengthSq(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return V3Dot(a, a);
+}
+
+PX_FORCE_INLINE Vec3V V3Normalize(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	//PX_ASSERT(!FAllEq(V4LengthSq(a), FZero()));
+	return V3ScaleInv(a, V3Length(a));
+}
+
+PX_FORCE_INLINE Vec3V V3NormalizeFast(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	//PX_ASSERT(!FAllEq(V4LengthSq(a), FZero()));
+	return V3Scale(a, VRECIPSQRTE(V3Dot(a, a)));
+}
+
+PX_FORCE_INLINE Vec3V V3NormalizeSafe(const Vec3V a, const Vec3V unsafeReturnValue)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const FloatV zero = vdup_n_f32(0.0f);
+	const FloatV length = V3Length(a);
+	const uint32x4_t isGreaterThanZero = FIsGrtr(length, zero);
+	return V3Sel(isGreaterThanZero, V3ScaleInv(a, length), unsafeReturnValue);
+}
+
+PX_FORCE_INLINE Vec3V V3Sel(const BoolV c, const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V( vbslq_f32(c, a, b));
+	return vbslq_f32(c, a, b);
+}
+
+PX_FORCE_INLINE BoolV V3IsGrtr(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return vcgtq_f32(a, b);
+}
+
+PX_FORCE_INLINE BoolV V3IsGrtrOrEq(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return vcgeq_f32(a, b);
+}
+
+PX_FORCE_INLINE BoolV V3IsEq(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return vceqq_f32(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Max(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);	
+	return vmaxq_f32(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Min(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return vminq_f32(a, b);
+}
+
+PX_FORCE_INLINE FloatV V3ExtractMax(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+
+	const float32x2_t low = vget_low_f32(a);
+	const float32x2_t high = vget_high_f32(a);
+
+	const float32x2_t zz = vdup_lane_f32(high, 0);
+	const float32x2_t max0 = vpmax_f32(zz, low);
+	const float32x2_t max1 = vpmax_f32(max0, max0);
+
+	return max1;
+}
+
+PX_FORCE_INLINE FloatV V3ExtractMin(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+
+	const float32x2_t low = vget_low_f32(a);
+	const float32x2_t high = vget_high_f32(a);
+
+	const float32x2_t zz = vdup_lane_f32(high, 0);
+	const float32x2_t min0 = vpmin_f32(zz, low);
+	const float32x2_t min1 = vpmin_f32(min0, min0);
+
+	return min1;
+}
+
+// return (a >= 0.0f) ? 1.0f : -1.0f;
+PX_FORCE_INLINE Vec3V V3Sign(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const Vec3V zero = V3Zero();
+	const Vec3V one = V3One();
+	const Vec3V none = V3Neg(one);
+	return V3Sel(V3IsGrtrOrEq(a, zero), one, none);
+}
+
+PX_FORCE_INLINE Vec3V V3Clamp(const Vec3V a, const Vec3V minV, const Vec3V maxV)
+{
+	ASSERT_ISVALIDVEC3V(minV);
+	ASSERT_ISVALIDVEC3V(maxV);	
+	return V3Max(V3Min(a, maxV), minV);
+}
+
+PX_FORCE_INLINE PxU32 V3AllGrtr(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return internalUnitNeonSimd::BAllTrue3_R(V4IsGrtr(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V3AllGrtrOrEq(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return internalUnitNeonSimd::BAllTrue3_R(V4IsGrtrOrEq(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V3AllEq(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return internalUnitNeonSimd::BAllTrue3_R(V4IsEq(a, b));
+}
+
+PX_FORCE_INLINE Vec3V V3Round(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	// truncate(a + (0.5f - sign(a)))
+	const Vec3V half = V3Load(0.5f);
+	const float32x4_t sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(a), 31)));
+	const Vec3V aPlusHalf = V3Add(a, half);
+	const Vec3V aRound = V3Sub(aPlusHalf, sign);
+	return vcvtq_f32_s32(vcvtq_s32_f32(aRound));
+}
+
+PX_FORCE_INLINE Vec3V V3Sin(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+
+	// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
+	const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const Vec4V twoPi = V4LoadA(g_PXTwoPi.f);
+	const Vec3V tmp = V4Mul(a, recipTwoPi);
+	const Vec3V b = V3Round(tmp);
+	const Vec3V V1 = V4NegMulSub(twoPi, b, a);
+
+	// sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
+	//           V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
+	const Vec3V V2 = V3Mul(V1, V1);
+	const Vec3V V3 = V3Mul(V2, V1);
+	const Vec3V V5 = V3Mul(V3, V2);
+	const Vec3V V7 = V3Mul(V5, V2);
+	const Vec3V V9 = V3Mul(V7, V2);
+	const Vec3V V11 = V3Mul(V9, V2);
+	const Vec3V V13 = V3Mul(V11, V2);
+	const Vec3V V15 = V3Mul(V13, V2);
+	const Vec3V V17 = V3Mul(V15, V2);
+	const Vec3V V19 = V3Mul(V17, V2);
+	const Vec3V V21 = V3Mul(V19, V2);
+	const Vec3V V23 = V3Mul(V21, V2);
+
+	const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f);
+	const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f);
+	const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f);
+
+	const FloatV S1 = V4GetY(sinCoefficients0);
+	const FloatV S2 = V4GetZ(sinCoefficients0);
+	const FloatV S3 = V4GetW(sinCoefficients0);
+	const FloatV S4 = V4GetX(sinCoefficients1);
+	const FloatV S5 = V4GetY(sinCoefficients1);
+	const FloatV S6 = V4GetZ(sinCoefficients1);
+	const FloatV S7 = V4GetW(sinCoefficients1);
+	const FloatV S8 = V4GetX(sinCoefficients2);
+	const FloatV S9 = V4GetY(sinCoefficients2);
+	const FloatV S10 = V4GetZ(sinCoefficients2);
+	const FloatV S11 = V4GetW(sinCoefficients2);
+
+	Vec3V Result;
+	Result = V4ScaleAdd(V3, S1, V1);
+	Result = V4ScaleAdd(V5, S2, Result);
+	Result = V4ScaleAdd(V7, S3, Result);
+	Result = V4ScaleAdd(V9, S4, Result);
+	Result = V4ScaleAdd(V11, S5, Result);
+	Result = V4ScaleAdd(V13, S6, Result);
+	Result = V4ScaleAdd(V15, S7, Result);
+	Result = V4ScaleAdd(V17, S8, Result);
+	Result = V4ScaleAdd(V19, S9, Result);
+	Result = V4ScaleAdd(V21, S10, Result);
+	Result = V4ScaleAdd(V23, S11, Result);
+
+	return Result;
+}
+
+PX_FORCE_INLINE Vec3V V3Cos(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	
+	// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
+	const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const Vec4V twoPi = V4LoadA(g_PXTwoPi.f);
+	const Vec3V tmp = V4Mul(a, recipTwoPi);
+	const Vec3V b = V3Round(tmp);
+	const Vec3V V1 = V4NegMulSub(twoPi, b, a);
+
+	// cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
+	//           V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
+	const Vec3V V2 = V3Mul(V1, V1);
+	const Vec3V V4 = V3Mul(V2, V2);
+	const Vec3V V6 = V3Mul(V4, V2);
+	const Vec3V V8 = V3Mul(V4, V4);
+	const Vec3V V10 = V3Mul(V6, V4);
+	const Vec3V V12 = V3Mul(V6, V6);
+	const Vec3V V14 = V3Mul(V8, V6);
+	const Vec3V V16 = V3Mul(V8, V8);
+	const Vec3V V18 = V3Mul(V10, V8);
+	const Vec3V V20 = V3Mul(V10, V10);
+	const Vec3V V22 = V3Mul(V12, V10);
+
+	const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f);
+	const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f);
+	const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f);
+
+	const FloatV C1 = V4GetY(cosCoefficients0);
+	const FloatV C2 = V4GetZ(cosCoefficients0);
+	const FloatV C3 = V4GetW(cosCoefficients0);
+	const FloatV C4 = V4GetX(cosCoefficients1);
+	const FloatV C5 = V4GetY(cosCoefficients1);
+	const FloatV C6 = V4GetZ(cosCoefficients1);
+	const FloatV C7 = V4GetW(cosCoefficients1);
+	const FloatV C8 = V4GetX(cosCoefficients2);
+	const FloatV C9 = V4GetY(cosCoefficients2);
+	const FloatV C10 = V4GetZ(cosCoefficients2);
+	const FloatV C11 = V4GetW(cosCoefficients2);
+
+	Vec3V Result;
+	Result = V4ScaleAdd(V2, C1, V4One());
+	Result = V4ScaleAdd(V4, C2, Result);
+	Result = V4ScaleAdd(V6, C3, Result);
+	Result = V4ScaleAdd(V8, C4, Result);
+	Result = V4ScaleAdd(V10, C5, Result);
+	Result = V4ScaleAdd(V12, C6, Result);
+	Result = V4ScaleAdd(V14, C7, Result);
+	Result = V4ScaleAdd(V16, C8, Result);
+	Result = V4ScaleAdd(V18, C9, Result);
+	Result = V4ScaleAdd(V20, C10, Result);
+	Result = V4ScaleAdd(V22, C11, Result);
+
+	return V4ClearW(Result);
+}
+
+PX_FORCE_INLINE Vec3V V3PermYZZ(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const float32x2_t xy = vget_low_f32(a);
+	const float32x2_t zw = vget_high_f32(a);
+	const float32x2_t yz = vext_f32(xy, zw, 1);
+	return vcombine_f32(yz, zw);
+}
+
+PX_FORCE_INLINE Vec3V V3PermXYX(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const uint32x2_t mask = { 0xffffFFFF, 0x0 };
+
+	const uint32x2_t xy = vget_low_u32(vreinterpretq_u32_f32(a));
+	const uint32x2_t xw = vand_u32(xy, mask);
+	return vreinterpretq_f32_u32(vcombine_u32(xy, xw));
+}
+
+PX_FORCE_INLINE Vec3V V3PermYZX(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const uint32x2_t mask = { 0xffffFFFF, 0x0 };
+
+	const uint32x2_t xy = vget_low_u32(vreinterpretq_u32_f32(a));
+	const uint32x2_t zw = vget_high_u32(vreinterpretq_u32_f32(a));
+	const uint32x2_t yz = vext_u32(xy, zw, 1);
+	const uint32x2_t xw = vand_u32(xy, mask);
+	return vreinterpretq_f32_u32(vcombine_u32(yz, xw));
+}
+
+PX_FORCE_INLINE Vec3V V3PermZXY(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);	
+	
+	const uint32x2_t xy = vget_low_u32(vreinterpretq_u32_f32(a));
+	const uint32x2_t zw = vget_high_u32(vreinterpretq_u32_f32(a));
+	const uint32x2_t wz = vrev64_u32(zw);
+
+	const uint32x2_t zx = vext_u32(wz, xy, 1);
+	const uint32x2_t yw = vext_u32(xy, wz, 1);
+
+	return vreinterpretq_f32_u32(vcombine_u32(zx, yw));
+}
+
+PX_FORCE_INLINE Vec3V V3PermZZY(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+
+	const uint32x2_t xy = vget_low_u32(vreinterpretq_u32_f32(a));
+	const uint32x2_t zw = vget_high_u32(vreinterpretq_u32_f32(a));
+
+	const uint32x2_t wz = vrev64_u32(zw);
+	const uint32x2_t yw = vext_u32(xy, wz, 1);
+	const uint32x2_t zz = vdup_lane_u32(wz, 1);
+
+	return vreinterpretq_f32_u32(vcombine_u32(zz, yw));
+}
+
+PX_FORCE_INLINE Vec3V V3PermYXX(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+
+	const uint32x2_t mask = { 0xffffFFFF, 0x0 };
+
+	const uint32x2_t xy = vget_low_u32(vreinterpretq_u32_f32(a));
+	const uint32x2_t yx = vrev64_u32(xy);
+	const uint32x2_t xw = vand_u32(xy, mask);
+	return vreinterpretq_f32_u32(vcombine_u32(yx, xw));
+}
+
+PX_FORCE_INLINE Vec3V V3Perm_Zero_1Z_0Y(const Vec3V v0, const Vec3V v1)
+{
+	ASSERT_ISVALIDVEC3V(v0);
+	ASSERT_ISVALIDVEC3V(v1);
+	
+	const uint32x2_t xy = vget_low_u32(vreinterpretq_u32_f32(v0));
+	const uint32x2_t zw = vget_high_u32(vreinterpretq_u32_f32(v1));
+	const uint32x2_t wz = vrev64_u32(zw);
+	const uint32x2_t yw = vext_u32(xy, wz, 1);
+
+	return vreinterpretq_f32_u32(vcombine_u32(wz, yw));
+}
+
+PX_FORCE_INLINE Vec3V V3Perm_0Z_Zero_1X(const Vec3V v0, const Vec3V v1)
+{
+	ASSERT_ISVALIDVEC3V(v0);
+	ASSERT_ISVALIDVEC3V(v1);
+
+	const uint32x2_t mask = { 0xffffFFFF, 0x0 };
+
+	const uint32x2_t zw = vget_high_u32(vreinterpretq_u32_f32(v0));
+	const uint32x2_t xy = vget_low_u32(vreinterpretq_u32_f32(v1));
+	const uint32x2_t xw = vand_u32(xy, mask);
+
+	return vreinterpretq_f32_u32(vcombine_u32(zw, xw));
+}
+
+PX_FORCE_INLINE Vec3V V3Perm_1Y_0X_Zero(const Vec3V v0, const Vec3V v1)
+{
+	ASSERT_ISVALIDVEC3V(v0);
+	ASSERT_ISVALIDVEC3V(v1);
+	
+	const uint32x2_t axy = vget_low_u32(vreinterpretq_u32_f32(v0));
+	const uint32x2_t bxy = vget_low_u32(vreinterpretq_u32_f32(v1));
+	const uint32x2_t byax = vext_u32(bxy, axy, 1);
+	const uint32x2_t ww = vdup_n_u32(0);
+
+	return vreinterpretq_f32_u32(vcombine_u32(byax, ww));
+}
+
+PX_FORCE_INLINE FloatV V3SumElems(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+
+	// const uint32x2_t mask = {0xffffFFFF, 0x0};
+
+	const float32x2_t low = vget_low_f32(a);
+	const float32x2_t high = vget_high_f32(a);
+	// const float32x2_t high = vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(high_), mask));
+
+	const float32x2_t sumTmp = vpadd_f32(low, high);       // = {0+z, x+y}
+	const float32x2_t sum0ZYX = vpadd_f32(sumTmp, sumTmp); // = {x+y+z, x+y+z}
+
+	return sum0ZYX;
+}
+
+PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V min, const Vec3V max)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(min);
+	ASSERT_ISVALIDVEC3V(max);
+
+	const BoolV c = BOr(V3IsGrtr(a, max), V3IsGrtr(min, a));
+	return internalUnitNeonSimd::BAnyTrue3_R(c);
+}
+
+PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V min, const Vec3V max)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(min);
+	ASSERT_ISVALIDVEC3V(max);
+
+	const BoolV c = BAnd(V3IsGrtrOrEq(a, min), V3IsGrtrOrEq(max, a));
+	return internalUnitNeonSimd::BAllTrue4_R(c);
+}
+
+PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V bounds)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(bounds);
+
+	const BoolV greater = V3IsGrtr(V3Abs(a), bounds);
+	return internalUnitNeonSimd::BAnyTrue3_R(greater);
+}
+
+PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V bounds)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(bounds);
+
+	const BoolV greaterOrEq = V3IsGrtrOrEq(bounds, V3Abs(a));
+	return internalUnitNeonSimd::BAllTrue4_R(greaterOrEq);
+}
+
+PX_FORCE_INLINE void V3Transpose(Vec3V& col0, Vec3V& col1, Vec3V& col2)
+{
+	ASSERT_ISVALIDVEC3V(col0);
+	ASSERT_ISVALIDVEC3V(col1);
+	ASSERT_ISVALIDVEC3V(col2);
+
+	Vec3V col3 = V3Zero();
+	const float32x4x2_t v0v1 = vzipq_f32(col0, col2);
+	const float32x4x2_t v2v3 = vzipq_f32(col1, col3);
+	const float32x4x2_t zip0 = vzipq_f32(v0v1.val[0], v2v3.val[0]);
+	const float32x4x2_t zip1 = vzipq_f32(v0v1.val[1], v2v3.val[1]);
+	col0 = zip0.val[0];
+	col1 = zip0.val[1];
+	col2 = zip1.val[0];
+	// col3 = zip1.val[1];
+}
+
+//////////////////////////////////
+// VEC4V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec4V V4Splat(const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return vcombine_f32(f, f);
+}
+
+PX_FORCE_INLINE Vec4V V4Merge(const FloatV* const floatVArray)
+{
+	ASSERT_ISVALIDFLOATV(floatVArray[0]);
+	ASSERT_ISVALIDFLOATV(floatVArray[1]);
+	ASSERT_ISVALIDFLOATV(floatVArray[2]);
+	ASSERT_ISVALIDFLOATV(floatVArray[3]);
+
+	const uint32x2_t xLow = vreinterpret_u32_f32(floatVArray[0]);
+	const uint32x2_t yLow = vreinterpret_u32_f32(floatVArray[1]);
+	const uint32x2_t zLow = vreinterpret_u32_f32(floatVArray[2]);
+	const uint32x2_t wLow = vreinterpret_u32_f32(floatVArray[3]);
+
+	const uint32x2_t dLow = vext_u32(xLow, yLow, 1);
+	const uint32x2_t dHigh = vext_u32(zLow, wLow, 1);
+
+	return vreinterpretq_f32_u32(vcombine_u32(dLow, dHigh));
+}
+
+PX_FORCE_INLINE Vec4V V4Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z, const FloatVArg w)
+{
+	ASSERT_ISVALIDFLOATV(x);
+	ASSERT_ISVALIDFLOATV(y);
+	ASSERT_ISVALIDFLOATV(z);
+	ASSERT_ISVALIDFLOATV(w);
+
+	const uint32x2_t xLow = vreinterpret_u32_f32(x);
+	const uint32x2_t yLow = vreinterpret_u32_f32(y);
+	const uint32x2_t zLow = vreinterpret_u32_f32(z);
+	const uint32x2_t wLow = vreinterpret_u32_f32(w);
+
+	const uint32x2_t dLow = vext_u32(xLow, yLow, 1);
+	const uint32x2_t dHigh = vext_u32(zLow, wLow, 1);
+
+	return vreinterpretq_f32_u32(vcombine_u32(dLow, dHigh));
+}
+
+PX_FORCE_INLINE Vec4V V4MergeW(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	const float32x2_t xx = vget_high_f32(x);
+	const float32x2_t yy = vget_high_f32(y);
+	const float32x2_t zz = vget_high_f32(z);
+	const float32x2_t ww = vget_high_f32(w);
+
+	const float32x2x2_t zipL = vzip_f32(xx, yy);
+	const float32x2x2_t zipH = vzip_f32(zz, ww);
+
+	return vcombine_f32(zipL.val[1], zipH.val[1]);
+}
+
+PX_FORCE_INLINE Vec4V V4MergeZ(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	const float32x2_t xx = vget_high_f32(x);
+	const float32x2_t yy = vget_high_f32(y);
+	const float32x2_t zz = vget_high_f32(z);
+	const float32x2_t ww = vget_high_f32(w);
+
+	const float32x2x2_t zipL = vzip_f32(xx, yy);
+	const float32x2x2_t zipH = vzip_f32(zz, ww);
+
+	return vcombine_f32(zipL.val[0], zipH.val[0]);
+}
+
+PX_FORCE_INLINE Vec4V V4MergeY(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	const float32x2_t xx = vget_low_f32(x);
+	const float32x2_t yy = vget_low_f32(y);
+	const float32x2_t zz = vget_low_f32(z);
+	const float32x2_t ww = vget_low_f32(w);
+
+	const float32x2x2_t zipL = vzip_f32(xx, yy);
+	const float32x2x2_t zipH = vzip_f32(zz, ww);
+
+	return vcombine_f32(zipL.val[1], zipH.val[1]);
+}
+
+PX_FORCE_INLINE Vec4V V4MergeX(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	const float32x2_t xx = vget_low_f32(x);
+	const float32x2_t yy = vget_low_f32(y);
+	const float32x2_t zz = vget_low_f32(z);
+	const float32x2_t ww = vget_low_f32(w);
+
+	const float32x2x2_t zipL = vzip_f32(xx, yy);
+	const float32x2x2_t zipH = vzip_f32(zz, ww);
+
+	return vcombine_f32(zipL.val[0], zipH.val[0]);
+}
+
+PX_FORCE_INLINE Vec4V V4UnpackXY(const Vec4VArg a, const Vec4VArg b)
+{
+	return vzipq_f32(a, b).val[0];
+}
+
+PX_FORCE_INLINE Vec4V V4UnpackZW(const Vec4VArg a, const Vec4VArg b)
+{
+	return vzipq_f32(a, b).val[1];
+}
+
+PX_FORCE_INLINE Vec4V V4UnitW()
+{
+	const float32x2_t zeros = vreinterpret_f32_u32(vmov_n_u32(0));
+	const float32x2_t ones = vmov_n_f32(1.0f);
+	const float32x2_t zo = vext_f32(zeros, ones, 1);
+	return vcombine_f32(zeros, zo);
+}
+
+PX_FORCE_INLINE Vec4V V4UnitX()
+{
+	const float32x2_t zeros = vreinterpret_f32_u32(vmov_n_u32(0));
+	const float32x2_t ones = vmov_n_f32(1.0f);
+	const float32x2_t oz = vext_f32(ones, zeros, 1);
+	return vcombine_f32(oz, zeros);
+}
+
+PX_FORCE_INLINE Vec4V V4UnitY()
+{
+	const float32x2_t zeros = vreinterpret_f32_u32(vmov_n_u32(0));
+	const float32x2_t ones = vmov_n_f32(1.0f);
+	const float32x2_t zo = vext_f32(zeros, ones, 1);
+	return vcombine_f32(zo, zeros);
+}
+
+PX_FORCE_INLINE Vec4V V4UnitZ()
+{
+	const float32x2_t zeros = vreinterpret_f32_u32(vmov_n_u32(0));
+	const float32x2_t ones = vmov_n_f32(1.0f);
+	const float32x2_t oz = vext_f32(ones, zeros, 1);
+	return vcombine_f32(zeros, oz);
+}
+
+PX_FORCE_INLINE FloatV V4GetW(const Vec4V f)
+{
+	const float32x2_t fhigh = vget_high_f32(f);
+	return vdup_lane_f32(fhigh, 1);
+}
+
+PX_FORCE_INLINE FloatV V4GetX(const Vec4V f)
+{
+	const float32x2_t fLow = vget_low_f32(f);
+	return vdup_lane_f32(fLow, 0);
+}
+
+PX_FORCE_INLINE FloatV V4GetY(const Vec4V f)
+{
+	const float32x2_t fLow = vget_low_f32(f);
+	return vdup_lane_f32(fLow, 1);
+}
+
+PX_FORCE_INLINE FloatV V4GetZ(const Vec4V f)
+{
+	const float32x2_t fhigh = vget_high_f32(f);
+	return vdup_lane_f32(fhigh, 0);
+}
+
+PX_FORCE_INLINE Vec4V V4SetW(const Vec4V v, const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTTTF(), v, vcombine_f32(f, f));
+}
+
+PX_FORCE_INLINE Vec4V V4SetX(const Vec4V v, const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BFTTT(), v, vcombine_f32(f, f));
+}
+
+PX_FORCE_INLINE Vec4V V4SetY(const Vec4V v, const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTFTT(), v, vcombine_f32(f, f));
+}
+
+PX_FORCE_INLINE Vec4V V4SetZ(const Vec4V v, const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTTFT(), v, vcombine_f32(f, f));
+}
+
+PX_FORCE_INLINE Vec4V V4ClearW(const Vec4V v)
+{
+	return V4Sel(BTTTF(), v, V4Zero());
+}
+
+PX_FORCE_INLINE Vec4V V4PermYXWZ(const Vec4V a)
+{
+	const float32x2_t xy = vget_low_f32(a);
+	const float32x2_t zw = vget_high_f32(a);
+	const float32x2_t yx = vext_f32(xy, xy, 1);
+	const float32x2_t wz = vext_f32(zw, zw, 1);
+	return vcombine_f32(yx, wz);
+}
+
+PX_FORCE_INLINE Vec4V V4PermXZXZ(const Vec4V a)
+{
+	const float32x2_t xy = vget_low_f32(a);
+	const float32x2_t zw = vget_high_f32(a);
+	const float32x2x2_t xzyw = vzip_f32(xy, zw);
+	return vcombine_f32(xzyw.val[0], xzyw.val[0]);
+}
+
+PX_FORCE_INLINE Vec4V V4PermYWYW(const Vec4V a)
+{
+	const float32x2_t xy = vget_low_f32(a);
+	const float32x2_t zw = vget_high_f32(a);
+	const float32x2x2_t xzyw = vzip_f32(xy, zw);
+	return vcombine_f32(xzyw.val[1], xzyw.val[1]);
+}
+
+PX_FORCE_INLINE Vec4V V4PermYZXW(const Vec4V a)
+{
+	const uint32x2_t xy = vget_low_u32(vreinterpretq_u32_f32(a));
+	const uint32x2_t zw = vget_high_u32(vreinterpretq_u32_f32(a));
+	const uint32x2_t yz = vext_u32(xy, zw, 1);
+	const uint32x2_t xw = vrev64_u32(vext_u32(zw, xy, 1));
+	return vreinterpretq_f32_u32(vcombine_u32(yz, xw));
+}
+
+template <PxU8 E0, PxU8 E1, PxU8 E2, PxU8 E3>
+PX_FORCE_INLINE Vec4V V4Perm(const Vec4V V)
+{
+	static const uint32_t ControlElement[4] =
+	{
+#if 1
+		0x03020100, // XM_SWIZZLE_X
+		0x07060504, // XM_SWIZZLE_Y
+		0x0B0A0908, // XM_SWIZZLE_Z
+		0x0F0E0D0C, // XM_SWIZZLE_W
+#else
+		0x00010203, // XM_SWIZZLE_X
+		0x04050607, // XM_SWIZZLE_Y
+		0x08090A0B, // XM_SWIZZLE_Z
+		0x0C0D0E0F, // XM_SWIZZLE_W
+#endif
+	};
+
+	uint8x8x2_t tbl;
+	tbl.val[0] = vreinterpret_u8_f32(vget_low_f32(V));
+	tbl.val[1] = vreinterpret_u8_f32(vget_high_f32(V));
+
+	uint8x8_t idx =
+	    vcreate_u8(static_cast<uint64_t>(ControlElement[E0]) | (static_cast<uint64_t>(ControlElement[E1]) << 32));
+	const uint8x8_t rL = vtbl2_u8(tbl, idx);
+	idx = vcreate_u8(static_cast<uint64_t>(ControlElement[E2]) | (static_cast<uint64_t>(ControlElement[E3]) << 32));
+	const uint8x8_t rH = vtbl2_u8(tbl, idx);
+	return vreinterpretq_f32_u8(vcombine_u8(rL, rH));
+}
+
+// PT: this seems measurably slower than the hardcoded version
+/*PX_FORCE_INLINE Vec4V V4PermYZXW(const Vec4V a)
+{
+    return V4Perm<1, 2, 0, 3>(a);
+}*/
+
+PX_FORCE_INLINE Vec4V V4Zero()
+{
+	return vreinterpretq_f32_u32(vmovq_n_u32(0));
+	//	return vmovq_n_f32(0.0f);
+}
+
+PX_FORCE_INLINE Vec4V V4One()
+{
+	return vmovq_n_f32(1.0f);
+}
+
+PX_FORCE_INLINE Vec4V V4Eps()
+{
+	//	return vmovq_n_f32(PX_EPS_REAL);
+	return V4Load(PX_EPS_REAL);
+}
+
+PX_FORCE_INLINE Vec4V V4Neg(const Vec4V f)
+{
+	return vnegq_f32(f);
+}
+
+PX_FORCE_INLINE Vec4V V4Add(const Vec4V a, const Vec4V b)
+{
+	return vaddq_f32(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Sub(const Vec4V a, const Vec4V b)
+{
+	return vsubq_f32(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Scale(const Vec4V a, const FloatV b)
+{
+	return vmulq_lane_f32(a, b, 0);
+}
+
+PX_FORCE_INLINE Vec4V V4Mul(const Vec4V a, const Vec4V b)
+{
+	return vmulq_f32(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4ScaleInv(const Vec4V a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(b);
+	const float32x2_t invB = VRECIP(b);
+	return vmulq_lane_f32(a, invB, 0);
+}
+
+PX_FORCE_INLINE Vec4V V4Div(const Vec4V a, const Vec4V b)
+{
+	const float32x4_t invB = VRECIPQ(b);
+	return vmulq_f32(a, invB);
+}
+
+PX_FORCE_INLINE Vec4V V4ScaleInvFast(const Vec4V a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(b);
+	const float32x2_t invB = VRECIPE(b);
+	return vmulq_lane_f32(a, invB, 0);
+}
+
+PX_FORCE_INLINE Vec4V V4DivFast(const Vec4V a, const Vec4V b)
+{
+	const float32x4_t invB = VRECIPEQ(b);
+	return vmulq_f32(a, invB);
+}
+
+PX_FORCE_INLINE Vec4V V4Recip(const Vec4V a)
+{
+	return VRECIPQ(a);
+}
+
+PX_FORCE_INLINE Vec4V V4RecipFast(const Vec4V a)
+{
+	return VRECIPEQ(a);
+}
+
+PX_FORCE_INLINE Vec4V V4Rsqrt(const Vec4V a)
+{
+	return VRECIPSQRTQ(a);
+}
+
+PX_FORCE_INLINE Vec4V V4RsqrtFast(const Vec4V a)
+{
+	return VRECIPSQRTEQ(a);
+}
+
+PX_FORCE_INLINE Vec4V V4Sqrt(const Vec4V a)
+{
+	return V4Sel(V4IsEq(a, V4Zero()), a, V4Mul(a, VRECIPSQRTQ(a)));
+}
+
+PX_FORCE_INLINE Vec4V V4ScaleAdd(const Vec4V a, const FloatV b, const Vec4V c)
+{
+	ASSERT_ISVALIDFLOATV(b);
+	return vmlaq_lane_f32(c, a, b, 0);
+}
+
+PX_FORCE_INLINE Vec4V V4NegScaleSub(const Vec4V a, const FloatV b, const Vec4V c)
+{
+	ASSERT_ISVALIDFLOATV(b);
+	return vmlsq_lane_f32(c, a, b, 0);
+}
+
+PX_FORCE_INLINE Vec4V V4MulAdd(const Vec4V a, const Vec4V b, const Vec4V c)
+{
+	return vmlaq_f32(c, a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4NegMulSub(const Vec4V a, const Vec4V b, const Vec4V c)
+{
+	return vmlsq_f32(c, a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Abs(const Vec4V a)
+{
+	return vabsq_f32(a);
+}
+
+PX_FORCE_INLINE FloatV V4SumElements(const Vec4V a)
+{
+	const Vec4V xy = V4UnpackXY(a, a); // x,x,y,y
+	const Vec4V zw = V4UnpackZW(a, a); // z,z,w,w
+	const Vec4V xz_yw = V4Add(xy, zw); // x+z,x+z,y+w,y+w
+	const FloatV xz = V4GetX(xz_yw);   // x+z
+	const FloatV yw = V4GetZ(xz_yw);   // y+w
+	return FAdd(xz, yw);               // sum
+}
+
+PX_FORCE_INLINE FloatV V4Dot(const Vec4V a, const Vec4V b)
+{
+	const float32x4_t tmp = vmulq_f32(a, b);
+	const float32x2_t low = vget_low_f32(tmp);
+	const float32x2_t high = vget_high_f32(tmp);
+
+	const float32x2_t sumTmp = vpadd_f32(low, high);       // = {z+w, x+y}
+	const float32x2_t sumWZYX = vpadd_f32(sumTmp, sumTmp); // = {x+y+z+w, x+y+z+w}
+	return sumWZYX;
+}
+
+PX_FORCE_INLINE FloatV V4Dot3(const Vec4V aa, const Vec4V bb)
+{
+	// PT: the V3Dot code relies on the fact that W=0 so we can't reuse it as-is, we need to clear W first.
+	// TODO: find a better implementation that does not need to clear W.
+	const Vec4V a = V4ClearW(aa);
+	const Vec4V b = V4ClearW(bb);
+
+	const float32x4_t tmp = vmulq_f32(a, b);
+	const float32x2_t low = vget_low_f32(tmp);
+	const float32x2_t high = vget_high_f32(tmp);
+
+	const float32x2_t sumTmp = vpadd_f32(low, high);       // = {0+z, x+y}
+	const float32x2_t sum0ZYX = vpadd_f32(sumTmp, sumTmp); // = {x+y+z, x+y+z}
+	return sum0ZYX;
+}
+
+PX_FORCE_INLINE Vec4V V4Cross(const Vec4V a, const Vec4V b)
+{
+	const uint32x2_t TF = { 0xffffFFFF, 0x0 };
+	const float32x2_t ay_ax = vget_low_f32(a);  // d2
+	const float32x2_t aw_az = vget_high_f32(a); // d3
+	const float32x2_t by_bx = vget_low_f32(b);  // d4
+	const float32x2_t bw_bz = vget_high_f32(b); // d5
+	// Hi, Lo
+	const float32x2_t bz_by = vext_f32(by_bx, bw_bz, 1); // bz, by
+	const float32x2_t az_ay = vext_f32(ay_ax, aw_az, 1); // az, ay
+
+	const float32x2_t azbx = vmul_f32(aw_az, by_bx);      // 0, az*bx
+	const float32x2_t aybz_axby = vmul_f32(ay_ax, bz_by); // ay*bz, ax*by
+
+	const float32x2_t azbxSUBaxbz = vmls_f32(azbx, bw_bz, ay_ax);                  // 0, az*bx-ax*bz
+	const float32x2_t aybzSUBazby_axbySUBaybx = vmls_f32(aybz_axby, by_bx, az_ay); // ay*bz-az*by, ax*by-ay*bx
+
+	const float32x2_t retLow = vext_f32(aybzSUBazby_axbySUBaybx, azbxSUBaxbz, 1);           // az*bx-ax*bz, ay*bz-az*by
+	const uint32x2_t retHigh = vand_u32(TF, vreinterpret_u32_f32(aybzSUBazby_axbySUBaybx)); // 0, ax*by-ay*bx
+
+	return vcombine_f32(retLow, vreinterpret_f32_u32(retHigh));
+}
+
+PX_FORCE_INLINE FloatV V4Length(const Vec4V a)
+{
+	const float32x4_t tmp = vmulq_f32(a, a);
+	const float32x2_t low = vget_low_f32(tmp);
+	const float32x2_t high = vget_high_f32(tmp);
+
+	const float32x2_t sumTmp = vpadd_f32(low, high);       // = {0+z, x+y}
+	const float32x2_t sumWZYX = vpadd_f32(sumTmp, sumTmp); // = {x+y+z, x+y+z}
+	return FSqrt(sumWZYX);
+}
+
+PX_FORCE_INLINE FloatV V4LengthSq(const Vec4V a)
+{
+	return V4Dot(a, a);
+}
+
+PX_FORCE_INLINE Vec4V V4Normalize(const Vec4V a)
+{
+	//PX_ASSERT(!FAllEq(V4LengthSq(a), FZero()));
+	return V4ScaleInv(a, V4Length(a));
+}
+
+PX_FORCE_INLINE Vec4V V4NormalizeFast(const Vec4V a)
+{
+	//PX_ASSERT(!FAllEq(V4LengthSq(a), FZero()));
+	return V4Scale(a, FRsqrtFast(V4Dot(a, a)));
+}
+
+PX_FORCE_INLINE Vec4V V4NormalizeSafe(const Vec4V a, const Vec4V unsafeReturnValue)
+{
+	const FloatV zero = FZero();
+	const FloatV length = V4Length(a);
+	const uint32x4_t isGreaterThanZero = FIsGrtr(length, zero);
+	return V4Sel(isGreaterThanZero, V4ScaleInv(a, length), unsafeReturnValue);
+}
+
+PX_FORCE_INLINE BoolV V4IsEqU32(const VecU32V a, const VecU32V b)
+{
+	return vceqq_u32(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Sel(const BoolV c, const Vec4V a, const Vec4V b)
+{
+	return vbslq_f32(c, a, b);
+}
+
+PX_FORCE_INLINE BoolV V4IsGrtr(const Vec4V a, const Vec4V b)
+{
+	return vcgtq_f32(a, b);
+}
+
+PX_FORCE_INLINE BoolV V4IsGrtrOrEq(const Vec4V a, const Vec4V b)
+{
+	return vcgeq_f32(a, b);
+}
+
+PX_FORCE_INLINE BoolV V4IsEq(const Vec4V a, const Vec4V b)
+{
+	return vceqq_f32(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Max(const Vec4V a, const Vec4V b)
+{
+	return vmaxq_f32(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Min(const Vec4V a, const Vec4V b)
+{
+	return vminq_f32(a, b);
+}
+
+PX_FORCE_INLINE FloatV V4ExtractMax(const Vec4V a)
+{
+	const float32x2_t low = vget_low_f32(a);
+	const float32x2_t high = vget_high_f32(a);
+
+	const float32x2_t max0 = vpmax_f32(high, low);
+	const float32x2_t max1 = vpmax_f32(max0, max0);
+
+	return max1;
+}
+
+PX_FORCE_INLINE FloatV V4ExtractMin(const Vec4V a)
+{
+	const float32x2_t low = vget_low_f32(a);
+	const float32x2_t high = vget_high_f32(a);
+
+	const float32x2_t min0 = vpmin_f32(high, low);
+	const float32x2_t min1 = vpmin_f32(min0, min0);
+
+	return min1;
+}
+
+PX_FORCE_INLINE Vec4V V4Clamp(const Vec4V a, const Vec4V minV, const Vec4V maxV)
+{
+	return V4Max(V4Min(a, maxV), minV);
+}
+
+PX_FORCE_INLINE PxU32 V4AllGrtr(const Vec4V a, const Vec4V b)
+{
+	return internalUnitNeonSimd::BAllTrue4_R(V4IsGrtr(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V4AllGrtrOrEq(const Vec4V a, const Vec4V b)
+{
+	return internalUnitNeonSimd::BAllTrue4_R(V4IsGrtrOrEq(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V4AllGrtrOrEq3(const Vec4V a, const Vec4V b)
+{
+	return internalUnitNeonSimd::BAllTrue3_R(V4IsGrtrOrEq(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V4AllEq(const Vec4V a, const Vec4V b)
+{
+	return internalUnitNeonSimd::BAllTrue4_R(V4IsEq(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V4AnyGrtr3(const Vec4V a, const Vec4V b)
+{
+	return internalUnitNeonSimd::BAnyTrue3_R(V4IsGrtr(a, b));
+}
+
+PX_FORCE_INLINE Vec4V V4Round(const Vec4V a)
+{
+	// truncate(a + (0.5f - sign(a)))
+	const Vec4V half = V4Load(0.5f);
+	const float32x4_t sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(a), 31)));
+	const Vec4V aPlusHalf = V4Add(a, half);
+	const Vec4V aRound = V4Sub(aPlusHalf, sign);
+	return vcvtq_f32_s32(vcvtq_s32_f32(aRound));
+}
+
+PX_FORCE_INLINE Vec4V V4Sin(const Vec4V a)
+{
+	const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const Vec4V twoPi = V4LoadA(g_PXTwoPi.f);
+	const Vec4V tmp = V4Mul(a, recipTwoPi);
+	const Vec4V b = V4Round(tmp);
+	const Vec4V V1 = V4NegMulSub(twoPi, b, a);
+
+	// sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
+	//           V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
+	const Vec4V V2 = V4Mul(V1, V1);
+	const Vec4V V3 = V4Mul(V2, V1);
+	const Vec4V V5 = V4Mul(V3, V2);
+	const Vec4V V7 = V4Mul(V5, V2);
+	const Vec4V V9 = V4Mul(V7, V2);
+	const Vec4V V11 = V4Mul(V9, V2);
+	const Vec4V V13 = V4Mul(V11, V2);
+	const Vec4V V15 = V4Mul(V13, V2);
+	const Vec4V V17 = V4Mul(V15, V2);
+	const Vec4V V19 = V4Mul(V17, V2);
+	const Vec4V V21 = V4Mul(V19, V2);
+	const Vec4V V23 = V4Mul(V21, V2);
+
+	const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f);
+	const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f);
+	const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f);
+
+	const FloatV S1 = V4GetY(sinCoefficients0);
+	const FloatV S2 = V4GetZ(sinCoefficients0);
+	const FloatV S3 = V4GetW(sinCoefficients0);
+	const FloatV S4 = V4GetX(sinCoefficients1);
+	const FloatV S5 = V4GetY(sinCoefficients1);
+	const FloatV S6 = V4GetZ(sinCoefficients1);
+	const FloatV S7 = V4GetW(sinCoefficients1);
+	const FloatV S8 = V4GetX(sinCoefficients2);
+	const FloatV S9 = V4GetY(sinCoefficients2);
+	const FloatV S10 = V4GetZ(sinCoefficients2);
+	const FloatV S11 = V4GetW(sinCoefficients2);
+
+	Vec4V Result;
+	Result = V4ScaleAdd(V3, S1, V1);
+	Result = V4ScaleAdd(V5, S2, Result);
+	Result = V4ScaleAdd(V7, S3, Result);
+	Result = V4ScaleAdd(V9, S4, Result);
+	Result = V4ScaleAdd(V11, S5, Result);
+	Result = V4ScaleAdd(V13, S6, Result);
+	Result = V4ScaleAdd(V15, S7, Result);
+	Result = V4ScaleAdd(V17, S8, Result);
+	Result = V4ScaleAdd(V19, S9, Result);
+	Result = V4ScaleAdd(V21, S10, Result);
+	Result = V4ScaleAdd(V23, S11, Result);
+
+	return Result;
+}
+
+PX_FORCE_INLINE Vec4V V4Cos(const Vec4V a)
+{
+	const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const Vec4V twoPi = V4LoadA(g_PXTwoPi.f);
+	const Vec4V tmp = V4Mul(a, recipTwoPi);
+	const Vec4V b = V4Round(tmp);
+	const Vec4V V1 = V4NegMulSub(twoPi, b, a);
+
+	// cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
+	//           V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
+	const Vec4V V2 = V4Mul(V1, V1);
+	const Vec4V V4 = V4Mul(V2, V2);
+	const Vec4V V6 = V4Mul(V4, V2);
+	const Vec4V V8 = V4Mul(V4, V4);
+	const Vec4V V10 = V4Mul(V6, V4);
+	const Vec4V V12 = V4Mul(V6, V6);
+	const Vec4V V14 = V4Mul(V8, V6);
+	const Vec4V V16 = V4Mul(V8, V8);
+	const Vec4V V18 = V4Mul(V10, V8);
+	const Vec4V V20 = V4Mul(V10, V10);
+	const Vec4V V22 = V4Mul(V12, V10);
+
+	const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f);
+	const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f);
+	const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f);
+
+	const FloatV C1 = V4GetY(cosCoefficients0);
+	const FloatV C2 = V4GetZ(cosCoefficients0);
+	const FloatV C3 = V4GetW(cosCoefficients0);
+	const FloatV C4 = V4GetX(cosCoefficients1);
+	const FloatV C5 = V4GetY(cosCoefficients1);
+	const FloatV C6 = V4GetZ(cosCoefficients1);
+	const FloatV C7 = V4GetW(cosCoefficients1);
+	const FloatV C8 = V4GetX(cosCoefficients2);
+	const FloatV C9 = V4GetY(cosCoefficients2);
+	const FloatV C10 = V4GetZ(cosCoefficients2);
+	const FloatV C11 = V4GetW(cosCoefficients2);
+
+	Vec4V Result;
+	Result = V4ScaleAdd(V2, C1, V4One());
+	Result = V4ScaleAdd(V4, C2, Result);
+	Result = V4ScaleAdd(V6, C3, Result);
+	Result = V4ScaleAdd(V8, C4, Result);
+	Result = V4ScaleAdd(V10, C5, Result);
+	Result = V4ScaleAdd(V12, C6, Result);
+	Result = V4ScaleAdd(V14, C7, Result);
+	Result = V4ScaleAdd(V16, C8, Result);
+	Result = V4ScaleAdd(V18, C9, Result);
+	Result = V4ScaleAdd(V20, C10, Result);
+	Result = V4ScaleAdd(V22, C11, Result);
+
+	return Result;
+}
+
+PX_FORCE_INLINE void V4Transpose(Vec4V& col0, Vec4V& col1, Vec4V& col2, Vec4V& col3)
+{
+	const float32x4x2_t v0v1 = vzipq_f32(col0, col2);
+	const float32x4x2_t v2v3 = vzipq_f32(col1, col3);
+	const float32x4x2_t zip0 = vzipq_f32(v0v1.val[0], v2v3.val[0]);
+	const float32x4x2_t zip1 = vzipq_f32(v0v1.val[1], v2v3.val[1]);
+	col0 = zip0.val[0];
+	col1 = zip0.val[1];
+	col2 = zip1.val[0];
+	col3 = zip1.val[1];
+}
+
+//////////////////////////////////
+// VEC4V
+//////////////////////////////////
+
+PX_FORCE_INLINE BoolV BFFFF()
+{
+	return vmovq_n_u32(0);
+}
+
+PX_FORCE_INLINE BoolV BFFFT()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	const uint32x2_t zo = vext_u32(zeros, ones, 1);
+	return vcombine_u32(zeros, zo);
+}
+
+PX_FORCE_INLINE BoolV BFFTF()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	const uint32x2_t oz = vext_u32(ones, zeros, 1);
+	return vcombine_u32(zeros, oz);
+}
+
+PX_FORCE_INLINE BoolV BFFTT()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	return vcombine_u32(zeros, ones);
+}
+
+PX_FORCE_INLINE BoolV BFTFF()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	const uint32x2_t zo = vext_u32(zeros, ones, 1);
+	return vcombine_u32(zo, zeros);
+}
+
+PX_FORCE_INLINE BoolV BFTFT()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	const uint32x2_t zo = vext_u32(zeros, ones, 1);
+	return vcombine_u32(zo, zo);
+}
+
+PX_FORCE_INLINE BoolV BFTTF()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	const uint32x2_t zo = vext_u32(zeros, ones, 1);
+	const uint32x2_t oz = vext_u32(ones, zeros, 1);
+	return vcombine_u32(zo, oz);
+}
+
+PX_FORCE_INLINE BoolV BFTTT()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	const uint32x2_t zo = vext_u32(zeros, ones, 1);
+	return vcombine_u32(zo, ones);
+}
+
+PX_FORCE_INLINE BoolV BTFFF()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	// const uint32x2_t zo = vext_u32(zeros, ones, 1);
+	const uint32x2_t oz = vext_u32(ones, zeros, 1);
+	return vcombine_u32(oz, zeros);
+}
+
+PX_FORCE_INLINE BoolV BTFFT()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	const uint32x2_t zo = vext_u32(zeros, ones, 1);
+	const uint32x2_t oz = vext_u32(ones, zeros, 1);
+	return vcombine_u32(oz, zo);
+}
+
+PX_FORCE_INLINE BoolV BTFTF()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	const uint32x2_t oz = vext_u32(ones, zeros, 1);
+	return vcombine_u32(oz, oz);
+}
+
+PX_FORCE_INLINE BoolV BTFTT()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	const uint32x2_t oz = vext_u32(ones, zeros, 1);
+	return vcombine_u32(oz, ones);
+}
+
+PX_FORCE_INLINE BoolV BTTFF()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	return vcombine_u32(ones, zeros);
+}
+
+PX_FORCE_INLINE BoolV BTTFT()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	const uint32x2_t zo = vext_u32(zeros, ones, 1);
+	return vcombine_u32(ones, zo);
+}
+
+PX_FORCE_INLINE BoolV BTTTF()
+{
+	const uint32x2_t zeros = vmov_n_u32(0);
+	const uint32x2_t ones = vmov_n_u32(0xffffFFFF);
+	const uint32x2_t oz = vext_u32(ones, zeros, 1);
+	return vcombine_u32(ones, oz);
+}
+
+PX_FORCE_INLINE BoolV BTTTT()
+{
+	return vmovq_n_u32(0xffffFFFF);
+}
+
+PX_FORCE_INLINE BoolV BXMask()
+{
+	return BTFFF();
+}
+
+PX_FORCE_INLINE BoolV BYMask()
+{
+	return BFTFF();
+}
+
+PX_FORCE_INLINE BoolV BZMask()
+{
+	return BFFTF();
+}
+
+PX_FORCE_INLINE BoolV BWMask()
+{
+	return BFFFT();
+}
+
+PX_FORCE_INLINE BoolV BGetX(const BoolV f)
+{
+	const uint32x2_t fLow = vget_low_u32(f);
+	return vdupq_lane_u32(fLow, 0);
+}
+
+PX_FORCE_INLINE BoolV BGetY(const BoolV f)
+{
+	const uint32x2_t fLow = vget_low_u32(f);
+	return vdupq_lane_u32(fLow, 1);
+}
+
+PX_FORCE_INLINE BoolV BGetZ(const BoolV f)
+{
+	const uint32x2_t fHigh = vget_high_u32(f);
+	return vdupq_lane_u32(fHigh, 0);
+}
+
+PX_FORCE_INLINE BoolV BGetW(const BoolV f)
+{
+	const uint32x2_t fHigh = vget_high_u32(f);
+	return vdupq_lane_u32(fHigh, 1);
+}
+
+PX_FORCE_INLINE BoolV BSetX(const BoolV v, const BoolV f)
+{
+	return vbslq_u32(BFTTT(), v, f);
+}
+
+PX_FORCE_INLINE BoolV BSetY(const BoolV v, const BoolV f)
+{
+	return vbslq_u32(BTFTT(), v, f);
+}
+
+PX_FORCE_INLINE BoolV BSetZ(const BoolV v, const BoolV f)
+{
+	return vbslq_u32(BTTFT(), v, f);
+}
+
+PX_FORCE_INLINE BoolV BSetW(const BoolV v, const BoolV f)
+{
+	return vbslq_u32(BTTTF(), v, f);
+}
+
+PX_FORCE_INLINE BoolV BAnd(const BoolV a, const BoolV b)
+{
+	return vandq_u32(a, b);
+}
+
+PX_FORCE_INLINE BoolV BNot(const BoolV a)
+{
+	return vmvnq_u32(a);
+}
+
+PX_FORCE_INLINE BoolV BAndNot(const BoolV a, const BoolV b)
+{
+	// return vbicq_u32(a, b);
+	return vandq_u32(a, vmvnq_u32(b));
+}
+
+PX_FORCE_INLINE BoolV BOr(const BoolV a, const BoolV b)
+{
+	return vorrq_u32(a, b);
+}
+
+PX_FORCE_INLINE BoolV BAllTrue4(const BoolV a)
+{
+	const uint32x2_t allTrue = vmov_n_u32(0xffffFFFF);
+	const uint16x4_t dHigh = vget_high_u16(vreinterpretq_u16_u32(a));
+	const uint16x4_t dLow = vmovn_u32(a);
+	uint16x8_t combined = vcombine_u16(dLow, dHigh);
+	const uint32x2_t finalReduce = vreinterpret_u32_u8(vmovn_u16(combined));
+	const uint32x2_t result = vceq_u32(finalReduce, allTrue);
+	return vdupq_lane_u32(result, 0);
+}
+
+PX_FORCE_INLINE BoolV BAnyTrue4(const BoolV a)
+{
+	const uint32x2_t allTrue = vmov_n_u32(0xffffFFFF);
+	const uint16x4_t dHigh = vget_high_u16(vreinterpretq_u16_u32(a));
+	const uint16x4_t dLow = vmovn_u32(a);
+	uint16x8_t combined = vcombine_u16(dLow, dHigh);
+	const uint32x2_t finalReduce = vreinterpret_u32_u8(vmovn_u16(combined));
+	const uint32x2_t result = vtst_u32(finalReduce, allTrue);
+	return vdupq_lane_u32(result, 0);
+}
+
+PX_FORCE_INLINE BoolV BAllTrue3(const BoolV a)
+{
+	const uint32x2_t allTrue3 = vmov_n_u32(0x00ffFFFF);
+	const uint16x4_t dHigh = vget_high_u16(vreinterpretq_u16_u32(a));
+	const uint16x4_t dLow = vmovn_u32(a);
+	uint16x8_t combined = vcombine_u16(dLow, dHigh);
+	const uint32x2_t finalReduce = vreinterpret_u32_u8(vmovn_u16(combined));
+	const uint32x2_t result = vceq_u32(vand_u32(finalReduce, allTrue3), allTrue3);
+	return vdupq_lane_u32(result, 0);
+}
+
+PX_FORCE_INLINE BoolV BAnyTrue3(const BoolV a)
+{
+	const uint32x2_t allTrue3 = vmov_n_u32(0x00ffFFFF);
+	const uint16x4_t dHigh = vget_high_u16(vreinterpretq_u16_u32(a));
+	const uint16x4_t dLow = vmovn_u32(a);
+	uint16x8_t combined = vcombine_u16(dLow, dHigh);
+	const uint32x2_t finalReduce = vreinterpret_u32_u8(vmovn_u16(combined));
+	const uint32x2_t result = vtst_u32(vand_u32(finalReduce, allTrue3), allTrue3);
+	return vdupq_lane_u32(result, 0);
+}
+
+PX_FORCE_INLINE PxU32 BAllEq(const BoolV a, const BoolV b)
+{
+	const BoolV bTest = vceqq_u32(a, b);
+	return internalUnitNeonSimd::BAllTrue4_R(bTest);
+}
+
+PX_FORCE_INLINE PxU32 BAllEqTTTT(const BoolV a)
+{
+	return BAllEq(a, BTTTT());
+}
+
+PX_FORCE_INLINE PxU32 BAllEqFFFF(const BoolV a)
+{
+	return BAllEq(a, BFFFF());
+}
+
+PX_FORCE_INLINE PxU32 BGetBitMask(const BoolV a)
+{
+	static PX_ALIGN(16, const PxU32) bitMaskData[4] = { 1, 2, 4, 8 };
+	const uint32x4_t bitMask = *(reinterpret_cast<const uint32x4_t*>(bitMaskData));
+	const uint32x4_t t0 = vandq_u32(a, bitMask);
+	const uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0)); // Pairwise add (0 + 1), (2 + 3)
+	return PxU32(vget_lane_u32(vpadd_u32(t1, t1), 0));
+}
+
+//////////////////////////////////
+// MAT33V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec3V M33MulV3(const Mat33V& a, const Vec3V b)
+{
+	const FloatV x = V3GetX(b);
+	const FloatV y = V3GetY(b);
+	const FloatV z = V3GetZ(b);
+	const Vec3V v0 = V3Scale(a.col0, x);
+	const Vec3V v1 = V3Scale(a.col1, y);
+	const Vec3V v2 = V3Scale(a.col2, z);
+	const Vec3V v0PlusV1 = V3Add(v0, v1);
+	return V3Add(v0PlusV1, v2);
+}
+
+PX_FORCE_INLINE Vec3V M33TrnspsMulV3(const Mat33V& a, const Vec3V b)
+{
+	const FloatV x = V3Dot(a.col0, b);
+	const FloatV y = V3Dot(a.col1, b);
+	const FloatV z = V3Dot(a.col2, b);
+	return V3Merge(x, y, z);
+}
+
+PX_FORCE_INLINE Vec3V M33MulV3AddV3(const Mat33V& A, const Vec3V b, const Vec3V c)
+{
+	const FloatV x = V3GetX(b);
+	const FloatV y = V3GetY(b);
+	const FloatV z = V3GetZ(b);
+	Vec3V result = V3ScaleAdd(A.col0, x, c);
+	result = V3ScaleAdd(A.col1, y, result);
+	return V3ScaleAdd(A.col2, z, result);
+}
+
+PX_FORCE_INLINE Mat33V M33MulM33(const Mat33V& a, const Mat33V& b)
+{
+	return Mat33V(M33MulV3(a, b.col0), M33MulV3(a, b.col1), M33MulV3(a, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Add(const Mat33V& a, const Mat33V& b)
+{
+	return Mat33V(V3Add(a.col0, b.col0), V3Add(a.col1, b.col1), V3Add(a.col2, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Scale(const Mat33V& a, const FloatV& b)
+{
+	return Mat33V(V3Scale(a.col0, b), V3Scale(a.col1, b), V3Scale(a.col2, b));
+}
+
+PX_FORCE_INLINE Mat33V M33Inverse(const Mat33V& a)
+{
+	const float32x2_t zeros = vreinterpret_f32_u32(vmov_n_u32(0));
+	const BoolV btttf = BTTTF();
+
+	const Vec3V cross01 = V3Cross(a.col0, a.col1);
+	const Vec3V cross12 = V3Cross(a.col1, a.col2);
+	const Vec3V cross20 = V3Cross(a.col2, a.col0);
+	const FloatV dot = V3Dot(cross01, a.col2);
+	const FloatV invDet = FRecipFast(dot);
+
+	const float32x4x2_t merge = vzipq_f32(cross12, cross01);
+	const float32x4_t mergeh = merge.val[0];
+	const float32x4_t mergel = merge.val[1];
+
+	// const Vec3V colInv0 = XMVectorPermute(mergeh,cross20,PxPermuteControl(0,4,1,7));
+	const float32x4_t colInv0_xxyy = vzipq_f32(mergeh, cross20).val[0];
+	const float32x4_t colInv0 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(colInv0_xxyy), btttf));
+
+	// const Vec3V colInv1 = XMVectorPermute(mergeh,cross20,PxPermuteControl(2,5,3,7));
+	const float32x2_t zw0 = vget_high_f32(mergeh);
+	const float32x2_t xy1 = vget_low_f32(cross20);
+	const float32x2_t yzero1 = vext_f32(xy1, zeros, 1);
+	const float32x2x2_t merge1 = vzip_f32(zw0, yzero1);
+	const float32x4_t colInv1 = vcombine_f32(merge1.val[0], merge1.val[1]);
+
+	// const Vec3V colInv2 = XMVectorPermute(mergel,cross20,PxPermuteControl(0,6,1,7));
+	const float32x2_t x0y0 = vget_low_f32(mergel);
+	const float32x2_t z1w1 = vget_high_f32(cross20);
+	const float32x2x2_t merge2 = vzip_f32(x0y0, z1w1);
+	const float32x4_t colInv2 = vcombine_f32(merge2.val[0], merge2.val[1]);
+
+	return Mat33V(vmulq_lane_f32(colInv0, invDet, 0), vmulq_lane_f32(colInv1, invDet, 0),
+	              vmulq_lane_f32(colInv2, invDet, 0));
+}
+
+PX_FORCE_INLINE Mat33V M33Trnsps(const Mat33V& a)
+{
+	return Mat33V(V3Merge(V3GetX(a.col0), V3GetX(a.col1), V3GetX(a.col2)),
+	              V3Merge(V3GetY(a.col0), V3GetY(a.col1), V3GetY(a.col2)),
+	              V3Merge(V3GetZ(a.col0), V3GetZ(a.col1), V3GetZ(a.col2)));
+}
+
+PX_FORCE_INLINE Mat33V M33Identity()
+{
+	return Mat33V(V3UnitX(), V3UnitY(), V3UnitZ());
+}
+
+PX_FORCE_INLINE Mat33V M33Sub(const Mat33V& a, const Mat33V& b)
+{
+	return Mat33V(V3Sub(a.col0, b.col0), V3Sub(a.col1, b.col1), V3Sub(a.col2, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Neg(const Mat33V& a)
+{
+	return Mat33V(V3Neg(a.col0), V3Neg(a.col1), V3Neg(a.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Abs(const Mat33V& a)
+{
+	return Mat33V(V3Abs(a.col0), V3Abs(a.col1), V3Abs(a.col2));
+}
+
+PX_FORCE_INLINE Mat33V PromoteVec3V(const Vec3V v)
+{
+	const BoolV bTFFF = BTFFF();
+	const BoolV bFTFF = BFTFF();
+	const BoolV bFFTF = BTFTF();
+
+	const Vec3V zero = V3Zero();
+
+	return Mat33V(V3Sel(bTFFF, v, zero), V3Sel(bFTFF, v, zero), V3Sel(bFFTF, v, zero));
+}
+
+PX_FORCE_INLINE Mat33V M33Diagonal(const Vec3VArg d)
+{
+	const Vec3V x = V3Mul(V3UnitX(), d);
+	const Vec3V y = V3Mul(V3UnitY(), d);
+	const Vec3V z = V3Mul(V3UnitZ(), d);
+	return Mat33V(x, y, z);
+}
+
+//////////////////////////////////
+// MAT34V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec3V M34MulV3(const Mat34V& a, const Vec3V b)
+{
+	const FloatV x = V3GetX(b);
+	const FloatV y = V3GetY(b);
+	const FloatV z = V3GetZ(b);
+	const Vec3V v0 = V3Scale(a.col0, x);
+	const Vec3V v1 = V3Scale(a.col1, y);
+	const Vec3V v2 = V3Scale(a.col2, z);
+	const Vec3V v0PlusV1 = V3Add(v0, v1);
+	const Vec3V v0PlusV1Plusv2 = V3Add(v0PlusV1, v2);
+	return V3Add(v0PlusV1Plusv2, a.col3);
+}
+
+PX_FORCE_INLINE Vec3V M34Mul33V3(const Mat34V& a, const Vec3V b)
+{
+	const FloatV x = V3GetX(b);
+	const FloatV y = V3GetY(b);
+	const FloatV z = V3GetZ(b);
+	const Vec3V v0 = V3Scale(a.col0, x);
+	const Vec3V v1 = V3Scale(a.col1, y);
+	const Vec3V v2 = V3Scale(a.col2, z);
+	const Vec3V v0PlusV1 = V3Add(v0, v1);
+	return V3Add(v0PlusV1, v2);
+}
+
+PX_FORCE_INLINE Vec3V M34TrnspsMul33V3(const Mat34V& a, const Vec3V b)
+{
+	const FloatV x = V3Dot(a.col0, b);
+	const FloatV y = V3Dot(a.col1, b);
+	const FloatV z = V3Dot(a.col2, b);
+	return V3Merge(x, y, z);
+}
+
+PX_FORCE_INLINE Mat34V M34MulM34(const Mat34V& a, const Mat34V& b)
+{
+	return Mat34V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2), M34MulV3(a, b.col3));
+}
+
+PX_FORCE_INLINE Mat33V M34MulM33(const Mat34V& a, const Mat33V& b)
+{
+	return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M34Mul33MM34(const Mat34V& a, const Mat34V& b)
+{
+	return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2));
+}
+
+PX_FORCE_INLINE Mat34V M34Add(const Mat34V& a, const Mat34V& b)
+{
+	return Mat34V(V3Add(a.col0, b.col0), V3Add(a.col1, b.col1), V3Add(a.col2, b.col2), V3Add(a.col3, b.col3));
+}
+
+PX_FORCE_INLINE Mat33V M34Trnsps33(const Mat34V& a)
+{
+	return Mat33V(V3Merge(V3GetX(a.col0), V3GetX(a.col1), V3GetX(a.col2)),
+	              V3Merge(V3GetY(a.col0), V3GetY(a.col1), V3GetY(a.col2)),
+	              V3Merge(V3GetZ(a.col0), V3GetZ(a.col1), V3GetZ(a.col2)));
+}
+
+//////////////////////////////////
+// MAT44V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec4V M44MulV4(const Mat44V& a, const Vec4V b)
+{
+	const FloatV x = V4GetX(b);
+	const FloatV y = V4GetY(b);
+	const FloatV z = V4GetZ(b);
+	const FloatV w = V4GetW(b);
+
+	const Vec4V v0 = V4Scale(a.col0, x);
+	const Vec4V v1 = V4Scale(a.col1, y);
+	const Vec4V v2 = V4Scale(a.col2, z);
+	const Vec4V v3 = V4Scale(a.col3, w);
+	const Vec4V v0PlusV1 = V4Add(v0, v1);
+	const Vec4V v0PlusV1Plusv2 = V4Add(v0PlusV1, v2);
+	return V4Add(v0PlusV1Plusv2, v3);
+}
+
+PX_FORCE_INLINE Vec4V M44TrnspsMulV4(const Mat44V& a, const Vec4V b)
+{
+	return V4Merge(V4Dot(a.col0, b), V4Dot(a.col1, b), V4Dot(a.col2, b), V4Dot(a.col3, b));
+}
+
+PX_FORCE_INLINE Mat44V M44MulM44(const Mat44V& a, const Mat44V& b)
+{
+	return Mat44V(M44MulV4(a, b.col0), M44MulV4(a, b.col1), M44MulV4(a, b.col2), M44MulV4(a, b.col3));
+}
+
+PX_FORCE_INLINE Mat44V M44Add(const Mat44V& a, const Mat44V& b)
+{
+	return Mat44V(V4Add(a.col0, b.col0), V4Add(a.col1, b.col1), V4Add(a.col2, b.col2), V4Add(a.col3, b.col3));
+}
+
+PX_FORCE_INLINE Mat44V M44Trnsps(const Mat44V& a)
+{
+	// asm volatile(
+	// "vzip.f32 %q0, %q2 \n\t"
+	// "vzip.f32 %q1, %q3 \n\t"
+	// "vzip.f32 %q0, %q1 \n\t"
+	// "vzip.f32 %q2, %q3 \n\t"
+	// : "+w" (a.col0), "+w" (a.col1), "+w" (a.col2), "+w" a.col3));
+
+	const float32x4x2_t v0v1 = vzipq_f32(a.col0, a.col2);
+	const float32x4x2_t v2v3 = vzipq_f32(a.col1, a.col3);
+	const float32x4x2_t zip0 = vzipq_f32(v0v1.val[0], v2v3.val[0]);
+	const float32x4x2_t zip1 = vzipq_f32(v0v1.val[1], v2v3.val[1]);
+
+	return Mat44V(zip0.val[0], zip0.val[1], zip1.val[0], zip1.val[1]);
+}
+
+PX_FORCE_INLINE Mat44V M44Inverse(const Mat44V& a)
+{
+	float32x4_t minor0, minor1, minor2, minor3;
+	float32x4_t row0, row1, row2, row3;
+	float32x4_t det, tmp1;
+
+	tmp1 = vmovq_n_f32(0.0f);
+	row1 = vmovq_n_f32(0.0f);
+	row3 = vmovq_n_f32(0.0f);
+
+	row0 = a.col0;
+	row1 = vextq_f32(a.col1, a.col1, 2);
+	row2 = a.col2;
+	row3 = vextq_f32(a.col3, a.col3, 2);
+
+	tmp1 = vmulq_f32(row2, row3);
+	tmp1 = vrev64q_f32(tmp1);
+	minor0 = vmulq_f32(row1, tmp1);
+	minor1 = vmulq_f32(row0, tmp1);
+	tmp1 = vextq_f32(tmp1, tmp1, 2);
+	minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
+	minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
+	minor1 = vextq_f32(minor1, minor1, 2);
+
+	tmp1 = vmulq_f32(row1, row2);
+	tmp1 = vrev64q_f32(tmp1);
+	minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
+	minor3 = vmulq_f32(row0, tmp1);
+	tmp1 = vextq_f32(tmp1, tmp1, 2);
+	minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
+	minor3 = vsubq_f32(vmulq_f32(row0, tmp1), minor3);
+	minor3 = vextq_f32(minor3, minor3, 2);
+
+	tmp1 = vmulq_f32(vextq_f32(row1, row1, 2), row3);
+	tmp1 = vrev64q_f32(tmp1);
+	row2 = vextq_f32(row2, row2, 2);
+	minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
+	minor2 = vmulq_f32(row0, tmp1);
+	tmp1 = vextq_f32(tmp1, tmp1, 2);
+	minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
+	minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
+	minor2 = vextq_f32(minor2, minor2, 2);
+
+	tmp1 = vmulq_f32(row0, row1);
+	tmp1 = vrev64q_f32(tmp1);
+	minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
+	minor3 = vsubq_f32(vmulq_f32(row2, tmp1), minor3);
+	tmp1 = vextq_f32(tmp1, tmp1, 2);
+	minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
+	minor3 = vsubq_f32(minor3, vmulq_f32(row2, tmp1));
+
+	tmp1 = vmulq_f32(row0, row3);
+	tmp1 = vrev64q_f32(tmp1);
+	minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
+	minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
+	tmp1 = vextq_f32(tmp1, tmp1, 2);
+	minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
+	minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
+
+	tmp1 = vmulq_f32(row0, row2);
+	tmp1 = vrev64q_f32(tmp1);
+	minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
+	minor3 = vsubq_f32(minor3, vmulq_f32(row1, tmp1));
+	tmp1 = vextq_f32(tmp1, tmp1, 2);
+	minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
+	minor3 = vaddq_f32(vmulq_f32(row1, tmp1), minor3);
+
+	det = vmulq_f32(row0, minor0);
+	det = vaddq_f32(vextq_f32(det, det, 2), det);
+	det = vaddq_f32(vrev64q_f32(det), det);
+	det = vdupq_lane_f32(VRECIPE(vget_low_f32(det)), 0);
+
+	minor0 = vmulq_f32(det, minor0);
+	minor1 = vmulq_f32(det, minor1);
+	minor2 = vmulq_f32(det, minor2);
+	minor3 = vmulq_f32(det, minor3);
+	Mat44V invTrans(minor0, minor1, minor2, minor3);
+	return M44Trnsps(invTrans);
+}
+
+PX_FORCE_INLINE Vec4V V4LoadXYZW(const PxF32& x, const PxF32& y, const PxF32& z, const PxF32& w)
+{
+	const float32x4_t ret = { x, y, z, w };
+	return ret;
+}
+
+/*
+PX_FORCE_INLINE VecU16V V4U32PK(VecU32V a, VecU32V b)
+{
+    return vcombine_u16(vqmovn_u32(a), vqmovn_u32(b));
+}
+*/
+
+PX_FORCE_INLINE VecU32V V4U32Sel(const BoolV c, const VecU32V a, const VecU32V b)
+{
+	return vbslq_u32(c, a, b);
+}
+
+PX_FORCE_INLINE VecU32V V4U32or(VecU32V a, VecU32V b)
+{
+	return vorrq_u32(a, b);
+}
+
+PX_FORCE_INLINE VecU32V V4U32xor(VecU32V a, VecU32V b)
+{
+	return veorq_u32(a, b);
+}
+
+PX_FORCE_INLINE VecU32V V4U32and(VecU32V a, VecU32V b)
+{
+	return vandq_u32(a, b);
+}
+
+PX_FORCE_INLINE VecU32V V4U32Andc(VecU32V a, VecU32V b)
+{
+	// return vbicq_u32(a, b); // creates gcc compiler bug in RTreeQueries.cpp
+	return vandq_u32(a, vmvnq_u32(b));
+}
+
+/*
+PX_FORCE_INLINE VecU16V V4U16Or(VecU16V a, VecU16V b)
+{
+    return vorrq_u16(a, b);
+}
+*/
+
+/*
+PX_FORCE_INLINE VecU16V V4U16And(VecU16V a, VecU16V b)
+{
+    return vandq_u16(a, b);
+}
+*/
+/*
+PX_FORCE_INLINE VecU16V V4U16Andc(VecU16V a, VecU16V b)
+{
+    return vbicq_u16(a, b);
+}
+*/
+
+PX_FORCE_INLINE VecI32V I4Load(const PxI32 i)
+{
+	return vdupq_n_s32(i);
+}
+
+PX_FORCE_INLINE VecI32V I4LoadU(const PxI32* i)
+{
+	return vld1q_s32(i);
+}
+
+PX_FORCE_INLINE VecI32V I4LoadA(const PxI32* i)
+{
+	return vld1q_s32(i);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Add(const VecI32VArg a, const VecI32VArg b)
+{
+	return vaddq_s32(a, b);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Sub(const VecI32VArg a, const VecI32VArg b)
+{
+	return vsubq_s32(a, b);
+}
+
+PX_FORCE_INLINE BoolV VecI32V_IsGrtr(const VecI32VArg a, const VecI32VArg b)
+{
+	return vcgtq_s32(a, b);
+}
+
+PX_FORCE_INLINE BoolV VecI32V_IsEq(const VecI32VArg a, const VecI32VArg b)
+{
+	return vceqq_s32(a, b);
+}
+
+PX_FORCE_INLINE VecI32V V4I32Sel(const BoolV c, const VecI32V a, const VecI32V b)
+{
+	return vbslq_s32(c, a, b);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Zero()
+{
+	return vdupq_n_s32(0);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_One()
+{
+	return vdupq_n_s32(1);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Two()
+{
+	return vdupq_n_s32(2);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_MinusOne()
+{
+	return vdupq_n_s32(-1);
+}
+
+PX_FORCE_INLINE VecU32V U4Zero()
+{
+	return U4Load(0);
+}
+
+PX_FORCE_INLINE VecU32V U4One()
+{
+	return U4Load(1);
+}
+
+PX_FORCE_INLINE VecU32V U4Two()
+{
+	return U4Load(2);
+}
+
+PX_FORCE_INLINE VecShiftV VecI32V_PrepareShift(const VecI32VArg shift)
+{
+	return shift;
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_LeftShift(const VecI32VArg a, const VecShiftVArg count)
+{
+	return vshlq_s32(a, count);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_RightShift(const VecI32VArg a, const VecShiftVArg count)
+{
+	return vshlq_s32(a, VecI32V_Sub(I4Load(0), count));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_And(const VecI32VArg a, const VecI32VArg b)
+{
+	return vandq_s32(a, b);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Or(const VecI32VArg a, const VecI32VArg b)
+{
+	return vorrq_s32(a, b);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetX(const VecI32VArg f)
+{
+	const int32x2_t fLow = vget_low_s32(f);
+	return vdupq_lane_s32(fLow, 0);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetY(const VecI32VArg f)
+{
+	const int32x2_t fLow = vget_low_s32(f);
+	return vdupq_lane_s32(fLow, 1);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetZ(const VecI32VArg f)
+{
+	const int32x2_t fHigh = vget_high_s32(f);
+	return vdupq_lane_s32(fHigh, 0);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetW(const VecI32VArg f)
+{
+	const int32x2_t fHigh = vget_high_s32(f);
+	return vdupq_lane_s32(fHigh, 1);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Sel(const BoolV c, const VecI32VArg a, const VecI32VArg b)
+{
+	return vbslq_s32(c, a, b);
+}
+
+PX_FORCE_INLINE void PxI32_From_VecI32V(const VecI32VArg a, PxI32* i)
+{
+	*i = vgetq_lane_s32(a, 0);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Merge(const VecI32VArg a, const VecI32VArg b, const VecI32VArg c, const VecI32VArg d)
+{
+	const int32x2_t aLow = vget_low_s32(a);
+	const int32x2_t bLow = vget_low_s32(b);
+	const int32x2_t cLow = vget_low_s32(c);
+	const int32x2_t dLow = vget_low_s32(d);
+
+	const int32x2_t low = vext_s32(aLow, bLow, 1);
+	const int32x2_t high = vext_s32(cLow, dLow, 1);
+
+	return vcombine_s32(low, high);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_From_BoolV(const BoolVArg a)
+{
+	return vreinterpretq_s32_u32(a);
+}
+
+PX_FORCE_INLINE VecU32V VecU32V_From_BoolV(const BoolVArg a)
+{
+	return a;
+}
+
+/*
+template<int a> PX_FORCE_INLINE VecI32V V4ISplat()
+{
+    return vdupq_n_s32(a);
+}
+
+template<PxU32 a> PX_FORCE_INLINE VecU32V V4USplat()
+{
+    return vdupq_n_u32(a);
+}
+*/
+
+/*
+PX_FORCE_INLINE void V4U16StoreAligned(VecU16V val, VecU16V* address)
+{
+    vst1q_u16((uint16_t*)address, val);
+}
+*/
+
+PX_FORCE_INLINE void V4U32StoreAligned(VecU32V val, VecU32V* address)
+{
+	vst1q_u32(reinterpret_cast<uint32_t*>(address), val);
+}
+
+PX_FORCE_INLINE Vec4V V4LoadAligned(Vec4V* addr)
+{
+	return vld1q_f32(reinterpret_cast<float32_t*>(addr));
+}
+
+PX_FORCE_INLINE Vec4V V4LoadUnaligned(Vec4V* addr)
+{
+	return vld1q_f32(reinterpret_cast<float32_t*>(addr));
+}
+
+PX_FORCE_INLINE Vec4V V4Andc(const Vec4V a, const VecU32V b)
+{
+	return vreinterpretq_f32_u32(V4U32Andc(vreinterpretq_u32_f32(a), b));
+}
+
+PX_FORCE_INLINE VecU32V V4IsGrtrV32u(const Vec4V a, const Vec4V b)
+{
+	return V4IsGrtr(a, b);
+}
+
+PX_FORCE_INLINE VecU16V V4U16LoadAligned(VecU16V* addr)
+{
+	return vld1q_u16(reinterpret_cast<uint16_t*>(addr));
+}
+
+PX_FORCE_INLINE VecU16V V4U16LoadUnaligned(VecU16V* addr)
+{
+	return vld1q_u16(reinterpret_cast<uint16_t*>(addr));
+}
+
+PX_FORCE_INLINE VecU16V V4U16CompareGt(VecU16V a, VecU16V b)
+{
+	return vcgtq_u16(a, b);
+}
+
+PX_FORCE_INLINE VecU16V V4I16CompareGt(VecI16V a, VecI16V b)
+{
+	return vcgtq_s16(a, b);
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_VecU32V(VecU32V a)
+{
+	return vcvtq_f32_u32(a);
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_VecI32V(VecI32V a)
+{
+	return vcvtq_f32_s32(a);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_From_Vec4V(Vec4V a)
+{
+	return vcvtq_s32_f32(a);
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecU32V(VecU32V a)
+{
+	return vreinterpretq_f32_u32(a);
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecI32V(VecI32V a)
+{
+	return vreinterpretq_f32_s32(a);
+}
+
+PX_FORCE_INLINE VecU32V VecU32V_ReinterpretFrom_Vec4V(Vec4V a)
+{
+	return vreinterpretq_u32_f32(a);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_ReinterpretFrom_Vec4V(Vec4V a)
+{
+	return vreinterpretq_s32_f32(a);
+}
+
+template <int index>
+PX_FORCE_INLINE BoolV BSplatElement(BoolV a)
+{
+	if(index < 2)
+	{
+		return vdupq_lane_u32(vget_low_u32(a), index);
+	}
+	else if(index == 2)
+	{
+		return vdupq_lane_u32(vget_high_u32(a), 0);
+	}
+	else if(index == 3)
+	{
+		return vdupq_lane_u32(vget_high_u32(a), 1);
+	}
+}
+
+template <int index>
+PX_FORCE_INLINE VecU32V V4U32SplatElement(VecU32V a)
+{
+	if(index < 2)
+	{
+		return vdupq_lane_u32(vget_low_u32(a), index);
+	}
+	else if(index == 2)
+	{
+		return vdupq_lane_u32(vget_high_u32(a), 0);
+	}
+	else if(index == 3)
+	{
+		return vdupq_lane_u32(vget_high_u32(a), 1);
+	}
+}
+
+template <int index>
+PX_FORCE_INLINE Vec4V V4SplatElement(Vec4V a)
+{
+	if(index < 2)
+	{
+		return vdupq_lane_f32(vget_low_f32(a), index);
+	}
+	else if(index == 2)
+	{
+		return vdupq_lane_f32(vget_high_f32(a), 0);
+	}
+	else if(index == 3)
+	{
+		return vdupq_lane_f32(vget_high_f32(a), 1);
+	}
+}
+
+PX_FORCE_INLINE VecU32V U4LoadXYZW(PxU32 x, PxU32 y, PxU32 z, PxU32 w)
+{
+	const uint32x4_t ret = { x, y, z, w };
+	return ret;
+}
+
+PX_FORCE_INLINE VecU32V U4Load(const PxU32 i)
+{
+	return vdupq_n_u32(i);
+}
+
+PX_FORCE_INLINE VecU32V U4LoadU(const PxU32* i)
+{
+	return vld1q_u32(i);
+}
+
+PX_FORCE_INLINE VecU32V U4LoadA(const PxU32* i)
+{
+	return vld1q_u32(i);
+}
+
+PX_FORCE_INLINE Vec4V V4Ceil(const Vec4V in)
+{
+	const float32x4_t ones = vdupq_n_f32(1.0f);
+	const float32x4_t rdToZero = vcvtq_f32_s32(vcvtq_s32_f32(in));
+	const float32x4_t rdToZeroPlusOne = vaddq_f32(rdToZero, ones);
+	const uint32x4_t gt = vcgtq_f32(in, rdToZero);
+	return vbslq_f32(gt, rdToZeroPlusOne, rdToZero);
+}
+
+PX_FORCE_INLINE Vec4V V4Floor(const Vec4V in)
+{
+	const float32x4_t ones = vdupq_n_f32(1.0f);
+	const float32x4_t rdToZero = vcvtq_f32_s32(vcvtq_s32_f32(in));
+	const float32x4_t rdToZeroMinusOne = vsubq_f32(rdToZero, ones);
+	const uint32x4_t lt = vcltq_f32(in, rdToZero);
+	return vbslq_f32(lt, rdToZeroMinusOne, rdToZero);
+}
+
+PX_FORCE_INLINE VecU32V V4ConvertToU32VSaturate(const Vec4V in, PxU32 power)
+{
+	PX_ASSERT(power == 0 && "Non-zero power not supported in convertToU32VSaturate");
+	PX_UNUSED(power); // prevent warning in release builds
+
+	return vcvtq_u32_f32(in);
+}
+
+PX_FORCE_INLINE void QuatGetMat33V(const QuatVArg q, Vec3V& column0, Vec3V& column1, Vec3V& column2)
+{
+	const FloatV one = FOne();
+	const FloatV x = V4GetX(q);
+	const FloatV y = V4GetY(q);
+	const FloatV z = V4GetZ(q);
+	const FloatV w = V4GetW(q);
+
+	const FloatV x2 = FAdd(x, x);
+	const FloatV y2 = FAdd(y, y);
+	const FloatV z2 = FAdd(z, z);
+
+	const FloatV xx = FMul(x2, x);
+	const FloatV yy = FMul(y2, y);
+	const FloatV zz = FMul(z2, z);
+
+	const FloatV xy = FMul(x2, y);
+	const FloatV xz = FMul(x2, z);
+	const FloatV xw = FMul(x2, w);
+
+	const FloatV yz = FMul(y2, z);
+	const FloatV yw = FMul(y2, w);
+	const FloatV zw = FMul(z2, w);
+
+	const FloatV v = FSub(one, xx);
+
+	column0 = V3Merge(FSub(FSub(one, yy), zz), FAdd(xy, zw), FSub(xz, yw));
+	column1 = V3Merge(FSub(xy, zw), FSub(v, zz), FAdd(yz, xw));
+	column2 = V3Merge(FAdd(xz, yw), FSub(yz, xw), FSub(v, yy));
+}
+
+#endif // PSFOUNDATION_PSUNIXNEONINLINEAOS_H
diff --git a/PxShared/src/foundation/include/unix/sse2/PsUnixSse2AoS.h b/PxShared/src/foundation/include/unix/sse2/PsUnixSse2AoS.h
new file mode 100644
index 0000000..9c76438
--- /dev/null
+++ b/PxShared/src/foundation/include/unix/sse2/PsUnixSse2AoS.h
@@ -0,0 +1,179 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSUNIXSSE2AOS_H
+#define PSFOUNDATION_PSUNIXSSE2AOS_H
+
+// no includes here! this file should be included from PxcVecMath.h only!!!
+
+#if !COMPILE_VECTOR_INTRINSICS
+#error Vector intrinsics should not be included when using scalar implementation.
+#endif
+#if PX_EMSCRIPTEN
+typedef int8_t   __int8_t;
+typedef int16_t  __int16_t;
+typedef int32_t  __int32_t;
+typedef int64_t  __int64_t;
+typedef uint16_t __uint16_t;
+typedef uint32_t __uint32_t;
+typedef uint64_t __uint64_t;
+#endif
+
+typedef union UnionM128
+{
+	UnionM128()
+	{
+	}
+	UnionM128(__m128 in)
+	{
+		m128 = in;
+	}
+
+	UnionM128(__m128i in)
+	{
+		m128i = in;
+	}
+
+	operator __m128()
+	{
+		return m128;
+	}
+
+	operator const __m128() const
+	{
+		return m128;
+	}
+
+	float m128_f32[4];
+	__int8_t m128_i8[16];
+	__int16_t m128_i16[8];
+	__int32_t m128_i32[4];
+	__int64_t m128_i64[2];
+	__uint16_t m128_u16[8];
+	__uint32_t m128_u32[4];
+	__uint64_t m128_u64[2];
+	__m128 m128;
+	__m128i m128i;
+} UnionM128;
+
+typedef __m128 FloatV;
+typedef __m128 Vec3V;
+typedef __m128 Vec4V;
+typedef __m128 BoolV;
+typedef __m128 QuatV;
+typedef __m128i VecI32V;
+typedef UnionM128 VecU32V;
+typedef UnionM128 VecU16V;
+typedef UnionM128 VecI16V;
+typedef UnionM128 VecU8V;
+
+#define FloatVArg FloatV &
+#define Vec3VArg Vec3V &
+#define Vec4VArg Vec4V &
+#define BoolVArg BoolV &
+#define VecU32VArg VecU32V &
+#define VecI32VArg VecI32V &
+#define VecU16VArg VecU16V &
+#define VecI16VArg VecI16V &
+#define VecU8VArg VecU8V &
+#define QuatVArg QuatV &
+
+// Optimization for situations in which you cross product multiple vectors with the same vector.
+// Avoids 2X shuffles per product
+struct VecCrossV
+{
+	Vec3V mL1;
+	Vec3V mR1;
+};
+
+struct VecShiftV
+{
+	VecI32V shift;
+};
+#define VecShiftVArg VecShiftV &
+
+PX_ALIGN_PREFIX(16)
+struct Mat33V
+{
+	Mat33V()
+	{
+	}
+	Mat33V(const Vec3V& c0, const Vec3V& c1, const Vec3V& c2) : col0(c0), col1(c1), col2(c2)
+	{
+	}
+	Vec3V PX_ALIGN(16, col0);
+	Vec3V PX_ALIGN(16, col1);
+	Vec3V PX_ALIGN(16, col2);
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct Mat34V
+{
+	Mat34V()
+	{
+	}
+	Mat34V(const Vec3V& c0, const Vec3V& c1, const Vec3V& c2, const Vec3V& c3) : col0(c0), col1(c1), col2(c2), col3(c3)
+	{
+	}
+	Vec3V PX_ALIGN(16, col0);
+	Vec3V PX_ALIGN(16, col1);
+	Vec3V PX_ALIGN(16, col2);
+	Vec3V PX_ALIGN(16, col3);
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct Mat43V
+{
+	Mat43V()
+	{
+	}
+	Mat43V(const Vec4V& c0, const Vec4V& c1, const Vec4V& c2) : col0(c0), col1(c1), col2(c2)
+	{
+	}
+	Vec4V PX_ALIGN(16, col0);
+	Vec4V PX_ALIGN(16, col1);
+	Vec4V PX_ALIGN(16, col2);
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct Mat44V
+{
+	Mat44V()
+	{
+	}
+	Mat44V(const Vec4V& c0, const Vec4V& c1, const Vec4V& c2, const Vec4V& c3) : col0(c0), col1(c1), col2(c2), col3(c3)
+	{
+	}
+	Vec4V PX_ALIGN(16, col0);
+	Vec4V PX_ALIGN(16, col1);
+	Vec4V PX_ALIGN(16, col2);
+	Vec4V PX_ALIGN(16, col3);
+} PX_ALIGN_SUFFIX(16);
+
+#endif // PSFOUNDATION_PSUNIXSSE2AOS_H
diff --git a/PxShared/src/foundation/include/unix/sse2/PsUnixSse2InlineAoS.h b/PxShared/src/foundation/include/unix/sse2/PsUnixSse2InlineAoS.h
new file mode 100644
index 0000000..0355538
--- /dev/null
+++ b/PxShared/src/foundation/include/unix/sse2/PsUnixSse2InlineAoS.h
@@ -0,0 +1,3226 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSUNIXSSE2INLINEAOS_H
+#define PSFOUNDATION_PSUNIXSSE2INLINEAOS_H
+
+#if !COMPILE_VECTOR_INTRINSICS
+#error Vector intrinsics should not be included when using scalar implementation.
+#endif
+
+// Remove this define when all platforms use simd solver.
+#define PX_SUPPORT_SIMD
+
+#ifdef __SSE4_2__
+#include "smmintrin.h"
+#endif
+
+#include "../../PsVecMathSSE.h"
+
+#define PX_FPCLASS_SNAN 0x0001 /* signaling NaN */
+#define PX_FPCLASS_QNAN 0x0002 /* quiet NaN */
+#define PX_FPCLASS_NINF 0x0004 /* negative infinity */
+#define PX_FPCLASS_PINF 0x0200 /* positive infinity */
+
+PX_FORCE_INLINE __m128 m128_I2F(__m128i n)
+{
+	return _mm_castsi128_ps(n);
+}
+PX_FORCE_INLINE __m128i m128_F2I(__m128 n)
+{
+	return _mm_castps_si128(n);
+}
+
+//////////////////////////////////////////////////////////////////////
+//Test that Vec3V and FloatV are legal
+//////////////////////////////////////////////////////////////////////
+
+#define FLOAT_COMPONENTS_EQUAL_THRESHOLD 0.01f
+PX_FORCE_INLINE static bool isValidFloatV(const FloatV a)
+{
+	const PxF32 x = V4ReadX(a);
+	const PxF32 y = V4ReadY(a);
+	const PxF32 z = V4ReadZ(a);
+	const PxF32 w = V4ReadW(a);
+
+ 	if (
+		(PxAbs(x - y) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) &&
+		(PxAbs(x - z) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) &&
+		(PxAbs(x - w) < FLOAT_COMPONENTS_EQUAL_THRESHOLD)
+		)
+	{
+		return true;
+	}
+
+	if (
+		(PxAbs((x - y) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) &&
+		(PxAbs((x - z) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) &&
+		(PxAbs((x - w) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD)
+		)
+	{
+		return true;
+	}
+
+	return false;
+}
+
+PX_FORCE_INLINE bool isValidVec3V(const Vec3V a)
+{
+	PX_ALIGN(16, PxF32 f[4]);
+	V4StoreA(a, f);
+	return (f[3] == 0.0f);
+}
+
+PX_FORCE_INLINE bool isFiniteLength(const Vec3V a)
+{
+	return !FAllEq(V4LengthSq(a), FZero());
+}
+
+PX_FORCE_INLINE bool isAligned16(void* a)
+{
+	return(0 == (size_t(a) & 0x0f));
+}
+
+//ASSERT_FINITELENGTH is deactivated because there is a lot of code that calls a simd normalisation function with zero length but then ignores the result.
+
+#if PX_DEBUG
+#define ASSERT_ISVALIDVEC3V(a) PX_ASSERT(isValidVec3V(a))
+#define ASSERT_ISVALIDFLOATV(a) PX_ASSERT(isValidFloatV(a))
+#define ASSERT_ISALIGNED16(a) PX_ASSERT(isAligned16(reinterpret_cast<void*>(a)))
+#define ASSERT_ISFINITELENGTH(a) //PX_ASSERT(isFiniteLength(a))
+#else
+#define ASSERT_ISVALIDVEC3V(a)
+#define ASSERT_ISVALIDFLOATV(a) 
+#define ASSERT_ISALIGNED16(a)
+#define ASSERT_ISFINITELENGTH(a)
+#endif
+
+
+namespace internalUnitSSE2Simd
+{
+PX_FORCE_INLINE PxU32 BAllTrue4_R(const BoolV a)
+{
+	const PxI32 moveMask = _mm_movemask_ps(a);
+	return PxU32(moveMask == 0xf);
+}
+
+PX_FORCE_INLINE PxU32 BAllTrue3_R(const BoolV a)
+{
+	const PxI32 moveMask = _mm_movemask_ps(a);
+	return PxU32((moveMask & 0x7) == 0x7);
+}
+
+PX_FORCE_INLINE PxU32 BAnyTrue4_R(const BoolV a)
+{
+	const PxI32 moveMask = _mm_movemask_ps(a);
+	return PxU32(moveMask != 0x0);
+}
+
+PX_FORCE_INLINE PxU32 BAnyTrue3_R(const BoolV a)
+{
+	const PxI32 moveMask = _mm_movemask_ps(a);
+	return PxU32((moveMask & 0x7) != 0x0);
+}
+
+PX_FORCE_INLINE PxU32 FiniteTestEq(const Vec4V a, const Vec4V b)
+{
+	// This is a bit of a bodge.
+	//_mm_comieq_ss returns 1 if either value is nan so we need to re-cast a and b with true encoded as a non-nan
+	// number.
+	// There must be a better way of doing this in sse.
+	const BoolV one = FOne();
+	const BoolV zero = FZero();
+	const BoolV a1 = V4Sel(a, one, zero);
+	const BoolV b1 = V4Sel(b, one, zero);
+	return (
+	    _mm_comieq_ss(a1, b1) &&
+	    _mm_comieq_ss(_mm_shuffle_ps(a1, a1, _MM_SHUFFLE(1, 1, 1, 1)), _mm_shuffle_ps(b1, b1, _MM_SHUFFLE(1, 1, 1, 1))) &&
+	    _mm_comieq_ss(_mm_shuffle_ps(a1, a1, _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(b1, b1, _MM_SHUFFLE(2, 2, 2, 2))) &&
+	    _mm_comieq_ss(_mm_shuffle_ps(a1, a1, _MM_SHUFFLE(3, 3, 3, 3)), _mm_shuffle_ps(b1, b1, _MM_SHUFFLE(3, 3, 3, 3))));
+}
+
+#if !PX_EMSCRIPTEN
+const PX_ALIGN(16, PxF32 gMaskXYZ[4]) = { physx::PxUnionCast<PxF32>(0xffffffff), physx::PxUnionCast<PxF32>(0xffffffff),
+	                                      physx::PxUnionCast<PxF32>(0xffffffff), 0 };
+}
+#else
+// emscripten doesn't like the PxUnionCast data structure
+// the following is what windows and xbox does -- using these for emscripten
+const PX_ALIGN(16, PxU32 gMaskXYZ[4]) = { 0xffffffff, 0xffffffff, 0xffffffff, 0 }; }
+#endif
+
+namespace _VecMathTests
+{
+// PT: this function returns an invalid Vec3V (W!=0.0f) just for unit-testing 'isValidVec3V'
+PX_FORCE_INLINE Vec3V getInvalidVec3V()
+{
+	const float f = 1.0f;
+	return _mm_load1_ps(&f);
+}
+
+PX_FORCE_INLINE bool allElementsEqualFloatV(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_comieq_ss(a, b) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsEqualVec3V(const Vec3V a, const Vec3V b)
+{
+	return V3AllEq(a, b) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsEqualVec4V(const Vec4V a, const Vec4V b)
+{
+	return V4AllEq(a, b) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsEqualBoolV(const BoolV a, const BoolV b)
+{
+	return internalUnitSSE2Simd::BAllTrue4_R(VecI32V_IsEq(m128_F2I(a), m128_F2I(b))) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsEqualVecU32V(const VecU32V a, const VecU32V b)
+{
+	return internalUnitSSE2Simd::BAllTrue4_R(V4IsEqU32(a, b)) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsEqualVecI32V(const VecI32V a, const VecI32V b)
+{
+	BoolV c = m128_I2F(_mm_cmpeq_epi32(a, b));
+	return internalUnitSSE2Simd::BAllTrue4_R(c) != 0;
+}
+
+#define VECMATH_AOS_EPSILON (1e-3f)
+
+PX_FORCE_INLINE bool allElementsNearEqualFloatV(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	const FloatV c = FSub(a, b);
+	const FloatV minError = FLoad(-VECMATH_AOS_EPSILON);
+	const FloatV maxError = FLoad(VECMATH_AOS_EPSILON);
+	return _mm_comigt_ss(c, minError) && _mm_comilt_ss(c, maxError);
+}
+
+PX_FORCE_INLINE bool allElementsNearEqualVec3V(const Vec3V a, const Vec3V b)
+{
+	const Vec3V c = V3Sub(a, b);
+	const Vec3V minError = V3Load(-VECMATH_AOS_EPSILON);
+	const Vec3V maxError = V3Load(VECMATH_AOS_EPSILON);
+	return (_mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), minError) &&
+	 		_mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), maxError) &&
+	 		_mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), minError) &&
+	 		_mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), maxError) &&
+	 		_mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), minError) &&
+	 		_mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), maxError));
+}
+
+PX_FORCE_INLINE bool allElementsNearEqualVec4V(const Vec4V a, const Vec4V b)
+{
+	const Vec4V c = V4Sub(a, b);
+	const Vec4V minError = V4Load(-VECMATH_AOS_EPSILON);
+	const Vec4V maxError = V4Load(VECMATH_AOS_EPSILON);
+	return (_mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), minError) &&
+	        _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), maxError) &&
+	        _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), minError) &&
+	        _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), maxError) &&
+	        _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), minError) &&
+	        _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), maxError) &&
+	        _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 3, 3, 3)), minError) &&
+	        _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 3, 3, 3)), maxError));
+}
+}
+
+/////////////////////////////////////////////////////////////////////
+////FUNCTIONS USED ONLY FOR ASSERTS IN VECTORISED IMPLEMENTATIONS
+/////////////////////////////////////////////////////////////////////
+
+PX_FORCE_INLINE bool isFiniteFloatV(const FloatV a)
+{
+	PxF32 badNumber =
+	    physx::PxUnionCast<PxF32, PxU32>(PX_FPCLASS_SNAN | PX_FPCLASS_QNAN | PX_FPCLASS_NINF | PX_FPCLASS_PINF);
+	const FloatV vBadNum = FLoad(badNumber);
+	const BoolV vMask = BAnd(vBadNum, a);
+	return internalUnitSSE2Simd::FiniteTestEq(vMask, BFFFF()) == 1;
+}
+
+PX_FORCE_INLINE bool isFiniteVec3V(const Vec3V a)
+{
+	PxF32 badNumber =
+	    physx::PxUnionCast<PxF32, PxU32>(PX_FPCLASS_SNAN | PX_FPCLASS_QNAN | PX_FPCLASS_NINF | PX_FPCLASS_PINF);
+	const Vec3V vBadNum = V3Load(badNumber);
+	const BoolV vMask = BAnd(BAnd(vBadNum, a), BTTTF());
+	return internalUnitSSE2Simd::FiniteTestEq(vMask, BFFFF()) == 1;
+}
+
+PX_FORCE_INLINE bool isFiniteVec4V(const Vec4V a)
+{
+	/*Vec4V a;
+	PX_ALIGN(16, PxF32 f[4]);
+	F32Array_Aligned_From_Vec4V(a, f);
+	return PxIsFinite(f[0])
+	        && PxIsFinite(f[1])
+	        && PxIsFinite(f[2])
+	        && PxIsFinite(f[3]);*/
+
+	PxF32 badNumber =
+	    physx::PxUnionCast<PxF32, PxU32>(PX_FPCLASS_SNAN | PX_FPCLASS_QNAN | PX_FPCLASS_NINF | PX_FPCLASS_PINF);
+	const Vec4V vBadNum = V4Load(badNumber);
+	const BoolV vMask = BAnd(vBadNum, a);
+
+	return internalUnitSSE2Simd::FiniteTestEq(vMask, BFFFF()) == 1;
+}
+
+PX_FORCE_INLINE bool hasZeroElementinFloatV(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), FZero()) ? true : false;
+}
+
+PX_FORCE_INLINE bool hasZeroElementInVec3V(const Vec3V a)
+{
+	return (_mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), FZero()) ||
+	        _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)), FZero()) ||
+	        _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)), FZero()));
+}
+
+PX_FORCE_INLINE bool hasZeroElementInVec4V(const Vec4V a)
+{
+	return (_mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), FZero()) ||
+	        _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)), FZero()) ||
+	        _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)), FZero()) ||
+	        _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)), FZero()));
+}
+
+/////////////////////////////////////////////////////////////////////
+////VECTORISED FUNCTION IMPLEMENTATIONS
+/////////////////////////////////////////////////////////////////////
+
+PX_FORCE_INLINE FloatV FLoad(const PxF32 f)
+{
+	return _mm_load1_ps(&f);
+}
+
+PX_FORCE_INLINE Vec3V V3Load(const PxF32 f)
+{
+	return _mm_set_ps(0.0f, f, f, f);
+}
+
+PX_FORCE_INLINE Vec4V V4Load(const PxF32 f)
+{
+	return _mm_load1_ps(&f);
+}
+
+PX_FORCE_INLINE BoolV BLoad(const bool f)
+{
+	const PxU32 i = -PxI32(f);
+	return _mm_load1_ps(reinterpret_cast<const float*>(&i));
+}
+
+PX_FORCE_INLINE Vec3V V3LoadA(const PxVec3& f)
+{
+	ASSERT_ISALIGNED16(const_cast<PxVec3*>(&f));
+#if !PX_EMSCRIPTEN
+	return _mm_and_ps(reinterpret_cast<const Vec3V&>(f), V4LoadA(internalUnitSSE2Simd::gMaskXYZ));
+#else
+	return _mm_and_ps((Vec3V&)f, (VecI32V&)internalUnitSSE2Simd::gMaskXYZ);
+#endif
+}
+
+PX_FORCE_INLINE Vec3V V3LoadU(const PxVec3& f)
+{
+	return _mm_set_ps(0.0f, f.z, f.y, f.x);
+}
+
+PX_FORCE_INLINE Vec3V V3LoadUnsafeA(const PxVec3& f)
+{
+	ASSERT_ISALIGNED16(const_cast<PxVec3*>(&f));
+	return _mm_set_ps(0.0f, f.z, f.y, f.x);
+}
+
+PX_FORCE_INLINE Vec3V V3LoadA(const PxF32* const f)
+{
+	ASSERT_ISALIGNED16(const_cast<PxF32*>(f));
+#if !PX_EMSCRIPTEN
+	return _mm_and_ps(V4LoadA(f), V4LoadA(internalUnitSSE2Simd::gMaskXYZ));
+#else
+	return _mm_and_ps((Vec3V&)*f, (VecI32V&)internalUnitSSE2Simd::gMaskXYZ);
+#endif
+}
+
+PX_FORCE_INLINE Vec3V V3LoadU(const PxF32* const i)
+{
+	return _mm_set_ps(0.0f, i[2], i[1], i[0]);
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V(Vec4V v)
+{
+	return V4ClearW(v);
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V_WUndefined(const Vec4V v)
+{
+	return v;
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_Vec3V(Vec3V f)
+{
+	ASSERT_ISVALIDVEC3V(f);
+	return f; // ok if it is implemented as the same type.
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_PxVec3_WUndefined(const PxVec3& f)
+{
+	return _mm_set_ps(0.0f, f.z, f.y, f.x);
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_FloatV(FloatV f)
+{
+	return f;
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_FloatV(FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return Vec3V_From_Vec4V(Vec4V_From_FloatV(f));
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_FloatV_WUndefined(FloatV f)
+{
+	ASSERT_ISVALIDVEC3V(f);
+	return Vec3V_From_Vec4V_WUndefined(Vec4V_From_FloatV(f));
+}
+
+PX_FORCE_INLINE Mat33V Mat33V_From_PxMat33(const PxMat33& m)
+{
+	return Mat33V(V3LoadU(m.column0), V3LoadU(m.column1), V3LoadU(m.column2));
+}
+
+PX_FORCE_INLINE void PxMat33_From_Mat33V(const Mat33V& m, PxMat33& out)
+{
+	V3StoreU(m.col0, out.column0);
+	V3StoreU(m.col1, out.column1);
+	V3StoreU(m.col2, out.column2);
+}
+
+PX_FORCE_INLINE Vec4V V4LoadA(const PxF32* const f)
+{
+	ASSERT_ISALIGNED16(const_cast<PxF32*>(f));
+	return _mm_load_ps(f);
+}
+
+PX_FORCE_INLINE void V4StoreA(Vec4V a, PxF32* f)
+{
+	ASSERT_ISALIGNED16(f);
+	_mm_store_ps(f, a);
+}
+
+PX_FORCE_INLINE void V4StoreU(const Vec4V a, PxF32* f)
+{
+	_mm_storeu_ps(f, a);
+}
+
+PX_FORCE_INLINE void BStoreA(const BoolV a, PxU32* f)
+{
+	ASSERT_ISALIGNED16(f);
+	_mm_store_ps(reinterpret_cast<PxF32*>(f), a);
+}
+
+PX_FORCE_INLINE void U4StoreA(const VecU32V uv, PxU32* u)
+{
+	ASSERT_ISALIGNED16(u);
+	_mm_store_ps(reinterpret_cast<float*>(u), uv);
+}
+
+PX_FORCE_INLINE void I4StoreA(const VecI32V iv, PxI32* i)
+{
+	ASSERT_ISALIGNED16(i);
+	_mm_store_ps(reinterpret_cast<float*>(i), m128_I2F(iv));
+}
+
+PX_FORCE_INLINE Vec4V V4LoadU(const PxF32* const f)
+{
+	return _mm_loadu_ps(f);
+}
+
+PX_FORCE_INLINE BoolV BLoad(const bool* const f)
+{
+	const PX_ALIGN(16, PxI32) b[4] = { -PxI32(f[0]), -PxI32(f[1]), -PxI32(f[2]), -PxI32(f[3]) };
+	return _mm_load_ps(reinterpret_cast<const float*>(&b));
+}
+
+PX_FORCE_INLINE void FStore(const FloatV a, PxF32* PX_RESTRICT f)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	_mm_store_ss(f, a);
+}
+
+PX_FORCE_INLINE void V3StoreA(const Vec3V a, PxVec3& f)
+{
+	ASSERT_ISALIGNED16(&f);
+	PX_ALIGN(16, PxF32) f2[4];
+	_mm_store_ps(f2, a);
+	f = PxVec3(f2[0], f2[1], f2[2]);
+}
+
+PX_FORCE_INLINE void V3StoreU(const Vec3V a, PxVec3& f)
+{
+	PX_ALIGN(16, PxF32) f2[4];
+	_mm_store_ps(f2, a);
+	f = PxVec3(f2[0], f2[1], f2[2]);
+}
+
+PX_FORCE_INLINE void Store_From_BoolV(const BoolV b, PxU32* b2)
+{
+	_mm_store_ss(reinterpret_cast<PxF32*>(b2), b);
+}
+
+PX_FORCE_INLINE VecU32V U4Load(const PxU32 i)
+{
+	return _mm_load1_ps(reinterpret_cast<const PxF32*>(&i));
+}
+
+PX_FORCE_INLINE VecU32V U4LoadU(const PxU32* i)
+{
+	return _mm_loadu_ps(reinterpret_cast<const PxF32*>(i));
+}
+
+PX_FORCE_INLINE VecU32V U4LoadA(const PxU32* i)
+{
+	ASSERT_ISALIGNED16(const_cast<PxU32*>(i));
+	return _mm_load_ps(reinterpret_cast<const PxF32*>(i));
+}
+
+//////////////////////////////////
+// FLOATV
+//////////////////////////////////
+
+PX_FORCE_INLINE FloatV FZero()
+{
+	return FLoad(0.0f);
+}
+
+PX_FORCE_INLINE FloatV FOne()
+{
+	return FLoad(1.0f);
+}
+
+PX_FORCE_INLINE FloatV FHalf()
+{
+	return FLoad(0.5f);
+}
+
+PX_FORCE_INLINE FloatV FEps()
+{
+	return FLoad(PX_EPS_REAL);
+}
+
+PX_FORCE_INLINE FloatV FEps6()
+{
+	return FLoad(1e-6f);
+}
+
+PX_FORCE_INLINE FloatV FMax()
+{
+	return FLoad(PX_MAX_REAL);
+}
+
+PX_FORCE_INLINE FloatV FNegMax()
+{
+	return FLoad(-PX_MAX_REAL);
+}
+
+PX_FORCE_INLINE FloatV IZero()
+{
+	const PxU32 zero = 0;
+	return _mm_load1_ps(reinterpret_cast<const PxF32*>(&zero));
+}
+
+PX_FORCE_INLINE FloatV IOne()
+{
+	const PxU32 one = 1;
+	return _mm_load1_ps(reinterpret_cast<const PxF32*>(&one));
+}
+
+PX_FORCE_INLINE FloatV ITwo()
+{
+	const PxU32 two = 2;
+	return _mm_load1_ps(reinterpret_cast<const PxF32*>(&two));
+}
+
+PX_FORCE_INLINE FloatV IThree()
+{
+	const PxU32 three = 3;
+	return _mm_load1_ps(reinterpret_cast<const PxF32*>(&three));
+}
+
+PX_FORCE_INLINE FloatV IFour()
+{
+	PxU32 four = 4;
+	return _mm_load1_ps(reinterpret_cast<const PxF32*>(&four));
+}
+
+PX_FORCE_INLINE FloatV FNeg(const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return _mm_sub_ps(_mm_setzero_ps(), f);
+}
+
+PX_FORCE_INLINE FloatV FAdd(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+/*
+	if(!isValidFloatV(a))
+	{
+assert(false);
+	}
+	if(!isValidFloatV(b))
+	{
+assert(false);
+	}
+*/
+	return _mm_add_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FSub(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_sub_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FMul(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_mul_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FDiv(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_div_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FDivFast(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_mul_ps(a, _mm_rcp_ps(b));
+}
+
+PX_FORCE_INLINE FloatV FRecip(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return _mm_div_ps(FOne(), a);
+}
+
+PX_FORCE_INLINE FloatV FRecipFast(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return _mm_rcp_ps(a);
+}
+
+PX_FORCE_INLINE FloatV FRsqrt(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return _mm_div_ps(FOne(), _mm_sqrt_ps(a));
+}
+
+PX_FORCE_INLINE FloatV FSqrt(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return _mm_sqrt_ps(a);
+}
+
+PX_FORCE_INLINE FloatV FRsqrtFast(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return _mm_rsqrt_ps(a);
+}
+
+PX_FORCE_INLINE FloatV FScaleAdd(const FloatV a, const FloatV b, const FloatV c)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	ASSERT_ISVALIDFLOATV(c);
+	return FAdd(FMul(a, b), c);
+}
+
+PX_FORCE_INLINE FloatV FNegScaleSub(const FloatV a, const FloatV b, const FloatV c)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	ASSERT_ISVALIDFLOATV(c);
+	return FSub(c, FMul(a, b));
+}
+
+PX_FORCE_INLINE FloatV FAbs(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	PX_ALIGN(16, const PxU32) absMask[4] = { 0x7fFFffFF, 0x7fFFffFF, 0x7fFFffFF, 0x7fFFffFF };
+	return _mm_and_ps(a, _mm_load_ps(reinterpret_cast<const PxF32*>(absMask)));
+}
+
+PX_FORCE_INLINE FloatV FSel(const BoolV c, const FloatV a, const FloatV b)
+{
+	PX_ASSERT(_VecMathTests::allElementsEqualBoolV(c,BTTTT()) ||
+			  _VecMathTests::allElementsEqualBoolV(c,BFFFF()));
+	ASSERT_ISVALIDFLOATV(_mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a)));
+	return _mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a));
+}
+
+PX_FORCE_INLINE BoolV FIsGrtr(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_cmpgt_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV FIsGrtrOrEq(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_cmpge_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV FIsEq(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_cmpeq_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FMax(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_max_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FMin(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_min_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FClamp(const FloatV a, const FloatV minV, const FloatV maxV)
+{
+	ASSERT_ISVALIDFLOATV(minV);
+	ASSERT_ISVALIDFLOATV(maxV);
+	return _mm_max_ps(_mm_min_ps(a, maxV), minV);
+}
+
+PX_FORCE_INLINE PxU32 FAllGrtr(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_comigt_ss(a, b);
+}
+
+PX_FORCE_INLINE PxU32 FAllGrtrOrEq(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_comige_ss(a, b);
+}
+
+PX_FORCE_INLINE PxU32 FAllEq(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_comieq_ss(a, b);
+}
+
+PX_FORCE_INLINE FloatV FRound(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+#ifdef __SSE4_2__
+	return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+#else
+	// return _mm_round_ps(a, 0x0);
+	const FloatV half = FLoad(0.5f);
+	const __m128 signBit = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_cvtps_epi32(a), 31));
+	const FloatV aRound = FSub(FAdd(a, half), signBit);
+	__m128i tmp = _mm_cvttps_epi32(aRound);
+	return _mm_cvtepi32_ps(tmp);
+#endif
+}
+
+PX_FORCE_INLINE FloatV FSin(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+
+	// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
+	const FloatV recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const FloatV twoPi = V4LoadA(g_PXTwoPi.f);
+	const FloatV tmp = FMul(a, recipTwoPi);
+	const FloatV b = FRound(tmp);
+	const FloatV V1 = FNegScaleSub(twoPi, b, a);
+
+	// sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
+	//			 V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
+	const FloatV V2 = FMul(V1, V1);
+	const FloatV V3 = FMul(V2, V1);
+	const FloatV V5 = FMul(V3, V2);
+	const FloatV V7 = FMul(V5, V2);
+	const FloatV V9 = FMul(V7, V2);
+	const FloatV V11 = FMul(V9, V2);
+	const FloatV V13 = FMul(V11, V2);
+	const FloatV V15 = FMul(V13, V2);
+	const FloatV V17 = FMul(V15, V2);
+	const FloatV V19 = FMul(V17, V2);
+	const FloatV V21 = FMul(V19, V2);
+	const FloatV V23 = FMul(V21, V2);
+
+	const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f);
+	const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f);
+	const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f);
+
+	const FloatV S1 = V4GetY(sinCoefficients0);
+	const FloatV S2 = V4GetZ(sinCoefficients0);
+	const FloatV S3 = V4GetW(sinCoefficients0);
+	const FloatV S4 = V4GetX(sinCoefficients1);
+	const FloatV S5 = V4GetY(sinCoefficients1);
+	const FloatV S6 = V4GetZ(sinCoefficients1);
+	const FloatV S7 = V4GetW(sinCoefficients1);
+	const FloatV S8 = V4GetX(sinCoefficients2);
+	const FloatV S9 = V4GetY(sinCoefficients2);
+	const FloatV S10 = V4GetZ(sinCoefficients2);
+	const FloatV S11 = V4GetW(sinCoefficients2);
+
+	FloatV Result;
+	Result = FScaleAdd(S1, V3, V1);
+	Result = FScaleAdd(S2, V5, Result);
+	Result = FScaleAdd(S3, V7, Result);
+	Result = FScaleAdd(S4, V9, Result);
+	Result = FScaleAdd(S5, V11, Result);
+	Result = FScaleAdd(S6, V13, Result);
+	Result = FScaleAdd(S7, V15, Result);
+	Result = FScaleAdd(S8, V17, Result);
+	Result = FScaleAdd(S9, V19, Result);
+	Result = FScaleAdd(S10, V21, Result);
+	Result = FScaleAdd(S11, V23, Result);
+
+	return Result;
+}
+
+PX_FORCE_INLINE FloatV FCos(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+
+	// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
+	const FloatV recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const FloatV twoPi = V4LoadA(g_PXTwoPi.f);
+	const FloatV tmp = FMul(a, recipTwoPi);
+	const FloatV b = FRound(tmp);
+	const FloatV V1 = FNegScaleSub(twoPi, b, a);
+
+	// cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
+	//           V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
+	const FloatV V2 = FMul(V1, V1);
+	const FloatV V4 = FMul(V2, V2);
+	const FloatV V6 = FMul(V4, V2);
+	const FloatV V8 = FMul(V4, V4);
+	const FloatV V10 = FMul(V6, V4);
+	const FloatV V12 = FMul(V6, V6);
+	const FloatV V14 = FMul(V8, V6);
+	const FloatV V16 = FMul(V8, V8);
+	const FloatV V18 = FMul(V10, V8);
+	const FloatV V20 = FMul(V10, V10);
+	const FloatV V22 = FMul(V12, V10);
+
+	const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f);
+	const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f);
+	const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f);
+
+	const FloatV C1 = V4GetY(cosCoefficients0);
+	const FloatV C2 = V4GetZ(cosCoefficients0);
+	const FloatV C3 = V4GetW(cosCoefficients0);
+	const FloatV C4 = V4GetX(cosCoefficients1);
+	const FloatV C5 = V4GetY(cosCoefficients1);
+	const FloatV C6 = V4GetZ(cosCoefficients1);
+	const FloatV C7 = V4GetW(cosCoefficients1);
+	const FloatV C8 = V4GetX(cosCoefficients2);
+	const FloatV C9 = V4GetY(cosCoefficients2);
+	const FloatV C10 = V4GetZ(cosCoefficients2);
+	const FloatV C11 = V4GetW(cosCoefficients2);
+
+	FloatV Result;
+	Result = FScaleAdd(C1, V2, V4One());
+	Result = FScaleAdd(C2, V4, Result);
+	Result = FScaleAdd(C3, V6, Result);
+	Result = FScaleAdd(C4, V8, Result);
+	Result = FScaleAdd(C5, V10, Result);
+	Result = FScaleAdd(C6, V12, Result);
+	Result = FScaleAdd(C7, V14, Result);
+	Result = FScaleAdd(C8, V16, Result);
+	Result = FScaleAdd(C9, V18, Result);
+	Result = FScaleAdd(C10, V20, Result);
+	Result = FScaleAdd(C11, V22, Result);
+
+	return Result;
+}
+
+PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV min, const FloatV max)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(min);
+	ASSERT_ISVALIDFLOATV(max);
+	const BoolV c = BOr(FIsGrtr(a, max), FIsGrtr(min, a));
+	return !BAllEqFFFF(c);
+}
+
+PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV min, const FloatV max)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(min);
+	ASSERT_ISVALIDFLOATV(max)
+	const BoolV c = BAnd(FIsGrtrOrEq(a, min), FIsGrtrOrEq(max, a));
+	return BAllEqTTTT(c);
+}
+
+PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV bounds)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(bounds);
+	return FOutOfBounds(a, FNeg(bounds), bounds);
+}
+
+PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV bounds)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(bounds);
+	return FInBounds(a, FNeg(bounds), bounds);
+}
+
+//////////////////////////////////
+// VEC3V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec3V V3Splat(const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	const __m128 zero = FZero();
+	const __m128 fff0 = _mm_move_ss(f, zero);
+	return _mm_shuffle_ps(fff0, fff0, _MM_SHUFFLE(0, 1, 2, 3));
+}
+
+PX_FORCE_INLINE Vec3V V3Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z)
+{
+	ASSERT_ISVALIDFLOATV(x);
+	ASSERT_ISVALIDFLOATV(y);
+	ASSERT_ISVALIDFLOATV(z);
+	// static on zero causes compiler crash on x64 debug_opt
+	const __m128 zero = FZero();
+	const __m128 xy = _mm_move_ss(x, y);
+	const __m128 z0 = _mm_move_ss(zero, z);
+
+	return _mm_shuffle_ps(xy, z0, _MM_SHUFFLE(1, 0, 0, 1));
+}
+
+PX_FORCE_INLINE Vec3V V3UnitX()
+{
+	const PX_ALIGN(16, PxF32) x[4] = { 1.0f, 0.0f, 0.0f, 0.0f };
+	const __m128 x128 = _mm_load_ps(x);
+	return x128;
+}
+
+PX_FORCE_INLINE Vec3V V3UnitY()
+{
+	const PX_ALIGN(16, PxF32) y[4] = { 0.0f, 1.0f, 0.0f, 0.0f };
+	const __m128 y128 = _mm_load_ps(y);
+	return y128;
+}
+
+PX_FORCE_INLINE Vec3V V3UnitZ()
+{
+	const PX_ALIGN(16, PxF32) z[4] = { 0.0f, 0.0f, 1.0f, 0.0f };
+	const __m128 z128 = _mm_load_ps(z);
+	return z128;
+}
+
+PX_FORCE_INLINE FloatV V3GetX(const Vec3V f)
+{
+	ASSERT_ISVALIDVEC3V(f);
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0, 0, 0, 0));
+}
+
+PX_FORCE_INLINE FloatV V3GetY(const Vec3V f)
+{
+	ASSERT_ISVALIDVEC3V(f)
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 1, 1));
+}
+
+PX_FORCE_INLINE FloatV V3GetZ(const Vec3V f)
+{
+	ASSERT_ISVALIDVEC3V(f);
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(2, 2, 2, 2));
+}
+
+PX_FORCE_INLINE Vec3V V3SetX(const Vec3V v, const FloatV f)
+{
+	ASSERT_ISVALIDVEC3V(v);
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BFTTT(), v, f);
+}
+
+PX_FORCE_INLINE Vec3V V3SetY(const Vec3V v, const FloatV f)
+{
+	ASSERT_ISVALIDVEC3V(v);
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTFTT(), v, f);
+}
+
+PX_FORCE_INLINE Vec3V V3SetZ(const Vec3V v, const FloatV f)
+{
+	ASSERT_ISVALIDVEC3V(v);
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTTFT(), v, f);
+}
+
+PX_FORCE_INLINE Vec3V V3ColX(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+	Vec3V r = _mm_shuffle_ps(a, c, _MM_SHUFFLE(3, 0, 3, 0));
+	return V3SetY(r, V3GetX(b));
+}
+
+PX_FORCE_INLINE Vec3V V3ColY(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c)
+	Vec3V r = _mm_shuffle_ps(a, c, _MM_SHUFFLE(3, 1, 3, 1));
+	return V3SetY(r, V3GetY(b));
+}
+
+PX_FORCE_INLINE Vec3V V3ColZ(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+	Vec3V r = _mm_shuffle_ps(a, c, _MM_SHUFFLE(3, 2, 3, 2));
+	return V3SetY(r, V3GetZ(b));
+}
+
+PX_FORCE_INLINE Vec3V V3Zero()
+{
+	return V3Load(0.0f);
+}
+
+PX_FORCE_INLINE Vec3V V3Eps()
+{
+	return V3Load(PX_EPS_REAL);
+}
+PX_FORCE_INLINE Vec3V V3One()
+{
+	return V3Load(1.0f);
+}
+
+PX_FORCE_INLINE Vec3V V3Neg(const Vec3V f)
+{
+	ASSERT_ISVALIDVEC3V(f);
+	return _mm_sub_ps(_mm_setzero_ps(), f);
+}
+
+PX_FORCE_INLINE Vec3V V3Add(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_add_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Sub(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_sub_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Scale(const Vec3V a, const FloatV b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_mul_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Mul(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_mul_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3ScaleInv(const Vec3V a, const FloatV b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_div_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Div(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return V4ClearW(_mm_div_ps(a, b));
+}
+
+PX_FORCE_INLINE Vec3V V3ScaleInvFast(const Vec3V a, const FloatV b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_mul_ps(a, _mm_rcp_ps(b));
+}
+
+PX_FORCE_INLINE Vec3V V3DivFast(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return V4ClearW(_mm_mul_ps(a, _mm_rcp_ps(b)));
+}
+
+PX_FORCE_INLINE Vec3V V3Recip(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 zero = V3Zero();
+	const __m128 tttf = BTTTF();
+	const __m128 recipA = _mm_div_ps(V3One(), a);
+	return V4Sel(tttf, recipA, zero);
+}
+
+PX_FORCE_INLINE Vec3V V3RecipFast(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 zero = V3Zero();
+	const __m128 tttf = BTTTF();
+	const __m128 recipA = _mm_rcp_ps(a);
+	return V4Sel(tttf, recipA, zero);
+}
+
+PX_FORCE_INLINE Vec3V V3Rsqrt(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 zero = V3Zero();
+	const __m128 tttf = BTTTF();
+	const __m128 recipA = _mm_div_ps(V3One(), _mm_sqrt_ps(a));
+	return V4Sel(tttf, recipA, zero);
+}
+
+PX_FORCE_INLINE Vec3V V3RsqrtFast(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 zero = V3Zero();
+	const __m128 tttf = BTTTF();
+	const __m128 recipA = _mm_rsqrt_ps(a);
+	return V4Sel(tttf, recipA, zero);
+}
+
+PX_FORCE_INLINE Vec3V V3ScaleAdd(const Vec3V a, const FloatV b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	ASSERT_ISVALIDVEC3V(c);
+	return V3Add(V3Scale(a, b), c);
+}
+
+PX_FORCE_INLINE Vec3V V3NegScaleSub(const Vec3V a, const FloatV b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	ASSERT_ISVALIDVEC3V(c);
+	return V3Sub(c, V3Scale(a, b));
+}
+
+PX_FORCE_INLINE Vec3V V3MulAdd(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+	return V3Add(V3Mul(a, b), c);
+}
+
+PX_FORCE_INLINE Vec3V V3NegMulSub(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+	return V3Sub(c, V3Mul(a, b));
+}
+
+PX_FORCE_INLINE Vec3V V3Abs(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return V3Max(a, V3Neg(a));
+}
+
+PX_FORCE_INLINE FloatV V3Dot(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+#ifdef __SSE4_2__
+	return _mm_dp_ps(a, b, 0x7f);
+#else
+	const __m128 t0 = _mm_mul_ps(a, b);								//	aw*bw | az*bz | ay*by | ax*bx
+	const __m128 t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(1,0,3,2));	//	ay*by | ax*bx | aw*bw | az*bz
+	const __m128 t2 = _mm_add_ps(t0, t1);							//	ay*by + aw*bw | ax*bx + az*bz | aw*bw + ay*by | az*bz + ax*bx
+	const __m128 t3 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,3,0,1));	//	ax*bx + az*bz | ay*by + aw*bw | az*bz + ax*bx | aw*bw + ay*by
+	return _mm_add_ps(t3, t2);										//	ax*bx + az*bz + ay*by + aw*bw 
+																	//	ay*by + aw*bw + ax*bx + az*bz
+																	//	az*bz + ax*bx + aw*bw + ay*by
+																	//	aw*bw + ay*by + az*bz + ax*bx
+#endif
+}
+
+PX_FORCE_INLINE Vec3V V3Cross(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	const __m128 r1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	const __m128 r2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	const __m128 l1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	const __m128 l2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	return _mm_sub_ps(_mm_mul_ps(l1, l2), _mm_mul_ps(r1, r2));
+}
+
+PX_FORCE_INLINE VecCrossV V3PrepareCross(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	VecCrossV v;
+	v.mR1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	v.mL1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	return v;
+}
+
+PX_FORCE_INLINE Vec3V V3Cross(const VecCrossV& a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(b);
+	const __m128 r2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	const __m128 l2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	return _mm_sub_ps(_mm_mul_ps(a.mL1, l2), _mm_mul_ps(a.mR1, r2));
+}
+
+PX_FORCE_INLINE Vec3V V3Cross(const Vec3V a, const VecCrossV& b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 r2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	const __m128 l2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	return _mm_sub_ps(_mm_mul_ps(b.mR1, r2), _mm_mul_ps(b.mL1, l2));
+}
+
+PX_FORCE_INLINE Vec3V V3Cross(const VecCrossV& a, const VecCrossV& b)
+{
+	return _mm_sub_ps(_mm_mul_ps(a.mL1, b.mR1), _mm_mul_ps(a.mR1, b.mL1));
+}
+
+PX_FORCE_INLINE FloatV V3Length(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_sqrt_ps(V3Dot(a, a));
+}
+
+PX_FORCE_INLINE FloatV V3LengthSq(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return V3Dot(a, a);
+}
+
+PX_FORCE_INLINE Vec3V V3Normalize(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISFINITELENGTH(a);
+	return V3ScaleInv(a, _mm_sqrt_ps(V3Dot(a, a)));
+}
+
+PX_FORCE_INLINE Vec3V V3NormalizeFast(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISFINITELENGTH(a);
+	return V3Scale(a, _mm_rsqrt_ps(V3Dot(a, a)));
+}
+
+PX_FORCE_INLINE Vec3V V3NormalizeSafe(const Vec3V a, const Vec3V unsafeReturnValue)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 eps = V3Eps();
+	const __m128 length = V3Length(a);
+	const __m128 isGreaterThanZero = FIsGrtr(length, eps);
+	return V3Sel(isGreaterThanZero, V3ScaleInv(a, length), unsafeReturnValue);
+}
+
+PX_FORCE_INLINE Vec3V V3Sel(const BoolV c, const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(_mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a)));
+	return _mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a));
+}
+
+PX_FORCE_INLINE BoolV V3IsGrtr(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_cmpgt_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV V3IsGrtrOrEq(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_cmpge_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV V3IsEq(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_cmpeq_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Max(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_max_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Min(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_min_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV V3ExtractMax(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0));
+	const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1));
+	const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2));
+
+	return _mm_max_ps(_mm_max_ps(shuf1, shuf2), shuf3);
+}
+
+PX_FORCE_INLINE FloatV V3ExtractMin(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+
+	const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0));
+	const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1));
+	const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2));
+
+	return _mm_min_ps(_mm_min_ps(shuf1, shuf2), shuf3);
+}
+
+// return (a >= 0.0f) ? 1.0f : -1.0f;
+PX_FORCE_INLINE Vec3V V3Sign(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 zero = V3Zero();
+	const __m128 one = V3One();
+	const __m128 none = V3Neg(one);
+	return V3Sel(V3IsGrtrOrEq(a, zero), one, none);
+}
+
+PX_FORCE_INLINE Vec3V V3Clamp(const Vec3V a, const Vec3V minV, const Vec3V maxV)
+{
+	ASSERT_ISVALIDVEC3V(maxV);
+	ASSERT_ISVALIDVEC3V(minV);
+	return V3Max(V3Min(a, maxV), minV);
+}
+
+PX_FORCE_INLINE PxU32 V3AllGrtr(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return internalUnitSSE2Simd::BAllTrue3_R(V4IsGrtr(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V3AllGrtrOrEq(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return internalUnitSSE2Simd::BAllTrue3_R(V4IsGrtrOrEq(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V3AllEq(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return internalUnitSSE2Simd::BAllTrue3_R(V4IsEq(a, b));
+}
+
+PX_FORCE_INLINE Vec3V V3Round(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+#ifdef __SSE4_2__
+	return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+#else
+	// return _mm_round_ps(a, 0x0);
+	const Vec3V half = V3Load(0.5f);
+	const __m128 signBit = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_cvtps_epi32(a), 31));
+	const Vec3V aRound = V3Sub(V3Add(a, half), signBit);
+	__m128i tmp = _mm_cvttps_epi32(aRound);
+	return _mm_cvtepi32_ps(tmp);
+#endif
+}
+
+PX_FORCE_INLINE Vec3V V3Sin(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
+	const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const Vec4V twoPi = V4LoadA(g_PXTwoPi.f);
+	const Vec3V tmp = V3Scale(a, recipTwoPi);
+	const Vec3V b = V3Round(tmp);
+	const Vec3V V1 = V3NegScaleSub(b, twoPi, a);
+
+	// sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
+	//           V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
+	const Vec3V V2 = V3Mul(V1, V1);
+	const Vec3V V3 = V3Mul(V2, V1);
+	const Vec3V V5 = V3Mul(V3, V2);
+	const Vec3V V7 = V3Mul(V5, V2);
+	const Vec3V V9 = V3Mul(V7, V2);
+	const Vec3V V11 = V3Mul(V9, V2);
+	const Vec3V V13 = V3Mul(V11, V2);
+	const Vec3V V15 = V3Mul(V13, V2);
+	const Vec3V V17 = V3Mul(V15, V2);
+	const Vec3V V19 = V3Mul(V17, V2);
+	const Vec3V V21 = V3Mul(V19, V2);
+	const Vec3V V23 = V3Mul(V21, V2);
+
+	const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f);
+	const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f);
+	const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f);
+
+	const FloatV S1 = V4GetY(sinCoefficients0);
+	const FloatV S2 = V4GetZ(sinCoefficients0);
+	const FloatV S3 = V4GetW(sinCoefficients0);
+	const FloatV S4 = V4GetX(sinCoefficients1);
+	const FloatV S5 = V4GetY(sinCoefficients1);
+	const FloatV S6 = V4GetZ(sinCoefficients1);
+	const FloatV S7 = V4GetW(sinCoefficients1);
+	const FloatV S8 = V4GetX(sinCoefficients2);
+	const FloatV S9 = V4GetY(sinCoefficients2);
+	const FloatV S10 = V4GetZ(sinCoefficients2);
+	const FloatV S11 = V4GetW(sinCoefficients2);
+
+	Vec3V Result;
+	Result = V3ScaleAdd(V3, S1, V1);
+	Result = V3ScaleAdd(V5, S2, Result);
+	Result = V3ScaleAdd(V7, S3, Result);
+	Result = V3ScaleAdd(V9, S4, Result);
+	Result = V3ScaleAdd(V11, S5, Result);
+	Result = V3ScaleAdd(V13, S6, Result);
+	Result = V3ScaleAdd(V15, S7, Result);
+	Result = V3ScaleAdd(V17, S8, Result);
+	Result = V3ScaleAdd(V19, S9, Result);
+	Result = V3ScaleAdd(V21, S10, Result);
+	Result = V3ScaleAdd(V23, S11, Result);
+
+	ASSERT_ISVALIDVEC3V(Result);
+	return Result;
+}
+
+PX_FORCE_INLINE Vec3V V3Cos(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+
+	// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
+	const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const Vec4V twoPi = V4LoadA(g_PXTwoPi.f);
+	const Vec3V tmp = V3Scale(a, recipTwoPi);
+	const Vec3V b = V3Round(tmp);
+	const Vec3V V1 = V3NegScaleSub(b, twoPi, a);
+
+	// cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
+	//           V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
+	const Vec3V V2 = V3Mul(V1, V1);
+	const Vec3V V4 = V3Mul(V2, V2);
+	const Vec3V V6 = V3Mul(V4, V2);
+	const Vec3V V8 = V3Mul(V4, V4);
+	const Vec3V V10 = V3Mul(V6, V4);
+	const Vec3V V12 = V3Mul(V6, V6);
+	const Vec3V V14 = V3Mul(V8, V6);
+	const Vec3V V16 = V3Mul(V8, V8);
+	const Vec3V V18 = V3Mul(V10, V8);
+	const Vec3V V20 = V3Mul(V10, V10);
+	const Vec3V V22 = V3Mul(V12, V10);
+
+	const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f);
+	const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f);
+	const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f);
+
+	const FloatV C1 = V4GetY(cosCoefficients0);
+	const FloatV C2 = V4GetZ(cosCoefficients0);
+	const FloatV C3 = V4GetW(cosCoefficients0);
+	const FloatV C4 = V4GetX(cosCoefficients1);
+	const FloatV C5 = V4GetY(cosCoefficients1);
+	const FloatV C6 = V4GetZ(cosCoefficients1);
+	const FloatV C7 = V4GetW(cosCoefficients1);
+	const FloatV C8 = V4GetX(cosCoefficients2);
+	const FloatV C9 = V4GetY(cosCoefficients2);
+	const FloatV C10 = V4GetZ(cosCoefficients2);
+	const FloatV C11 = V4GetW(cosCoefficients2);
+
+	Vec3V Result;
+	Result = V3ScaleAdd(V2, C1, V3One());
+	Result = V3ScaleAdd(V4, C2, Result);
+	Result = V3ScaleAdd(V6, C3, Result);
+	Result = V3ScaleAdd(V8, C4, Result);
+	Result = V3ScaleAdd(V10, C5, Result);
+	Result = V3ScaleAdd(V12, C6, Result);
+	Result = V3ScaleAdd(V14, C7, Result);
+	Result = V3ScaleAdd(V16, C8, Result);
+	Result = V3ScaleAdd(V18, C9, Result);
+	Result = V3ScaleAdd(V20, C10, Result);
+	Result = V3ScaleAdd(V22, C11, Result);
+
+	ASSERT_ISVALIDVEC3V(Result);
+	return Result;
+}
+
+PX_FORCE_INLINE Vec3V V3PermYZZ(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 2, 1));
+}
+
+PX_FORCE_INLINE Vec3V V3PermXYX(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 1, 0));
+}
+
+PX_FORCE_INLINE Vec3V V3PermYZX(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
+}
+
+PX_FORCE_INLINE Vec3V V3PermZXY(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2));
+}
+
+PX_FORCE_INLINE Vec3V V3PermZZY(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 2, 2));
+}
+
+PX_FORCE_INLINE Vec3V V3PermYXX(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 0, 1));
+}
+
+PX_FORCE_INLINE Vec3V V3Perm_Zero_1Z_0Y(const Vec3V v0, const Vec3V v1)
+{
+	ASSERT_ISVALIDVEC3V(v0);
+	ASSERT_ISVALIDVEC3V(v1);
+	return _mm_shuffle_ps(v1, v0, _MM_SHUFFLE(3, 1, 2, 3));
+}
+
+PX_FORCE_INLINE Vec3V V3Perm_0Z_Zero_1X(const Vec3V v0, const Vec3V v1)
+{
+	ASSERT_ISVALIDVEC3V(v0);
+	ASSERT_ISVALIDVEC3V(v1);
+	return _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(3, 0, 3, 2));
+}
+
+PX_FORCE_INLINE Vec3V V3Perm_1Y_0X_Zero(const Vec3V v0, const Vec3V v1)
+{
+	ASSERT_ISVALIDVEC3V(v0);
+	ASSERT_ISVALIDVEC3V(v1);
+	// There must be a better way to do this.
+	Vec3V v2 = V3Zero();
+	FloatV y1 = V3GetY(v1);
+	FloatV x0 = V3GetX(v0);
+	v2 = V3SetX(v2, y1);
+	return V3SetY(v2, x0);
+}
+
+PX_FORCE_INLINE FloatV V3SumElems(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+#ifdef __SSE4_2__
+	Vec3V r = _mm_hadd_ps(a, a);
+	r = _mm_hadd_ps(r, r);
+	return r;
+#else
+	__m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)); // z,y,x,w
+	__m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)); // y,x,w,z
+	__m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)); // x,w,z,y
+	return _mm_add_ps(_mm_add_ps(shuf1, shuf2), shuf3);
+#endif
+}
+
+PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V min, const Vec3V max)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(min);
+	ASSERT_ISVALIDVEC3V(max);
+	const BoolV c = BOr(V3IsGrtr(a, max), V3IsGrtr(min, a));
+	return !BAllEqFFFF(c);
+}
+
+PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V min, const Vec3V max)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(min);
+	ASSERT_ISVALIDVEC3V(max);
+	const BoolV c = BAnd(V3IsGrtrOrEq(a, min), V3IsGrtrOrEq(max, a));
+	return BAllEqTTTT(c);
+}
+
+PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V bounds)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(bounds);
+	return V3OutOfBounds(a, V3Neg(bounds), bounds);
+}
+
+PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V bounds)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(bounds)
+	return V3InBounds(a, V3Neg(bounds), bounds);
+}
+
+PX_FORCE_INLINE void V3Transpose(Vec3V& col0, Vec3V& col1, Vec3V& col2)
+{
+	ASSERT_ISVALIDVEC3V(col0);
+	ASSERT_ISVALIDVEC3V(col1);
+	ASSERT_ISVALIDVEC3V(col2);
+
+	const Vec3V col3 = _mm_setzero_ps();
+	Vec3V tmp0 = _mm_unpacklo_ps(col0, col1);
+	Vec3V tmp2 = _mm_unpacklo_ps(col2, col3);
+	Vec3V tmp1 = _mm_unpackhi_ps(col0, col1);
+	Vec3V tmp3 = _mm_unpackhi_ps(col2, col3);
+	col0 = _mm_movelh_ps(tmp0, tmp2);
+	col1 = _mm_movehl_ps(tmp2, tmp0);
+	col2 = _mm_movelh_ps(tmp1, tmp3);
+}
+
+//////////////////////////////////
+// VEC4V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec4V V4Splat(const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	// return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0,0,0,0));
+	return f;
+}
+
+PX_FORCE_INLINE Vec4V V4Merge(const FloatV* const floatVArray)
+{
+	ASSERT_ISVALIDFLOATV(floatVArray[0]);
+	ASSERT_ISVALIDFLOATV(floatVArray[1]);
+	ASSERT_ISVALIDFLOATV(floatVArray[2]);
+	ASSERT_ISVALIDFLOATV(floatVArray[3]);
+	const __m128 xw = _mm_move_ss(floatVArray[1], floatVArray[0]); // y, y, y, x
+	const __m128 yz = _mm_move_ss(floatVArray[2], floatVArray[3]); // z, z, z, w
+	return _mm_shuffle_ps(xw, yz, _MM_SHUFFLE(0, 2, 1, 0));
+}
+
+PX_FORCE_INLINE Vec4V V4Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z, const FloatVArg w)
+{
+	ASSERT_ISVALIDFLOATV(x);
+	ASSERT_ISVALIDFLOATV(y);
+	ASSERT_ISVALIDFLOATV(z);
+	ASSERT_ISVALIDFLOATV(w);
+	const __m128 xw = _mm_move_ss(y, x); // y, y, y, x
+	const __m128 yz = _mm_move_ss(z, w); // z, z, z, w
+	return _mm_shuffle_ps(xw, yz, _MM_SHUFFLE(0, 2, 1, 0));
+}
+
+PX_FORCE_INLINE Vec4V V4MergeW(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	const Vec4V xz = _mm_unpackhi_ps(x, z);
+	const Vec4V yw = _mm_unpackhi_ps(y, w);
+	return _mm_unpackhi_ps(xz, yw);
+}
+
+PX_FORCE_INLINE Vec4V V4MergeZ(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	const Vec4V xz = _mm_unpackhi_ps(x, z);
+	const Vec4V yw = _mm_unpackhi_ps(y, w);
+	return _mm_unpacklo_ps(xz, yw);
+}
+
+PX_FORCE_INLINE Vec4V V4MergeY(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	const Vec4V xz = _mm_unpacklo_ps(x, z);
+	const Vec4V yw = _mm_unpacklo_ps(y, w);
+	return _mm_unpackhi_ps(xz, yw);
+}
+
+PX_FORCE_INLINE Vec4V V4MergeX(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	const Vec4V xz = _mm_unpacklo_ps(x, z);
+	const Vec4V yw = _mm_unpacklo_ps(y, w);
+	return _mm_unpacklo_ps(xz, yw);
+}
+
+PX_FORCE_INLINE Vec4V V4UnpackXY(const Vec4VArg a, const Vec4VArg b)
+{
+	return _mm_unpacklo_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4UnpackZW(const Vec4VArg a, const Vec4VArg b)
+{
+	return _mm_unpackhi_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4UnitW()
+{
+	const PX_ALIGN(16, PxF32) w[4] = { 0.0f, 0.0f, 0.0f, 1.0f };
+	const __m128 w128 = _mm_load_ps(w);
+	return w128;
+}
+
+PX_FORCE_INLINE Vec4V V4UnitX()
+{
+	const PX_ALIGN(16, PxF32) x[4] = { 1.0f, 0.0f, 0.0f, 0.0f };
+	const __m128 x128 = _mm_load_ps(x);
+	return x128;
+}
+
+PX_FORCE_INLINE Vec4V V4UnitY()
+{
+	const PX_ALIGN(16, PxF32) y[4] = { 0.0f, 1.0f, 0.0f, 0.0f };
+	const __m128 y128 = _mm_load_ps(y);
+	return y128;
+}
+
+PX_FORCE_INLINE Vec4V V4UnitZ()
+{
+	const PX_ALIGN(16, PxF32) z[4] = { 0.0f, 0.0f, 1.0f, 0.0f };
+	const __m128 z128 = _mm_load_ps(z);
+	return z128;
+}
+
+PX_FORCE_INLINE FloatV V4GetW(const Vec4V f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(3, 3, 3, 3));
+}
+
+PX_FORCE_INLINE FloatV V4GetX(const Vec4V f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0, 0, 0, 0));
+}
+
+PX_FORCE_INLINE FloatV V4GetY(const Vec4V f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 1, 1));
+}
+
+PX_FORCE_INLINE FloatV V4GetZ(const Vec4V f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(2, 2, 2, 2));
+}
+
+PX_FORCE_INLINE Vec4V V4SetW(const Vec4V v, const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTTTF(), v, f);
+}
+
+PX_FORCE_INLINE Vec4V V4SetX(const Vec4V v, const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BFTTT(), v, f);
+}
+
+PX_FORCE_INLINE Vec4V V4SetY(const Vec4V v, const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTFTT(), v, f);
+}
+
+PX_FORCE_INLINE Vec4V V4SetZ(const Vec4V v, const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTTFT(), v, f);
+}
+
+PX_FORCE_INLINE Vec4V V4ClearW(const Vec4V v)
+{
+#if !PX_EMSCRIPTEN
+	return _mm_and_ps(v, V4LoadA(internalUnitSSE2Simd::gMaskXYZ));
+#else
+	return _mm_and_ps(v, (VecI32V&)internalUnitSSE2Simd::gMaskXYZ);
+#endif
+}
+
+PX_FORCE_INLINE Vec4V V4PermYXWZ(const Vec4V a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+
+PX_FORCE_INLINE Vec4V V4PermXZXZ(const Vec4V a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 0, 2, 0));
+}
+
+PX_FORCE_INLINE Vec4V V4PermYWYW(const Vec4V a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 3, 1));
+}
+
+PX_FORCE_INLINE Vec4V V4PermYZXW(const Vec4V a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
+}
+
+template <PxU8 x, PxU8 y, PxU8 z, PxU8 w>
+PX_FORCE_INLINE Vec4V V4Perm(const Vec4V a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(w, z, y, x));
+}
+
+PX_FORCE_INLINE Vec4V V4Zero()
+{
+	return V4Load(0.0f);
+}
+
+PX_FORCE_INLINE Vec4V V4One()
+{
+	return V4Load(1.0f);
+}
+
+PX_FORCE_INLINE Vec4V V4Eps()
+{
+	return V4Load(PX_EPS_REAL);
+}
+
+PX_FORCE_INLINE Vec4V V4Neg(const Vec4V f)
+{
+	return _mm_sub_ps(_mm_setzero_ps(), f);
+}
+
+PX_FORCE_INLINE Vec4V V4Add(const Vec4V a, const Vec4V b)
+{
+	return _mm_add_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Sub(const Vec4V a, const Vec4V b)
+{
+	return _mm_sub_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Scale(const Vec4V a, const FloatV b)
+{
+	return _mm_mul_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Mul(const Vec4V a, const Vec4V b)
+{
+	return _mm_mul_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4ScaleInv(const Vec4V a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_div_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Div(const Vec4V a, const Vec4V b)
+{
+	return _mm_div_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4ScaleInvFast(const Vec4V a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_mul_ps(a, _mm_rcp_ps(b));
+}
+
+PX_FORCE_INLINE Vec4V V4DivFast(const Vec4V a, const Vec4V b)
+{
+	return _mm_mul_ps(a, _mm_rcp_ps(b));
+}
+
+PX_FORCE_INLINE Vec4V V4Recip(const Vec4V a)
+{
+	return _mm_div_ps(V4One(), a);
+}
+
+PX_FORCE_INLINE Vec4V V4RecipFast(const Vec4V a)
+{
+	return _mm_rcp_ps(a);
+}
+
+PX_FORCE_INLINE Vec4V V4Rsqrt(const Vec4V a)
+{
+	return _mm_div_ps(V4One(), _mm_sqrt_ps(a));
+}
+
+PX_FORCE_INLINE Vec4V V4RsqrtFast(const Vec4V a)
+{
+	return _mm_rsqrt_ps(a);
+}
+
+PX_FORCE_INLINE Vec4V V4Sqrt(const Vec4V a)
+{
+	return _mm_sqrt_ps(a);
+}
+
+PX_FORCE_INLINE Vec4V V4ScaleAdd(const Vec4V a, const FloatV b, const Vec4V c)
+{
+	ASSERT_ISVALIDFLOATV(b);
+	return V4Add(V4Scale(a, b), c);
+}
+
+PX_FORCE_INLINE Vec4V V4NegScaleSub(const Vec4V a, const FloatV b, const Vec4V c)
+{
+	ASSERT_ISVALIDFLOATV(b);
+	return V4Sub(c, V4Scale(a, b));
+}
+
+PX_FORCE_INLINE Vec4V V4MulAdd(const Vec4V a, const Vec4V b, const Vec4V c)
+{
+	return V4Add(V4Mul(a, b), c);
+}
+
+PX_FORCE_INLINE Vec4V V4NegMulSub(const Vec4V a, const Vec4V b, const Vec4V c)
+{
+	return V4Sub(c, V4Mul(a, b));
+}
+
+PX_FORCE_INLINE Vec4V V4Abs(const Vec4V a)
+{
+	return V4Max(a, V4Neg(a));
+}
+
+PX_FORCE_INLINE FloatV V4SumElements(const Vec4V a)
+{
+#ifdef __SSE4_2__
+	Vec4V r = _mm_hadd_ps(a, a);
+	r = _mm_hadd_ps(r, r);
+	return r;
+#else
+	const Vec4V xy = V4UnpackXY(a, a);                                        // x,x,y,y
+	const Vec4V zw = V4UnpackZW(a, a);                                        // z,z,w,w
+	const Vec4V xz_yw = V4Add(xy, zw);                                        // x+z,x+z,y+w,y+w
+	const FloatV xz = V4GetX(xz_yw);                                          // x+z
+	const FloatV yw = V4GetZ(xz_yw);                                          // y+w
+	return FAdd(xz, yw);                                                      // sum
+#endif
+}
+
+PX_FORCE_INLINE FloatV V4Dot(const Vec4V a, const Vec4V b)
+{
+#ifdef __SSE4_2__
+	return _mm_dp_ps(a, b, 0xff);
+#else
+	const __m128 dot1 = _mm_mul_ps(a, b);                                     // x,y,z,w
+	const __m128 shuf1 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(2, 1, 0, 3)); // w,x,y,z
+	const __m128 shuf2 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(1, 0, 3, 2)); // z,w,x,y
+	const __m128 shuf3 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(0, 3, 2, 1)); // y,z,w,x
+	return _mm_add_ps(_mm_add_ps(shuf2, shuf3), _mm_add_ps(dot1, shuf1));
+#endif
+}
+
+PX_FORCE_INLINE FloatV V4Dot3(const Vec4V a, const Vec4V b)
+{
+#ifdef __SSE4_2__
+	return _mm_dp_ps(a, b, 0x7f);
+#else
+	const __m128 dot1 = _mm_mul_ps(a, b);                                     // w,z,y,x
+	const __m128 shuf1 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(0, 0, 0, 0)); // z,y,x,w
+	const __m128 shuf2 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(1, 1, 1, 1)); // y,x,w,z
+	const __m128 shuf3 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(2, 2, 2, 2)); // x,w,z,y
+	return _mm_add_ps(_mm_add_ps(shuf1, shuf2), shuf3);
+#endif
+}
+
+PX_FORCE_INLINE Vec4V V4Cross(const Vec4V a, const Vec4V b)
+{
+	const __m128 r1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	const __m128 r2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	const __m128 l1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	const __m128 l2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	return _mm_sub_ps(_mm_mul_ps(l1, l2), _mm_mul_ps(r1, r2));
+}
+
+PX_FORCE_INLINE FloatV V4Length(const Vec4V a)
+{
+	return _mm_sqrt_ps(V4Dot(a, a));
+}
+
+PX_FORCE_INLINE FloatV V4LengthSq(const Vec4V a)
+{
+	return V4Dot(a, a);
+}
+
+PX_FORCE_INLINE Vec4V V4Normalize(const Vec4V a)
+{
+	ASSERT_ISFINITELENGTH(a);
+	return V4ScaleInv(a, _mm_sqrt_ps(V4Dot(a, a)));
+}
+
+PX_FORCE_INLINE Vec4V V4NormalizeFast(const Vec4V a)
+{
+	ASSERT_ISFINITELENGTH(a);
+	return V4ScaleInvFast(a, _mm_sqrt_ps(V4Dot(a, a)));
+}
+
+PX_FORCE_INLINE Vec4V V4NormalizeSafe(const Vec4V a, const Vec3V unsafeReturnValue)
+{
+	const __m128 eps = V3Eps();
+	const __m128 length = V4Length(a);
+	const __m128 isGreaterThanZero = V4IsGrtr(length, eps);
+	return V4Sel(isGreaterThanZero, V4ScaleInv(a, length), unsafeReturnValue);
+}
+
+PX_FORCE_INLINE BoolV V4IsEqU32(const VecU32V a, const VecU32V b)
+{
+	return m128_I2F(_mm_cmpeq_epi32(m128_F2I(a), m128_F2I(b)));
+}
+
+PX_FORCE_INLINE Vec4V V4Sel(const BoolV c, const Vec4V a, const Vec4V b)
+{
+	return _mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a));
+}
+
+PX_FORCE_INLINE BoolV V4IsGrtr(const Vec4V a, const Vec4V b)
+{
+	return _mm_cmpgt_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV V4IsGrtrOrEq(const Vec4V a, const Vec4V b)
+{
+	return _mm_cmpge_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV V4IsEq(const Vec4V a, const Vec4V b)
+{
+	return _mm_cmpeq_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Max(const Vec4V a, const Vec4V b)
+{
+	return _mm_max_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Min(const Vec4V a, const Vec4V b)
+{
+	return _mm_min_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV V4ExtractMax(const Vec4V a)
+{
+	const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 1, 0, 3));
+	const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2));
+	const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 3, 2, 1));
+
+	return _mm_max_ps(_mm_max_ps(a, shuf1), _mm_max_ps(shuf2, shuf3));
+}
+
+PX_FORCE_INLINE FloatV V4ExtractMin(const Vec4V a)
+{
+	const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 1, 0, 3));
+	const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2));
+	const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 3, 2, 1));
+
+	return _mm_min_ps(_mm_min_ps(a, shuf1), _mm_min_ps(shuf2, shuf3));
+}
+
+PX_FORCE_INLINE Vec4V V4Clamp(const Vec4V a, const Vec4V minV, const Vec4V maxV)
+{
+	return V4Max(V4Min(a, maxV), minV);
+}
+
+PX_FORCE_INLINE PxU32 V4AllGrtr(const Vec4V a, const Vec4V b)
+{
+	return internalUnitSSE2Simd::BAllTrue4_R(V4IsGrtr(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V4AllGrtrOrEq(const Vec4V a, const Vec4V b)
+{
+	return internalUnitSSE2Simd::BAllTrue4_R(V4IsGrtrOrEq(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V4AllGrtrOrEq3(const Vec4V a, const Vec4V b)
+{
+	return internalUnitSSE2Simd::BAllTrue3_R(V4IsGrtrOrEq(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V4AllEq(const Vec4V a, const Vec4V b)
+{
+	return internalUnitSSE2Simd::BAllTrue4_R(V4IsEq(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V4AnyGrtr3(const Vec4V a, const Vec4V b)
+{
+	return internalUnitSSE2Simd::BAnyTrue3_R(V4IsGrtr(a, b));
+}
+
+PX_FORCE_INLINE Vec4V V4Round(const Vec4V a)
+{
+#ifdef __SSE4_2__
+	return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+#else
+	// return _mm_round_ps(a, 0x0);
+	const Vec4V half = V4Load(0.5f);
+	const __m128 signBit = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_cvtps_epi32(a), 31));
+	const Vec4V aRound = V4Sub(V4Add(a, half), signBit);
+	__m128i tmp = _mm_cvttps_epi32(aRound);
+	return _mm_cvtepi32_ps(tmp);
+#endif
+}
+
+PX_FORCE_INLINE Vec4V V4Sin(const Vec4V a)
+{
+	const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const Vec4V twoPi = V4LoadA(g_PXTwoPi.f);
+	const Vec4V tmp = V4Mul(a, recipTwoPi);
+	const Vec4V b = V4Round(tmp);
+	const Vec4V V1 = V4NegMulSub(twoPi, b, a);
+
+	// sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
+	//           V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
+	const Vec4V V2 = V4Mul(V1, V1);
+	const Vec4V V3 = V4Mul(V2, V1);
+	const Vec4V V5 = V4Mul(V3, V2);
+	const Vec4V V7 = V4Mul(V5, V2);
+	const Vec4V V9 = V4Mul(V7, V2);
+	const Vec4V V11 = V4Mul(V9, V2);
+	const Vec4V V13 = V4Mul(V11, V2);
+	const Vec4V V15 = V4Mul(V13, V2);
+	const Vec4V V17 = V4Mul(V15, V2);
+	const Vec4V V19 = V4Mul(V17, V2);
+	const Vec4V V21 = V4Mul(V19, V2);
+	const Vec4V V23 = V4Mul(V21, V2);
+
+	const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f);
+	const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f);
+	const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f);
+
+	const FloatV S1 = V4GetY(sinCoefficients0);
+	const FloatV S2 = V4GetZ(sinCoefficients0);
+	const FloatV S3 = V4GetW(sinCoefficients0);
+	const FloatV S4 = V4GetX(sinCoefficients1);
+	const FloatV S5 = V4GetY(sinCoefficients1);
+	const FloatV S6 = V4GetZ(sinCoefficients1);
+	const FloatV S7 = V4GetW(sinCoefficients1);
+	const FloatV S8 = V4GetX(sinCoefficients2);
+	const FloatV S9 = V4GetY(sinCoefficients2);
+	const FloatV S10 = V4GetZ(sinCoefficients2);
+	const FloatV S11 = V4GetW(sinCoefficients2);
+
+	Vec4V Result;
+	Result = V4MulAdd(S1, V3, V1);
+	Result = V4MulAdd(S2, V5, Result);
+	Result = V4MulAdd(S3, V7, Result);
+	Result = V4MulAdd(S4, V9, Result);
+	Result = V4MulAdd(S5, V11, Result);
+	Result = V4MulAdd(S6, V13, Result);
+	Result = V4MulAdd(S7, V15, Result);
+	Result = V4MulAdd(S8, V17, Result);
+	Result = V4MulAdd(S9, V19, Result);
+	Result = V4MulAdd(S10, V21, Result);
+	Result = V4MulAdd(S11, V23, Result);
+
+	return Result;
+}
+
+PX_FORCE_INLINE Vec4V V4Cos(const Vec4V a)
+{
+	const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const Vec4V twoPi = V4LoadA(g_PXTwoPi.f);
+	const Vec4V tmp = V4Mul(a, recipTwoPi);
+	const Vec4V b = V4Round(tmp);
+	const Vec4V V1 = V4NegMulSub(twoPi, b, a);
+
+	// cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
+	//           V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
+	const Vec4V V2 = V4Mul(V1, V1);
+	const Vec4V V4 = V4Mul(V2, V2);
+	const Vec4V V6 = V4Mul(V4, V2);
+	const Vec4V V8 = V4Mul(V4, V4);
+	const Vec4V V10 = V4Mul(V6, V4);
+	const Vec4V V12 = V4Mul(V6, V6);
+	const Vec4V V14 = V4Mul(V8, V6);
+	const Vec4V V16 = V4Mul(V8, V8);
+	const Vec4V V18 = V4Mul(V10, V8);
+	const Vec4V V20 = V4Mul(V10, V10);
+	const Vec4V V22 = V4Mul(V12, V10);
+
+	const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f);
+	const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f);
+	const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f);
+
+	const FloatV C1 = V4GetY(cosCoefficients0);
+	const FloatV C2 = V4GetZ(cosCoefficients0);
+	const FloatV C3 = V4GetW(cosCoefficients0);
+	const FloatV C4 = V4GetX(cosCoefficients1);
+	const FloatV C5 = V4GetY(cosCoefficients1);
+	const FloatV C6 = V4GetZ(cosCoefficients1);
+	const FloatV C7 = V4GetW(cosCoefficients1);
+	const FloatV C8 = V4GetX(cosCoefficients2);
+	const FloatV C9 = V4GetY(cosCoefficients2);
+	const FloatV C10 = V4GetZ(cosCoefficients2);
+	const FloatV C11 = V4GetW(cosCoefficients2);
+
+	Vec4V Result;
+	Result = V4MulAdd(C1, V2, V4One());
+	Result = V4MulAdd(C2, V4, Result);
+	Result = V4MulAdd(C3, V6, Result);
+	Result = V4MulAdd(C4, V8, Result);
+	Result = V4MulAdd(C5, V10, Result);
+	Result = V4MulAdd(C6, V12, Result);
+	Result = V4MulAdd(C7, V14, Result);
+	Result = V4MulAdd(C8, V16, Result);
+	Result = V4MulAdd(C9, V18, Result);
+	Result = V4MulAdd(C10, V20, Result);
+	Result = V4MulAdd(C11, V22, Result);
+
+	return Result;
+}
+
+PX_FORCE_INLINE void V4Transpose(Vec4V& col0, Vec4V& col1, Vec4V& col2, Vec4V& col3)
+{
+	Vec4V tmp0 = _mm_unpacklo_ps(col0, col1);
+	Vec4V tmp2 = _mm_unpacklo_ps(col2, col3);
+	Vec4V tmp1 = _mm_unpackhi_ps(col0, col1);
+	Vec4V tmp3 = _mm_unpackhi_ps(col2, col3);
+	col0 = _mm_movelh_ps(tmp0, tmp2);
+	col1 = _mm_movehl_ps(tmp2, tmp0);
+	col2 = _mm_movelh_ps(tmp1, tmp3);
+	col3 = _mm_movehl_ps(tmp3, tmp1);
+}
+
+//////////////////////////////////
+// BoolV
+//////////////////////////////////
+
+PX_FORCE_INLINE BoolV BFFFF()
+{
+	return _mm_setzero_ps();
+}
+
+PX_FORCE_INLINE BoolV BFFFT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0,0,0xFFFFFFFF};
+	const __m128 ffft=_mm_load_ps((float*)&f);
+	return ffft;*/
+	return m128_I2F(_mm_set_epi32(-1, 0, 0, 0));
+}
+
+PX_FORCE_INLINE BoolV BFFTF()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0};
+	const __m128 fftf=_mm_load_ps((float*)&f);
+	return fftf;*/
+	return m128_I2F(_mm_set_epi32(0, -1, 0, 0));
+}
+
+PX_FORCE_INLINE BoolV BFFTT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0xFFFFFFFF};
+	const __m128 fftt=_mm_load_ps((float*)&f);
+	return fftt;*/
+	return m128_I2F(_mm_set_epi32(-1, -1, 0, 0));
+}
+
+PX_FORCE_INLINE BoolV BFTFF()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0};
+	const __m128 ftff=_mm_load_ps((float*)&f);
+	return ftff;*/
+	return m128_I2F(_mm_set_epi32(0, 0, -1, 0));
+}
+
+PX_FORCE_INLINE BoolV BFTFT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0xFFFFFFFF};
+	const __m128 ftft=_mm_load_ps((float*)&f);
+	return ftft;*/
+	return m128_I2F(_mm_set_epi32(-1, 0, -1, 0));
+}
+
+PX_FORCE_INLINE BoolV BFTTF()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0xFFFFFFFF,0};
+	const __m128 fttf=_mm_load_ps((float*)&f);
+	return fttf;*/
+	return m128_I2F(_mm_set_epi32(0, -1, -1, 0));
+}
+
+PX_FORCE_INLINE BoolV BFTTT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF};
+	const __m128 fttt=_mm_load_ps((float*)&f);
+	return fttt;*/
+	return m128_I2F(_mm_set_epi32(-1, -1, -1, 0));
+}
+
+PX_FORCE_INLINE BoolV BTFFF()
+{
+	// const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0};
+	// const __m128 tfff=_mm_load_ps((float*)&f);
+	// return tfff;
+	return m128_I2F(_mm_set_epi32(0, 0, 0, -1));
+}
+
+PX_FORCE_INLINE BoolV BTFFT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0xFFFFFFFF};
+	const __m128 tfft=_mm_load_ps((float*)&f);
+	return tfft;*/
+	return m128_I2F(_mm_set_epi32(-1, 0, 0, -1));
+}
+
+PX_FORCE_INLINE BoolV BTFTF()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0xFFFFFFFF,0};
+	const __m128 tftf=_mm_load_ps((float*)&f);
+	return tftf;*/
+	return m128_I2F(_mm_set_epi32(0, -1, 0, -1));
+}
+
+PX_FORCE_INLINE BoolV BTFTT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0xFFFFFFFF,0xFFFFFFFF};
+	const __m128 tftt=_mm_load_ps((float*)&f);
+	return tftt;*/
+	return m128_I2F(_mm_set_epi32(-1, -1, 0, -1));
+}
+
+PX_FORCE_INLINE BoolV BTTFF()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0,0};
+	const __m128 ttff=_mm_load_ps((float*)&f);
+	return ttff;*/
+	return m128_I2F(_mm_set_epi32(0, 0, -1, -1));
+}
+
+PX_FORCE_INLINE BoolV BTTFT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0,0xFFFFFFFF};
+	const __m128 ttft=_mm_load_ps((float*)&f);
+	return ttft;*/
+	return m128_I2F(_mm_set_epi32(-1, 0, -1, -1));
+}
+
+PX_FORCE_INLINE BoolV BTTTF()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0};
+	const __m128 tttf=_mm_load_ps((float*)&f);
+	return tttf;*/
+	return m128_I2F(_mm_set_epi32(0, -1, -1, -1));
+}
+
+PX_FORCE_INLINE BoolV BTTTT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF};
+	const __m128 tttt=_mm_load_ps((float*)&f);
+	return tttt;*/
+	return m128_I2F(_mm_set_epi32(-1, -1, -1, -1));
+}
+
+PX_FORCE_INLINE BoolV BXMask()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0};
+	const __m128 tfff=_mm_load_ps((float*)&f);
+	return tfff;*/
+	return m128_I2F(_mm_set_epi32(0, 0, 0, -1));
+}
+
+PX_FORCE_INLINE BoolV BYMask()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0};
+	const __m128 ftff=_mm_load_ps((float*)&f);
+	return ftff;*/
+	return m128_I2F(_mm_set_epi32(0, 0, -1, 0));
+}
+
+PX_FORCE_INLINE BoolV BZMask()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0};
+	const __m128 fftf=_mm_load_ps((float*)&f);
+	return fftf;*/
+	return m128_I2F(_mm_set_epi32(0, -1, 0, 0));
+}
+
+PX_FORCE_INLINE BoolV BWMask()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0,0,0xFFFFFFFF};
+	const __m128 ffft=_mm_load_ps((float*)&f);
+	return ffft;*/
+	return m128_I2F(_mm_set_epi32(-1, 0, 0, 0));
+}
+
+PX_FORCE_INLINE BoolV BGetX(const BoolV f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0, 0, 0, 0));
+}
+
+PX_FORCE_INLINE BoolV BGetY(const BoolV f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 1, 1));
+}
+
+PX_FORCE_INLINE BoolV BGetZ(const BoolV f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(2, 2, 2, 2));
+}
+
+PX_FORCE_INLINE BoolV BGetW(const BoolV f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(3, 3, 3, 3));
+}
+
+PX_FORCE_INLINE BoolV BSetX(const BoolV v, const BoolV f)
+{
+	return V4Sel(BFTTT(), v, f);
+}
+
+PX_FORCE_INLINE BoolV BSetY(const BoolV v, const BoolV f)
+{
+	return V4Sel(BTFTT(), v, f);
+}
+
+PX_FORCE_INLINE BoolV BSetZ(const BoolV v, const BoolV f)
+{
+	return V4Sel(BTTFT(), v, f);
+}
+
+PX_FORCE_INLINE BoolV BSetW(const BoolV v, const BoolV f)
+{
+	return V4Sel(BTTTF(), v, f);
+}
+
+PX_FORCE_INLINE BoolV BAnd(const BoolV a, const BoolV b)
+{
+	return _mm_and_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV BNot(const BoolV a)
+{
+	const BoolV bAllTrue(BTTTT());
+	return _mm_xor_ps(a, bAllTrue);
+}
+
+PX_FORCE_INLINE BoolV BAndNot(const BoolV a, const BoolV b)
+{
+	return _mm_andnot_ps(b, a);
+}
+
+PX_FORCE_INLINE BoolV BOr(const BoolV a, const BoolV b)
+{
+	return _mm_or_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV BAllTrue4(const BoolV a)
+{
+	const BoolV bTmp =
+	    _mm_and_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 2, 3)));
+	return _mm_and_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)),
+	                  _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1)));
+}
+
+PX_FORCE_INLINE BoolV BAnyTrue4(const BoolV a)
+{
+	const BoolV bTmp =
+	    _mm_or_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 2, 3)));
+	return _mm_or_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)),
+	                 _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1)));
+}
+
+PX_FORCE_INLINE BoolV BAllTrue3(const BoolV a)
+{
+	const BoolV bTmp =
+	    _mm_and_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)));
+	return _mm_and_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)),
+	                  _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1)));
+}
+
+PX_FORCE_INLINE BoolV BAnyTrue3(const BoolV a)
+{
+	const BoolV bTmp =
+	    _mm_or_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)));
+	return _mm_or_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)),
+	                 _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1)));
+}
+
+PX_FORCE_INLINE PxU32 BAllEq(const BoolV a, const BoolV b)
+{
+	const BoolV bTest = m128_I2F(_mm_cmpeq_epi32(m128_F2I(a), m128_F2I(b)));
+	return internalUnitSSE2Simd::BAllTrue4_R(bTest);
+}
+
+PX_FORCE_INLINE PxU32 BAllEqTTTT(const BoolV a)
+{
+	return PxU32(_mm_movemask_ps(a)==15);
+}
+
+PX_FORCE_INLINE PxU32 BAllEqFFFF(const BoolV a)
+{
+	return PxU32(_mm_movemask_ps(a)==0);
+}
+
+PX_FORCE_INLINE PxU32 BGetBitMask(const BoolV a)
+{
+	return PxU32(_mm_movemask_ps(a));
+}
+
+//////////////////////////////////
+// MAT33V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec3V M33MulV3(const Mat33V& a, const Vec3V b)
+{
+	const FloatV x = V3GetX(b);
+	const FloatV y = V3GetY(b);
+	const FloatV z = V3GetZ(b);
+	const Vec3V v0 = V3Scale(a.col0, x);
+	const Vec3V v1 = V3Scale(a.col1, y);
+	const Vec3V v2 = V3Scale(a.col2, z);
+	const Vec3V v0PlusV1 = V3Add(v0, v1);
+	return V3Add(v0PlusV1, v2);
+}
+
+PX_FORCE_INLINE Vec3V M33TrnspsMulV3(const Mat33V& a, const Vec3V b)
+{
+	const FloatV x = V3Dot(a.col0, b);
+	const FloatV y = V3Dot(a.col1, b);
+	const FloatV z = V3Dot(a.col2, b);
+	return V3Merge(x, y, z);
+}
+
+PX_FORCE_INLINE Vec3V M33MulV3AddV3(const Mat33V& A, const Vec3V b, const Vec3V c)
+{
+	const FloatV x = V3GetX(b);
+	const FloatV y = V3GetY(b);
+	const FloatV z = V3GetZ(b);
+	Vec3V result = V3ScaleAdd(A.col0, x, c);
+	result = V3ScaleAdd(A.col1, y, result);
+	return V3ScaleAdd(A.col2, z, result);
+}
+
+PX_FORCE_INLINE Mat33V M33MulM33(const Mat33V& a, const Mat33V& b)
+{
+	return Mat33V(M33MulV3(a, b.col0), M33MulV3(a, b.col1), M33MulV3(a, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Add(const Mat33V& a, const Mat33V& b)
+{
+	return Mat33V(V3Add(a.col0, b.col0), V3Add(a.col1, b.col1), V3Add(a.col2, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Scale(const Mat33V& a, const FloatV& b)
+{
+	return Mat33V(V3Scale(a.col0, b), V3Scale(a.col1, b), V3Scale(a.col2, b));
+}
+
+PX_FORCE_INLINE Mat33V M33Inverse(const Mat33V& a)
+{
+	const BoolV tfft = BTFFT();
+	const BoolV tttf = BTTTF();
+	const FloatV zero = FZero();
+	const Vec3V cross01 = V3Cross(a.col0, a.col1);
+	const Vec3V cross12 = V3Cross(a.col1, a.col2);
+	const Vec3V cross20 = V3Cross(a.col2, a.col0);
+	const FloatV dot = V3Dot(cross01, a.col2);
+	const FloatV invDet = _mm_rcp_ps(dot);
+	const Vec3V mergeh = _mm_unpacklo_ps(cross12, cross01);
+	const Vec3V mergel = _mm_unpackhi_ps(cross12, cross01);
+	Vec3V colInv0 = _mm_unpacklo_ps(mergeh, cross20);
+	colInv0 = _mm_or_ps(_mm_andnot_ps(tttf, zero), _mm_and_ps(tttf, colInv0));
+	const Vec3V zppd = _mm_shuffle_ps(mergeh, cross20, _MM_SHUFFLE(3, 0, 0, 2));
+	const Vec3V pbwp = _mm_shuffle_ps(cross20, mergeh, _MM_SHUFFLE(3, 3, 1, 0));
+	const Vec3V colInv1 = _mm_or_ps(_mm_andnot_ps(BTFFT(), pbwp), _mm_and_ps(BTFFT(), zppd));
+	const Vec3V xppd = _mm_shuffle_ps(mergel, cross20, _MM_SHUFFLE(3, 0, 0, 0));
+	const Vec3V pcyp = _mm_shuffle_ps(cross20, mergel, _MM_SHUFFLE(3, 1, 2, 0));
+	const Vec3V colInv2 = _mm_or_ps(_mm_andnot_ps(tfft, pcyp), _mm_and_ps(tfft, xppd));
+
+	return Mat33V(_mm_mul_ps(colInv0, invDet), _mm_mul_ps(colInv1, invDet), _mm_mul_ps(colInv2, invDet));
+}
+
+PX_FORCE_INLINE Mat33V M33Trnsps(const Mat33V& a)
+{
+	return Mat33V(V3Merge(V3GetX(a.col0), V3GetX(a.col1), V3GetX(a.col2)),
+	              V3Merge(V3GetY(a.col0), V3GetY(a.col1), V3GetY(a.col2)),
+	              V3Merge(V3GetZ(a.col0), V3GetZ(a.col1), V3GetZ(a.col2)));
+}
+
+PX_FORCE_INLINE Mat33V M33Identity()
+{
+	return Mat33V(V3UnitX(), V3UnitY(), V3UnitZ());
+}
+
+PX_FORCE_INLINE Mat33V M33Sub(const Mat33V& a, const Mat33V& b)
+{
+	return Mat33V(V3Sub(a.col0, b.col0), V3Sub(a.col1, b.col1), V3Sub(a.col2, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Neg(const Mat33V& a)
+{
+	return Mat33V(V3Neg(a.col0), V3Neg(a.col1), V3Neg(a.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Abs(const Mat33V& a)
+{
+	return Mat33V(V3Abs(a.col0), V3Abs(a.col1), V3Abs(a.col2));
+}
+
+PX_FORCE_INLINE Mat33V PromoteVec3V(const Vec3V v)
+{
+	const BoolV bTFFF = BTFFF();
+	const BoolV bFTFF = BFTFF();
+	const BoolV bFFTF = BTFTF();
+
+	const Vec3V zero = V3Zero();
+
+	return Mat33V(V3Sel(bTFFF, v, zero), V3Sel(bFTFF, v, zero), V3Sel(bFFTF, v, zero));
+}
+
+PX_FORCE_INLINE Mat33V M33Diagonal(const Vec3VArg d)
+{
+	const FloatV x = V3Mul(V3UnitX(), d);
+	const FloatV y = V3Mul(V3UnitY(), d);
+	const FloatV z = V3Mul(V3UnitZ(), d);
+	return Mat33V(x, y, z);
+}
+
+//////////////////////////////////
+// MAT34V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec3V M34MulV3(const Mat34V& a, const Vec3V b)
+{
+	const FloatV x = V3GetX(b);
+	const FloatV y = V3GetY(b);
+	const FloatV z = V3GetZ(b);
+	const Vec3V v0 = V3Scale(a.col0, x);
+	const Vec3V v1 = V3Scale(a.col1, y);
+	const Vec3V v2 = V3Scale(a.col2, z);
+	const Vec3V v0PlusV1 = V3Add(v0, v1);
+	const Vec3V v0PlusV1Plusv2 = V3Add(v0PlusV1, v2);
+	return V3Add(v0PlusV1Plusv2, a.col3);
+}
+
+PX_FORCE_INLINE Vec3V M34Mul33V3(const Mat34V& a, const Vec3V b)
+{
+	const FloatV x = V3GetX(b);
+	const FloatV y = V3GetY(b);
+	const FloatV z = V3GetZ(b);
+	const Vec3V v0 = V3Scale(a.col0, x);
+	const Vec3V v1 = V3Scale(a.col1, y);
+	const Vec3V v2 = V3Scale(a.col2, z);
+	const Vec3V v0PlusV1 = V3Add(v0, v1);
+	return V3Add(v0PlusV1, v2);
+}
+
+PX_FORCE_INLINE Vec3V M34TrnspsMul33V3(const Mat34V& a, const Vec3V b)
+{
+	const FloatV x = V3Dot(a.col0, b);
+	const FloatV y = V3Dot(a.col1, b);
+	const FloatV z = V3Dot(a.col2, b);
+	return V3Merge(x, y, z);
+}
+
+PX_FORCE_INLINE Mat34V M34MulM34(const Mat34V& a, const Mat34V& b)
+{
+	return Mat34V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2), M34MulV3(a, b.col3));
+}
+
+PX_FORCE_INLINE Mat33V M34MulM33(const Mat34V& a, const Mat33V& b)
+{
+	return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M34Mul33MM34(const Mat34V& a, const Mat34V& b)
+{
+	return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2));
+}
+
+PX_FORCE_INLINE Mat34V M34Add(const Mat34V& a, const Mat34V& b)
+{
+	return Mat34V(V3Add(a.col0, b.col0), V3Add(a.col1, b.col1), V3Add(a.col2, b.col2), V3Add(a.col3, b.col3));
+}
+
+PX_FORCE_INLINE Mat33V M34Trnsps33(const Mat34V& a)
+{
+	return Mat33V(V3Merge(V3GetX(a.col0), V3GetX(a.col1), V3GetX(a.col2)),
+	              V3Merge(V3GetY(a.col0), V3GetY(a.col1), V3GetY(a.col2)),
+	              V3Merge(V3GetZ(a.col0), V3GetZ(a.col1), V3GetZ(a.col2)));
+}
+
+//////////////////////////////////
+// MAT44V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec4V M44MulV4(const Mat44V& a, const Vec4V b)
+{
+	const FloatV x = V4GetX(b);
+	const FloatV y = V4GetY(b);
+	const FloatV z = V4GetZ(b);
+	const FloatV w = V4GetW(b);
+
+	const Vec4V v0 = V4Scale(a.col0, x);
+	const Vec4V v1 = V4Scale(a.col1, y);
+	const Vec4V v2 = V4Scale(a.col2, z);
+	const Vec4V v3 = V4Scale(a.col3, w);
+	const Vec4V v0PlusV1 = V4Add(v0, v1);
+	const Vec4V v0PlusV1Plusv2 = V4Add(v0PlusV1, v2);
+	return V4Add(v0PlusV1Plusv2, v3);
+}
+
+PX_FORCE_INLINE Vec4V M44TrnspsMulV4(const Mat44V& a, const Vec4V b)
+{
+	PX_ALIGN(16, FloatV) dotProdArray[4] = { V4Dot(a.col0, b), V4Dot(a.col1, b), V4Dot(a.col2, b), V4Dot(a.col3, b) };
+	return V4Merge(dotProdArray);
+}
+
+PX_FORCE_INLINE Mat44V M44MulM44(const Mat44V& a, const Mat44V& b)
+{
+	return Mat44V(M44MulV4(a, b.col0), M44MulV4(a, b.col1), M44MulV4(a, b.col2), M44MulV4(a, b.col3));
+}
+
+PX_FORCE_INLINE Mat44V M44Add(const Mat44V& a, const Mat44V& b)
+{
+	return Mat44V(V4Add(a.col0, b.col0), V4Add(a.col1, b.col1), V4Add(a.col2, b.col2), V4Add(a.col3, b.col3));
+}
+
+PX_FORCE_INLINE Mat44V M44Trnsps(const Mat44V& a)
+{
+	const Vec4V v0 = _mm_unpacklo_ps(a.col0, a.col2);
+	const Vec4V v1 = _mm_unpackhi_ps(a.col0, a.col2);
+	const Vec4V v2 = _mm_unpacklo_ps(a.col1, a.col3);
+	const Vec4V v3 = _mm_unpackhi_ps(a.col1, a.col3);
+	return Mat44V(_mm_unpacklo_ps(v0, v2), _mm_unpackhi_ps(v0, v2), _mm_unpacklo_ps(v1, v3), _mm_unpackhi_ps(v1, v3));
+}
+
+PX_FORCE_INLINE Mat44V M44Inverse(const Mat44V& a)
+{
+	__m128 minor0, minor1, minor2, minor3;
+	__m128 row0, row1, row2, row3;
+	__m128 det, tmp1;
+
+	tmp1 = V4Zero();
+	row1 = V4Zero();
+	row3 = V4Zero();
+
+	row0 = a.col0;
+	row1 = _mm_shuffle_ps(a.col1, a.col1, _MM_SHUFFLE(1, 0, 3, 2));
+	row2 = a.col2;
+	row3 = _mm_shuffle_ps(a.col3, a.col3, _MM_SHUFFLE(1, 0, 3, 2));
+
+	tmp1 = _mm_mul_ps(row2, row3);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	minor0 = _mm_mul_ps(row1, tmp1);
+	minor1 = _mm_mul_ps(row0, tmp1);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
+	minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
+	minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E);
+
+	tmp1 = _mm_mul_ps(row1, row2);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
+	minor3 = _mm_mul_ps(row0, tmp1);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
+	minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
+	minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E);
+
+	tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	row2 = _mm_shuffle_ps(row2, row2, 0x4E);
+	minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
+	minor2 = _mm_mul_ps(row0, tmp1);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
+	minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
+	minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
+
+	tmp1 = _mm_mul_ps(row0, row1);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
+	minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
+	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
+
+	tmp1 = _mm_mul_ps(row0, row3);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
+	minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
+	minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
+
+	tmp1 = _mm_mul_ps(row0, row2);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
+	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
+	minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
+
+	det = _mm_mul_ps(row0, minor0);
+	det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
+	det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
+	tmp1 = _mm_rcp_ss(det);
+#if 0
+	det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
+	det = _mm_shuffle_ps(det, det, 0x00);
+#else
+	det = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(0, 0, 0, 0));
+#endif
+
+	minor0 = _mm_mul_ps(det, minor0);
+	minor1 = _mm_mul_ps(det, minor1);
+	minor2 = _mm_mul_ps(det, minor2);
+	minor3 = _mm_mul_ps(det, minor3);
+	Mat44V invTrans(minor0, minor1, minor2, minor3);
+	return M44Trnsps(invTrans);
+}
+
+PX_FORCE_INLINE Vec4V V4LoadXYZW(const PxF32& x, const PxF32& y, const PxF32& z, const PxF32& w)
+{
+	return _mm_set_ps(w, z, y, x);
+}
+
+/*
+// AP: work in progress - use proper SSE intrinsics where possible
+PX_FORCE_INLINE VecU16V V4U32PK(VecU32V a, VecU32V b)
+{
+    VecU16V result;
+    result.m128_u16[0] = PxU16(PxClamp<PxU32>((a).m128_u32[0], 0, 0xFFFF));
+    result.m128_u16[1] = PxU16(PxClamp<PxU32>((a).m128_u32[1], 0, 0xFFFF));
+    result.m128_u16[2] = PxU16(PxClamp<PxU32>((a).m128_u32[2], 0, 0xFFFF));
+    result.m128_u16[3] = PxU16(PxClamp<PxU32>((a).m128_u32[3], 0, 0xFFFF));
+    result.m128_u16[4] = PxU16(PxClamp<PxU32>((b).m128_u32[0], 0, 0xFFFF));
+    result.m128_u16[5] = PxU16(PxClamp<PxU32>((b).m128_u32[1], 0, 0xFFFF));
+    result.m128_u16[6] = PxU16(PxClamp<PxU32>((b).m128_u32[2], 0, 0xFFFF));
+    result.m128_u16[7] = PxU16(PxClamp<PxU32>((b).m128_u32[3], 0, 0xFFFF));
+    return result;
+}
+*/
+
+PX_FORCE_INLINE VecU32V V4U32Sel(const BoolV c, const VecU32V a, const VecU32V b)
+{
+	return m128_I2F(_mm_or_si128(_mm_andnot_si128(m128_F2I(c), m128_F2I(b)), _mm_and_si128(m128_F2I(c), m128_F2I(a))));
+}
+
+PX_FORCE_INLINE VecU32V V4U32or(VecU32V a, VecU32V b)
+{
+	return m128_I2F(_mm_or_si128(m128_F2I(a), m128_F2I(b)));
+}
+
+PX_FORCE_INLINE VecU32V V4U32xor(VecU32V a, VecU32V b)
+{
+	return m128_I2F(_mm_xor_si128(m128_F2I(a), m128_F2I(b)));
+}
+
+PX_FORCE_INLINE VecU32V V4U32and(VecU32V a, VecU32V b)
+{
+	return m128_I2F(_mm_and_si128(m128_F2I(a), m128_F2I(b)));
+}
+
+PX_FORCE_INLINE VecU32V V4U32Andc(VecU32V a, VecU32V b)
+{
+	return m128_I2F(_mm_andnot_si128(m128_F2I(b), m128_F2I(a)));
+}
+
+/*
+PX_FORCE_INLINE VecU16V V4U16Or(VecU16V a, VecU16V b)
+{
+    return m128_I2F(_mm_or_si128(m128_F2I(a), m128_F2I(b)));
+}
+*/
+
+/*
+PX_FORCE_INLINE VecU16V V4U16And(VecU16V a, VecU16V b)
+{
+    return m128_I2F(_mm_and_si128(m128_F2I(a), m128_F2I(b)));
+}
+*/
+
+/*
+PX_FORCE_INLINE VecU16V V4U16Andc(VecU16V a, VecU16V b)
+{
+    return m128_I2F(_mm_andnot_si128(m128_F2I(b), m128_F2I(a)));
+}
+*/
+
+PX_FORCE_INLINE VecI32V I4Load(const PxI32 i)
+{
+	return m128_F2I(_mm_load1_ps(reinterpret_cast<const PxF32*>(&i)));
+}
+
+PX_FORCE_INLINE VecI32V I4LoadU(const PxI32* i)
+{
+	return m128_F2I(_mm_loadu_ps(reinterpret_cast<const PxF32*>(i)));
+}
+
+PX_FORCE_INLINE VecI32V I4LoadA(const PxI32* i)
+{
+	return m128_F2I(_mm_load_ps(reinterpret_cast<const PxF32*>(i)));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Add(const VecI32VArg a, const VecI32VArg b)
+{
+	return _mm_add_epi32(a, b);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Sub(const VecI32VArg a, const VecI32VArg b)
+{
+	return _mm_sub_epi32(a, b);
+}
+
+PX_FORCE_INLINE BoolV VecI32V_IsGrtr(const VecI32VArg a, const VecI32VArg b)
+{
+	return m128_I2F(_mm_cmpgt_epi32(a, b));
+}
+
+PX_FORCE_INLINE BoolV VecI32V_IsEq(const VecI32VArg a, const VecI32VArg b)
+{
+	return m128_I2F(_mm_cmpeq_epi32(a, b));
+}
+
+PX_FORCE_INLINE VecI32V V4I32Sel(const BoolV c, const VecI32V a, const VecI32V b)
+{
+	return _mm_or_si128(_mm_andnot_si128(m128_F2I(c), b), _mm_and_si128(m128_F2I(c), a));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Zero()
+{
+	return _mm_setzero_si128();
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_One()
+{
+	return I4Load(1);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Two()
+{
+	return I4Load(2);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_MinusOne()
+{
+	return I4Load(-1);
+}
+
+PX_FORCE_INLINE VecU32V U4Zero()
+{
+	return U4Load(0);
+}
+
+PX_FORCE_INLINE VecU32V U4One()
+{
+	return U4Load(1);
+}
+
+PX_FORCE_INLINE VecU32V U4Two()
+{
+	return U4Load(2);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Sel(const BoolV c, const VecI32VArg a, const VecI32VArg b)
+{
+	return _mm_or_si128(_mm_andnot_si128(m128_F2I(c), b), _mm_and_si128(m128_F2I(c), a));
+}
+
+PX_FORCE_INLINE VecShiftV VecI32V_PrepareShift(const VecI32VArg shift)
+{
+	VecShiftV s;
+	s.shift = VecI32V_Sel(BTFFF(), shift, VecI32V_Zero());
+	return s;
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_LeftShift(const VecI32VArg a, const VecShiftVArg count)
+{
+	return _mm_sll_epi32(a, count.shift);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_RightShift(const VecI32VArg a, const VecShiftVArg count)
+{
+	return _mm_srl_epi32(a, count.shift);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_And(const VecI32VArg a, const VecI32VArg b)
+{
+	return _mm_and_si128(a, b);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Or(const VecI32VArg a, const VecI32VArg b)
+{
+	return _mm_or_si128(a, b);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetX(const VecI32VArg a)
+{
+	return m128_F2I(_mm_shuffle_ps(m128_I2F(a), m128_I2F(a), _MM_SHUFFLE(0, 0, 0, 0)));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetY(const VecI32VArg a)
+{
+	return m128_F2I(_mm_shuffle_ps(m128_I2F(a), m128_I2F(a), _MM_SHUFFLE(1, 1, 1, 1)));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetZ(const VecI32VArg a)
+{
+	return m128_F2I(_mm_shuffle_ps(m128_I2F(a), m128_I2F(a), _MM_SHUFFLE(2, 2, 2, 2)));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetW(const VecI32VArg a)
+{
+	return m128_F2I(_mm_shuffle_ps(m128_I2F(a), m128_I2F(a), _MM_SHUFFLE(3, 3, 3, 3)));
+}
+
+PX_FORCE_INLINE void PxI32_From_VecI32V(const VecI32VArg a, PxI32* i)
+{
+	_mm_store_ss(reinterpret_cast<PxF32*>(i), m128_I2F(a));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Merge(const VecI32VArg x, const VecI32VArg y, const VecI32VArg z, const VecI32VArg w)
+{
+	const __m128 xw = _mm_move_ss(m128_I2F(y), m128_I2F(x)); // y, y, y, x
+	const __m128 yz = _mm_move_ss(m128_I2F(z), m128_I2F(w)); // z, z, z, w
+	return m128_F2I(_mm_shuffle_ps(xw, yz, _MM_SHUFFLE(0, 2, 1, 0)));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_From_BoolV(const BoolVArg a)
+{
+	return m128_F2I(a);
+}
+
+PX_FORCE_INLINE VecU32V VecU32V_From_BoolV(const BoolVArg a)
+{
+	return a;
+}
+
+/*
+template<int a> PX_FORCE_INLINE VecI32V V4ISplat()
+{
+    VecI32V result;
+    result.m128_i32[0] = a;
+    result.m128_i32[1] = a;
+    result.m128_i32[2] = a;
+    result.m128_i32[3] = a;
+    return result;
+}
+
+template<PxU32 a> PX_FORCE_INLINE VecU32V V4USplat()
+{
+    VecU32V result;
+    result.m128_u32[0] = a;
+    result.m128_u32[1] = a;
+    result.m128_u32[2] = a;
+    result.m128_u32[3] = a;
+    return result;
+}
+*/
+
+/*
+PX_FORCE_INLINE void V4U16StoreAligned(VecU16V val, VecU16V* address)
+{
+    *address = val;
+}
+*/
+
+PX_FORCE_INLINE void V4U32StoreAligned(VecU32V val, VecU32V* address)
+{
+	*address = val;
+}
+
+PX_FORCE_INLINE Vec4V V4LoadAligned(Vec4V* addr)
+{
+	return *addr;
+}
+
+PX_FORCE_INLINE Vec4V V4LoadUnaligned(Vec4V* addr)
+{
+	return V4LoadU(reinterpret_cast<float*>(addr));
+}
+
+PX_FORCE_INLINE Vec4V V4Andc(const Vec4V a, const VecU32V b)
+{
+	VecU32V result32(a);
+	result32 = V4U32Andc(result32, b);
+	return Vec4V(result32);
+}
+
+PX_FORCE_INLINE VecU32V V4IsGrtrV32u(const Vec4V a, const Vec4V b)
+{
+	return V4IsGrtr(a, b);
+}
+
+PX_FORCE_INLINE VecU16V V4U16LoadAligned(VecU16V* addr)
+{
+	return *addr;
+}
+
+PX_FORCE_INLINE VecU16V V4U16LoadUnaligned(VecU16V* addr)
+{
+	return *addr;
+}
+
+PX_FORCE_INLINE VecU16V V4U16CompareGt(VecU16V a, VecU16V b)
+{
+	// _mm_cmpgt_epi16 doesn't work for unsigned values unfortunately
+	// return m128_I2F(_mm_cmpgt_epi16(m128_F2I(a), m128_F2I(b)));
+	VecU16V result;
+	result.m128_u16[0] = (a).m128_u16[0] > (b).m128_u16[0];
+	result.m128_u16[1] = (a).m128_u16[1] > (b).m128_u16[1];
+	result.m128_u16[2] = (a).m128_u16[2] > (b).m128_u16[2];
+	result.m128_u16[3] = (a).m128_u16[3] > (b).m128_u16[3];
+	result.m128_u16[4] = (a).m128_u16[4] > (b).m128_u16[4];
+	result.m128_u16[5] = (a).m128_u16[5] > (b).m128_u16[5];
+	result.m128_u16[6] = (a).m128_u16[6] > (b).m128_u16[6];
+	result.m128_u16[7] = (a).m128_u16[7] > (b).m128_u16[7];
+	return result;
+}
+
+PX_FORCE_INLINE VecU16V V4I16CompareGt(VecU16V a, VecU16V b)
+{
+	return m128_I2F(_mm_cmpgt_epi16(m128_F2I(a), m128_F2I(b)));
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_VecU32V(VecU32V a)
+{
+	Vec4V result = V4LoadXYZW(PxF32(a.m128_u32[0]), PxF32(a.m128_u32[1]), PxF32(a.m128_u32[2]), PxF32(a.m128_u32[3]));
+	return result;
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_VecI32V(VecI32V in)
+{
+	return _mm_cvtepi32_ps(in);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_From_Vec4V(Vec4V a)
+{
+	return _mm_cvttps_epi32(a);
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecU32V(VecU32V a)
+{
+	return Vec4V(a);
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecI32V(VecI32V a)
+{
+	return m128_I2F(a);
+}
+
+PX_FORCE_INLINE VecU32V VecU32V_ReinterpretFrom_Vec4V(Vec4V a)
+{
+	return VecU32V(a);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_ReinterpretFrom_Vec4V(Vec4V a)
+{
+	return m128_F2I(a);
+}
+
+/*
+template<int index> PX_FORCE_INLINE BoolV BSplatElement(BoolV a)
+{
+    BoolV result;
+    result[0] = result[1] = result[2] = result[3] = a[index];
+    return result;
+}
+*/
+
+template <int index>
+BoolV BSplatElement(BoolV a)
+{
+	float* data = reinterpret_cast<float*>(&a);
+	return V4Load(data[index]);
+}
+
+template <int index>
+PX_FORCE_INLINE VecU32V V4U32SplatElement(VecU32V a)
+{
+	VecU32V result;
+	result.m128_u32[0] = result.m128_u32[1] = result.m128_u32[2] = result.m128_u32[3] = a.m128_u32[index];
+	return result;
+}
+
+template <int index>
+PX_FORCE_INLINE Vec4V V4SplatElement(Vec4V a)
+{
+	float* data = reinterpret_cast<float*>(&a);
+	return V4Load(data[index]);
+}
+
+PX_FORCE_INLINE VecU32V U4LoadXYZW(PxU32 x, PxU32 y, PxU32 z, PxU32 w)
+{
+	VecU32V result;
+	result.m128_u32[0] = x;
+	result.m128_u32[1] = y;
+	result.m128_u32[2] = z;
+	result.m128_u32[3] = w;
+	return result;
+}
+
+PX_FORCE_INLINE Vec4V V4Ceil(const Vec4V in)
+{
+	UnionM128 a(in);
+	return V4LoadXYZW(PxCeil(a.m128_f32[0]), PxCeil(a.m128_f32[1]), PxCeil(a.m128_f32[2]), PxCeil(a.m128_f32[3]));
+}
+
+PX_FORCE_INLINE Vec4V V4Floor(const Vec4V in)
+{
+	UnionM128 a(in);
+	return V4LoadXYZW(PxFloor(a.m128_f32[0]), PxFloor(a.m128_f32[1]), PxFloor(a.m128_f32[2]), PxFloor(a.m128_f32[3]));
+}
+
+PX_FORCE_INLINE VecU32V V4ConvertToU32VSaturate(const Vec4V in, PxU32 power)
+{
+	PX_ASSERT(power == 0 && "Non-zero power not supported in convertToU32VSaturate");
+	PX_UNUSED(power); // prevent warning in release builds
+	PxF32 ffffFFFFasFloat = PxF32(0xFFFF0000);
+	UnionM128 a(in);
+	VecU32V result;
+	result.m128_u32[0] = PxU32(PxClamp<PxF32>((a).m128_f32[0], 0.0f, ffffFFFFasFloat));
+	result.m128_u32[1] = PxU32(PxClamp<PxF32>((a).m128_f32[1], 0.0f, ffffFFFFasFloat));
+	result.m128_u32[2] = PxU32(PxClamp<PxF32>((a).m128_f32[2], 0.0f, ffffFFFFasFloat));
+	result.m128_u32[3] = PxU32(PxClamp<PxF32>((a).m128_f32[3], 0.0f, ffffFFFFasFloat));
+	return result;
+}
+
+#endif // PSFOUNDATION_PSUNIXSSE2INLINEAOS_H
diff --git a/PxShared/src/foundation/include/windows/PsWindowsAoS.h b/PxShared/src/foundation/include/windows/PsWindowsAoS.h
new file mode 100644
index 0000000..aab0712
--- /dev/null
+++ b/PxShared/src/foundation/include/windows/PsWindowsAoS.h
@@ -0,0 +1,131 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSWINDOWSAOS_H
+#define PSFOUNDATION_PSWINDOWSAOS_H
+
+// no includes here! this file should be included from PxcVecMath.h only!!!
+
+#if !COMPILE_VECTOR_INTRINSICS
+#error Vector intrinsics should not be included when using scalar implementation.
+#endif
+
+typedef __m128 FloatV;
+typedef __m128 Vec3V;
+typedef __m128 Vec4V;
+typedef __m128 BoolV;
+typedef __m128 VecU32V;
+typedef __m128 VecI32V;
+typedef __m128 VecU16V;
+typedef __m128 VecI16V;
+typedef __m128 QuatV;
+
+#define FloatVArg FloatV &
+#define Vec3VArg Vec3V &
+#define Vec4VArg Vec4V &
+#define BoolVArg BoolV &
+#define VecU32VArg VecU32V &
+#define VecI32VArg VecI32V &
+#define VecU16VArg VecU16V &
+#define VecI16VArg VecI16V &
+#define QuatVArg QuatV &
+
+// Optimization for situations in which you cross product multiple vectors with the same vector.
+// Avoids 2X shuffles per product
+struct VecCrossV
+{
+	Vec3V mL1;
+	Vec3V mR1;
+};
+
+struct VecShiftV
+{
+	VecI32V shift;
+};
+#define VecShiftVArg VecShiftV &
+
+PX_ALIGN_PREFIX(16)
+struct Mat33V
+{
+	Mat33V()
+	{
+	}
+	Mat33V(const Vec3V& c0, const Vec3V& c1, const Vec3V& c2) : col0(c0), col1(c1), col2(c2)
+	{
+	}
+	Vec3V PX_ALIGN(16, col0);
+	Vec3V PX_ALIGN(16, col1);
+	Vec3V PX_ALIGN(16, col2);
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct Mat34V
+{
+	Mat34V()
+	{
+	}
+	Mat34V(const Vec3V& c0, const Vec3V& c1, const Vec3V& c2, const Vec3V& c3) : col0(c0), col1(c1), col2(c2), col3(c3)
+	{
+	}
+	Vec3V PX_ALIGN(16, col0);
+	Vec3V PX_ALIGN(16, col1);
+	Vec3V PX_ALIGN(16, col2);
+	Vec3V PX_ALIGN(16, col3);
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct Mat43V
+{
+	Mat43V()
+	{
+	}
+	Mat43V(const Vec4V& c0, const Vec4V& c1, const Vec4V& c2) : col0(c0), col1(c1), col2(c2)
+	{
+	}
+	Vec4V PX_ALIGN(16, col0);
+	Vec4V PX_ALIGN(16, col1);
+	Vec4V PX_ALIGN(16, col2);
+} PX_ALIGN_SUFFIX(16);
+
+PX_ALIGN_PREFIX(16)
+struct Mat44V
+{
+	Mat44V()
+	{
+	}
+	Mat44V(const Vec4V& c0, const Vec4V& c1, const Vec4V& c2, const Vec4V& c3) : col0(c0), col1(c1), col2(c2), col3(c3)
+	{
+	}
+	Vec4V PX_ALIGN(16, col0);
+	Vec4V PX_ALIGN(16, col1);
+	Vec4V PX_ALIGN(16, col2);
+	Vec4V PX_ALIGN(16, col3);
+} PX_ALIGN_SUFFIX(16);
+
+#endif // PSFOUNDATION_PSWINDOWSAOS_H
diff --git a/PxShared/src/foundation/include/windows/PsWindowsFPU.h b/PxShared/src/foundation/include/windows/PsWindowsFPU.h
new file mode 100644
index 0000000..d85e531
--- /dev/null
+++ b/PxShared/src/foundation/include/windows/PsWindowsFPU.h
@@ -0,0 +1,51 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSWINDOWSFPU_H
+#define PSFOUNDATION_PSWINDOWSFPU_H
+
+PX_INLINE physx::shdfnd::SIMDGuard::SIMDGuard()
+{
+#if !PX_ARM
+	mControlWord = _mm_getcsr();
+	// set default (disable exceptions: _MM_MASK_MASK) and FTZ (_MM_FLUSH_ZERO_ON), DAZ (_MM_DENORMALS_ZERO_ON: (1<<6))
+	_mm_setcsr(_MM_MASK_MASK | _MM_FLUSH_ZERO_ON | (1 << 6));
+#endif
+}
+
+PX_INLINE physx::shdfnd::SIMDGuard::~SIMDGuard()
+{
+#if !PX_ARM
+	// restore control word and clear any exception flags
+	// (setting exception state flags cause exceptions on the first following fp operation)
+	_mm_setcsr(mControlWord & ~_MM_EXCEPT_MASK);
+#endif
+}
+
+#endif // #ifndef PSFOUNDATION_PSWINDOWSFPU_H
diff --git a/PxShared/src/foundation/include/windows/PsWindowsInclude.h b/PxShared/src/foundation/include/windows/PsWindowsInclude.h
new file mode 100644
index 0000000..75962e1
--- /dev/null
+++ b/PxShared/src/foundation/include/windows/PsWindowsInclude.h
@@ -0,0 +1,96 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSWINDOWSINCLUDE_H
+#define PSFOUNDATION_PSWINDOWSINCLUDE_H
+
+#include "Ps.h"
+
+#ifndef _WIN32
+#error "This file should only be included by Windows builds!!"
+#endif
+
+#ifdef _WINDOWS_ // windows already included
+#error "Only include windows.h through this file!!"
+#endif
+
+// We only support >= Windows XP, and we need this for critical section and
+#define _WIN32_WINNT 0x0501
+
+// turn off as much as we can for windows. All we really need is the thread functions(critical sections/Interlocked*
+// etc)
+#define NOGDICAPMASKS
+#define NOVIRTUALKEYCODES
+#define NOWINMESSAGES
+#define NOWINSTYLES
+#define NOSYSMETRICS
+#define NOMENUS
+#define NOICONS
+#define NOKEYSTATES
+#define NOSYSCOMMANDS
+#define NORASTEROPS
+#define NOSHOWWINDOW
+#define NOATOM
+#define NOCLIPBOARD
+#define NOCOLOR
+#define NOCTLMGR
+#define NODRAWTEXT
+#define NOGDI
+#define NOMB
+#define NOMEMMGR
+#define NOMETAFILE
+#define NOMINMAX
+#define NOOPENFILE
+#define NOSCROLL
+#define NOSERVICE
+#define NOSOUND
+#define NOTEXTMETRIC
+#define NOWH
+#define NOWINOFFSETS
+#define NOCOMM
+#define NOKANJI
+#define NOHELP
+#define NOPROFILER
+#define NODEFERWINDOWPOS
+#define NOMCX
+#define WIN32_LEAN_AND_MEAN
+#define NOUSER
+#define NONLS
+#define NOMSG
+
+#pragma warning(push)
+#pragma warning(disable : 4668) //'symbol' is not defined as a preprocessor macro, replacing with '0' for 'directives'
+#include <windows.h>
+#pragma warning(pop)
+
+#if PX_SSE2
+#include <xmmintrin.h>
+#endif
+
+#endif // #ifndef PSFOUNDATION_PSWINDOWSINCLUDE_H
diff --git a/PxShared/src/foundation/include/windows/PsWindowsInlineAoS.h b/PxShared/src/foundation/include/windows/PsWindowsInlineAoS.h
new file mode 100644
index 0000000..14a311f
--- /dev/null
+++ b/PxShared/src/foundation/include/windows/PsWindowsInlineAoS.h
@@ -0,0 +1,3119 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSWINDOWSINLINEAOS_H
+#define PSFOUNDATION_PSWINDOWSINLINEAOS_H
+
+#if !COMPILE_VECTOR_INTRINSICS
+#error Vector intrinsics should not be included when using scalar implementation.
+#endif
+
+// Remove this define when all platforms use simd solver.
+#define PX_SUPPORT_SIMD
+
+#include "../PsVecMathSSE.h"
+
+//////////////////////////////////////////////////////////////////////
+//Test that Vec3V and FloatV are legal
+//////////////////////////////////////////////////////////////////////
+
+#define FLOAT_COMPONENTS_EQUAL_THRESHOLD 0.01f
+PX_FORCE_INLINE bool isValidFloatV(const FloatV a)
+{
+	const PxF32 x = V4ReadX(a);
+	const PxF32 y = V4ReadY(a);
+	const PxF32 z = V4ReadZ(a);
+	const PxF32 w = V4ReadW(a);
+
+	if (
+		(PxAbs(x - y) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) &&
+		(PxAbs(x - z) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) &&
+		(PxAbs(x - w) < FLOAT_COMPONENTS_EQUAL_THRESHOLD)
+		)
+	{
+		return true;
+	}
+	
+	if (
+		(PxAbs((x - y) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) &&
+		(PxAbs((x - z) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD) &&
+		(PxAbs((x - w) / x) < FLOAT_COMPONENTS_EQUAL_THRESHOLD)
+		)
+	{
+		return true;
+	}
+	return false;
+}
+
+PX_FORCE_INLINE bool isValidVec3V(const Vec3V a)
+{
+	//using _mm_comieq_ss to do the comparison doesn't work for NaN.
+	PX_ALIGN(16, PxF32 f[4]);
+	V4StoreA((const Vec4V&)a, f);
+	return f[3] == 0.0f;
+}
+
+PX_FORCE_INLINE bool isFiniteLength(const Vec3V a)
+{
+	return !FAllEq(V4LengthSq(a), FZero());
+}
+
+PX_FORCE_INLINE bool isAligned16(void* a)
+{
+	return(0 == ((size_t)a & 0x0f));
+}
+
+//ASSERT_FINITELENGTH is deactivated because there is a lot of code that calls a simd normalisation function with zero length but then ignores the result.
+
+#if PX_DEBUG
+#define ASSERT_ISVALIDVEC3V(a) PX_ASSERT(isValidVec3V(a))
+#define ASSERT_ISVALIDFLOATV(a) PX_ASSERT(isValidFloatV(a))
+#define ASSERT_ISALIGNED16(a) PX_ASSERT(isAligned16((void*)a))
+#define ASSERT_ISFINITELENGTH(a) //PX_ASSERT(isFiniteLength(a))
+#else
+#define ASSERT_ISVALIDVEC3V(a)
+#define ASSERT_ISVALIDFLOATV(a) 
+#define ASSERT_ISALIGNED16(a)
+#define ASSERT_ISFINITELENGTH(a)
+#endif
+/////////////////////////////////////////////////////////////////////
+////FUNCTIONS USED ONLY FOR ASSERTS IN VECTORISED IMPLEMENTATIONS
+/////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////
+// USED ONLY INTERNALLY
+//////////////////////////////////////////////////////////////////////
+
+namespace internalWindowsSimd
+{
+PX_FORCE_INLINE __m128 m128_I2F(__m128i n)
+{
+	return _mm_castsi128_ps(n);
+}
+
+PX_FORCE_INLINE __m128i m128_F2I(__m128 n)
+{
+	return _mm_castps_si128(n);
+}
+
+PX_FORCE_INLINE PxU32 BAllTrue4_R(const BoolV a)
+{
+	const PxI32 moveMask = _mm_movemask_ps(a);
+	return PxU32(moveMask == 0xf);
+}
+
+PX_FORCE_INLINE PxU32 BAllTrue3_R(const BoolV a)
+{
+	const PxI32 moveMask = _mm_movemask_ps(a);
+	return PxU32((moveMask & 0x7) == 0x7);
+}
+
+PX_FORCE_INLINE PxU32 BAnyTrue4_R(const BoolV a)
+{
+	const PxI32 moveMask = _mm_movemask_ps(a);
+	return PxU32(moveMask != 0x0);
+}
+
+PX_FORCE_INLINE PxU32 BAnyTrue3_R(const BoolV a)
+{
+	const PxI32 moveMask = _mm_movemask_ps(a);
+	return PxU32(((moveMask & 0x7) != 0x0));
+}
+
+PX_FORCE_INLINE PxU32 FiniteTestEq(const Vec4V a, const Vec4V b)
+{
+	// This is a bit of a bodge.
+	//_mm_comieq_ss returns 1 if either value is nan so we need to re-cast a and b with true encoded as a non-nan
+	// number.
+	// There must be a better way of doing this in sse.
+	const BoolV one = FOne();
+	const BoolV zero = FZero();
+	const BoolV a1 = V4Sel(a, one, zero);
+	const BoolV b1 = V4Sel(b, one, zero);
+	return (PxU32(
+	    _mm_comieq_ss(a1, b1) &&
+	    _mm_comieq_ss(_mm_shuffle_ps(a1, a1, _MM_SHUFFLE(1, 1, 1, 1)), _mm_shuffle_ps(b1, b1, _MM_SHUFFLE(1, 1, 1, 1))) &&
+	    _mm_comieq_ss(_mm_shuffle_ps(a1, a1, _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(b1, b1, _MM_SHUFFLE(2, 2, 2, 2))) &&
+	    _mm_comieq_ss(_mm_shuffle_ps(a1, a1, _MM_SHUFFLE(3, 3, 3, 3)), _mm_shuffle_ps(b1, b1, _MM_SHUFFLE(3, 3, 3, 3)))));
+}
+
+PX_FORCE_INLINE bool hasZeroElementinFloatV(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), FZero()) ? true : false;
+}
+
+PX_FORCE_INLINE bool hasZeroElementInVec3V(const Vec3V a)
+{
+	return (_mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), FZero()) ||
+			_mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)), FZero()) ||
+			_mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)), FZero()));
+}
+
+PX_FORCE_INLINE bool hasZeroElementInVec4V(const Vec4V a)
+{
+	return (_mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)), FZero()) ||
+	        _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)), FZero()) ||
+	        _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)), FZero()) ||
+	        _mm_comieq_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)), FZero()));
+}
+
+const PX_ALIGN(16, PxU32 gMaskXYZ[4]) = { 0xffffffff, 0xffffffff, 0xffffffff, 0 };
+} //internalWindowsSimd
+
+namespace _VecMathTests
+{
+// PT: this function returns an invalid Vec3V (W!=0.0f) just for unit-testing 'isValidVec3V'
+PX_FORCE_INLINE Vec3V getInvalidVec3V()
+{
+	const float f = 1.0f;
+	return _mm_load1_ps(&f);
+}
+
+PX_FORCE_INLINE bool allElementsEqualFloatV(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_comieq_ss(a, b) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsEqualVec3V(const Vec3V a, const Vec3V b)
+{
+	return V3AllEq(a, b) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsEqualVec4V(const Vec4V a, const Vec4V b)
+{
+	return V4AllEq(a, b) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsEqualBoolV(const BoolV a, const BoolV b)
+{
+	return internalWindowsSimd::BAllTrue4_R(VecI32V_IsEq(a, b)) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsEqualVecU32V(const VecU32V a, const VecU32V b)
+{
+	return internalWindowsSimd::BAllTrue4_R(V4IsEqU32(a, b)) != 0;
+}
+
+PX_FORCE_INLINE bool allElementsEqualVecI32V(const VecI32V a, const VecI32V b)
+{
+	BoolV c = internalWindowsSimd::m128_I2F(
+	    _mm_cmpeq_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b)));
+	return internalWindowsSimd::BAllTrue4_R(c) != 0;
+}
+
+#define VECMATH_AOS_EPSILON (1e-3f)
+static const FloatV minFError = FLoad(-VECMATH_AOS_EPSILON);
+static const FloatV maxFError = FLoad(VECMATH_AOS_EPSILON);
+static const Vec3V minV3Error = V3Load(-VECMATH_AOS_EPSILON);
+static const Vec3V maxV3Error = V3Load(VECMATH_AOS_EPSILON);
+static const Vec4V minV4Error = V4Load(-VECMATH_AOS_EPSILON);
+static const Vec4V maxV4Error = V4Load(VECMATH_AOS_EPSILON);
+
+PX_FORCE_INLINE bool allElementsNearEqualFloatV(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	const FloatV c = FSub(a, b);
+	return _mm_comigt_ss(c, minFError) && _mm_comilt_ss(c, maxFError);
+}
+
+PX_FORCE_INLINE bool allElementsNearEqualVec3V(const Vec3V a, const Vec3V b)
+{
+	const Vec3V c = V3Sub(a, b);
+	return (_mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), minV3Error) &&
+			_mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), maxV3Error) &&
+			_mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), minV3Error) &&
+			_mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), maxV3Error) &&
+			_mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), minV3Error) &&
+			_mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), maxV3Error));
+}
+
+PX_FORCE_INLINE bool allElementsNearEqualVec4V(const Vec4V a, const Vec4V b)
+{
+	const Vec4V c = V4Sub(a, b);
+	return (_mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), minV4Error) &&
+	        _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)), maxV4Error) &&
+	        _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), minV4Error) &&
+	        _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1)), maxV4Error) &&
+	        _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), minV4Error) &&
+	        _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2)), maxV4Error) &&
+	        _mm_comigt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 3, 3, 3)), minV4Error) &&
+	        _mm_comilt_ss(_mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 3, 3, 3)), maxV4Error));
+}
+} //_VecMathTests
+
+PX_FORCE_INLINE bool isFiniteFloatV(const FloatV a)
+{
+	PxF32 f;
+	FStore(a, &f);
+	return PxIsFinite(f);
+	/*
+	const PxU32 badNumber = (_FPCLASS_SNAN | _FPCLASS_QNAN | _FPCLASS_NINF | _FPCLASS_PINF);
+	const FloatV vBadNum = FloatV_From_F32((PxF32&)badNumber);
+	const BoolV vMask = BAnd(vBadNum,  a);
+	return FiniteTestEq(vMask, BFFFF()) == 1;
+	*/
+}
+
+PX_FORCE_INLINE bool isFiniteVec3V(const Vec3V a)
+{
+	PX_ALIGN(16, PxF32 f[4]);
+	V4StoreA((Vec4V&)a, f);
+	return PxIsFinite(f[0]) && PxIsFinite(f[1]) && PxIsFinite(f[2]);
+
+	/*
+	const PxU32 badNumber = (_FPCLASS_SNAN | _FPCLASS_QNAN | _FPCLASS_NINF | _FPCLASS_PINF);
+	const Vec3V vBadNum = Vec3V_From_F32((PxF32&)badNumber);
+	const BoolV vMask = BAnd(BAnd(vBadNum,  a), BTTTF());
+	return FiniteTestEq(vMask, BFFFF()) == 1;
+	*/
+}
+
+PX_FORCE_INLINE bool isFiniteVec4V(const Vec4V a)
+{
+	PX_ALIGN(16, PxF32 f[4]);
+	V4StoreA(a, f);
+	return PxIsFinite(f[0]) && PxIsFinite(f[1]) && PxIsFinite(f[2]) && PxIsFinite(f[3]);
+
+	/*
+	const PxU32 badNumber = (_FPCLASS_SNAN | _FPCLASS_QNAN | _FPCLASS_NINF | _FPCLASS_PINF);
+	const Vec4V vBadNum = Vec4V_From_U32((PxF32&)badNumber);
+	const BoolV vMask = BAnd(vBadNum,  a);
+
+	return FiniteTestEq(vMask, BFFFF()) == 1;
+	*/
+}
+
+/////////////////////////////////////////////////////////////////////
+////VECTORISED FUNCTION IMPLEMENTATIONS
+/////////////////////////////////////////////////////////////////////
+
+PX_FORCE_INLINE FloatV FLoad(const PxF32 f)
+{
+	return _mm_load1_ps(&f);
+}
+
+PX_FORCE_INLINE Vec3V V3Load(const PxF32 f)
+{
+	return _mm_set_ps(0.0f, f, f, f);
+}
+
+PX_FORCE_INLINE Vec4V V4Load(const PxF32 f)
+{
+	return _mm_load1_ps(&f);
+}
+
+PX_FORCE_INLINE BoolV BLoad(const bool f)
+{
+	const PxU32 i = PxU32(-(PxI32)f);
+	return _mm_load1_ps((float*)&i);
+}
+
+PX_FORCE_INLINE Vec3V V3LoadA(const PxVec3& f)
+{
+	ASSERT_ISALIGNED16(&f);
+	return _mm_and_ps(_mm_load_ps(&f.x), reinterpret_cast<const Vec4V&>(internalWindowsSimd::gMaskXYZ));
+}
+
+PX_FORCE_INLINE Vec3V V3LoadU(const PxVec3& f)
+{
+	return _mm_set_ps(0.0f, f.z, f.y, f.x);
+}
+
+// w component of result is undefined
+PX_FORCE_INLINE Vec3V V3LoadUnsafeA(const PxVec3& f)
+{
+	ASSERT_ISALIGNED16(&f);
+	return _mm_load_ps(&f.x);
+}
+
+PX_FORCE_INLINE Vec3V V3LoadA(const PxF32* const f)
+{
+	ASSERT_ISALIGNED16(f);
+	return V4ClearW(_mm_load_ps(f));
+}
+
+PX_FORCE_INLINE Vec3V V3LoadU(const PxF32* const i)
+{
+	return _mm_set_ps(0.0f, i[2], i[1], i[0]);
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V(Vec4V v)
+{
+	return V4ClearW(v);
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V_WUndefined(const Vec4V v)
+{
+	return v;
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_Vec3V(Vec3V f)
+{
+	return f; // ok if it is implemented as the same type.
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_FloatV(FloatV f)
+{
+	return f;
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_FloatV(FloatV f)
+{
+	return Vec3V_From_Vec4V(Vec4V_From_FloatV(f));
+}
+
+PX_FORCE_INLINE Vec3V Vec3V_From_FloatV_WUndefined(FloatV f)
+{
+	return Vec3V_From_Vec4V_WUndefined(Vec4V_From_FloatV(f));
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_PxVec3_WUndefined(const PxVec3& f)
+{
+	return _mm_set_ps(0.0f, f.z, f.y, f.x);
+}
+
+PX_FORCE_INLINE Vec4V V4LoadA(const PxF32* const f)
+{
+	ASSERT_ISALIGNED16(f);
+	return _mm_load_ps(f);
+}
+
+PX_FORCE_INLINE void V4StoreA(const Vec4V a, PxF32* f)
+{
+	ASSERT_ISALIGNED16(f);
+	_mm_store_ps(f, a);
+}
+
+PX_FORCE_INLINE void V4StoreU(const Vec4V a, PxF32* f)
+{
+	_mm_storeu_ps(f, a);
+}
+
+PX_FORCE_INLINE void BStoreA(const BoolV a, PxU32* f)
+{
+	ASSERT_ISALIGNED16(f);
+	_mm_store_ps((PxF32*)f, a);
+}
+
+PX_FORCE_INLINE void U4StoreA(const VecU32V uv, PxU32* u)
+{
+	ASSERT_ISALIGNED16(u);
+	_mm_store_ps((PxF32*)u, uv);
+}
+
+PX_FORCE_INLINE void I4StoreA(const VecI32V iv, PxI32* i)
+{
+	ASSERT_ISALIGNED16(i);
+	_mm_store_ps((PxF32*)i, iv);
+}
+
+PX_FORCE_INLINE Vec4V V4LoadU(const PxF32* const f)
+{
+	return _mm_loadu_ps(f);
+}
+
+PX_FORCE_INLINE BoolV BLoad(const bool* const f)
+{
+	const PX_ALIGN(16, PxU32 b[4]) = { PxU32(-(PxI32)f[0]), PxU32(-(PxI32)f[1]),
+		                               PxU32(-(PxI32)f[2]), PxU32(-(PxI32)f[3]) };
+	return _mm_load_ps((float*)&b);
+}
+
+PX_FORCE_INLINE void FStore(const FloatV a, PxF32* PX_RESTRICT f)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	_mm_store_ss(f, a);
+}
+
+PX_FORCE_INLINE void V3StoreA(const Vec3V a, PxVec3& f)
+{
+	ASSERT_ISALIGNED16(&f);
+	PX_ALIGN(16, PxF32 f2[4]);
+	_mm_store_ps(f2, a);
+	f = PxVec3(f2[0], f2[1], f2[2]);
+}
+
+PX_FORCE_INLINE void Store_From_BoolV(const BoolV b, PxU32* b2)
+{
+	_mm_store_ss((PxF32*)b2, b);
+}
+
+PX_FORCE_INLINE void V3StoreU(const Vec3V a, PxVec3& f)
+{
+	PX_ALIGN(16, PxF32 f2[4]);
+	_mm_store_ps(f2, a);
+	f = PxVec3(f2[0], f2[1], f2[2]);
+}
+
+PX_FORCE_INLINE Mat33V Mat33V_From_PxMat33(const PxMat33& m)
+{
+	return Mat33V(V3LoadU(m.column0), V3LoadU(m.column1), V3LoadU(m.column2));
+}
+
+PX_FORCE_INLINE void PxMat33_From_Mat33V(const Mat33V& m, PxMat33& out)
+{
+	ASSERT_ISALIGNED16(&out);
+	V3StoreU(m.col0, out.column0);
+	V3StoreU(m.col1, out.column1);
+	V3StoreU(m.col2, out.column2);
+}
+
+//////////////////////////////////
+// FLOATV
+//////////////////////////////////
+
+PX_FORCE_INLINE FloatV FZero()
+{
+	return _mm_setzero_ps();
+}
+
+PX_FORCE_INLINE FloatV FOne()
+{
+	return FLoad(1.0f);
+}
+
+PX_FORCE_INLINE FloatV FHalf()
+{
+	return FLoad(0.5f);
+}
+
+PX_FORCE_INLINE FloatV FEps()
+{
+	return FLoad(PX_EPS_REAL);
+}
+
+PX_FORCE_INLINE FloatV FEps6()
+{
+	return FLoad(1e-6f);
+}
+
+PX_FORCE_INLINE FloatV FMax()
+{
+	return FLoad(PX_MAX_REAL);
+}
+
+PX_FORCE_INLINE FloatV FNegMax()
+{
+	return FLoad(-PX_MAX_REAL);
+}
+
+PX_FORCE_INLINE FloatV IZero()
+{
+	const PxU32 zero = 0;
+	return _mm_load1_ps((PxF32*)&zero);
+}
+
+PX_FORCE_INLINE FloatV IOne()
+{
+	const PxU32 one = 1;
+	return _mm_load1_ps((PxF32*)&one);
+}
+
+PX_FORCE_INLINE FloatV ITwo()
+{
+	const PxU32 two = 2;
+	return _mm_load1_ps((PxF32*)&two);
+}
+
+PX_FORCE_INLINE FloatV IThree()
+{
+	const PxU32 three = 3;
+	return _mm_load1_ps((PxF32*)&three);
+}
+
+PX_FORCE_INLINE FloatV IFour()
+{
+	const PxU32 four = 4;
+	return _mm_load1_ps((PxF32*)&four);
+}
+
+PX_FORCE_INLINE FloatV FNeg(const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return _mm_sub_ps(_mm_setzero_ps(), f);
+}
+
+PX_FORCE_INLINE FloatV FAdd(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_add_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FSub(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_sub_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FMul(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_mul_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FDiv(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_div_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FDivFast(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_mul_ps(a, _mm_rcp_ps(b));
+}
+
+PX_FORCE_INLINE FloatV FRecip(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return _mm_div_ps(FOne(), a);
+}
+
+PX_FORCE_INLINE FloatV FRecipFast(const FloatV a)
+{
+	return _mm_rcp_ps(a);
+}
+
+PX_FORCE_INLINE FloatV FRsqrt(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return _mm_div_ps(FOne(), _mm_sqrt_ps(a));
+}
+
+PX_FORCE_INLINE FloatV FSqrt(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return _mm_sqrt_ps(a);
+}
+
+PX_FORCE_INLINE FloatV FRsqrtFast(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	return _mm_rsqrt_ps(a);
+}
+
+PX_FORCE_INLINE FloatV FScaleAdd(const FloatV a, const FloatV b, const FloatV c)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	ASSERT_ISVALIDFLOATV(c);
+	return FAdd(FMul(a, b), c);
+}
+
+PX_FORCE_INLINE FloatV FNegScaleSub(const FloatV a, const FloatV b, const FloatV c)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	ASSERT_ISVALIDFLOATV(c);
+	return FSub(c, FMul(a, b));
+}
+
+PX_FORCE_INLINE FloatV FAbs(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	PX_ALIGN(16, const static PxU32 absMask[4]) = { 0x7fFFffFF, 0x7fFFffFF, 0x7fFFffFF, 0x7fFFffFF };
+	return _mm_and_ps(a, _mm_load_ps((PxF32*)absMask));
+}
+
+PX_FORCE_INLINE FloatV FSel(const BoolV c, const FloatV a, const FloatV b)
+{
+	PX_ASSERT(_VecMathTests::allElementsEqualBoolV(c, BTTTT()) ||
+			  _VecMathTests::allElementsEqualBoolV(c, BFFFF()));
+	ASSERT_ISVALIDFLOATV(_mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a)));
+	return _mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a));
+}
+
+PX_FORCE_INLINE BoolV FIsGrtr(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_cmpgt_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV FIsGrtrOrEq(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_cmpge_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV FIsEq(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_cmpeq_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FMax(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_max_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FMin(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_min_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV FClamp(const FloatV a, const FloatV minV, const FloatV maxV)
+{
+	ASSERT_ISVALIDFLOATV(minV);
+	ASSERT_ISVALIDFLOATV(maxV);
+	return _mm_max_ps(_mm_min_ps(a, maxV), minV);
+}
+
+PX_FORCE_INLINE PxU32 FAllGrtr(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return PxU32(_mm_comigt_ss(a, b));
+}
+
+PX_FORCE_INLINE PxU32 FAllGrtrOrEq(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return PxU32(_mm_comige_ss(a, b));
+}
+
+PX_FORCE_INLINE PxU32 FAllEq(const FloatV a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return PxU32(_mm_comieq_ss(a, b));
+}
+
+PX_FORCE_INLINE FloatV FRound(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	// return _mm_round_ps(a, 0x0);
+	const FloatV half = FLoad(0.5f);
+	const __m128 signBit = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_cvtps_epi32(a), 31));
+	const FloatV aRound = FSub(FAdd(a, half), signBit);
+	__m128i tmp = _mm_cvttps_epi32(aRound);
+	return _mm_cvtepi32_ps(tmp);
+}
+
+PX_FORCE_INLINE FloatV FSin(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
+	const FloatV recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const FloatV twoPi = V4LoadA(g_PXTwoPi.f);
+	const FloatV tmp = FMul(a, recipTwoPi);
+	const FloatV b = FRound(tmp);
+	const FloatV V1 = FNegScaleSub(twoPi, b, a);
+
+	// sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
+	//           V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
+	const FloatV V2 = FMul(V1, V1);
+	const FloatV V3 = FMul(V2, V1);
+	const FloatV V5 = FMul(V3, V2);
+	const FloatV V7 = FMul(V5, V2);
+	const FloatV V9 = FMul(V7, V2);
+	const FloatV V11 = FMul(V9, V2);
+	const FloatV V13 = FMul(V11, V2);
+	const FloatV V15 = FMul(V13, V2);
+	const FloatV V17 = FMul(V15, V2);
+	const FloatV V19 = FMul(V17, V2);
+	const FloatV V21 = FMul(V19, V2);
+	const FloatV V23 = FMul(V21, V2);
+
+	const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f);
+	const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f);
+	const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f);
+
+	const FloatV S1 = V4GetY(sinCoefficients0);
+	const FloatV S2 = V4GetZ(sinCoefficients0);
+	const FloatV S3 = V4GetW(sinCoefficients0);
+	const FloatV S4 = V4GetX(sinCoefficients1);
+	const FloatV S5 = V4GetY(sinCoefficients1);
+	const FloatV S6 = V4GetZ(sinCoefficients1);
+	const FloatV S7 = V4GetW(sinCoefficients1);
+	const FloatV S8 = V4GetX(sinCoefficients2);
+	const FloatV S9 = V4GetY(sinCoefficients2);
+	const FloatV S10 = V4GetZ(sinCoefficients2);
+	const FloatV S11 = V4GetW(sinCoefficients2);
+
+	FloatV Result;
+	Result = FScaleAdd(S1, V3, V1);
+	Result = FScaleAdd(S2, V5, Result);
+	Result = FScaleAdd(S3, V7, Result);
+	Result = FScaleAdd(S4, V9, Result);
+	Result = FScaleAdd(S5, V11, Result);
+	Result = FScaleAdd(S6, V13, Result);
+	Result = FScaleAdd(S7, V15, Result);
+	Result = FScaleAdd(S8, V17, Result);
+	Result = FScaleAdd(S9, V19, Result);
+	Result = FScaleAdd(S10, V21, Result);
+	Result = FScaleAdd(S11, V23, Result);
+
+	return Result;
+}
+
+PX_FORCE_INLINE FloatV FCos(const FloatV a)
+{
+	ASSERT_ISVALIDFLOATV(a);
+
+	// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
+	const FloatV recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const FloatV twoPi = V4LoadA(g_PXTwoPi.f);
+	const FloatV tmp = FMul(a, recipTwoPi);
+	const FloatV b = FRound(tmp);
+	const FloatV V1 = FNegScaleSub(twoPi, b, a);
+
+	// cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
+	//           V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
+	const FloatV V2 = FMul(V1, V1);
+	const FloatV V4 = FMul(V2, V2);
+	const FloatV V6 = FMul(V4, V2);
+	const FloatV V8 = FMul(V4, V4);
+	const FloatV V10 = FMul(V6, V4);
+	const FloatV V12 = FMul(V6, V6);
+	const FloatV V14 = FMul(V8, V6);
+	const FloatV V16 = FMul(V8, V8);
+	const FloatV V18 = FMul(V10, V8);
+	const FloatV V20 = FMul(V10, V10);
+	const FloatV V22 = FMul(V12, V10);
+
+	const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f);
+	const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f);
+	const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f);
+
+	const FloatV C1 = V4GetY(cosCoefficients0);
+	const FloatV C2 = V4GetZ(cosCoefficients0);
+	const FloatV C3 = V4GetW(cosCoefficients0);
+	const FloatV C4 = V4GetX(cosCoefficients1);
+	const FloatV C5 = V4GetY(cosCoefficients1);
+	const FloatV C6 = V4GetZ(cosCoefficients1);
+	const FloatV C7 = V4GetW(cosCoefficients1);
+	const FloatV C8 = V4GetX(cosCoefficients2);
+	const FloatV C9 = V4GetY(cosCoefficients2);
+	const FloatV C10 = V4GetZ(cosCoefficients2);
+	const FloatV C11 = V4GetW(cosCoefficients2);
+
+	FloatV Result;
+	Result = FScaleAdd(C1, V2, V4One());
+	Result = FScaleAdd(C2, V4, Result);
+	Result = FScaleAdd(C3, V6, Result);
+	Result = FScaleAdd(C4, V8, Result);
+	Result = FScaleAdd(C5, V10, Result);
+	Result = FScaleAdd(C6, V12, Result);
+	Result = FScaleAdd(C7, V14, Result);
+	Result = FScaleAdd(C8, V16, Result);
+	Result = FScaleAdd(C9, V18, Result);
+	Result = FScaleAdd(C10, V20, Result);
+	Result = FScaleAdd(C11, V22, Result);
+
+	return Result;
+}
+
+PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV min, const FloatV max)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(min);
+	ASSERT_ISVALIDFLOATV(max);
+	const BoolV c = BOr(FIsGrtr(a, max), FIsGrtr(min, a));
+	return PxU32(!BAllEqFFFF(c));
+}
+
+PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV min, const FloatV max)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(min);
+	ASSERT_ISVALIDFLOATV(max);
+	const BoolV c = BAnd(FIsGrtrOrEq(a, min), FIsGrtrOrEq(max, a));
+	return BAllEqTTTT(c);
+}
+
+PX_FORCE_INLINE PxU32 FOutOfBounds(const FloatV a, const FloatV bounds)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(bounds);
+	return FOutOfBounds(a, FNeg(bounds), bounds);
+}
+
+PX_FORCE_INLINE PxU32 FInBounds(const FloatV a, const FloatV bounds)
+{
+	ASSERT_ISVALIDFLOATV(a);
+	ASSERT_ISVALIDFLOATV(bounds);
+	return FInBounds(a, FNeg(bounds), bounds);
+}
+
+//////////////////////////////////
+// VEC3V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec3V V3Splat(const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	const __m128 zero = V3Zero();
+	const __m128 fff0 = _mm_move_ss(f, zero);
+	return _mm_shuffle_ps(fff0, fff0, _MM_SHUFFLE(0, 1, 2, 3));
+}
+
+PX_FORCE_INLINE Vec3V V3Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z)
+{
+	ASSERT_ISVALIDFLOATV(x);
+	ASSERT_ISVALIDFLOATV(y);
+	ASSERT_ISVALIDFLOATV(z);
+	// static on zero causes compiler crash on x64 debug_opt
+	const __m128 zero = V3Zero();
+	const __m128 xy = _mm_move_ss(x, y);
+	const __m128 z0 = _mm_move_ss(zero, z);
+
+	return _mm_shuffle_ps(xy, z0, _MM_SHUFFLE(1, 0, 0, 1));
+}
+
+PX_FORCE_INLINE Vec3V V3UnitX()
+{
+	const PX_ALIGN(16, PxF32 x[4]) = { 1.0f, 0.0f, 0.0f, 0.0f };
+	const __m128 x128 = _mm_load_ps(x);
+	return x128;
+}
+
+PX_FORCE_INLINE Vec3V V3UnitY()
+{
+	const PX_ALIGN(16, PxF32 y[4]) = { 0.0f, 1.0f, 0.0f, 0.0f };
+	const __m128 y128 = _mm_load_ps(y);
+	return y128;
+}
+
+PX_FORCE_INLINE Vec3V V3UnitZ()
+{
+	const PX_ALIGN(16, PxF32 z[4]) = { 0.0f, 0.0f, 1.0f, 0.0f };
+	const __m128 z128 = _mm_load_ps(z);
+	return z128;
+}
+
+PX_FORCE_INLINE FloatV V3GetX(const Vec3V f)
+{
+	ASSERT_ISVALIDVEC3V(f);
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0, 0, 0, 0));
+}
+
+PX_FORCE_INLINE FloatV V3GetY(const Vec3V f)
+{
+	ASSERT_ISVALIDVEC3V(f);
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 1, 1));
+}
+
+PX_FORCE_INLINE FloatV V3GetZ(const Vec3V f)
+{
+	ASSERT_ISVALIDVEC3V(f);
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(2, 2, 2, 2));
+}
+
+PX_FORCE_INLINE Vec3V V3SetX(const Vec3V v, const FloatV f)
+{
+	ASSERT_ISVALIDVEC3V(v);
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BFTTT(), v, f);
+}
+
+PX_FORCE_INLINE Vec3V V3SetY(const Vec3V v, const FloatV f)
+{
+	ASSERT_ISVALIDVEC3V(v);
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTFTT(), v, f);
+}
+
+PX_FORCE_INLINE Vec3V V3SetZ(const Vec3V v, const FloatV f)
+{
+	ASSERT_ISVALIDVEC3V(v);
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTTFT(), v, f);
+}
+
+PX_FORCE_INLINE Vec3V V3ColX(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+	Vec3V r = _mm_shuffle_ps(a, c, _MM_SHUFFLE(3, 0, 3, 0));
+	return V3SetY(r, V3GetX(b));
+}
+
+PX_FORCE_INLINE Vec3V V3ColY(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+	Vec3V r = _mm_shuffle_ps(a, c, _MM_SHUFFLE(3, 1, 3, 1));
+	return V3SetY(r, V3GetY(b));
+}
+
+PX_FORCE_INLINE Vec3V V3ColZ(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+	Vec3V r = _mm_shuffle_ps(a, c, _MM_SHUFFLE(3, 2, 3, 2));
+	return V3SetY(r, V3GetZ(b));
+}
+
+PX_FORCE_INLINE Vec3V V3Zero()
+{
+	return _mm_setzero_ps();
+}
+
+PX_FORCE_INLINE Vec3V V3One()
+{
+	return V3Load(1.0f);
+}
+
+PX_FORCE_INLINE Vec3V V3Eps()
+{
+	return V3Load(PX_EPS_REAL);
+}
+
+PX_FORCE_INLINE Vec3V V3Neg(const Vec3V f)
+{
+	ASSERT_ISVALIDVEC3V(f);
+	return _mm_sub_ps(_mm_setzero_ps(), f);
+}
+
+PX_FORCE_INLINE Vec3V V3Add(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_add_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Sub(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_sub_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Scale(const Vec3V a, const FloatV b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_mul_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Mul(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_mul_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3ScaleInv(const Vec3V a, const FloatV b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_div_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Div(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return V4ClearW(_mm_div_ps(a, b));
+}
+
+PX_FORCE_INLINE Vec3V V3ScaleInvFast(const Vec3V a, const FloatV b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_mul_ps(a, _mm_rcp_ps(b));
+}
+
+PX_FORCE_INLINE Vec3V V3DivFast(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return V4ClearW(_mm_mul_ps(a, _mm_rcp_ps(b)));
+}
+
+PX_FORCE_INLINE Vec3V V3Recip(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 zero = V3Zero();
+	const __m128 tttf = BTTTF();
+	const __m128 recipA = _mm_div_ps(V3One(), a);
+	return V4Sel(tttf, recipA, zero);
+}
+
+PX_FORCE_INLINE Vec3V V3RecipFast(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 zero = V3Zero();
+	const __m128 tttf = BTTTF();
+	const __m128 recipA = _mm_rcp_ps(a);
+	return V4Sel(tttf, recipA, zero);
+}
+
+PX_FORCE_INLINE Vec3V V3Rsqrt(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 zero = V3Zero();
+	const __m128 tttf = BTTTF();
+	const __m128 recipA = _mm_div_ps(V3One(), _mm_sqrt_ps(a));
+	return V4Sel(tttf, recipA, zero);
+}
+
+PX_FORCE_INLINE Vec3V V3RsqrtFast(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 zero = V3Zero();
+	const __m128 tttf = BTTTF();
+	const __m128 recipA = _mm_rsqrt_ps(a);
+	return V4Sel(tttf, recipA, zero);
+}
+
+PX_FORCE_INLINE Vec3V V3ScaleAdd(const Vec3V a, const FloatV b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	ASSERT_ISVALIDVEC3V(c);
+	return V3Add(V3Scale(a, b), c);
+}
+
+PX_FORCE_INLINE Vec3V V3NegScaleSub(const Vec3V a, const FloatV b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDFLOATV(b);
+	ASSERT_ISVALIDVEC3V(c);
+	return V3Sub(c, V3Scale(a, b));
+}
+
+PX_FORCE_INLINE Vec3V V3MulAdd(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+	return V3Add(V3Mul(a, b), c);
+}
+
+PX_FORCE_INLINE Vec3V V3NegMulSub(const Vec3V a, const Vec3V b, const Vec3V c)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	ASSERT_ISVALIDVEC3V(c);
+	return V3Sub(c, V3Mul(a, b));
+}
+
+PX_FORCE_INLINE Vec3V V3Abs(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return V3Max(a, V3Neg(a));
+}
+
+PX_FORCE_INLINE FloatV V3Dot(const Vec3V a, const Vec3V b)	
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+
+	const __m128 t0 = _mm_mul_ps(a, b);								//	aw*bw | az*bz | ay*by | ax*bx
+	const __m128 t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(1,0,3,2));	//	ay*by | ax*bx | aw*bw | az*bz
+	const __m128 t2 = _mm_add_ps(t0, t1);							//	ay*by + aw*bw | ax*bx + az*bz | aw*bw + ay*by | az*bz + ax*bx
+	const __m128 t3 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,3,0,1));	//	ax*bx + az*bz | ay*by + aw*bw | az*bz + ax*bx | aw*bw + ay*by
+	return _mm_add_ps(t3, t2);										//	ax*bx + az*bz + ay*by + aw*bw 
+																	//	ay*by + aw*bw + ax*bx + az*bz
+																	//	az*bz + ax*bx + aw*bw + ay*by
+																	//	aw*bw + ay*by + az*bz + ax*bx
+}
+
+PX_FORCE_INLINE Vec3V V3Cross(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	const __m128 r1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	const __m128 r2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	const __m128 l1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	const __m128 l2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	return _mm_sub_ps(_mm_mul_ps(l1, l2), _mm_mul_ps(r1, r2));
+}
+
+PX_FORCE_INLINE VecCrossV V3PrepareCross(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	VecCrossV v;
+	v.mR1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	v.mL1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	return v;
+}
+
+PX_FORCE_INLINE Vec3V V3Cross(const VecCrossV& a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(b);
+	const __m128 r2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	const __m128 l2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	return _mm_sub_ps(_mm_mul_ps(a.mL1, l2), _mm_mul_ps(a.mR1, r2));
+}
+
+PX_FORCE_INLINE Vec3V V3Cross(const Vec3V a, const VecCrossV& b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 r2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	const __m128 l2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	return _mm_sub_ps(_mm_mul_ps(b.mR1, r2), _mm_mul_ps(b.mL1, l2));
+}
+
+PX_FORCE_INLINE Vec3V V3Cross(const VecCrossV& a, const VecCrossV& b)
+{
+	return _mm_sub_ps(_mm_mul_ps(a.mL1, b.mR1), _mm_mul_ps(a.mR1, b.mL1));
+}
+
+PX_FORCE_INLINE FloatV V3Length(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_sqrt_ps(V3Dot(a, a));
+}
+
+PX_FORCE_INLINE FloatV V3LengthSq(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return V3Dot(a, a);
+}
+
+PX_FORCE_INLINE Vec3V V3Normalize(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISFINITELENGTH(a);
+	return V3ScaleInv(a, _mm_sqrt_ps(V3Dot(a, a)));
+}
+
+PX_FORCE_INLINE Vec3V V3NormalizeFast(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISFINITELENGTH(a);
+	return V3Scale(a, _mm_rsqrt_ps(V3Dot(a, a)));
+}
+
+PX_FORCE_INLINE Vec3V V3NormalizeSafe(const Vec3V a, const Vec3V unsafeReturnValue)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 eps = FEps();
+	const __m128 length = V3Length(a);
+	const __m128 isGreaterThanZero = FIsGrtr(length, eps);
+	return V3Sel(isGreaterThanZero, V3ScaleInv(a, length), unsafeReturnValue);
+}
+
+PX_FORCE_INLINE Vec3V V3Sel(const BoolV c, const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(_mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a)));
+	return _mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a));
+}
+
+PX_FORCE_INLINE BoolV V3IsGrtr(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_cmpgt_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV V3IsGrtrOrEq(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_cmpge_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV V3IsEq(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_cmpeq_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Max(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_max_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec3V V3Min(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return _mm_min_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV V3ExtractMax(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0));
+	const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1));
+	const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2));
+	return _mm_max_ps(_mm_max_ps(shuf1, shuf2), shuf3);
+}
+
+PX_FORCE_INLINE FloatV V3ExtractMin(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0));
+	const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1));
+	const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2));
+	return _mm_min_ps(_mm_min_ps(shuf1, shuf2), shuf3);
+}
+
+//// if(a > 0.0f) return 1.0f; else if a == 0.f return 0.f, else return -1.f;
+// PX_FORCE_INLINE Vec3V V3MathSign(const Vec3V a)
+//{
+//	VECMATHAOS_ASSERT(isValidVec3V(a));
+//
+//	const __m128i ai = _mm_cvtps_epi32(a);
+//	const __m128i bi = _mm_cvtps_epi32(V3Neg(a));
+//	const __m128  aa = _mm_cvtepi32_ps(_mm_srai_epi32(ai, 31));
+//	const __m128  bb = _mm_cvtepi32_ps(_mm_srai_epi32(bi, 31));
+//	return _mm_or_ps(aa, bb);
+//}
+
+// return (a >= 0.0f) ? 1.0f : -1.0f;
+PX_FORCE_INLINE Vec3V V3Sign(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 zero = V3Zero();
+	const __m128 one = V3One();
+	const __m128 none = V3Neg(one);
+	return V3Sel(V3IsGrtrOrEq(a, zero), one, none);
+}
+
+PX_FORCE_INLINE Vec3V V3Clamp(const Vec3V a, const Vec3V minV, const Vec3V maxV)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(minV);
+	ASSERT_ISVALIDVEC3V(maxV);
+	return V3Max(V3Min(a, maxV), minV);
+}
+
+PX_FORCE_INLINE PxU32 V3AllGrtr(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return internalWindowsSimd::BAllTrue3_R(V4IsGrtr(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V3AllGrtrOrEq(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return internalWindowsSimd::BAllTrue3_R(V4IsGrtrOrEq(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V3AllEq(const Vec3V a, const Vec3V b)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(b);
+	return internalWindowsSimd::BAllTrue3_R(V4IsEq(a, b));
+}
+
+PX_FORCE_INLINE Vec3V V3Round(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+
+	// return _mm_round_ps(a, 0x0);
+	const Vec3V half = V3Load(0.5f);
+	const __m128 signBit = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_cvtps_epi32(a), 31));
+	const Vec3V aRound = V3Sub(V3Add(a, half), signBit);
+	__m128i tmp = _mm_cvttps_epi32(aRound);
+	return _mm_cvtepi32_ps(tmp);
+}
+
+PX_FORCE_INLINE Vec3V V3Sin(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+
+	// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
+	const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const Vec4V twoPi = V4LoadA(g_PXTwoPi.f);
+	const Vec3V tmp = V3Scale(a, recipTwoPi);
+	const Vec3V b = V3Round(tmp);
+	const Vec3V V1 = V3NegScaleSub(b, twoPi, a);
+
+	// sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
+	//           V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
+	const Vec3V V2 = V3Mul(V1, V1);
+	const Vec3V V3 = V3Mul(V2, V1);
+	const Vec3V V5 = V3Mul(V3, V2);
+	const Vec3V V7 = V3Mul(V5, V2);
+	const Vec3V V9 = V3Mul(V7, V2);
+	const Vec3V V11 = V3Mul(V9, V2);
+	const Vec3V V13 = V3Mul(V11, V2);
+	const Vec3V V15 = V3Mul(V13, V2);
+	const Vec3V V17 = V3Mul(V15, V2);
+	const Vec3V V19 = V3Mul(V17, V2);
+	const Vec3V V21 = V3Mul(V19, V2);
+	const Vec3V V23 = V3Mul(V21, V2);
+
+	const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f);
+	const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f);
+	const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f);
+
+	const FloatV S1 = V4GetY(sinCoefficients0);
+	const FloatV S2 = V4GetZ(sinCoefficients0);
+	const FloatV S3 = V4GetW(sinCoefficients0);
+	const FloatV S4 = V4GetX(sinCoefficients1);
+	const FloatV S5 = V4GetY(sinCoefficients1);
+	const FloatV S6 = V4GetZ(sinCoefficients1);
+	const FloatV S7 = V4GetW(sinCoefficients1);
+	const FloatV S8 = V4GetX(sinCoefficients2);
+	const FloatV S9 = V4GetY(sinCoefficients2);
+	const FloatV S10 = V4GetZ(sinCoefficients2);
+	const FloatV S11 = V4GetW(sinCoefficients2);
+
+	Vec3V Result;
+	Result = V3ScaleAdd(V3, S1, V1);
+	Result = V3ScaleAdd(V5, S2, Result);
+	Result = V3ScaleAdd(V7, S3, Result);
+	Result = V3ScaleAdd(V9, S4, Result);
+	Result = V3ScaleAdd(V11, S5, Result);
+	Result = V3ScaleAdd(V13, S6, Result);
+	Result = V3ScaleAdd(V15, S7, Result);
+	Result = V3ScaleAdd(V17, S8, Result);
+	Result = V3ScaleAdd(V19, S9, Result);
+	Result = V3ScaleAdd(V21, S10, Result);
+	Result = V3ScaleAdd(V23, S11, Result);
+
+	ASSERT_ISVALIDVEC3V(Result);
+	return Result;
+}
+
+PX_FORCE_INLINE Vec3V V3Cos(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+
+	// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
+	const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const Vec4V twoPi = V4LoadA(g_PXTwoPi.f);
+	const Vec3V tmp = V3Scale(a, recipTwoPi);
+	const Vec3V b = V3Round(tmp);
+	const Vec3V V1 = V3NegScaleSub(b, twoPi, a);
+
+	// cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
+	//           V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
+	const Vec3V V2 = V3Mul(V1, V1);
+	const Vec3V V4 = V3Mul(V2, V2);
+	const Vec3V V6 = V3Mul(V4, V2);
+	const Vec3V V8 = V3Mul(V4, V4);
+	const Vec3V V10 = V3Mul(V6, V4);
+	const Vec3V V12 = V3Mul(V6, V6);
+	const Vec3V V14 = V3Mul(V8, V6);
+	const Vec3V V16 = V3Mul(V8, V8);
+	const Vec3V V18 = V3Mul(V10, V8);
+	const Vec3V V20 = V3Mul(V10, V10);
+	const Vec3V V22 = V3Mul(V12, V10);
+
+	const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f);
+	const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f);
+	const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f);
+
+	const FloatV C1 = V4GetY(cosCoefficients0);
+	const FloatV C2 = V4GetZ(cosCoefficients0);
+	const FloatV C3 = V4GetW(cosCoefficients0);
+	const FloatV C4 = V4GetX(cosCoefficients1);
+	const FloatV C5 = V4GetY(cosCoefficients1);
+	const FloatV C6 = V4GetZ(cosCoefficients1);
+	const FloatV C7 = V4GetW(cosCoefficients1);
+	const FloatV C8 = V4GetX(cosCoefficients2);
+	const FloatV C9 = V4GetY(cosCoefficients2);
+	const FloatV C10 = V4GetZ(cosCoefficients2);
+	const FloatV C11 = V4GetW(cosCoefficients2);
+
+	Vec3V Result;
+	Result = V3ScaleAdd(V2, C1, V3One());
+	Result = V3ScaleAdd(V4, C2, Result);
+	Result = V3ScaleAdd(V6, C3, Result);
+	Result = V3ScaleAdd(V8, C4, Result);
+	Result = V3ScaleAdd(V10, C5, Result);
+	Result = V3ScaleAdd(V12, C6, Result);
+	Result = V3ScaleAdd(V14, C7, Result);
+	Result = V3ScaleAdd(V16, C8, Result);
+	Result = V3ScaleAdd(V18, C9, Result);
+	Result = V3ScaleAdd(V20, C10, Result);
+	Result = V3ScaleAdd(V22, C11, Result);
+
+	ASSERT_ISVALIDVEC3V(Result); 
+	return Result;
+}
+
+PX_FORCE_INLINE Vec3V V3PermYZZ(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 2, 1));
+}
+
+PX_FORCE_INLINE Vec3V V3PermXYX(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 1, 0));
+}
+
+PX_FORCE_INLINE Vec3V V3PermYZX(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
+}
+
+PX_FORCE_INLINE Vec3V V3PermZXY(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2));
+}
+
+PX_FORCE_INLINE Vec3V V3PermZZY(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 2, 2));
+}
+
+PX_FORCE_INLINE Vec3V V3PermYXX(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 0, 1));
+}
+
+PX_FORCE_INLINE Vec3V V3Perm_Zero_1Z_0Y(const Vec3V v0, const Vec3V v1)
+{
+	ASSERT_ISVALIDVEC3V(v0);
+	ASSERT_ISVALIDVEC3V(v1);
+	return _mm_shuffle_ps(v1, v0, _MM_SHUFFLE(3, 1, 2, 3));
+}
+
+PX_FORCE_INLINE Vec3V V3Perm_0Z_Zero_1X(const Vec3V v0, const Vec3V v1)
+{
+	ASSERT_ISVALIDVEC3V(v0);
+	ASSERT_ISVALIDVEC3V(v1);
+	return _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(3, 0, 3, 2));
+}
+
+PX_FORCE_INLINE Vec3V V3Perm_1Y_0X_Zero(const Vec3V v0, const Vec3V v1)
+{
+	ASSERT_ISVALIDVEC3V(v0);
+	ASSERT_ISVALIDVEC3V(v1);
+	// There must be a better way to do this.
+	Vec3V v2 = V3Zero();
+	FloatV y1 = V3GetY(v1);
+	FloatV x0 = V3GetX(v0);
+	v2 = V3SetX(v2, y1);
+	return V3SetY(v2, x0);
+}
+
+PX_FORCE_INLINE FloatV V3SumElems(const Vec3V a)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)); // z,y,x,w
+	const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)); // y,x,w,z
+	const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)); // x,w,z,y
+	return _mm_add_ps(_mm_add_ps(shuf1, shuf2), shuf3);
+}
+
+PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V min, const Vec3V max)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(min);
+	ASSERT_ISVALIDVEC3V(max);
+	const BoolV c = BOr(V3IsGrtr(a, max), V3IsGrtr(min, a));
+	return PxU32(!BAllEqFFFF(c));
+}
+
+PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V min, const Vec3V max)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(min);
+	ASSERT_ISVALIDVEC3V(max);
+	const BoolV c = BAnd(V3IsGrtrOrEq(a, min), V3IsGrtrOrEq(max, a));
+	return BAllEqTTTT(c);
+}
+
+PX_FORCE_INLINE PxU32 V3OutOfBounds(const Vec3V a, const Vec3V bounds)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(bounds);
+	return V3OutOfBounds(a, V3Neg(bounds), bounds);
+}
+
+PX_FORCE_INLINE PxU32 V3InBounds(const Vec3V a, const Vec3V bounds)
+{
+	ASSERT_ISVALIDVEC3V(a);
+	ASSERT_ISVALIDVEC3V(bounds);
+	return V3InBounds(a, V3Neg(bounds), bounds);
+}
+
+PX_FORCE_INLINE void V3Transpose(Vec3V& col0, Vec3V& col1, Vec3V& col2)
+{
+	ASSERT_ISVALIDVEC3V(col0);
+	ASSERT_ISVALIDVEC3V(col1);
+	ASSERT_ISVALIDVEC3V(col2);
+	const Vec3V col3 = _mm_setzero_ps();
+	Vec3V tmp0 = _mm_unpacklo_ps(col0, col1);
+	Vec3V tmp2 = _mm_unpacklo_ps(col2, col3);
+	Vec3V tmp1 = _mm_unpackhi_ps(col0, col1);
+	Vec3V tmp3 = _mm_unpackhi_ps(col2, col3);
+	col0 = _mm_movelh_ps(tmp0, tmp2);
+	col1 = _mm_movehl_ps(tmp2, tmp0);
+	col2 = _mm_movelh_ps(tmp1, tmp3);
+}
+
+//////////////////////////////////
+// VEC4V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec4V V4Splat(const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	// return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0,0,0,0));
+	return f;
+}
+
+PX_FORCE_INLINE Vec4V V4Merge(const FloatV* const floatVArray)
+{
+	ASSERT_ISVALIDFLOATV(floatVArray[0]);
+	ASSERT_ISVALIDFLOATV(floatVArray[1]);
+	ASSERT_ISVALIDFLOATV(floatVArray[2]);
+	ASSERT_ISVALIDFLOATV(floatVArray[3]);
+	const __m128 xw = _mm_move_ss(floatVArray[1], floatVArray[0]); // y, y, y, x
+	const __m128 yz = _mm_move_ss(floatVArray[2], floatVArray[3]); // z, z, z, w
+	return _mm_shuffle_ps(xw, yz, _MM_SHUFFLE(0, 2, 1, 0));
+}
+
+PX_FORCE_INLINE Vec4V V4Merge(const FloatVArg x, const FloatVArg y, const FloatVArg z, const FloatVArg w)
+{
+	ASSERT_ISVALIDFLOATV(x);
+	ASSERT_ISVALIDFLOATV(y);
+	ASSERT_ISVALIDFLOATV(z);
+	ASSERT_ISVALIDFLOATV(w);
+	const __m128 xw = _mm_move_ss(y, x); // y, y, y, x
+	const __m128 yz = _mm_move_ss(z, w); // z, z, z, w
+	return _mm_shuffle_ps(xw, yz, _MM_SHUFFLE(0, 2, 1, 0));
+}
+
+PX_FORCE_INLINE Vec4V V4MergeW(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	const Vec4V xz = _mm_unpackhi_ps(x, z);
+	const Vec4V yw = _mm_unpackhi_ps(y, w);
+	return _mm_unpackhi_ps(xz, yw);
+}
+
+PX_FORCE_INLINE Vec4V V4MergeZ(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	const Vec4V xz = _mm_unpackhi_ps(x, z);
+	const Vec4V yw = _mm_unpackhi_ps(y, w);
+	return _mm_unpacklo_ps(xz, yw);
+}
+
+PX_FORCE_INLINE Vec4V V4MergeY(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	const Vec4V xz = _mm_unpacklo_ps(x, z);
+	const Vec4V yw = _mm_unpacklo_ps(y, w);
+	return _mm_unpackhi_ps(xz, yw);
+}
+
+PX_FORCE_INLINE Vec4V V4MergeX(const Vec4VArg x, const Vec4VArg y, const Vec4VArg z, const Vec4VArg w)
+{
+	const Vec4V xz = _mm_unpacklo_ps(x, z);
+	const Vec4V yw = _mm_unpacklo_ps(y, w);
+	return _mm_unpacklo_ps(xz, yw);
+}
+
+PX_FORCE_INLINE Vec4V V4UnpackXY(const Vec4VArg a, const Vec4VArg b)
+{
+	return _mm_unpacklo_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4UnpackZW(const Vec4VArg a, const Vec4VArg b)
+{
+	return _mm_unpackhi_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4PermYXWZ(const Vec4V a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+
+PX_FORCE_INLINE Vec4V V4PermXZXZ(const Vec4V a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 0, 2, 0));
+}
+
+PX_FORCE_INLINE Vec4V V4PermYWYW(const Vec4V a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 3, 1));
+}
+
+PX_FORCE_INLINE Vec4V V4PermYZXW(const Vec4V a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1));
+}
+
+template <PxU8 x, PxU8 y, PxU8 z, PxU8 w>
+PX_FORCE_INLINE Vec4V V4Perm(const Vec4V a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(w, z, y, x));
+}
+
+PX_FORCE_INLINE Vec4V V4UnitW()
+{
+	const PX_ALIGN(16, PxF32 w[4]) = { 0.0f, 0.0f, 0.0f, 1.0f };
+	const __m128 w128 = _mm_load_ps(w);
+	return w128;
+}
+
+PX_FORCE_INLINE Vec4V V4UnitX()
+{
+	const PX_ALIGN(16, PxF32 x[4]) = { 1.0f, 0.0f, 0.0f, 0.0f };
+	const __m128 x128 = _mm_load_ps(x);
+	return x128;
+}
+
+PX_FORCE_INLINE Vec4V V4UnitY()
+{
+	const PX_ALIGN(16, PxF32 y[4]) = { 0.0f, 1.0f, 0.0f, 0.0f };
+	const __m128 y128 = _mm_load_ps(y);
+	return y128;
+}
+
+PX_FORCE_INLINE Vec4V V4UnitZ()
+{
+	const PX_ALIGN(16, PxF32 z[4]) = { 0.0f, 0.0f, 1.0f, 0.0f };
+	const __m128 z128 = _mm_load_ps(z);
+	return z128;
+}
+
+PX_FORCE_INLINE FloatV V4GetW(const Vec4V f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(3, 3, 3, 3));
+}
+
+PX_FORCE_INLINE FloatV V4GetX(const Vec4V f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0, 0, 0, 0));
+}
+
+PX_FORCE_INLINE FloatV V4GetY(const Vec4V f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 1, 1));
+}
+
+PX_FORCE_INLINE FloatV V4GetZ(const Vec4V f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(2, 2, 2, 2));
+}
+
+PX_FORCE_INLINE Vec4V V4SetW(const Vec4V v, const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTTTF(), v, f);
+}
+
+PX_FORCE_INLINE Vec4V V4ClearW(const Vec4V v)
+{
+	return _mm_and_ps(v, (VecI32V&)internalWindowsSimd::gMaskXYZ);
+}
+
+PX_FORCE_INLINE Vec4V V4SetX(const Vec4V v, const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BFTTT(), v, f);
+}
+
+PX_FORCE_INLINE Vec4V V4SetY(const Vec4V v, const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTFTT(), v, f);
+}
+
+PX_FORCE_INLINE Vec4V V4SetZ(const Vec4V v, const FloatV f)
+{
+	ASSERT_ISVALIDFLOATV(f);
+	return V4Sel(BTTFT(), v, f);
+}
+
+PX_FORCE_INLINE Vec4V V4Zero()
+{
+	return _mm_setzero_ps();
+}
+
+PX_FORCE_INLINE Vec4V V4One()
+{
+	return V4Load(1.0f);
+}
+
+PX_FORCE_INLINE Vec4V V4Eps()
+{
+	return V4Load(PX_EPS_REAL);
+}
+
+PX_FORCE_INLINE Vec4V V4Neg(const Vec4V f)
+{
+	return _mm_sub_ps(_mm_setzero_ps(), f);
+}
+
+PX_FORCE_INLINE Vec4V V4Add(const Vec4V a, const Vec4V b)
+{
+	return _mm_add_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Sub(const Vec4V a, const Vec4V b)
+{
+	return _mm_sub_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Scale(const Vec4V a, const FloatV b)
+{
+	return _mm_mul_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Mul(const Vec4V a, const Vec4V b)
+{
+	return _mm_mul_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4ScaleInv(const Vec4V a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_div_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Div(const Vec4V a, const Vec4V b)
+{
+	return _mm_div_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4ScaleInvFast(const Vec4V a, const FloatV b)
+{
+	ASSERT_ISVALIDFLOATV(b);
+	return _mm_mul_ps(a, _mm_rcp_ps(b));
+}
+
+PX_FORCE_INLINE Vec4V V4DivFast(const Vec4V a, const Vec4V b)
+{
+	return _mm_mul_ps(a, _mm_rcp_ps(b));
+}
+
+PX_FORCE_INLINE Vec4V V4Recip(const Vec4V a)
+{
+	return _mm_div_ps(V4One(), a);
+}
+
+PX_FORCE_INLINE Vec4V V4RecipFast(const Vec4V a)
+{
+	return _mm_rcp_ps(a);
+}
+
+PX_FORCE_INLINE Vec4V V4Rsqrt(const Vec4V a)
+{
+	return _mm_div_ps(V4One(), _mm_sqrt_ps(a));
+}
+
+PX_FORCE_INLINE Vec4V V4RsqrtFast(const Vec4V a)
+{
+	return _mm_rsqrt_ps(a);
+}
+
+PX_FORCE_INLINE Vec4V V4Sqrt(const Vec4V a)
+{
+	return _mm_sqrt_ps(a);
+}
+
+PX_FORCE_INLINE Vec4V V4ScaleAdd(const Vec4V a, const FloatV b, const Vec4V c)
+{
+	ASSERT_ISVALIDFLOATV(b);
+	return V4Add(V4Scale(a, b), c);
+}
+
+PX_FORCE_INLINE Vec4V V4NegScaleSub(const Vec4V a, const FloatV b, const Vec4V c)
+{
+	ASSERT_ISVALIDFLOATV(b);
+	return V4Sub(c, V4Scale(a, b));
+}
+
+PX_FORCE_INLINE Vec4V V4MulAdd(const Vec4V a, const Vec4V b, const Vec4V c)
+{
+	return V4Add(V4Mul(a, b), c);
+}
+
+PX_FORCE_INLINE Vec4V V4NegMulSub(const Vec4V a, const Vec4V b, const Vec4V c)
+{
+	return V4Sub(c, V4Mul(a, b));
+}
+
+PX_FORCE_INLINE Vec4V V4Abs(const Vec4V a)
+{
+	return V4Max(a, V4Neg(a));
+}
+
+PX_FORCE_INLINE FloatV V4SumElements(const Vec4V a)
+{
+	const Vec4V xy = V4UnpackXY(a, a); // x,x,y,y
+	const Vec4V zw = V4UnpackZW(a, a); // z,z,w,w
+	const Vec4V xz_yw = V4Add(xy, zw); // x+z,x+z,y+w,y+w
+	const FloatV xz = V4GetX(xz_yw);   // x+z
+	const FloatV yw = V4GetZ(xz_yw);   // y+w
+	return FAdd(xz, yw);               // sum
+}
+
+PX_FORCE_INLINE FloatV V4Dot(const Vec4V a, const Vec4V b)
+{
+	const __m128 dot1 = _mm_mul_ps(a, b);                                     // x,y,z,w
+	const __m128 shuf1 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(2, 1, 0, 3)); // w,x,y,z
+	const __m128 shuf2 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(1, 0, 3, 2)); // z,w,x,y
+	const __m128 shuf3 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(0, 3, 2, 1)); // y,z,w,x
+	return _mm_add_ps(_mm_add_ps(shuf2, shuf3), _mm_add_ps(dot1, shuf1));
+
+	// PT: this version has two less instructions but we should check its accuracy
+	// aw*bw | az*bz | ay*by | ax*bx
+	// const __m128 t0 = _mm_mul_ps(a, b);
+	// ay*by | ax*bx | aw*bw | az*bz
+	// const __m128 t1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(1,0,3,2));
+	// ay*by + aw*bw | ax*bx + az*bz | aw*bw + ay*by | az*bz + ax*bx
+	// const __m128 t2 = _mm_add_ps(t0, t1);
+	// ax*bx + az*bz | ay*by + aw*bw | az*bz + ax*bx | aw*bw + ay*by
+	// const __m128 t3 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,3,0,1));
+	// ax*bx + az*bz + ay*by + aw*bw
+	// return _mm_add_ps(t3, t2);
+	// ay*by + aw*bw + ax*bx + az*bz
+	// az*bz + ax*bx + aw*bw + ay*by
+	// aw*bw + ay*by + az*bz + ax*bx
+}
+
+PX_FORCE_INLINE FloatV V4Dot3(const Vec4V a, const Vec4V b)
+{
+	const __m128 dot1 = _mm_mul_ps(a, b);                                     // aw*bw | az*bz | ay*by | ax*bx
+	const __m128 shuf1 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(0, 0, 0, 0)); // ax*bx | ax*bx | ax*bx | ax*bx
+	const __m128 shuf2 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(1, 1, 1, 1)); // ay*by | ay*by | ay*by | ay*by
+	const __m128 shuf3 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(2, 2, 2, 2)); // az*bz | az*bz | az*bz | az*bz
+	return _mm_add_ps(_mm_add_ps(shuf1, shuf2), shuf3);                       // ax*bx + ay*by + az*bz in each component
+}
+
+PX_FORCE_INLINE Vec4V V4Cross(const Vec4V a, const Vec4V b)
+{
+	const __m128 r1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	const __m128 r2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	const __m128 l1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)); // y,z,x,w
+	const __m128 l2 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)); // z,x,y,w
+	return _mm_sub_ps(_mm_mul_ps(l1, l2), _mm_mul_ps(r1, r2));
+}
+
+PX_FORCE_INLINE FloatV V4Length(const Vec4V a)
+{
+	return _mm_sqrt_ps(V4Dot(a, a));
+}
+
+PX_FORCE_INLINE FloatV V4LengthSq(const Vec4V a)
+{
+	return V4Dot(a, a);
+}
+
+PX_FORCE_INLINE Vec4V V4Normalize(const Vec4V a)
+{
+	ASSERT_ISFINITELENGTH(a);
+	return V4ScaleInv(a, _mm_sqrt_ps(V4Dot(a, a)));
+}
+
+PX_FORCE_INLINE Vec4V V4NormalizeFast(const Vec4V a)
+{
+	ASSERT_ISFINITELENGTH(a);
+	return V4ScaleInvFast(a, _mm_sqrt_ps(V4Dot(a, a)));
+}
+
+PX_FORCE_INLINE Vec4V V4NormalizeSafe(const Vec4V a, const Vec4V unsafeReturnValue)
+{
+	const __m128 eps = V3Eps();
+	const __m128 length = V4Length(a);
+	const __m128 isGreaterThanZero = V4IsGrtr(length, eps);
+	return V4Sel(isGreaterThanZero, V4ScaleInv(a, length), unsafeReturnValue);
+}
+
+PX_FORCE_INLINE Vec4V V4Sel(const BoolV c, const Vec4V a, const Vec4V b)
+{
+	return _mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a));
+}
+
+PX_FORCE_INLINE BoolV V4IsGrtr(const Vec4V a, const Vec4V b)
+{
+	return _mm_cmpgt_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV V4IsGrtrOrEq(const Vec4V a, const Vec4V b)
+{
+	return _mm_cmpge_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV V4IsEq(const Vec4V a, const Vec4V b)
+{
+	return _mm_cmpeq_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV V4IsEqU32(const VecU32V a, const VecU32V b)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_cmpeq_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b)));
+}
+
+PX_FORCE_INLINE Vec4V V4Max(const Vec4V a, const Vec4V b)
+{
+	return _mm_max_ps(a, b);
+}
+
+PX_FORCE_INLINE Vec4V V4Min(const Vec4V a, const Vec4V b)
+{
+	return _mm_min_ps(a, b);
+}
+
+PX_FORCE_INLINE FloatV V4ExtractMax(const Vec4V a)
+{
+	const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 1, 0, 3));
+	const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2));
+	const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 3, 2, 1));
+
+	return _mm_max_ps(_mm_max_ps(a, shuf1), _mm_max_ps(shuf2, shuf3));
+}
+
+PX_FORCE_INLINE FloatV V4ExtractMin(const Vec4V a)
+{
+	const __m128 shuf1 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 1, 0, 3));
+	const __m128 shuf2 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2));
+	const __m128 shuf3 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 3, 2, 1));
+
+	return _mm_min_ps(_mm_min_ps(a, shuf1), _mm_min_ps(shuf2, shuf3));
+}
+
+PX_FORCE_INLINE Vec4V V4Clamp(const Vec4V a, const Vec4V minV, const Vec4V maxV)
+{
+	return V4Max(V4Min(a, maxV), minV);
+}
+
+PX_FORCE_INLINE PxU32 V4AllGrtr(const Vec4V a, const Vec4V b)
+{
+	return internalWindowsSimd::BAllTrue4_R(V4IsGrtr(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V4AllGrtrOrEq(const Vec4V a, const Vec4V b)
+{
+	return internalWindowsSimd::BAllTrue4_R(V4IsGrtrOrEq(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V4AllGrtrOrEq3(const Vec4V a, const Vec4V b)
+{
+	return internalWindowsSimd::BAllTrue3_R(V4IsGrtrOrEq(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V4AllEq(const Vec4V a, const Vec4V b)
+{
+	return internalWindowsSimd::BAllTrue4_R(V4IsEq(a, b));
+}
+
+PX_FORCE_INLINE PxU32 V4AnyGrtr3(const Vec4V a, const Vec4V b)
+{
+	return internalWindowsSimd::BAnyTrue3_R(V4IsGrtr(a, b));
+}
+
+PX_FORCE_INLINE Vec4V V4Round(const Vec4V a)
+{
+	// return _mm_round_ps(a, 0x0);
+	const Vec4V half = V4Load(0.5f);
+	const __m128 signBit = _mm_cvtepi32_ps(_mm_srli_epi32(_mm_cvtps_epi32(a), 31));
+	const Vec4V aRound = V4Sub(V4Add(a, half), signBit);
+	const __m128i tmp = _mm_cvttps_epi32(aRound);
+	return _mm_cvtepi32_ps(tmp);
+}
+
+PX_FORCE_INLINE Vec4V V4Sin(const Vec4V a)
+{
+	const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const Vec4V twoPi = V4LoadA(g_PXTwoPi.f);
+	const Vec4V tmp = V4Mul(a, recipTwoPi);
+	const Vec4V b = V4Round(tmp);
+	const Vec4V V1 = V4NegMulSub(twoPi, b, a);
+
+	// sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
+	//           V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
+	const Vec4V V2 = V4Mul(V1, V1);
+	const Vec4V V3 = V4Mul(V2, V1);
+	const Vec4V V5 = V4Mul(V3, V2);
+	const Vec4V V7 = V4Mul(V5, V2);
+	const Vec4V V9 = V4Mul(V7, V2);
+	const Vec4V V11 = V4Mul(V9, V2);
+	const Vec4V V13 = V4Mul(V11, V2);
+	const Vec4V V15 = V4Mul(V13, V2);
+	const Vec4V V17 = V4Mul(V15, V2);
+	const Vec4V V19 = V4Mul(V17, V2);
+	const Vec4V V21 = V4Mul(V19, V2);
+	const Vec4V V23 = V4Mul(V21, V2);
+
+	const Vec4V sinCoefficients0 = V4LoadA(g_PXSinCoefficients0.f);
+	const Vec4V sinCoefficients1 = V4LoadA(g_PXSinCoefficients1.f);
+	const Vec4V sinCoefficients2 = V4LoadA(g_PXSinCoefficients2.f);
+
+	const FloatV S1 = V4GetY(sinCoefficients0);
+	const FloatV S2 = V4GetZ(sinCoefficients0);
+	const FloatV S3 = V4GetW(sinCoefficients0);
+	const FloatV S4 = V4GetX(sinCoefficients1);
+	const FloatV S5 = V4GetY(sinCoefficients1);
+	const FloatV S6 = V4GetZ(sinCoefficients1);
+	const FloatV S7 = V4GetW(sinCoefficients1);
+	const FloatV S8 = V4GetX(sinCoefficients2);
+	const FloatV S9 = V4GetY(sinCoefficients2);
+	const FloatV S10 = V4GetZ(sinCoefficients2);
+	const FloatV S11 = V4GetW(sinCoefficients2);
+
+	Vec4V Result;
+	Result = V4MulAdd(S1, V3, V1);
+	Result = V4MulAdd(S2, V5, Result);
+	Result = V4MulAdd(S3, V7, Result);
+	Result = V4MulAdd(S4, V9, Result);
+	Result = V4MulAdd(S5, V11, Result);
+	Result = V4MulAdd(S6, V13, Result);
+	Result = V4MulAdd(S7, V15, Result);
+	Result = V4MulAdd(S8, V17, Result);
+	Result = V4MulAdd(S9, V19, Result);
+	Result = V4MulAdd(S10, V21, Result);
+	Result = V4MulAdd(S11, V23, Result);
+
+	return Result;
+}
+
+PX_FORCE_INLINE Vec4V V4Cos(const Vec4V a)
+{
+	const Vec4V recipTwoPi = V4LoadA(g_PXReciprocalTwoPi.f);
+	const FloatV twoPi = V4LoadA(g_PXTwoPi.f);
+	const Vec4V tmp = V4Mul(a, recipTwoPi);
+	const Vec4V b = V4Round(tmp);
+	const Vec4V V1 = V4NegMulSub(twoPi, b, a);
+
+	// cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
+	//           V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
+	const Vec4V V2 = V4Mul(V1, V1);
+	const Vec4V V4 = V4Mul(V2, V2);
+	const Vec4V V6 = V4Mul(V4, V2);
+	const Vec4V V8 = V4Mul(V4, V4);
+	const Vec4V V10 = V4Mul(V6, V4);
+	const Vec4V V12 = V4Mul(V6, V6);
+	const Vec4V V14 = V4Mul(V8, V6);
+	const Vec4V V16 = V4Mul(V8, V8);
+	const Vec4V V18 = V4Mul(V10, V8);
+	const Vec4V V20 = V4Mul(V10, V10);
+	const Vec4V V22 = V4Mul(V12, V10);
+
+	const Vec4V cosCoefficients0 = V4LoadA(g_PXCosCoefficients0.f);
+	const Vec4V cosCoefficients1 = V4LoadA(g_PXCosCoefficients1.f);
+	const Vec4V cosCoefficients2 = V4LoadA(g_PXCosCoefficients2.f);
+
+	const FloatV C1 = V4GetY(cosCoefficients0);
+	const FloatV C2 = V4GetZ(cosCoefficients0);
+	const FloatV C3 = V4GetW(cosCoefficients0);
+	const FloatV C4 = V4GetX(cosCoefficients1);
+	const FloatV C5 = V4GetY(cosCoefficients1);
+	const FloatV C6 = V4GetZ(cosCoefficients1);
+	const FloatV C7 = V4GetW(cosCoefficients1);
+	const FloatV C8 = V4GetX(cosCoefficients2);
+	const FloatV C9 = V4GetY(cosCoefficients2);
+	const FloatV C10 = V4GetZ(cosCoefficients2);
+	const FloatV C11 = V4GetW(cosCoefficients2);
+
+	Vec4V Result;
+	Result = V4MulAdd(C1, V2, V4One());
+	Result = V4MulAdd(C2, V4, Result);
+	Result = V4MulAdd(C3, V6, Result);
+	Result = V4MulAdd(C4, V8, Result);
+	Result = V4MulAdd(C5, V10, Result);
+	Result = V4MulAdd(C6, V12, Result);
+	Result = V4MulAdd(C7, V14, Result);
+	Result = V4MulAdd(C8, V16, Result);
+	Result = V4MulAdd(C9, V18, Result);
+	Result = V4MulAdd(C10, V20, Result);
+	Result = V4MulAdd(C11, V22, Result);
+
+	return Result;
+}
+
+PX_FORCE_INLINE void V4Transpose(Vec4V& col0, Vec4V& col1, Vec4V& col2, Vec4V& col3)
+{
+	Vec4V tmp0 = _mm_unpacklo_ps(col0, col1);
+	Vec4V tmp2 = _mm_unpacklo_ps(col2, col3);
+	Vec4V tmp1 = _mm_unpackhi_ps(col0, col1);
+	Vec4V tmp3 = _mm_unpackhi_ps(col2, col3);
+	col0 = _mm_movelh_ps(tmp0, tmp2);
+	col1 = _mm_movehl_ps(tmp2, tmp0);
+	col2 = _mm_movelh_ps(tmp1, tmp3);
+	col3 = _mm_movehl_ps(tmp3, tmp1);
+}
+
+//////////////////////////////////
+// BoolV
+//////////////////////////////////
+
+PX_FORCE_INLINE BoolV BFFFF()
+{
+	return _mm_setzero_ps();
+}
+
+PX_FORCE_INLINE BoolV BFFFT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0,0,0xFFFFFFFF};
+	const __m128 ffft=_mm_load_ps((float*)&f);
+	return ffft;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, 0, 0, 0));
+}
+
+PX_FORCE_INLINE BoolV BFFTF()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0};
+	const __m128 fftf=_mm_load_ps((float*)&f);
+	return fftf;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, -1, 0, 0));
+}
+
+PX_FORCE_INLINE BoolV BFFTT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0xFFFFFFFF};
+	const __m128 fftt=_mm_load_ps((float*)&f);
+	return fftt;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, -1, 0, 0));
+}
+
+PX_FORCE_INLINE BoolV BFTFF()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0};
+	const __m128 ftff=_mm_load_ps((float*)&f);
+	return ftff;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, 0, -1, 0));
+}
+
+PX_FORCE_INLINE BoolV BFTFT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0xFFFFFFFF};
+	const __m128 ftft=_mm_load_ps((float*)&f);
+	return ftft;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, 0, -1, 0));
+}
+
+PX_FORCE_INLINE BoolV BFTTF()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0xFFFFFFFF,0};
+	const __m128 fttf=_mm_load_ps((float*)&f);
+	return fttf;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, -1, -1, 0));
+}
+
+PX_FORCE_INLINE BoolV BFTTT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF};
+	const __m128 fttt=_mm_load_ps((float*)&f);
+	return fttt;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, -1, -1, 0));
+}
+
+PX_FORCE_INLINE BoolV BTFFF()
+{
+	// const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0};
+	// const __m128 tfff=_mm_load_ps((float*)&f);
+	// return tfff;
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, 0, 0, -1));
+}
+
+PX_FORCE_INLINE BoolV BTFFT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0xFFFFFFFF};
+	const __m128 tfft=_mm_load_ps((float*)&f);
+	return tfft;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, 0, 0, -1));
+}
+
+PX_FORCE_INLINE BoolV BTFTF()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0xFFFFFFFF,0};
+	const __m128 tftf=_mm_load_ps((float*)&f);
+	return tftf;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, -1, 0, -1));
+}
+
+PX_FORCE_INLINE BoolV BTFTT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0xFFFFFFFF,0xFFFFFFFF};
+	const __m128 tftt=_mm_load_ps((float*)&f);
+	return tftt;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, -1, 0, -1));
+}
+
+PX_FORCE_INLINE BoolV BTTFF()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0,0};
+	const __m128 ttff=_mm_load_ps((float*)&f);
+	return ttff;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, 0, -1, -1));
+}
+
+PX_FORCE_INLINE BoolV BTTFT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0,0xFFFFFFFF};
+	const __m128 ttft=_mm_load_ps((float*)&f);
+	return ttft;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, 0, -1, -1));
+}
+
+PX_FORCE_INLINE BoolV BTTTF()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0};
+	const __m128 tttf=_mm_load_ps((float*)&f);
+	return tttf;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, -1, -1, -1));
+}
+
+PX_FORCE_INLINE BoolV BTTTT()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF};
+	const __m128 tttt=_mm_load_ps((float*)&f);
+	return tttt;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, -1, -1, -1));
+}
+
+PX_FORCE_INLINE BoolV BXMask()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0};
+	const __m128 tfff=_mm_load_ps((float*)&f);
+	return tfff;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, 0, 0, -1));
+}
+
+PX_FORCE_INLINE BoolV BYMask()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0};
+	const __m128 ftff=_mm_load_ps((float*)&f);
+	return ftff;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, 0, -1, 0));
+}
+
+PX_FORCE_INLINE BoolV BZMask()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0};
+	const __m128 fftf=_mm_load_ps((float*)&f);
+	return fftf;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(0, -1, 0, 0));
+}
+
+PX_FORCE_INLINE BoolV BWMask()
+{
+	/*const PX_ALIGN(16, PxU32 f[4])={0,0,0,0xFFFFFFFF};
+	const __m128 ffft=_mm_load_ps((float*)&f);
+	return ffft;*/
+	return internalWindowsSimd::m128_I2F(_mm_set_epi32(-1, 0, 0, 0));
+}
+
+PX_FORCE_INLINE BoolV BGetX(const BoolV f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0, 0, 0, 0));
+}
+
+PX_FORCE_INLINE BoolV BGetY(const BoolV f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(1, 1, 1, 1));
+}
+
+PX_FORCE_INLINE BoolV BGetZ(const BoolV f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(2, 2, 2, 2));
+}
+
+PX_FORCE_INLINE BoolV BGetW(const BoolV f)
+{
+	return _mm_shuffle_ps(f, f, _MM_SHUFFLE(3, 3, 3, 3));
+}
+
+PX_FORCE_INLINE BoolV BSetX(const BoolV v, const BoolV f)
+{
+	return V4Sel(BFTTT(), v, f);
+}
+
+PX_FORCE_INLINE BoolV BSetY(const BoolV v, const BoolV f)
+{
+	return V4Sel(BTFTT(), v, f);
+}
+
+PX_FORCE_INLINE BoolV BSetZ(const BoolV v, const BoolV f)
+{
+	return V4Sel(BTTFT(), v, f);
+}
+
+PX_FORCE_INLINE BoolV BSetW(const BoolV v, const BoolV f)
+{
+	return V4Sel(BTTTF(), v, f);
+}
+
+template <int index>
+BoolV BSplatElement(BoolV a)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_shuffle_epi32(internalWindowsSimd::m128_F2I(a), _MM_SHUFFLE(index, index, index, index)));
+}
+
+PX_FORCE_INLINE BoolV BAnd(const BoolV a, const BoolV b)
+{
+	return _mm_and_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV BNot(const BoolV a)
+{
+	const BoolV bAllTrue(BTTTT());
+	return _mm_xor_ps(a, bAllTrue);
+}
+
+PX_FORCE_INLINE BoolV BAndNot(const BoolV a, const BoolV b)
+{
+	return _mm_andnot_ps(b, a);
+}
+
+PX_FORCE_INLINE BoolV BOr(const BoolV a, const BoolV b)
+{
+	return _mm_or_ps(a, b);
+}
+
+PX_FORCE_INLINE BoolV BAllTrue4(const BoolV a)
+{
+	const BoolV bTmp =
+	    _mm_and_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 2, 3)));
+	return _mm_and_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)),
+	                  _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1)));
+}
+
+PX_FORCE_INLINE BoolV BAnyTrue4(const BoolV a)
+{
+	const BoolV bTmp =
+	    _mm_or_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 2, 3)));
+	return _mm_or_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)),
+	                 _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1)));
+}
+
+PX_FORCE_INLINE BoolV BAllTrue3(const BoolV a)
+{
+	const BoolV bTmp =
+	    _mm_and_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)));
+	return _mm_and_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)),
+	                  _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1)));
+}
+
+PX_FORCE_INLINE BoolV BAnyTrue3(const BoolV a)
+{
+	const BoolV bTmp =
+	    _mm_or_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 0, 1)), _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)));
+	return _mm_or_ps(_mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(0, 0, 0, 0)),
+	                 _mm_shuffle_ps(bTmp, bTmp, _MM_SHUFFLE(1, 1, 1, 1)));
+}
+
+PX_FORCE_INLINE PxU32 BAllEq(const BoolV a, const BoolV b)
+{
+	const BoolV bTest = internalWindowsSimd::m128_I2F(
+	    _mm_cmpeq_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b)));
+	return internalWindowsSimd::BAllTrue4_R(bTest);
+}
+
+PX_FORCE_INLINE PxU32 BAllEqTTTT(const BoolV a)
+{
+	return PxU32(_mm_movemask_ps(a)==15);
+}
+
+PX_FORCE_INLINE PxU32 BAllEqFFFF(const BoolV a)
+{
+	return PxU32(_mm_movemask_ps(a)==0);
+}
+
+PX_FORCE_INLINE PxU32 BGetBitMask(const BoolV a)
+{
+	return PxU32(_mm_movemask_ps(a));
+}
+
+//////////////////////////////////
+// MAT33V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec3V M33MulV3(const Mat33V& a, const Vec3V b)
+{
+	const FloatV x = V3GetX(b);
+	const FloatV y = V3GetY(b);
+	const FloatV z = V3GetZ(b);
+	const Vec3V v0 = V3Scale(a.col0, x);
+	const Vec3V v1 = V3Scale(a.col1, y);
+	const Vec3V v2 = V3Scale(a.col2, z);
+	const Vec3V v0PlusV1 = V3Add(v0, v1);
+	return V3Add(v0PlusV1, v2);
+}
+
+PX_FORCE_INLINE Vec3V M33TrnspsMulV3(const Mat33V& a, const Vec3V b)
+{
+	Vec3V v0 = V3Mul(a.col0, b);
+	Vec3V v1 = V3Mul(a.col1, b);
+	Vec3V v2 = V3Mul(a.col2, b);
+	V3Transpose(v0, v1, v2);
+	return V3Add(V3Add(v0, v1), v2);
+}
+
+PX_FORCE_INLINE Vec3V M33MulV3AddV3(const Mat33V& A, const Vec3V b, const Vec3V c)
+{
+	const FloatV x = V3GetX(b);
+	const FloatV y = V3GetY(b);
+	const FloatV z = V3GetZ(b);
+	Vec3V result = V3ScaleAdd(A.col0, x, c);
+	result = V3ScaleAdd(A.col1, y, result);
+	return V3ScaleAdd(A.col2, z, result);
+}
+
+PX_FORCE_INLINE Mat33V M33MulM33(const Mat33V& a, const Mat33V& b)
+{
+	return Mat33V(M33MulV3(a, b.col0), M33MulV3(a, b.col1), M33MulV3(a, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Add(const Mat33V& a, const Mat33V& b)
+{
+	return Mat33V(V3Add(a.col0, b.col0), V3Add(a.col1, b.col1), V3Add(a.col2, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Scale(const Mat33V& a, const FloatV& b)
+{
+	return Mat33V(V3Scale(a.col0, b), V3Scale(a.col1, b), V3Scale(a.col2, b));
+}
+
+PX_FORCE_INLINE Mat33V M33Sub(const Mat33V& a, const Mat33V& b)
+{
+	return Mat33V(V3Sub(a.col0, b.col0), V3Sub(a.col1, b.col1), V3Sub(a.col2, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Neg(const Mat33V& a)
+{
+	return Mat33V(V3Neg(a.col0), V3Neg(a.col1), V3Neg(a.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Abs(const Mat33V& a)
+{
+	return Mat33V(V3Abs(a.col0), V3Abs(a.col1), V3Abs(a.col2));
+}
+
+PX_FORCE_INLINE Mat33V M33Inverse(const Mat33V& a)
+{
+	const BoolV tfft = BTFFT();
+	const BoolV tttf = BTTTF();
+	const FloatV zero = V3Zero();
+	const Vec3V cross01 = V3Cross(a.col0, a.col1);
+	const Vec3V cross12 = V3Cross(a.col1, a.col2);
+	const Vec3V cross20 = V3Cross(a.col2, a.col0);
+	const FloatV dot = V3Dot(cross01, a.col2);
+	const FloatV invDet = _mm_rcp_ps(dot);
+	const Vec3V mergeh = _mm_unpacklo_ps(cross12, cross01);
+	const Vec3V mergel = _mm_unpackhi_ps(cross12, cross01);
+	Vec3V colInv0 = _mm_unpacklo_ps(mergeh, cross20);
+	colInv0 = _mm_or_ps(_mm_andnot_ps(tttf, zero), _mm_and_ps(tttf, colInv0));
+	const Vec3V zppd = _mm_shuffle_ps(mergeh, cross20, _MM_SHUFFLE(3, 0, 0, 2));
+	const Vec3V pbwp = _mm_shuffle_ps(cross20, mergeh, _MM_SHUFFLE(3, 3, 1, 0));
+	const Vec3V colInv1 = _mm_or_ps(_mm_andnot_ps(BTFFT(), pbwp), _mm_and_ps(BTFFT(), zppd));
+	const Vec3V xppd = _mm_shuffle_ps(mergel, cross20, _MM_SHUFFLE(3, 0, 0, 0));
+	const Vec3V pcyp = _mm_shuffle_ps(cross20, mergel, _MM_SHUFFLE(3, 1, 2, 0));
+	const Vec3V colInv2 = _mm_or_ps(_mm_andnot_ps(tfft, pcyp), _mm_and_ps(tfft, xppd));
+
+	return Mat33V(_mm_mul_ps(colInv0, invDet), _mm_mul_ps(colInv1, invDet), _mm_mul_ps(colInv2, invDet));
+}
+
+PX_FORCE_INLINE Mat33V M33Trnsps(const Mat33V& a)
+{
+	Vec3V col0 = a.col0, col1 = a.col1, col2 = a.col2;
+	V3Transpose(col0, col1, col2);
+	return Mat33V(col0, col1, col2);
+}
+
+PX_FORCE_INLINE Mat33V M33Identity()
+{
+	return Mat33V(V3UnitX(), V3UnitY(), V3UnitZ());
+}
+
+PX_FORCE_INLINE Mat33V M33Diagonal(const Vec3VArg d)
+{
+	const FloatV x = V3Mul(V3UnitX(), d);
+	const FloatV y = V3Mul(V3UnitY(), d);
+	const FloatV z = V3Mul(V3UnitZ(), d);
+	return Mat33V(x, y, z);
+}
+
+//////////////////////////////////
+// MAT34V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec3V M34MulV3(const Mat34V& a, const Vec3V b)
+{
+	const FloatV x = V3GetX(b);
+	const FloatV y = V3GetY(b);
+	const FloatV z = V3GetZ(b);
+	const Vec3V v0 = V3Scale(a.col0, x);
+	const Vec3V v1 = V3Scale(a.col1, y);
+	const Vec3V v2 = V3Scale(a.col2, z);
+	const Vec3V v0PlusV1 = V3Add(v0, v1);
+	const Vec3V v0PlusV1Plusv2 = V3Add(v0PlusV1, v2);
+	return V3Add(v0PlusV1Plusv2, a.col3);
+}
+
+PX_FORCE_INLINE Vec3V M34Mul33V3(const Mat34V& a, const Vec3V b)
+{
+	const FloatV x = V3GetX(b);
+	const FloatV y = V3GetY(b);
+	const FloatV z = V3GetZ(b);
+	const Vec3V v0 = V3Scale(a.col0, x);
+	const Vec3V v1 = V3Scale(a.col1, y);
+	const Vec3V v2 = V3Scale(a.col2, z);
+	const Vec3V v0PlusV1 = V3Add(v0, v1);
+	return V3Add(v0PlusV1, v2);
+}
+
+PX_FORCE_INLINE Vec3V M34TrnspsMul33V3(const Mat34V& a, const Vec3V b)
+{
+	Vec3V v0 = V3Mul(a.col0, b);
+	Vec3V v1 = V3Mul(a.col1, b);
+	Vec3V v2 = V3Mul(a.col2, b);
+	V3Transpose(v0, v1, v2);
+	return V3Add(V3Add(v0, v1), v2);
+}
+
+PX_FORCE_INLINE Mat34V M34MulM34(const Mat34V& a, const Mat34V& b)
+{
+	return Mat34V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2), M34MulV3(a, b.col3));
+}
+
+PX_FORCE_INLINE Mat33V M34MulM33(const Mat34V& a, const Mat33V& b)
+{
+	return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2));
+}
+
+PX_FORCE_INLINE Mat33V M34Mul33MM34(const Mat34V& a, const Mat34V& b)
+{
+	return Mat33V(M34Mul33V3(a, b.col0), M34Mul33V3(a, b.col1), M34Mul33V3(a, b.col2));
+}
+
+PX_FORCE_INLINE Mat34V M34Add(const Mat34V& a, const Mat34V& b)
+{
+	return Mat34V(V3Add(a.col0, b.col0), V3Add(a.col1, b.col1), V3Add(a.col2, b.col2), V3Add(a.col3, b.col3));
+}
+
+PX_FORCE_INLINE Mat34V M34Inverse(const Mat34V& a)
+{
+	Mat34V aInv;
+	const BoolV tfft = BTFFT();
+	const BoolV tttf = BTTTF();
+	const FloatV zero = V3Zero();
+	const Vec3V cross01 = V3Cross(a.col0, a.col1);
+	const Vec3V cross12 = V3Cross(a.col1, a.col2);
+	const Vec3V cross20 = V3Cross(a.col2, a.col0);
+	const FloatV dot = V3Dot(cross01, a.col2);
+	const FloatV invDet = _mm_rcp_ps(dot);
+	const Vec3V mergeh = _mm_unpacklo_ps(cross12, cross01);
+	const Vec3V mergel = _mm_unpackhi_ps(cross12, cross01);
+	Vec3V colInv0 = _mm_unpacklo_ps(mergeh, cross20);
+	colInv0 = _mm_or_ps(_mm_andnot_ps(tttf, zero), _mm_and_ps(tttf, colInv0));
+	const Vec3V zppd = _mm_shuffle_ps(mergeh, cross20, _MM_SHUFFLE(3, 0, 0, 2));
+	const Vec3V pbwp = _mm_shuffle_ps(cross20, mergeh, _MM_SHUFFLE(3, 3, 1, 0));
+	const Vec3V colInv1 = _mm_or_ps(_mm_andnot_ps(BTFFT(), pbwp), _mm_and_ps(BTFFT(), zppd));
+	const Vec3V xppd = _mm_shuffle_ps(mergel, cross20, _MM_SHUFFLE(3, 0, 0, 0));
+	const Vec3V pcyp = _mm_shuffle_ps(cross20, mergel, _MM_SHUFFLE(3, 1, 2, 0));
+	const Vec3V colInv2 = _mm_or_ps(_mm_andnot_ps(tfft, pcyp), _mm_and_ps(tfft, xppd));
+	aInv.col0 = _mm_mul_ps(colInv0, invDet);
+	aInv.col1 = _mm_mul_ps(colInv1, invDet);
+	aInv.col2 = _mm_mul_ps(colInv2, invDet);
+	aInv.col3 = M34Mul33V3(aInv, V3Neg(a.col3));
+	return aInv;
+}
+
+PX_FORCE_INLINE Mat33V M34Trnsps33(const Mat34V& a)
+{
+	Vec3V col0 = a.col0, col1 = a.col1, col2 = a.col2;
+	V3Transpose(col0, col1, col2);
+	return Mat33V(col0, col1, col2);
+}
+
+//////////////////////////////////
+// MAT44V
+//////////////////////////////////
+
+PX_FORCE_INLINE Vec4V M44MulV4(const Mat44V& a, const Vec4V b)
+{
+	const FloatV x = V4GetX(b);
+	const FloatV y = V4GetY(b);
+	const FloatV z = V4GetZ(b);
+	const FloatV w = V4GetW(b);
+
+	const Vec4V v0 = V4Scale(a.col0, x);
+	const Vec4V v1 = V4Scale(a.col1, y);
+	const Vec4V v2 = V4Scale(a.col2, z);
+	const Vec4V v3 = V4Scale(a.col3, w);
+	const Vec4V v0PlusV1 = V4Add(v0, v1);
+	const Vec4V v0PlusV1Plusv2 = V4Add(v0PlusV1, v2);
+	return V4Add(v0PlusV1Plusv2, v3);
+}
+
+PX_FORCE_INLINE Vec4V M44TrnspsMulV4(const Mat44V& a, const Vec4V b)
+{
+	Vec4V v0 = V4Mul(a.col0, b);
+	Vec4V v1 = V4Mul(a.col1, b);
+	Vec4V v2 = V4Mul(a.col2, b);
+	Vec4V v3 = V4Mul(a.col3, b);
+	V4Transpose(v0, v1, v2, v3);
+	return V4Add(V4Add(v0, v1), V4Add(v2, v3));
+}
+
+PX_FORCE_INLINE Mat44V M44MulM44(const Mat44V& a, const Mat44V& b)
+{
+	return Mat44V(M44MulV4(a, b.col0), M44MulV4(a, b.col1), M44MulV4(a, b.col2), M44MulV4(a, b.col3));
+}
+
+PX_FORCE_INLINE Mat44V M44Add(const Mat44V& a, const Mat44V& b)
+{
+	return Mat44V(V4Add(a.col0, b.col0), V4Add(a.col1, b.col1), V4Add(a.col2, b.col2), V4Add(a.col3, b.col3));
+}
+
+PX_FORCE_INLINE Mat44V M44Trnsps(const Mat44V& a)
+{
+	Vec4V col0 = a.col0, col1 = a.col1, col2 = a.col2, col3 = a.col3;
+	V4Transpose(col0, col1, col2, col3);
+	return Mat44V(col0, col1, col2, col3);
+}
+
+PX_FORCE_INLINE Mat44V M44Inverse(const Mat44V& a)
+{
+	__m128 minor0, minor1, minor2, minor3;
+	__m128 row0, row1, row2, row3;
+	__m128 det, tmp1;
+
+	tmp1 = V4Zero();
+	row1 = V4Zero();
+	row3 = V4Zero();
+
+	row0 = a.col0;
+	row1 = _mm_shuffle_ps(a.col1, a.col1, _MM_SHUFFLE(1, 0, 3, 2));
+	row2 = a.col2;
+	row3 = _mm_shuffle_ps(a.col3, a.col3, _MM_SHUFFLE(1, 0, 3, 2));
+
+	tmp1 = _mm_mul_ps(row2, row3);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	minor0 = _mm_mul_ps(row1, tmp1);
+	minor1 = _mm_mul_ps(row0, tmp1);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
+	minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
+	minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E);
+
+	tmp1 = _mm_mul_ps(row1, row2);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
+	minor3 = _mm_mul_ps(row0, tmp1);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
+	minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
+	minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E);
+
+	tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	row2 = _mm_shuffle_ps(row2, row2, 0x4E);
+	minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
+	minor2 = _mm_mul_ps(row0, tmp1);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
+	minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
+	minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
+
+	tmp1 = _mm_mul_ps(row0, row1);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
+	minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
+	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
+
+	tmp1 = _mm_mul_ps(row0, row3);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
+	minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
+	minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
+
+	tmp1 = _mm_mul_ps(row0, row2);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
+	minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
+	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
+	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
+	minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
+
+	det = _mm_mul_ps(row0, minor0);
+	det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
+	det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
+	tmp1 = _mm_rcp_ss(det);
+#if 0
+	det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
+	det = _mm_shuffle_ps(det, det, 0x00);
+#else
+	det = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(0, 0, 0, 0));
+#endif
+
+	minor0 = _mm_mul_ps(det, minor0);
+	minor1 = _mm_mul_ps(det, minor1);
+	minor2 = _mm_mul_ps(det, minor2);
+	minor3 = _mm_mul_ps(det, minor3);
+	Mat44V invTrans(minor0, minor1, minor2, minor3);
+	return M44Trnsps(invTrans);
+}
+
+PX_FORCE_INLINE Vec4V V4LoadXYZW(const PxF32& x, const PxF32& y, const PxF32& z, const PxF32& w)
+{
+	return _mm_set_ps(w, z, y, x);
+}
+
+PX_FORCE_INLINE VecU32V V4U32Sel(const BoolV c, const VecU32V a, const VecU32V b)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_or_si128(_mm_andnot_si128(internalWindowsSimd::m128_F2I(c), internalWindowsSimd::m128_F2I(b)),
+	                 _mm_and_si128(internalWindowsSimd::m128_F2I(c), internalWindowsSimd::m128_F2I(a))));
+}
+
+PX_FORCE_INLINE VecU32V V4U32or(VecU32V a, VecU32V b)
+{
+	return internalWindowsSimd::m128_I2F(_mm_or_si128(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b)));
+}
+
+PX_FORCE_INLINE VecU32V V4U32xor(VecU32V a, VecU32V b)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_xor_si128(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b)));
+}
+
+PX_FORCE_INLINE VecU32V V4U32and(VecU32V a, VecU32V b)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_and_si128(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b)));
+}
+
+PX_FORCE_INLINE VecU32V V4U32Andc(VecU32V a, VecU32V b)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_andnot_si128(internalWindowsSimd::m128_F2I(b), internalWindowsSimd::m128_F2I(a)));
+}
+
+PX_FORCE_INLINE VecI32V U4Load(const PxU32 i)
+{
+	return _mm_load1_ps((PxF32*)&i);
+}
+
+PX_FORCE_INLINE VecU32V U4LoadU(const PxU32* i)
+{
+	return _mm_loadu_ps((PxF32*)i);
+}
+
+PX_FORCE_INLINE VecU32V U4LoadA(const PxU32* i)
+{
+	ASSERT_ISALIGNED16(i);
+	return _mm_load_ps((PxF32*)i);
+}
+
+PX_FORCE_INLINE VecI32V I4Load(const PxI32 i)
+{
+	return _mm_load1_ps((PxF32*)&i);
+}
+
+PX_FORCE_INLINE VecI32V I4LoadU(const PxI32* i)
+{
+	return _mm_loadu_ps((PxF32*)i);
+}
+
+PX_FORCE_INLINE VecI32V I4LoadA(const PxI32* i)
+{
+	ASSERT_ISALIGNED16(i);
+	return _mm_load_ps((PxF32*)i);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Add(const VecI32VArg a, const VecI32VArg b)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_add_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b)));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Sub(const VecI32VArg a, const VecI32VArg b)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_sub_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b)));
+}
+
+PX_FORCE_INLINE BoolV VecI32V_IsGrtr(const VecI32VArg a, const VecI32VArg b)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_cmpgt_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b)));
+}
+
+PX_FORCE_INLINE BoolV VecI32V_IsEq(const VecI32VArg a, const VecI32VArg b)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_cmpeq_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b)));
+}
+
+PX_FORCE_INLINE VecI32V V4I32Sel(const BoolV c, const VecI32V a, const VecI32V b)
+{
+	return V4U32Sel(c, a, b);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Zero()
+{
+	return V4Zero();
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_One()
+{
+	return I4Load(1);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Two()
+{
+	return I4Load(2);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_MinusOne()
+{
+	return I4Load(-1);
+}
+
+PX_FORCE_INLINE VecU32V U4Zero()
+{
+	return U4Load(0);
+}
+
+PX_FORCE_INLINE VecU32V U4One()
+{
+	return U4Load(1);
+}
+
+PX_FORCE_INLINE VecU32V U4Two()
+{
+	return U4Load(2);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Sel(const BoolV c, const VecI32VArg a, const VecI32VArg b)
+{
+	PX_ASSERT(_VecMathTests::allElementsEqualBoolV(c, BTTTT()) ||
+			  _VecMathTests::allElementsEqualBoolV(c, BFFFF()));
+	return _mm_or_ps(_mm_andnot_ps(c, b), _mm_and_ps(c, a));
+}
+
+PX_FORCE_INLINE VecShiftV VecI32V_PrepareShift(const VecI32VArg shift)
+{
+	VecShiftV preparedShift;
+	preparedShift.shift = _mm_or_ps(_mm_andnot_ps(BTFFF(), VecI32V_Zero()), _mm_and_ps(BTFFF(), shift)); 
+	return preparedShift;
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_LeftShift(const VecI32VArg a, const VecShiftVArg count)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_sll_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(count.shift)));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_RightShift(const VecI32VArg a, const VecShiftVArg count)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_srl_epi32(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(count.shift)));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_And(const VecI32VArg a, const VecI32VArg b)
+{
+	return _mm_and_ps(a, b);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Or(const VecI32VArg a, const VecI32VArg b)
+{
+	return _mm_or_ps(a, b);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetX(const VecI32VArg a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetY(const VecI32VArg a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetZ(const VecI32VArg a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_GetW(const VecI32VArg a)
+{
+	return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3));
+}
+
+PX_FORCE_INLINE void PxI32_From_VecI32V(const VecI32VArg a, PxI32* i)
+{
+	_mm_store_ss((PxF32*)i, a);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_From_BoolV(const BoolVArg a)
+{
+	return a;
+}
+
+PX_FORCE_INLINE VecU32V VecU32V_From_BoolV(const BoolVArg a)
+{
+	return a;
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_Merge(const VecI32VArg a, const VecI32VArg b, const VecI32VArg c, const VecI32VArg d)
+{
+	const __m128 xw = _mm_move_ss(b, a); // y, y, y, x
+	const __m128 yz = _mm_move_ss(c, d); // z, z, z, w
+	return _mm_shuffle_ps(xw, yz, _MM_SHUFFLE(0, 2, 1, 0));
+}
+
+PX_FORCE_INLINE void V4U32StoreAligned(VecU32V val, VecU32V* address)
+{
+	*address = val;
+}
+
+PX_FORCE_INLINE Vec4V V4Andc(const Vec4V a, const VecU32V b)
+{
+	VecU32V result32(a);
+	result32 = V4U32Andc(result32, b);
+	return Vec4V(result32);
+}
+
+PX_FORCE_INLINE VecU32V V4IsGrtrV32u(const Vec4V a, const Vec4V b)
+{
+	return V4IsGrtr(a, b);
+}
+
+PX_FORCE_INLINE VecU16V V4U16LoadAligned(VecU16V* addr)
+{
+	return *addr;
+}
+
+PX_FORCE_INLINE VecU16V V4U16LoadUnaligned(VecU16V* addr)
+{
+	return *addr;
+}
+
+// unsigned compares are not supported on x86
+PX_FORCE_INLINE VecU16V V4U16CompareGt(VecU16V a, VecU16V b)
+{
+	// _mm_cmpgt_epi16 doesn't work for unsigned values unfortunately
+	// return m128_I2F(_mm_cmpgt_epi16(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b)));
+	VecU16V result;
+	result.m128_u16[0] = PxU16((a).m128_u16[0] > (b).m128_u16[0]);
+	result.m128_u16[1] = PxU16((a).m128_u16[1] > (b).m128_u16[1]);
+	result.m128_u16[2] = PxU16((a).m128_u16[2] > (b).m128_u16[2]);
+	result.m128_u16[3] = PxU16((a).m128_u16[3] > (b).m128_u16[3]);
+	result.m128_u16[4] = PxU16((a).m128_u16[4] > (b).m128_u16[4]);
+	result.m128_u16[5] = PxU16((a).m128_u16[5] > (b).m128_u16[5]);
+	result.m128_u16[6] = PxU16((a).m128_u16[6] > (b).m128_u16[6]);
+	result.m128_u16[7] = PxU16((a).m128_u16[7] > (b).m128_u16[7]);
+	return result;
+}
+
+PX_FORCE_INLINE VecU16V V4I16CompareGt(VecU16V a, VecU16V b)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_cmpgt_epi16(internalWindowsSimd::m128_F2I(a), internalWindowsSimd::m128_F2I(b)));
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_VecU32V(VecU32V a)
+{
+	Vec4V result = V4LoadXYZW(PxF32(a.m128_u32[0]), PxF32(a.m128_u32[1]), PxF32(a.m128_u32[2]), PxF32(a.m128_u32[3]));
+	return result;
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_From_VecI32V(VecI32V a)
+{
+	return _mm_cvtepi32_ps(internalWindowsSimd::m128_F2I(a));
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_From_Vec4V(Vec4V a)
+{
+	return internalWindowsSimd::m128_I2F(_mm_cvttps_epi32(a));
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecU32V(VecU32V a)
+{
+	return Vec4V(a);
+}
+
+PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecI32V(VecI32V a)
+{
+	return Vec4V(a);
+}
+
+PX_FORCE_INLINE VecU32V VecU32V_ReinterpretFrom_Vec4V(Vec4V a)
+{
+	return VecU32V(a);
+}
+
+PX_FORCE_INLINE VecI32V VecI32V_ReinterpretFrom_Vec4V(Vec4V a)
+{
+	return VecI32V(a);
+}
+
+template <int index>
+PX_FORCE_INLINE VecU32V V4U32SplatElement(VecU32V a)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_shuffle_epi32(internalWindowsSimd::m128_F2I(a), _MM_SHUFFLE(index, index, index, index)));
+}
+
+template <int index>
+PX_FORCE_INLINE Vec4V V4SplatElement(Vec4V a)
+{
+	return internalWindowsSimd::m128_I2F(
+	    _mm_shuffle_epi32(internalWindowsSimd::m128_F2I(a), _MM_SHUFFLE(index, index, index, index)));
+}
+
+PX_FORCE_INLINE VecU32V U4LoadXYZW(PxU32 x, PxU32 y, PxU32 z, PxU32 w)
+{
+	VecU32V result;
+	result.m128_u32[0] = x;
+	result.m128_u32[1] = y;
+	result.m128_u32[2] = z;
+	result.m128_u32[3] = w;
+	return result;
+}
+
+PX_FORCE_INLINE Vec4V V4ConvertFromI32V(const VecI32V in)
+{
+	return _mm_cvtepi32_ps(internalWindowsSimd::m128_F2I(in));
+}
+
+#endif // PSFOUNDATION_PSWINDOWSINLINEAOS_H
diff --git a/PxShared/src/foundation/include/windows/PsWindowsIntrinsics.h b/PxShared/src/foundation/include/windows/PsWindowsIntrinsics.h
new file mode 100644
index 0000000..ca1e9c5
--- /dev/null
+++ b/PxShared/src/foundation/include/windows/PsWindowsIntrinsics.h
@@ -0,0 +1,190 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSWINDOWSINTRINSICS_H
+#define PSFOUNDATION_PSWINDOWSINTRINSICS_H
+
+#include "Ps.h"
+#include "foundation/PxAssert.h"
+
+// this file is for internal intrinsics - that is, intrinsics that are used in
+// cross platform code but do not appear in the API
+
+#if !PX_WINDOWS_FAMILY
+#error "This file should only be included by Windows builds!!"
+#endif
+
+#pragma warning(push)
+//'symbol' is not defined as a preprocessor macro, replacing with '0' for 'directives'
+#pragma warning(disable : 4668)
+#if PX_VC == 10
+#pragma warning(disable : 4987) // nonstandard extension used: 'throw (...)'
+#endif
+#include <intrin.h>
+#pragma warning(pop)
+
+#pragma warning(push)
+#pragma warning(disable : 4985) // 'symbol name': attributes not present on previous declaration
+#include <math.h>
+#pragma warning(pop)
+
+#include <float.h>
+#include <mmintrin.h>
+
+#pragma intrinsic(_BitScanForward)
+#pragma intrinsic(_BitScanReverse)
+
+namespace physx
+{
+namespace shdfnd
+{
+
+/*
+* Implements a memory barrier
+*/
+PX_FORCE_INLINE void memoryBarrier()
+{
+	_ReadWriteBarrier();
+	/* long Barrier;
+	__asm {
+	    xchg Barrier, eax
+	}*/
+}
+
+/*!
+Returns the index of the highest set bit. Not valid for zero arg.
+*/
+PX_FORCE_INLINE uint32_t highestSetBitUnsafe(uint32_t v)
+{
+	unsigned long retval;
+	_BitScanReverse(&retval, v);
+	return retval;
+}
+
+/*!
+Returns the index of the highest set bit. Undefined for zero arg.
+*/
+PX_FORCE_INLINE uint32_t lowestSetBitUnsafe(uint32_t v)
+{
+	unsigned long retval;
+	_BitScanForward(&retval, v);
+	return retval;
+}
+
+/*!
+Returns the number of leading zeros in v. Returns 32 for v=0.
+*/
+PX_FORCE_INLINE uint32_t countLeadingZeros(uint32_t v)
+{
+	if(v)
+	{
+		unsigned long bsr = (unsigned long)-1;
+		_BitScanReverse(&bsr, v);
+		return 31 - bsr;
+	}
+	else
+		return 32;
+}
+
+/*!
+Prefetch aligned cache size around \c ptr+offset.
+*/
+#if !PX_ARM
+PX_FORCE_INLINE void prefetchLine(const void* ptr, uint32_t offset = 0)
+{
+	// cache line on X86/X64 is 64-bytes so a 128-byte prefetch would require 2 prefetches.
+	// However, we can only dispatch a limited number of prefetch instructions so we opt to prefetch just 1 cache line
+	/*_mm_prefetch(((const char*)ptr + offset), _MM_HINT_T0);*/
+	// We get slightly better performance prefetching to non-temporal addresses instead of all cache levels
+	_mm_prefetch(((const char*)ptr + offset), _MM_HINT_NTA);
+}
+#else
+PX_FORCE_INLINE void prefetchLine(const void* ptr, uint32_t offset = 0)
+{
+	// arm does have 32b cache line size
+	__prefetch(((const char*)ptr + offset));
+}
+#endif
+
+/*!
+Prefetch \c count bytes starting at \c ptr.
+*/
+#if !PX_ARM
+PX_FORCE_INLINE void prefetch(const void* ptr, uint32_t count = 1)
+{
+	const char* cp = (char*)ptr;
+	uint64_t p = size_t(ptr);
+	uint64_t startLine = p >> 6, endLine = (p + count - 1) >> 6;
+	uint64_t lines = endLine - startLine + 1;
+	do
+	{
+		prefetchLine(cp);
+		cp += 64;
+	} while(--lines);
+}
+#else
+PX_FORCE_INLINE void prefetch(const void* ptr, uint32_t count = 1)
+{
+	const char* cp = (char*)ptr;
+	uint32_t p = size_t(ptr);
+	uint32_t startLine = p >> 5, endLine = (p + count - 1) >> 5;
+	uint32_t lines = endLine - startLine + 1;
+	do
+	{
+		prefetchLine(cp);
+		cp += 32;
+	} while(--lines);
+}
+#endif
+
+//! \brief platform-specific reciprocal
+PX_CUDA_CALLABLE PX_FORCE_INLINE float recipFast(float a)
+{
+	return 1.0f / a;
+}
+
+//! \brief platform-specific fast reciprocal square root
+PX_CUDA_CALLABLE PX_FORCE_INLINE float recipSqrtFast(float a)
+{
+	return 1.0f / ::sqrtf(a);
+}
+
+//! \brief platform-specific floor
+PX_CUDA_CALLABLE PX_FORCE_INLINE float floatFloor(float x)
+{
+	return ::floorf(x);
+}
+
+#define NS_EXPECT_TRUE(x) x
+#define NS_EXPECT_FALSE(x) x
+
+} // namespace shdfnd
+} // namespace physx
+
+#endif // #ifndef PSFOUNDATION_PSWINDOWSINTRINSICS_H
diff --git a/PxShared/src/foundation/include/windows/PsWindowsLoadLibrary.h b/PxShared/src/foundation/include/windows/PsWindowsLoadLibrary.h
new file mode 100644
index 0000000..adfd8e4
--- /dev/null
+++ b/PxShared/src/foundation/include/windows/PsWindowsLoadLibrary.h
@@ -0,0 +1,72 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PS_WINDOWS_FOUNDATION_LOADLIBRARY_H
+#define PS_WINDOWS_FOUNDATION_LOADLIBRARY_H
+
+#include "foundation/PxPreprocessor.h"
+#include "windows/PsWindowsInclude.h"
+#include "foundation/windows/PxWindowsFoundationDelayLoadHook.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+	EXTERN_C IMAGE_DOS_HEADER __ImageBase;
+
+	PX_INLINE FARPROC WINAPI foundationDliNotePreLoadLibrary(const char* libraryName, const physx::PxFoundationDelayLoadHook* delayLoadHook)
+	{	
+		if(!delayLoadHook)
+		{
+			return (FARPROC)::LoadLibraryA(libraryName);
+		}
+		else
+		{
+			if(strstr(libraryName, "PxFoundation"))
+			{
+				if(strstr(libraryName, "DEBUG"))
+					return (FARPROC)::LoadLibraryA(delayLoadHook->getPxFoundationDEBUGDllName());
+
+				if(strstr(libraryName, "CHECKED"))
+					return (FARPROC)::LoadLibraryA(delayLoadHook->getPxFoundationCHECKEDDllName());
+
+				if(strstr(libraryName, "PROFILE"))
+					return (FARPROC)::LoadLibraryA(delayLoadHook->getPxFoundationPROFILEDllName());
+
+				return (FARPROC)::LoadLibraryA(delayLoadHook->getPxFoundationDllName());
+			}
+		}
+		return NULL;
+    }
+} // namespace shdfnd
+} // namespace physx
+
+
+#endif	// PS_WINDOWS_FOUNDATION_LOADLIBRARY_H
diff --git a/PxShared/src/foundation/include/windows/PsWindowsTrigConstants.h b/PxShared/src/foundation/include/windows/PsWindowsTrigConstants.h
new file mode 100644
index 0000000..dda3e3b
--- /dev/null
+++ b/PxShared/src/foundation/include/windows/PsWindowsTrigConstants.h
@@ -0,0 +1,87 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PSFOUNDATION_PSWINDOWSTRIGCONSTANTS_H
+#define PSFOUNDATION_PSWINDOWSTRIGCONSTANTS_H
+
+#define PX_GLOBALCONST extern const __declspec(selectany)
+
+__declspec(align(16)) struct PX_VECTORF32
+{
+	float f[4];
+};
+
+//#define PX_PI               3.141592654f
+//#define PX_2PI              6.283185307f
+//#define PX_1DIVPI           0.318309886f
+//#define PX_1DIV2PI          0.159154943f
+//#define PX_PIDIV2           1.570796327f
+//#define PX_PIDIV4           0.785398163f
+
+PX_GLOBALCONST PX_VECTORF32 g_PXSinCoefficients0 = { { 1.0f, -0.166666667f, 8.333333333e-3f, -1.984126984e-4f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXSinCoefficients1 = { { 2.755731922e-6f, -2.505210839e-8f, 1.605904384e-10f, -7.647163732e-13f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXSinCoefficients2 = { { 2.811457254e-15f, -8.220635247e-18f, 1.957294106e-20f, -3.868170171e-23f } };
+PX_GLOBALCONST PX_VECTORF32 g_PXCosCoefficients0 = { { 1.0f, -0.5f, 4.166666667e-2f, -1.388888889e-3f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXCosCoefficients1 = { { 2.480158730e-5f, -2.755731922e-7f, 2.087675699e-9f, -1.147074560e-11f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXCosCoefficients2 = { { 4.779477332e-14f, -1.561920697e-16f, 4.110317623e-19f, -8.896791392e-22f } };
+PX_GLOBALCONST PX_VECTORF32 g_PXTanCoefficients0 = { { 1.0f, 0.333333333f, 0.133333333f, 5.396825397e-2f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXTanCoefficients1 = { { 2.186948854e-2f, 8.863235530e-3f, 3.592128167e-3f, 1.455834485e-3f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXTanCoefficients2 = { { 5.900274264e-4f, 2.391290764e-4f, 9.691537707e-5f, 3.927832950e-5f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXASinCoefficients0 = { { -0.05806367563904f, -0.41861972469416f, 0.22480114791621f, 2.17337241360606f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXASinCoefficients1 = { { 0.61657275907170f, 4.29696498283455f, -1.18942822255452f, -6.53784832094831f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXASinCoefficients2 = { { -1.36926553863413f, -4.48179294237210f, 1.41810672941833f, 5.48179257935713f } };
+PX_GLOBALCONST PX_VECTORF32 g_PXATanCoefficients0 = { { 1.0f, 0.333333334f, 0.2f, 0.142857143f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXATanCoefficients1 = { { 1.111111111e-1f, 9.090909091e-2f, 7.692307692e-2f, 6.666666667e-2f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXATanCoefficients2 = { { 5.882352941e-2f, 5.263157895e-2f, 4.761904762e-2f, 4.347826087e-2f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXSinEstCoefficients = { { 1.0f, -1.66521856991541e-1f, 8.199913018755e-3f, -1.61475937228e-4f } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXCosEstCoefficients = { { 1.0f, -4.95348008918096e-1f, 3.878259962881e-2f, -9.24587976263e-4f } };
+PX_GLOBALCONST PX_VECTORF32 g_PXTanEstCoefficients = { { 2.484f, -1.954923183e-1f, 2.467401101f, PxInvPi } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXATanEstCoefficients = { { 7.689891418951e-1f, 1.104742493348f, 8.661844266006e-1f, PxPiDivTwo } };
+PX_GLOBALCONST PX_VECTORF32
+g_PXASinEstCoefficients = { { -1.36178272886711f, 2.37949493464538f, -8.08228565650486e-1f, 2.78440142746736e-1f } };
+PX_GLOBALCONST PX_VECTORF32 g_PXASinEstConstants = { { 1.00000011921f, PxPiDivTwo, 0.0f, 0.0f } };
+PX_GLOBALCONST PX_VECTORF32 g_PXPiConstants0 = { { PxPi, PxTwoPi, PxInvPi, PxInvTwoPi } };
+PX_GLOBALCONST PX_VECTORF32 g_PXReciprocalTwoPi = { { PxInvTwoPi, PxInvTwoPi, PxInvTwoPi, PxInvTwoPi } };
+PX_GLOBALCONST PX_VECTORF32 g_PXTwoPi = { { PxTwoPi, PxTwoPi, PxTwoPi, PxTwoPi } };
+
+#endif
diff --git a/PxShared/src/foundation/src/PsAllocator.cpp b/PxShared/src/foundation/src/PsAllocator.cpp
new file mode 100644
index 0000000..3952b27
--- /dev/null
+++ b/PxShared/src/foundation/src/PsAllocator.cpp
@@ -0,0 +1,124 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "PsFoundation.h"
+#include "PsAllocator.h"
+#include "PsHashMap.h"
+#include "PsArray.h"
+#include "PsMutex.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+
+#if PX_USE_NAMED_ALLOCATOR
+namespace
+{
+typedef HashMap<const NamedAllocator*, const char*, Hash<const NamedAllocator*>, NonTrackingAllocator> AllocNameMap;
+PX_INLINE AllocNameMap& getMap()
+{
+	return getFoundation().getNamedAllocMap();
+}
+PX_INLINE Foundation::Mutex& getMutex()
+{
+	return getFoundation().getNamedAllocMutex();
+}
+}
+
+NamedAllocator::NamedAllocator(const PxEMPTY)
+{
+	Foundation::Mutex::ScopedLock lock(getMutex());
+	getMap().insert(this, 0);
+}
+
+NamedAllocator::NamedAllocator(const char* name)
+{
+	Foundation::Mutex::ScopedLock lock(getMutex());
+	getMap().insert(this, name);
+}
+
+NamedAllocator::NamedAllocator(const NamedAllocator& other)
+{
+	Foundation::Mutex::ScopedLock lock(getMutex());
+	const AllocNameMap::Entry* e = getMap().find(&other);
+	PX_ASSERT(e);
+	const char* name = e->second; // The copy is important because insert might invalidate the referenced hash entry
+	getMap().insert(this, name);
+}
+
+NamedAllocator::~NamedAllocator()
+{
+	Foundation::Mutex::ScopedLock lock(getMutex());
+	bool erased = getMap().erase(this);
+	PX_UNUSED(erased);
+	PX_ASSERT(erased);
+}
+
+NamedAllocator& NamedAllocator::operator=(const NamedAllocator& other)
+{
+	Foundation::Mutex::ScopedLock lock(getMutex());
+	const AllocNameMap::Entry* e = getMap().find(&other);
+	PX_ASSERT(e);
+	getMap()[this] = e->second;
+	return *this;
+}
+
+void* NamedAllocator::allocate(size_t size, const char* filename, int line)
+{
+	if(!size)
+		return 0;
+	Foundation::Mutex::ScopedLock lock(getMutex());
+	const AllocNameMap::Entry* e = getMap().find(this);
+	PX_ASSERT(e);
+	return getAllocator().allocate(size, e->second, filename, line);
+}
+
+void NamedAllocator::deallocate(void* ptr)
+{
+	if(ptr)
+		getAllocator().deallocate(ptr);
+}
+
+#endif // PX_DEBUG
+
+void* Allocator::allocate(size_t size, const char* file, int line)
+{
+	if(!size)
+		return 0;
+	return getAllocator().allocate(size, "", file, line);
+}
+void Allocator::deallocate(void* ptr)
+{
+	if(ptr)
+		getAllocator().deallocate(ptr);
+}
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/PxShared/src/foundation/src/PsAssert.cpp b/PxShared/src/foundation/src/PsAssert.cpp
new file mode 100644
index 0000000..3070383
--- /dev/null
+++ b/PxShared/src/foundation/src/PsAssert.cpp
@@ -0,0 +1,90 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxAssert.h"
+
+#include <stdio.h>
+#include "PsString.h"
+
+#if PX_WINDOWS_FAMILY
+#include <crtdbg.h>
+#elif PX_NX
+#include "nx/PsNXAbort.h"
+#endif
+
+namespace
+{
+class DefaultAssertHandler : public physx::PxAssertHandler
+{
+	virtual void operator()(const char* expr, const char* file, int line, bool& ignore)
+	{
+		PX_UNUSED(ignore); // is used only in debug windows config
+		char buffer[1024];
+#if PX_WINDOWS_FAMILY
+		sprintf_s(buffer, "%s(%d) : Assertion failed: %s\n", file, line, expr);
+#else
+		sprintf(buffer, "%s(%d) : Assertion failed: %s\n", file, line, expr);
+#endif
+		physx::shdfnd::printString(buffer);
+#if PX_WINDOWS_FAMILY&& PX_DEBUG
+		// _CrtDbgReport returns -1 on error, 1 on 'retry', 0 otherwise including 'ignore'.
+		// Hitting 'abort' will terminate the process immediately.
+		int result = _CrtDbgReport(_CRT_ASSERT, file, line, NULL, "%s", buffer);
+		int mode = _CrtSetReportMode(_CRT_ASSERT, _CRTDBG_REPORT_MODE);
+		ignore = _CRTDBG_MODE_WNDW == mode && result == 0;
+		if(ignore)
+			return;
+		__debugbreak();
+#elif PX_WINDOWS_FAMILY&& PX_CHECKED
+		__debugbreak();
+#elif PX_NX
+		abort(buffer);
+#else
+		abort();
+#endif
+	}
+};
+
+DefaultAssertHandler sAssertHandler;
+physx::PxAssertHandler* sAssertHandlerPtr = &sAssertHandler;
+}
+
+namespace physx
+{
+
+PxAssertHandler& PxGetAssertHandler()
+{
+	return *sAssertHandlerPtr;
+}
+
+void PxSetAssertHandler(PxAssertHandler& handler)
+{
+	sAssertHandlerPtr = &handler;
+}
+} // end of physx namespace
diff --git a/PxShared/src/foundation/src/PsFoundation.cpp b/PxShared/src/foundation/src/PsFoundation.cpp
new file mode 100644
index 0000000..e559b57
--- /dev/null
+++ b/PxShared/src/foundation/src/PsFoundation.cpp
@@ -0,0 +1,278 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxProfiler.h"
+#include "foundation/PxErrorCallback.h"
+#include "foundation/PxFoundationVersion.h"
+#include "PsFoundation.h"
+#include "PsString.h"
+#include "PsAllocator.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+
+Foundation::Foundation(PxErrorCallback& errc, PxAllocatorCallback& alloc)
+: mAllocatorCallback(alloc)
+, mErrorCallback(errc)
+, mBroadcastingAllocator(alloc, errc)
+, mBroadcastingError(errc)
+,
+#if PX_CHECKED
+    mReportAllocationNames(true)
+,
+#else
+    mReportAllocationNames(false)
+,
+#endif
+    mErrorMask(PxErrorCode::Enum(~0))
+, mErrorMutex(PX_DEBUG_EXP("Foundation::mErrorMutex"))
+, mNamedAllocMutex(PX_DEBUG_EXP("Foundation::mNamedAllocMutex"))
+, mTempAllocMutex(PX_DEBUG_EXP("Foundation::mTempAllocMutex"))
+{
+}
+
+Foundation::~Foundation()
+{
+	// deallocate temp buffer allocations
+	Allocator alloc;
+	for(PxU32 i = 0; i < mTempAllocFreeTable.size(); ++i)
+	{
+		for(TempAllocatorChunk* ptr = mTempAllocFreeTable[i]; ptr;)
+		{
+			TempAllocatorChunk* next = ptr->mNext;
+			alloc.deallocate(ptr);
+			ptr = next;
+		}
+	}
+	mTempAllocFreeTable.reset();
+}
+
+Foundation& Foundation::getInstance()
+{
+	PX_ASSERT(mInstance);
+	return *mInstance;
+}
+
+PxU32 Foundation::getWarnOnceTimestamp()
+{
+	PX_ASSERT(mInstance != NULL);
+	return mWarnOnceTimestap;
+}
+
+void Foundation::error(PxErrorCode::Enum c, const char* file, int line, const char* messageFmt, ...)
+{
+	va_list va;
+	va_start(va, messageFmt);
+	errorImpl(c, file, line, messageFmt, va);
+	va_end(va);
+}
+
+void Foundation::errorImpl(PxErrorCode::Enum e, const char* file, int line, const char* messageFmt, va_list va)
+{
+	PX_ASSERT(messageFmt);
+	if(e & mErrorMask)
+	{
+		// this function is reentrant but user's error callback may not be, so...
+		Mutex::ScopedLock lock(mErrorMutex);
+
+		// using a static fixed size buffer here because:
+		// 1. vsnprintf return values differ between platforms
+		// 2. va_start is only usable in functions with ellipses
+		// 3. ellipses (...) cannot be passed to called function
+		// which would be necessary to dynamically grow the buffer here
+
+		static const size_t bufSize = 1024;
+		char stringBuffer[bufSize];
+		shdfnd::vsnprintf(stringBuffer, bufSize, messageFmt, va);
+
+		mBroadcastingError.reportError(e, stringBuffer, file, line);
+	}
+}
+
+Foundation* Foundation::createInstance(PxU32 version, PxErrorCallback& errc, PxAllocatorCallback& alloc)
+{
+	if(version != PX_FOUNDATION_VERSION)
+	{
+		char* buffer = new char[256];
+		physx::shdfnd::snprintf(buffer, 256, "Wrong version: foundation version is 0x%08x, tried to create 0x%08x",
+		                        PX_FOUNDATION_VERSION, version);
+		errc.reportError(PxErrorCode::eINVALID_PARAMETER, buffer, __FILE__, __LINE__);
+		return 0;
+	}
+
+	if(!mInstance)
+	{
+		// if we don't assign this here, the Foundation object can't create member
+		// subobjects which require the allocator
+
+		mInstance = reinterpret_cast<Foundation*>(alloc.allocate(sizeof(Foundation), "Foundation", __FILE__, __LINE__));
+
+		if(mInstance)
+		{
+			PX_PLACEMENT_NEW(mInstance, Foundation)(errc, alloc);
+
+			PX_ASSERT(mRefCount == 0);
+			mRefCount = 1;
+
+			// skip 0 which marks uninitialized timestaps in PX_WARN_ONCE
+			mWarnOnceTimestap = (mWarnOnceTimestap == PX_MAX_U32) ? 1 : mWarnOnceTimestap + 1;
+
+			return mInstance;
+		}
+		else
+		{
+			errc.reportError(PxErrorCode::eINTERNAL_ERROR, "Memory allocation for foundation object failed.", __FILE__,
+			                 __LINE__);
+		}
+	}
+	else
+	{
+		errc.reportError(PxErrorCode::eINVALID_OPERATION,
+		                 "Foundation object exists already. Only one instance per process can be created.", __FILE__,
+		                 __LINE__);
+	}
+
+	return 0;
+}
+
+void Foundation::destroyInstance()
+{
+	PX_ASSERT(mInstance != NULL);
+
+	if(mRefCount == 1)
+	{
+		PxAllocatorCallback& alloc = mInstance->getAllocatorCallback();
+		mInstance->~Foundation();
+		alloc.deallocate(mInstance);
+		mInstance = 0;
+		mRefCount = 0;
+	}
+	else
+	{
+		mInstance->error(PxErrorCode::eINVALID_OPERATION, __FILE__, __LINE__,
+		                 "Foundation destruction failed due to pending module references. Close/release all depending "
+		                 "modules first.");
+	}
+}
+
+void Foundation::incRefCount()
+{
+	PX_ASSERT(mInstance != NULL);
+
+	if(mRefCount > 0)
+	{
+		mRefCount++;
+	}
+	else
+	{
+		mInstance->error(PxErrorCode::eINVALID_OPERATION, __FILE__, __LINE__,
+		                 "Foundation: Invalid registration detected.");
+	}
+}
+
+void Foundation::decRefCount()
+{
+	PX_ASSERT(mInstance != NULL);
+
+	if(mRefCount > 0)
+	{
+		mRefCount--;
+	}
+	else
+	{
+		mInstance->error(PxErrorCode::eINVALID_OPERATION, __FILE__, __LINE__,
+		                 "Foundation: Invalid deregistration detected.");
+	}
+}
+
+void Foundation::release()
+{
+	Foundation::destroyInstance();
+}
+
+PxAllocatorCallback& getAllocator()
+{
+	return getFoundation().getAllocator();
+}
+
+Foundation* Foundation::mInstance = NULL;
+PxU32 Foundation::mRefCount = 0;
+PxU32 Foundation::mWarnOnceTimestap = 0;
+
+void Foundation::registerAllocationListener(physx::shdfnd::AllocationListener& listener)
+{
+	Mutex::ScopedLock lock(mListenerMutex);
+	mBroadcastingAllocator.registerListener(listener);
+}
+
+void Foundation::deregisterAllocationListener(physx::shdfnd::AllocationListener& listener)
+{
+	Mutex::ScopedLock lock(mListenerMutex);
+	mBroadcastingAllocator.deregisterListener(listener);
+}
+
+void Foundation::registerErrorCallback(PxErrorCallback& callback)
+{
+	Mutex::ScopedLock lock(mListenerMutex);
+	mBroadcastingError.registerListener(callback);
+}
+
+void Foundation::deregisterErrorCallback(PxErrorCallback& callback)
+{
+	Mutex::ScopedLock lock(mListenerMutex);
+	mBroadcastingError.deregisterListener(callback);
+}
+
+physx::PxProfilerCallback* gProfilerCallback = NULL;
+
+} // namespace shdfnd
+} // namespace physx
+
+physx::PxFoundation* PxCreateFoundation(physx::PxU32 version, physx::PxAllocatorCallback& allocator,
+                                        physx::PxErrorCallback& errorCallback)
+{
+	return physx::shdfnd::Foundation::createInstance(version, errorCallback, allocator);
+}
+
+physx::PxFoundation& PxGetFoundation()
+{
+	return physx::shdfnd::Foundation::getInstance();
+}
+
+physx::PxProfilerCallback* PxGetProfilerCallback()
+{
+	return physx::shdfnd::gProfilerCallback;
+}
+
+void PxSetProfilerCallback(physx::PxProfilerCallback* profiler)
+{
+	physx::shdfnd::gProfilerCallback = profiler;
+}
diff --git a/PxShared/src/foundation/src/PsMathUtils.cpp b/PxShared/src/foundation/src/PsMathUtils.cpp
new file mode 100644
index 0000000..b900fdd
--- /dev/null
+++ b/PxShared/src/foundation/src/PsMathUtils.cpp
@@ -0,0 +1,212 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxMat33.h"
+#include "foundation/PxMathUtils.h"
+#include "foundation/PxVec4.h"
+#include "foundation/PxAssert.h"
+#include "PsMathUtils.h"
+#include "PsUtilities.h"
+#include "PsBasicTemplates.h"
+
+using namespace physx;
+using namespace physx::shdfnd;
+using namespace physx::intrinsics;
+
+PX_FOUNDATION_API PxQuat physx::PxShortestRotation(const PxVec3& v0, const PxVec3& v1)
+{
+	const PxReal d = v0.dot(v1);
+	const PxVec3 cross = v0.cross(v1);
+
+	PxQuat q = d > -1 ? PxQuat(cross.x, cross.y, cross.z, 1 + d) : PxAbs(v0.x) < 0.1f ? PxQuat(0.0f, v0.z, -v0.y, 0.0f)
+	                                                                                  : PxQuat(v0.y, -v0.x, 0.0f, 0.0f);
+
+	return q.getNormalized();
+}
+
+namespace
+{
+// indexed rotation around axis, with sine and cosine of half-angle
+PxQuat indexedRotation(PxU32 axis, PxReal s, PxReal c)
+{
+	PxReal v[3] = { 0, 0, 0 };
+	v[axis] = s;
+	return PxQuat(v[0], v[1], v[2], c);
+}
+}
+
+PX_FOUNDATION_API PxVec3 physx::PxDiagonalize(const PxMat33& m, PxQuat& massFrame)
+{
+	// jacobi rotation using quaternions (from an idea of Stan Melax, with fix for precision issues)
+
+	const PxU32 MAX_ITERS = 24;
+
+	PxQuat q = PxQuat(PxIdentity);
+
+	PxMat33 d;
+	for(PxU32 i = 0; i < MAX_ITERS; i++)
+	{
+		PxMat33 axes(q);
+		d = axes.getTranspose() * m * axes;
+
+		PxReal d0 = PxAbs(d[1][2]), d1 = PxAbs(d[0][2]), d2 = PxAbs(d[0][1]);
+		PxU32 a = PxU32(d0 > d1 && d0 > d2 ? 0 : d1 > d2 ? 1 : 2); // rotation axis index, from largest off-diagonal
+		// element
+
+		PxU32 a1 = shdfnd::getNextIndex3(a), a2 = shdfnd::getNextIndex3(a1);
+		if(d[a1][a2] == 0.0f || PxAbs(d[a1][a1] - d[a2][a2]) > 2e6f * PxAbs(2.0f * d[a1][a2]))
+			break;
+
+		PxReal w = (d[a1][a1] - d[a2][a2]) / (2.0f * d[a1][a2]); // cot(2 * phi), where phi is the rotation angle
+		PxReal absw = PxAbs(w);
+
+		PxQuat r;
+		if(absw > 1000)
+			r = indexedRotation(a, 1 / (4 * w), 1.f); // h will be very close to 1, so use small angle approx instead
+		else
+		{
+			PxReal t = 1 / (absw + PxSqrt(w * w + 1)); // absolute value of tan phi
+			PxReal h = 1 / PxSqrt(t * t + 1);          // absolute value of cos phi
+
+			PX_ASSERT(h != 1); // |w|<1000 guarantees this with typical IEEE754 machine eps (approx 6e-8)
+			r = indexedRotation(a, PxSqrt((1 - h) / 2) * PxSign(w), PxSqrt((1 + h) / 2));
+		}
+
+		q = (q * r).getNormalized();
+	}
+
+	massFrame = q;
+	return PxVec3(d.column0.x, d.column1.y, d.column2.z);
+}
+
+/**
+\brief computes a oriented bounding box around the scaled basis.
+\param basis Input = skewed basis, Output = (normalized) orthogonal basis.
+\return Bounding box extent.
+*/
+PxVec3 physx::shdfnd::optimizeBoundingBox(PxMat33& basis)
+{
+	PxVec3* PX_RESTRICT vec = &basis[0]; // PT: don't copy vectors if not needed...
+
+	// PT: since we store the magnitudes to memory, we can avoid the FCMPs afterwards
+	PxVec3 magnitude(vec[0].magnitudeSquared(), vec[1].magnitudeSquared(), vec[2].magnitudeSquared());
+
+	// find indices sorted by magnitude
+	unsigned int i = magnitude[1] > magnitude[0] ? 1 : 0u;
+	unsigned int j = magnitude[2] > magnitude[1 - i] ? 2 : 1 - i;
+	const unsigned int k = 3 - i - j;
+
+	if(magnitude[i] < magnitude[j])
+		swap(i, j);
+
+	PX_ASSERT(magnitude[i] >= magnitude[j] && magnitude[i] >= magnitude[k] && magnitude[j] >= magnitude[k]);
+
+	// ortho-normalize basis
+
+	PxReal invSqrt = PxRecipSqrt(magnitude[i]);
+	magnitude[i] *= invSqrt;
+	vec[i] *= invSqrt; // normalize the first axis
+	PxReal dotij = vec[i].dot(vec[j]);
+	PxReal dotik = vec[i].dot(vec[k]);
+	magnitude[i] += PxAbs(dotij) + PxAbs(dotik); // elongate the axis by projection of the other two
+	vec[j] -= vec[i] * dotij;                    // orthogonize the two remaining axii relative to vec[i]
+	vec[k] -= vec[i] * dotik;
+
+	magnitude[j] = vec[j].normalize();
+	PxReal dotjk = vec[j].dot(vec[k]);
+	magnitude[j] += PxAbs(dotjk); // elongate the axis by projection of the other one
+	vec[k] -= vec[j] * dotjk;     // orthogonize vec[k] relative to vec[j]
+
+	magnitude[k] = vec[k].normalize();
+
+	return magnitude;
+}
+
+PxQuat physx::shdfnd::slerp(const PxReal t, const PxQuat& left, const PxQuat& right)
+{
+	const PxReal quatEpsilon = (PxReal(1.0e-8f));
+
+	PxReal cosine = left.dot(right);
+	PxReal sign = PxReal(1);
+	if(cosine < 0)
+	{
+		cosine = -cosine;
+		sign = PxReal(-1);
+	}
+
+	PxReal sine = PxReal(1) - cosine * cosine;
+
+	if(sine >= quatEpsilon * quatEpsilon)
+	{
+		sine = PxSqrt(sine);
+		const PxReal angle = PxAtan2(sine, cosine);
+		const PxReal i_sin_angle = PxReal(1) / sine;
+
+		const PxReal leftw = PxSin(angle * (PxReal(1) - t)) * i_sin_angle;
+		const PxReal rightw = PxSin(angle * t) * i_sin_angle * sign;
+
+		return left * leftw + right * rightw;
+	}
+
+	return left;
+}
+
+void physx::shdfnd::integrateTransform(const PxTransform& curTrans, const PxVec3& linvel, const PxVec3& angvel,
+                                       PxReal timeStep, PxTransform& result)
+{
+	result.p = curTrans.p + linvel * timeStep;
+
+	// from void DynamicsContext::integrateAtomPose(PxsRigidBody* atom, Cm::BitMap &shapeChangedMap) const:
+	// Integrate the rotation using closed form quaternion integrator
+	PxReal w = angvel.magnitudeSquared();
+
+	if(w != 0.0f)
+	{
+		w = PxSqrt(w);
+		if(w != 0.0f)
+		{
+			const PxReal v = timeStep * w * 0.5f;
+			const PxReal q = PxCos(v);
+			const PxReal s = PxSin(v) / w;
+
+			const PxVec3 pqr = angvel * s;
+			const PxQuat quatVel(pqr.x, pqr.y, pqr.z, 0);
+			PxQuat out; // need to have temporary, otherwise we may overwrite input if &curTrans == &result.
+			out = quatVel * curTrans.q;
+			out.x += curTrans.q.x * q;
+			out.y += curTrans.q.y * q;
+			out.z += curTrans.q.z * q;
+			out.w += curTrans.q.w * q;
+			result.q = out;
+			return;
+		}
+	}
+	// orientation stays the same - convert from quat to matrix:
+	result.q = curTrans.q;
+}
diff --git a/PxShared/src/foundation/src/PsString.cpp b/PxShared/src/foundation/src/PsString.cpp
new file mode 100644
index 0000000..adb29d6
--- /dev/null
+++ b/PxShared/src/foundation/src/PsString.cpp
@@ -0,0 +1,185 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "PsString.h"
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+
+#if PX_WINDOWS_FAMILY
+#pragma warning(push)
+#pragma warning(disable : 4996) // unsafe string functions
+#endif
+
+#if PX_PS4 || PX_APPLE_FAMILY
+#pragma clang diagnostic push
+// error : format string is not a string literal
+#pragma clang diagnostic ignored "-Wformat-nonliteral"
+#endif
+
+namespace physx
+{
+namespace shdfnd
+{
+// cross-platform implementations
+
+int32_t strcmp(const char* str1, const char* str2)
+{
+	return ::strcmp(str1, str2);
+}
+
+int32_t strncmp(const char* str1, const char* str2, size_t count)
+{
+	return ::strncmp(str1, str2, count);
+}
+
+int32_t snprintf(char* dst, size_t dstSize, const char* format, ...)
+{
+	va_list arg;
+	va_start(arg, format);
+	int32_t r = shdfnd::vsnprintf(dst, dstSize, format, arg);
+	va_end(arg);
+	return r;
+}
+
+int32_t sscanf(const char* buffer, const char* format, ...)
+{
+	va_list arg;
+	va_start(arg, format);
+#if (PX_VC < 12) && !PX_LINUX
+	int32_t r = ::sscanf(buffer, format, arg);
+#else
+	int32_t r = ::vsscanf(buffer, format, arg);
+#endif
+	va_end(arg);
+
+	return r;
+}
+
+size_t strlcpy(char* dst, size_t dstSize, const char* src)
+{
+	size_t i = 0;
+	if(dst && dstSize)
+	{
+		for(; i + 1 < dstSize && src[i]; i++) // copy up to dstSize-1 bytes
+			dst[i] = src[i];
+		dst[i] = 0; // always null-terminate
+	}
+
+	while(src[i]) // read any remaining characters in the src string to get the length
+		i++;
+
+	return i;
+}
+
+size_t strlcat(char* dst, size_t dstSize, const char* src)
+{
+	size_t i = 0, s = 0;
+	if(dst && dstSize)
+	{
+		s = strlen(dst);
+		for(; i + s + 1 < dstSize && src[i]; i++) // copy until total is at most dstSize-1
+			dst[i + s] = src[i];
+		dst[i + s] = 0; // always null-terminate
+	}
+
+	while(src[i]) // read any remaining characters in the src string to get the length
+		i++;
+
+	return i + s;
+}
+
+void strlwr(char* str)
+{
+	for(; *str; str++)
+		if(*str >= 'A' && *str <= 'Z')
+			*str += 32;
+}
+
+void strupr(char* str)
+{
+	for(; *str; str++)
+		if(*str >= 'a' && *str <= 'z')
+			*str -= 32;
+}
+
+int32_t vsnprintf(char* dst, size_t dstSize, const char* src, va_list arg)
+{
+
+#if PX_VC // MSVC is not C99-compliant...
+	int32_t result = dst ? ::vsnprintf(dst, dstSize, src, arg) : -1;
+	if(dst && (result == int32_t(dstSize) || result < 0))
+		dst[dstSize - 1] = 0; // string was truncated or there wasn't room for the NULL
+	if(result < 0)
+		result = _vscprintf(src, arg); // work out how long the answer would have been.
+#else
+	int32_t result = ::vsnprintf(dst, dstSize, src, arg);
+#endif
+	return result;
+}
+
+int32_t stricmp(const char* str, const char* str1)
+{
+#if PX_VC
+	return (::_stricmp(str, str1));
+#else
+	return (::strcasecmp(str, str1));
+#endif
+}
+
+int32_t strnicmp(const char* str, const char* str1, size_t n)
+{
+#if PX_VC
+	return (::_strnicmp(str, str1, n));
+#else
+	return (::strncasecmp(str, str1, n));
+#endif
+}
+
+void printFormatted(const char* format, ...)
+{
+	char buf[MAX_PRINTFORMATTED_LENGTH];
+
+	va_list arg;
+	va_start(arg, format);
+	vsnprintf(buf, MAX_PRINTFORMATTED_LENGTH, format, arg);
+	va_end(arg);
+
+	printString(buf);
+}
+}
+}
+
+#if PX_PS4 || PX_APPLE_FAMILY
+#pragma clang diagnostic pop
+#endif
+
+#if PX_WINDOWS_FAMILY
+#pragma warning(pop)
+#endif
diff --git a/PxShared/src/foundation/src/PsTempAllocator.cpp b/PxShared/src/foundation/src/PsTempAllocator.cpp
new file mode 100644
index 0000000..c917a65
--- /dev/null
+++ b/PxShared/src/foundation/src/PsTempAllocator.cpp
@@ -0,0 +1,129 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxMath.h"
+
+#include "PsFoundation.h"
+#include "PsTempAllocator.h"
+#include "PsArray.h"
+#include "PsMutex.h"
+#include "PsAtomic.h"
+#include "PsIntrinsics.h"
+#include "PsBitUtils.h"
+
+#if PX_VC
+#pragma warning(disable : 4706) // assignment within conditional expression
+#endif
+
+namespace physx
+{
+namespace shdfnd
+{
+namespace
+{
+typedef TempAllocatorChunk Chunk;
+typedef Array<Chunk*, NonTrackingAllocator> AllocFreeTable;
+
+PX_INLINE Foundation::AllocFreeTable& getFreeTable()
+{
+	return getFoundation().getTempAllocFreeTable();
+}
+PX_INLINE Foundation::Mutex& getMutex()
+{
+	return getFoundation().getTempAllocMutex();
+}
+
+const PxU32 sMinIndex = 8;  // 256B min
+const PxU32 sMaxIndex = 17; // 128kB max
+}
+
+void* TempAllocator::allocate(size_t size, const char* filename, int line)
+{
+	if(!size)
+		return 0;
+
+	uint32_t index = PxMax(highestSetBit(uint32_t(size) + sizeof(Chunk) - 1), sMinIndex);
+
+	Chunk* chunk = 0;
+	if(index < sMaxIndex)
+	{
+		Foundation::Mutex::ScopedLock lock(getMutex());
+
+		// find chunk up to 16x bigger than necessary
+		Chunk** it = getFreeTable().begin() + index - sMinIndex;
+		Chunk** end = PxMin(it + 3, getFreeTable().end());
+		while(it < end && !(*it))
+			++it;
+
+		if(it < end)
+		{
+			// pop top off freelist
+			chunk = *it;
+			*it = chunk->mNext;
+			index = uint32_t(it - getFreeTable().begin() + sMinIndex);
+		}
+		else
+			// create new chunk
+			chunk = reinterpret_cast<Chunk*>(NonTrackingAllocator().allocate(size_t(2 << index), filename, line));
+	}
+	else
+	{
+		// too big for temp allocation, forward to base allocator
+		chunk = reinterpret_cast<Chunk*>(NonTrackingAllocator().allocate(size + sizeof(Chunk), filename, line));
+	}
+
+	chunk->mIndex = index;
+	void* ret = chunk + 1;
+	PX_ASSERT((size_t(ret) & 0xf) == 0); // SDK types require at minimum 16 byte allignment.
+	return ret;
+}
+
+void TempAllocator::deallocate(void* ptr)
+{
+	if(!ptr)
+		return;
+
+	Chunk* chunk = reinterpret_cast<Chunk*>(ptr) - 1;
+	uint32_t index = chunk->mIndex;
+
+	if(index >= sMaxIndex)
+		return NonTrackingAllocator().deallocate(chunk);
+
+	Foundation::Mutex::ScopedLock lock(getMutex());
+
+	index -= sMinIndex;
+	if(getFreeTable().size() <= index)
+		getFreeTable().resize(index + 1);
+
+	chunk->mNext = getFreeTable()[index];
+	getFreeTable()[index] = chunk;
+}
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/PxShared/src/foundation/src/PsUtilities.cpp b/PxShared/src/foundation/src/PsUtilities.cpp
new file mode 100644
index 0000000..c6a6dc4
--- /dev/null
+++ b/PxShared/src/foundation/src/PsUtilities.cpp
@@ -0,0 +1,73 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxMat33.h"
+#include "foundation/PxQuat.h"
+#include "foundation/PxTransform.h"
+#include "PsUtilities.h"
+#include "PsUserAllocated.h"
+#include "PsFPU.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+
+bool checkValid(const float& f)
+{
+	return PxIsFinite(f);
+}
+bool checkValid(const PxVec3& v)
+{
+	return PxIsFinite(v.x) && PxIsFinite(v.y) && PxIsFinite(v.z);
+}
+
+bool checkValid(const PxTransform& t)
+{
+	return checkValid(t.p) && checkValid(t.q);
+}
+
+bool checkValid(const PxQuat& q)
+{
+	return PxIsFinite(q.x) && PxIsFinite(q.y) && PxIsFinite(q.z) && PxIsFinite(q.w);
+}
+bool checkValid(const PxMat33& m)
+{
+	return PxIsFinite(m(0, 0)) && PxIsFinite(m(1, 0)) && PxIsFinite(m(2, 0)) && PxIsFinite(m(0, 1)) &&
+	       PxIsFinite(m(1, 1)) && PxIsFinite(m(2, 1)) && PxIsFinite(m(0, 3)) && PxIsFinite(m(1, 3)) &&
+	       PxIsFinite(m(2, 3));
+}
+bool checkValid(const char* string)
+{
+	static const PxU32 maxLength = 4096;
+	return strnlen(string, maxLength) != maxLength;
+}
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/PxShared/src/foundation/src/nx/PsNXAtomic.cpp b/PxShared/src/foundation/src/nx/PsNXAtomic.cpp
new file mode 100644
index 0000000..1ab5e6b
--- /dev/null
+++ b/PxShared/src/foundation/src/nx/PsNXAtomic.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#define PAUSE() asm ("nop")
+
+#include "PsAtomic.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+
+PxI32 atomicExchange(volatile PxI32* val,PxI32 val2)
+{
+	PxI32 newVal, oldVal;
+
+	do
+	{
+		PAUSE();
+		oldVal = *val;
+		newVal = val2;
+	}
+	while (atomicCompareExchange(val, newVal, oldVal) != oldVal);
+
+	return oldVal;
+}
+
+PxI32 atomicCompareExchange(volatile PxI32* dest, PxI32 exch, PxI32 comp)
+{
+	return __sync_val_compare_and_swap(dest, comp, exch);
+}
+
+void* atomicCompareExchangePointer(volatile void** dest, void* exch, void* comp)
+{
+	return __sync_val_compare_and_swap((void**)dest, comp, exch);
+}
+
+PxI32 atomicIncrement(volatile PxI32* val)
+{
+	return __sync_add_and_fetch(val, 1);
+}
+
+PxI32 atomicDecrement(volatile PxI32* val)
+{
+	return __sync_sub_and_fetch(val, 1);
+}
+
+PxI32 atomicAdd(volatile PxI32* val, PxI32 delta)
+{
+	return __sync_add_and_fetch(val, delta);
+}
+
+PxI32 atomicMax(volatile PxI32* val, PxI32 val2)
+{
+	PxI32 oldVal, newVal;
+
+	do
+	{
+		PAUSE();
+		oldVal = *val;
+
+		if (val2 > oldVal)
+			newVal = val2;
+		else
+			newVal = oldVal;
+
+	}
+	while (atomicCompareExchange(val, newVal, oldVal) != oldVal);
+
+	return *val;
+}
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/PxShared/src/foundation/src/nx/PsNXCpu.cpp b/PxShared/src/foundation/src/nx/PsNXCpu.cpp
new file mode 100644
index 0000000..47c1354
--- /dev/null
+++ b/PxShared/src/foundation/src/nx/PsNXCpu.cpp
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "PsCpu.h"
+
+#define cpuid(op, reg) reg[0]=reg[1]=reg[2]=reg[3]=0;
+
+namespace physx { namespace shdfnd {
+
+	physx::PxU8 Cpu::getCpuId()
+	{
+		PxU32 cpuInfo[4];
+		cpuid(1, cpuInfo);
+		return static_cast<physx::PxU8>(  cpuInfo[1] >> 24 ); // APIC Physical ID
+	}
+}}
diff --git a/PxShared/src/foundation/src/nx/PsNXFPU.cpp b/PxShared/src/foundation/src/nx/PsNXFPU.cpp
new file mode 100644
index 0000000..a94354f
--- /dev/null
+++ b/PxShared/src/foundation/src/nx/PsNXFPU.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+#include "PsFPU.h"
+
+#include <cfenv>
+
+physx::shdfnd::FPUGuard::FPUGuard()
+{
+	PX_COMPILE_TIME_ASSERT(sizeof(fenv_t) <= sizeof(mControlWords));
+
+	fegetenv(reinterpret_cast<fenv_t*>(mControlWords));
+	fesetenv(FE_DFL_ENV);
+
+	// NX does not seem to support fedisableexcept
+	//fedisableexcept(FE_ALL_EXCEPT);
+
+	fesetround(FE_TONEAREST);  // since this does not seem to be the default mode
+}
+
+physx::shdfnd::FPUGuard::~FPUGuard()
+{
+	fesetenv(reinterpret_cast<fenv_t*>(mControlWords));
+}
+
+PX_FOUNDATION_API void physx::shdfnd::enableFPExceptions()
+{
+	// NX does not seem to support feenableexcept
+	//feclearexcept(FE_ALL_EXCEPT);
+	//feenableexcept(FE_INVALID|FE_DIVBYZERO|FE_OVERFLOW);	
+}
+
+PX_FOUNDATION_API void physx::shdfnd::disableFPExceptions()
+{
+	// NX does not seem to support fedisableexcept
+	//fedisableexcept(FE_ALL_EXCEPT);
+}
diff --git a/PxShared/src/foundation/src/nx/PsNXMutex.cpp b/PxShared/src/foundation/src/nx/PsNXMutex.cpp
new file mode 100644
index 0000000..6d3334a
--- /dev/null
+++ b/PxShared/src/foundation/src/nx/PsNXMutex.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include <atomic>
+#include "nn/os/os_Mutex.h"
+#include "PsFoundation.h"
+#include "PsAllocator.h"
+#include "PsMutex.h"
+#include "PsThread.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+
+namespace 
+{
+	struct MutexNXImpl
+	{
+		nn::os::MutexType lock;
+		Thread::Id owner;
+	};
+
+	MutexNXImpl* getMutex(MutexImpl* impl)
+	{
+		return reinterpret_cast<MutexNXImpl*>(impl);
+	}
+}
+
+MutexImpl::MutexImpl() 
+{ 
+	nn::os::InitializeMutex(&getMutex(this)->lock, true, 0);
+}
+
+MutexImpl::~MutexImpl() 
+{ 
+	nn::os::FinalizeMutex(&getMutex(this)->lock);
+}
+
+void MutexImpl::lock()
+{
+	nn::os::LockMutex(&getMutex(this)->lock);
+
+#ifdef PX_DEBUG
+	getMutex(this)->owner = Thread::getId();
+#endif
+}
+
+bool MutexImpl::trylock()
+{
+	bool success = nn::os::TryLockMutex(&getMutex(this)->lock);
+#ifdef PX_DEBUG
+	if (success)
+		getMutex(this)->owner = Thread::getId();
+#endif
+	return success;
+}
+
+void MutexImpl::unlock()
+{
+#ifdef PX_DEBUG
+	// ensure we are already holding the lock
+	if (getMutex(this)->owner != Thread::getId())
+	{
+		getFoundation().error(PxErrorCode::eINVALID_OPERATION, __FILE__, __LINE__, "Mutex must be unlocked only by thread that has already acquired lock");
+		return;
+	}
+#endif
+
+	nn::os::UnlockMutex(&getMutex(this)->lock);
+}
+
+static const PxU32 gSize = sizeof(MutexNXImpl);
+
+const PxU32& MutexImpl::getSize()  { return gSize; }
+
+class ReadWriteLockImpl
+{
+	PX_NOCOPY(ReadWriteLockImpl)
+public:
+	ReadWriteLockImpl() : readerCount(0) {}
+	Mutex				mutex;
+	std::atomic<int>	readerCount;  //handle recursive writer locking
+};
+
+ReadWriteLock::ReadWriteLock()
+{
+    mImpl = reinterpret_cast<ReadWriteLockImpl*>(PX_ALLOC(sizeof(ReadWriteLockImpl), PX_DEBUG_EXP("ReadWriteLockImpl")));
+	PX_PLACEMENT_NEW(mImpl, ReadWriteLockImpl);
+}
+
+ReadWriteLock::~ReadWriteLock()
+{
+	mImpl->~ReadWriteLockImpl();
+    PX_FREE( mImpl );
+}
+
+void ReadWriteLock::lockReader()
+{
+	mImpl->mutex.lock();
+
+	mImpl->readerCount.fetch_add(1);
+
+	mImpl->mutex.unlock();
+}
+
+void ReadWriteLock::lockWriter()
+{
+	mImpl->mutex.lock();
+
+	// spin lock until no readers
+	while (mImpl->readerCount);
+}
+
+void ReadWriteLock::unlockReader()
+{
+	mImpl->readerCount.fetch_sub(1);
+}
+
+void ReadWriteLock::unlockWriter()
+{
+	mImpl->mutex.unlock();
+}
+
+} // namespace shdfnd
+} // namespace physx
+
diff --git a/PxShared/src/foundation/src/nx/PsNXPrintString.cpp b/PxShared/src/foundation/src/nx/PsNXPrintString.cpp
new file mode 100644
index 0000000..be5f5ec
--- /dev/null
+++ b/PxShared/src/foundation/src/nx/PsNXPrintString.cpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "PsString.h"
+
+#include <cstdio>
+#include <cstdarg>
+
+#include "nn/nn_Log.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+
+void printString(const char* str)
+{
+	NN_LOG(str);
+	NN_LOG("\n");
+}
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/PxShared/src/foundation/src/nx/PsNXSList.cpp b/PxShared/src/foundation/src/nx/PsNXSList.cpp
new file mode 100644
index 0000000..b6258cf
--- /dev/null
+++ b/PxShared/src/foundation/src/nx/PsNXSList.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "PsThread.h"  // for PxSpinLockPause()
+#include "PsSList.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+	namespace 
+	{
+		struct ScopedSpinLock
+		{
+		private:
+			PX_NOCOPY(ScopedSpinLock)
+
+		public:
+			// !!!pthread version need to check
+			PX_FORCE_INLINE ScopedSpinLock(volatile PxI32& lock): mLock(lock)	
+			{  
+				while (__sync_lock_test_and_set(&mLock, 1))
+				{
+					// spinning without atomics is usually
+					// causing less bus traffic. -> only one
+					// CPU is modifying the cache line.
+					while(lock)
+						PxSpinLockPause();
+				} 
+			}
+
+			PX_FORCE_INLINE ~ScopedSpinLock()
+			{  
+				__sync_lock_release(&mLock);
+			}
+		private:
+			volatile PxI32& mLock;
+		};
+		
+		struct SListDetail
+		{
+			SListEntry* head;
+			volatile PxI32 lock;
+		};
+
+		template <typename T>
+		SListDetail* getDetail(T* impl)
+		{
+			return reinterpret_cast<SListDetail*>(impl);
+		}
+	}
+		
+	SListImpl::SListImpl()
+	{
+		getDetail(this)->head = NULL;
+		getDetail(this)->lock = 0; // 0 == unlocked
+	}
+	
+	SListImpl::~SListImpl()
+	{
+	}
+	
+	void SListImpl::push(SListEntry* entry)
+	{
+		ScopedSpinLock lock(getDetail(this)->lock);
+		entry->mNext = getDetail(this)->head;
+		getDetail(this)->head = entry;
+	}
+	
+	SListEntry* SListImpl::pop()
+	{
+		ScopedSpinLock lock(getDetail(this)->lock);
+		SListEntry* result = getDetail(this)->head;
+		if( result != NULL )
+			getDetail(this)->head = result->mNext;
+		return result;
+	}
+	
+	SListEntry* SListImpl::flush()
+	{
+		ScopedSpinLock lock(getDetail(this)->lock);
+		SListEntry* result = getDetail(this)->head;
+		getDetail(this)->head = NULL;
+		return result;
+	}
+	
+	static const PxU32 gSize = sizeof(SListDetail);
+
+	const PxU32& SListImpl::getSize()
+	{
+		return gSize;
+	}
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/PxShared/src/foundation/src/nx/PsNXSocket.cpp b/PxShared/src/foundation/src/nx/PsNXSocket.cpp
new file mode 100644
index 0000000..735ab20
--- /dev/null
+++ b/PxShared/src/foundation/src/nx/PsNXSocket.cpp
@@ -0,0 +1,417 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "PsFoundation.h"
+#include "PsSocket.h"
+#include "PsThread.h"
+#include "PsArray.h"
+#include "foundation/PxMemory.h"
+
+#include <nn/socket.h>
+#include <nn/nn_Log.h>
+
+#define SOCKET_ERROR -1
+
+namespace physx
+{
+namespace shdfnd
+{
+
+const PxU32 Socket::DEFAULT_BUFFER_SIZE = 32768;
+
+class SocketImpl
+{
+public:
+	SocketImpl(bool isBlocking);
+	virtual ~SocketImpl();
+
+	bool	init();
+	bool	connect(const char* host, PxU16 port, PxU32 timeout);
+	void	disconnect();
+	bool	listen(PxU16 /*port*/)
+	{
+		return false; // not implemented on this platform
+	}
+	bool	accept(bool /*block*/)
+	{
+		return false; // not implemented on this platform
+	}
+
+	void	setBlocking(bool blocking);
+
+	virtual PxU32	write(const PxU8* data, PxU32 length);
+	virtual bool	flush();
+
+	PxU32			read(PxU8* data, PxU32 length);
+
+	static void* allocate(size_t size) { return PX_ALLOC(size, "Socket"); }
+	static void deallocate(void* mem, size_t) { PX_FREE(mem); }
+
+	PX_FORCE_INLINE	bool		isBlocking() const	{ return mIsBlocking; }
+	PX_FORCE_INLINE	bool		isConnected() const	{ return mIsConnected; }
+	PX_FORCE_INLINE	const char*	getHost() const		{ return mHost; }
+	PX_FORCE_INLINE	PxU16		getPort() const		{ return mPort; }
+
+protected:
+	int				mSocket;
+	const char*		mHost;
+	PxU16			mPort;
+	bool			mIsConnected;
+	bool			mIsBlocking;
+};
+
+
+class BufferedSocketImpl: public SocketImpl
+{
+public:
+	BufferedSocketImpl(bool isBlocking);
+	virtual ~BufferedSocketImpl() {};
+	bool flush();
+	PxU32 write(const PxU8* data, PxU32 length);
+
+private:
+	PxU32	mBufferPos;
+	PxU8	mBuffer[Socket::DEFAULT_BUFFER_SIZE];
+};
+
+BufferedSocketImpl::BufferedSocketImpl(bool isBlocking)
+	: SocketImpl(isBlocking)
+	, mBufferPos(0)
+{}
+
+
+SocketImpl::SocketImpl(bool isBlocking)
+	: mSocket(SOCKET_ERROR)
+	, mHost(NULL)
+	, mPort(0)
+	, mIsConnected(false)
+	, mIsBlocking(isBlocking)
+{
+}
+
+
+SocketImpl::~SocketImpl()
+{
+}
+
+
+bool SocketImpl::init()
+{
+	mSocket = nn::socket::Socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+	return (mSocket != SOCKET_ERROR);
+}
+
+
+void SocketImpl::setBlocking(bool blocking)
+{
+	if (blocking != mIsBlocking)
+	{
+		int mode = nn::socket::Fcntl(mSocket, F_GETFL, 0);
+		if (!blocking)
+			mode |= O_NONBLOCK;
+		else
+			mode &= ~O_NONBLOCK;
+		int ret = nn::socket::Fcntl(mSocket, F_SETFL, mode);
+		if (ret != SOCKET_ERROR)
+			mIsBlocking = blocking;
+	}
+}
+
+
+bool SocketImpl::flush()
+{
+	return true;
+};
+
+
+bool SocketImpl::connect(const char* host, PxU16 port, PxU32 timeout)
+{
+	if (!init()) 
+		return false;
+
+	setBlocking(false);
+
+	sockaddr_in socketAddress;
+	socketAddress.sin_family = AF_INET;
+	socketAddress.sin_port = nn::socket::InetHtons(port);
+
+	// get host part 
+	int result = nn::socket::InetPton(AF_INET, host, &socketAddress.sin_addr);
+	PX_UNUSED(result);
+	PX_ASSERT(result != SOCKET_ERROR);
+
+	if (nn::socket::Connect(mSocket, (sockaddr*)&socketAddress, sizeof(socketAddress)) < 0)
+	{
+		if (nn::socket::GetLastErrno() != EINPROGRESS)
+		{
+			disconnect();
+			return false;
+		}
+
+		//Use poll function call to monitor the connect call.
+		pollfd socketDesc;
+		socketDesc.fd = mSocket;
+		socketDesc.events = POLLOUT;
+		socketDesc.revents = 0;
+		int pollret = nn::socket::Poll(&socketDesc, 1, static_cast<int>(timeout));
+		if (pollret != 1 || (socketDesc.revents & POLLERR) || !(socketDesc.revents & POLLOUT))
+		{
+			disconnect();
+			return false;
+		}
+
+		// check if we are really connected, above code seems to return 
+		// true if host is a unix machine even if the connection was
+		// not accepted.
+		char buffer;
+		if(nn::socket::Recv(mSocket, (void*)&buffer, 0, 0) < 0)
+		{
+			if(nn::socket::GetLastErrno() != EWOULDBLOCK)
+			{
+				disconnect();
+				return false;
+			}
+		}
+	}
+
+	setBlocking(mIsBlocking);
+	mIsConnected = true;
+	mPort = port;
+	mHost = host;
+
+	return true;
+}
+
+
+void SocketImpl::disconnect()
+{
+	if (mSocket != SOCKET_ERROR)
+	{
+		int result;
+		if (mIsConnected)
+		{
+			setBlocking(true);
+			result = nn::socket::Shutdown(mSocket, SHUT_RDWR);
+			PX_UNUSED(result);
+			PX_ASSERT(result != SOCKET_ERROR);
+		}
+
+		result = nn::socket::Close(mSocket);
+		PX_UNUSED(result);
+		PX_ASSERT(result != SOCKET_ERROR);
+		mSocket = SOCKET_ERROR;
+	}
+
+	mIsConnected = false;
+	mPort = 0;
+	mHost = NULL;
+}
+
+
+PxU32 SocketImpl::write(const PxU8* data, PxU32 length)
+{
+	int sent = 0;
+	while((sent = nn::socket::Send(mSocket, (const void*)data, (size_t)length, 0)) == SOCKET_ERROR)
+	{
+		if(nn::socket::GetLastErrno() != EWOULDBLOCK)
+		{
+			mIsConnected = false;
+			return 0;
+		}
+	}
+
+	return (PxU32)sent;
+}
+
+
+PxU32 SocketImpl::read(PxU8* data, PxU32 length)
+{
+	int bytesReceived = 0;
+
+	// If out of receive buffer, increase it
+	while((bytesReceived = nn::socket::Recv(mSocket, (void*)data, (size_t)length, 0)) == SOCKET_ERROR && 
+		nn::socket::GetLastErrno() == ENOBUFS)
+	{
+		int iBuffSize = (int)length;
+
+		// terminate the loop if we cannot increase the buffer size
+		if(nn::socket::SetSockOpt(mSocket, SOL_SOCKET, SO_RCVBUF, (void*)&iBuffSize, sizeof(int)) != 0)
+			break;
+	}
+
+	if(bytesReceived <= 0)
+	{
+		bytesReceived = 0;
+		mIsConnected = false;
+	}
+
+	return PxU32(bytesReceived);
+}
+
+
+bool BufferedSocketImpl::flush()
+{
+	PxU32 totalBytesWritten = 0;
+	PxI32 bytesWritten = 1;
+	while(totalBytesWritten < mBufferPos && bytesWritten > 0)
+	{
+		bytesWritten = SocketImpl::write(mBuffer+totalBytesWritten, mBufferPos-totalBytesWritten);
+		if(bytesWritten > 0)
+			totalBytesWritten += bytesWritten;
+	}
+	bool ret = (totalBytesWritten == mBufferPos);
+	mBufferPos = 0;
+	return ret;
+}
+
+
+PxU32 BufferedSocketImpl::write(const PxU8* data, PxU32 length)
+{
+	PxU32 bytesWritten = 0;
+	while(length > (Socket::DEFAULT_BUFFER_SIZE - mBufferPos))
+	{
+		PxU32 currentChunk = Socket::DEFAULT_BUFFER_SIZE - mBufferPos;
+		PxMemCopy(mBuffer+mBufferPos, data+bytesWritten, currentChunk);
+		mBufferPos = Socket::DEFAULT_BUFFER_SIZE;
+		if(!flush())
+		{
+			disconnect();
+			return bytesWritten;
+		}
+		bytesWritten += currentChunk;
+		length -= currentChunk;
+	}
+	if(length > 0) 
+	{
+		PxMemCopy(mBuffer+mBufferPos, data+bytesWritten, length);
+		bytesWritten += length;
+		mBufferPos += length;
+	}
+	if(mBufferPos == Socket::DEFAULT_BUFFER_SIZE)
+	{
+		if (!flush())
+		{
+			disconnect();
+			return bytesWritten;
+		}
+	}
+	return bytesWritten;
+}
+
+
+Socket::Socket(bool inEnableBuffering, bool blocking)
+{
+	if (inEnableBuffering)
+	{
+		void* mem = PX_ALLOC(sizeof(BufferedSocketImpl), PX_DEBUG_EXP("BufferedSocketImpl"));
+		mImpl = PX_PLACEMENT_NEW(mem, BufferedSocketImpl)(blocking);
+	}
+	else
+	{
+		void* mem = PX_ALLOC(sizeof(SocketImpl), PX_DEBUG_EXP("SocketImpl"));
+		mImpl = PX_PLACEMENT_NEW(mem, SocketImpl)(blocking);
+	}
+}
+
+
+Socket::~Socket()
+{
+	mImpl->flush();
+	mImpl->disconnect();
+	mImpl->~SocketImpl();
+	PX_FREE(mImpl);
+}
+
+
+bool Socket::connect(const char* host, PxU16 port, PxU32 timeout)
+{
+	return mImpl->connect(host, port, timeout);
+}
+
+
+bool Socket::listen(PxU16 port)
+{
+	return mImpl->listen(port);
+}
+
+
+bool Socket::accept(bool block)
+{
+	return mImpl->accept(block);
+}
+
+
+void Socket::disconnect()
+{
+	mImpl->disconnect();
+}
+
+
+bool Socket::isConnected() const
+{
+	return mImpl->isConnected();
+}
+
+
+const char* Socket::getHost() const
+{
+	return mImpl->getHost();
+}
+
+
+PxU16 Socket::getPort() const
+{
+	return mImpl->getPort();
+}
+
+
+bool Socket::flush()
+{
+	if(!mImpl->isConnected())
+		return false;
+	return mImpl->flush();
+}
+
+
+PxU32 Socket::write(const PxU8* data, PxU32 length)
+{
+	if(!mImpl->isConnected())
+		return 0;
+	return mImpl->write(data, length);
+}
+
+
+PxU32 Socket::read(PxU8* data, PxU32 length)
+{
+	if(!mImpl->isConnected())
+		return 0;
+	return mImpl->read(data, length);
+}
+
+
+void Socket::setBlocking(bool blocking)
+{
+	if(!mImpl->isConnected())
+		return;
+	mImpl->setBlocking(blocking);
+}
+
+
+bool Socket::isBlocking() const
+{
+	if(!mImpl->isConnected())
+		return true;
+	return mImpl->isBlocking();
+}
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/PxShared/src/foundation/src/nx/PsNXSync.cpp b/PxShared/src/foundation/src/nx/PsNXSync.cpp
new file mode 100644
index 0000000..0a9566d
--- /dev/null
+++ b/PxShared/src/foundation/src/nx/PsNXSync.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "nn/os/os_ConditionVariable.h"
+#include "nn/os/os_Tick.h"
+#include "nn/nn_TimeSpan.h"
+#include "nn/os/os_Mutex.h"
+#include "foundation/PxAssert.h"
+#include "PsSync.h"
+
+
+namespace physx
+{
+namespace shdfnd
+{
+	namespace
+	{
+		class _SyncImpl
+		{
+		public:
+			nn::os::ConditionVariableType syncCondVar;  // note: nn::os::EventType is not used because nn::os::ClearEvent() is not atomic
+			nn::os::MutexType syncMutex;
+			volatile PxI32 setCounter;
+			volatile bool is_set;
+		};
+		
+		_SyncImpl* getSync(SyncImpl* impl)
+		{
+			return reinterpret_cast<_SyncImpl*>(impl);
+		}
+	}
+	
+	static const PxU32 gSize = sizeof(_SyncImpl);
+	const PxU32& SyncImpl::getSize()  { return gSize; }
+
+	struct PxNXScopedLock
+	{
+	private:
+		PX_NOCOPY(PxNXScopedLock)
+
+	public:
+		PxNXScopedLock(nn::os::MutexType& lock) : mLock(lock)	{ nn::os::LockMutex(&mLock); }
+		~PxNXScopedLock()										{ nn::os::UnlockMutex(&mLock); }
+
+	private:
+		nn::os::MutexType& mLock;
+	};
+
+	SyncImpl::SyncImpl()
+	{
+		_SyncImpl* syncImpl = getSync(this);
+		nn::os::InitializeMutex(&syncImpl->syncMutex, false, 0);  // non-recursive is correct even if it might be slightly confusing the way it is used but nn::os::WaitConditionVariable() unlocks and locks again
+		nn::os::InitializeConditionVariable(&syncImpl->syncCondVar);
+		syncImpl->setCounter = 0;
+		syncImpl->is_set = false;
+	}
+
+	SyncImpl::~SyncImpl()
+	{
+		_SyncImpl* syncImpl = getSync(this);
+		nn::os::FinalizeConditionVariable(&syncImpl->syncCondVar);
+		nn::os::FinalizeMutex(&syncImpl->syncMutex);
+	}
+
+	void SyncImpl::reset()
+	{
+		_SyncImpl* syncImpl = getSync(this);
+		PxNXScopedLock lock(syncImpl->syncMutex);
+		syncImpl->is_set = false;
+	}
+
+	void SyncImpl::set()
+	{
+		_SyncImpl* syncImpl = getSync(this);
+		PxNXScopedLock lock(syncImpl->syncMutex);
+		if(!syncImpl->is_set)
+		{
+			syncImpl->setCounter++;
+			syncImpl->is_set = true;
+			nn::os::BroadcastConditionVariable(&syncImpl->syncCondVar);
+		}
+	}
+
+	bool SyncImpl::wait(PxU32 milliseconds)
+	{
+		_SyncImpl* syncImpl = getSync(this);
+		PxNXScopedLock lock(syncImpl->syncMutex);
+		PxI32 lastSetCounter = syncImpl->setCounter;
+		if(!getSync(this)->is_set)
+		{
+			if(milliseconds == static_cast<PxU32>(-1))
+			{
+				// have to loop here and check is_set since WaitConditionVariable can return even 
+				// if it was not signaled by BroadcastConditionVariable
+				while((!syncImpl->is_set) && (lastSetCounter == syncImpl->setCounter))
+					nn::os::WaitConditionVariable(&syncImpl->syncCondVar, &syncImpl->syncMutex);
+				PX_ASSERT(syncImpl->is_set || (lastSetCounter != syncImpl->setCounter));
+			}
+			else
+			{
+				const int64_t ticksToWait = (static_cast<int64_t>(milliseconds) * nn::os::GetSystemTickFrequency()) / 1000;
+				const int64_t targetCounter = nn::os::GetSystemTick().GetInt64Value() + ticksToWait;
+				const int64_t targetCounterWithMargin = targetCounter - (targetCounter >> 4) + 1;  // allow for a bit of error in the wait time (around 6%)
+				
+				// have to loop here and check is_set since TimedWaitConditionVariable can return even 
+				// if it was not signaled by BroadcastConditionVariable. Note: to keep it simple, the elapsed
+				// time is not taken into account in such a case and the original wait time will be used again.
+				bool timeLimitOk = true;
+				while((!syncImpl->is_set) && (lastSetCounter == syncImpl->setCounter) && timeLimitOk)
+				{
+					nn::os::TimedWaitConditionVariable(&syncImpl->syncCondVar, &syncImpl->syncMutex, nn::os::ConvertToTimeSpan(nn::os::Tick(ticksToWait)));
+					timeLimitOk = nn::os::GetSystemTick().GetInt64Value() <= targetCounterWithMargin;
+				}
+				PX_ASSERT(syncImpl->is_set || (lastSetCounter != syncImpl->setCounter) || (!timeLimitOk));
+			}
+		}
+
+		return syncImpl->is_set || (lastSetCounter != syncImpl->setCounter);
+	}
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/PxShared/src/foundation/src/nx/PsNXThread.cpp b/PxShared/src/foundation/src/nx/PsNXThread.cpp
new file mode 100644
index 0000000..a792a66
--- /dev/null
+++ b/PxShared/src/foundation/src/nx/PsNXThread.cpp
@@ -0,0 +1,404 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include <atomic>
+#include "PsFoundation.h"
+#include "nn/os/os_Thread.h"
+#include "nn/nn_TimeSpan.h"
+
+#include "PsBitUtils.h"
+#include "PsThread.h"
+#include "foundation/PxAssert.h"
+
+
+namespace physx
+{
+namespace shdfnd
+{
+
+namespace
+{
+	class _ThreadImpl
+	{
+		PX_NOCOPY(_ThreadImpl)
+
+	public:
+		enum State
+		{
+			NotStarted,
+			Started,
+			Stopped
+		};
+
+		nn::os::ThreadType	nativeThread;
+
+		PxU8*				stackMemory;
+		int					threadAffinity;
+
+		std::atomic<int>	quitNow;
+		State				state;
+
+		ThreadImpl::ExecuteFn	fn;
+		void*				arg;
+
+		static const int sInvalidAffinityMask = 0xffffffff;
+	};
+
+	_ThreadImpl* getThread(ThreadImpl* impl)
+	{
+		return reinterpret_cast<_ThreadImpl*>(impl);
+	}
+
+	PX_FORCE_INLINE void initThreadImpl(_ThreadImpl* threadImpl)
+	{
+		threadImpl->nativeThread._basePriority = nn::os::DefaultThreadPriority;
+		threadImpl->nativeThread._namePointer = NULL;
+		threadImpl->state = _ThreadImpl::NotStarted;
+		threadImpl->quitNow = 0;
+		threadImpl->threadAffinity = _ThreadImpl::sInvalidAffinityMask;
+		threadImpl->fn = NULL;
+		threadImpl->arg = NULL;
+		threadImpl->stackMemory = NULL;
+	}
+
+	void ThreadStart(void* ptrArg)
+	{
+		// then run either the passed in function or execute from the derived class (Runnable).
+		_ThreadImpl* impl = getThread(reinterpret_cast<ThreadImpl*>(ptrArg));
+		if(impl->fn)
+			(*impl->fn)(impl->arg);
+		else if(impl->arg)
+			(static_cast<Runnable*>(impl->arg))->execute();
+	}
+
+	PX_FORCE_INLINE void* allocateStackMemory(size_t size)
+	{
+		size_t pad = (nn::os::StackRegionAlignment - 1) + sizeof(size_t); // store offset for delete.
+		PxU8* base = reinterpret_cast<PxU8*>(::malloc(size + pad));
+		if (!base)
+			return NULL;
+
+		PxU8* ptr = reinterpret_cast<PxU8*>(size_t(base + pad) & ~(nn::os::StackRegionAlignment - 1)); // aligned pointer
+		(reinterpret_cast<size_t*>(ptr))[-1] = static_cast<size_t>(ptr - base); // store offset
+
+		return ptr;
+	}
+
+	PX_FORCE_INLINE void freeStackMemory(void* ptr)
+	{
+		if (ptr == NULL)
+			return;
+
+		PxU8* base = reinterpret_cast<PxU8*>(ptr) - (reinterpret_cast<size_t*>(ptr))[-1];
+		::free(base);
+	}
+}
+
+static const PxU32 gSize = sizeof(_ThreadImpl);
+const PxU32& ThreadImpl::getSize()  { return gSize; }
+
+
+ThreadImpl::Id ThreadImpl::getId()
+{
+	return reinterpret_cast<Id>(nn::os::GetCurrentThread());
+}
+
+ThreadImpl::ThreadImpl()
+{
+	initThreadImpl(getThread(this));
+}
+
+ThreadImpl::ThreadImpl(ExecuteFn fn, void* arg)
+{
+	_ThreadImpl* tImpl = getThread(this);
+
+	initThreadImpl(tImpl);
+
+	tImpl->fn = fn;
+	tImpl->arg = arg;
+
+	start(0, NULL);
+}
+
+ThreadImpl::~ThreadImpl()
+{
+	_ThreadImpl* tImpl = getThread(this);
+
+	if ((tImpl->state != _ThreadImpl::NotStarted) && (tImpl->stackMemory != NULL))
+	{
+		nn::os::DestroyThread(&tImpl->nativeThread);
+		freeStackMemory(reinterpret_cast<void*>(tImpl->stackMemory));
+	}
+}
+
+PxU32 ThreadImpl::getDefaultStackSize()
+{
+	const PxU32 defaultSize = 524288;
+	PX_COMPILE_TIME_ASSERT((defaultSize % nn::os::StackRegionAlignment) == 0);
+	return defaultSize;
+}
+
+void ThreadImpl::start(PxU32 stackSize, Runnable* runnable)
+{
+	_ThreadImpl* tImpl = getThread(this);
+	if(tImpl->state != _ThreadImpl::NotStarted)
+		return;
+	tImpl->state = _ThreadImpl::Started;
+
+	PxU32 newStackSize = getDefaultStackSize();
+	if(stackSize != 0)
+		newStackSize = stackSize;
+
+	PX_ASSERT((newStackSize % nn::os::StackRegionAlignment) == 0);
+
+	// need to provide stack memory as well
+	// (for other platforms, the system allocates the stack memory, hence it seems ok to not use the user allocator for this)
+	PxU8* mem = reinterpret_cast<PxU8*>(allocateStackMemory(newStackSize));
+
+	if (mem)
+	{
+		tImpl->stackMemory = mem;
+		
+		if(runnable && !tImpl->arg && ! tImpl->fn)
+			tImpl->arg = runnable;
+
+		const int priority = tImpl->nativeThread._basePriority;
+		PX_ASSERT((priority <= nn::os::LowestThreadPriority) && (priority >= nn::os::HighestThreadPriority));
+
+		nn::Result result = nn::os::CreateThread(&tImpl->nativeThread, ThreadStart, this, mem, newStackSize, priority);
+
+		if (result.IsSuccess())
+		{
+			if (tImpl->threadAffinity != _ThreadImpl::sInvalidAffinityMask)
+				nn::os::SetThreadCoreMask(&tImpl->nativeThread, nn::os::IdealCoreDontCare, static_cast<nn::Bit64>(tImpl->threadAffinity));
+
+			if (tImpl->nativeThread._namePointer)
+				nn::os::SetThreadNamePointer(&tImpl->nativeThread, tImpl->nativeThread._namePointer);
+
+			nn::os::StartThread(&tImpl->nativeThread);
+		}
+		else
+		{
+			freeStackMemory(reinterpret_cast<void*>(mem));
+			tImpl->stackMemory = NULL;
+			PX_ALWAYS_ASSERT();
+		}
+	}
+}
+
+void ThreadImpl::signalQuit()
+{
+	getThread(this)->quitNow.fetch_add(1);
+}
+
+bool ThreadImpl::waitForQuit()
+{
+	_ThreadImpl* tImpl = getThread(this);
+
+	if(tImpl->state == _ThreadImpl::NotStarted)
+		return false;
+
+	nn::os::WaitThread(&tImpl->nativeThread);
+
+	return true;
+}
+
+bool ThreadImpl::quitIsSignalled()
+{
+	_ThreadImpl* tImpl = getThread(this);
+	int expected = 0;
+	return !tImpl->quitNow.compare_exchange_strong(expected, 0);
+}
+
+void ThreadImpl::quit()
+{
+	getThread(this)->state = _ThreadImpl::Stopped;
+
+	// nothing to call. The thread will return and that will trigger all waiting threads to be informed.
+}
+
+void ThreadImpl::kill()
+{
+	PX_ASSERT(!"kill() is not implemented for this platform");
+
+	// nn::os::DestroyThread() waits for the thread to exit, which does seem the wrong behavior for kill()
+}
+
+void ThreadImpl::sleep(PxU32 ms)
+{
+	nn::os::SleepThread(nn::TimeSpan::FromMilliSeconds(ms));
+}
+
+void ThreadImpl::yield()
+{
+	nn::os::YieldThread();
+}
+
+PxU32 ThreadImpl::setAffinityMask(PxU32 mask)
+{
+	PX_ASSERT((mask & (~nn::os::GetThreadAvailableCoreMask())) == 0);
+
+	_ThreadImpl* tImpl = getThread(this);
+
+	if (tImpl->state == _ThreadImpl::NotStarted)
+	{
+		const int previousMask = tImpl->threadAffinity;
+		tImpl->threadAffinity = static_cast<int>(mask);
+
+		if (previousMask == _ThreadImpl::sInvalidAffinityMask)
+			return 0;
+		else
+			return static_cast<PxU32>(previousMask);
+	}
+	else
+	{
+		nn::Bit64 affMask;
+		nn::os::GetThreadCoreMask(NULL, &affMask, &tImpl->nativeThread);
+
+		nn::os::SetThreadCoreMask(&tImpl->nativeThread, nn::os::IdealCoreDontCare, static_cast<nn::Bit64>(mask));
+
+		return static_cast<PxU32>(affMask);
+	}
+}
+
+void ThreadImpl::setName(const char* name)
+{
+	// important: The memory has to be allocated and managed by the caller
+
+	_ThreadImpl* tImpl = getThread(this);
+	if(tImpl->state == _ThreadImpl::Started)
+		nn::os::SetThreadNamePointer(&tImpl->nativeThread, name);
+	else
+		tImpl->nativeThread._namePointer = name;
+}
+
+void ThreadImpl::setPriority(ThreadPriority::Enum prio)
+{
+	_ThreadImpl* tImpl = getThread(this);
+
+	int convertedPriority = ThreadPriority::eNORMAL;  // compiler complains if no default is set (even though all values are covered in the switch statement)
+	switch(prio)
+	{
+	case ThreadPriority::eHIGH:
+		convertedPriority = nn::os::HighestThreadPriority;
+		break;
+	case ThreadPriority::eABOVE_NORMAL:
+		convertedPriority = (nn::os::HighestThreadPriority + nn::os::DefaultThreadPriority) / 2;
+		break;
+	case ThreadPriority::eNORMAL:
+		convertedPriority = nn::os::DefaultThreadPriority;
+		break;
+	case ThreadPriority::eBELOW_NORMAL:
+		convertedPriority = (nn::os::LowestThreadPriority + nn::os::DefaultThreadPriority) / 2;
+		break;
+	case ThreadPriority::eLOW:
+		convertedPriority = nn::os::LowestThreadPriority;
+		break;
+	case ThreadPriority::eFORCE_DWORD:
+		PX_ALWAYS_ASSERT();
+		convertedPriority = nn::os::DefaultThreadPriority;
+		break;
+	}
+
+	if (tImpl->state == _ThreadImpl::Started)
+		nn::os::ChangeThreadPriority(&tImpl->nativeThread, convertedPriority);
+	else
+		tImpl->nativeThread._basePriority = convertedPriority;
+}
+
+ThreadPriority::Enum ThreadImpl::getPriority(Id tID)
+{
+	const nn::os::ThreadType* thread = reinterpret_cast<const nn::os::ThreadType*>(tID);
+	int prio = nn::os::GetThreadCurrentPriority(thread);
+	ThreadPriority::Enum convertedPriority;
+
+	switch(prio)
+	{
+	case nn::os::HighestThreadPriority:
+		convertedPriority = ThreadPriority::eHIGH;
+		break;
+	case ((nn::os::HighestThreadPriority + nn::os::DefaultThreadPriority) / 2):
+		convertedPriority = ThreadPriority::eABOVE_NORMAL;
+		break;
+	case nn::os::DefaultThreadPriority:
+		convertedPriority = ThreadPriority::eNORMAL;
+		break;
+	case ((nn::os::LowestThreadPriority + nn::os::DefaultThreadPriority) / 2):
+		convertedPriority = ThreadPriority::eBELOW_NORMAL;
+		break;
+	case nn::os::LowestThreadPriority:
+		convertedPriority = ThreadPriority::eLOW;
+		break;
+	default:
+		PX_ALWAYS_ASSERT();
+		convertedPriority = ThreadPriority::eNORMAL;
+		break;
+	}
+
+	return convertedPriority;
+}
+
+PxU32 ThreadImpl::getNbPhysicalCores()
+{
+	nn::Bit64 mask = nn::os::GetThreadAvailableCoreMask();
+	PX_ASSERT(mask > 0);
+
+	const uint32_t count = bitCount(static_cast<const uint32_t>(mask));
+	return count;
+}
+
+
+PxU32 TlsAlloc()
+{
+	// note: only nn::os::TlsSlotCountMax Tls slots are available (which used to be 16)
+
+	nn::os::TlsSlot tlsSlot;
+	nn::Result result = nn::os::AllocateTlsSlot(&tlsSlot, NULL);
+
+	if (result.IsSuccess())
+		return static_cast<PxU32>(tlsSlot._innerValue);
+	else
+	{
+		PX_ALWAYS_ASSERT();
+		return 0xffffffff;
+	}
+}
+
+void TlsFree(PxU32 index)
+{
+	nn::os::TlsSlot tlsSlot;
+	tlsSlot._innerValue = static_cast<uint32_t>(index);
+
+	nn::os::FreeTlsSlot(tlsSlot);
+}
+
+void* TlsGet(PxU32 index)
+{
+	nn::os::TlsSlot tlsSlot;
+	tlsSlot._innerValue = static_cast<uint32_t>(index);
+
+	return reinterpret_cast<void*>(nn::os::GetTlsValue(tlsSlot));
+}
+
+PxU32 TlsSet(PxU32 index, void* value)
+{
+	nn::os::TlsSlot tlsSlot;
+	tlsSlot._innerValue = static_cast<uint32_t>(index);
+
+	nn::os::SetTlsValue(tlsSlot, reinterpret_cast<uintptr_t>(value));
+
+	return 1;
+}
+
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/PxShared/src/foundation/src/nx/PsNXTime.cpp b/PxShared/src/foundation/src/nx/PsNXTime.cpp
new file mode 100644
index 0000000..977e6ce
--- /dev/null
+++ b/PxShared/src/foundation/src/nx/PsNXTime.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "PsTime.h"
+#include "nn/os/os_Tick.h"
+
+
+namespace physx
+{
+namespace shdfnd
+{
+
+namespace
+{
+	static double sRecipFrequency = 1.0 / nn::os::GetSystemTickFrequency();
+}
+
+static const CounterFrequencyToTensOfNanos gCounterFreq = Time::getCounterFrequency();
+
+const CounterFrequencyToTensOfNanos& Time::getBootCounterFrequency()
+{
+	return gCounterFreq;
+}
+
+CounterFrequencyToTensOfNanos Time::getCounterFrequency()
+{
+	return CounterFrequencyToTensOfNanos( Time::sNumTensOfNanoSecondsInASecond, static_cast<PxU64>(nn::os::GetSystemTickFrequency()));
+}
+
+
+PxU64 Time::getCurrentCounterValue()
+{
+	nn::os::Tick tick = nn::os::GetSystemTick();
+	return static_cast<PxU64>(tick.GetInt64Value());
+}
+
+Time::Time(): mTickCount(0)
+{
+	getElapsedSeconds();
+}
+
+Time::Second Time::getElapsedSeconds()
+{
+	PxI64 lastTickCount = mTickCount;
+	mTickCount = static_cast<PxI64>(nn::os::GetSystemTick().GetInt64Value());
+	return (mTickCount - lastTickCount) * sRecipFrequency;
+}
+
+Time::Second Time::peekElapsedSeconds()
+{
+	return (static_cast<PxI64>(nn::os::GetSystemTick().GetInt64Value()) - mTickCount) * sRecipFrequency;
+}
+
+Time::Second Time::getLastTime() const
+{
+	return mTickCount * sRecipFrequency;
+}
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/PxShared/src/foundation/src/unix/PsUnixAtomic.cpp b/PxShared/src/foundation/src/unix/PsUnixAtomic.cpp
new file mode 100644
index 0000000..d4e933b
--- /dev/null
+++ b/PxShared/src/foundation/src/unix/PsUnixAtomic.cpp
@@ -0,0 +1,102 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+#include "Ps.h"
+#include "PsAtomic.h"
+
+#if ! PX_EMSCRIPTEN
+#define PAUSE() asm("nop")
+#else
+#define PAUSE()
+#endif
+
+namespace physx
+{
+namespace shdfnd
+{
+
+void* atomicCompareExchangePointer(volatile void** dest, void* exch, void* comp)
+{
+	return __sync_val_compare_and_swap(const_cast<void**>(dest), comp, exch);
+}
+
+int32_t atomicCompareExchange(volatile int32_t* dest, int32_t exch, int32_t comp)
+{
+	return __sync_val_compare_and_swap(dest, comp, exch);
+}
+
+int32_t atomicIncrement(volatile int32_t* val)
+{
+	return __sync_add_and_fetch(val, 1);
+}
+
+int32_t atomicDecrement(volatile int32_t* val)
+{
+	return __sync_sub_and_fetch(val, 1);
+}
+
+int32_t atomicAdd(volatile int32_t* val, int32_t delta)
+{
+	return __sync_add_and_fetch(val, delta);
+}
+
+int32_t atomicMax(volatile int32_t* val, int32_t val2)
+{
+	int32_t oldVal, newVal;
+
+	do
+	{
+		PAUSE();
+		oldVal = *val;
+
+		if(val2 > oldVal)
+			newVal = val2;
+		else
+			newVal = oldVal;
+
+	} while(atomicCompareExchange(val, newVal, oldVal) != oldVal);
+
+	return *val;
+}
+
+int32_t atomicExchange(volatile int32_t* val, int32_t val2)
+{
+	int32_t newVal, oldVal;
+
+	do
+	{
+		PAUSE();
+		oldVal = *val;
+		newVal = val2;
+	} while(atomicCompareExchange(val, newVal, oldVal) != oldVal);
+
+	return oldVal;
+}
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/PxShared/src/foundation/src/unix/PsUnixCpu.cpp b/PxShared/src/foundation/src/unix/PsUnixCpu.cpp
new file mode 100644
index 0000000..0139fe4
--- /dev/null
+++ b/PxShared/src/foundation/src/unix/PsUnixCpu.cpp
@@ -0,0 +1,58 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxSimpleTypes.h"
+#include "PsCpu.h"
+
+#if PX_X86 && !defined(__EMSCRIPTEN__)
+#define cpuid(op, reg)                                                                                                 \
+	__asm__ __volatile__("pushl %%ebx      \n\t" /* save %ebx */                                                       \
+	                     "cpuid            \n\t"                                                                       \
+	                     "movl %%ebx, %1   \n\t" /* save what cpuid just put in %ebx */                                \
+	                     "popl %%ebx       \n\t" /* restore the old %ebx */                                            \
+	                     : "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]), "=d"(reg[3])                                      \
+	                     : "a"(op)                                                                                     \
+	                     : "cc")
+#else
+#define cpuid(op, reg) reg[0] = reg[1] = reg[2] = reg[3] = 0;
+#endif
+
+namespace physx
+{
+namespace shdfnd
+{
+
+uint8_t Cpu::getCpuId()
+{
+	uint32_t cpuInfo[4];
+	cpuid(1, cpuInfo);
+	return static_cast<uint8_t>(cpuInfo[1] >> 24); // APIC Physical ID
+}
+}
+}
diff --git a/PxShared/src/foundation/src/unix/PsUnixFPU.cpp b/PxShared/src/foundation/src/unix/PsUnixFPU.cpp
new file mode 100644
index 0000000..e12fa5f
--- /dev/null
+++ b/PxShared/src/foundation/src/unix/PsUnixFPU.cpp
@@ -0,0 +1,117 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+#include "PsFPU.h"
+
+#if !(defined(__CYGWIN__) || PX_ANDROID || PX_PS4)
+#include <fenv.h>
+PX_COMPILE_TIME_ASSERT(8 * sizeof(uint32_t) >= sizeof(fenv_t));
+#endif
+
+#if PX_OSX
+// osx defines SIMD as standard for floating point operations.
+#include <xmmintrin.h>
+#endif
+
+physx::shdfnd::FPUGuard::FPUGuard()
+{
+#if defined(__CYGWIN__)
+#pragma message "FPUGuard::FPUGuard() is not implemented"
+#elif PX_ANDROID
+// not supported unless ARM_HARD_FLOAT is enabled.
+#elif PX_PS4
+	// not supported
+	PX_UNUSED(mControlWords);
+#elif PX_OSX
+	mControlWords[0] = _mm_getcsr();
+	// set default (disable exceptions: _MM_MASK_MASK) and FTZ (_MM_FLUSH_ZERO_ON), DAZ (_MM_DENORMALS_ZERO_ON: (1<<6))
+	_mm_setcsr(_MM_MASK_MASK | _MM_FLUSH_ZERO_ON | (1 << 6));
+#elif defined(__EMSCRIPTEN__)
+// not supported
+#else
+	PX_COMPILE_TIME_ASSERT(sizeof(fenv_t) <= sizeof(mControlWords));
+
+	fegetenv(reinterpret_cast<fenv_t*>(mControlWords));
+	fesetenv(FE_DFL_ENV);
+
+#if PX_LINUX
+	// need to explicitly disable exceptions because fesetenv does not modify
+	// the sse control word on 32bit linux (64bit is fine, but do it here just be sure)
+	fedisableexcept(FE_ALL_EXCEPT);
+#endif
+
+#endif
+}
+
+physx::shdfnd::FPUGuard::~FPUGuard()
+{
+#if defined(__CYGWIN__)
+#pragma message "FPUGuard::~FPUGuard() is not implemented"
+#elif PX_ANDROID
+// not supported unless ARM_HARD_FLOAT is enabled.
+#elif PX_PS4
+// not supported
+#elif PX_OSX
+	// restore control word and clear exception flags
+	// (setting exception state flags cause exceptions on the first following fp operation)
+	_mm_setcsr(mControlWords[0] & ~_MM_EXCEPT_MASK);
+#elif defined(__EMSCRIPTEN__)
+// not supported
+#else
+	fesetenv(reinterpret_cast<fenv_t*>(mControlWords));
+#endif
+}
+
+PX_FOUNDATION_API void physx::shdfnd::enableFPExceptions()
+{
+#if PX_LINUX && !defined(__EMSCRIPTEN__)
+	feclearexcept(FE_ALL_EXCEPT);
+	feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
+#elif PX_OSX
+	// clear any pending exceptions
+	// (setting exception state flags cause exceptions on the first following fp operation)
+	uint32_t control = _mm_getcsr() & ~_MM_EXCEPT_MASK;
+
+	// enable all fp exceptions except inexact and underflow (common, benign)
+	// note: denorm has to be disabled as well because underflow can create denorms
+	_mm_setcsr((control & ~_MM_MASK_MASK) | _MM_MASK_INEXACT | _MM_MASK_UNDERFLOW | _MM_MASK_DENORM);
+
+#endif
+}
+
+PX_FOUNDATION_API void physx::shdfnd::disableFPExceptions()
+{
+#if PX_LINUX && !defined(__EMSCRIPTEN__)
+	fedisableexcept(FE_ALL_EXCEPT);
+#elif PX_OSX
+	// clear any pending exceptions
+	// (setting exception state flags cause exceptions on the first following fp operation)
+	uint32_t control = _mm_getcsr() & ~_MM_EXCEPT_MASK;
+	_mm_setcsr(control | _MM_MASK_MASK);
+#endif
+}
diff --git a/PxShared/src/foundation/src/unix/PsUnixMutex.cpp b/PxShared/src/foundation/src/unix/PsUnixMutex.cpp
new file mode 100644
index 0000000..092ec1e
--- /dev/null
+++ b/PxShared/src/foundation/src/unix/PsUnixMutex.cpp
@@ -0,0 +1,171 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxAssert.h"
+#include "foundation/PxErrorCallback.h"
+
+#include "Ps.h"
+#include "PsFoundation.h"
+#include "PsUserAllocated.h"
+#include "PsMutex.h"
+#include "PsAtomic.h"
+#include "PsThread.h"
+
+#include <pthread.h>
+
+namespace physx
+{
+namespace shdfnd
+{
+
+namespace
+{
+struct MutexUnixImpl
+{
+	pthread_mutex_t lock;
+	Thread::Id owner;
+};
+
+MutexUnixImpl* getMutex(MutexImpl* impl)
+{
+	return reinterpret_cast<MutexUnixImpl*>(impl);
+}
+}
+
+MutexImpl::MutexImpl()
+{
+	pthread_mutexattr_t attr;
+	pthread_mutexattr_init(&attr);
+	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+#if !PX_ANDROID
+	// mimic default windows behavior where applicable
+	pthread_mutexattr_setprotocol(&attr, PTHREAD_PRIO_INHERIT);
+#endif
+	pthread_mutex_init(&getMutex(this)->lock, &attr);
+	pthread_mutexattr_destroy(&attr);
+}
+
+MutexImpl::~MutexImpl()
+{
+	pthread_mutex_destroy(&getMutex(this)->lock);
+}
+
+void MutexImpl::lock()
+{
+	int err = pthread_mutex_lock(&getMutex(this)->lock);
+	PX_ASSERT(!err);
+	PX_UNUSED(err);
+
+#if PX_DEBUG
+	getMutex(this)->owner = Thread::getId();
+#endif
+}
+
+bool MutexImpl::trylock()
+{
+	bool success = !pthread_mutex_trylock(&getMutex(this)->lock);
+#if PX_DEBUG
+	if(success)
+		getMutex(this)->owner = Thread::getId();
+#endif
+	return success;
+}
+
+void MutexImpl::unlock()
+{
+#if PX_DEBUG
+	if(getMutex(this)->owner != Thread::getId())
+	{
+		shdfnd::getFoundation().error(PxErrorCode::eINVALID_OPERATION, __FILE__, __LINE__,
+		                              "Mutex must be unlocked only by thread that has already acquired lock");
+		return;
+	}
+#endif
+
+	int err = pthread_mutex_unlock(&getMutex(this)->lock);
+	PX_ASSERT(!err);
+	PX_UNUSED(err);
+}
+
+const uint32_t gSize = sizeof(MutexUnixImpl);
+const uint32_t& MutexImpl::getSize()
+{
+	return gSize;
+}
+
+class ReadWriteLockImpl
+{
+  public:
+	Mutex mutex;
+	volatile int readerCounter;
+};
+
+ReadWriteLock::ReadWriteLock()
+{
+	mImpl = reinterpret_cast<ReadWriteLockImpl*>(PX_ALLOC(sizeof(ReadWriteLockImpl), "ReadWriteLockImpl"));
+	PX_PLACEMENT_NEW(mImpl, ReadWriteLockImpl);
+
+	mImpl->readerCounter = 0;
+}
+
+ReadWriteLock::~ReadWriteLock()
+{
+	mImpl->~ReadWriteLockImpl();
+	PX_FREE(mImpl);
+}
+
+void ReadWriteLock::lockReader()
+{
+	mImpl->mutex.lock();
+
+	atomicIncrement(&mImpl->readerCounter);
+
+	mImpl->mutex.unlock();
+}
+
+void ReadWriteLock::lockWriter()
+{
+	mImpl->mutex.lock();
+
+	while(mImpl->readerCounter != 0)
+		;
+}
+
+void ReadWriteLock::unlockReader()
+{
+	atomicDecrement(&mImpl->readerCounter);
+}
+
+void ReadWriteLock::unlockWriter()
+{
+	mImpl->mutex.unlock();
+}
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/PxShared/src/foundation/src/unix/PsUnixPrintString.cpp b/PxShared/src/foundation/src/unix/PsUnixPrintString.cpp
new file mode 100644
index 0000000..3c937a4
--- /dev/null
+++ b/PxShared/src/foundation/src/unix/PsUnixPrintString.cpp
@@ -0,0 +1,52 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "PsString.h"
+#include <stdio.h>
+
+#if PX_ANDROID
+#include <android/log.h>
+#endif
+
+namespace physx
+{
+namespace shdfnd
+{
+
+void printString(const char* str)
+{
+#if PX_ANDROID
+	__android_log_print(ANDROID_LOG_INFO, "PsPrintString", "%s", str);
+#else
+	puts(str);
+#endif
+}
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/PxShared/src/foundation/src/unix/PsUnixSList.cpp b/PxShared/src/foundation/src/unix/PsUnixSList.cpp
new file mode 100644
index 0000000..5dd8ac3
--- /dev/null
+++ b/PxShared/src/foundation/src/unix/PsUnixSList.cpp
@@ -0,0 +1,158 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "PsAllocator.h"
+#include "PsAtomic.h"
+#include "PsSList.h"
+#include "PsThread.h"
+#include <pthread.h>
+
+#if PX_IOS || PX_EMSCRIPTEN
+#define USE_MUTEX
+#endif
+
+namespace physx
+{
+namespace shdfnd
+{
+namespace
+{
+#if defined(USE_MUTEX)
+class ScopedMutexLock
+{
+	pthread_mutex_t& mMutex;
+
+  public:
+	PX_INLINE ScopedMutexLock(pthread_mutex_t& mutex) : mMutex(mutex)
+	{
+		pthread_mutex_lock(&mMutex);
+	}
+
+	PX_INLINE ~ScopedMutexLock()
+	{
+		pthread_mutex_unlock(&mMutex);
+	}
+};
+
+typedef ScopedMutexLock ScopedLock;
+#else
+struct ScopedSpinLock
+{
+	PX_FORCE_INLINE ScopedSpinLock(volatile int32_t& lock) : mLock(lock)
+	{
+		while(__sync_lock_test_and_set(&mLock, 1))
+		{
+			// spinning without atomics is usually
+			// causing less bus traffic. -> only one
+			// CPU is modifying the cache line.
+			while(lock)
+				PxSpinLockPause();
+		}
+	}
+
+	PX_FORCE_INLINE ~ScopedSpinLock()
+	{
+		__sync_lock_release(&mLock);
+	}
+
+  private:
+	volatile int32_t& mLock;
+};
+
+typedef ScopedSpinLock ScopedLock;
+#endif
+
+struct SListDetail
+{
+	SListEntry* head;
+#if defined(USE_MUTEX)
+	pthread_mutex_t lock;
+#else
+	volatile int32_t lock;
+#endif
+};
+
+template <typename T>
+SListDetail* getDetail(T* impl)
+{
+	return reinterpret_cast<SListDetail*>(impl);
+}
+}
+
+SListImpl::SListImpl()
+{
+	getDetail(this)->head = NULL;
+
+#if defined(USE_MUTEX)
+	pthread_mutex_init(&getDetail(this)->lock, NULL);
+#else
+	getDetail(this)->lock = 0; // 0 == unlocked
+#endif
+}
+
+SListImpl::~SListImpl()
+{
+#if defined(USE_MUTEX)
+	pthread_mutex_destroy(&getDetail(this)->lock);
+#endif
+}
+
+void SListImpl::push(SListEntry* entry)
+{
+	ScopedLock lock(getDetail(this)->lock);
+	entry->mNext = getDetail(this)->head;
+	getDetail(this)->head = entry;
+}
+
+SListEntry* SListImpl::pop()
+{
+	ScopedLock lock(getDetail(this)->lock);
+	SListEntry* result = getDetail(this)->head;
+	if(result != NULL)
+		getDetail(this)->head = result->mNext;
+	return result;
+}
+
+SListEntry* SListImpl::flush()
+{
+	ScopedLock lock(getDetail(this)->lock);
+	SListEntry* result = getDetail(this)->head;
+	getDetail(this)->head = NULL;
+	return result;
+}
+
+static const uint32_t gSize = sizeof(SListDetail);
+
+const uint32_t& SListImpl::getSize()
+{
+	return gSize;
+}
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/PxShared/src/foundation/src/unix/PsUnixSocket.cpp b/PxShared/src/foundation/src/unix/PsUnixSocket.cpp
new file mode 100644
index 0000000..bc4cb5c
--- /dev/null
+++ b/PxShared/src/foundation/src/unix/PsUnixSocket.cpp
@@ -0,0 +1,483 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxIntrinsics.h"
+
+#include "PsSocket.h"
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/select.h>
+#include <sys/time.h>
+#if !PX_PS4
+#include <netdb.h>
+#include <arpa/inet.h>
+#else
+#include <ps4/PsPS4Socket.h>
+#endif
+#include <fcntl.h>
+#include <errno.h>
+#include <unistd.h>
+
+#define INVALID_SOCKET -1
+
+#ifndef SOMAXCONN
+#define SOMAXCONN 5
+#endif
+
+namespace physx
+{
+namespace shdfnd
+{
+
+const uint32_t Socket::DEFAULT_BUFFER_SIZE = 32768;
+
+class SocketImpl
+{
+  public:
+	SocketImpl(bool isBlocking);
+	virtual ~SocketImpl();
+
+	bool connect(const char* host, uint16_t port, uint32_t timeout);
+	bool listen(uint16_t port);
+	bool accept(bool block);
+	void disconnect();
+
+	void setBlocking(bool blocking);
+
+	virtual uint32_t write(const uint8_t* data, uint32_t length);
+	virtual bool flush();
+	uint32_t read(uint8_t* data, uint32_t length);
+
+	PX_FORCE_INLINE bool isBlocking() const
+	{
+		return mIsBlocking;
+	}
+	PX_FORCE_INLINE bool isConnected() const
+	{
+		return mIsConnected;
+	}
+	PX_FORCE_INLINE const char* getHost() const
+	{
+		return mHost;
+	}
+	PX_FORCE_INLINE uint16_t getPort() const
+	{
+		return mPort;
+	}
+
+  protected:
+	bool nonBlockingTimeout() const;
+
+	int32_t mSocket;
+	int32_t mListenSocket;
+	const char* mHost;
+	uint16_t mPort;
+	bool mIsConnected;
+	bool mIsBlocking;
+	bool mListenMode;
+};
+
+void socketSetBlockingInternal(int32_t socket, bool blocking);
+
+SocketImpl::SocketImpl(bool isBlocking)
+: mSocket(INVALID_SOCKET)
+, mListenSocket(INVALID_SOCKET)
+, mHost(NULL)
+, mPort(0)
+, mIsConnected(false)
+, mIsBlocking(isBlocking)
+, mListenMode(false)
+{
+}
+
+SocketImpl::~SocketImpl()
+{
+}
+
+bool SocketImpl::connect(const char* host, uint16_t port, uint32_t timeout)
+{
+	sockaddr_in socketAddress;
+	intrinsics::memSet(&socketAddress, 0, sizeof(sockaddr_in));
+	socketAddress.sin_family = AF_INET;
+	socketAddress.sin_port = htons(port);
+
+#if PX_PS4
+	socketAddress.sin_addr.s_addr = resolveName(host, timeout);
+#else
+	// get host
+	hostent* hp = gethostbyname(host);
+	if(!hp)
+	{
+		in_addr a;
+		a.s_addr = inet_addr(host);
+		hp = gethostbyaddr(reinterpret_cast<const char*>(&a), sizeof(in_addr), AF_INET);
+		if(!hp)
+			return false;
+	}
+	intrinsics::memCopy(&socketAddress.sin_addr, hp->h_addr_list[0], hp->h_length);
+#endif
+	// connect
+	mSocket = socket(AF_INET, SOCK_STREAM, 0);
+	if(mSocket == INVALID_SOCKET)
+		return false;
+
+	socketSetBlockingInternal(mSocket, false);
+
+	int connectRet = ::connect(mSocket, reinterpret_cast<sockaddr*>(&socketAddress), sizeof(socketAddress));
+	if(connectRet < 0)
+	{
+		if(errno != EINPROGRESS)
+		{
+			disconnect();
+			return false;
+		}
+
+		// Setup select function call to monitor the connect call.
+		fd_set writefs;
+		fd_set exceptfs;
+		FD_ZERO(&writefs);
+		FD_ZERO(&exceptfs);
+		FD_SET(mSocket, &writefs);
+		FD_SET(mSocket, &exceptfs);
+		timeval timeout_;
+		timeout_.tv_sec = timeout / 1000;
+		timeout_.tv_usec = (timeout % 1000) * 1000;
+		int selret = ::select(mSocket + 1, NULL, &writefs, &exceptfs, &timeout_);
+		int excepted = FD_ISSET(mSocket, &exceptfs);
+		int canWrite = FD_ISSET(mSocket, &writefs);
+		if(selret != 1 || excepted || !canWrite)
+		{
+			disconnect();
+			return false;
+		}
+
+		// check if we are really connected, above code seems to return
+		// true if host is a unix machine even if the connection was
+		// not accepted.
+		char buffer;
+		if(recv(mSocket, &buffer, 0, 0) < 0)
+		{
+			if(errno != EWOULDBLOCK)
+			{
+				disconnect();
+				return false;
+			}
+		}
+	}
+
+	socketSetBlockingInternal(mSocket, mIsBlocking);
+
+#if PX_APPLE_FAMILY
+	int noSigPipe = 1;
+	setsockopt(mSocket, SOL_SOCKET, SO_NOSIGPIPE, &noSigPipe, sizeof(int));
+#endif
+
+	mIsConnected = true;
+	mPort = port;
+	mHost = host;
+	return true;
+}
+
+bool SocketImpl::listen(uint16_t port)
+{
+	mListenSocket = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP);
+	if(mListenSocket == INVALID_SOCKET)
+		return false;
+
+	// enable address reuse: "Address already in use" error message
+	int yes = 1;
+	if(setsockopt(mListenSocket, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(int)) == -1)
+		return false;
+
+	mListenMode = true;
+
+	sockaddr_in addr;
+	addr.sin_family = AF_INET;
+	addr.sin_port = htons(port);
+	addr.sin_addr.s_addr = INADDR_ANY;
+	intrinsics::memSet(addr.sin_zero, '\0', sizeof addr.sin_zero);
+
+	return bind(mListenSocket, reinterpret_cast<sockaddr*>(&addr), sizeof(addr)) != -1 &&
+	       ::listen(mListenSocket, SOMAXCONN) != -1;
+}
+
+bool SocketImpl::accept(bool block)
+{
+	if(mIsConnected || !mListenMode)
+		return false;
+
+	// set the listen socket to be non-blocking.
+	socketSetBlockingInternal(mListenSocket, block);
+	int32_t clientSocket = ::accept(mListenSocket, 0, 0);
+	if(clientSocket == INVALID_SOCKET)
+		return false;
+
+	mSocket = clientSocket;
+	mIsConnected = true;
+	socketSetBlockingInternal(mSocket, mIsBlocking); // force the mode to whatever the user set
+
+	return mIsConnected;
+}
+
+void SocketImpl::disconnect()
+{
+	if(mListenSocket != INVALID_SOCKET)
+	{
+		close(mListenSocket);
+		mListenSocket = INVALID_SOCKET;
+	}
+	if(mSocket != INVALID_SOCKET)
+	{
+		if(mIsConnected)
+		{
+			socketSetBlockingInternal(mSocket, true);
+			shutdown(mSocket, SHUT_RDWR);
+		}
+		close(mSocket);
+		mSocket = INVALID_SOCKET;
+	}
+	mIsConnected = false;
+	mListenMode = false;
+	mPort = 0;
+	mHost = NULL;
+}
+
+bool SocketImpl::nonBlockingTimeout() const
+{
+	return !mIsBlocking && errno == EWOULDBLOCK;
+}
+
+#if !PX_PS4
+void socketSetBlockingInternal(int32_t socket, bool blocking)
+{
+	int mode = fcntl(socket, F_GETFL, 0);
+	if(!blocking)
+		mode |= O_NONBLOCK;
+	else
+		mode &= ~O_NONBLOCK;
+	fcntl(socket, F_SETFL, mode);
+}
+#endif
+
+// should be cross-platform from here down
+
+void SocketImpl::setBlocking(bool blocking)
+{
+	if(blocking != mIsBlocking)
+	{
+		mIsBlocking = blocking;
+		if(isConnected())
+			socketSetBlockingInternal(mSocket, blocking);
+	}
+}
+
+bool SocketImpl::flush()
+{
+	return true;
+}
+
+uint32_t SocketImpl::write(const uint8_t* data, uint32_t length)
+{
+	if(length == 0)
+		return 0;
+
+	int sent = send(mSocket, reinterpret_cast<const char*>(data), int32_t(length), 0);
+
+	if(sent <= 0 && !nonBlockingTimeout())
+		disconnect();
+
+	return uint32_t(sent > 0 ? sent : 0);
+}
+
+uint32_t SocketImpl::read(uint8_t* data, uint32_t length)
+{
+	if(length == 0)
+		return 0;
+
+	int32_t received = recv(mSocket, reinterpret_cast<char*>(data), int32_t(length), 0);
+
+	if(received <= 0 && !nonBlockingTimeout())
+		disconnect();
+
+	return uint32_t(received > 0 ? received : 0);
+}
+
+class BufferedSocketImpl : public SocketImpl
+{
+  public:
+	BufferedSocketImpl(bool isBlocking) : SocketImpl(isBlocking), mBufferPos(0)
+	{
+	}
+	virtual ~BufferedSocketImpl()
+	{
+	}
+	bool flush();
+	uint32_t write(const uint8_t* data, uint32_t length);
+
+  private:
+	uint32_t mBufferPos;
+	uint8_t mBuffer[Socket::DEFAULT_BUFFER_SIZE];
+};
+
+bool BufferedSocketImpl::flush()
+{
+	uint32_t totalBytesWritten = 0;
+
+	while(totalBytesWritten < mBufferPos && mIsConnected)
+		totalBytesWritten += int32_t(SocketImpl::write(mBuffer + totalBytesWritten, mBufferPos - totalBytesWritten));
+
+	bool ret = (totalBytesWritten == mBufferPos);
+	mBufferPos = 0;
+	return ret;
+}
+
+uint32_t BufferedSocketImpl::write(const uint8_t* data, uint32_t length)
+{
+	uint32_t bytesWritten = 0;
+	while(mBufferPos + length >= Socket::DEFAULT_BUFFER_SIZE)
+	{
+		uint32_t currentChunk = Socket::DEFAULT_BUFFER_SIZE - mBufferPos;
+		intrinsics::memCopy(mBuffer + mBufferPos, data + bytesWritten, currentChunk);
+		bytesWritten += uint32_t(currentChunk); // for the user, this is consumed even if we fail to shove it down a
+		// non-blocking socket
+
+		uint32_t sent = SocketImpl::write(mBuffer, Socket::DEFAULT_BUFFER_SIZE);
+		mBufferPos = Socket::DEFAULT_BUFFER_SIZE - sent;
+
+		if(sent < Socket::DEFAULT_BUFFER_SIZE) // non-blocking or error
+		{
+			if(sent) // we can reasonably hope this is rare
+				intrinsics::memMove(mBuffer, mBuffer + sent, mBufferPos);
+
+			return bytesWritten;
+		}
+		length -= currentChunk;
+	}
+
+	if(length > 0)
+	{
+		intrinsics::memCopy(mBuffer + mBufferPos, data + bytesWritten, length);
+		bytesWritten += length;
+		mBufferPos += length;
+	}
+
+	return bytesWritten;
+}
+
+Socket::Socket(bool inIsBuffering, bool isBlocking)
+{
+	if(inIsBuffering)
+	{
+		void* mem = PX_ALLOC(sizeof(BufferedSocketImpl), "BufferedSocketImpl");
+		mImpl = PX_PLACEMENT_NEW(mem, BufferedSocketImpl)(isBlocking);
+	}
+	else
+	{
+		void* mem = PX_ALLOC(sizeof(SocketImpl), "SocketImpl");
+		mImpl = PX_PLACEMENT_NEW(mem, SocketImpl)(isBlocking);
+	}
+}
+
+Socket::~Socket()
+{
+	mImpl->flush();
+	mImpl->disconnect();
+	mImpl->~SocketImpl();
+	PX_FREE(mImpl);
+}
+
+bool Socket::connect(const char* host, uint16_t port, uint32_t timeout)
+{
+	return mImpl->connect(host, port, timeout);
+}
+
+bool Socket::listen(uint16_t port)
+{
+	return mImpl->listen(port);
+}
+
+bool Socket::accept(bool block)
+{
+	return mImpl->accept(block);
+}
+
+void Socket::disconnect()
+{
+	mImpl->disconnect();
+}
+
+bool Socket::isConnected() const
+{
+	return mImpl->isConnected();
+}
+
+const char* Socket::getHost() const
+{
+	return mImpl->getHost();
+}
+
+uint16_t Socket::getPort() const
+{
+	return mImpl->getPort();
+}
+
+bool Socket::flush()
+{
+	if(!mImpl->isConnected())
+		return false;
+	return mImpl->flush();
+}
+
+uint32_t Socket::write(const uint8_t* data, uint32_t length)
+{
+	if(!mImpl->isConnected())
+		return 0;
+	return mImpl->write(data, length);
+}
+
+uint32_t Socket::read(uint8_t* data, uint32_t length)
+{
+	if(!mImpl->isConnected())
+		return 0;
+	return mImpl->read(data, length);
+}
+
+void Socket::setBlocking(bool blocking)
+{
+	mImpl->setBlocking(blocking);
+}
+
+bool Socket::isBlocking() const
+{
+	return mImpl->isBlocking();
+}
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/PxShared/src/foundation/src/unix/PsUnixSync.cpp b/PxShared/src/foundation/src/unix/PsUnixSync.cpp
new file mode 100644
index 0000000..aedbbe0
--- /dev/null
+++ b/PxShared/src/foundation/src/unix/PsUnixSync.cpp
@@ -0,0 +1,165 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxAssert.h"
+
+#include "Ps.h"
+#include "PsUserAllocated.h"
+#include "PsSync.h"
+
+#include <errno.h>
+#include <stdio.h>
+#include <pthread.h>
+#include <time.h>
+#include <sys/time.h>
+
+namespace physx
+{
+namespace shdfnd
+{
+
+namespace
+{
+class _SyncImpl
+{
+  public:
+	pthread_mutex_t mutex;
+	pthread_cond_t cond;
+	volatile int setCounter;
+	volatile bool is_set;
+};
+
+_SyncImpl* getSync(SyncImpl* impl)
+{
+	return reinterpret_cast<_SyncImpl*>(impl);
+}
+}
+
+static const uint32_t gSize = sizeof(_SyncImpl);
+const uint32_t& SyncImpl::getSize()
+{
+	return gSize;
+}
+
+struct PxUnixScopeLock
+{
+	PxUnixScopeLock(pthread_mutex_t& m) : mMutex(m)
+	{
+		pthread_mutex_lock(&mMutex);
+	}
+
+	~PxUnixScopeLock()
+	{
+		pthread_mutex_unlock(&mMutex);
+	}
+
+  private:
+	pthread_mutex_t& mMutex;
+};
+
+SyncImpl::SyncImpl()
+{
+	int status = pthread_mutex_init(&getSync(this)->mutex, 0);
+	PX_ASSERT(!status);
+	status = pthread_cond_init(&getSync(this)->cond, 0);
+	PX_ASSERT(!status);
+	PX_UNUSED(status);
+	getSync(this)->is_set = false;
+	getSync(this)->setCounter = 0;
+}
+
+SyncImpl::~SyncImpl()
+{
+	pthread_cond_destroy(&getSync(this)->cond);
+	pthread_mutex_destroy(&getSync(this)->mutex);
+}
+
+void SyncImpl::reset()
+{
+	PxUnixScopeLock lock(getSync(this)->mutex);
+	getSync(this)->is_set = false;
+}
+
+void SyncImpl::set()
+{
+	PxUnixScopeLock lock(getSync(this)->mutex);
+	if(!getSync(this)->is_set)
+	{
+		getSync(this)->is_set = true;
+		getSync(this)->setCounter++;
+		pthread_cond_broadcast(&getSync(this)->cond);
+	}
+}
+
+bool SyncImpl::wait(uint32_t ms)
+{
+	PxUnixScopeLock lock(getSync(this)->mutex);
+	int lastSetCounter = getSync(this)->setCounter;
+	if(!getSync(this)->is_set)
+	{
+		if(ms == uint32_t(-1))
+		{
+			// have to loop here and check is_set since pthread_cond_wait can return successfully
+			// even if it was not signaled by pthread_cond_broadcast (OS efficiency design decision)
+			int status = 0;
+			while(!status && !getSync(this)->is_set && (lastSetCounter == getSync(this)->setCounter))
+				status = pthread_cond_wait(&getSync(this)->cond, &getSync(this)->mutex);
+			PX_ASSERT((!status && getSync(this)->is_set) || (lastSetCounter != getSync(this)->setCounter));
+		}
+		else
+		{
+			timespec ts;
+			timeval tp;
+			gettimeofday(&tp, NULL);
+			uint32_t sec = ms / 1000;
+			uint32_t usec = (ms - 1000 * sec) * 1000;
+
+			// sschirm: taking into account that us might accumulate to a second
+			// otherwise the pthread_cond_timedwait complains on osx.
+			usec = tp.tv_usec + usec;
+			uint32_t div_sec = usec / 1000000;
+			uint32_t rem_usec = usec - div_sec * 1000000;
+
+			ts.tv_sec = tp.tv_sec + sec + div_sec;
+			ts.tv_nsec = rem_usec * 1000;
+
+			// have to loop here and check is_set since pthread_cond_timedwait can return successfully
+			// even if it was not signaled by pthread_cond_broadcast (OS efficiency design decision)
+			int status = 0;
+			while(!status && !getSync(this)->is_set && (lastSetCounter == getSync(this)->setCounter))
+				status = pthread_cond_timedwait(&getSync(this)->cond, &getSync(this)->mutex, &ts);
+			PX_ASSERT((!status && getSync(this)->is_set) || (status == ETIMEDOUT) ||
+			          (lastSetCounter != getSync(this)->setCounter));
+		}
+	}
+	return getSync(this)->is_set || (lastSetCounter != getSync(this)->setCounter);
+}
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/PxShared/src/foundation/src/unix/PsUnixThread.cpp b/PxShared/src/foundation/src/unix/PsUnixThread.cpp
new file mode 100644
index 0000000..cb369e0
--- /dev/null
+++ b/PxShared/src/foundation/src/unix/PsUnixThread.cpp
@@ -0,0 +1,472 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxAssert.h"
+#include "foundation/PxErrorCallback.h"
+
+#include "Ps.h"
+#include "PsFoundation.h"
+#include "PsAtomic.h"
+#include "PsThread.h"
+
+#include <math.h>
+#if !PX_APPLE_FAMILY && !defined(ANDROID) && !defined(__CYGWIN__) && !PX_PS4 && !PX_EMSCRIPTEN
+#include <bits/local_lim.h> // PTHREAD_STACK_MIN
+#endif
+#include <stdio.h>
+#include <pthread.h>
+#include <unistd.h>
+#if !PX_PS4
+#include <sys/syscall.h>
+#if !PX_APPLE_FAMILY && !PX_EMSCRIPTEN
+#include <asm/unistd.h>
+#include <sys/resource.h>
+#endif
+#endif
+
+#if PX_APPLE_FAMILY
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <TargetConditionals.h>
+#include <pthread.h>
+#endif
+
+// fwd
+#if defined(ANDROID)
+extern "C" {
+int android_getCpuCount(void);
+}
+#endif
+
+#define PxSpinLockPause() asm("nop")
+
+namespace physx
+{
+namespace shdfnd
+{
+
+namespace
+{
+
+typedef enum
+{
+	_PxThreadNotStarted,
+	_PxThreadStarted,
+	_PxThreadStopped
+} PxThreadState;
+
+class _ThreadImpl
+{
+  public:
+	ThreadImpl::ExecuteFn fn;
+	void* arg;
+	volatile int32_t quitNow;
+	volatile int32_t threadStarted;
+	volatile int32_t state;
+
+	pthread_t thread;
+	pid_t tid;
+
+	uint32_t affinityMask;
+};
+
+_ThreadImpl* getThread(ThreadImpl* impl)
+{
+	return reinterpret_cast<_ThreadImpl*>(impl);
+}
+
+static void setTid(_ThreadImpl& threadImpl)
+{
+// query TID
+#if PX_PS4 || (defined (TARGET_OS_TV) && TARGET_OS_TV)
+// AM: TODO: neither of the below are implemented
+#elif PX_APPLE_FAMILY
+	threadImpl.tid = syscall(SYS_gettid);
+#elif PX_EMSCRIPTEN
+	threadImpl.tid = pthread_self();
+#else
+	threadImpl.tid = syscall(__NR_gettid);
+#endif
+
+	// notify/unblock parent thread
+	atomicCompareExchange(&(threadImpl.threadStarted), 1, 0);
+}
+
+void* PxThreadStart(void* arg)
+{
+	_ThreadImpl* impl = getThread(reinterpret_cast<ThreadImpl*>(arg));
+	impl->state = _PxThreadStarted;
+
+	// run setTid in thread's context
+	setTid(*impl);
+
+	// then run either the passed in function or execute from the derived class (Runnable).
+	if(impl->fn)
+		(*impl->fn)(impl->arg);
+	else if(impl->arg)
+		(reinterpret_cast<Runnable*>(impl->arg))->execute();
+	return 0;
+}
+}
+
+static const uint32_t gSize = sizeof(_ThreadImpl);
+const uint32_t& ThreadImpl::getSize()
+{
+	return gSize;
+}
+
+ThreadImpl::Id ThreadImpl::getId()
+{
+	return Id(pthread_self());
+}
+
+ThreadImpl::ThreadImpl()
+{
+	getThread(this)->thread = 0;
+	getThread(this)->tid = 0;
+	getThread(this)->state = _PxThreadNotStarted;
+	getThread(this)->quitNow = 0;
+	getThread(this)->threadStarted = 0;
+	getThread(this)->fn = NULL;
+	getThread(this)->arg = NULL;
+	getThread(this)->affinityMask = 0;
+}
+
+ThreadImpl::ThreadImpl(ThreadImpl::ExecuteFn fn, void* arg)
+{
+	getThread(this)->thread = 0;
+	getThread(this)->tid = 0;
+	getThread(this)->state = _PxThreadNotStarted;
+	getThread(this)->quitNow = 0;
+	getThread(this)->threadStarted = 0;
+	getThread(this)->fn = fn;
+	getThread(this)->arg = arg;
+	getThread(this)->affinityMask = 0;
+
+	start(0, NULL);
+}
+
+ThreadImpl::~ThreadImpl()
+{
+	if(getThread(this)->state == _PxThreadStarted)
+		kill();
+}
+
+void ThreadImpl::start(uint32_t stackSize, Runnable* runnable)
+{
+	if(getThread(this)->state != _PxThreadNotStarted)
+		return;
+
+	if(stackSize == 0)
+		stackSize = getDefaultStackSize();
+
+#if defined(PTHREAD_STACK_MIN) && !defined(ANDROID)
+	if(stackSize < PTHREAD_STACK_MIN)
+	{
+		shdfnd::getFoundation().error(PxErrorCode::eDEBUG_WARNING, __FILE__, __LINE__,
+		                              "ThreadImpl::start(): stack size was set below PTHREAD_STACK_MIN");
+		stackSize = PTHREAD_STACK_MIN;
+	}
+#endif
+
+	if(runnable && !getThread(this)->arg && !getThread(this)->fn)
+		getThread(this)->arg = runnable;
+
+	pthread_attr_t attr;
+	int status = pthread_attr_init(&attr);
+	PX_ASSERT(!status);
+	PX_UNUSED(status);
+
+	status = pthread_attr_setstacksize(&attr, stackSize);
+	PX_ASSERT(!status);
+	status = pthread_create(&getThread(this)->thread, &attr, PxThreadStart, this);
+	PX_ASSERT(!status);
+
+	// wait for thread to startup and write out TID
+	// otherwise TID dependent calls like setAffinity will fail.
+	while(atomicCompareExchange(&(getThread(this)->threadStarted), 1, 1) == 0)
+		yield();
+
+	// here we are sure that getThread(this)->state >= _PxThreadStarted
+
+	status = pthread_attr_destroy(&attr);
+	PX_ASSERT(!status);
+
+	// apply stored affinity mask
+	if(getThread(this)->affinityMask)
+		setAffinityMask(getThread(this)->affinityMask);
+}
+
+void ThreadImpl::signalQuit()
+{
+	atomicIncrement(&(getThread(this)->quitNow));
+}
+
+bool ThreadImpl::waitForQuit()
+{
+	if(getThread(this)->state == _PxThreadNotStarted)
+		return false;
+
+	// works also with a stopped/exited thread if the handle is still valid
+	pthread_join(getThread(this)->thread, NULL);
+	return true;
+}
+
+bool ThreadImpl::quitIsSignalled()
+{
+	return atomicCompareExchange(&(getThread(this)->quitNow), 0, 0) != 0;
+}
+
+#if defined(PX_GCC_FAMILY)
+__attribute__((noreturn))
+#endif
+    void ThreadImpl::quit()
+{
+	getThread(this)->state = _PxThreadStopped;
+	pthread_exit(0);
+}
+
+void ThreadImpl::kill()
+{
+#ifndef ANDROID
+	if(getThread(this)->state == _PxThreadStarted)
+		pthread_cancel(getThread(this)->thread);
+	getThread(this)->state = _PxThreadStopped;
+#else
+	shdfnd::getFoundation().error(PxErrorCode::eDEBUG_WARNING, __FILE__, __LINE__,
+	                              "ThreadImpl::kill() called, but is not implemented");
+#endif
+}
+
+void ThreadImpl::sleep(uint32_t ms)
+{
+	timespec sleepTime;
+	uint32_t remainder = ms % 1000;
+	sleepTime.tv_sec = ms - remainder;
+	sleepTime.tv_nsec = remainder * 1000000L;
+
+	while(nanosleep(&sleepTime, &sleepTime) == -1)
+		continue;
+}
+
+void ThreadImpl::yield()
+{
+	sched_yield();
+}
+
+#if PX_PS4
+uint32_t setAffinityMaskPS4(pthread_t, uint32_t);
+#endif
+
+uint32_t ThreadImpl::setAffinityMask(uint32_t mask)
+{
+	// Same as windows impl if mask is zero
+	if(!mask)
+		return 0;
+
+	getThread(this)->affinityMask = mask;
+
+	uint64_t prevMask = 0;
+
+	if(getThread(this)->state == _PxThreadStarted)
+	{
+#if PX_PS4
+		prevMask = setAffinityMaskPS4(getThread(this)->thread, mask);
+#elif PX_EMSCRIPTEN
+		// not supported
+#elif !PX_APPLE_FAMILY // Apple doesn't support syscall with getaffinity and setaffinity
+		int32_t errGet = syscall(__NR_sched_getaffinity, getThread(this)->tid, sizeof(prevMask), &prevMask);
+		if(errGet < 0)
+			return 0;
+
+		int32_t errSet = syscall(__NR_sched_setaffinity, getThread(this)->tid, sizeof(mask), &mask);
+		if(errSet != 0)
+			return 0;
+#endif
+	}
+
+	return uint32_t(prevMask);
+}
+
+void ThreadImpl::setName(const char* name)
+{
+#if(defined(ANDROID) && (__ANDROID_API__ > 8))
+	pthread_setname_np(getThread(this)->thread, name);
+#else
+	// not implemented because most unix APIs expect setName()
+	// to be called from the thread's context. Example see next comment:
+
+	// this works only with the current thread and can rename
+	// the main process if used in the wrong context:
+	// prctl(PR_SET_NAME, reinterpret_cast<unsigned long>(name) ,0,0,0);
+	PX_UNUSED(name);
+#endif
+}
+
+#if !PX_APPLE_FAMILY
+static ThreadPriority::Enum convertPriorityFromLinux(uint32_t inPrio, int policy)
+{
+	PX_COMPILE_TIME_ASSERT(ThreadPriority::eLOW > ThreadPriority::eHIGH);
+	PX_COMPILE_TIME_ASSERT(ThreadPriority::eHIGH == 0);
+
+	int maxL = sched_get_priority_max(policy);
+	int minL = sched_get_priority_min(policy);
+	int rangeL = maxL - minL;
+	int rangeNv = ThreadPriority::eLOW - ThreadPriority::eHIGH;
+
+	// case for default scheduler policy
+	if(rangeL == 0)
+		return ThreadPriority::eNORMAL;
+
+	float floatPrio = (float(maxL - inPrio) * float(rangeNv)) / float(rangeL);
+
+	return ThreadPriority::Enum(int(roundf(floatPrio)));
+}
+
+static int convertPriorityToLinux(ThreadPriority::Enum inPrio, int policy)
+{
+	int maxL = sched_get_priority_max(policy);
+	int minL = sched_get_priority_min(policy);
+	int rangeL = maxL - minL;
+	int rangeNv = ThreadPriority::eLOW - ThreadPriority::eHIGH;
+
+	// case for default scheduler policy
+	if(rangeL == 0)
+		return 0;
+
+	float floatPrio = (float(ThreadPriority::eLOW - inPrio) * float(rangeL)) / float(rangeNv);
+
+	return minL + int(roundf(floatPrio));
+}
+#endif
+
+void ThreadImpl::setPriority(ThreadPriority::Enum val)
+{
+	PX_UNUSED(val);
+#if !PX_APPLE_FAMILY
+	int policy;
+	sched_param s_param;
+	pthread_getschedparam(getThread(this)->thread, &policy, &s_param);
+	s_param.sched_priority = convertPriorityToLinux(val, policy);
+	pthread_setschedparam(getThread(this)->thread, policy, &s_param);
+#endif
+}
+
+ThreadPriority::Enum ThreadImpl::getPriority(Id pthread)
+{
+	PX_UNUSED(pthread);
+#if !PX_APPLE_FAMILY
+	int policy;
+	sched_param s_param;
+	int ret = pthread_getschedparam(pthread_t(pthread), &policy, &s_param);
+	if(ret == 0)
+		return convertPriorityFromLinux(s_param.sched_priority, policy);
+	else
+		return ThreadPriority::eNORMAL;
+#else
+	return ThreadPriority::eNORMAL;
+#endif
+}
+
+uint32_t ThreadImpl::getNbPhysicalCores()
+{
+#if PX_APPLE_FAMILY
+	int count;
+	size_t size = sizeof(count);
+	return sysctlbyname("hw.physicalcpu", &count, &size, NULL, 0) ? 0 : count;
+#elif defined(ANDROID)
+	return android_getCpuCount();
+#else
+	// Linux exposes CPU topology using /sys/devices/system/cpu
+	// https://www.kernel.org/doc/Documentation/cputopology.txt
+	if(FILE* f = fopen("/sys/devices/system/cpu/possible", "r"))
+	{
+		int minIndex, maxIndex;
+		int n = fscanf(f, "%d-%d", &minIndex, &maxIndex);
+		fclose(f);
+
+		if(n == 2)
+			return (maxIndex - minIndex) + 1;
+		else if(n == 1)
+			return minIndex + 1;
+	}
+
+#if PX_PS4
+	// Reducing to 6 to take into account that the OS appears to use 2 cores at peak currently.
+	return 6;
+#else
+	// For non-Linux kernels this fallback is possibly the best we can do
+	// but will report logical (hyper-threaded) counts
+	int n = sysconf(_SC_NPROCESSORS_CONF);
+	if(n < 0)
+		return 0;
+	else
+		return n;
+#endif
+#endif
+}
+
+uint32_t TlsAlloc()
+{
+	pthread_key_t key;
+	int status = pthread_key_create(&key, NULL);
+	PX_ASSERT(!status);
+	PX_UNUSED(status);
+	return uint32_t(key);
+}
+
+void TlsFree(uint32_t index)
+{
+	int status = pthread_key_delete(pthread_key_t(index));
+	PX_ASSERT(!status);
+	PX_UNUSED(status);
+}
+
+void* TlsGet(uint32_t index)
+{
+	return reinterpret_cast<void*>(pthread_getspecific(pthread_key_t(index)));
+}
+
+uint32_t TlsSet(uint32_t index, void* value)
+{
+	int status = pthread_setspecific(pthread_key_t(index), value);
+	PX_ASSERT(!status);
+	return !status;
+}
+
+// DM: On Linux x86-32, without implementation-specific restrictions
+// the default stack size for a new thread should be 2 megabytes (kernel.org).
+// NOTE: take care of this value on other architecutres!
+uint32_t ThreadImpl::getDefaultStackSize()
+{
+	return 1 << 21;
+}
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/PxShared/src/foundation/src/unix/PsUnixTime.cpp b/PxShared/src/foundation/src/unix/PsUnixTime.cpp
new file mode 100644
index 0000000..43b94f5
--- /dev/null
+++ b/PxShared/src/foundation/src/unix/PsUnixTime.cpp
@@ -0,0 +1,120 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "Ps.h"
+#include "PsTime.h"
+
+#include <time.h>
+#include <sys/time.h>
+
+#if PX_APPLE_FAMILY
+#include <mach/mach_time.h>
+#endif
+
+// Use real-time high-precision timer.
+#if !PX_APPLE_FAMILY
+#define CLOCKID CLOCK_REALTIME
+#endif
+
+namespace physx
+{
+namespace shdfnd
+{
+
+static const CounterFrequencyToTensOfNanos gCounterFreq = Time::getCounterFrequency();
+
+const CounterFrequencyToTensOfNanos& Time::getBootCounterFrequency()
+{
+	return gCounterFreq;
+}
+
+static Time::Second getTimeSeconds()
+{
+	static struct timeval _tv;
+	gettimeofday(&_tv, NULL);
+	return double(_tv.tv_sec) + double(_tv.tv_usec) * 0.000001;
+}
+
+Time::Time()
+{
+	mLastTime = getTimeSeconds();
+}
+
+Time::Second Time::getElapsedSeconds()
+{
+	Time::Second curTime = getTimeSeconds();
+	Time::Second diff = curTime - mLastTime;
+	mLastTime = curTime;
+	return diff;
+}
+
+Time::Second Time::peekElapsedSeconds()
+{
+	Time::Second curTime = getTimeSeconds();
+	Time::Second diff = curTime - mLastTime;
+	return diff;
+}
+
+Time::Second Time::getLastTime() const
+{
+	return mLastTime;
+}
+
+#if PX_APPLE_FAMILY
+CounterFrequencyToTensOfNanos Time::getCounterFrequency()
+{
+	mach_timebase_info_data_t info;
+	mach_timebase_info(&info);
+	// mach_absolute_time * (info.numer/info.denom) is in units of nano seconds
+	return CounterFrequencyToTensOfNanos(info.numer, info.denom * 10);
+}
+
+uint64_t Time::getCurrentCounterValue()
+{
+	return mach_absolute_time();
+}
+
+#else
+
+CounterFrequencyToTensOfNanos Time::getCounterFrequency()
+{
+	return CounterFrequencyToTensOfNanos(1, 10);
+}
+
+uint64_t Time::getCurrentCounterValue()
+{
+	struct timespec mCurrTimeInt;
+	clock_gettime(CLOCKID, &mCurrTimeInt);
+	// Convert to nanos as this doesn't cause a large divide here
+	return (static_cast<uint64_t>(mCurrTimeInt.tv_sec) * 1000000000) + (static_cast<uint64_t>(mCurrTimeInt.tv_nsec));
+}
+#endif
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/PxShared/src/foundation/src/windows/PsWindowsAtomic.cpp b/PxShared/src/foundation/src/windows/PsWindowsAtomic.cpp
new file mode 100644
index 0000000..97cdba2
--- /dev/null
+++ b/PxShared/src/foundation/src/windows/PsWindowsAtomic.cpp
@@ -0,0 +1,96 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "windows/PsWindowsInclude.h"
+#include "PsAtomic.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+
+int32_t atomicExchange(volatile int32_t* val, int32_t val2)
+{
+	return (int32_t)InterlockedExchange((volatile LONG*)val, (LONG)val2);
+}
+
+int32_t atomicCompareExchange(volatile int32_t* dest, int32_t exch, int32_t comp)
+{
+	return (int32_t)InterlockedCompareExchange((volatile LONG*)dest, exch, comp);
+}
+
+void* atomicCompareExchangePointer(volatile void** dest, void* exch, void* comp)
+{
+	return InterlockedCompareExchangePointer((volatile PVOID*)dest, exch, comp);
+}
+
+int32_t atomicIncrement(volatile int32_t* val)
+{
+	return (int32_t)InterlockedIncrement((volatile LONG*)val);
+}
+
+int32_t atomicDecrement(volatile int32_t* val)
+{
+	return (int32_t)InterlockedDecrement((volatile LONG*)val);
+}
+
+int32_t atomicAdd(volatile int32_t* val, int32_t delta)
+{
+	LONG newValue, oldValue;
+	do
+	{
+		oldValue = *val;
+		newValue = oldValue + delta;
+	} while(InterlockedCompareExchange((volatile LONG*)val, newValue, oldValue) != oldValue);
+
+	return newValue;
+}
+
+int32_t atomicMax(volatile int32_t* val, int32_t val2)
+{
+	// Could do this more efficiently in asm...
+
+	LONG newValue, oldValue;
+
+	do
+	{
+		oldValue = *val;
+
+		if(val2 > oldValue)
+			newValue = val2;
+		else
+			newValue = oldValue;
+
+	} while(InterlockedCompareExchange((volatile LONG*)val, newValue, oldValue) != oldValue);
+
+	return newValue;
+}
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/PxShared/src/foundation/src/windows/PsWindowsCpu.cpp b/PxShared/src/foundation/src/windows/PsWindowsCpu.cpp
new file mode 100644
index 0000000..14c78fb
--- /dev/null
+++ b/PxShared/src/foundation/src/windows/PsWindowsCpu.cpp
@@ -0,0 +1,64 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "PsCpu.h"
+#pragma warning(push)
+//'symbol' is not defined as a preprocessor macro, replacing with '0' for 'directives'
+#pragma warning(disable : 4668)
+#if PX_VC == 10
+#pragma warning(disable : 4987) // nonstandard extension used: 'throw (...)'
+#endif
+#include <intrin.h>
+#pragma warning(pop)
+
+namespace physx
+{
+namespace shdfnd
+{
+
+#if PX_ARM
+#define cpuid(reg) reg[0] = reg[1] = reg[2] = reg[3] = 0;
+
+uint8_t Cpu::getCpuId()
+{
+	uint32_t cpuInfo[4];
+	cpuid(cpuInfo);
+	return static_cast<uint8_t>(cpuInfo[1] >> 24); // APIC Physical ID
+}
+#else
+uint8_t Cpu::getCpuId()
+{
+	int CPUInfo[4];
+	int InfoType = 1;
+	__cpuid(CPUInfo, InfoType);
+	return static_cast<uint8_t>(CPUInfo[1] >> 24); // APIC Physical ID
+}
+#endif
+}
+}
diff --git a/PxShared/src/foundation/src/windows/PsWindowsFPU.cpp b/PxShared/src/foundation/src/windows/PsWindowsFPU.cpp
new file mode 100644
index 0000000..e83cccf
--- /dev/null
+++ b/PxShared/src/foundation/src/windows/PsWindowsFPU.cpp
@@ -0,0 +1,88 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+#include "PsFPU.h"
+#include "float.h"
+#include "PsIntrinsics.h"
+
+#if PX_X64
+#define _MCW_ALL _MCW_DN | _MCW_EM | _MCW_RC
+#else
+#define _MCW_ALL _MCW_DN | _MCW_EM | _MCW_IC | _MCW_RC | _MCW_PC
+#endif
+
+physx::shdfnd::FPUGuard::FPUGuard()
+{
+// default plus FTZ and DAZ
+#if PX_X64
+	// query current control word state
+	_controlfp_s(mControlWords, 0, 0);
+
+	// set both x87 and sse units to default + DAZ
+	unsigned int cw;
+	_controlfp_s(&cw, _CW_DEFAULT | _DN_FLUSH, _MCW_ALL);
+#else
+	// query current control word state
+	__control87_2(0, 0, mControlWords, mControlWords + 1);
+
+	// set both x87 and sse units to default + DAZ
+	unsigned int x87, sse;
+	__control87_2(_CW_DEFAULT | _DN_FLUSH, _MCW_ALL, &x87, &sse);
+#endif
+}
+
+physx::shdfnd::FPUGuard::~FPUGuard()
+{
+	_clearfp();
+
+#if PX_X64
+	// reset FP state
+	unsigned int cw;
+	_controlfp_s(&cw, *mControlWords, _MCW_ALL);
+#else
+
+	// reset FP state
+	unsigned int x87, sse;
+	__control87_2(mControlWords[0], _MCW_ALL, &x87, 0);
+	__control87_2(mControlWords[1], _MCW_ALL, 0, &sse);
+#endif
+}
+
+void physx::shdfnd::enableFPExceptions()
+{
+	// clear any pending exceptions
+	_clearfp();
+
+	// enable all fp exceptions except inexact and underflow (common, benign)
+	_controlfp_s(NULL, uint32_t(~_MCW_EM) | _EM_INEXACT | _EM_UNDERFLOW, _MCW_EM);
+}
+
+void physx::shdfnd::disableFPExceptions()
+{
+	_controlfp_s(NULL, _MCW_EM, _MCW_EM);
+}
diff --git a/PxShared/src/foundation/src/windows/PsWindowsMutex.cpp b/PxShared/src/foundation/src/windows/PsWindowsMutex.cpp
new file mode 100644
index 0000000..6174b96
--- /dev/null
+++ b/PxShared/src/foundation/src/windows/PsWindowsMutex.cpp
@@ -0,0 +1,163 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "windows/PsWindowsInclude.h"
+#include "PsFoundation.h"
+#include "PsUserAllocated.h"
+#include "PsMutex.h"
+#include "PsThread.h"
+#include "foundation/PxErrorCallback.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+
+namespace
+{
+struct MutexWinImpl
+{
+	CRITICAL_SECTION mLock;
+	Thread::Id mOwner;
+};
+
+MutexWinImpl* getMutex(MutexImpl* impl)
+{
+	return reinterpret_cast<MutexWinImpl*>(impl);
+}
+}
+
+MutexImpl::MutexImpl()
+{
+	InitializeCriticalSection(&getMutex(this)->mLock);
+	getMutex(this)->mOwner = 0;
+}
+
+MutexImpl::~MutexImpl()
+{
+	DeleteCriticalSection(&getMutex(this)->mLock);
+}
+
+void MutexImpl::lock()
+{
+	EnterCriticalSection(&getMutex(this)->mLock);
+
+#if PX_DEBUG
+	getMutex(this)->mOwner = Thread::getId();
+#endif
+}
+
+bool MutexImpl::trylock()
+{
+	bool success = TryEnterCriticalSection(&getMutex(this)->mLock) != 0;
+#if PX_DEBUG
+	if(success)
+		getMutex(this)->mOwner = Thread::getId();
+#endif
+	return success;
+}
+
+void MutexImpl::unlock()
+{
+#if PX_DEBUG
+	// ensure we are already holding the lock
+	if(getMutex(this)->mOwner != Thread::getId())
+	{
+		shdfnd::getFoundation().error(PxErrorCode::eINVALID_OPERATION, __FILE__, __LINE__,
+		                              "Mutex must be unlocked only by thread that has already acquired lock");
+		return;
+	}
+
+#endif
+
+	LeaveCriticalSection(&getMutex(this)->mLock);
+}
+
+static const uint32_t gSize = sizeof(MutexWinImpl);
+
+const uint32_t& MutexImpl::getSize()
+{
+	return gSize;
+}
+
+class ReadWriteLockImpl
+{
+	PX_NOCOPY(ReadWriteLockImpl)
+  public:
+	ReadWriteLockImpl()
+	{
+	}
+	Mutex mutex;
+	volatile LONG readerCount; // handle recursive writer locking
+};
+
+ReadWriteLock::ReadWriteLock()
+{
+	mImpl = reinterpret_cast<ReadWriteLockImpl*>(PX_ALLOC(sizeof(ReadWriteLockImpl), "ReadWriteLockImpl"));
+	PX_PLACEMENT_NEW(mImpl, ReadWriteLockImpl);
+
+	mImpl->readerCount = 0;
+}
+
+ReadWriteLock::~ReadWriteLock()
+{
+	mImpl->~ReadWriteLockImpl();
+	PX_FREE(mImpl);
+}
+
+void ReadWriteLock::lockReader()
+{
+	mImpl->mutex.lock();
+
+	InterlockedIncrement(&mImpl->readerCount);
+
+	mImpl->mutex.unlock();
+}
+
+void ReadWriteLock::lockWriter()
+{
+	mImpl->mutex.lock();
+
+	// spin lock until no readers
+	while(mImpl->readerCount)
+		;
+}
+
+void ReadWriteLock::unlockReader()
+{
+	InterlockedDecrement(&mImpl->readerCount);
+}
+
+void ReadWriteLock::unlockWriter()
+{
+	mImpl->mutex.unlock();
+}
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/PxShared/src/foundation/src/windows/PsWindowsPrintString.cpp b/PxShared/src/foundation/src/windows/PsWindowsPrintString.cpp
new file mode 100644
index 0000000..ee72037
--- /dev/null
+++ b/PxShared/src/foundation/src/windows/PsWindowsPrintString.cpp
@@ -0,0 +1,54 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "PsString.h"
+#include <stdio.h>
+#pragma warning(push)
+#pragma warning(disable : 4668) //'symbol' is not defined as a preprocessor macro, replacing with '0' for 'directives'
+#include <windows.h>
+#pragma warning(pop)
+
+#include <stdio.h>
+#include <string.h>
+#include <stdarg.h>
+
+namespace physx
+{
+namespace shdfnd
+{
+
+void printString(const char* str)
+{
+	puts(str); // do not use printf here, since str can contain multiple % signs that will not be printed
+	OutputDebugStringA(str);
+	OutputDebugStringA("\n");
+}
+}
+
+} // namespace physx
diff --git a/PxShared/src/foundation/src/windows/PsWindowsSList.cpp b/PxShared/src/foundation/src/windows/PsWindowsSList.cpp
new file mode 100644
index 0000000..146785e
--- /dev/null
+++ b/PxShared/src/foundation/src/windows/PsWindowsSList.cpp
@@ -0,0 +1,79 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "windows/PsWindowsInclude.h"
+#include "PsAllocator.h"
+#include "PsSList.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+namespace
+{
+template <typename T>
+SLIST_HEADER* getDetail(T* impl)
+{
+	return reinterpret_cast<SLIST_HEADER*>(impl);
+}
+}
+
+SListImpl::SListImpl()
+{
+	InitializeSListHead(getDetail(this));
+}
+
+SListImpl::~SListImpl()
+{
+}
+
+void SListImpl::push(SListEntry* entry)
+{
+	InterlockedPushEntrySList(getDetail(this), reinterpret_cast<SLIST_ENTRY*>(entry));
+}
+
+SListEntry* SListImpl::pop()
+{
+	return reinterpret_cast<SListEntry*>(InterlockedPopEntrySList(getDetail(this)));
+}
+
+SListEntry* SListImpl::flush()
+{
+	return reinterpret_cast<SListEntry*>(InterlockedFlushSList(getDetail(this)));
+}
+
+static const uint32_t gSize = sizeof(SLIST_HEADER);
+
+const uint32_t& SListImpl::getSize()
+{
+	return gSize;
+}
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/PxShared/src/foundation/src/windows/PsWindowsSocket.cpp b/PxShared/src/foundation/src/windows/PsWindowsSocket.cpp
new file mode 100644
index 0000000..bd253b9
--- /dev/null
+++ b/PxShared/src/foundation/src/windows/PsWindowsSocket.cpp
@@ -0,0 +1,446 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxIntrinsics.h"
+
+#include "windows/PsWindowsInclude.h"
+#include "PsSocket.h"
+#include "PsThread.h"
+#include "PsArray.h"
+
+#include <Winsock2.h>
+#pragma comment(lib, "Ws2_32")
+
+namespace physx
+{
+namespace shdfnd
+{
+
+const uint32_t Socket::DEFAULT_BUFFER_SIZE = 32768;
+
+class SocketImpl
+{
+  public:
+	SocketImpl(bool isBlocking);
+	virtual ~SocketImpl();
+
+	bool connect(const char* host, uint16_t port, uint32_t timeout);
+	bool listen(uint16_t port);
+	bool accept(bool block);
+	void disconnect();
+
+	void setBlocking(bool blocking);
+
+	virtual uint32_t write(const uint8_t* data, uint32_t length);
+	virtual bool flush();
+	uint32_t read(uint8_t* data, uint32_t length);
+
+	PX_FORCE_INLINE bool isBlocking() const
+	{
+		return mIsBlocking;
+	}
+	PX_FORCE_INLINE bool isConnected() const
+	{
+		return mIsConnected;
+	}
+	PX_FORCE_INLINE const char* getHost() const
+	{
+		return mHost;
+	}
+	PX_FORCE_INLINE uint16_t getPort() const
+	{
+		return mPort;
+	}
+
+  protected:
+	bool nonBlockingTimeout() const;
+	void setBlockingInternal(SOCKET socket, bool blocking);
+
+	mutable SOCKET mSocket;
+	SOCKET mListenSocket;
+	const char* mHost;
+	uint16_t mPort;
+	mutable bool mIsConnected;
+	bool mIsBlocking;
+	bool mListenMode;
+	bool mSocketLayerIntialized;
+};
+
+SocketImpl::SocketImpl(bool isBlocking)
+: mSocket(INVALID_SOCKET)
+, mListenSocket(INVALID_SOCKET)
+, mPort(0)
+, mHost(NULL)
+, mIsConnected(false)
+, mIsBlocking(isBlocking)
+, mListenMode(false)
+, mSocketLayerIntialized(false)
+{
+	WORD vreq;
+	WSADATA wsaData;
+	vreq = MAKEWORD(2, 2);
+	mSocketLayerIntialized = (WSAStartup(vreq, &wsaData) == 0);
+}
+
+SocketImpl::~SocketImpl()
+{
+	if(mSocketLayerIntialized)
+		WSACleanup();
+}
+
+void SocketImpl::setBlockingInternal(SOCKET socket, bool blocking)
+{
+	uint32_t mode = uint32_t(blocking ? 0 : 1);
+	ioctlsocket(socket, FIONBIO, (u_long*)&mode);
+}
+
+#ifdef PX_VC11
+#pragma warning(push)
+#pragma warning(disable : 4548) // for FD_SET on vc11 only
+#endif
+bool SocketImpl::connect(const char* host, uint16_t port, uint32_t timeout)
+{
+	if(!mSocketLayerIntialized)
+		return false;
+
+	sockaddr_in socketAddress;
+	hostent* hp;
+
+	intrinsics::memSet(&socketAddress, 0, sizeof(sockaddr_in));
+	socketAddress.sin_family = AF_INET;
+	socketAddress.sin_port = htons(port);
+
+	// get host
+	hp = gethostbyname(host);
+	if(!hp)
+	{
+		in_addr a;
+		a.s_addr = inet_addr(host);
+		hp = gethostbyaddr((const char*)&a, sizeof(in_addr), AF_INET);
+		if(!hp)
+			return false;
+	}
+	intrinsics::memCopy(&socketAddress.sin_addr, hp->h_addr_list[0], (uint32_t)hp->h_length);
+
+	// connect
+	mSocket = socket(PF_INET, SOCK_STREAM, 0);
+	if(mSocket == INVALID_SOCKET)
+		return false;
+
+	setBlockingInternal(mSocket, false);
+
+	::connect(mSocket, (sockaddr*)&socketAddress, sizeof(socketAddress));
+	// Setup select function call to monitor the connect call.
+	fd_set writefs;
+	fd_set exceptfs;
+	FD_ZERO(&writefs);
+	FD_ZERO(&exceptfs);
+#pragma warning(push)
+#pragma warning(disable : 4127 4548)
+	FD_SET(mSocket, &writefs);
+	FD_SET(mSocket, &exceptfs);
+#pragma warning(pop)
+	timeval timeout_;
+	timeout_.tv_sec = long(timeout / 1000);
+	timeout_.tv_usec = long(((timeout % 1000) * 1000));
+	int selret = ::select(1, NULL, &writefs, &exceptfs, &timeout_);
+	int excepted = FD_ISSET(mSocket, &exceptfs);
+	int canWrite = FD_ISSET(mSocket, &writefs);
+	if(selret != 1 || excepted || !canWrite)
+	{
+		disconnect();
+		return false;
+	}
+
+	setBlockingInternal(mSocket, mIsBlocking);
+
+	mIsConnected = true;
+	mPort = port;
+	mHost = host;
+	return true;
+}
+#ifdef PX_VC11
+#pragma warning(pop)
+#endif
+
+bool SocketImpl::listen(uint16_t port)
+{
+	if(!mSocketLayerIntialized)
+		return false;
+
+	mListenSocket = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP);
+	if(mListenSocket == INVALID_SOCKET)
+		return false;
+
+	mListenMode = true;
+
+	sockaddr_in addr = { 0 };
+	addr.sin_family = AF_INET;
+	addr.sin_port = htons(port);
+	addr.sin_addr.s_addr = htonl(INADDR_ANY);
+	return bind(mListenSocket, (sockaddr*)&addr, sizeof(addr)) == 0 && ::listen(mListenSocket, SOMAXCONN) == 0;
+}
+
+bool SocketImpl::accept(bool block)
+{
+	if(mIsConnected || !mListenMode)
+		return false;
+
+	// set the listen socket to be non-blocking.
+	setBlockingInternal(mListenSocket, block);
+	SOCKET clientSocket = ::accept(mListenSocket, 0, 0);
+	if(clientSocket == INVALID_SOCKET)
+		return false;
+
+	mSocket = clientSocket;
+	mIsConnected = true;
+	setBlockingInternal(mSocket, mIsBlocking); // force the mode to whatever the user set
+
+	return mIsConnected;
+}
+
+void SocketImpl::disconnect()
+{
+	if(mListenSocket != INVALID_SOCKET)
+	{
+		closesocket(mListenSocket);
+		mListenSocket = INVALID_SOCKET;
+	}
+	if(mSocket != INVALID_SOCKET)
+	{
+		WSASendDisconnect(mSocket, NULL);
+		closesocket(mSocket);
+		mSocket = INVALID_SOCKET;
+	}
+	mIsConnected = false;
+	mListenMode = false;
+	mPort = 0;
+	mHost = NULL;
+}
+
+bool SocketImpl::nonBlockingTimeout() const
+{
+	return !mIsBlocking && WSAGetLastError() == WSAEWOULDBLOCK;
+}
+
+// should be cross-platform from here down
+
+void SocketImpl::setBlocking(bool blocking)
+{
+	if(blocking != mIsBlocking)
+	{
+		mIsBlocking = blocking;
+		if(isConnected())
+			setBlockingInternal(mSocket, blocking);
+	}
+}
+
+bool SocketImpl::flush()
+{
+	return true;
+}
+
+uint32_t SocketImpl::write(const uint8_t* data, uint32_t length)
+{
+	if(length == 0)
+		return 0;
+
+	int sent = send(mSocket, (const char*)data, (int32_t)length, 0);
+
+	if(sent <= 0 && !nonBlockingTimeout())
+		disconnect();
+
+	return uint32_t(sent > 0 ? sent : 0);
+}
+
+uint32_t SocketImpl::read(uint8_t* data, uint32_t length)
+{
+	if(length == 0)
+		return 0;
+
+	int32_t received = recv(mSocket, (char*)data, (int32_t)length, 0);
+
+	if(received <= 0 && !nonBlockingTimeout())
+		disconnect();
+
+	return uint32_t(received > 0 ? received : 0);
+}
+
+class BufferedSocketImpl : public SocketImpl
+{
+  public:
+	BufferedSocketImpl(bool isBlocking) : SocketImpl(isBlocking), mBufferPos(0)
+	{
+	}
+	virtual ~BufferedSocketImpl()
+	{
+	}
+	bool flush();
+	uint32_t write(const uint8_t* data, uint32_t length);
+
+  private:
+	uint32_t mBufferPos;
+	uint8_t mBuffer[Socket::DEFAULT_BUFFER_SIZE];
+};
+
+bool BufferedSocketImpl::flush()
+{
+	uint32_t totalBytesWritten = 0;
+
+	while(totalBytesWritten < mBufferPos && mIsConnected)
+		totalBytesWritten += (int32_t)SocketImpl::write(mBuffer + totalBytesWritten, mBufferPos - totalBytesWritten);
+
+	bool ret = (totalBytesWritten == mBufferPos);
+	mBufferPos = 0;
+	return ret;
+}
+
+uint32_t BufferedSocketImpl::write(const uint8_t* data, uint32_t length)
+{
+	uint32_t bytesWritten = 0;
+	while(mBufferPos + length >= Socket::DEFAULT_BUFFER_SIZE)
+	{
+		uint32_t currentChunk = Socket::DEFAULT_BUFFER_SIZE - mBufferPos;
+		intrinsics::memCopy(mBuffer + mBufferPos, data + bytesWritten, currentChunk);
+		bytesWritten += (uint32_t)currentChunk; // for the user, this is consumed even if we fail to shove it down a
+		// non-blocking socket
+
+		uint32_t sent = SocketImpl::write(mBuffer, Socket::DEFAULT_BUFFER_SIZE);
+		mBufferPos = Socket::DEFAULT_BUFFER_SIZE - sent;
+
+		if(sent < Socket::DEFAULT_BUFFER_SIZE) // non-blocking or error
+		{
+			if(sent) // we can reasonably hope this is rare
+				intrinsics::memMove(mBuffer, mBuffer + sent, mBufferPos);
+
+			return bytesWritten;
+		}
+		length -= currentChunk;
+	}
+
+	if(length > 0)
+	{
+		intrinsics::memCopy(mBuffer + mBufferPos, data + bytesWritten, length);
+		bytesWritten += length;
+		mBufferPos += length;
+	}
+
+	return bytesWritten;
+}
+
+Socket::Socket(bool inIsBuffering, bool isBlocking)
+{
+	if(inIsBuffering)
+	{
+		void* mem = PX_ALLOC(sizeof(BufferedSocketImpl), "BufferedSocketImpl");
+		mImpl = PX_PLACEMENT_NEW(mem, BufferedSocketImpl)(isBlocking);
+	}
+	else
+	{
+		void* mem = PX_ALLOC(sizeof(SocketImpl), "SocketImpl");
+		mImpl = PX_PLACEMENT_NEW(mem, SocketImpl)(isBlocking);
+	}
+}
+
+Socket::~Socket()
+{
+	mImpl->flush();
+	mImpl->disconnect();
+	mImpl->~SocketImpl();
+	PX_FREE(mImpl);
+}
+
+bool Socket::connect(const char* host, uint16_t port, uint32_t timeout)
+{
+	return mImpl->connect(host, port, timeout);
+}
+
+bool Socket::listen(uint16_t port)
+{
+	return mImpl->listen(port);
+}
+
+bool Socket::accept(bool block)
+{
+	return mImpl->accept(block);
+}
+
+void Socket::disconnect()
+{
+	mImpl->disconnect();
+}
+
+bool Socket::isConnected() const
+{
+	return mImpl->isConnected();
+}
+
+const char* Socket::getHost() const
+{
+	return mImpl->getHost();
+}
+
+uint16_t Socket::getPort() const
+{
+	return mImpl->getPort();
+}
+
+bool Socket::flush()
+{
+	if(!mImpl->isConnected())
+		return false;
+	return mImpl->flush();
+}
+
+uint32_t Socket::write(const uint8_t* data, uint32_t length)
+{
+	if(!mImpl->isConnected())
+		return 0;
+	return mImpl->write(data, length);
+}
+
+uint32_t Socket::read(uint8_t* data, uint32_t length)
+{
+	if(!mImpl->isConnected())
+		return 0;
+	return mImpl->read(data, length);
+}
+
+void Socket::setBlocking(bool blocking)
+{
+	mImpl->setBlocking(blocking);
+}
+
+bool Socket::isBlocking() const
+{
+	return mImpl->isBlocking();
+}
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/PxShared/src/foundation/src/windows/PsWindowsSync.cpp b/PxShared/src/foundation/src/windows/PsWindowsSync.cpp
new file mode 100644
index 0000000..5ce8122
--- /dev/null
+++ b/PxShared/src/foundation/src/windows/PsWindowsSync.cpp
@@ -0,0 +1,82 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "windows/PsWindowsInclude.h"
+#include "PsUserAllocated.h"
+#include "PsSync.h"
+
+namespace physx
+{
+namespace shdfnd
+{
+
+namespace
+{
+HANDLE& getSync(SyncImpl* impl)
+{
+	return *reinterpret_cast<HANDLE*>(impl);
+}
+}
+
+static const uint32_t gSize = sizeof(HANDLE);
+const uint32_t& SyncImpl::getSize()
+{
+	return gSize;
+}
+
+SyncImpl::SyncImpl()
+{
+	getSync(this) = CreateEvent(0, true, false, 0);
+}
+
+SyncImpl::~SyncImpl()
+{
+	CloseHandle(getSync(this));
+}
+
+void SyncImpl::reset()
+{
+	ResetEvent(getSync(this));
+}
+
+void SyncImpl::set()
+{
+	SetEvent(getSync(this));
+}
+
+bool SyncImpl::wait(uint32_t milliseconds)
+{
+	if(milliseconds == -1)
+		milliseconds = INFINITE;
+
+	return WaitForSingleObject(getSync(this), milliseconds) == WAIT_OBJECT_0 ? true : false;
+}
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/PxShared/src/foundation/src/windows/PsWindowsThread.cpp b/PxShared/src/foundation/src/windows/PsWindowsThread.cpp
new file mode 100644
index 0000000..18ad5ee
--- /dev/null
+++ b/PxShared/src/foundation/src/windows/PsWindowsThread.cpp
@@ -0,0 +1,405 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "windows/PsWindowsInclude.h"
+#include "PsFoundation.h"
+#include "PsThread.h"
+#include "foundation/PxErrorCallback.h"
+#include "foundation/PxAssert.h"
+
+// an exception for setting the thread name in Microsoft debuggers
+#define NS_MS_VC_EXCEPTION 0x406D1388
+
+namespace physx
+{
+namespace shdfnd
+{
+
+namespace
+{
+
+#if PX_VC
+#pragma warning(disable : 4061) // enumerator 'identifier' in switch of enum 'enumeration' is not handled
+#pragma warning(disable : 4191) //'operator/operation' : unsafe conversion from 'type of expression' to 'type required'
+#endif
+
+// struct for naming a thread in the debugger
+#pragma pack(push, 8)
+
+typedef struct tagTHREADNAME_INFO
+{
+	DWORD dwType;     // Must be 0x1000.
+	LPCSTR szName;    // Pointer to name (in user addr space).
+	DWORD dwThreadID; // Thread ID (-1=caller thread).
+	DWORD dwFlags;    // Reserved for future use, must be zero.
+} THREADNAME_INFO;
+
+#pragma pack(pop)
+
+class _ThreadImpl
+{
+  public:
+	enum State
+	{
+		NotStarted,
+		Started,
+		Stopped
+	};
+
+	HANDLE thread;
+	LONG quitNow; // Should be 32bit aligned on SMP systems.
+	State state;
+	DWORD threadID;
+
+	ThreadImpl::ExecuteFn fn;
+	void* arg;
+
+	uint32_t affinityMask;
+};
+
+_ThreadImpl* getThread(ThreadImpl* impl)
+{
+	return reinterpret_cast<_ThreadImpl*>(impl);
+}
+
+DWORD WINAPI PxThreadStart(LPVOID arg)
+{
+	_ThreadImpl* impl = getThread((ThreadImpl*)arg);
+
+	// run either the passed in function or execute from the derived class (Runnable).
+	if(impl->fn)
+		(*impl->fn)(impl->arg);
+	else if(impl->arg)
+		((Runnable*)impl->arg)->execute();
+	return 0;
+}
+
+// cache physical thread count
+uint32_t gPhysicalCoreCount = 0;
+}
+
+static const uint32_t gSize = sizeof(_ThreadImpl);
+const uint32_t& ThreadImpl::getSize()
+{
+	return gSize;
+}
+
+ThreadImpl::Id ThreadImpl::getId()
+{
+	return static_cast<Id>(GetCurrentThreadId());
+}
+
+// fwd GetLogicalProcessorInformation()
+typedef BOOL(WINAPI* LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
+
+uint32_t ThreadImpl::getNbPhysicalCores()
+{
+	if(!gPhysicalCoreCount)
+	{
+		// modified example code from: http://msdn.microsoft.com/en-us/library/ms683194
+		LPFN_GLPI glpi;
+		PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = NULL;
+		PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = NULL;
+		DWORD returnLength = 0;
+		DWORD processorCoreCount = 0;
+		DWORD byteOffset = 0;
+
+		glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
+
+		if(NULL == glpi)
+		{
+			// GetLogicalProcessorInformation not supported on OS < XP Service Pack 3
+			return 0;
+		}
+
+		DWORD rc = (DWORD)glpi(NULL, &returnLength);
+		PX_ASSERT(rc == FALSE);
+		PX_UNUSED(rc);
+
+		// first query reports required buffer space
+		if(GetLastError() == ERROR_INSUFFICIENT_BUFFER)
+		{
+			buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)PxAlloca(returnLength);
+		}
+		else
+		{
+			physx::shdfnd::getFoundation().error(PxErrorCode::eINTERNAL_ERROR, __FILE__, __LINE__,
+			                                     "Error querying buffer size for number of physical processors");
+			return 0;
+		}
+
+		// retrieve data
+		rc = (DWORD)glpi(buffer, &returnLength);
+		if(rc != TRUE)
+		{
+			physx::shdfnd::getFoundation().error(PxErrorCode::eINTERNAL_ERROR, __FILE__, __LINE__,
+			                                     "Error querying number of physical processors");
+			return 0;
+		}
+
+		ptr = buffer;
+
+		while(byteOffset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= returnLength)
+		{
+			switch(ptr->Relationship)
+			{
+			case RelationProcessorCore:
+				processorCoreCount++;
+				break;
+			default:
+				break;
+			}
+
+			byteOffset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+			ptr++;
+		}
+
+		gPhysicalCoreCount = processorCoreCount;
+	}
+
+	return gPhysicalCoreCount;
+}
+
+ThreadImpl::ThreadImpl()
+{
+	getThread(this)->thread = NULL;
+	getThread(this)->state = _ThreadImpl::NotStarted;
+	getThread(this)->quitNow = 0;
+	getThread(this)->fn = NULL;
+	getThread(this)->arg = NULL;
+	getThread(this)->affinityMask = 0;
+}
+
+ThreadImpl::ThreadImpl(ExecuteFn fn, void* arg)
+{
+	getThread(this)->thread = NULL;
+	getThread(this)->state = _ThreadImpl::NotStarted;
+	getThread(this)->quitNow = 0;
+	getThread(this)->fn = fn;
+	getThread(this)->arg = arg;
+	getThread(this)->affinityMask = 0;
+
+	start(0, NULL);
+}
+
+ThreadImpl::~ThreadImpl()
+{
+	if(getThread(this)->state == _ThreadImpl::Started)
+		kill();
+	CloseHandle(getThread(this)->thread);
+}
+
+void ThreadImpl::start(uint32_t stackSize, Runnable* runnable)
+{
+	if(getThread(this)->state != _ThreadImpl::NotStarted)
+		return;
+	getThread(this)->state = _ThreadImpl::Started;
+
+	if(runnable && !getThread(this)->arg && !getThread(this)->fn)
+		getThread(this)->arg = runnable;
+
+	getThread(this)->thread =
+	    CreateThread(NULL, stackSize, PxThreadStart, (LPVOID) this, CREATE_SUSPENDED, &getThread(this)->threadID);
+	if(!getThread(this)->thread)
+	{
+		physx::shdfnd::getFoundation().error(PxErrorCode::eINTERNAL_ERROR, __FILE__, __LINE__,
+			                                    "PsWindowsThread::start: Failed to create thread.");
+		getThread(this)->state = _ThreadImpl::NotStarted;
+		return;
+	}
+
+	// set affinity and resume
+	if(getThread(this)->affinityMask)
+		setAffinityMask(getThread(this)->affinityMask);
+
+	DWORD rc = ResumeThread(getThread(this)->thread);
+	if(rc == DWORD(-1))
+	{
+		physx::shdfnd::getFoundation().error(PxErrorCode::eINTERNAL_ERROR, __FILE__, __LINE__,
+			                                    "PsWindowsThread::start: Failed to resume thread.");
+		getThread(this)->state = _ThreadImpl::NotStarted;
+		return;
+	}	
+}
+
+void ThreadImpl::signalQuit()
+{
+	InterlockedIncrement(&(getThread(this)->quitNow));
+}
+
+bool ThreadImpl::waitForQuit()
+{
+	if(getThread(this)->state == _ThreadImpl::NotStarted)
+		return false;
+
+	WaitForSingleObject(getThread(this)->thread, INFINITE);
+	return true;
+}
+
+bool ThreadImpl::quitIsSignalled()
+{
+	return InterlockedCompareExchange(&(getThread(this)->quitNow), 0, 0) != 0;
+}
+
+void ThreadImpl::quit()
+{
+	getThread(this)->state = _ThreadImpl::Stopped;
+	ExitThread(0);
+}
+
+void ThreadImpl::kill()
+{
+	if(getThread(this)->state == _ThreadImpl::Started)
+		TerminateThread(getThread(this)->thread, 0);
+	getThread(this)->state = _ThreadImpl::Stopped;
+}
+
+void ThreadImpl::sleep(uint32_t ms)
+{
+	Sleep(ms);
+}
+
+void ThreadImpl::yield()
+{
+	SwitchToThread();
+}
+
+uint32_t ThreadImpl::setAffinityMask(uint32_t mask)
+{
+	if(mask)
+	{
+		// store affinity
+		getThread(this)->affinityMask = mask;
+
+		// if thread already started apply immediately
+		if(getThread(this)->state == _ThreadImpl::Started)
+		{
+			uint32_t err = uint32_t(SetThreadAffinityMask(getThread(this)->thread, mask));
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+void ThreadImpl::setName(const char* name)
+{
+	THREADNAME_INFO info;
+	info.dwType = 0x1000;
+	info.szName = name;
+	info.dwThreadID = getThread(this)->threadID;
+	info.dwFlags = 0;
+
+	// C++ Exceptions are disabled for this project, but SEH is not (and cannot be)
+	// http://stackoverflow.com/questions/943087/what-exactly-will-happen-if-i-disable-c-exceptions-in-a-project
+	__try
+	{
+		RaiseException(NS_MS_VC_EXCEPTION, 0, sizeof(info) / sizeof(ULONG_PTR), (ULONG_PTR*)&info);
+	}
+	__except(EXCEPTION_EXECUTE_HANDLER)
+	{
+		// this runs if not attached to a debugger (thus not really naming the thread)
+	}
+}
+
+void ThreadImpl::setPriority(ThreadPriority::Enum prio)
+{
+	BOOL rc = false;
+	switch(prio)
+	{
+	case ThreadPriority::eHIGH:
+		rc = SetThreadPriority(getThread(this)->thread, THREAD_PRIORITY_HIGHEST);
+		break;
+	case ThreadPriority::eABOVE_NORMAL:
+		rc = SetThreadPriority(getThread(this)->thread, THREAD_PRIORITY_ABOVE_NORMAL);
+		break;
+	case ThreadPriority::eNORMAL:
+		rc = SetThreadPriority(getThread(this)->thread, THREAD_PRIORITY_NORMAL);
+		break;
+	case ThreadPriority::eBELOW_NORMAL:
+		rc = SetThreadPriority(getThread(this)->thread, THREAD_PRIORITY_BELOW_NORMAL);
+		break;
+	case ThreadPriority::eLOW:
+		rc = SetThreadPriority(getThread(this)->thread, THREAD_PRIORITY_LOWEST);
+		break;
+	default:
+		break;
+	}
+	if(!rc)
+	{
+		physx::shdfnd::getFoundation().error(PxErrorCode::eINTERNAL_ERROR, __FILE__, __LINE__,
+			"PsWindowsThread::setPriority: Failed to set thread priority.");
+	}
+}
+
+ThreadPriority::Enum ThreadImpl::getPriority(Id threadId)
+{
+	ThreadPriority::Enum retval = ThreadPriority::eLOW;
+	int priority = GetThreadPriority((HANDLE)threadId);
+	PX_COMPILE_TIME_ASSERT(THREAD_PRIORITY_HIGHEST > THREAD_PRIORITY_ABOVE_NORMAL);
+	if(priority >= THREAD_PRIORITY_HIGHEST)
+		retval = ThreadPriority::eHIGH;
+	else if(priority >= THREAD_PRIORITY_ABOVE_NORMAL)
+		retval = ThreadPriority::eABOVE_NORMAL;
+	else if(priority >= THREAD_PRIORITY_NORMAL)
+		retval = ThreadPriority::eNORMAL;
+	else if(priority >= THREAD_PRIORITY_BELOW_NORMAL)
+		retval = ThreadPriority::eBELOW_NORMAL;
+	return retval;
+}
+
+uint32_t TlsAlloc()
+{
+	DWORD rv = ::TlsAlloc();
+	PX_ASSERT(rv != TLS_OUT_OF_INDEXES);
+	return (uint32_t)rv;
+}
+
+void TlsFree(uint32_t index)
+{
+	::TlsFree(index);
+}
+
+void* TlsGet(uint32_t index)
+{
+	return ::TlsGetValue(index);
+}
+
+uint32_t TlsSet(uint32_t index, void* value)
+{
+	return (uint32_t)::TlsSetValue(index, value);
+}
+
+uint32_t ThreadImpl::getDefaultStackSize()
+{
+	return 1048576;
+};
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/PxShared/src/foundation/src/windows/PsWindowsTime.cpp b/PxShared/src/foundation/src/windows/PsWindowsTime.cpp
new file mode 100644
index 0000000..2c1e435
--- /dev/null
+++ b/PxShared/src/foundation/src/windows/PsWindowsTime.cpp
@@ -0,0 +1,101 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "PsTime.h"
+#include "windows/PsWindowsInclude.h"
+
+namespace
+{
+int64_t getTimeTicks()
+{
+	LARGE_INTEGER a;
+	QueryPerformanceCounter(&a);
+	return a.QuadPart;
+}
+
+double getTickDuration()
+{
+	LARGE_INTEGER a;
+	QueryPerformanceFrequency(&a);
+	return 1.0f / double(a.QuadPart);
+}
+
+double sTickDuration = getTickDuration();
+} // namespace
+
+namespace physx
+{
+namespace shdfnd
+{
+
+static const CounterFrequencyToTensOfNanos gCounterFreq = Time::getCounterFrequency();
+
+const CounterFrequencyToTensOfNanos& Time::getBootCounterFrequency()
+{
+	return gCounterFreq;
+}
+
+CounterFrequencyToTensOfNanos Time::getCounterFrequency()
+{
+	LARGE_INTEGER freq;
+	QueryPerformanceFrequency(&freq);
+	return CounterFrequencyToTensOfNanos(Time::sNumTensOfNanoSecondsInASecond, (uint64_t)freq.QuadPart);
+}
+
+uint64_t Time::getCurrentCounterValue()
+{
+	LARGE_INTEGER ticks;
+	QueryPerformanceCounter(&ticks);
+	return (uint64_t)ticks.QuadPart;
+}
+
+Time::Time() : mTickCount(0)
+{
+	getElapsedSeconds();
+}
+
+Time::Second Time::getElapsedSeconds()
+{
+	int64_t lastTickCount = mTickCount;
+	mTickCount = getTimeTicks();
+	return (mTickCount - lastTickCount) * sTickDuration;
+}
+
+Time::Second Time::peekElapsedSeconds()
+{
+	return (getTimeTicks() - mTickCount) * sTickDuration;
+}
+
+Time::Second Time::getLastTime() const
+{
+	return mTickCount * sTickDuration;
+}
+
+} // namespace shdfnd
+} // namespace physx
diff --git a/PxShared/src/pvd/include/PsPvd.h b/PxShared/src/pvd/include/PsPvd.h
new file mode 100644
index 0000000..1f0ae79
--- /dev/null
+++ b/PxShared/src/pvd/include/PsPvd.h
@@ -0,0 +1,85 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PXPVDSDK_PSPVD_H
+#define PXPVDSDK_PSPVD_H
+
+/** \addtogroup pvd
+@{
+*/
+#include "pvd/PxPvd.h"
+#include "PsBroadcast.h"
+
+#if !PX_DOXYGEN
+namespace physx
+{
+#endif
+
+class PxPvdTransport;
+
+#if !PX_DOXYGEN
+namespace pvdsdk
+{
+#endif
+
+class PvdDataStream;
+class PvdClient;
+class PvdOMMetaDataProvider;
+
+// PsPvd is used for advanced user, it support custom pvd client API
+class PsPvd : public physx::PxPvd, public shdfnd::AllocationListener
+{
+  public:
+	virtual void addClient(PvdClient* client) = 0;
+	virtual void removeClient(PvdClient* client) = 0;
+	
+	virtual bool registerObject(const void* inItem) = 0;
+	virtual bool unRegisterObject(const void* inItem) = 0;
+
+	//AllocationListener
+	void onAllocation(size_t size, const char* typeName, const char* filename, int line, void* allocatedMemory) = 0;
+	void onDeallocation(void* addr) = 0;
+
+	virtual PvdOMMetaDataProvider& getMetaDataProvider() = 0;
+	
+	virtual uint64_t getNextStreamId() = 0;
+	// Call to flush events to PVD
+	virtual void flush() = 0;
+
+};
+
+PX_PVDSDK_API void PX_CALL_CONV SetPvdAllocatorCallback(PxAllocatorCallback* inAllocatorCallback);
+
+#if !PX_DOXYGEN
+} // namespace pvdsdk
+} // namespace physx
+#endif
+
+/** @} */
+#endif // PXPVDSDK_PSPVD_H
diff --git a/PxShared/src/pvd/include/PxProfileAllocatorWrapper.h b/PxShared/src/pvd/include/PxProfileAllocatorWrapper.h
new file mode 100644
index 0000000..22903ec
--- /dev/null
+++ b/PxShared/src/pvd/include/PxProfileAllocatorWrapper.h
@@ -0,0 +1,231 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+
+#ifndef PXPVDSDK_PXPROFILEALLOCATORWRAPPER_H
+#define PXPVDSDK_PXPROFILEALLOCATORWRAPPER_H
+
+#include "foundation/PxPreprocessor.h"
+#include "foundation/PxAllocatorCallback.h"
+#include "foundation/PxErrorCallback.h"
+#include "foundation/PxAssert.h"
+
+#include "PsArray.h"
+#include "PsHashMap.h"
+
+namespace physx { namespace profile {
+
+	/**
+	\brief Helper struct to encapsulate the user allocator callback
+	Useful for array and hash templates
+	*/
+	struct PxProfileAllocatorWrapper
+	{
+		PxAllocatorCallback*			mUserAllocator;
+
+		PxProfileAllocatorWrapper( PxAllocatorCallback& inUserAllocator )
+			: mUserAllocator( &inUserAllocator )
+		{
+		}
+
+		PxProfileAllocatorWrapper( PxAllocatorCallback* inUserAllocator )
+			: mUserAllocator( inUserAllocator )
+		{
+		}
+
+		PxAllocatorCallback&		getAllocator() const
+		{
+			PX_ASSERT( NULL != mUserAllocator );
+			return *mUserAllocator;
+		}
+	};
+
+	/**
+	\brief Helper class to encapsulate the reflection allocator
+	*/
+	template <typename T>
+	class PxProfileWrapperReflectionAllocator
+	{
+		static const char* getName()
+		{
+#if PX_LINUX || PX_ANDROID || PX_PS4 || PX_IOS || PX_OSX || PX_EMSCRIPTEN || PX_NX
+			return __PRETTY_FUNCTION__;
+#else
+			return typeid(T).name();
+#endif
+		}
+		PxProfileAllocatorWrapper* mWrapper;
+
+	public:
+		PxProfileWrapperReflectionAllocator(PxProfileAllocatorWrapper& inWrapper) : mWrapper( &inWrapper )	{}
+		PxProfileWrapperReflectionAllocator( const PxProfileWrapperReflectionAllocator& inOther )
+			: mWrapper( inOther.mWrapper )
+		{
+		}
+		PxProfileWrapperReflectionAllocator& operator=( const PxProfileWrapperReflectionAllocator& inOther )
+		{
+			mWrapper = inOther.mWrapper;
+			return *this;
+		}
+		PxAllocatorCallback& getAllocator() { return mWrapper->getAllocator(); }
+		void* allocate(size_t size, const char* filename, int line)
+		{
+#if PX_CHECKED // checked and debug builds
+			if(!size)
+				return 0;
+			return getAllocator().allocate(size, getName(), filename, line);
+#else
+			return getAllocator().allocate(size, "<no allocation names in this config>", filename, line);
+#endif
+		}
+		void deallocate(void* ptr)
+		{
+			if(ptr)
+				getAllocator().deallocate(ptr);
+		}
+	};
+
+	/**
+	\brief Helper class to encapsulate the named allocator
+	*/
+	struct PxProfileWrapperNamedAllocator
+	{
+		PxProfileAllocatorWrapper*	mWrapper;
+		const char*			mAllocationName;
+		PxProfileWrapperNamedAllocator(PxProfileAllocatorWrapper& inWrapper, const char* inAllocationName)
+			: mWrapper( &inWrapper )
+			, mAllocationName( inAllocationName )
+		{}
+		PxProfileWrapperNamedAllocator( const PxProfileWrapperNamedAllocator& inOther )
+			: mWrapper( inOther.mWrapper )
+			, mAllocationName( inOther.mAllocationName )
+		{
+		}
+		PxProfileWrapperNamedAllocator& operator=( const PxProfileWrapperNamedAllocator& inOther )
+		{
+			mWrapper = inOther.mWrapper;
+			mAllocationName = inOther.mAllocationName;
+			return *this;
+		}
+		PxAllocatorCallback& getAllocator() { return mWrapper->getAllocator(); }
+		void* allocate(size_t size, const char* filename, int line)
+		{
+			if(!size)
+				return 0;
+			return getAllocator().allocate(size, mAllocationName, filename, line);
+		}
+		void deallocate(void* ptr)
+		{
+			if(ptr)
+				getAllocator().deallocate(ptr);
+		}
+	};
+
+	/**
+	\brief Helper struct to encapsulate the array
+	*/
+	template<class T>
+	struct PxProfileArray : public shdfnd::Array<T, PxProfileWrapperReflectionAllocator<T> >
+	{
+		typedef PxProfileWrapperReflectionAllocator<T> TAllocatorType;
+
+		PxProfileArray( PxProfileAllocatorWrapper& inWrapper )
+			: shdfnd::Array<T, TAllocatorType >( TAllocatorType( inWrapper ) )
+		{
+		}
+
+		PxProfileArray( const PxProfileArray< T >& inOther )
+			: shdfnd::Array<T, TAllocatorType >( inOther, inOther )
+		{
+		}
+	};
+
+	/**
+	\brief Helper struct to encapsulate the array
+	*/
+	template<typename TKeyType, typename TValueType, typename THashType=shdfnd::Hash<TKeyType> >
+	struct PxProfileHashMap : public shdfnd::HashMap<TKeyType, TValueType, THashType, PxProfileWrapperReflectionAllocator< TValueType > >
+	{
+		typedef shdfnd::HashMap<TKeyType, TValueType, THashType, PxProfileWrapperReflectionAllocator< TValueType > > THashMapType;
+		typedef PxProfileWrapperReflectionAllocator<TValueType> TAllocatorType;
+		PxProfileHashMap( PxProfileAllocatorWrapper& inWrapper )
+			: THashMapType( TAllocatorType( inWrapper ) )
+		{
+		}
+	};
+
+	/**
+	\brief Helper function to encapsulate the profile allocation
+	*/
+	template<typename TDataType>
+	inline TDataType* PxProfileAllocate( PxAllocatorCallback* inAllocator, const char* file, int inLine )
+	{
+		PxProfileAllocatorWrapper wrapper( inAllocator );
+		typedef PxProfileWrapperReflectionAllocator< TDataType > TAllocator;
+		TAllocator theAllocator( wrapper );
+		return reinterpret_cast<TDataType*>( theAllocator.allocate( sizeof( TDataType ), file, inLine ) );
+	}
+
+	/**
+	\brief Helper function to encapsulate the profile allocation
+	*/
+	template<typename TDataType>
+	inline TDataType* PxProfileAllocate( PxAllocatorCallback& inAllocator, const char* file, int inLine )
+	{
+		return PxProfileAllocate<TDataType>( &inAllocator, file, inLine );
+	}
+
+	/**
+	\brief Helper function to encapsulate the profile deallocation
+	*/
+	template<typename TDataType>
+	inline void PxProfileDeleteAndDeallocate( PxProfileAllocatorWrapper& inAllocator, TDataType* inDType )
+	{
+		PX_ASSERT(inDType);
+		PxAllocatorCallback& allocator( inAllocator.getAllocator() );
+		inDType->~TDataType();
+		allocator.deallocate( inDType );
+	}
+
+	/**
+	\brief Helper function to encapsulate the profile deallocation
+	*/
+	template<typename TDataType>
+	inline void PxProfileDeleteAndDeallocate( PxAllocatorCallback& inAllocator, TDataType* inDType )
+	{
+		PxProfileAllocatorWrapper wrapper( &inAllocator );
+		PxProfileDeleteAndDeallocate( wrapper, inDType );
+	}
+
+} }
+
+#define PX_PROFILE_NEW( allocator, dtype ) new (physx::profile::PxProfileAllocate<dtype>( allocator, __FILE__, __LINE__ )) dtype
+#define PX_PROFILE_DELETE( allocator, obj ) physx::profile::PxProfileDeleteAndDeallocate( allocator, obj );
+
+#endif // PXPVDSDK_PXPROFILEALLOCATORWRAPPER_H
diff --git a/PxShared/src/pvd/include/PxPvdClient.h b/PxShared/src/pvd/include/PxPvdClient.h
new file mode 100644
index 0000000..fccf76d
--- /dev/null
+++ b/PxShared/src/pvd/include/PxPvdClient.h
@@ -0,0 +1,77 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PXPVDSDK_PXPVDCLIENT_H
+#define PXPVDSDK_PXPVDCLIENT_H
+
+/** \addtogroup pvd
+@{
+*/
+#include "foundation/PxFlags.h"
+#include "foundation/PxVec3.h"
+
+#if !PX_DOXYGEN
+namespace physx
+{
+namespace pvdsdk
+{
+#endif
+
+class PvdDataStream;
+class PvdUserRenderer;
+
+/**
+\brief PvdClient is the per-client connection to PVD.
+It provides callback when PVD is connected/disconnted.
+It provides access to the internal object so that advanced users can create extension client.
+*/
+class PvdClient
+{
+  public:
+	virtual PvdDataStream* getDataStream() = 0;
+	virtual PvdUserRenderer* getUserRender() = 0;
+
+	virtual bool isConnected() const = 0;
+	virtual void onPvdConnected() = 0;
+	virtual void onPvdDisconnected() = 0;
+	virtual void flush() = 0;
+
+  protected:
+	virtual ~PvdClient()
+	{
+	}
+};
+
+#if !PX_DOXYGEN
+} // namespace pvdsdk
+} // namespace physx
+#endif
+
+/** @} */
+#endif // PXPVDSDK_PXPVDCLIENT_H
diff --git a/PxShared/src/pvd/include/PxPvdDataStream.h b/PxShared/src/pvd/include/PxPvdDataStream.h
new file mode 100644
index 0000000..c47ef12
--- /dev/null
+++ b/PxShared/src/pvd/include/PxPvdDataStream.h
@@ -0,0 +1,272 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+#ifndef PXPVDSDK_PXPVDDATASTREAM_H
+#define PXPVDSDK_PXPVDDATASTREAM_H
+
+/** \addtogroup pvd
+@{
+*/
+#include "pvd/PxPvd.h"
+#include "PxPvdErrorCodes.h"
+#include "PxPvdObjectModelBaseTypes.h"
+
+#if !PX_DOXYGEN
+namespace physx
+{
+namespace pvdsdk
+{
+#endif
+
+class PvdPropertyDefinitionHelper;
+
+class PvdMetaDataStream
+{
+  protected:
+	virtual ~PvdMetaDataStream()
+	{
+	}
+
+  public:
+	virtual PvdError createClass(const NamespacedName& nm) = 0;
+	template <typename TDataType>
+	PvdError createClass()
+	{
+		return createClass(getPvdNamespacedNameForType<TDataType>());
+	}
+
+	virtual PvdError deriveClass(const NamespacedName& parent, const NamespacedName& child) = 0;
+	template <typename TParentType, typename TChildType>
+	PvdError deriveClass()
+	{
+		return deriveClass(getPvdNamespacedNameForType<TParentType>(), getPvdNamespacedNameForType<TChildType>());
+	}
+
+	virtual bool isClassExist(const NamespacedName& nm) = 0;
+	template <typename TDataType>
+	bool isClassExist()
+	{
+		return isClassExist(getPvdNamespacedNameForType<TDataType>());
+	}
+
+	virtual PvdError createProperty(const NamespacedName& clsName, const char* name, const char* semantic,
+	                                const NamespacedName& dtypeName, PropertyType::Enum propertyType,
+	                                DataRef<NamedValue> values = DataRef<NamedValue>()) = 0;
+	template <typename TClsType, typename TDataType>
+	PvdError createProperty(String name, String semantic = "", PropertyType::Enum propertyType = PropertyType::Scalar,
+	                        DataRef<NamedValue> values = DataRef<NamedValue>())
+	{
+		return createProperty(getPvdNamespacedNameForType<TClsType>(), name, semantic,
+		                      getPvdNamespacedNameForType<TDataType>(), propertyType, values);
+	}
+
+	virtual PvdError createPropertyMessage(const NamespacedName& cls, const NamespacedName& msgName,
+	                                       DataRef<PropertyMessageArg> entries, uint32_t messageSizeInBytes) = 0;
+
+	template <typename TClsType, typename TMsgType>
+	PvdError createPropertyMessage(DataRef<PropertyMessageArg> entries)
+	{
+		return createPropertyMessage(getPvdNamespacedNameForType<TClsType>(), getPvdNamespacedNameForType<TMsgType>(),
+		                             entries, sizeof(TMsgType));
+	}
+};
+
+class PvdInstanceDataStream
+{
+  protected:
+	virtual ~PvdInstanceDataStream()
+	{
+	}
+
+  public:
+	virtual PvdError createInstance(const NamespacedName& cls, const void* instance) = 0;
+
+	template <typename TDataType>
+	PvdError createInstance(const TDataType* inst)
+	{
+		return createInstance(getPvdNamespacedNameForType<TDataType>(), inst);
+	}
+	virtual bool isInstanceValid(const void* instance) = 0;
+
+	// If the property will fit or is already completely in memory
+	virtual PvdError setPropertyValue(const void* instance, String name, DataRef<const uint8_t> data,
+	                                  const NamespacedName& incomingTypeName) = 0;
+	template <typename TDataType>
+	PvdError setPropertyValue(const void* instance, String name, const TDataType& value)
+	{
+		const uint8_t* dataStart = reinterpret_cast<const uint8_t*>(&value);
+		return setPropertyValue(instance, name, DataRef<const uint8_t>(dataStart, dataStart + sizeof(TDataType)),
+		                        getPvdNamespacedNameForType<TDataType>());
+	}
+
+	template <typename TDataType>
+	PvdError setPropertyValue(const void* instance, String name, const TDataType* value, uint32_t numItems)
+	{
+		const uint8_t* dataStart = reinterpret_cast<const uint8_t*>(value);
+		return setPropertyValue(instance, name,
+		                        DataRef<const uint8_t>(dataStart, dataStart + sizeof(TDataType) * numItems),
+		                        getPvdNamespacedNameForType<TDataType>());
+	}
+
+	// Else if the property is very large (contact reports) you can send it in chunks.
+	virtual PvdError beginSetPropertyValue(const void* instance, String name, const NamespacedName& incomingTypeName) = 0;
+
+	template <typename TDataType>
+	PvdError beginSetPropertyValue(const void* instance, String name)
+	{
+		return beginSetPropertyValue(instance, name, getPvdNamespacedNameForType<TDataType>());
+	}
+	virtual PvdError appendPropertyValueData(DataRef<const uint8_t> data) = 0;
+
+	template <typename TDataType>
+	PvdError appendPropertyValueData(const TDataType* value, uint32_t numItems)
+	{
+		const uint8_t* dataStart = reinterpret_cast<const uint8_t*>(value);
+		return appendPropertyValueData(DataRef<const uint8_t>(dataStart, dataStart + numItems * sizeof(TDataType)));
+	}
+
+	virtual PvdError endSetPropertyValue() = 0;
+
+	// Set a set of properties to various values on an object.
+
+	virtual PvdError setPropertyMessage(const void* instance, const NamespacedName& msgName,
+	                                    DataRef<const uint8_t> data) = 0;
+
+	template <typename TDataType>
+	PvdError setPropertyMessage(const void* instance, const TDataType& value)
+	{
+		const uint8_t* dataStart = reinterpret_cast<const uint8_t*>(&value);
+		return setPropertyMessage(instance, getPvdNamespacedNameForType<TDataType>(),
+		                          DataRef<const uint8_t>(dataStart, sizeof(TDataType)));
+	}
+	// If you need to send of lot of identical messages, this avoids a hashtable lookup per message.
+	virtual PvdError beginPropertyMessageGroup(const NamespacedName& msgName) = 0;
+
+	template <typename TDataType>
+	PvdError beginPropertyMessageGroup()
+	{
+		return beginPropertyMessageGroup(getPvdNamespacedNameForType<TDataType>());
+	}
+	virtual PvdError sendPropertyMessageFromGroup(const void* instance, DataRef<const uint8_t> data) = 0;
+
+	template <typename TDataType>
+	PvdError sendPropertyMessageFromGroup(const void* instance, const TDataType& value)
+	{
+		const uint8_t* dataStart = reinterpret_cast<const uint8_t*>(&value);
+		return sendPropertyMessageFromGroup(instance, DataRef<const uint8_t>(dataStart, sizeof(TDataType)));
+	}
+
+	virtual PvdError endPropertyMessageGroup() = 0;
+
+	// These functions ensure the target array doesn't contain duplicates
+	virtual PvdError pushBackObjectRef(const void* instId, String propName, const void* objRef) = 0;
+	virtual PvdError removeObjectRef(const void* instId, String propName, const void* objRef) = 0;
+
+	// Instance elimination.
+	virtual PvdError destroyInstance(const void* key) = 0;
+
+	// Profiling hooks
+	virtual PvdError beginSection(const void* instance, String name) = 0;
+	virtual PvdError endSection(const void* instance, String name) = 0;
+
+	// Origin Shift
+	virtual PvdError originShift(const void* scene, PxVec3 shift) = 0;
+
+  public:
+	/*For some cases, pvd command cannot be run immediately. For example, when create joints, while the actors may still
+	*pending for insert, the joints update commands can be run deffered.
+	*/
+	class PvdCommand
+	{
+	  public:
+		// Assigned is needed for copying
+		PvdCommand(const PvdCommand&)
+		{
+		}
+		PvdCommand& operator=(const PvdCommand&)
+		{
+			return *this;
+		}
+
+	  public:
+		PvdCommand()
+		{
+		}
+		virtual ~PvdCommand()
+		{
+		}
+
+		// Not pure virtual so can have default PvdCommand obj
+		virtual bool canRun(PvdInstanceDataStream&)
+		{
+			return false;
+		}
+		virtual void run(PvdInstanceDataStream&)
+		{
+		}
+	};
+
+	// PVD SDK provide this helper function to allocate cmd's memory and release them at after flush the command queue
+	virtual void* allocateMemForCmd(uint32_t length) = 0;
+
+	// PVD will call the destructor of PvdCommand object at the end fo flushPvdCommand
+	virtual void pushPvdCommand(PvdCommand& cmd) = 0;
+	virtual void flushPvdCommand() = 0;
+};
+
+class PvdDataStream : public PvdInstanceDataStream, public PvdMetaDataStream
+{
+  protected:
+	virtual ~PvdDataStream()
+	{
+	}
+
+  public:
+	virtual void release() = 0;
+	virtual bool isConnected() = 0;
+
+	virtual void addProfileZone(void* zone, const char* name) = 0;
+	virtual void addProfileZoneEvent(void* zone, const char* name, uint16_t eventId, bool compileTimeEnabled) = 0;
+
+	virtual PvdPropertyDefinitionHelper& getPropertyDefinitionHelper() = 0;
+
+	virtual void setIsTopLevelUIElement(const void* instance, bool topLevel) = 0;
+	virtual void sendErrorMessage(uint32_t code, const char* message, const char* file, uint32_t line) = 0;
+	virtual void updateCamera(const char* name, const PxVec3& origin, const PxVec3& up, const PxVec3& target) = 0;
+
+/**
+	\brief Create a new PvdDataStream.
+	\param pvd A pointer to a valid PxPvd instance.  This must be non-null.	
+*/
+	PX_PVDSDK_API static PvdDataStream* create(PxPvd* pvd);
+};
+#if !PX_DOXYGEN
+} // pvdsdk
+} // physx
+#endif
+
+/** @} */
+#endif // PXPVDSDK_PXPVDDATASTREAM_H
diff --git a/PxShared/src/pvd/include/PxPvdDataStreamHelpers.h b/PxShared/src/pvd/include/PxPvdDataStreamHelpers.h
new file mode 100644
index 0000000..7b47db5
--- /dev/null
+++ b/PxShared/src/pvd/include/PxPvdDataStreamHelpers.h
@@ -0,0 +1,120 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+#ifndef PXPVDSDK_PXPVDDATASTREAMHELPERS_H
+#define PXPVDSDK_PXPVDDATASTREAMHELPERS_H
+
+/** \addtogroup pvd
+@{
+*/
+#include "PxPvdObjectModelBaseTypes.h"
+
+#if !PX_DOXYGEN
+namespace physx
+{
+namespace pvdsdk
+{
+#endif
+
+class PvdPropertyDefinitionHelper
+{
+  protected:
+	virtual ~PvdPropertyDefinitionHelper()
+	{
+	}
+
+  public:
+	/**
+	    Push a name c such that it appends such as a.b.c.
+	*/
+	virtual void pushName(const char* inName, const char* inAppendStr = ".") = 0;
+	/**
+	    Push a name c such that it appends like a.b[c]
+	*/
+	virtual void pushBracketedName(const char* inName, const char* leftBracket = "[", const char* rightBracket = "]") = 0;
+	/**
+	 *	Pop the current name
+	 */
+	virtual void popName() = 0;
+
+	virtual void clearNameStack() = 0;
+	/**
+	 *	Get the current name at the top of the name stack.
+	 *	Would return "a.b.c" or "a.b[c]" in the above examples.
+	 */
+	virtual const char* getTopName() = 0;
+
+	virtual void addNamedValue(const char* name, uint32_t value) = 0;
+	virtual void clearNamedValues() = 0;
+	virtual DataRef<NamedValue> getNamedValues() = 0;
+
+	/**
+	 *	Define a property using the top of the name stack and the passed-in semantic
+	 */
+	virtual void createProperty(const NamespacedName& clsName, const char* inSemantic, const NamespacedName& dtypeName,
+	                            PropertyType::Enum propType = PropertyType::Scalar) = 0;
+
+	template <typename TClsType, typename TDataType>
+	void createProperty(const char* inSemantic = "", PropertyType::Enum propType = PropertyType::Scalar)
+	{
+		createProperty(getPvdNamespacedNameForType<TClsType>(), inSemantic, getPvdNamespacedNameForType<TDataType>(),
+		               propType);
+	}
+
+	// The datatype used for instances needs to be pointer unless you actually have pvdsdk::InstanceId members on your
+	// value structs.
+	virtual void addPropertyMessageArg(const NamespacedName& inDatatype, uint32_t inOffset, uint32_t inSize) = 0;
+
+	template <typename TDataType>
+	void addPropertyMessageArg(uint32_t offset)
+	{
+		addPropertyMessageArg(getPvdNamespacedNameForType<TDataType>(), offset, static_cast<uint32_t>(sizeof(TDataType)));
+	}
+	virtual void addPropertyMessage(const NamespacedName& clsName, const NamespacedName& msgName,
+	                                uint32_t inStructSizeInBytes) = 0;
+	template <typename TClsType, typename TMsgType>
+	void addPropertyMessage()
+	{
+		addPropertyMessage(getPvdNamespacedNameForType<TClsType>(), getPvdNamespacedNameForType<TMsgType>(),
+		                   static_cast<uint32_t>(sizeof(TMsgType)));
+	}
+	virtual void clearPropertyMessageArgs() = 0;
+
+	void clearBufferedData()
+	{
+		clearNameStack();
+		clearPropertyMessageArgs();
+		clearNamedValues();
+	}
+};
+
+#if !PX_DOXYGEN
+} // pvdsdk
+} // physx
+#endif
+
+/** @} */
+#endif // PXPVDSDK_PXPVDDATASTREAMHELPERS_H
diff --git a/PxShared/src/pvd/include/PxPvdErrorCodes.h b/PxShared/src/pvd/include/PxPvdErrorCodes.h
new file mode 100644
index 0000000..ae17a6f
--- /dev/null
+++ b/PxShared/src/pvd/include/PxPvdErrorCodes.h
@@ -0,0 +1,62 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+#ifndef PXPVDSDK_PXPVDERRORCODES_H
+#define PXPVDSDK_PXPVDERRORCODES_H
+
+/** \addtogroup pvd
+@{
+*/
+
+#include "foundation/Px.h"
+
+#if !PX_DOXYGEN
+namespace physx
+{
+namespace pvdsdk
+{
+#endif
+
+struct PvdErrorType
+{
+	enum Enum
+	{
+		Success = 0,
+		NetworkError,
+		ArgumentError,
+		Disconnect,
+		InternalProblem
+	};
+};
+
+typedef PvdErrorType::Enum PvdError;
+
+#if !PX_DOXYGEN
+}
+}
+#endif
+/** @} */
+#endif // PXPVDSDK_PXPVDERRORCODES_H
diff --git a/PxShared/src/pvd/include/PxPvdObjectModelBaseTypes.h b/PxShared/src/pvd/include/PxPvdObjectModelBaseTypes.h
new file mode 100644
index 0000000..f65e581
--- /dev/null
+++ b/PxShared/src/pvd/include/PxPvdObjectModelBaseTypes.h
@@ -0,0 +1,428 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPVDOBJECTMODELBASETYPES_H
+#define PXPVDSDK_PXPVDOBJECTMODELBASETYPES_H
+
+/** \addtogroup pvd
+@{
+*/
+#include "foundation/PxAssert.h"
+
+#if !PX_DOXYGEN
+namespace physx
+{
+namespace pvdsdk
+{
+#endif
+
+using namespace physx;
+
+inline const char* nonNull(const char* str)
+{
+	return str ? str : "";
+}
+// strcmp will crash if passed a null string, however,
+// so we need to make sure that doesn't happen.  We do that
+// by equating NULL and the empty string, "".
+inline bool safeStrEq(const char* lhs, const char* rhs)
+{
+	return ::strcmp(nonNull(lhs), nonNull(rhs)) == 0;
+}
+
+// Does this string have useful information in it.
+inline bool isMeaningful(const char* str)
+{
+	return *(nonNull(str)) > 0;
+}
+
+inline uint32_t safeStrLen(const char* str)
+{
+	str = nonNull(str);
+	return static_cast<uint32_t>(strlen(str));
+}
+
+struct ObjectRef
+{
+	int32_t mInstanceId;
+
+	ObjectRef(int32_t iid = -1) : mInstanceId(iid)
+	{
+	}
+	operator int32_t() const
+	{
+		return mInstanceId;
+	}
+	bool hasValue() const
+	{
+		return mInstanceId > 0;
+	}
+};
+
+struct U32Array4
+{
+	uint32_t mD0;
+	uint32_t mD1;
+	uint32_t mD2;
+	uint32_t mD3;
+	U32Array4(uint32_t d0, uint32_t d1, uint32_t d2, uint32_t d3) : mD0(d0), mD1(d1), mD2(d2), mD3(d3)
+	{
+	}
+	U32Array4() : mD0(0), mD1(0), mD2(0), mD3(0)
+	{
+	}
+};
+
+typedef bool				PvdBool;
+typedef const char*			String;
+typedef void*				VoidPtr;
+typedef double				PvdF64;
+typedef float				PvdF32;
+typedef int64_t				PvdI64;
+typedef uint64_t			PvdU64;
+typedef int32_t				PvdI32;
+typedef uint32_t			PvdU32;
+typedef int16_t				PvdI16;
+typedef uint16_t			PvdU16;
+typedef int8_t				PvdI8;
+typedef uint8_t				PvdU8;
+
+struct PvdColor
+{
+	uint8_t r;
+	uint8_t g;
+	uint8_t b;
+	uint8_t a;
+	PvdColor(uint8_t _r, uint8_t _g, uint8_t _b, uint8_t _a = 255) : r(_r), g(_g), b(_b), a(_a)
+	{
+	}
+	PvdColor() : r(0), g(0), b(0), a(255)
+	{
+	}
+	PvdColor(uint32_t abgr)
+	{
+		uint8_t* valPtr = reinterpret_cast<uint8_t*>(&abgr);
+		r = valPtr[0];
+		g = valPtr[1];
+		b = valPtr[2];
+		a = valPtr[3];
+	}
+};
+
+struct StringHandle
+{
+	uint32_t mHandle;
+	StringHandle(uint32_t val = 0) : mHandle(val)
+	{
+	}
+	operator uint32_t() const
+	{
+		return mHandle;
+	}
+};
+
+#define DECLARE_TYPES					\
+DECLARE_BASE_PVD_TYPE(PvdI8)			\
+DECLARE_BASE_PVD_TYPE(PvdU8)			\
+DECLARE_BASE_PVD_TYPE(PvdI16)			\
+DECLARE_BASE_PVD_TYPE(PvdU16)			\
+DECLARE_BASE_PVD_TYPE(PvdI32)			\
+DECLARE_BASE_PVD_TYPE(PvdU32)			\
+DECLARE_BASE_PVD_TYPE(PvdI64)			\
+DECLARE_BASE_PVD_TYPE(PvdU64)			\
+DECLARE_BASE_PVD_TYPE(PvdF32)			\
+DECLARE_BASE_PVD_TYPE(PvdF64)			\
+DECLARE_BASE_PVD_TYPE(PvdBool)			\
+DECLARE_BASE_PVD_TYPE(PvdColor)			\
+DECLARE_BASE_PVD_TYPE(String)			\
+DECLARE_BASE_PVD_TYPE(StringHandle)		\
+DECLARE_BASE_PVD_TYPE(ObjectRef)		\
+DECLARE_BASE_PVD_TYPE(VoidPtr)			\
+DECLARE_BASE_PVD_TYPE(PxVec2)			\
+DECLARE_BASE_PVD_TYPE(PxVec3)			\
+DECLARE_BASE_PVD_TYPE(PxVec4)			\
+DECLARE_BASE_PVD_TYPE(PxBounds3)		\
+DECLARE_BASE_PVD_TYPE(PxQuat)			\
+DECLARE_BASE_PVD_TYPE(PxTransform)		\
+DECLARE_BASE_PVD_TYPE(PxMat33)			\
+DECLARE_BASE_PVD_TYPE(PxMat44)			\
+DECLARE_BASE_PVD_TYPE(U32Array4)		
+
+struct PvdBaseType
+{
+	enum Enum
+	{
+		None          = 0,
+		InternalStart = 1,
+		InternalStop  = 64,
+#define DECLARE_BASE_PVD_TYPE(type) type,
+		DECLARE_TYPES
+		Last
+#undef DECLARE_BASE_PVD_TYPE
+	};
+};
+struct NamespacedName
+{
+	String mNamespace;
+	String mName;
+	NamespacedName(String ns, String nm) : mNamespace(ns), mName(nm)
+	{
+	}
+	NamespacedName(String nm = "") : mNamespace(""), mName(nm)
+	{
+	}
+	bool operator==(const NamespacedName& other) const
+	{
+		return safeStrEq(mNamespace, other.mNamespace) && safeStrEq(mName, other.mName);
+	}
+};
+
+struct NamedValue
+{
+	String mName;
+	uint32_t mValue;
+	NamedValue(String nm = "", uint32_t val = 0) : mName(nm), mValue(val)
+	{
+	}
+};
+
+template <typename T>
+struct BaseDataTypeToTypeMap
+{
+	bool compile_error;
+};
+template <PvdBaseType::Enum>
+struct BaseTypeToDataTypeMap
+{
+	bool compile_error;
+};
+
+// Users can extend this mapping with new datatypes.
+template <typename T>
+struct PvdDataTypeToNamespacedNameMap
+{
+	bool Name;
+};
+// This mapping tells you the what class id to use for the base datatypes
+//
+#define DECLARE_BASE_PVD_TYPE(type)                                                                                    \
+	template <>                                                                                                        \
+	struct BaseDataTypeToTypeMap<type>                                                                                 \
+	{                                                                                                                  \
+		enum Enum                                                                                                      \
+		{                                                                                                              \
+			BaseTypeEnum = PvdBaseType::type                                                                           \
+		};                                                                                                             \
+	};                                                                                                                 \
+	template <>                                                                                                        \
+	struct BaseDataTypeToTypeMap<const type&>                                                                          \
+	{                                                                                                                  \
+		enum Enum                                                                                                      \
+		{                                                                                                              \
+			BaseTypeEnum = PvdBaseType::type                                                                           \
+		};                                                                                                             \
+	};                                                                                                                 \
+	template <>                                                                                                        \
+	struct BaseTypeToDataTypeMap<PvdBaseType::type>                                                                    \
+	{                                                                                                                  \
+		typedef type TDataType;                                                                                        \
+	};                                                                                                                 \
+	template <>                                                                                                        \
+	struct PvdDataTypeToNamespacedNameMap<type>                                                                        \
+	{                                                                                                                  \
+		NamespacedName Name;                                                                                           \
+		PvdDataTypeToNamespacedNameMap<type>() : Name("physx3", #type)                                                 \
+		{                                                                                                              \
+		}                                                                                                              \
+	};                                                                                                                 \
+	template <>                                                                                                        \
+	struct PvdDataTypeToNamespacedNameMap<const type&>                                                                 \
+	{                                                                                                                  \
+		NamespacedName Name;                                                                                           \
+		PvdDataTypeToNamespacedNameMap<const type&>() : Name("physx3", #type)                                          \
+		{                                                                                                              \
+		}                                                                                                              \
+	};
+
+DECLARE_TYPES
+#undef DECLARE_BASE_PVD_TYPE
+
+template <typename TDataType>
+inline int32_t getPvdTypeForType()
+{
+	return static_cast<PvdBaseType::Enum>(BaseDataTypeToTypeMap<TDataType>::BaseTypeEnum);
+}
+template <typename TDataType>
+inline NamespacedName getPvdNamespacedNameForType()
+{
+	return PvdDataTypeToNamespacedNameMap<TDataType>().Name;
+}
+
+#define DEFINE_PVD_TYPE_NAME_MAP(type, ns, name)                                                                       \
+	template <>                                                                                                        \
+	struct PvdDataTypeToNamespacedNameMap<type>                                                                        \
+	{                                                                                                                  \
+		NamespacedName Name;                                                                                           \
+		PvdDataTypeToNamespacedNameMap<type>() : Name(ns, name)                                                        \
+		{                                                                                                              \
+		}                                                                                                              \
+	};
+
+#define DEFINE_PVD_TYPE_ALIAS(newType, oldType)                                                                        \
+	template <>                                                                                                        \
+	struct PvdDataTypeToNamespacedNameMap<newType>                                                                     \
+	{                                                                                                                  \
+		NamespacedName Name;                                                                                           \
+		PvdDataTypeToNamespacedNameMap<newType>() : Name(PvdDataTypeToNamespacedNameMap<oldType>().Name)               \
+		{                                                                                                              \
+		}                                                                                                              \
+	};
+
+DEFINE_PVD_TYPE_ALIAS(const void*, void*)
+
+struct ArrayData
+{
+	uint8_t* mBegin;
+	uint8_t* mEnd;
+	uint8_t* mCapacity; //>= stop
+	ArrayData(uint8_t* beg = NULL, uint8_t* end = NULL, uint8_t* cap = NULL) : mBegin(beg), mEnd(end), mCapacity(cap)
+	{
+	}
+	uint8_t* begin()
+	{
+		return mBegin;
+	}
+	uint8_t* end()
+	{
+		return mEnd;
+	}
+	uint32_t byteCapacity()
+	{
+		return static_cast<uint32_t>(mCapacity - mBegin);
+	}
+	uint32_t byteSize() const
+	{
+		return static_cast<uint32_t>(mEnd - mBegin);
+	} // in bytes
+	uint32_t numberOfItems(uint32_t objectByteSize)
+	{
+		if(objectByteSize)
+			return byteSize() / objectByteSize;
+		return 0;
+	}
+
+	void forgetData()
+	{
+		mBegin = mEnd = mCapacity = 0;
+	}
+};
+
+template <typename T>
+class DataRef
+{
+	const T* mBegin;
+	const T* mEnd;
+
+  public:
+	DataRef(const T* b, uint32_t count) : mBegin(b), mEnd(b + count)
+	{
+	}
+	DataRef(const T* b = NULL, const T* e = NULL) : mBegin(b), mEnd(e)
+	{
+	}
+	DataRef(const DataRef& o) : mBegin(o.mBegin), mEnd(o.mEnd)
+	{
+	}
+	DataRef& operator=(const DataRef& o)
+	{
+		mBegin = o.mBegin;
+		mEnd = o.mEnd;
+		return *this;
+	}
+	uint32_t size() const
+	{
+		return static_cast<uint32_t>(mEnd - mBegin);
+	}
+	const T* begin() const
+	{
+		return mBegin;
+	}
+	const T* end() const
+	{
+		return mEnd;
+	}
+	const T& operator[](uint32_t idx) const
+	{
+		PX_ASSERT(idx < size());
+		return mBegin[idx];
+	}
+	const T& back() const
+	{
+		PX_ASSERT(mEnd > mBegin);
+		return *(mEnd - 1);
+	}
+};
+
+struct PropertyType
+{
+	enum Enum
+	{
+		Unknown = 0,
+		Scalar,
+		Array
+	};
+};
+
+// argument to the create property message function
+struct PropertyMessageArg
+{
+	String mPropertyName;
+	NamespacedName mDatatypeName;
+	// where in the message this property starts.
+	uint32_t mMessageOffset;
+	// size of this entry object
+	uint32_t mByteSize;
+
+	PropertyMessageArg(String propName, NamespacedName dtype, uint32_t msgOffset, uint32_t byteSize)
+	: mPropertyName(propName), mDatatypeName(dtype), mMessageOffset(msgOffset), mByteSize(byteSize)
+	{
+	}
+	PropertyMessageArg() : mPropertyName(""), mMessageOffset(0), mByteSize(0)
+	{
+	}
+};
+
+class PvdUserRenderer;
+DEFINE_PVD_TYPE_NAME_MAP(PvdUserRenderer, "_debugger_", "PvdUserRenderer")
+
+#if !PX_DOXYGEN
+}
+}
+#endif
+
+/** @} */
+#endif // PXPVDSDK_PXPVDOBJECTMODELBASETYPES_H
diff --git a/PxShared/src/pvd/include/PxPvdRenderBuffer.h b/PxShared/src/pvd/include/PxPvdRenderBuffer.h
new file mode 100644
index 0000000..58c8d51
--- /dev/null
+++ b/PxShared/src/pvd/include/PxPvdRenderBuffer.h
@@ -0,0 +1,140 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PXPVDSDK_PXPVDRENDERBUFFER_H
+#define PXPVDSDK_PXPVDRENDERBUFFER_H
+
+/** \addtogroup pvd
+@{
+*/
+
+#include "foundation/PxVec3.h"
+
+#if !PX_DOXYGEN
+namespace physx
+{
+namespace pvdsdk
+{
+#endif
+
+/**
+\brief Default color values used for debug rendering.
+*/
+struct PvdDebugColor
+{
+	enum Enum
+	{
+		eARGB_BLACK     = 0xff000000,
+		eARGB_RED       = 0xffff0000,
+		eARGB_GREEN     = 0xff00ff00,
+		eARGB_BLUE      = 0xff0000ff,
+		eARGB_YELLOW    = 0xffffff00,
+		eARGB_MAGENTA   = 0xffff00ff,
+		eARGB_CYAN      = 0xff00ffff,
+		eARGB_WHITE     = 0xffffffff,
+		eARGB_GREY      = 0xff808080,
+		eARGB_DARKRED   = 0x88880000,
+		eARGB_DARKGREEN = 0x88008800,
+		eARGB_DARKBLUE  = 0x88000088
+	};
+};
+
+/**
+\brief Used to store a single point and colour for debug rendering.
+*/
+struct PvdDebugPoint
+{
+	PvdDebugPoint(const PxVec3& p, const uint32_t& c) : pos(p), color(c)
+	{
+	}
+
+	PxVec3 pos;
+	uint32_t color;
+};
+
+/**
+\brief Used to store a single line and colour for debug rendering.
+*/
+struct PvdDebugLine
+{
+	PvdDebugLine(const PxVec3& p0, const PxVec3& p1, const uint32_t& c) : pos0(p0), color0(c), pos1(p1), color1(c)
+	{
+	}
+
+	PxVec3 pos0;
+	uint32_t color0;
+	PxVec3 pos1;
+	uint32_t color1;
+};
+
+/**
+\brief Used to store a single triangle and colour for debug rendering.
+*/
+struct PvdDebugTriangle
+{
+	PvdDebugTriangle(const PxVec3& p0, const PxVec3& p1, const PxVec3& p2, const uint32_t& c)
+	: pos0(p0), color0(c), pos1(p1), color1(c), pos2(p2), color2(c)
+	{
+	}
+
+	PxVec3 pos0;
+	uint32_t color0;
+	PxVec3 pos1;
+	uint32_t color1;
+	PxVec3 pos2;
+	uint32_t color2;
+};
+
+/**
+\brief Used to store a text for debug rendering. Doesn't own 'string' array.
+*/
+struct PvdDebugText
+{
+	PvdDebugText() : string(0)
+	{
+	}
+
+	PvdDebugText(const PxVec3& p, const float& s, const uint32_t& c, const char* str)
+	: position(p), size(s), color(c), string(str)
+	{
+	}
+
+	PxVec3 position;
+	float size;
+	uint32_t color;
+	const char* string;
+};
+
+#if !PX_DOXYGEN
+}
+} // namespace physx
+#endif
+
+/** @} */
+#endif // PXPVDSDK_PXPVDRENDERBUFFER_H
diff --git a/PxShared/src/pvd/include/PxPvdUserRenderer.h b/PxShared/src/pvd/include/PxPvdUserRenderer.h
new file mode 100644
index 0000000..ac6f26d
--- /dev/null
+++ b/PxShared/src/pvd/include/PxPvdUserRenderer.h
@@ -0,0 +1,107 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+#ifndef PXPVDSDK_PXPVDUSERRENDERER_H
+#define PXPVDSDK_PXPVDUSERRENDERER_H
+
+/** \addtogroup pvd
+@{
+*/
+#include "foundation/PxVec3.h"
+#include "foundation/PxTransform.h"
+#include "pvd/PxPvd.h"
+
+#include "PxPvdDataStream.h"
+#include "PxPvdRenderBuffer.h"
+#include "PsUserAllocated.h"
+
+#if !PX_DOXYGEN
+namespace physx
+{
+#endif
+
+class PxPvd;
+
+#if !PX_DOXYGEN
+namespace pvdsdk
+{
+#endif
+
+class RendererEventClient;
+
+class PvdUserRenderer : public shdfnd::UserAllocated
+{
+  protected:
+	virtual ~PvdUserRenderer()
+	{
+	}
+
+  public:
+	virtual void release() = 0;
+	virtual void setClient(RendererEventClient* client) = 0;
+
+	// Instance to associate the further rendering with.
+	virtual void setInstanceId(const void* instanceId) = 0;
+	// Draw these points associated with this instance
+	virtual void drawPoints(const PvdDebugPoint* points, uint32_t count) = 0;
+	// Draw these lines associated with this instance
+	virtual void drawLines(const PvdDebugLine* lines, uint32_t count) = 0;
+	// Draw these triangles associated with this instance
+	virtual void drawTriangles(const PvdDebugTriangle* triangles, uint32_t count) = 0;
+	// Draw this text associated with this instance
+	virtual void drawText(const PvdDebugText& text) = 0;
+
+	// Draw SDK debug render
+	virtual void drawRenderbuffer(const PvdDebugPoint* pointData, uint32_t pointCount, const PvdDebugLine* lineData,
+	                              uint32_t lineCount, const PvdDebugTriangle* triangleData, uint32_t triangleCount) = 0;
+
+	// Constraint visualization routines
+	virtual void visualizeJointFrames(const PxTransform& parent, const PxTransform& child) = 0;
+	virtual void visualizeLinearLimit(const PxTransform& t0, const PxTransform& t1, float value, bool active) = 0;
+	virtual void visualizeAngularLimit(const PxTransform& t0, float lower, float upper, bool active) = 0;
+	virtual void visualizeLimitCone(const PxTransform& t, float ySwing, float zSwing, bool active) = 0;
+	virtual void visualizeDoubleCone(const PxTransform& t, float angle, bool active) = 0;
+
+	// Clear the immedate buffer.
+	virtual void flushRenderEvents() = 0;
+
+	PX_PVDSDK_API static PvdUserRenderer* create(uint32_t bufferSize = 0x2000);
+};
+
+class RendererEventClient 
+{
+ public:
+	virtual ~RendererEventClient(){}
+
+	virtual void handleBufferFlush(const uint8_t* inData, uint32_t inLength) = 0;
+};
+
+#if !PX_DOXYGEN
+}
+}
+#endif
+/** @} */
+#endif // PXPVDSDK_PXPVDUSERRENDERER_H
diff --git a/PxShared/src/pvd/src/PxProfileBase.h b/PxShared/src/pvd/src/PxProfileBase.h
new file mode 100644
index 0000000..52918a1
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileBase.h
@@ -0,0 +1,35 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPROFILEBASE_H
+#define PXPVDSDK_PXPROFILEBASE_H
+
+#include "foundation/PxSimpleTypes.h"
+
+#define PX_PROFILE_POINTER_TO_U64( pointer ) static_cast<uint64_t>(reinterpret_cast<size_t>(pointer))
+
+#endif // PXPVDSDK_PXPROFILEBASE_H
diff --git a/PxShared/src/pvd/src/PxProfileCompileTimeEventFilter.h b/PxShared/src/pvd/src/PxProfileCompileTimeEventFilter.h
new file mode 100644
index 0000000..428374e
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileCompileTimeEventFilter.h
@@ -0,0 +1,75 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPROFILECOMPILETIMEEVENTFILTER_H
+#define PXPVDSDK_PXPROFILECOMPILETIMEEVENTFILTER_H
+
+#include "PxProfileBase.h"
+#include "PxProfileEventId.h"
+
+//Define before including header in order to enable a different
+//compile time event profile threshold.
+#ifndef PX_PROFILE_EVENT_PROFILE_THRESHOLD
+#define PX_PROFILE_EVENT_PROFILE_THRESHOLD EventPriorities::Medium
+#endif
+
+namespace physx { namespace profile {
+
+	/**
+	\brief Profile event priorities. Used to filter out events.
+	*/
+	struct EventPriorities
+	{
+		enum Enum
+		{
+			None,		// the filter setting to kill all events
+			Coarse,
+			Medium,
+			Detail,
+			Never		// the priority to set for an event if it should never fire.
+		};
+	};
+
+	/**
+	\brief Gets the priority for a given event.
+	Specialize this object in order to get the priorities setup correctly.
+	*/
+	template<uint16_t TEventId>
+	struct EventPriority { static const uint32_t val = EventPriorities::Medium; };
+
+	/**
+	\brief 	Filter events by given event priority and set threshold.
+	*/
+	template<uint16_t TEventId>
+	struct EventFilter
+	{
+		static const bool val = EventPriority<TEventId>::val <= PX_PROFILE_EVENT_PROFILE_THRESHOLD;
+	};
+
+}}
+
+#endif // PXPVDSDK_PXPROFILECOMPILETIMEEVENTFILTER_H
diff --git a/PxShared/src/pvd/src/PxProfileContextProvider.h b/PxShared/src/pvd/src/PxProfileContextProvider.h
new file mode 100644
index 0000000..44f6b94
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileContextProvider.h
@@ -0,0 +1,98 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPROFILECONTEXTPROVIDER_H
+#define PXPVDSDK_PXPROFILECONTEXTPROVIDER_H
+
+#include "PxProfileBase.h"
+
+namespace physx { namespace profile {
+
+	struct PxProfileEventExecutionContext
+	{
+		uint32_t				mThreadId;
+		uint8_t					mCpuId;
+		uint8_t					mThreadPriority;
+
+		PxProfileEventExecutionContext( uint32_t inThreadId = 0, uint8_t inThreadPriority = 2 /*eThreadPriorityNormal*/, uint8_t inCpuId = 0 )
+			: mThreadId( inThreadId )
+			, mCpuId( inCpuId )
+			, mThreadPriority( inThreadPriority )
+		{
+		}
+
+		bool operator==( const PxProfileEventExecutionContext& inOther ) const
+		{
+			return mThreadId == inOther.mThreadId
+				&& mCpuId == inOther.mCpuId
+				&& mThreadPriority == inOther.mThreadPriority;
+		}
+	};
+
+	//Provides the context in which the event is happening.
+	class PxProfileContextProvider
+	{
+	protected:
+		virtual ~PxProfileContextProvider(){}
+	public:
+		virtual PxProfileEventExecutionContext getExecutionContext() = 0;
+		virtual uint32_t getThreadId() = 0;
+	};
+	//Provides pre-packaged context.
+	struct PxProfileTrivialContextProvider
+	{
+		PxProfileEventExecutionContext mContext;
+		PxProfileTrivialContextProvider( PxProfileEventExecutionContext inContext = PxProfileEventExecutionContext() )
+			: mContext( inContext )
+		{
+		}
+		PxProfileEventExecutionContext getExecutionContext() { return mContext; }
+		uint32_t getThreadId() { return mContext.mThreadId; }
+	};
+	
+	//Forwards the get context calls to another (perhaps shared) context.
+	template<typename TProviderType>
+	struct PxProfileContextProviderForward
+	{
+		TProviderType* mProvider;
+		PxProfileContextProviderForward( TProviderType* inProvider ) : mProvider( inProvider ) {}
+		PxProfileEventExecutionContext getExecutionContext() { return mProvider->getExecutionContext(); }
+		uint32_t getThreadId() { return mProvider->getThreadId(); }
+	};
+
+	template<typename TProviderType>
+	struct PxProfileContextProviderImpl : public PxProfileContextProvider
+	{
+		PxProfileContextProviderForward<TProviderType> mContext;
+		PxProfileContextProviderImpl( TProviderType* inP ) : mContext( inP ) {}
+		PxProfileEventExecutionContext getExecutionContext() { return mContext.getExecutionContext(); }
+		uint32_t getThreadId() { return mContext.getThreadId(); }
+	};
+
+} }
+
+#endif // PXPVDSDK_PXPROFILECONTEXTPROVIDER_H
diff --git a/PxShared/src/pvd/src/PxProfileContextProviderImpl.h b/PxShared/src/pvd/src/PxProfileContextProviderImpl.h
new file mode 100644
index 0000000..bc5f09f
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileContextProviderImpl.h
@@ -0,0 +1,52 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPROFILECONTEXTPROVIDERIMPL_H
+#define PXPVDSDK_PXPROFILECONTEXTPROVIDERIMPL_H
+
+#include "PxProfileContextProvider.h"
+
+#include "PsThread.h"
+
+namespace physx { namespace profile {
+	
+	struct PxDefaultContextProvider
+	{
+		PxProfileEventExecutionContext getExecutionContext() 
+		{ 
+			shdfnd::Thread::Id theId( shdfnd::Thread::getId() );
+			return PxProfileEventExecutionContext( static_cast<uint32_t>( theId ), static_cast<uint8_t>( shdfnd::ThreadPriority::eNORMAL ), 0 );
+		}
+
+		uint32_t getThreadId() 
+		{ 
+			return static_cast<uint32_t>( shdfnd::Thread::getId() ); 
+		}
+	};
+} }
+
+#endif // PXPVDSDK_PXPROFILECONTEXTPROVIDERIMPL_H
diff --git a/PxShared/src/pvd/src/PxProfileDataBuffer.h b/PxShared/src/pvd/src/PxProfileDataBuffer.h
new file mode 100644
index 0000000..2191519
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileDataBuffer.h
@@ -0,0 +1,167 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PXPVDSDK_PXPROFILEDATABUFFER_H
+#define PXPVDSDK_PXPROFILEDATABUFFER_H
+
+#include "PxProfileAllocatorWrapper.h"
+#include "PxProfileMemoryBuffer.h"
+#include "PxProfileEventBufferClient.h"
+
+namespace physx { namespace profile {
+
+	template<typename TMutex
+			, typename TScopedLock>
+	class DataBuffer //base class for buffers that cache data and then dump the data to clients.
+	{
+	public:
+		typedef TMutex				TMutexType;
+		typedef TScopedLock			TScopedLockType;
+		typedef PxProfileWrapperNamedAllocator TU8AllocatorType;
+
+		typedef MemoryBuffer<TU8AllocatorType > TMemoryBufferType;
+		typedef PxProfileArray<PxProfileEventBufferClient*> TBufferClientArray;
+
+	protected:
+		
+		PxProfileAllocatorWrapper					mWrapper;
+		TMemoryBufferType					mDataArray;
+		TBufferClientArray					mClients;
+		uint32_t								mBufferFullAmount;
+		EventContextInformation				mEventContextInformation;		
+		TMutexType*							mBufferMutex;
+		volatile bool						mHasClients;
+		EventSerializer<TMemoryBufferType >	mSerializer;
+
+	public:
+		
+		DataBuffer( PxAllocatorCallback* inFoundation
+					, uint32_t inBufferFullAmount
+					, TMutexType* inBufferMutex
+					, const char* inAllocationName )
+			: mWrapper( inFoundation )
+			, mDataArray( TU8AllocatorType( mWrapper, inAllocationName ) )
+			, mClients( mWrapper )
+			, mBufferFullAmount( inBufferFullAmount )
+			, mBufferMutex( inBufferMutex )
+			, mHasClients( false )
+			, mSerializer( &mDataArray )
+		{
+			//The data array is never resized really.  We ensure
+			//it is bigger than it will ever need to be.
+			mDataArray.reserve( inBufferFullAmount + 68 );
+		}
+		
+		virtual ~DataBuffer()
+		{
+			while( mClients.size() )
+			{
+				removeClient( *mClients[0] );
+			}
+		}
+
+		PxProfileAllocatorWrapper& getWrapper() { return mWrapper; }
+		TMutexType*		  getBufferMutex() { return mBufferMutex; }
+		void			  setBufferMutex(TMutexType* mutex) { mBufferMutex = mutex; }
+
+		void addClient( PxProfileEventBufferClient& inClient ) 
+		{ 
+			TScopedLockType lock( mBufferMutex ); 
+			mClients.pushBack( &inClient );
+			mHasClients = true;
+		}
+
+		void removeClient( PxProfileEventBufferClient& inClient ) 
+		{
+			TScopedLockType lock( mBufferMutex );
+			for ( uint32_t idx =0; idx < mClients.size(); ++idx )
+			{
+				if ( mClients[idx] == &inClient )
+				{
+					inClient.handleClientRemoved();
+					mClients.replaceWithLast( idx );
+					break;
+				}
+			}
+			mHasClients = mClients.size() != 0;
+		}
+
+		
+		bool hasClients() const 
+		{ 
+			return mHasClients;
+		}
+
+		virtual void flushEvents()
+		{	
+			TScopedLockType lock(mBufferMutex);
+			const uint8_t* theData = mDataArray.begin();
+			uint32_t theDataSize = mDataArray.size();
+			sendDataToClients(theData, theDataSize);
+			mDataArray.clear();
+			clearCachedData();
+		}
+
+		//Used for chaining together event buffers.
+		virtual void handleBufferFlush( const uint8_t* inData, uint32_t inDataSize )
+		{
+			TScopedLockType lock( mBufferMutex );
+			if ( inData && inDataSize )
+			{
+				clearCachedData();
+				if ( mDataArray.size() + inDataSize >= mBufferFullAmount )
+					flushEvents();
+				if ( inDataSize >= mBufferFullAmount )
+					sendDataToClients( inData, inDataSize );
+				else
+					mDataArray.write( inData, inDataSize );
+			}
+		}
+
+	protected:
+		virtual void clearCachedData()
+		{
+		}
+
+	private:
+			
+		void sendDataToClients( const uint8_t* inData, uint32_t inDataSize )
+		{
+			uint32_t clientCount = mClients.size();
+			for( uint32_t idx =0; idx < clientCount; ++idx )
+				mClients[idx]->handleBufferFlush( inData, inDataSize );
+		}
+
+	};
+
+}}
+
+
+#endif // PXPVDSDK_PXPROFILEDATABUFFER_H
diff --git a/PxShared/src/pvd/src/PxProfileDataParsing.h b/PxShared/src/pvd/src/PxProfileDataParsing.h
new file mode 100644
index 0000000..e7fa0df
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileDataParsing.h
@@ -0,0 +1,218 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PXPVDSDK_PXPROFILEDATAPARSING_H
+#define PXPVDSDK_PXPROFILEDATAPARSING_H
+
+#include "PxProfileBase.h"
+
+namespace physx { namespace profile {
+
+	//Converts datatypes without using type punning.
+	struct BlockParserDataConverter
+	{
+		union
+		{
+			uint8_t	mU8[8];
+			uint16_t	mU16[4];
+			uint32_t	mU32[2];
+			uint64_t	mU64[1];
+			
+			int8_t	mI8[8];
+			int16_t	mI16[4];
+			int32_t	mI32[2];
+			int64_t	mI64[1];
+
+
+			float	mF32[2];
+			double	mF64[1];
+		};
+
+		template<typename TDataType> inline TDataType convert() { PX_ASSERT( false ); return TDataType(); }
+
+		template<typename TDataType>
+		inline void convert( const TDataType& ) {}
+	};
+	
+	template<> inline uint8_t BlockParserDataConverter::convert<uint8_t>() { return mU8[0]; }
+	template<> inline uint16_t BlockParserDataConverter::convert<uint16_t>() { return mU16[0]; }
+	template<> inline uint32_t BlockParserDataConverter::convert<uint32_t>() { return mU32[0]; }
+	template<> inline uint64_t BlockParserDataConverter::convert<uint64_t>() { return mU64[0]; }
+	template<> inline int8_t BlockParserDataConverter::convert<int8_t>() { return mI8[0]; }
+	template<> inline int16_t BlockParserDataConverter::convert<int16_t>() { return mI16[0]; }
+	template<> inline int32_t BlockParserDataConverter::convert<int32_t>() { return mI32[0]; }
+	template<> inline int64_t BlockParserDataConverter::convert<int64_t>() { return mI64[0]; }
+	template<> inline float BlockParserDataConverter::convert<float>() { return mF32[0]; }
+	template<> inline double BlockParserDataConverter::convert<double>() { return mF64[0]; }
+	
+	template<> inline void BlockParserDataConverter::convert<uint8_t>( const uint8_t& inData ) { mU8[0] = inData; }
+	template<> inline void BlockParserDataConverter::convert<uint16_t>( const uint16_t& inData ) { mU16[0] = inData; }
+	template<> inline void BlockParserDataConverter::convert<uint32_t>( const uint32_t& inData ) { mU32[0] = inData; }
+	template<> inline void BlockParserDataConverter::convert<uint64_t>( const uint64_t& inData ) { mU64[0] = inData; }
+	template<> inline void BlockParserDataConverter::convert<int8_t>( const int8_t& inData ) { mI8[0] = inData; }
+	template<> inline void BlockParserDataConverter::convert<int16_t>( const int16_t& inData ) { mI16[0] = inData; }
+	template<> inline void BlockParserDataConverter::convert<int32_t>( const int32_t& inData ) { mI32[0] = inData; }
+	template<> inline void BlockParserDataConverter::convert<int64_t>( const int64_t& inData ) { mI64[0] = inData; }
+	template<> inline void BlockParserDataConverter::convert<float>( const float& inData ) { mF32[0] = inData; }
+	template<> inline void BlockParserDataConverter::convert<double>( const double& inData ) { mF64[0] = inData; }
+
+
+	//Handles various details around parsing blocks of uint8_t data.
+	struct BlockParseFunctions
+	{
+		template<uint8_t ByteCount>
+		static inline void swapBytes( uint8_t* inData )
+		{
+			for ( uint32_t idx = 0; idx < ByteCount/2; ++idx )
+			{
+				uint32_t endIdx = ByteCount-idx-1;
+				uint8_t theTemp = inData[idx];
+				inData[idx] = inData[endIdx];
+				inData[endIdx] = theTemp;
+			}
+		}
+
+		static inline bool checkLength( const uint8_t* inStart, const uint8_t* inStop, uint32_t inLength )
+		{
+			return static_cast<uint32_t>(inStop - inStart) >= inLength;
+		}
+		//warning work-around
+		template<typename T>
+		static inline T val(T v) {return v;}
+
+		template<bool DoSwapBytes, typename TDataType>
+		static inline bool parse( const uint8_t*& inStart, const uint8_t* inStop, TDataType& outData )
+		{
+			if ( checkLength( inStart, inStop, sizeof( TDataType ) ) )
+			{
+				BlockParserDataConverter theConverter;
+				for ( uint32_t idx =0; idx < sizeof( TDataType ); ++idx )
+					theConverter.mU8[idx] = inStart[idx];
+				if ( val(DoSwapBytes))
+					swapBytes<sizeof(TDataType)>( theConverter.mU8 );
+				outData = theConverter.convert<TDataType>();
+				inStart += sizeof( TDataType );
+				return true;
+			}
+			return false;
+		}
+
+		template<bool DoSwapBytes, typename TDataType>
+		static inline bool parseBlock( const uint8_t*& inStart, const uint8_t* inStop, TDataType* outData, uint32_t inNumItems )
+		{
+			uint32_t desired = sizeof(TDataType)*inNumItems;
+			if ( checkLength( inStart, inStop, desired ) )
+			{
+				if ( val(DoSwapBytes) )
+				{
+					for ( uint32_t item = 0; item < inNumItems; ++item )
+					{
+						BlockParserDataConverter theConverter;
+						for ( uint32_t idx =0; idx < sizeof( TDataType ); ++idx )
+							theConverter.mU8[idx] = inStart[idx];
+						swapBytes<sizeof(TDataType)>( theConverter.mU8 );
+						outData[item] = theConverter.convert<TDataType>();
+						inStart += sizeof(TDataType);
+					}
+				}
+				else
+				{
+					uint8_t* target = reinterpret_cast<uint8_t*>(outData);
+					memmove( target, inStart, desired );
+					inStart += desired;
+				}
+				return true;
+			}
+			return false;
+		}
+		
+		//In-place byte swapping block
+		template<bool DoSwapBytes, typename TDataType>
+		static inline bool parseBlock( uint8_t*& inStart, const uint8_t* inStop, uint32_t inNumItems )
+		{
+			uint32_t desired = sizeof(TDataType)*inNumItems;
+			if ( checkLength( inStart, inStop, desired ) )
+			{
+				if ( val(DoSwapBytes) )
+				{
+					for ( uint32_t item = 0; item < inNumItems; ++item, inStart += sizeof( TDataType ) )
+						swapBytes<sizeof(TDataType)>( inStart ); //In-place swap.
+				}
+				else
+					inStart += sizeof( TDataType ) * inNumItems;
+				return true;
+			}
+			return false;
+		}
+	};
+
+	//Wraps the begin/end keeping track of them.
+	template<bool DoSwapBytes>
+	struct BlockParser
+	{
+		const uint8_t* mBegin;
+		const uint8_t* mEnd;
+		BlockParser( const uint8_t* inBegin=NULL, const uint8_t* inEnd=NULL )
+			: mBegin( inBegin )
+			, mEnd( inEnd )
+		{
+		}
+		inline bool hasMoreData() const { return mBegin != mEnd; }
+		inline bool checkLength( uint32_t inLength ) { return BlockParseFunctions::checkLength( mBegin, mEnd, inLength ); }
+		
+		template<typename TDataType>
+		inline bool read( TDataType& outDatatype ) { return BlockParseFunctions::parse<DoSwapBytes>( mBegin, mEnd, outDatatype ); }
+
+		template<typename TDataType>
+		inline bool readBlock( TDataType* outDataPtr, uint32_t inNumItems ) { return BlockParseFunctions::parseBlock<DoSwapBytes>( mBegin, mEnd, outDataPtr, inNumItems ); }
+
+		template<typename TDataType>
+		inline bool readBlock( uint32_t inNumItems ) 
+		{ 
+			uint8_t* theTempPtr = const_cast<uint8_t*>(mBegin);
+			bool retval = BlockParseFunctions::parseBlock<DoSwapBytes, TDataType>( theTempPtr, mEnd, inNumItems ); 
+			mBegin = theTempPtr;
+			return retval;
+		}
+
+		uint32_t amountLeft() const { return static_cast<uint32_t>( mEnd - mBegin ); }
+	};
+
+	//Reads the data without checking for error conditions
+	template<typename TDataType, typename TBlockParserType>
+	inline TDataType blockParserRead( TBlockParserType& inType )
+	{
+		TDataType retval;
+		inType.read( retval );
+		return retval;
+	}
+}}
+
+#endif // PXPVDSDK_PXPROFILEDATAPARSING_H
diff --git a/PxShared/src/pvd/src/PxProfileEventBuffer.h b/PxShared/src/pvd/src/PxProfileEventBuffer.h
new file mode 100644
index 0000000..213d980
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileEventBuffer.h
@@ -0,0 +1,270 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PXPVDSDK_PXPROFILEEVENTBUFFER_H
+#define PXPVDSDK_PXPROFILEEVENTBUFFER_H
+
+#include "PxProfileEvents.h"
+#include "PxProfileEventSerialization.h"
+#include "PxProfileEventSystem.h"
+#include "PxProfileDataBuffer.h"
+#include "PxProfileContextProvider.h"
+
+#include "PsArray.h"
+#include "PsTime.h"
+#include "PsCpu.h"
+
+namespace physx { namespace profile {
+
+	/**
+	 *	An event buffer maintains an in-memory buffer of events.  When this buffer is full
+	 *	it sends to buffer to all handlers registered and resets the buffer.
+	 *
+	 *	It is parameterized in four ways.  The first is a context provider that provides
+	 *	both thread id and context id.
+	 *	
+	 *	The second is the mutex (which may be null) and a scoped locking mechanism.  Thus the buffer
+	 *	may be used in a multithreaded context but clients of the buffer don't pay for this if they
+	 *	don't intend to use it this way.
+	 *
+	 *	Finally the buffer may use an event filtering mechanism.  This mechanism needs one function,
+	 *	namely isEventEnabled( uint8_t subsystem, uint8_t eventId ).
+	 *
+	 *	All of these systems can be parameterized at compile time leading to an event buffer
+	 *	that should be as fast as possible given the constraints.
+	 *
+	 *	Buffers may be chained together as this buffer has a handleBufferFlush method that
+	 *	will grab the mutex and add the data to this event buffer.
+	 *
+	 *	Overall, lets look at the PhysX SDK an how all the pieces fit together.
+	 *	The SDK should have a mutex-protected event buffer where actual devs or users of PhysX
+	 *	can register handlers.  This buffer has slow but correct implementations of the
+	 *	context provider interface.
+	 *
+	 *	The SDK object should also have a concrete event filter which was used in the
+	 *	construction of the event buffer and which it exposes through opaque interfaces.
+	 *
+	 *	The SDK should protect its event buffer and its event filter from multithreaded
+	 *	access and thus this provides the safest and slowest way to log events and to
+	 *	enable/disable events.
+	 *
+	 *	Each scene should also have a concrete event filter.  This filter is updated from
+	 *	the SDK event filter (in a mutex protected way) every frame.  Thus scenes can change
+	 *	their event filtering on a frame-by-frame basis.  It means that tasks running
+	 *	under the scene don't need a mutex when accessing the filter.
+	 *
+	 *	Furthermore the scene should have an event buffer that always sets the context id
+	 *	on each event to the scene.  This allows PVD and other systems to correlate events
+	 *	to scenes.  Scenes should provide access only to a relative event sending system
+	 *	that looks up thread id upon each event but uses the scene id.
+	 *
+	 *	The SDK's event buffer should be setup as an EventBufferClient for each scene's
+	 *	event buffer. Thus the SDK should expose an EventBufferClient interface that
+	 *	any client can use.
+	 *
+	 *	For extremely *extremely* performance sensitive areas we should create a specialized
+	 *	per-scene, per-thread event buffer that is set on the task for these occasions.  This buffer
+	 *	uses a trivial event context setup with the scene's context id and the thread id.  It should
+	 *	share the scene's concrete event filter and it should have absolutely no locking.  It should
+	 *	empty into the scene's event buffer which in some cases should empty into the SDK's event buffer
+	 *	which when full will push events all the way out of the system.  The task should *always* flush
+	 *	the event buffer (if it has one) when it is finished; nothing else will work reliably.
+	 *
+	 *	If the per-scene,per-thread event buffer is correctly parameterized and fully defined adding
+	 *	a new event should be an inline operation requiring no mutex grabs in the common case.  I don't
+	 *	believe you can get faster event production than this; the events are as small as possible (all
+	 *	relative events) and they are all produced inline resulting in one 4 byte header and one
+	 *	8 byte timestamp per event.  Reducing the memory pressure in this way reduces the communication
+	 *	overhead, the mutex grabs, basically everything that makes profiling expensive at the cost
+	 *	of a per-scene,per-thread event buffer (which could easily be reduced to a per-thread event
+	 *	buffer.
+	 */
+	template<typename TContextProvider, 
+			typename TMutex, 
+			typename TScopedLock,
+			typename TEventFilter>
+	class EventBuffer  : public DataBuffer<TMutex, TScopedLock>
+	{
+	public:
+		typedef DataBuffer<TMutex, TScopedLock> TBaseType;
+		typedef TContextProvider	TContextProviderType;
+		typedef TEventFilter		TEventFilterType;
+		typedef typename TBaseType::TMutexType TMutexType;
+		typedef typename TBaseType::TScopedLockType TScopedLockType;
+		typedef typename TBaseType::TU8AllocatorType TU8AllocatorType;
+		typedef typename TBaseType::TMemoryBufferType TMemoryBufferType;
+		typedef typename TBaseType::TBufferClientArray TBufferClientArray;
+
+	private:
+		EventContextInformation				mEventContextInformation;
+		uint64_t								mLastTimestamp;
+		TContextProvider					mContextProvider;
+		TEventFilterType					mEventFilter;
+
+	public:
+		EventBuffer(PxAllocatorCallback* inFoundation
+					, uint32_t inBufferFullAmount
+					, const TContextProvider& inProvider
+					, TMutexType* inBufferMutex
+					, const TEventFilterType& inEventFilter )
+					: TBaseType( inFoundation, inBufferFullAmount, inBufferMutex, "struct physx::profile::ProfileEvent" )
+			, mLastTimestamp( 0 )
+			, mContextProvider( inProvider )
+			, mEventFilter( inEventFilter )
+		{
+			memset(&mEventContextInformation,0,sizeof(EventContextInformation));
+		}
+
+		TContextProvider& getContextProvider() { return mContextProvider; }
+
+		PX_FORCE_INLINE void startEvent(uint16_t inId, uint32_t threadId, uint64_t contextId, uint8_t cpuId, uint8_t threadPriority, uint64_t inTimestamp)
+		{			
+			TScopedLockType lock(TBaseType::mBufferMutex);
+			if ( mEventFilter.isEventEnabled( inId ) )
+			{
+				StartEvent theEvent;
+				theEvent.init( threadId, contextId, cpuId, threadPriority, inTimestamp );
+				doAddProfileEvent( inId, theEvent );
+			}
+		}
+
+		PX_FORCE_INLINE void startEvent(uint16_t inId, uint64_t contextId)
+		{
+			PxProfileEventExecutionContext ctx( mContextProvider.getExecutionContext() );
+			startEvent( inId, ctx.mThreadId, contextId, ctx.mCpuId, static_cast<uint8_t>(ctx.mThreadPriority), shdfnd::Time::getCurrentCounterValue() );
+		}
+
+		PX_FORCE_INLINE void startEvent(uint16_t inId, uint64_t contextId, uint32_t threadId)
+		{
+			startEvent( inId, threadId, contextId, 0, 0, shdfnd::Time::getCurrentCounterValue() );
+		}
+
+		PX_FORCE_INLINE void stopEvent(uint16_t inId, uint32_t threadId, uint64_t contextId, uint8_t cpuId, uint8_t threadPriority, uint64_t inTimestamp)
+		{			
+			TScopedLockType lock(TBaseType::mBufferMutex);
+			if ( mEventFilter.isEventEnabled( inId ) )
+			{
+				StopEvent theEvent;
+				theEvent.init( threadId, contextId, cpuId, threadPriority, inTimestamp );
+				doAddProfileEvent( inId, theEvent );
+			}
+		}
+
+		PX_FORCE_INLINE void stopEvent(uint16_t inId, uint64_t contextId)
+		{
+			PxProfileEventExecutionContext ctx( mContextProvider.getExecutionContext() );
+			stopEvent( inId, ctx.mThreadId, contextId, ctx.mCpuId, static_cast<uint8_t>(ctx.mThreadPriority), shdfnd::Time::getCurrentCounterValue() );
+		}
+
+		PX_FORCE_INLINE void stopEvent(uint16_t inId, uint64_t contextId, uint32_t threadId)
+		{
+			stopEvent( inId, threadId, contextId, 0, 0, shdfnd::Time::getCurrentCounterValue() );
+		}
+
+		inline void eventValue( uint16_t inId, uint64_t contextId, int64_t inValue )
+		{
+			eventValue( inId, mContextProvider.getThreadId(), contextId, inValue );
+		}
+
+		inline void eventValue( uint16_t inId, uint32_t threadId, uint64_t contextId, int64_t inValue )
+		{
+			TScopedLockType lock( TBaseType::mBufferMutex );
+			EventValue theEvent;
+			theEvent.init( inValue, contextId, threadId );
+			EventHeader theHeader( static_cast<uint8_t>( getEventType<EventValue>() ), inId );
+			//set the header relative timestamp;
+			EventValue& theType( theEvent );
+			theType.setupHeader( theHeader );
+			sendEvent( theHeader, theType );
+		}
+
+		void flushProfileEvents()
+		{				
+			TBaseType::flushEvents();
+		}
+
+		void release()
+		{
+			PX_PROFILE_DELETE( TBaseType::mWrapper.mUserFoundation, this );
+		}
+	protected:
+		//Clears the cache meaning event compression
+		//starts over again.
+		//only called when the buffer mutex is held
+		void clearCachedData()
+		{
+			mEventContextInformation.setToDefault();
+			mLastTimestamp = 0;
+		}
+
+		template<typename TProfileEventType>
+		PX_FORCE_INLINE void doAddProfileEvent(uint16_t eventId, const TProfileEventType& inType)
+		{
+			TScopedLockType lock(TBaseType::mBufferMutex);
+			if (mEventContextInformation == inType.mContextInformation)
+				doAddEvent(static_cast<uint8_t>(inType.getRelativeEventType()), eventId, inType.getRelativeEvent());
+			else
+			{
+				mEventContextInformation = inType.mContextInformation;
+				doAddEvent( static_cast<uint8_t>( getEventType<TProfileEventType>() ), eventId, inType );
+			}
+		}
+
+		template<typename TDataType>
+		PX_FORCE_INLINE void doAddEvent(uint8_t inEventType, uint16_t eventId, const TDataType& inType)
+		{
+			EventHeader theHeader( inEventType, eventId );
+			//set the header relative timestamp;
+			TDataType& theType( const_cast<TDataType&>( inType ) );
+			uint64_t currentTs =  inType.getTimestamp();
+			theType.setupHeader(theHeader, mLastTimestamp);
+			mLastTimestamp = currentTs;
+			sendEvent( theHeader, theType );
+		}
+
+		template<typename TDataType>
+		PX_FORCE_INLINE void sendEvent( EventHeader& inHeader, TDataType& inType )
+		{			
+			uint32_t sizeToWrite = sizeof(inHeader) + inType.getEventSize(inHeader);
+			PX_UNUSED(sizeToWrite);
+
+			uint32_t writtenSize = inHeader.streamify( TBaseType::mSerializer );
+			writtenSize += inType.streamify(TBaseType::mSerializer, inHeader);
+
+			PX_ASSERT(writtenSize == sizeToWrite);
+
+			if ( TBaseType::mDataArray.size() >= TBaseType::mBufferFullAmount )
+				flushProfileEvents();
+
+		}
+
+	};
+}}
+#endif // PXPVDSDK_PXPROFILEEVENTBUFFER_H
diff --git a/PxShared/src/pvd/src/PxProfileEventBufferAtomic.h b/PxShared/src/pvd/src/PxProfileEventBufferAtomic.h
new file mode 100644
index 0000000..f87839f
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileEventBufferAtomic.h
@@ -0,0 +1,320 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PXPVDSDK_PXPROFILEEVENTBUFFERATOMIC_H
+#define PXPVDSDK_PXPROFILEEVENTBUFFERATOMIC_H
+
+#include "PxProfileEvents.h"
+#include "PxProfileEventSerialization.h"
+#include "PxProfileEventSystem.h"
+#include "PxProfileDataBuffer.h"
+#include "PxProfileContextProvider.h"
+
+#include "PsArray.h"
+#include "PsAlloca.h"
+#include "PsTime.h"
+#include "PsCpu.h"
+#include "PsAtomic.h"
+#include "PsAllocator.h"
+
+
+namespace physx {
+	namespace profile {
+
+		static const uint32_t LOCAL_BUFFER_SIZE = 512;
+
+		/**
+		*	An event buffer maintains an in-memory buffer of events.  When this buffer is full
+		*	it sends to buffer to all handlers registered and resets the buffer.
+		*
+		*	It is parameterized in four ways.  The first is a context provider that provides
+		*	both thread id and context id.
+		*
+		*	The second is the mutex (which may be null) and a scoped locking mechanism.  Thus the buffer
+		*	may be used in a multithreaded context but clients of the buffer don't pay for this if they
+		*	don't intend to use it this way.
+		*
+		*	Finally the buffer may use an event filtering mechanism.  This mechanism needs one function,
+		*	namely isEventEnabled( uint8_t subsystem, uint8_t eventId ).
+		*
+		*	All of these systems can be parameterized at compile time leading to an event buffer
+		*	that should be as fast as possible given the constraints.
+		*
+		*	Buffers may be chained together as this buffer has a handleBufferFlush method that
+		*	will grab the mutex and add the data to this event buffer.
+		*
+		*	Overall, lets look at the PhysX SDK an how all the pieces fit together.
+		*	The SDK should have a mutex-protected event buffer where actual devs or users of PhysX
+		*	can register handlers.  This buffer has slow but correct implementations of the
+		*	context provider interface.
+		*
+		*	The SDK object should also have a concrete event filter which was used in the
+		*	construction of the event buffer and which it exposes through opaque interfaces.
+		*
+		*	The SDK should protect its event buffer and its event filter from multithreaded
+		*	access and thus this provides the safest and slowest way to log events and to
+		*	enable/disable events.
+		*
+		*	Each scene should also have a concrete event filter.  This filter is updated from
+		*	the SDK event filter (in a mutex protected way) every frame.  Thus scenes can change
+		*	their event filtering on a frame-by-frame basis.  It means that tasks running
+		*	under the scene don't need a mutex when accessing the filter.
+		*
+		*	Furthermore the scene should have an event buffer that always sets the context id
+		*	on each event to the scene.  This allows PVD and other systems to correlate events
+		*	to scenes.  Scenes should provide access only to a relative event sending system
+		*	that looks up thread id upon each event but uses the scene id.
+		*
+		*	The SDK's event buffer should be setup as an EventBufferClient for each scene's
+		*	event buffer. Thus the SDK should expose an EventBufferClient interface that
+		*	any client can use.
+		*
+		*	For extremely *extremely* performance sensitive areas we should create a specialized
+		*	per-scene, per-thread event buffer that is set on the task for these occasions.  This buffer
+		*	uses a trivial event context setup with the scene's context id and the thread id.  It should
+		*	share the scene's concrete event filter and it should have absolutely no locking.  It should
+		*	empty into the scene's event buffer which in some cases should empty into the SDK's event buffer
+		*	which when full will push events all the way out of the system.  The task should *always* flush
+		*	the event buffer (if it has one) when it is finished; nothing else will work reliably.
+		*
+		*	If the per-scene,per-thread event buffer is correctly parameterized and fully defined adding
+		*	a new event should be an inline operation requiring no mutex grabs in the common case.  I don't
+		*	believe you can get faster event production than this; the events are as small as possible (all
+		*	relative events) and they are all produced inline resulting in one 4 byte header and one
+		*	8 byte timestamp per event.  Reducing the memory pressure in this way reduces the communication
+		*	overhead, the mutex grabs, basically everything that makes profiling expensive at the cost
+		*	of a per-scene,per-thread event buffer (which could easily be reduced to a per-thread event
+		*	buffer.
+		*/
+		template<typename TContextProvider,
+			typename TMutex,
+			typename TScopedLock,			
+			typename TEventFilter>
+		class EventBufferAtomic : public DataBuffer < TMutex, TScopedLock >
+		{
+		public:
+			typedef DataBuffer<TMutex, TScopedLock> TBaseType;
+			typedef TContextProvider	TContextProviderType;
+			typedef TEventFilter		TEventFilterType;
+			typedef typename TBaseType::TMutexType TMutexType;
+			typedef typename TBaseType::TScopedLockType TScopedLockType;
+			typedef typename TBaseType::TU8AllocatorType TU8AllocatorType;
+			typedef typename TBaseType::TMemoryBufferType TMemoryBufferType;
+			typedef typename TBaseType::TBufferClientArray TBufferClientArray;
+
+		private:						
+			TContextProvider					mContextProvider;
+			TEventFilterType					mEventFilter;
+			volatile int32_t					mReserved;
+			volatile int32_t					mWritten;
+			
+		public:
+			EventBufferAtomic(PxAllocatorCallback* inFoundation
+				, uint32_t inBufferFullAmount
+				, const TContextProvider& inProvider
+				, TMutexType* inBufferMutex
+				, const TEventFilterType& inEventFilter)
+				: TBaseType(inFoundation, inBufferFullAmount, inBufferMutex, "struct physx::profile::ProfileEvent")				
+				, mContextProvider(inProvider)
+				, mEventFilter(inEventFilter)
+				, mReserved(0)
+				, mWritten(0)		 
+			{				
+			}
+
+			TContextProvider& getContextProvider() { return mContextProvider; }
+
+			PX_FORCE_INLINE void startEvent(uint16_t inId, uint32_t threadId, uint64_t contextId, uint8_t cpuId, uint8_t threadPriority, uint64_t inTimestamp)
+			{
+				if (mEventFilter.isEventEnabled(inId))
+				{
+					StartEvent theEvent;
+					theEvent.init(threadId, contextId, cpuId, threadPriority, inTimestamp);
+					doAddProfileEvent(inId, theEvent);
+				}
+			}
+
+			PX_FORCE_INLINE void startEvent(uint16_t inId, uint64_t contextId)
+			{
+				PxProfileEventExecutionContext ctx(mContextProvider.getExecutionContext());
+				startEvent(inId, ctx.mThreadId, contextId, ctx.mCpuId, static_cast<uint8_t>(ctx.mThreadPriority), shdfnd::Time::getCurrentCounterValue());
+			}
+
+			PX_FORCE_INLINE void startEvent(uint16_t inId, uint64_t contextId, uint32_t threadId)
+			{
+				startEvent(inId, threadId, contextId, 0, 0, shdfnd::Time::getCurrentCounterValue());
+			}
+
+			PX_FORCE_INLINE void stopEvent(uint16_t inId, uint32_t threadId, uint64_t contextId, uint8_t cpuId, uint8_t threadPriority, uint64_t inTimestamp)
+			{
+				if (mEventFilter.isEventEnabled(inId))
+				{
+					StopEvent theEvent;
+					theEvent.init(threadId, contextId, cpuId, threadPriority, inTimestamp);
+					doAddProfileEvent(inId, theEvent);
+				}
+			}
+
+			PX_FORCE_INLINE void stopEvent(uint16_t inId, uint64_t contextId)
+			{
+				PxProfileEventExecutionContext ctx(mContextProvider.getExecutionContext());
+				stopEvent(inId, ctx.mThreadId, contextId, ctx.mCpuId, static_cast<uint8_t>(ctx.mThreadPriority), shdfnd::Time::getCurrentCounterValue());
+			}
+
+			PX_FORCE_INLINE void stopEvent(uint16_t inId, uint64_t contextId, uint32_t threadId)
+			{
+				stopEvent(inId, threadId, contextId, 0, 0, shdfnd::Time::getCurrentCounterValue());
+			}
+
+			inline void eventValue(uint16_t inId, uint64_t contextId, int64_t inValue)
+			{
+				eventValue(inId, mContextProvider.getThreadId(), contextId, inValue);
+			}
+
+			inline void eventValue(uint16_t inId, uint32_t threadId, uint64_t contextId, int64_t inValue)
+			{
+				EventValue theEvent;
+				theEvent.init(inValue, contextId, threadId);
+				EventHeader theHeader(static_cast<uint8_t>(getEventType<EventValue>()), inId);
+				//set the header relative timestamp;
+				EventValue& theType(theEvent);
+				theType.setupHeader(theHeader);
+
+				int32_t sizeToWrite = int32_t(sizeof(theHeader) + theType.getEventSize(theHeader));
+				int32_t reserved = shdfnd::atomicAdd(&mReserved, sizeToWrite);
+				sendEvent(theHeader, theType, reserved, sizeToWrite);
+			}
+
+			void flushProfileEvents(int32_t reserved = -1)
+			{
+				TScopedLockType lock(TBaseType::mBufferMutex);
+
+				// set the buffer full to lock additional writes
+				int32_t reservedOld = shdfnd::atomicExchange(&mReserved, int32_t(TBaseType::mBufferFullAmount + 1));
+				if (reserved == -1)
+					reserved = reservedOld;
+
+				// spin till we have written all the data				
+				while (reserved > mWritten)
+				{
+				}
+
+				// check if we have written all data
+				PX_ASSERT(reserved == mWritten);
+
+				// set the correct size of the serialization data buffer
+				TBaseType::mSerializer.mArray->setEnd(TBaseType::mSerializer.mArray->begin() + mWritten);
+
+				// flush events
+				TBaseType::flushEvents();
+
+				// write master timestamp and set reserved/written to start writing to buffer again
+				mWritten = 0;
+				mReserved = 0;
+			}
+
+			void release()
+			{
+				PX_PROFILE_DELETE(TBaseType::mWrapper.mUserFoundation, this);
+			}
+		protected:
+			//Clears the cache meaning event compression
+			//starts over again.
+			//only called when the buffer mutex is held
+			void clearCachedData()
+			{								
+			}
+
+			template<typename TProfileEventType>
+			PX_FORCE_INLINE void doAddProfileEvent(uint16_t eventId, const TProfileEventType& inType)
+			{				
+				doAddEvent(static_cast<uint8_t>(getEventType<TProfileEventType>()), eventId, inType);
+			}
+
+			template<typename TDataType>
+			PX_FORCE_INLINE void doAddEvent(uint8_t inEventType, uint16_t eventId, const TDataType& inType)
+			{
+				EventHeader theHeader(inEventType, eventId);
+				TDataType& theType(const_cast<TDataType&>(inType));				
+				theType.setupHeader(theHeader, 0);
+
+				const int32_t sizeToWrite = int32_t(sizeof(theHeader) + theType.getEventSize(theHeader));
+
+				int32_t reserved = shdfnd::atomicAdd(&mReserved, sizeToWrite);
+				sendEvent(theHeader, theType, reserved, sizeToWrite);				
+			}
+
+			template<typename TDataType>
+			PX_FORCE_INLINE void sendEvent(EventHeader& inHeader, TDataType& inType, int32_t reserved, int32_t sizeToWrite)
+			{
+				// if we don't fit to the buffer, we wait till it is flushed
+				if (reserved - sizeToWrite >= int32_t(TBaseType::mBufferFullAmount))
+				{
+					while (reserved - sizeToWrite >= int32_t(TBaseType::mBufferFullAmount))
+					{
+						// I32 overflow 
+						if (mReserved < int32_t(TBaseType::mBufferFullAmount))
+						{							
+							reserved = shdfnd::atomicAdd(&mReserved, sizeToWrite);
+						}
+					}
+				}
+
+				int32_t writeIndex = reserved - sizeToWrite;
+				uint32_t writtenSize = 0;
+
+				PX_ASSERT(writeIndex >= 0);
+				
+				PX_ALLOCA(tempBuffer, uint8_t, sizeToWrite);
+				TempMemoryBuffer memoryBuffer(tempBuffer, sizeToWrite);
+				EventSerializer<TempMemoryBuffer> eventSerializer(&memoryBuffer);
+
+				writtenSize = inHeader.streamify(eventSerializer);
+				writtenSize += inType.streamify(eventSerializer, inHeader);
+
+				TBaseType::mSerializer.mArray->reserve(writeIndex + writtenSize);
+				TBaseType::mSerializer.mArray->write(&tempBuffer[0], writtenSize, writeIndex);
+				
+				PX_ASSERT(writtenSize == uint32_t(sizeToWrite));					
+				shdfnd::atomicAdd(&mWritten, sizeToWrite);
+
+				if (reserved >= int32_t(TBaseType::mBufferFullAmount))
+				{	
+					TScopedLockType lock(TBaseType::mBufferMutex);
+					// we flush the buffer if its full and we did not flushed him in the meantime
+					if(mReserved >= reserved)
+						flushProfileEvents(reserved);
+				}
+			}
+
+		};
+	}
+}
+#endif // PXPVDSDK_PXPROFILEEVENTBUFFERATOMIC_H
diff --git a/PxShared/src/pvd/src/PxProfileEventBufferClient.h b/PxShared/src/pvd/src/PxProfileEventBufferClient.h
new file mode 100644
index 0000000..d8a1ff2
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileEventBufferClient.h
@@ -0,0 +1,81 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPROFILEEVENTBUFFERCLIENT_H
+#define PXPVDSDK_PXPROFILEEVENTBUFFERCLIENT_H
+
+#include "PxProfileBase.h"
+#include "PxProfileEventNames.h"
+
+namespace physx { namespace profile {
+	
+	/**
+	\brief Client handles the data when an event buffer flushes.  This data
+	can be parsed (PxProfileEventHandler.h) as a binary set of events.
+	*/
+	class PxProfileEventBufferClient
+	{
+	protected:
+		virtual ~PxProfileEventBufferClient(){}
+	public:
+		/**
+		\brief Callback when the event buffer is full. This data is serialized profile events
+		and can be read back using: PxProfileEventHandler::parseEventBuffer.
+
+		\param inData Provided buffer data.
+		\param inLength Data length.
+
+		@see PxProfileEventHandler::parseEventBuffer.
+		 */
+		virtual void handleBufferFlush( const uint8_t* inData, uint32_t inLength ) = 0;
+
+		/**
+		\brief Happens if something removes all the clients from the manager.
+		*/
+		virtual void handleClientRemoved() = 0; 
+	};
+
+	/**
+	\brief Client handles new profile event add.
+	*/
+	class PxProfileZoneClient : public PxProfileEventBufferClient
+	{
+	protected:
+		virtual ~PxProfileZoneClient(){}
+	public:
+		/**
+		\brief Callback when new profile event is added.
+
+		\param inName Added profile event name.
+		*/
+		virtual void handleEventAdded( const PxProfileEventName& inName ) = 0;
+	};
+
+} }
+
+
+#endif // PXPVDSDK_PXPROFILEEVENTBUFFERCLIENT_H
diff --git a/PxShared/src/pvd/src/PxProfileEventBufferClientManager.h b/PxShared/src/pvd/src/PxProfileEventBufferClientManager.h
new file mode 100644
index 0000000..1402183
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileEventBufferClientManager.h
@@ -0,0 +1,94 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPROFILEEVENTBUFFERCLIENTMANAGER_H
+#define PXPVDSDK_PXPROFILEEVENTBUFFERCLIENTMANAGER_H
+
+#include "PxProfileEventBufferClient.h"
+
+namespace physx { namespace profile {
+	
+	/**
+	\brief	Manager keep collections of PxProfileEventBufferClient clients. 
+
+	@see PxProfileEventBufferClient
+	*/
+	class PxProfileEventBufferClientManager
+	{
+	protected:
+		virtual ~PxProfileEventBufferClientManager(){}
+	public:
+		/**
+		\brief Adds new client.
+		\param inClient Client to add.
+		*/
+		virtual void addClient( PxProfileEventBufferClient& inClient ) = 0;
+
+		/**
+		\brief Removes a client.
+		\param inClient Client to remove.
+		*/
+		virtual void removeClient( PxProfileEventBufferClient& inClient ) = 0;
+
+		/**
+		\brief Check if manager has clients.
+		\return True if manager has added clients.
+		*/
+		virtual bool hasClients() const = 0;
+	};
+
+	/**
+	\brief	Manager keep collections of PxProfileZoneClient clients. 
+
+	@see PxProfileZoneClient
+	*/
+	class PxProfileZoneClientManager
+	{
+	protected:
+		virtual ~PxProfileZoneClientManager(){}
+	public:
+		/**
+		\brief Adds new client.
+		\param inClient Client to add.
+		*/
+		virtual void addClient( PxProfileZoneClient& inClient ) = 0;
+
+		/**
+		\brief Removes a client.
+		\param inClient Client to remove.
+		*/
+		virtual void removeClient( PxProfileZoneClient& inClient ) = 0;
+
+		/**
+		\brief Check if manager has clients.
+		\return True if manager has added clients.
+		*/
+		virtual bool hasClients() const = 0;
+	};
+} }
+
+#endif // PXPVDSDK_PXPROFILEEVENTBUFFERCLIENTMANAGER_H
diff --git a/PxShared/src/pvd/src/PxProfileEventFilter.h b/PxShared/src/pvd/src/PxProfileEventFilter.h
new file mode 100644
index 0000000..0f38d65
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileEventFilter.h
@@ -0,0 +1,93 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPROFILEEVENTFILTER_H
+#define PXPVDSDK_PXPROFILEEVENTFILTER_H
+
+#include "foundation/PxAssert.h"
+#include "PxProfileBase.h"
+#include "PxProfileEventId.h"
+
+namespace physx { namespace profile {
+
+	/**
+	\brief Called upon every event to give a quick-out before adding the event
+	to the event buffer.
+
+	\note: not thread safe, can be called from different threads at the same time
+	*/
+	class PxProfileEventFilter
+	{
+	protected:
+		virtual ~PxProfileEventFilter(){}
+	public:
+		/**
+		\brief Disabled events will not go into the event buffer and will not be 
+		transmitted to clients.
+		\param inId Profile event id.
+		\param isEnabled True if event should be enabled.
+		*/
+		virtual void setEventEnabled( const PxProfileEventId& inId, bool isEnabled ) = 0;
+
+		/**
+		\brief Returns the current state of the profile event.
+		\return True if profile event is enabled.
+		*/
+		virtual bool isEventEnabled( const PxProfileEventId& inId ) const = 0;
+	};
+
+	/**
+	\brief Forwards the filter requests to another event filter.
+	*/
+	template<typename TFilterType>
+	struct PxProfileEventFilterForward
+	{		
+		/**
+		\brief Default constructor.
+		*/
+		PxProfileEventFilterForward( TFilterType* inFilter ) : filter( inFilter ) {}
+
+		/**
+		\brief Disabled events will not go into the event buffer and will not be 
+		transmitted to clients.
+		\param inId Profile event id.
+		\param isEnabled True if event should be enabled.
+		*/
+		void setEventEnabled( const PxProfileEventId& inId, bool isEnabled ) { filter->setEventEnabled( inId, isEnabled ); }
+
+		/**
+		\brief Returns the current state of the profile event.
+		\return True if profile event is enabled.
+		*/
+		bool isEventEnabled( const PxProfileEventId& inId ) const { return filter->isEventEnabled( inId ); }
+
+		TFilterType* filter;
+	};
+
+} }
+
+#endif // PXPVDSDK_PXPROFILEEVENTFILTER_H
diff --git a/PxShared/src/pvd/src/PxProfileEventHandler.h b/PxShared/src/pvd/src/PxProfileEventHandler.h
new file mode 100644
index 0000000..0e0f0f9
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileEventHandler.h
@@ -0,0 +1,99 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPROFILEEVENTHANDLER_H
+#define PXPVDSDK_PXPROFILEEVENTHANDLER_H
+
+#include "PxProfileBase.h"
+#include "PxProfileEventId.h"
+
+namespace physx { namespace profile {
+
+	/**
+	\brief A client of the event system can expect to find these events in the event buffer.
+	*/
+	class PxProfileEventHandler
+	{
+	protected:
+		virtual ~PxProfileEventHandler(){}
+	public:
+		/**
+		\brief Event start - onStartEvent.
+			
+		\param[in] inId Profile event id.
+		\param[in] threadId Thread id.
+		\param[in] contextId Context id.
+		\param[in] cpuId CPU id.
+		\param[in] threadPriority Thread priority.
+		\param[in] timestamp Timestamp in cycles.		
+		 */
+		virtual void onStartEvent( const PxProfileEventId& inId, uint32_t threadId, uint64_t contextId, uint8_t cpuId, uint8_t threadPriority, uint64_t timestamp ) = 0;
+
+		/**
+		\brief Event stop - onStopEvent.
+			
+		\param[in] inId Profile event id.
+		\param[in] threadId Thread id.
+		\param[in] contextId Context id.
+		\param[in] cpuId CPU id.
+		\param[in] threadPriority Thread priority.
+		\param[in] timestamp Timestamp in cycles.		
+		 */
+		virtual void onStopEvent( const PxProfileEventId& inId, uint32_t threadId, uint64_t contextId, uint8_t cpuId, uint8_t threadPriority, uint64_t timestamp ) = 0;
+
+		/**
+		\brief Event value - onEventValue.
+			
+		\param[in] inId Profile event id.
+		\param[in] threadId Thread id.
+		\param[in] contextId Context id.
+		\param[in] inValue Value.
+		 */
+		virtual void onEventValue( const PxProfileEventId& inId, uint32_t threadId, uint64_t contextId, int64_t inValue ) = 0;
+
+		/**
+		\brief Parse the flushed profile buffer which contains the profile events.
+			
+		\param[in] inBuffer The profile buffer with profile events.
+		\param[in] inBufferSize Buffer size.
+		\param[in] inHandler The profile event callback to receive the parsed events.
+		\param[in] inSwapBytes Swap bytes possibility.		
+		 */
+		static void parseEventBuffer( const uint8_t* inBuffer, uint32_t inBufferSize, PxProfileEventHandler& inHandler, bool inSwapBytes );
+
+		/**
+		\brief Translates event duration in timestamp (cycles) into nanoseconds.
+			
+		\param[in] duration Timestamp duration of the event.
+
+		\return event duration in nanoseconds. 
+		 */
+		static uint64_t durationToNanoseconds(uint64_t duration);
+	};
+} }
+
+#endif // PXPVDSDK_PXPROFILEEVENTHANDLER_H
diff --git a/PxShared/src/pvd/src/PxProfileEventId.h b/PxShared/src/pvd/src/PxProfileEventId.h
new file mode 100644
index 0000000..dd98cd5
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileEventId.h
@@ -0,0 +1,80 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPROFILEEVENTID_H
+#define PXPVDSDK_PXPROFILEEVENTID_H
+
+#include "PxProfileBase.h"
+
+namespace physx { namespace profile {
+	/**
+	\brief A event id structure. Optionally includes information about
+	if the event was enabled at compile time.
+	*/
+	struct PxProfileEventId
+	{
+		uint16_t		eventId;
+		mutable bool	compileTimeEnabled; 
+
+		/**
+		\brief Profile event id constructor.
+		\param inId Profile event id.
+		\param inCompileTimeEnabled Compile time enabled.
+		*/
+		PxProfileEventId( uint16_t inId = 0, bool inCompileTimeEnabled = true )
+			: eventId( inId )
+			, compileTimeEnabled( inCompileTimeEnabled )
+		{
+		}
+
+		operator uint16_t () const { return eventId; }
+
+		bool operator==( const PxProfileEventId& inOther ) const 
+		{ 
+			return eventId == inOther.eventId;
+		}
+	};
+
+	/**
+	\brief Template event id structure.
+	*/
+	template<bool TEnabled>
+	struct PxProfileCompileTimeFilteredEventId : public PxProfileEventId
+	{
+		/**
+		\brief Constructor.
+		\param inId Profile event id.		
+		*/
+		PxProfileCompileTimeFilteredEventId( uint16_t inId = 0 )
+			: PxProfileEventId( inId, TEnabled )
+		{
+		}
+	};
+		
+} }
+
+#endif // PXPVDSDK_PXPROFILEEVENTID_H
diff --git a/PxShared/src/pvd/src/PxProfileEventImpl.cpp b/PxShared/src/pvd/src/PxProfileEventImpl.cpp
new file mode 100644
index 0000000..a519f92
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileEventImpl.cpp
@@ -0,0 +1,230 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#include "foundation/PxErrorCallback.h"
+#include "foundation/PxAllocatorCallback.h"
+
+#include "PxProfileEvents.h"
+#include "PxProfileEventSerialization.h"
+#include "PxProfileEventBuffer.h"
+#include "PxProfileZoneImpl.h"
+#include "PxProfileZoneManagerImpl.h"
+#include "PxProfileEventParser.h"
+#include "PxProfileEventHandler.h"
+#include "PxProfileScopedMutexLock.h"
+#include "PxProfileEventFilter.h"
+#include "PxProfileContextProvider.h"
+#include "PxProfileEventMutex.h"
+#include "PxProfileMemoryEventTypes.h"
+#include "PxProfileMemoryEventRecorder.h"
+#include "PxProfileMemoryEventBuffer.h"
+#include "PxProfileMemoryEventParser.h"
+#include "PxProfileContextProviderImpl.h"
+
+#include "PsUserAllocated.h"
+#include "PsTime.h"
+
+#include <stdio.h>
+
+namespace physx { namespace profile {
+
+
+	uint64_t PxProfileEventHandler::durationToNanoseconds(uint64_t duration)
+	{
+		return shdfnd::Time::getBootCounterFrequency().toTensOfNanos(duration) * 10;
+	}
+
+	void PxProfileEventHandler::parseEventBuffer( const uint8_t* inBuffer, uint32_t inBufferSize, PxProfileEventHandler& inHandler, bool inSwapBytes )
+	{
+		if ( inSwapBytes == false )
+			parseEventData<false>( inBuffer, inBufferSize, &inHandler );
+		else
+			parseEventData<true>( inBuffer, inBufferSize, &inHandler );
+	}
+
+	template<uint32_t TNumEvents>
+	struct ProfileBulkEventHandlerBuffer
+	{
+		Event mEvents[TNumEvents];
+		uint32_t mEventCount;
+		PxProfileBulkEventHandler* mHandler;
+		ProfileBulkEventHandlerBuffer( PxProfileBulkEventHandler* inHdl )
+			: mEventCount( 0 )
+			, mHandler( inHdl )
+		{
+		}
+		void onEvent( const Event& inEvent )
+		{
+			mEvents[mEventCount] = inEvent;
+			++mEventCount;
+			if ( mEventCount == TNumEvents )
+				flush();
+		}
+		void onEvent( const PxProfileEventId& inId, uint32_t threadId, uint64_t contextId, uint8_t cpuId, uint8_t threadPriority, uint64_t timestamp, EventTypes::Enum inType )
+		{
+			StartEvent theEvent;
+			theEvent.init( threadId, contextId, cpuId, static_cast<uint8_t>( threadPriority ), timestamp );
+			onEvent( Event( EventHeader( static_cast<uint8_t>( inType ), inId.eventId ), theEvent ) );
+		}
+		void onStartEvent( const PxProfileEventId& inId, uint32_t threadId, uint64_t contextId, uint8_t cpuId, uint8_t threadPriority, uint64_t timestamp )
+		{
+			onEvent( inId, threadId, contextId, cpuId, threadPriority, timestamp, EventTypes::StartEvent );
+		}
+		void onStopEvent( const PxProfileEventId& inId, uint32_t threadId, uint64_t contextId, uint8_t cpuId, uint8_t threadPriority, uint64_t timestamp )
+		{
+			onEvent( inId, threadId, contextId, cpuId, threadPriority, timestamp, EventTypes::StopEvent );
+		}
+		void onEventValue( const PxProfileEventId& inId, uint32_t threadId, uint64_t contextId, int64_t value )
+		{
+			EventValue theEvent;
+			theEvent.init( value, contextId, threadId );
+			onEvent( Event( inId.eventId, theEvent ) );
+		}
+		void flush()
+		{
+			if ( mEventCount )
+				mHandler->handleEvents( mEvents, mEventCount );
+			mEventCount = 0;
+		}
+	};
+
+
+	void PxProfileBulkEventHandler::parseEventBuffer( const uint8_t* inBuffer, uint32_t inBufferSize, PxProfileBulkEventHandler& inHandler, bool inSwapBytes )
+	{
+		ProfileBulkEventHandlerBuffer<256> hdler( &inHandler );
+		if ( inSwapBytes )
+			parseEventData<true>( inBuffer, inBufferSize, &hdler );
+		else
+			parseEventData<false>( inBuffer, inBufferSize, &hdler );
+		hdler.flush();
+	}
+
+	struct PxProfileNameProviderImpl
+	{
+		PxProfileNameProvider* mImpl;
+		PxProfileNameProviderImpl( PxProfileNameProvider* inImpl )
+			: mImpl( inImpl )
+		{
+		}
+		PxProfileNames getProfileNames() const { return mImpl->getProfileNames(); }
+	};
+
+	
+	struct PxProfileNameProviderForward
+	{
+		PxProfileNames mNames;
+		PxProfileNameProviderForward( PxProfileNames inNames )
+			: mNames( inNames )
+		{
+		}
+		PxProfileNames getProfileNames() const { return mNames; }
+	};
+
+	
+	PX_FOUNDATION_API PxProfileZone& PxProfileZone::createProfileZone( PxAllocatorCallback* inAllocator, const char* inSDKName, PxProfileNames inNames, uint32_t inEventBufferByteSize )
+	{
+		typedef ZoneImpl<PxProfileNameProviderForward> TSDKType;
+		return *PX_PROFILE_NEW( inAllocator, TSDKType ) ( inAllocator, inSDKName, inEventBufferByteSize, PxProfileNameProviderForward( inNames ) );
+	}
+	
+	PxProfileZoneManager& PxProfileZoneManager::createProfileZoneManager(PxAllocatorCallback* inAllocator )
+	{
+		return *PX_PROFILE_NEW( inAllocator, ZoneManagerImpl ) ( inAllocator );
+	}
+
+	PxProfileMemoryEventRecorder& PxProfileMemoryEventRecorder::createRecorder( PxAllocatorCallback* inAllocator )
+	{
+		return *PX_PROFILE_NEW( inAllocator, PxProfileMemoryEventRecorderImpl )( inAllocator );
+	}
+	
+	PxProfileMemoryEventBuffer& PxProfileMemoryEventBuffer::createMemoryEventBuffer( PxAllocatorCallback& inAllocator, uint32_t inBufferSize )
+	{
+		return *PX_PROFILE_NEW( &inAllocator, PxProfileMemoryEventBufferImpl )( inAllocator, inBufferSize );
+	}
+	template<uint32_t TNumEvents>
+	struct ProfileBulkMemoryEventHandlerBuffer
+	{
+		PxProfileBulkMemoryEvent mEvents[TNumEvents];
+		uint32_t mEventCount;
+		PxProfileBulkMemoryEventHandler* mHandler;
+		ProfileBulkMemoryEventHandlerBuffer( PxProfileBulkMemoryEventHandler* inHdl )
+			: mEventCount( 0 )
+			, mHandler( inHdl )
+		{
+		}
+		void onEvent( const PxProfileBulkMemoryEvent& evt )
+		{
+			mEvents[mEventCount] = evt;
+			++mEventCount;
+			if ( mEventCount == TNumEvents )
+				flush();
+		}
+
+		template<typename TDataType>
+		void operator()( const MemoryEventHeader&, const TDataType& ) {}
+
+		void operator()( const MemoryEventHeader&, const AllocationEvent& evt )
+		{
+			onEvent( PxProfileBulkMemoryEvent( evt.mSize, evt.mType, evt.mFile, evt.mLine, evt.mAddress ) );
+		}
+
+		void operator()( const MemoryEventHeader&, const DeallocationEvent& evt )
+		{
+			onEvent( PxProfileBulkMemoryEvent( evt.mAddress ) );
+		}
+
+		void flush()
+		{
+			if ( mEventCount )
+				mHandler->handleEvents( mEvents, mEventCount );
+			mEventCount = 0;
+		}
+	};
+
+	void PxProfileBulkMemoryEventHandler::parseEventBuffer( const uint8_t* inBuffer, uint32_t inBufferSize, PxProfileBulkMemoryEventHandler& inHandler, bool inSwapBytes, PxAllocatorCallback* inAlloc )
+	{
+		PX_ASSERT(inAlloc);
+
+		ProfileBulkMemoryEventHandlerBuffer<0x1000>* theBuffer = PX_PROFILE_NEW(inAlloc, ProfileBulkMemoryEventHandlerBuffer<0x1000>)(&inHandler);
+
+		if ( inSwapBytes )
+		{			
+			MemoryEventParser<true> theParser( *inAlloc );
+			theParser.parseEventData( inBuffer, inBufferSize, theBuffer );
+		}
+		else
+		{
+			MemoryEventParser<false> theParser( *inAlloc );
+			theParser.parseEventData( inBuffer, inBufferSize, theBuffer );
+		}
+		theBuffer->flush();
+
+		PX_PROFILE_DELETE(*inAlloc, theBuffer);
+	}
+
+} }
+
diff --git a/PxShared/src/pvd/src/PxProfileEventMutex.h b/PxShared/src/pvd/src/PxProfileEventMutex.h
new file mode 100644
index 0000000..5ec837b
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileEventMutex.h
@@ -0,0 +1,63 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+
+#ifndef PXPVDSDK_PXPROFILEEVENTMUTEX_H
+#define PXPVDSDK_PXPROFILEEVENTMUTEX_H
+
+#include "PxProfileBase.h"
+
+namespace physx { namespace profile {
+	
+	/**
+	 *	Mutex interface that hides implementation around lock and unlock.
+	 *	The event system locks the mutex for every interaction.
+	 */
+	class PxProfileEventMutex
+	{
+	protected:
+		virtual ~PxProfileEventMutex(){}
+	public:
+		virtual void lock() = 0;
+		virtual void unlock() = 0;
+	};
+
+	/**
+	 * Take any mutex type that implements lock and unlock and make an EventMutex out of it.
+	 */
+	template<typename TMutexType>
+	struct PxProfileEventMutexImpl : public PxProfileEventMutex
+	{
+		TMutexType* mMutex;
+		PxProfileEventMutexImpl( TMutexType* inMtx ) : mMutex( inMtx ) {}
+		virtual void lock() { mMutex->lock(); }
+		virtual void unlock() { mMutex->unlock(); }
+	};
+
+} }
+
+#endif // PXPVDSDK_PXPROFILEEVENTMUTEX_H
diff --git a/PxShared/src/pvd/src/PxProfileEventNames.h b/PxShared/src/pvd/src/PxProfileEventNames.h
new file mode 100644
index 0000000..831da20
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileEventNames.h
@@ -0,0 +1,90 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPROFILEEVENTNAMES_H
+#define PXPVDSDK_PXPROFILEEVENTNAMES_H
+
+#include "PxProfileBase.h"
+#include "PxProfileEventId.h"
+
+namespace physx { namespace profile {
+
+	/**
+	\brief Mapping from event id to name.
+	*/
+	struct PxProfileEventName
+	{
+		const char*					name;
+		PxProfileEventId			eventId;
+
+		/**
+		\brief Default constructor.
+		\param inName Profile event name.
+		\param inId Profile event id.
+		*/
+		PxProfileEventName( const char* inName, PxProfileEventId inId ) : name( inName ), eventId( inId ) {}
+	};
+
+	/**
+	\brief Aggregator of event id -> name mappings
+	*/
+	struct PxProfileNames
+	{
+		/**
+		\brief Default constructor that doesn't point to any names.
+		\param inEventCount Number of provided events.
+		\param inSubsystems Event names array.
+		*/
+		PxProfileNames( uint32_t inEventCount = 0, const PxProfileEventName* inSubsystems = NULL )
+			: eventCount( inEventCount )
+			, events( inSubsystems )
+		{
+		}
+
+		uint32_t							eventCount;
+		const PxProfileEventName*			events;
+	};
+
+	/**
+	\brief Provides a mapping from event ID -> name.
+	*/
+	class PxProfileNameProvider
+	{
+	public:
+		/**
+		\brief Returns profile event names.
+		\return Profile event names.
+		*/
+		virtual PxProfileNames getProfileNames() const = 0;
+
+	protected:
+		virtual ~PxProfileNameProvider(){}
+		PxProfileNameProvider& operator=(const PxProfileNameProvider&) { return *this; }
+	};
+} }
+
+#endif // PXPVDSDK_PXPROFILEEVENTNAMES_H
diff --git a/PxShared/src/pvd/src/PxProfileEventParser.h b/PxShared/src/pvd/src/PxProfileEventParser.h
new file mode 100644
index 0000000..60aebdc
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileEventParser.h
@@ -0,0 +1,193 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PXPVDSDK_PXPROFILEEVENTPARSER_H
+#define PXPVDSDK_PXPROFILEEVENTPARSER_H
+
+#include "PxProfileEvents.h"
+#include "PxProfileEventSerialization.h"
+
+namespace physx { namespace profile {
+	
+	struct EventParserData
+	{
+		EventContextInformation		mContextInformation;
+		uint64_t					mLastTimestamp;		
+
+		EventParserData() : mLastTimestamp(0)
+		{
+		}
+	};
+
+	//This object will be copied a lot so all of its important
+	//data must be pointers.
+	template<typename THandlerType, bool TSwapBytes>
+	struct EventParseOperator
+	{
+		typedef EventDeserializer<TSwapBytes>	TDeserializerType;
+
+		EventParserData*						mData;
+		TDeserializerType*						mDeserializer;
+		EventHeader*							mHeader;
+		THandlerType*							mHandler;
+
+		EventParseOperator( EventParserData* inData, TDeserializerType* inDeserializer, EventHeader* inHeader, THandlerType* inHandler ) 
+			: mData( inData )
+			, mDeserializer( inDeserializer )
+			, mHeader( inHeader )
+			, mHandler( inHandler ) 
+		{}
+		
+		template<typename TEventType>
+		bool parse( TEventType& ioEvent )
+		{
+			ioEvent.streamify( *mDeserializer, *mHeader );
+			bool success = mDeserializer->mFail == false;
+			PX_ASSERT( success );
+			return success;
+		}
+
+		bool parseHeader( EventHeader& ioEvent )
+		{
+			ioEvent.streamify( *mDeserializer );
+			bool success = mDeserializer->mFail == false;
+			PX_ASSERT( success );
+			return success;
+		}
+
+		template<typename TEventType>
+		bool handleProfileEvent( TEventType& evt )
+		{
+			bool retval = parse( evt );
+			if ( retval )
+			{
+				mData->mContextInformation = evt.mContextInformation;
+				handle( evt.getRelativeEvent(), evt.mContextInformation );
+			}
+			return retval;
+		}
+		
+		template<typename TEventType>
+		bool handleRelativeProfileEvent( TEventType& evt )
+		{
+			bool retval = parse( evt );
+			if ( retval )
+				handle( evt, mData->mContextInformation );
+			return retval;
+		}
+
+		template<typename TRelativeEventType>
+		void handle( const TRelativeEventType& evt, const EventContextInformation& inInfo )
+		{	
+			mData->mLastTimestamp = mHeader->uncompressTimestamp( mData->mLastTimestamp, evt.getTimestamp());
+			const_cast<TRelativeEventType&>(evt).setTimestamp( mData->mLastTimestamp );
+			evt.handle( mHandler, mHeader->mEventId
+						, inInfo.mThreadId
+						, inInfo.mContextId
+						, inInfo.mCpuId
+						, inInfo.mThreadPriority );
+		}
+
+		bool operator()( const StartEvent& )
+		{
+			StartEvent evt;
+			return handleProfileEvent( evt );
+		}
+		bool operator()( const StopEvent& )
+		{
+			StopEvent evt;
+			return handleProfileEvent( evt );
+		}
+		bool operator()( const RelativeStartEvent& )
+		{
+			RelativeStartEvent evt;
+			return handleRelativeProfileEvent( evt );
+
+		}
+		bool operator()( const RelativeStopEvent& )
+		{
+			RelativeStopEvent evt;
+			return handleRelativeProfileEvent( evt );
+		}
+		bool operator()( const EventValue& )
+		{
+			EventValue evt;
+			bool retval = parse( evt );
+			if ( retval )
+			{
+				evt.mValue = mHeader->uncompressTimestamp( 0, evt.mValue );
+				evt.handle( mHandler, mHeader->mEventId );
+			}
+			return retval;
+		}
+
+		//obsolete, placeholder to skip data from PhysX SDKs < 3.4
+		bool operator()( const CUDAProfileBuffer& )
+		{
+			CUDAProfileBuffer evt;
+			bool retval = parse( evt );
+			return retval;
+		}
+
+		//Unknown event type.
+		bool operator()(uint8_t )
+		{
+			PX_ASSERT( false );
+			return false;
+		}
+	};
+
+	template<bool TSwapBytes, typename THandlerType> 
+	inline bool parseEventData( const uint8_t* inData, uint32_t inLength, THandlerType* inHandler )
+	{
+		EventDeserializer<TSwapBytes> deserializer( inData, inLength );
+		Event::EventData crapData;
+		EventHeader theHeader;
+		EventParserData theData;
+		EventParseOperator<THandlerType,TSwapBytes> theOp( &theData, &deserializer, &theHeader, inHandler );
+		while( deserializer.mLength && deserializer.mFail == false)
+		{
+			if ( theOp.parseHeader( theHeader ) )
+				visit<bool>( static_cast<EventTypes::Enum>( theHeader.mEventType ), crapData, theOp );
+		}
+		return deserializer.mFail == false;
+	}
+
+	class PxProfileBulkEventHandler
+	{
+	protected:
+		virtual ~PxProfileBulkEventHandler(){}
+	public:
+		virtual void handleEvents(const physx::profile::Event* inEvents, uint32_t inBufferSize) = 0;
+		static void parseEventBuffer(const uint8_t* inBuffer, uint32_t inBufferSize, PxProfileBulkEventHandler& inHandler, bool inSwapBytes);
+	};
+}}
+
+#endif // PXPVDSDK_PXPROFILEEVENTPARSER_H
diff --git a/PxShared/src/pvd/src/PxProfileEventSender.h b/PxShared/src/pvd/src/PxProfileEventSender.h
new file mode 100644
index 0000000..3c1cf5e
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileEventSender.h
@@ -0,0 +1,129 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPROFILEEVENTSENDER_H
+#define PXPVDSDK_PXPROFILEEVENTSENDER_H
+
+#include "PxProfileBase.h"
+
+namespace physx { namespace profile {
+
+	/**
+	\brief Tagging interface to indicate an object that is capable of flushing a profile
+	event stream at a certain point.
+	 */
+	class PxProfileEventFlusher
+	{
+	protected:
+		virtual ~PxProfileEventFlusher(){}
+	public:
+		/**
+		\brief Flush profile events. Sends the profile event buffer to hooked clients.
+		*/
+		virtual void flushProfileEvents() = 0;
+	};
+
+	/**
+	\brief Sends the full events where the caller must provide the context and thread id.
+	 */
+	class PxProfileEventSender
+	{
+	protected:
+		virtual ~PxProfileEventSender(){}
+	public:
+	
+		/**
+		\brief Use this as a thread id for events that start on one thread and end on another
+		*/
+		static const uint32_t CrossThreadId = 99999789;
+
+		/**
+		\brief Send a start profile event, optionally with a context. Events are sorted by thread
+		and context in the client side.
+		\param inId Profile event id.
+		\param contextId Context id.
+		*/
+		virtual void startEvent( uint16_t inId, uint64_t contextId) = 0;
+		/**
+		\brief Send a stop profile event, optionally with a context. Events are sorted by thread
+		and context in the client side.
+		\param inId Profile event id.
+		\param contextId Context id.
+		*/
+		virtual void stopEvent( uint16_t inId, uint64_t contextId) = 0;
+
+		/**
+		\brief Send a start profile event, optionally with a context. Events are sorted by thread
+		and context in the client side.
+		\param inId Profile event id.
+		\param contextId Context id.
+		\param threadId Thread id.
+		*/
+		virtual void startEvent( uint16_t inId, uint64_t contextId, uint32_t threadId) = 0;
+		/**
+		\brief Send a stop profile event, optionally with a context. Events are sorted by thread
+		and context in the client side.
+		\param inId Profile event id.
+		\param contextId Context id.
+		\param threadId Thread id.
+		*/
+		virtual void stopEvent( uint16_t inId, uint64_t contextId, uint32_t threadId ) = 0;
+
+		virtual void atEvent(uint16_t inId, uint64_t contextId, uint32_t threadId, uint64_t start, uint64_t stop) = 0;
+
+		/**
+		\brief Set an specific events value. This is different than the profiling value
+		for the event; it is a value recorded and kept around without a timestamp associated
+		with it. This value is displayed when the event itself is processed.
+		\param inId Profile event id.
+		\param contextId Context id.
+		\param inValue Value to set for the event.
+		 */
+		virtual void eventValue( uint16_t inId, uint64_t contextId, int64_t inValue ) = 0;
+	};
+
+	/**
+	\brief Tagging interface to indicate an object that may or may not return
+	an object capable of adding profile events to a buffer.
+	*/
+	class PxProfileEventSenderProvider
+	{
+	protected:
+		virtual ~PxProfileEventSenderProvider(){}
+	public:
+		/**
+		\brief This method can *always* return NULL.
+		Thus need to always check that what you are getting is what
+		you think it is.
+
+		\return Perhaps a profile event sender.
+		*/
+		virtual PxProfileEventSender* getProfileEventSender() = 0;
+	};
+} }
+
+#endif // PXPVDSDK_PXPROFILEEVENTSENDER_H
diff --git a/PxShared/src/pvd/src/PxProfileEventSerialization.h b/PxShared/src/pvd/src/PxProfileEventSerialization.h
new file mode 100644
index 0000000..07c0563
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileEventSerialization.h
@@ -0,0 +1,258 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PXPVDSDK_PXPROFILEEVENTSERIALIZATION_H
+#define PXPVDSDK_PXPROFILEEVENTSERIALIZATION_H
+
+#include "PxProfileBase.h"
+#include "PxProfileDataParsing.h"
+#include "PxProfileEvents.h"
+
+namespace physx { namespace profile {
+
+	/**
+	 *	Array type must be a pxu8 container.  Templated so that this object can write
+	 *	to different collections.
+	 */
+	
+	template<typename TArrayType>
+	struct EventSerializer
+	{
+		TArrayType* mArray;
+		EventSerializer( TArrayType* inA ) : mArray( inA ) {}
+
+		template<typename TDataType>
+		uint32_t streamify( const char*, const TDataType& inType )
+		{
+			return mArray->write( inType );
+		}
+
+		uint32_t streamify( const char*, const char*& inType )
+		{
+			PX_ASSERT( inType != NULL );
+			uint32_t len( static_cast<uint32_t>( strlen( inType ) ) );
+			++len; //include the null terminator
+			uint32_t writtenSize = 0;
+			writtenSize = mArray->write(len);
+			writtenSize += mArray->write(inType, len);
+			return writtenSize;
+		}
+		
+		uint32_t streamify( const char*, const uint8_t* inData, uint32_t len )
+		{
+			uint32_t writtenSize = mArray->write(len);
+			if ( len )
+				writtenSize += mArray->write(inData, len);
+			return writtenSize;
+		}
+
+		uint32_t streamify( const char* nm, const uint64_t& inType, EventStreamCompressionFlags::Enum inFlags )
+		{
+			uint32_t writtenSize = 0;
+			switch( inFlags )
+			{
+			case EventStreamCompressionFlags::U8:
+					writtenSize = streamify(nm, static_cast<uint8_t>(inType));
+					break;
+			case EventStreamCompressionFlags::U16:
+					writtenSize = streamify(nm, static_cast<uint16_t>(inType));
+					break;
+			case EventStreamCompressionFlags::U32:
+					writtenSize = streamify(nm, static_cast<uint32_t>(inType));
+					break;
+			case EventStreamCompressionFlags::U64:
+				writtenSize = streamify(nm, inType);
+				break;
+			}
+			return writtenSize;
+		}
+		
+		uint32_t streamify( const char* nm, const uint32_t& inType, EventStreamCompressionFlags::Enum inFlags )
+		{
+			uint32_t writtenSize = 0;
+			switch( inFlags )
+			{
+			case EventStreamCompressionFlags::U8:
+					writtenSize = streamify(nm, static_cast<uint8_t>(inType));
+					break;
+			case EventStreamCompressionFlags::U16:
+					writtenSize = streamify(nm, static_cast<uint16_t>(inType));
+					break;
+			case EventStreamCompressionFlags::U32:
+			case EventStreamCompressionFlags::U64:
+				writtenSize = streamify(nm, inType);
+				break;
+			}
+			return writtenSize;
+		}
+	};
+
+	/**
+	 *	The event deserializes takes a buffer implements the streamify functions
+	 *	by setting the passed in data to the data in the buffer.
+	 */	
+	template<bool TSwapBytes>
+	struct EventDeserializer
+	{
+		const uint8_t* mData;
+		uint32_t		mLength;
+		bool		mFail;
+
+		EventDeserializer( const uint8_t* inData,  uint32_t inLength )
+			: mData( inData )
+			, mLength( inLength )
+			, mFail( false )
+		{
+			if ( mData == NULL )
+				mLength = 0;
+		}
+
+		bool val() { return TSwapBytes; }
+
+		uint32_t streamify( const char* , uint8_t& inType )
+		{
+			uint8_t* theData = reinterpret_cast<uint8_t*>( &inType ); //type punned pointer...
+			if ( mFail || sizeof( inType ) > mLength )
+			{
+				PX_ASSERT( false );
+				mFail = true;
+			}
+			else
+			{
+				for( uint32_t idx = 0; idx < sizeof( uint8_t ); ++idx, ++mData, --mLength )
+					theData[idx] = *mData;
+			}
+			return 0;
+		}
+
+		//default streamify reads things natively as bytes.
+		template<typename TDataType>
+		uint32_t streamify( const char* , TDataType& inType )
+		{
+			uint8_t* theData = reinterpret_cast<uint8_t*>( &inType ); //type punned pointer...
+			if ( mFail || sizeof( inType ) > mLength )
+			{
+				PX_ASSERT( false );
+				mFail = true;
+			}
+			else
+			{
+				for( uint32_t idx = 0; idx < sizeof( TDataType ); ++idx, ++mData, --mLength )
+					theData[idx] = *mData;
+				bool temp = val();
+				if ( temp ) 
+					BlockParseFunctions::swapBytes<sizeof(TDataType)>( theData );
+			}
+			return 0;
+		}
+
+		uint32_t streamify( const char*, const char*& inType )
+		{
+			uint32_t theLen;
+			streamify( "", theLen );
+			theLen = PxMin( theLen, mLength );
+			inType = reinterpret_cast<const char*>( mData );
+			mData += theLen;
+			mLength -= theLen;
+			return 0;
+		}
+		
+		uint32_t streamify( const char*, const uint8_t*& inData, uint32_t& len )
+		{
+			uint32_t theLen;
+			streamify( "", theLen );
+			theLen = PxMin( theLen, mLength );
+			len = theLen;
+			inData = reinterpret_cast<const uint8_t*>( mData );
+			mData += theLen;
+			mLength -= theLen;
+			return 0;
+		}
+
+		uint32_t streamify( const char* nm, uint64_t& inType, EventStreamCompressionFlags::Enum inFlags )
+		{
+			switch( inFlags )
+			{
+			case EventStreamCompressionFlags::U8:
+				{
+					uint8_t val=0;
+					streamify( nm, val );
+					inType = val;
+				}
+					break;
+			case EventStreamCompressionFlags::U16:
+				{
+					uint16_t val;
+					streamify( nm, val );
+					inType = val;
+				}
+					break;
+			case EventStreamCompressionFlags::U32:
+				{
+					uint32_t val;
+					streamify( nm, val );
+					inType = val;
+				}
+					break;
+			case EventStreamCompressionFlags::U64:
+				streamify( nm, inType );
+				break;
+			}
+			return 0;
+		}
+		
+		uint32_t streamify( const char* nm, uint32_t& inType, EventStreamCompressionFlags::Enum inFlags )
+		{
+			switch( inFlags )
+			{
+			case EventStreamCompressionFlags::U8:
+				{
+					uint8_t val=0;
+					streamify( nm, val );
+					inType = val;
+				}
+					break;
+			case EventStreamCompressionFlags::U16:
+				{
+					uint16_t val=0;
+					streamify( nm, val );
+					inType = val;
+				}
+					break;
+			case EventStreamCompressionFlags::U32:
+			case EventStreamCompressionFlags::U64:
+				streamify( nm, inType );
+				break;
+			}
+			return 0;
+		}
+	};
+}}
+#endif // PXPVDSDK_PXPROFILEEVENTSERIALIZATION_H
diff --git a/PxShared/src/pvd/src/PxProfileEventSystem.h b/PxShared/src/pvd/src/PxProfileEventSystem.h
new file mode 100644
index 0000000..7411824
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileEventSystem.h
@@ -0,0 +1,56 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPROFILEEVENTSYSTEM_H
+#define PXPVDSDK_PXPROFILEEVENTSYSTEM_H
+
+#include "PxProfileBase.h"
+#include "PxProfileEventSender.h"
+#include "PxProfileEventBufferClient.h"
+#include "PxProfileEventBufferClientManager.h"
+
+namespace physx { namespace profile {
+	class PxProfileContextProvider;
+	class PxProfileEventMutex;
+	class PxProfileEventFilter;
+
+	/**
+	 *	Wraps the different interfaces into one object.
+	 */
+	class PxProfileEventSystem : public PxProfileEventSender
+							, public PxProfileEventBufferClient
+							, public PxProfileEventBufferClientManager
+							, public PxProfileEventFlusher
+	{
+	protected:
+		~PxProfileEventSystem(){}
+	public:
+		virtual void release() = 0;
+	};
+} }
+
+#endif // PXPVDSDK_PXPROFILEEVENTSYSTEM_H
diff --git a/PxShared/src/pvd/src/PxProfileEvents.h b/PxShared/src/pvd/src/PxProfileEvents.h
new file mode 100644
index 0000000..25f09d7
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileEvents.h
@@ -0,0 +1,706 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPROFILEEVENTS_H
+#define PXPVDSDK_PXPROFILEEVENTS_H
+
+#include "foundation/PxMath.h"
+#include "foundation/PxAssert.h"
+
+#include "PxProfileBase.h"
+#include "PxProfileEventId.h"
+
+
+#define	PX_PROFILE_UNION_1(a)					physx::profile::TUnion<a, physx::profile::Empty>
+#define	PX_PROFILE_UNION_2(a,b)					physx::profile::TUnion<a, PX_PROFILE_UNION_1(b)>
+#define	PX_PROFILE_UNION_3(a,b,c)				physx::profile::TUnion<a, PX_PROFILE_UNION_2(b,c)>
+#define	PX_PROFILE_UNION_4(a,b,c,d)				physx::profile::TUnion<a, PX_PROFILE_UNION_3(b,c,d)>
+#define	PX_PROFILE_UNION_5(a,b,c,d,e)			physx::profile::TUnion<a, PX_PROFILE_UNION_4(b,c,d,e)>
+#define	PX_PROFILE_UNION_6(a,b,c,d,e,f)			physx::profile::TUnion<a, PX_PROFILE_UNION_5(b,c,d,e,f)>
+#define	PX_PROFILE_UNION_7(a,b,c,d,e,f,g)		physx::profile::TUnion<a, PX_PROFILE_UNION_6(b,c,d,e,f,g)>
+#define	PX_PROFILE_UNION_8(a,b,c,d,e,f,g,h)		physx::profile::TUnion<a, PX_PROFILE_UNION_7(b,c,d,e,f,g,h)>
+#define	PX_PROFILE_UNION_9(a,b,c,d,e,f,g,h,i)	physx::profile::TUnion<a, PX_PROFILE_UNION_8(b,c,d,e,f,g,h,i)>
+
+namespace physx { namespace profile {
+
+	struct Empty {};
+
+	template <typename T> struct Type2Type {};
+
+	template <typename U, typename V>
+	union TUnion
+	{
+		typedef U Head;
+		typedef V Tail;
+
+		Head	head;
+		Tail	tail;
+
+		template <typename TDataType>
+		void init(const TDataType& inData)
+		{
+			toType(Type2Type<TDataType>()).init(inData);
+		}
+
+		template <typename TDataType>
+		PX_FORCE_INLINE TDataType& toType(const Type2Type<TDataType>& outData) { return tail.toType(outData); }
+
+		PX_FORCE_INLINE Head& toType(const Type2Type<Head>&) { return head; }
+
+		template <typename TDataType>
+		PX_FORCE_INLINE const TDataType& toType(const Type2Type<TDataType>& outData) const { return tail.toType(outData); }
+
+		PX_FORCE_INLINE const Head& toType(const Type2Type<Head>&) const { return head; }
+	};
+
+	struct EventTypes
+	{
+		enum Enum
+		{
+			Unknown = 0,
+			StartEvent,
+			StopEvent,
+			RelativeStartEvent, //reuses context,id from the earlier event.
+			RelativeStopEvent, //reuses context,id from the earlier event.
+			EventValue,
+			CUDAProfileBuffer //obsolete, placeholder to skip data from PhysX SDKs < 3.4
+		};
+	};
+
+	struct EventStreamCompressionFlags
+	{
+		enum Enum
+		{
+			U8 = 0,
+			U16 = 1,
+			U32 = 2,
+			U64 = 3,
+			CompressionMask = 3
+		};
+	};
+
+#if (PX_PS4) || (PX_APPLE_FAMILY)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wimplicit-fallthrough"
+#endif
+
+	//Find the smallest value that will represent the incoming value without loss.
+	//We can enlarge the current compression value, but we can't make is smaller.
+	//In this way, we can use this function to find the smallest compression setting
+	//that will work for a set of values.
+	inline EventStreamCompressionFlags::Enum findCompressionValue( uint64_t inValue, EventStreamCompressionFlags::Enum inCurrentCompressionValue = EventStreamCompressionFlags::U8 )
+	{
+		PX_ASSERT_WITH_MESSAGE( (inCurrentCompressionValue >= EventStreamCompressionFlags::U8) &&
+								(inCurrentCompressionValue <= EventStreamCompressionFlags::U64),
+								"Invalid inCurrentCompressionValue in profile::findCompressionValue");
+
+		//Fallthrough is intentional
+		switch( inCurrentCompressionValue ) 
+		{
+		case EventStreamCompressionFlags::U8:
+			if ( inValue <= UINT8_MAX )
+				return EventStreamCompressionFlags::U8;
+		case EventStreamCompressionFlags::U16:
+			if ( inValue <= UINT16_MAX )
+				return EventStreamCompressionFlags::U16;
+		case EventStreamCompressionFlags::U32:
+			if ( inValue <= UINT32_MAX )
+				return EventStreamCompressionFlags::U32;
+		case EventStreamCompressionFlags::U64:
+			break;
+		}
+		return EventStreamCompressionFlags::U64;
+	}
+
+	//Find the smallest value that will represent the incoming value without loss.
+	//We can enlarge the current compression value, but we can't make is smaller.
+	//In this way, we can use this function to find the smallest compression setting
+	//that will work for a set of values.
+	inline EventStreamCompressionFlags::Enum findCompressionValue( uint32_t inValue, EventStreamCompressionFlags::Enum inCurrentCompressionValue = EventStreamCompressionFlags::U8 )
+	{
+		PX_ASSERT_WITH_MESSAGE( (inCurrentCompressionValue >= EventStreamCompressionFlags::U8) &&
+								(inCurrentCompressionValue <= EventStreamCompressionFlags::U64),
+								"Invalid inCurrentCompressionValue in profile::findCompressionValue");
+
+		//Fallthrough is intentional
+		switch( inCurrentCompressionValue ) 
+		{
+		case EventStreamCompressionFlags::U8:
+			if ( inValue <= UINT8_MAX )
+				return EventStreamCompressionFlags::U8;
+		case EventStreamCompressionFlags::U16:
+			if ( inValue <= UINT16_MAX )
+				return EventStreamCompressionFlags::U16;
+		case EventStreamCompressionFlags::U32:
+		case EventStreamCompressionFlags::U64:
+			break;
+		}
+		return EventStreamCompressionFlags::U32;
+	}
+
+#if (PX_PS4) || (PX_APPLE_FAMILY)
+#pragma clang diagnostic pop
+#endif
+
+	//Event header is 32 bytes and precedes all events.
+	struct EventHeader
+	{
+		uint8_t	mEventType; //Used to parse the correct event out of the stream
+		uint8_t	mStreamOptions; //Timestamp compression, etc.
+		uint16_t	mEventId;	//16 bit per-event-system event id
+		EventHeader( uint8_t type = 0, uint16_t id = 0 )
+			: mEventType( type )
+			, mStreamOptions( uint8_t(-1) )
+			, mEventId( id )
+		{
+		}
+
+		EventHeader( EventTypes::Enum type, uint16_t id )
+			: mEventType( static_cast<uint8_t>( type ) )
+			, mStreamOptions( uint8_t(-1) )
+			, mEventId( id )
+		{
+		}
+
+		EventStreamCompressionFlags::Enum getTimestampCompressionFlags() const 
+		{ 
+			return static_cast<EventStreamCompressionFlags::Enum> ( mStreamOptions & EventStreamCompressionFlags::CompressionMask );
+		}
+
+		uint64_t compressTimestamp( uint64_t inLastTimestamp, uint64_t inCurrentTimestamp )
+		{
+			mStreamOptions = EventStreamCompressionFlags::U64;
+			uint64_t retval = inCurrentTimestamp;
+			if ( inLastTimestamp )
+			{
+				retval = inCurrentTimestamp - inLastTimestamp;
+				EventStreamCompressionFlags::Enum compressionValue = findCompressionValue( retval );
+				mStreamOptions = static_cast<uint8_t>( compressionValue );
+				if ( compressionValue == EventStreamCompressionFlags::U64 )
+					retval = inCurrentTimestamp; //just send the timestamp as is.
+			}
+			return retval;
+		}
+
+		uint64_t uncompressTimestamp( uint64_t inLastTimestamp, uint64_t inCurrentTimestamp ) const
+		{
+			if ( getTimestampCompressionFlags() != EventStreamCompressionFlags::U64 )
+				return inLastTimestamp + inCurrentTimestamp;
+			return inCurrentTimestamp;
+		}
+
+		void setContextIdCompressionFlags( uint64_t inContextId )
+		{
+			uint8_t options = static_cast<uint8_t>( findCompressionValue( inContextId ) );
+			mStreamOptions = uint8_t(mStreamOptions | options << 2);
+		}
+
+		EventStreamCompressionFlags::Enum getContextIdCompressionFlags() const 
+		{
+			return static_cast< EventStreamCompressionFlags::Enum >( ( mStreamOptions >> 2 ) & EventStreamCompressionFlags::CompressionMask );
+		}
+
+		bool operator==( const EventHeader& inOther ) const
+		{
+			return mEventType == inOther.mEventType
+				&& mStreamOptions == inOther.mStreamOptions
+				&& mEventId == inOther.mEventId;
+		}
+
+		template<typename TStreamType>
+		inline uint32_t streamify( TStreamType& inStream )
+		{
+			uint32_t writtenSize = inStream.streamify( "EventType", mEventType ); 
+			writtenSize += inStream.streamify("StreamOptions", mStreamOptions); //Timestamp compression, etc.
+			writtenSize += inStream.streamify("EventId", mEventId);	//16 bit per-event-system event id
+			return writtenSize;
+		}
+
+
+	};
+
+	//Declaration of type level getEventType function that maps enumeration event types to datatypes
+	template<typename TDataType>
+	inline EventTypes::Enum getEventType() { PX_ASSERT( false ); return EventTypes::Unknown; }
+
+	//Relative profile event means this event is sharing the context and thread id
+	//with the event before it.
+	struct RelativeProfileEvent
+	{
+		uint64_t	mTensOfNanoSeconds; //timestamp is in tensOfNanonseconds
+		void init( uint64_t inTs ) { mTensOfNanoSeconds = inTs; }
+		void init( const RelativeProfileEvent& inData ) { mTensOfNanoSeconds = inData.mTensOfNanoSeconds; }
+		bool operator==( const RelativeProfileEvent& other ) const 
+		{ 
+			return mTensOfNanoSeconds == other.mTensOfNanoSeconds;
+		}
+		template<typename TStreamType> 
+		uint32_t streamify( TStreamType& inStream, const EventHeader& inHeader )
+		{
+			return inStream.streamify( "TensOfNanoSeconds", mTensOfNanoSeconds, inHeader.getTimestampCompressionFlags() );
+		}
+		uint64_t getTimestamp() const { return mTensOfNanoSeconds; }
+		void setTimestamp( uint64_t inTs ) { mTensOfNanoSeconds = inTs; }
+		void setupHeader( EventHeader& inHeader, uint64_t inLastTimestamp )
+		{
+			mTensOfNanoSeconds = inHeader.compressTimestamp( inLastTimestamp, mTensOfNanoSeconds );
+		}
+
+		uint32_t getEventSize(const EventHeader& inHeader)
+		{	
+			uint32_t size = 0;
+			switch (inHeader.getTimestampCompressionFlags())
+			{
+			case EventStreamCompressionFlags::U8:
+				size = 1;
+				break;
+			case EventStreamCompressionFlags::U16:
+				size = 2;
+				break;
+			case EventStreamCompressionFlags::U32:
+				size = 4;
+				break;
+			case EventStreamCompressionFlags::U64:
+				size = 8;
+				break;
+			}
+			return size;
+		}
+	};
+
+	//Start version of the relative event.
+	struct RelativeStartEvent : public RelativeProfileEvent
+	{
+		void init( uint64_t inTs = 0 ) { RelativeProfileEvent::init( inTs ); }
+		void init( const RelativeStartEvent& inData ) { RelativeProfileEvent::init( inData ); }
+		template<typename THandlerType>
+		void handle( THandlerType* inHdlr, uint16_t eventId, uint32_t thread, uint64_t context, uint8_t inCpuId, uint8_t threadPriority ) const
+		{
+			inHdlr->onStartEvent( PxProfileEventId( eventId ), thread, context, inCpuId, threadPriority, mTensOfNanoSeconds );
+		}
+	};
+	
+	template<> inline EventTypes::Enum getEventType<RelativeStartEvent>() { return EventTypes::RelativeStartEvent; }
+	
+	//Stop version of relative event.
+	struct RelativeStopEvent : public RelativeProfileEvent
+	{
+		void init( uint64_t inTs = 0 ) { RelativeProfileEvent::init( inTs ); }
+		void init( const RelativeStopEvent& inData ) { RelativeProfileEvent::init( inData ); }
+		template<typename THandlerType>
+		void handle( THandlerType* inHdlr, uint16_t eventId, uint32_t thread, uint64_t context, uint8_t inCpuId, uint8_t threadPriority ) const
+		{
+			inHdlr->onStopEvent( PxProfileEventId( eventId ), thread, context, inCpuId, threadPriority, mTensOfNanoSeconds );
+		}
+	};
+
+	template<> inline EventTypes::Enum getEventType<RelativeStopEvent>() { return EventTypes::RelativeStopEvent; }
+
+	struct EventContextInformation
+	{
+		uint64_t mContextId;
+		uint32_t mThreadId; //Thread this event was taken from
+		uint8_t  mThreadPriority;
+		uint8_t  mCpuId;
+
+		void init( uint32_t inThreadId = UINT32_MAX
+								, uint64_t inContextId = (uint64_t(-1))
+								, uint8_t inPriority = UINT8_MAX
+								, uint8_t inCpuId = UINT8_MAX )
+		{
+			mContextId = inContextId;
+			mThreadId = inThreadId;
+			mThreadPriority = inPriority;
+			mCpuId = inCpuId;
+		}
+
+		void init( const EventContextInformation& inData )
+		{
+			mContextId = inData.mContextId;
+			mThreadId = inData.mThreadId;
+			mThreadPriority = inData.mThreadPriority;
+			mCpuId = inData.mCpuId;
+		}
+
+		template<typename TStreamType> 
+		uint32_t streamify( TStreamType& inStream, EventStreamCompressionFlags::Enum inContextIdFlags )
+		{
+			uint32_t writtenSize = inStream.streamify( "ThreadId", mThreadId );
+			writtenSize += inStream.streamify("ContextId", mContextId, inContextIdFlags);
+			writtenSize += inStream.streamify("ThreadPriority", mThreadPriority);
+			writtenSize += inStream.streamify("CpuId", mCpuId);
+			return writtenSize;
+		}
+		
+		bool operator==( const EventContextInformation& other ) const 
+		{ 
+			return mThreadId == other.mThreadId
+				&& mContextId == other.mContextId
+				&& mThreadPriority == other.mThreadPriority
+				&& mCpuId == other.mCpuId;
+		}
+
+		void setToDefault()
+		{
+			*this = EventContextInformation();
+		}
+	};
+	
+	//Profile event contains all the data required to tell the profile what is going
+	//on.
+	struct ProfileEvent
+	{
+		EventContextInformation mContextInformation;
+		RelativeProfileEvent	mTimeData; //timestamp in seconds.
+		void init( uint32_t inThreadId, uint64_t inContextId, uint8_t inCpuId, uint8_t inPriority, uint64_t inTs )
+		{
+			mContextInformation.init( inThreadId, inContextId, inPriority, inCpuId );
+			mTimeData.init( inTs );
+		}
+
+		void init( const ProfileEvent& inData )
+		{
+			mContextInformation.init( inData.mContextInformation );
+			mTimeData.init( inData.mTimeData );
+		}
+
+		bool operator==( const ProfileEvent& other ) const 
+		{ 
+			return mContextInformation == other.mContextInformation
+					&& mTimeData == other.mTimeData; 
+		}
+
+		template<typename TStreamType> 
+		uint32_t streamify( TStreamType& inStream, const EventHeader& inHeader )
+		{
+			uint32_t writtenSize = mContextInformation.streamify(inStream, inHeader.getContextIdCompressionFlags());
+			writtenSize += mTimeData.streamify(inStream, inHeader);
+			return writtenSize;
+		}
+
+		uint32_t getEventSize(const EventHeader& inHeader)
+		{
+			uint32_t eventSize = 0;
+			// time is stored depending on the conpress flag mTimeData.streamify(inStream, inHeader);
+			switch (inHeader.getTimestampCompressionFlags())
+			{
+			case EventStreamCompressionFlags::U8:
+				eventSize++;
+				break;
+			case EventStreamCompressionFlags::U16:
+				eventSize += 2;
+				break;
+			case EventStreamCompressionFlags::U32:
+				eventSize += 4;
+				break;
+			case EventStreamCompressionFlags::U64:
+				eventSize += 8;
+				break;
+			}
+
+			// context information
+			// mContextInformation.streamify( inStream, inHeader.getContextIdCompressionFlags() );
+			eventSize += 6;  // 		uint32_t mThreadId; uint8_t  mThreadPriority; uint8_t  mCpuId;
+			switch (inHeader.getContextIdCompressionFlags())
+			{
+			case EventStreamCompressionFlags::U8:
+				eventSize++;
+				break;
+			case EventStreamCompressionFlags::U16:
+				eventSize += 2;
+				break;
+			case EventStreamCompressionFlags::U32:
+				eventSize += 4;
+				break;
+			case EventStreamCompressionFlags::U64:
+				eventSize += 8;
+				break;
+			}
+
+			return eventSize;
+		}
+
+		uint64_t getTimestamp() const { return mTimeData.getTimestamp(); }
+		void setTimestamp( uint64_t inTs ) { mTimeData.setTimestamp( inTs ); }
+		
+		void setupHeader( EventHeader& inHeader, uint64_t inLastTimestamp )
+		{
+			mTimeData.setupHeader( inHeader, inLastTimestamp );
+			inHeader.setContextIdCompressionFlags( mContextInformation.mContextId );
+		}
+	};
+
+	//profile start event starts the profile session.
+	struct StartEvent : public ProfileEvent
+	{
+		void init( uint32_t inThreadId = 0, uint64_t inContextId = 0, uint8_t inCpuId = 0, uint8_t inPriority = 0, uint64_t inTensOfNanoSeconds = 0 ) 
+		{
+			ProfileEvent::init( inThreadId, inContextId, inCpuId, inPriority, inTensOfNanoSeconds );
+		}
+		void init( const StartEvent& inData )
+		{
+			ProfileEvent::init( inData );
+		}
+
+		RelativeStartEvent getRelativeEvent() const { RelativeStartEvent theEvent; theEvent.init( mTimeData.mTensOfNanoSeconds ); return theEvent; }
+		EventTypes::Enum getRelativeEventType() const { return getEventType<RelativeStartEvent>(); }
+	};
+	
+	template<> inline EventTypes::Enum getEventType<StartEvent>() { return EventTypes::StartEvent; }
+
+	//Profile stop event stops the profile session.
+	struct StopEvent : public ProfileEvent
+	{
+		void init( uint32_t inThreadId = 0, uint64_t inContextId = 0, uint8_t inCpuId = 0, uint8_t inPriority = 0, uint64_t inTensOfNanoSeconds = 0 )
+		{
+			ProfileEvent::init( inThreadId, inContextId, inCpuId, inPriority, inTensOfNanoSeconds );
+		}
+		void init( const StopEvent& inData )
+		{
+			ProfileEvent::init( inData );
+		}
+		RelativeStopEvent getRelativeEvent() const { RelativeStopEvent theEvent; theEvent.init( mTimeData.mTensOfNanoSeconds ); return theEvent; }
+		EventTypes::Enum getRelativeEventType() const { return getEventType<RelativeStopEvent>(); }
+	};
+	
+	template<> inline EventTypes::Enum getEventType<StopEvent>() { return EventTypes::StopEvent; }
+
+	struct EventValue
+	{
+		uint64_t	mValue;
+		uint64_t	mContextId;
+		uint32_t	mThreadId;
+		void init( int64_t inValue = 0, uint64_t inContextId = 0, uint32_t inThreadId = 0 )
+		{
+			mValue = static_cast<uint64_t>( inValue );
+			mContextId = inContextId;
+			mThreadId = inThreadId;
+		}
+
+		void init( const EventValue& inData )
+		{
+			mValue = inData.mValue;
+			mContextId = inData.mContextId;
+			mThreadId = inData.mThreadId;
+		}
+
+		int64_t getValue() const { return static_cast<int16_t>( mValue ); }
+
+		void setupHeader( EventHeader& inHeader )
+		{
+			mValue = inHeader.compressTimestamp( 0, mValue );
+			inHeader.setContextIdCompressionFlags( mContextId );
+		}
+
+		template<typename TStreamType> 
+		uint32_t streamify( TStreamType& inStream, const EventHeader& inHeader )
+		{
+			uint32_t writtenSize = inStream.streamify("Value", mValue, inHeader.getTimestampCompressionFlags());
+			writtenSize += inStream.streamify("ContextId", mContextId, inHeader.getContextIdCompressionFlags());
+			writtenSize += inStream.streamify("ThreadId", mThreadId);
+			return writtenSize;
+		}
+
+		uint32_t getEventSize(const EventHeader& inHeader)
+		{
+			uint32_t eventSize = 0;
+			// value
+			switch (inHeader.getTimestampCompressionFlags())
+			{
+			case EventStreamCompressionFlags::U8:
+				eventSize++;
+				break;
+			case EventStreamCompressionFlags::U16:
+				eventSize += 2;
+				break;
+			case EventStreamCompressionFlags::U32:
+				eventSize += 4;
+				break;
+			case EventStreamCompressionFlags::U64:
+				eventSize += 8;
+				break;
+			}
+
+			// context information						
+			switch (inHeader.getContextIdCompressionFlags())
+			{
+			case EventStreamCompressionFlags::U8:
+				eventSize++;
+				break;
+			case EventStreamCompressionFlags::U16:
+				eventSize += 2;
+				break;
+			case EventStreamCompressionFlags::U32:
+				eventSize += 4;
+				break;
+			case EventStreamCompressionFlags::U64:
+				eventSize += 8;
+				break;
+			}
+
+			eventSize += 4;  // 		uint32_t mThreadId;
+
+			return eventSize;
+		}
+
+		bool operator==( const EventValue& other ) const 
+		{ 
+			return mValue == other.mValue
+				&& mContextId == other.mContextId
+				&& mThreadId == other.mThreadId;
+		}
+
+		template<typename THandlerType>
+		void handle( THandlerType* inHdlr, uint16_t eventId ) const
+		{
+			inHdlr->onEventValue( PxProfileEventId( eventId ), mThreadId, mContextId, getValue() );
+		}
+
+	};
+	template<> inline EventTypes::Enum getEventType<EventValue>() { return EventTypes::EventValue; }
+
+	//obsolete, placeholder to skip data from PhysX SDKs < 3.4
+	struct CUDAProfileBuffer
+	{
+		uint64_t mTimestamp;
+		float mTimespan;
+		const uint8_t* mCudaData;
+		uint32_t mBufLen;
+		uint32_t mVersion;
+
+		template<typename TStreamType> 
+		uint32_t streamify( TStreamType& inStream, const EventHeader& )
+		{
+			uint32_t writtenSize = inStream.streamify("Timestamp", mTimestamp);
+			writtenSize += inStream.streamify("Timespan", mTimespan);
+			writtenSize += inStream.streamify("CudaData", mCudaData, mBufLen);
+			writtenSize += inStream.streamify("BufLen", mBufLen);
+			writtenSize += inStream.streamify("Version", mVersion);
+			return writtenSize;
+		}
+
+		bool operator==( const CUDAProfileBuffer& other ) const 
+		{ 
+			return mTimestamp == other.mTimestamp
+				&& mTimespan == other.mTimespan
+				&& mBufLen == other.mBufLen
+				&& memcmp( mCudaData, other.mCudaData, mBufLen ) == 0
+				&& mVersion == other.mVersion;
+		}
+	};
+
+	template<> inline EventTypes::Enum getEventType<CUDAProfileBuffer>() { return EventTypes::CUDAProfileBuffer; }
+
+	//Provides a generic equal operation for event data objects.
+	template <typename TEventData>
+	struct EventDataEqualOperator
+	{
+		TEventData mData;
+		EventDataEqualOperator( const TEventData& inD ) : mData( inD ) {}
+		template<typename TDataType> bool operator()( const TDataType& inRhs ) const { return mData.toType( Type2Type<TDataType>() ) == inRhs; }
+		bool operator()() const { return false; }
+	};
+
+	/**
+	 *	Generic event container that combines and even header with the generic event data type.
+	 *	Provides unsafe and typesafe access to the event data.
+	 */
+	class Event
+	{
+	public:
+		typedef PX_PROFILE_UNION_7(StartEvent, StopEvent, RelativeStartEvent, RelativeStopEvent, EventValue, CUDAProfileBuffer, uint8_t) EventData;
+
+	private:
+		EventHeader mHeader;
+		EventData	mData;
+	public:
+		Event() {}
+
+		template <typename TDataType>
+		Event( EventHeader inHeader, const TDataType& inData )
+			: mHeader( inHeader )
+		{
+			mData.init<TDataType>(inData);
+		}
+
+		template<typename TDataType>
+		Event( uint16_t eventId, const TDataType& inData )
+			: mHeader( getEventType<TDataType>(), eventId )
+		{
+			mData.init<TDataType>(inData);
+		}
+		const EventHeader& getHeader() const { return mHeader; }
+		const EventData& getData() const { return mData; }
+
+		template<typename TDataType>
+		const TDataType& getValue() const { PX_ASSERT( mHeader.mEventType == getEventType<TDataType>() ); return mData.toType<TDataType>(); }
+
+		template<typename TDataType>
+		TDataType& getValue() { PX_ASSERT( mHeader.mEventType == getEventType<TDataType>() ); return mData.toType<TDataType>(); }
+
+		template<typename TRetVal, typename TOperator>
+		inline TRetVal visit( TOperator inOp ) const;
+
+		bool operator==( const Event& inOther ) const
+		{
+			if ( !(mHeader == inOther.mHeader ) ) return false;
+			if ( mHeader.mEventType )
+				return inOther.visit<bool>( EventDataEqualOperator<EventData>( mData ) );
+			return true;
+		}
+	};
+
+	//Combining the above union type with an event type means that an object can get the exact
+	//data out of the union.  Using this function means that all callsites will be forced to
+	//deal with the newer datatypes and that the switch statement only exists in once place.
+	//Implements conversion from enum -> datatype
+	template<typename TRetVal, typename TOperator>
+	TRetVal visit( EventTypes::Enum inEventType, const Event::EventData& inData, TOperator inOperator )
+	{
+		switch( inEventType )
+		{
+		case EventTypes::StartEvent:			return inOperator( inData.toType( Type2Type<StartEvent>() ) );
+		case EventTypes::StopEvent:				return inOperator( inData.toType( Type2Type<StopEvent>() ) );
+		case EventTypes::RelativeStartEvent:	return inOperator( inData.toType( Type2Type<RelativeStartEvent>() ) );
+		case EventTypes::RelativeStopEvent:		return inOperator( inData.toType( Type2Type<RelativeStopEvent>() ) );
+		case EventTypes::EventValue:			return inOperator( inData.toType( Type2Type<EventValue>() ) );
+		//obsolete, placeholder to skip data from PhysX SDKs < 3.4
+		case EventTypes::CUDAProfileBuffer:		return inOperator( inData.toType( Type2Type<CUDAProfileBuffer>() ) );
+		case EventTypes::Unknown:				break;
+		}
+		uint8_t type = static_cast<uint8_t>( inEventType );
+		return inOperator( type );
+	}
+
+	template<typename TRetVal, typename TOperator>
+	inline TRetVal Event::visit( TOperator inOp ) const
+	{ 
+		return physx::profile::visit<TRetVal>( static_cast<EventTypes::Enum>(mHeader.mEventType), mData, inOp ); 
+	}
+} }
+
+#endif // PXPVDSDK_PXPROFILEEVENTS_H
diff --git a/PxShared/src/pvd/src/PxProfileMemory.h b/PxShared/src/pvd/src/PxProfileMemory.h
new file mode 100644
index 0000000..30e8bdc
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileMemory.h
@@ -0,0 +1,99 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PXPVDSDK_PXPROFILEMEMORY_H
+#define PXPVDSDK_PXPROFILEMEMORY_H
+
+#include "PxProfileBase.h"
+#include "PxProfileEventBufferClientManager.h"
+#include "PxProfileEventSender.h"
+#include "PsBroadcast.h"
+
+namespace physx { namespace profile {
+
+	/**
+	\brief Record events so a late-connecting client knows about
+	all outstanding allocations
+	*/
+	class PxProfileMemoryEventRecorder : public shdfnd::AllocationListener
+	{
+	protected:
+		virtual ~PxProfileMemoryEventRecorder(){}
+	public:
+		/**
+		\brief Set the allocation listener
+		\param inListener Allocation listener.
+		*/
+		virtual void setListener(AllocationListener* inListener) = 0;
+		/**
+		\brief Release the instance.
+		*/
+		virtual void release() = 0;
+
+		/**
+		\brief Create the profile memory event recorder.
+		\param inAllocator Allocation callback.
+		*/
+		static PxProfileMemoryEventRecorder& createRecorder(PxAllocatorCallback* inAllocator);
+	};
+
+	/**
+	\brief Stores memory events into the memory buffer. 
+	*/
+	class PxProfileMemoryEventBuffer
+		: public shdfnd::AllocationListener //add a new event to the buffer
+		, public PxProfileEventBufferClientManager //add clients to handle the serialized memory events
+		, public PxProfileEventFlusher //flush the buffer
+	{
+	protected:
+		virtual ~PxProfileMemoryEventBuffer(){}
+	public:
+
+		/**
+		\brief Release the instance.
+		*/
+		virtual void release() = 0;
+		
+		/**
+		\brief Create a non-mutex-protected event buffer.		
+		\param inAllocator Allocation callback.
+		\param inBufferSize Internal buffer size.
+		*/
+		static PxProfileMemoryEventBuffer& createMemoryEventBuffer(PxAllocatorCallback& inAllocator, uint32_t inBufferSize = 0x1000);
+	};
+
+
+
+} } // namespace physx
+
+
+#endif // PXPVDSDK_PXPROFILEMEMORY_H
+
+
diff --git a/PxShared/src/pvd/src/PxProfileMemoryBuffer.h b/PxShared/src/pvd/src/PxProfileMemoryBuffer.h
new file mode 100644
index 0000000..ae957ce
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileMemoryBuffer.h
@@ -0,0 +1,193 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PXPVDSDK_PXPROFILEMEMORYBUFFER_H
+#define PXPVDSDK_PXPROFILEMEMORYBUFFER_H
+
+#include "PxProfileBase.h"
+#include "PsAllocator.h"
+#include "foundation/PxMemory.h"
+
+namespace physx { namespace profile {
+
+	template<typename TAllocator = typename shdfnd::AllocatorTraits<uint8_t>::Type >
+	class MemoryBuffer : public TAllocator
+	{
+		uint8_t* mBegin;
+		uint8_t* mEnd;
+		uint8_t* mCapacityEnd;
+
+	public:
+		MemoryBuffer( const TAllocator& inAlloc = TAllocator() ) : TAllocator( inAlloc ), mBegin( 0 ), mEnd( 0 ), mCapacityEnd( 0 ) {}
+		~MemoryBuffer()
+		{
+			if ( mBegin ) TAllocator::deallocate( mBegin );
+		}
+		uint32_t size() const { return static_cast<uint32_t>( mEnd - mBegin ); }
+		uint32_t capacity() const { return static_cast<uint32_t>( mCapacityEnd - mBegin ); }
+		uint8_t* begin() { return mBegin; }
+		uint8_t* end() { return mEnd; }
+		void setEnd(uint8_t* nEnd) { mEnd = nEnd; }
+		const uint8_t* begin() const { return mBegin; }
+		const uint8_t* end() const { return mEnd; }
+		void clear() { mEnd = mBegin; }
+		uint32_t write( uint8_t inValue )
+		{
+			growBuf( 1 );
+			*mEnd = inValue;
+			++mEnd;
+			return 1;
+		}
+
+		template<typename TDataType>
+		uint32_t write( const TDataType& inValue )
+		{
+			uint32_t writtenSize = sizeof(TDataType);
+			growBuf(writtenSize);
+			const uint8_t* __restrict readPtr = reinterpret_cast< const uint8_t* >( &inValue );
+			uint8_t* __restrict writePtr = mEnd;
+			for ( uint32_t idx = 0; idx < sizeof(TDataType); ++idx ) writePtr[idx] = readPtr[idx];
+			mEnd += writtenSize;
+			return writtenSize;
+		}
+		
+		template<typename TDataType>
+		uint32_t write( const TDataType* inValue, uint32_t inLength )
+		{
+			if ( inValue && inLength )
+			{
+				uint32_t writeSize = inLength * sizeof( TDataType );
+				growBuf( writeSize );
+				PxMemCopy( mBegin + size(), inValue, writeSize );
+				mEnd += writeSize;
+				return writeSize;
+			}
+			return 0;
+		}
+
+		// used by atomic write. Store the data and write the end afterwards
+		// we dont check the buffer size, it should not resize on the fly
+		template<typename TDataType>
+		uint32_t write(const TDataType* inValue, uint32_t inLength, int32_t index)
+		{
+			if (inValue && inLength)
+			{
+				uint32_t writeSize = inLength * sizeof(TDataType);
+				PX_ASSERT(mBegin + index + writeSize < mCapacityEnd);
+				PxMemCopy(mBegin + index, inValue, writeSize);				
+				return writeSize;
+			}
+			return 0;
+		}
+		
+		void growBuf( uint32_t inAmount )
+		{
+			uint32_t newSize = size() + inAmount;
+			reserve( newSize );
+		}
+		void resize( uint32_t inAmount )
+		{
+			reserve( inAmount );
+			mEnd = mBegin + inAmount;
+		}
+		void reserve( uint32_t newSize )
+		{
+			uint32_t currentSize = size();
+			if ( newSize >= capacity() )
+			{
+				const uint32_t allocSize = mBegin ? newSize * 2 : newSize;
+
+				uint8_t* newData = static_cast<uint8_t*>(TAllocator::allocate(allocSize, __FILE__, __LINE__));
+				memset(newData, 0xf,allocSize);
+				if ( mBegin )
+				{
+					PxMemCopy( newData, mBegin, currentSize );
+					TAllocator::deallocate( mBegin );
+				}
+				mBegin = newData;
+				mEnd = mBegin + currentSize;
+				mCapacityEnd = mBegin + allocSize;
+			}
+		}
+	};
+
+	
+	class TempMemoryBuffer
+	{
+		uint8_t* mBegin;
+		uint8_t* mEnd;
+		uint8_t* mCapacityEnd;
+
+	public:
+		TempMemoryBuffer(uint8_t* data, int32_t size) : mBegin(data), mEnd(data), mCapacityEnd(data + size) {}
+		~TempMemoryBuffer()
+		{			
+		}
+		uint32_t size() const { return static_cast<uint32_t>(mEnd - mBegin); }
+		uint32_t capacity() const { return static_cast<uint32_t>(mCapacityEnd - mBegin); }
+		const uint8_t* begin() { return mBegin; }
+		uint8_t* end() { return mEnd; }
+		const uint8_t* begin() const { return mBegin; }
+		const uint8_t* end() const { return mEnd; }		
+		uint32_t write(uint8_t inValue)
+		{			
+			*mEnd = inValue;
+			++mEnd;
+			return 1;
+		}
+
+		template<typename TDataType>
+		uint32_t write(const TDataType& inValue)
+		{
+			uint32_t writtenSize = sizeof(TDataType);			
+			const uint8_t* __restrict readPtr = reinterpret_cast<const uint8_t*>(&inValue);
+			uint8_t* __restrict writePtr = mEnd;
+			for (uint32_t idx = 0; idx < sizeof(TDataType); ++idx) writePtr[idx] = readPtr[idx];
+			mEnd += writtenSize;
+			return writtenSize;
+		}
+
+		template<typename TDataType>
+		uint32_t write(const TDataType* inValue, uint32_t inLength)
+		{
+			if (inValue && inLength)
+			{
+				uint32_t writeSize = inLength * sizeof(TDataType);
+				PxMemCopy(mBegin + size(), inValue, writeSize);
+				mEnd += writeSize;
+				return writeSize;
+			}
+			return 0;
+		}
+	};
+
+}}
+
+#endif // PXPVDSDK_PXPROFILEMEMORYBUFFER_H
diff --git a/PxShared/src/pvd/src/PxProfileMemoryEventBuffer.h b/PxShared/src/pvd/src/PxProfileMemoryEventBuffer.h
new file mode 100644
index 0000000..7cc50b6
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileMemoryEventBuffer.h
@@ -0,0 +1,156 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PXPVDSDK_PXPROFILEMEMORYEVENTBUFFER_H
+#define PXPVDSDK_PXPROFILEMEMORYEVENTBUFFER_H
+
+#include "PxProfileDataBuffer.h"
+#include "PxProfileMemoryEvents.h"
+#include "PxProfileMemoryEventTypes.h"
+#include "PxProfileScopedMutexLock.h"
+#include "PxProfileAllocatorWrapper.h"
+
+#include "PsHash.h"
+#include "PsHashMap.h"
+#include "PsUserAllocated.h"
+
+namespace physx { namespace profile {
+
+	template<typename TMutex,
+			 typename TScopedLock>
+	class MemoryEventBuffer : public DataBuffer<TMutex, TScopedLock>
+	{
+	public:
+		typedef DataBuffer<TMutex, TScopedLock> TBaseType;
+		typedef typename TBaseType::TMutexType TMutexType;
+		typedef typename TBaseType::TScopedLockType TScopedLockType;
+		typedef typename TBaseType::TU8AllocatorType TU8AllocatorType;
+		typedef typename TBaseType::TMemoryBufferType TMemoryBufferType;
+		typedef typename TBaseType::TBufferClientArray TBufferClientArray;
+		typedef shdfnd::HashMap<const char*, uint32_t, shdfnd::Hash<const char*>, TU8AllocatorType> TCharPtrToHandleMap;
+
+	protected:
+		TCharPtrToHandleMap mStringTable;
+
+	public:
+
+		MemoryEventBuffer( PxAllocatorCallback& cback
+					, uint32_t inBufferFullAmount
+					, TMutexType* inBufferMutex )
+			: TBaseType( &cback, inBufferFullAmount, inBufferMutex, "struct physx::profile::MemoryEvent" )
+			, mStringTable( TU8AllocatorType( TBaseType::getWrapper(), "MemoryEventStringBuffer" ) )
+		{
+		}
+
+		uint32_t getHandle( const char* inData )
+		{
+			if ( inData == NULL ) inData = "";
+			const typename TCharPtrToHandleMap::Entry* result( mStringTable.find( inData ) );
+			if ( result )
+				return result->second;
+			uint32_t hdl = mStringTable.size() + 1;
+			mStringTable.insert( inData, hdl );
+			StringTableEvent theEvent;
+			theEvent.init( inData, hdl );
+			sendEvent( theEvent );
+			return hdl;
+		}
+
+		void onAllocation( size_t inSize, const char* inType, const char* inFile, uint32_t inLine, uint64_t addr )
+		{
+			if ( addr == 0 )
+				return;
+			uint32_t typeHdl( getHandle( inType ) );
+			uint32_t fileHdl( getHandle( inFile ) );
+			AllocationEvent theEvent;
+			theEvent.init( inSize, typeHdl, fileHdl, inLine, addr );
+			sendEvent( theEvent );
+		}
+
+		void onDeallocation( uint64_t addr )
+		{
+			if ( addr == 0 )
+				return;
+			DeallocationEvent theEvent;
+			theEvent.init( addr );
+			sendEvent( theEvent );
+		}
+
+		void flushProfileEvents()
+		{
+			TBaseType::flushEvents();
+		}
+
+	protected:
+		
+		template<typename TDataType>
+		void sendEvent( TDataType inType )
+		{
+			MemoryEventHeader theHeader( getMemoryEventType<TDataType>() );
+			inType.setup( theHeader );
+			theHeader.streamify( TBaseType::mSerializer );
+			inType.streamify( TBaseType::mSerializer, theHeader );
+			if ( TBaseType::mDataArray.size() >= TBaseType::mBufferFullAmount )
+				flushProfileEvents();
+		}
+	};
+
+	class PxProfileMemoryEventBufferImpl : public shdfnd::UserAllocated
+		, public PxProfileMemoryEventBuffer
+	{
+		typedef MemoryEventBuffer<PxProfileEventMutex, NullLock> TMemoryBufferType;
+		TMemoryBufferType mBuffer;
+
+	public:
+		PxProfileMemoryEventBufferImpl( PxAllocatorCallback& alloc, uint32_t inBufferFullAmount )
+			: mBuffer( alloc, inBufferFullAmount, NULL )
+		{
+		}
+
+		virtual void onAllocation( size_t size, const char* typeName, const char* filename, int line, void* allocatedMemory )
+		{
+			mBuffer.onAllocation( size, typeName, filename, uint32_t(line), PX_PROFILE_POINTER_TO_U64( allocatedMemory ) );
+		}
+		virtual void onDeallocation( void* allocatedMemory )
+		{
+			mBuffer.onDeallocation( PX_PROFILE_POINTER_TO_U64( allocatedMemory ) );
+		}
+		
+		virtual void addClient( PxProfileEventBufferClient& inClient ) { mBuffer.addClient( inClient ); }
+		virtual void removeClient( PxProfileEventBufferClient& inClient ) { mBuffer.removeClient( inClient ); }
+		virtual bool hasClients() const { return mBuffer.hasClients(); }
+
+		virtual void flushProfileEvents() { mBuffer.flushProfileEvents(); }
+
+		virtual void release(){ PX_PROFILE_DELETE( mBuffer.getWrapper().getAllocator(), this ); }
+	};
+}}
+
+#endif // PXPVDSDK_PXPROFILEMEMORYEVENTBUFFER_H
diff --git a/PxShared/src/pvd/src/PxProfileMemoryEventParser.h b/PxShared/src/pvd/src/PxProfileMemoryEventParser.h
new file mode 100644
index 0000000..feb8063
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileMemoryEventParser.h
@@ -0,0 +1,185 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PXPVDSDK_PXPROFILEMEMORYEVENTPARSER_H
+#define PXPVDSDK_PXPROFILEMEMORYEVENTPARSER_H
+
+#include "PxProfileMemoryEvents.h"
+#include "PxProfileAllocatorWrapper.h"
+#include "PxProfileEventSerialization.h"
+
+#include "PsHashMap.h"
+#include "PsString.h"
+
+namespace physx { namespace profile {
+
+	template<bool TSwapBytes, typename TParserType, typename THandlerType> 
+	bool parseEventData( TParserType& inParser, const uint8_t* inData, uint32_t inLength, THandlerType* inHandler );
+
+	template<bool TSwapBytes>
+	struct MemoryEventParser
+	{
+		typedef PxProfileWrapperReflectionAllocator<uint8_t> TAllocatorType;
+		typedef shdfnd::HashMap<uint32_t, char*, shdfnd::Hash<uint32_t>, TAllocatorType > THdlToStringMap;
+		typedef EventDeserializer<TSwapBytes>	TDeserializerType;
+		
+		PxProfileAllocatorWrapper	mWrapper;
+		THdlToStringMap		mHdlToStringMap;
+		TDeserializerType	mDeserializer;
+
+		MemoryEventParser( PxAllocatorCallback& inAllocator )
+			: mWrapper( inAllocator )
+			, mHdlToStringMap( TAllocatorType( mWrapper ) )
+			, mDeserializer ( 0, 0 )
+		{
+		}
+
+		~MemoryEventParser()
+		{
+			for ( THdlToStringMap::Iterator iter( mHdlToStringMap.getIterator() ); iter.done() == false; ++iter )
+				mWrapper.getAllocator().deallocate( reinterpret_cast<void*>(iter->second) );
+		}
+
+		template<typename TOperator>
+		void parse(const StringTableEvent&, const MemoryEventHeader& inHeader, TOperator& inOperator)
+		{
+			StringTableEvent evt;
+			evt.init();
+			evt.streamify( mDeserializer, inHeader );
+			uint32_t len = static_cast<uint32_t>( strlen( evt.mString ) );
+			char* newStr = static_cast<char*>( mWrapper.getAllocator().allocate( len + 1, "const char*", __FILE__, __LINE__ ) );
+			shdfnd::strlcpy( newStr, len+1, evt.mString );
+			mHdlToStringMap[evt.mHandle] = newStr;
+			inOperator( inHeader, evt );
+		}
+
+		const char* getString( uint32_t inHdl )
+		{
+			const THdlToStringMap::Entry* entry = mHdlToStringMap.find( inHdl );
+			if ( entry ) return entry->second;
+			return "";
+		}
+
+		//Slow reverse lookup used only for testing.
+		uint32_t getHandle( const char* inStr )
+		{
+			for ( THdlToStringMap::Iterator iter = mHdlToStringMap.getIterator();
+				!iter.done();
+				++iter )
+			{
+				if ( safeStrEq( iter->second, inStr ) )
+					return iter->first;
+			}
+			return 0;
+		}
+
+		template<typename TOperator>
+		void parse(const AllocationEvent&, const MemoryEventHeader& inHeader, TOperator& inOperator)
+		{
+			AllocationEvent evt;
+			evt.streamify( mDeserializer, inHeader );
+			inOperator( inHeader, evt );
+		}
+
+		template<typename TOperator>
+		void parse(const DeallocationEvent&, const MemoryEventHeader& inHeader, TOperator& inOperator)
+		{
+			DeallocationEvent evt;
+			evt.streamify( mDeserializer, inHeader );
+			inOperator( inHeader, evt );
+		}
+
+		template<typename TOperator>
+		void parse(const FullAllocationEvent&, const MemoryEventHeader&, TOperator& )
+		{
+			PX_ASSERT( false ); //will never happen.
+		}
+
+		template<typename THandlerType>
+		void parseEventData( const uint8_t* inData, uint32_t inLength, THandlerType* inOperator )
+		{
+			physx::profile::parseEventData<TSwapBytes>( *this, inData, inLength, inOperator );
+		}
+	};
+	
+
+	template<typename THandlerType, bool TSwapBytes>
+	struct MemoryEventParseOperator
+	{
+		MemoryEventParser<TSwapBytes>* mParser;
+		THandlerType* mOperator;
+		MemoryEventHeader* mHeader;
+		MemoryEventParseOperator( MemoryEventParser<TSwapBytes>* inParser, THandlerType* inOperator, MemoryEventHeader* inHeader )
+			: mParser( inParser )
+			, mOperator( inOperator )
+			, mHeader( inHeader )
+		{
+		}
+
+		bool wasSuccessful() { return mParser->mDeserializer.mFail == false; }
+
+		bool parseHeader()
+		{
+			mHeader->streamify( mParser->mDeserializer );
+			return wasSuccessful();
+		}
+
+		template<typename TDataType>
+		bool operator()( const TDataType& inType )
+		{
+			mParser->parse( inType, *mHeader, *mOperator );
+			return wasSuccessful();
+		}
+		
+		bool operator()( uint8_t ) { PX_ASSERT( false ); return false;}
+	};
+
+	template<bool TSwapBytes, typename TParserType, typename THandlerType> 
+	inline bool parseEventData( TParserType& inParser, const uint8_t* inData, uint32_t inLength, THandlerType* inHandler )
+	{
+		inParser.mDeserializer = EventDeserializer<TSwapBytes>( inData, inLength );
+		MemoryEvent::EventData crapData;
+		uint32_t eventCount = 0;
+		MemoryEventHeader theHeader;
+		MemoryEventParseOperator<THandlerType, TSwapBytes> theOp( &inParser, inHandler, &theHeader );
+		while( inParser.mDeserializer.mLength && inParser.mDeserializer.mFail == false)
+		{
+			if ( theOp.parseHeader() )
+			{
+				if( visit<bool>( theHeader.getType(), crapData, theOp ) == false )
+					inParser.mDeserializer.mFail = true;
+			}
+			++eventCount;
+		}
+		return inParser.mDeserializer.mFail == false;
+	}
+}}
+
+#endif // PXPVDSDK_PXPROFILEMEMORYEVENTPARSER_H
diff --git a/PxShared/src/pvd/src/PxProfileMemoryEventRecorder.h b/PxShared/src/pvd/src/PxProfileMemoryEventRecorder.h
new file mode 100644
index 0000000..a3d1ed8
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileMemoryEventRecorder.h
@@ -0,0 +1,147 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PXPVDSDK_PXPROFILEMEMORYEVENTRECORDER_H
+#define PXPVDSDK_PXPROFILEMEMORYEVENTRECORDER_H
+
+
+#include "PxProfileBase.h"
+#include "PxProfileAllocatorWrapper.h"
+#include "PxProfileMemoryEvents.h"
+#include "PxProfileMemoryEventTypes.h"
+
+#include "PsHashMap.h"
+#include "PsUserAllocated.h"
+#include "PsBroadcast.h"
+#include "PxProfileMemory.h"
+
+namespace physx { namespace profile {
+
+	//Remember outstanding events.
+	//Remembers allocations, forwards them to a listener if one is attached
+	//and will forward all outstanding allocations to a listener when one is
+	//attached.
+	struct MemoryEventRecorder : public shdfnd::AllocationListener
+	{
+		typedef PxProfileWrapperReflectionAllocator<uint8_t> TAllocatorType;
+		typedef shdfnd::HashMap<uint64_t,FullAllocationEvent,shdfnd::Hash<uint64_t>,TAllocatorType> THashMapType;
+
+		PxProfileAllocatorWrapper		mWrapper;
+		THashMapType			mOutstandingAllocations;
+		AllocationListener*		mListener;
+
+		MemoryEventRecorder( PxAllocatorCallback* inFoundation )
+			: mWrapper( inFoundation )
+			, mOutstandingAllocations( TAllocatorType( mWrapper ) )
+			, mListener( NULL )
+		{
+		}
+
+		static uint64_t ToU64( void* inData ) { return PX_PROFILE_POINTER_TO_U64( inData ); }
+		static void* ToVoidPtr( uint64_t inData ) { return reinterpret_cast<void*>(size_t(inData)); }
+		virtual void onAllocation( size_t size, const char* typeName, const char* filename, int line, void* allocatedMemory )
+		{
+			onAllocation( size, typeName, filename, uint32_t(line), ToU64( allocatedMemory ) );
+		}
+		
+		void onAllocation( size_t size, const char* typeName, const char* filename, uint32_t line, uint64_t allocatedMemory )
+		{
+			if ( allocatedMemory == 0 )
+				return;
+			FullAllocationEvent theEvent;
+			theEvent.init( size, typeName, filename, line, allocatedMemory );
+			mOutstandingAllocations.insert( allocatedMemory, theEvent );
+			if ( mListener != NULL ) mListener->onAllocation( size, typeName, filename, int(line), ToVoidPtr(allocatedMemory) );
+		}
+		
+		virtual void onDeallocation( void* allocatedMemory )
+		{
+			onDeallocation( ToU64( allocatedMemory ) );
+		}
+
+		void onDeallocation( uint64_t allocatedMemory )
+		{
+			if ( allocatedMemory == 0 )
+				return;
+			mOutstandingAllocations.erase( allocatedMemory );
+			if ( mListener != NULL ) mListener->onDeallocation( ToVoidPtr( allocatedMemory ) );
+		}
+
+		void flushProfileEvents() {}
+
+		void setListener( AllocationListener* inListener )
+		{
+			mListener = inListener;
+			if ( mListener )
+			{	
+				for ( THashMapType::Iterator iter = mOutstandingAllocations.getIterator();
+					!iter.done();
+					++iter )
+				{
+					const FullAllocationEvent& evt( iter->second );
+					mListener->onAllocation( evt.mSize, evt.mType, evt.mFile, int(evt.mLine), ToVoidPtr( evt.mAddress ) );
+				}
+			}
+		}
+	};
+
+	class PxProfileMemoryEventRecorderImpl : public shdfnd::UserAllocated
+											, public physx::profile::PxProfileMemoryEventRecorder
+	{
+		MemoryEventRecorder mRecorder;
+	public:
+		PxProfileMemoryEventRecorderImpl( PxAllocatorCallback* inFnd )
+			: mRecorder( inFnd )
+		{
+		}
+
+		virtual void onAllocation( size_t size, const char* typeName, const char* filename, int line, void* allocatedMemory )
+		{
+			mRecorder.onAllocation( size, typeName, filename, line, allocatedMemory );
+		}
+
+		virtual void onDeallocation( void* allocatedMemory )
+		{
+			mRecorder.onDeallocation( allocatedMemory );
+		}
+		
+		virtual void setListener( AllocationListener* inListener )
+		{
+			mRecorder.setListener( inListener );
+		}
+
+		virtual void release()
+		{
+			PX_PROFILE_DELETE( mRecorder.mWrapper.getAllocator(), this );
+		}
+	};
+
+}}
+#endif // PXPVDSDK_PXPROFILEMEMORYEVENTRECORDER_H
diff --git a/PxShared/src/pvd/src/PxProfileMemoryEventReflexiveWriter.h b/PxShared/src/pvd/src/PxProfileMemoryEventReflexiveWriter.h
new file mode 100644
index 0000000..75fbd03
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileMemoryEventReflexiveWriter.h
@@ -0,0 +1,71 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXPVDSDK_PXPROFILEMEMORYEVENTREFLEXIVEWRITER_H
+#define PXPVDSDK_PXPROFILEMEMORYEVENTREFLEXIVEWRITER_H
+
+#include "PxProfileMemoryBuffer.h"
+#include "PxProfileFoundationWrapper.h"
+#include "PxProfileMemoryEvents.h"
+
+namespace physx { namespace profile {
+
+	struct MemoryEventReflexiveWriter
+	{
+		typedef PxProfileWrapperReflectionAllocator<uint8_t>	TAllocatorType;
+		typedef MemoryBuffer<TAllocatorType>		TMemoryBufferType;
+		typedef EventSerializer<TMemoryBufferType>	TSerializerType;
+
+
+		PxProfileAllocatorWrapper	mWrapper;
+		TMemoryBufferType	mBuffer;
+		TSerializerType		mSerializer;
+
+		MemoryEventReflexiveWriter( PxAllocatorCallback* inFoundation )
+			: mWrapper( inFoundation )
+			, mBuffer( TAllocatorType( mWrapper ) )
+			, mSerializer( &mBuffer )
+		{
+		}
+
+		template<typename TDataType>
+		void operator()( const MemoryEventHeader& inHeader, const TDataType& inType )
+		{
+			//copy to get rid of const.
+			MemoryEventHeader theHeader( inHeader );
+			TDataType theData( inType );
+
+			//write them out.
+			theHeader.streamify( mSerializer );
+			theData.streamify( mSerializer, theHeader );
+		}
+	};
+}}
+
+#endif // PXPVDSDK_PXPROFILEMEMORYEVENTREFLEXIVEWRITER_H
+\ No newline at end of file
diff --git a/PxShared/src/pvd/src/PxProfileMemoryEventSummarizer.h b/PxShared/src/pvd/src/PxProfileMemoryEventSummarizer.h
new file mode 100644
index 0000000..788636e
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileMemoryEventSummarizer.h
@@ -0,0 +1,304 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PXPVDSDK_PXPROFILEMEMORYEVENTSUMMARIZER_H
+#define PXPVDSDK_PXPROFILEMEMORYEVENTSUMMARIZER_H
+
+#include "PxProfileBase.h"
+#include "PxProfileAllocatorWrapper.h"
+#include "PxProfileMemoryEvents.h"
+#include "PxProfileMemoryEventRecorder.h"
+#include "PxProfileMemoryEventParser.h"
+
+#include "PsHashMap.h"
+
+namespace physx { namespace profile {
+
+	struct MemoryEventSummarizerEntry
+	{
+		uint32_t		mType;
+		uint32_t		mFile;
+		uint32_t		mLine;
+
+		MemoryEventSummarizerEntry( const AllocationEvent& evt )
+			: mType( evt.mType )
+			, mFile( evt.mFile )
+			, mLine( evt.mLine )
+		{
+		}
+
+		MemoryEventSummarizerEntry( uint32_t tp, uint32_t f, uint32_t line )
+			: mType( tp )
+			, mFile( f )
+			, mLine( line )
+		{
+		}
+	};
+}}
+
+
+namespace physx { namespace shdfnd {
+
+	template <>
+	struct Hash<physx::profile::MemoryEventSummarizerEntry>
+	{
+	public:
+		uint32_t operator()(const physx::profile::MemoryEventSummarizerEntry& entry) const
+		{
+			//Combine hash values in a semi-reasonable way.
+			return Hash<uint32_t>()( entry.mType )
+					^ Hash<uint32_t>()( entry.mFile )
+					^ Hash<uint32_t>()( entry.mLine );
+		}
+
+		bool operator()(const physx::profile::MemoryEventSummarizerEntry& lhs, const physx::profile::MemoryEventSummarizerEntry& rhs) const
+		{
+			return lhs.mType == rhs.mType
+				&& lhs.mFile == rhs.mFile
+				&& lhs.mLine == rhs.mLine;
+		}
+
+		bool equal(const physx::profile::MemoryEventSummarizerEntry& lhs, const physx::profile::MemoryEventSummarizerEntry& rhs) const
+		{
+			return lhs.mType == rhs.mType
+				&& lhs.mFile == rhs.mFile
+				&& lhs.mLine == rhs.mLine;
+		}
+	};
+}}
+
+namespace physx { namespace profile {
+
+	struct MemoryEventSummarizerAllocatedValue
+	{
+		MemoryEventSummarizerEntry	mEntry;
+		uint32_t						mSize;
+		MemoryEventSummarizerAllocatedValue( MemoryEventSummarizerEntry en, uint32_t sz )
+			: mEntry( en )
+			, mSize( sz )
+		{
+		}
+	};
+
+	template<typename TSummarizerType>
+	struct SummarizerParseHandler
+	{
+		TSummarizerType* mSummarizer;
+		SummarizerParseHandler( TSummarizerType* inType )
+			: mSummarizer( inType )
+		{
+		}
+		template<typename TDataType>
+		void operator()( const MemoryEventHeader& inHeader, const TDataType& inType )
+		{
+			mSummarizer->handleParsedData( inHeader, inType );
+		}
+	};
+
+	template<typename TForwardType>
+	struct MemoryEventForward
+	{
+		TForwardType* mForward;
+		MemoryEventForward( TForwardType& inForward )
+			: mForward( &inForward )
+		{
+		}
+		template<typename TDataType>
+		void operator()( const MemoryEventHeader& inHeader, const TDataType& inType )
+		{
+			TForwardType& theForward( *mForward );
+			theForward( inHeader, inType );
+		}
+	};
+
+	struct NullMemoryEventHandler
+	{
+		template<typename TDataType>
+		void operator()( const MemoryEventHeader&, const TDataType&)
+		{
+		}
+	};
+
+	template<typename TForwardType>
+	struct NewEntryOperatorForward
+	{
+		TForwardType* mForward;
+		NewEntryOperatorForward( TForwardType& inForward )
+			: mForward( &inForward )
+		{
+		}
+		void operator()( const MemoryEventSummarizerEntry& inEntry, const char* inTypeStr, const char* inFileStr, uint32_t inTotalsArrayIndex )
+		{
+			TForwardType& theType( *mForward );
+			theType( inEntry, inTypeStr, inFileStr, inTotalsArrayIndex );
+		}
+	};
+
+	struct NullNewEntryOperator
+	{
+		void operator()( const MemoryEventSummarizerEntry&, const char*, const char*, uint32_t)
+		{
+		}
+	};
+
+	//Very specialized class meant to take a stream of memory events
+	//endian-convert it.
+	//Produce a new stream
+	//And keep track of the events in a meaningful way.
+	//It collapses the allocations into groupings keyed
+	//by file, line, and type.
+	template<bool TSwapBytes
+			, typename TNewEntryOperator
+			, typename MemoryEventHandler>
+	struct MemoryEventSummarizer : public PxProfileEventBufferClient
+	{
+		typedef MemoryEventSummarizer< TSwapBytes, TNewEntryOperator, MemoryEventHandler > TThisType;
+		typedef PxProfileWrapperReflectionAllocator<MemoryEventSummarizerEntry> TAllocatorType;
+		typedef shdfnd::HashMap<MemoryEventSummarizerEntry, uint32_t, shdfnd::Hash<MemoryEventSummarizerEntry>, TAllocatorType> TSummarizeEntryToU32Hash;
+		typedef shdfnd::HashMap<uint64_t, MemoryEventSummarizerAllocatedValue, shdfnd::Hash<uint64_t>, TAllocatorType> TU64ToSummarizerValueHash;
+		PxProfileAllocatorWrapper mWrapper;
+		TSummarizeEntryToU32Hash		mEntryIndexHash;
+		PxProfileArray<int32_t>				mTotalsArray;
+		MemoryEventParser<TSwapBytes>	mParser;
+		TU64ToSummarizerValueHash		mOutstandingAllocations;
+		TNewEntryOperator				mNewEntryOperator;
+		MemoryEventHandler				mEventHandler;
+
+		
+		MemoryEventSummarizer( PxAllocatorCallback& inAllocator
+								, TNewEntryOperator inNewEntryOperator
+								, MemoryEventHandler inEventHandler)
+
+			: mWrapper( inAllocator )
+			, mEntryIndexHash( TAllocatorType( mWrapper ) )
+			, mTotalsArray( mWrapper )
+			, mParser( inAllocator )
+			, mOutstandingAllocations( mWrapper )
+			, mNewEntryOperator( inNewEntryOperator )
+			, mEventHandler( inEventHandler )
+		{
+		}
+		virtual ~MemoryEventSummarizer(){}
+
+		//parse this data block.  This will endian-convert the data if necessary
+		//and then 
+		void handleData( const uint8_t* inData, uint32_t inLen )
+		{
+			SummarizerParseHandler<TThisType> theHandler( this );
+			parseEventData<TSwapBytes>( mParser, inData, inLen, &theHandler );
+		}
+
+		template<typename TDataType>
+		void handleParsedData( const MemoryEventHeader& inHeader, const TDataType& inData )
+		{
+			//forward it to someone who might care
+			mEventHandler( inHeader, inData );
+			//handle the parsed data.
+			doHandleParsedData( inData );
+		}
+
+		template<typename TDataType>
+		void doHandleParsedData( const TDataType& ) {}
+		
+		void doHandleParsedData( const AllocationEvent& inEvt ) 
+		{
+			onAllocation( inEvt.mSize, inEvt.mType, inEvt.mFile, inEvt.mLine, inEvt.mAddress );
+		}
+		
+		void doHandleParsedData( const DeallocationEvent& inEvt ) 
+		{
+			onDeallocation( inEvt.mAddress );
+		}
+
+		uint32_t getOrCreateEntryIndex( const MemoryEventSummarizerEntry& inEvent )
+		{
+			uint32_t index = 0;
+			const TSummarizeEntryToU32Hash::Entry* entry( mEntryIndexHash.find(inEvent ) );
+			if ( !entry )
+			{
+				index = mTotalsArray.size();
+				mTotalsArray.pushBack( 0 );
+				mEntryIndexHash.insert( inEvent, index );
+
+				//Force a string lookup and such here.
+				mNewEntryOperator( inEvent, mParser.getString( inEvent.mType), mParser.getString( inEvent.mFile ), index );
+			}
+			else
+				index = entry->second;
+			return index;
+		}
+
+		//Keep a running total of what is going on, letting a listener know when new events happen.
+		void onMemoryEvent( const MemoryEventSummarizerEntry& inEvent, int32_t inSize )
+		{
+			MemoryEventSummarizerEntry theEntry( inEvent );
+			uint32_t index = getOrCreateEntryIndex( theEntry );
+			mTotalsArray[index] += inSize;
+		}
+
+		void onAllocation( uint32_t inSize, uint32_t inType, uint32_t inFile, uint32_t inLine, uint64_t inAddress )
+		{
+			MemoryEventSummarizerEntry theEntry( inType, inFile, inLine );
+			onMemoryEvent( theEntry, static_cast<int32_t>( inSize ) );
+			mOutstandingAllocations.insert( inAddress, MemoryEventSummarizerAllocatedValue( theEntry, inSize ) );
+		}
+
+		void onDeallocation( uint64_t inAddress )
+		{
+			const TU64ToSummarizerValueHash::Entry* existing( mOutstandingAllocations.find( inAddress ) );
+			if ( existing )
+			{
+				const MemoryEventSummarizerAllocatedValue& data( existing->second );
+				onMemoryEvent( data.mEntry, -1 * static_cast<int32_t>( data.mSize ) );
+				mOutstandingAllocations.erase( inAddress );
+			}
+			//Not much we can do with an deallocation when we didn't track the allocation.
+		}
+
+		int32_t getTypeTotal( const char* inTypeName, const char* inFilename, uint32_t inLine )
+		{
+			uint32_t theType( mParser.getHandle( inTypeName ) );
+			uint32_t theFile( mParser.getHandle( inFilename ) );
+			uint32_t theLine = inLine; //all test lines are 50...
+			uint32_t index = getOrCreateEntryIndex( MemoryEventSummarizerEntry( theType, theFile, theLine ) );
+			return mTotalsArray[index];
+		}
+
+		virtual void handleBufferFlush( const uint8_t* inData, uint32_t inLength )
+		{
+			handleData( inData, inLength );
+		}
+		
+		virtual void handleClientRemoved() {}
+	};
+
+}}
+
+#endif // PXPVDSDK_PXPROFILEMEMORYEVENTSUMMARIZER_H
diff --git a/PxShared/src/pvd/src/PxProfileMemoryEventTypes.h b/PxShared/src/pvd/src/PxProfileMemoryEventTypes.h
new file mode 100644
index 0000000..c737451
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileMemoryEventTypes.h
@@ -0,0 +1,90 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPROFILEMEMORYEVENTTYPES_H
+#define PXPVDSDK_PXPROFILEMEMORYEVENTTYPES_H
+
+#include "PxProfileBase.h"
+#include "PxProfileEventBufferClientManager.h"
+#include "PxProfileEventSender.h"
+#include "PsBroadcast.h"
+
+namespace physx { namespace profile {
+
+	struct PxProfileMemoryEventType
+	{
+		enum Enum
+		{
+			Unknown = 0,
+			Allocation,
+			Deallocation
+		};
+	};
+
+	struct PxProfileBulkMemoryEvent
+	{
+		uint64_t mAddress;
+		uint32_t mDatatype;
+		uint32_t mFile;
+		uint32_t mLine;
+		uint32_t mSize;
+		PxProfileMemoryEventType::Enum mType;
+
+		PxProfileBulkMemoryEvent(){}
+
+		PxProfileBulkMemoryEvent( uint32_t size, uint32_t type, uint32_t file, uint32_t line, uint64_t addr )
+			: mAddress( addr )
+			, mDatatype( type )
+			, mFile( file )
+			, mLine( line )
+			, mSize( size )
+			, mType( PxProfileMemoryEventType::Allocation )
+		{
+		}
+		
+		PxProfileBulkMemoryEvent( uint64_t addr )
+			: mAddress( addr )
+			, mDatatype( 0 )
+			, mFile( 0 )
+			, mLine( 0 )
+			, mSize( 0 )
+			, mType( PxProfileMemoryEventType::Deallocation )
+		{
+		}
+	};
+	
+	class PxProfileBulkMemoryEventHandler
+	{
+	protected:
+		virtual ~PxProfileBulkMemoryEventHandler(){}
+	public:
+		virtual void handleEvents( const PxProfileBulkMemoryEvent* inEvents, uint32_t inBufferSize ) = 0;
+		static void parseEventBuffer( const uint8_t* inBuffer, uint32_t inBufferSize, PxProfileBulkMemoryEventHandler& inHandler, bool inSwapBytes, PxAllocatorCallback* inAlloc );
+	};
+} }
+
+#endif // PXPVDSDK_PXPROFILEMEMORYEVENTTYPES_H
diff --git a/PxShared/src/pvd/src/PxProfileMemoryEvents.h b/PxShared/src/pvd/src/PxProfileMemoryEvents.h
new file mode 100644
index 0000000..6fcb032
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileMemoryEvents.h
@@ -0,0 +1,411 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PXPVDSDK_PXPROFILEMEMORYEVENTS_H
+#define PXPVDSDK_PXPROFILEMEMORYEVENTS_H
+
+#include "PxProfileEvents.h"
+
+//Memory events define their own event stream
+
+namespace physx { namespace profile {
+	struct MemoryEventTypes
+	{
+		enum Enum
+		{
+			Unknown = 0,
+			StringTableEvent, //introduce a new mapping of const char* -> integer
+			AllocationEvent,
+			DeallocationEvent,
+			FullAllocationEvent
+		};
+	};
+
+	template<unsigned numBits, typename TDataType>
+	inline unsigned char convertToNBits( TDataType inType )
+	{
+		uint8_t conversion = static_cast<uint8_t>( inType );
+		PX_ASSERT( conversion < (1 << numBits) );
+		return conversion;
+	}
+
+	template<typename TDataType>
+	inline unsigned char convertToTwoBits( TDataType inType )
+	{
+		return convertToNBits<2>( inType );
+	}
+
+	template<typename TDataType>
+	inline unsigned char convertToFourBits( TDataType inType )
+	{
+		return convertToNBits<4>( inType );
+	}
+
+	inline EventStreamCompressionFlags::Enum fromNumber( uint8_t inNum ) { return static_cast<EventStreamCompressionFlags::Enum>( inNum ); } 
+	
+	template<unsigned lhs, unsigned rhs>
+	inline void compileCheckSize()
+	{
+		PX_COMPILE_TIME_ASSERT( lhs <= rhs );
+	}
+
+	//Used for predictable bit fields.
+	template<typename TDataType
+			, uint8_t TNumBits
+			, uint8_t TOffset
+			, typename TInputType>
+	struct BitMaskSetter
+	{
+		//Create a mask that masks out the orginal value shift into place
+		static TDataType createOffsetMask() { return TDataType(createMask() << TOffset); }
+		//Create a mask of TNumBits number of tis
+		static TDataType createMask() { return static_cast<TDataType>((1 << TNumBits) - 1); }
+		void setValue( TDataType& inCurrent, TInputType inData )
+		{
+			PX_ASSERT( inData < ( 1 << TNumBits ) );
+			
+			//Create a mask to remove the current value.
+			TDataType theMask = TDataType(~(createOffsetMask()));
+			//Clear out current value.
+			inCurrent = TDataType(inCurrent & theMask);
+			//Create the new value.
+			TDataType theAddition = static_cast<TDataType>( inData << TOffset );
+			//or it into the existing value.
+			inCurrent = TDataType(inCurrent | theAddition);
+		}
+
+		TInputType getValue( TDataType inCurrent )
+		{
+			return static_cast<TInputType>( ( inCurrent >> TOffset ) & createMask() );
+		}
+	};
+
+
+	struct MemoryEventHeader
+	{
+		uint16_t mValue;
+
+		typedef BitMaskSetter<uint16_t, 4, 0, uint8_t> TTypeBitmask;
+		typedef BitMaskSetter<uint16_t, 2, 4, uint8_t> TAddrCompressBitmask;
+		typedef BitMaskSetter<uint16_t, 2, 6, uint8_t> TTypeCompressBitmask;
+		typedef BitMaskSetter<uint16_t, 2, 8, uint8_t> TFnameCompressBitmask;
+		typedef BitMaskSetter<uint16_t, 2, 10, uint8_t> TSizeCompressBitmask;
+		typedef BitMaskSetter<uint16_t, 2, 12, uint8_t> TLineCompressBitmask;
+
+		//That leaves size as the only thing not compressed usually.
+
+		MemoryEventHeader( MemoryEventTypes::Enum inType = MemoryEventTypes::Unknown ) 
+			: mValue( 0 )
+		{
+			uint8_t defaultCompression( convertToTwoBits( EventStreamCompressionFlags::U64 ) );
+			TTypeBitmask().setValue( mValue, convertToFourBits( inType ) );
+			TAddrCompressBitmask().setValue( mValue, defaultCompression );
+			TTypeCompressBitmask().setValue( mValue, defaultCompression );
+			TFnameCompressBitmask().setValue( mValue, defaultCompression );
+			TSizeCompressBitmask().setValue( mValue, defaultCompression );
+			TLineCompressBitmask().setValue( mValue, defaultCompression );
+		}
+
+		MemoryEventTypes::Enum getType() const { return static_cast<MemoryEventTypes::Enum>( TTypeBitmask().getValue( mValue ) ); }
+
+#define DEFINE_MEMORY_HEADER_COMPRESSION_ACCESSOR( name )																			\
+	void set##name( EventStreamCompressionFlags::Enum inEnum ) { T##name##Bitmask().setValue( mValue, convertToTwoBits( inEnum ) ); }	\
+		EventStreamCompressionFlags::Enum get##name() const { return fromNumber( T##name##Bitmask().getValue( mValue ) ); }
+
+		DEFINE_MEMORY_HEADER_COMPRESSION_ACCESSOR( AddrCompress )
+		DEFINE_MEMORY_HEADER_COMPRESSION_ACCESSOR( TypeCompress )
+		DEFINE_MEMORY_HEADER_COMPRESSION_ACCESSOR( FnameCompress )
+		DEFINE_MEMORY_HEADER_COMPRESSION_ACCESSOR( SizeCompress )
+		DEFINE_MEMORY_HEADER_COMPRESSION_ACCESSOR( LineCompress )
+
+#undef DEFINE_MEMORY_HEADER_COMPRESSION_ACCESSOR
+
+		bool operator==( const MemoryEventHeader& inOther ) const 
+		{ 
+			return mValue == inOther.mValue; 
+		}
+		template<typename TStreamType>
+		void streamify( TStreamType& inStream ) 
+		{ 
+			inStream.streamify( "Header", mValue );
+		}
+	};
+	
+	//Declaration of type level getMemoryEventType function that maps enumeration event types to datatypes
+	template<typename TDataType>
+	inline MemoryEventTypes::Enum getMemoryEventType() { PX_ASSERT( false ); return MemoryEventTypes::Unknown; }
+
+	inline bool safeStrEq( const char* lhs, const char* rhs )
+	{
+		if ( lhs == rhs )
+			return true;
+		//If they aren't equal, and one of them is null,
+		//then they can't be equal.
+		//This is assuming that the null char* is not equal to
+		//the empty "" char*.
+		if ( !lhs || !rhs )
+			return false;
+
+		return ::strcmp( lhs, rhs ) == 0;
+	}
+
+	struct StringTableEvent
+	{
+		const char* mString;
+		uint32_t		mHandle;
+
+		void init( const char* inStr = "", uint32_t inHdl = 0 )
+		{
+			mString = inStr;
+			mHandle = inHdl;
+		}
+
+		void init( const StringTableEvent& inData )
+		{
+			mString = inData.mString;
+			mHandle = inData.mHandle;
+		}
+
+		bool operator==( const StringTableEvent& inOther ) const
+		{
+			return mHandle == inOther.mHandle
+				&& safeStrEq( mString, inOther.mString );
+		}
+		
+		void setup( MemoryEventHeader& ) const {}
+
+		template<typename TStreamType>
+		void streamify( TStreamType& inStream, const MemoryEventHeader& )
+		{
+			inStream.streamify( "String", mString );
+			inStream.streamify( "Handle", mHandle );
+		}
+	};
+	template<> inline MemoryEventTypes::Enum getMemoryEventType<StringTableEvent>() { return MemoryEventTypes::StringTableEvent; }
+
+	struct MemoryEventData
+	{
+		uint64_t mAddress;
+		void init( uint64_t addr )
+		{
+			mAddress = addr;
+		}
+
+		void init( const MemoryEventData& inData)
+		{
+			mAddress = inData.mAddress;
+		}
+
+		bool operator==( const MemoryEventData& inOther ) const
+		{
+			return mAddress == inOther.mAddress;
+		}
+		
+		void setup( MemoryEventHeader& inHeader ) const
+		{
+			inHeader.setAddrCompress( findCompressionValue( mAddress ) );
+		}
+
+		template<typename TStreamType>
+		void streamify( TStreamType& inStream, const MemoryEventHeader& inHeader )
+		{
+			inStream.streamify( "Address", mAddress, inHeader.getAddrCompress() );
+		}
+	};
+
+	struct AllocationEvent : public MemoryEventData
+	{
+		uint32_t mSize;
+		uint32_t mType;
+		uint32_t mFile;
+		uint32_t mLine;
+		void init( size_t size = 0, uint32_t type = 0, uint32_t file = 0, uint32_t line = 0, uint64_t addr = 0 )
+		{
+			MemoryEventData::init( addr );
+			mSize = static_cast<uint32_t>( size );
+			mType = type;
+			mFile = file;
+			mLine = line;
+		}
+
+		void init( const AllocationEvent& inData )
+		{
+			MemoryEventData::init( inData );
+			mSize = inData.mSize;
+			mType = inData.mType;
+			mFile = inData.mFile;
+			mLine = inData.mLine;
+		}
+
+		bool operator==( const AllocationEvent& inOther ) const
+		{
+			return MemoryEventData::operator==( inOther )
+				&& mSize == inOther.mSize
+				&& mType == inOther.mType
+				&& mFile == inOther.mFile
+				&& mLine == inOther.mLine;
+		}
+
+		void setup( MemoryEventHeader& inHeader ) const
+		{
+			inHeader.setTypeCompress( findCompressionValue( mType ) );
+			inHeader.setFnameCompress( findCompressionValue( mFile ) );
+			inHeader.setSizeCompress( findCompressionValue( mSize ) );
+			inHeader.setLineCompress( findCompressionValue( mLine ) );
+			MemoryEventData::setup( inHeader );
+		}
+
+		template<typename TStreamType>
+		void streamify( TStreamType& inStream, const MemoryEventHeader& inHeader )
+		{
+			inStream.streamify( "Size", mSize, inHeader.getSizeCompress() );
+			inStream.streamify( "Type", mType, inHeader.getTypeCompress() );
+			inStream.streamify( "File", mFile, inHeader.getFnameCompress() );
+			inStream.streamify( "Line", mLine, inHeader.getLineCompress() );
+			MemoryEventData::streamify( inStream, inHeader );
+		}
+	};
+	template<> inline MemoryEventTypes::Enum getMemoryEventType<AllocationEvent>() { return MemoryEventTypes::AllocationEvent; }
+	
+
+	struct FullAllocationEvent : public MemoryEventData
+	{
+		size_t mSize;
+		const char* mType;
+		const char* mFile;
+		uint32_t mLine;
+		void init( size_t size, const char* type, const char* file, uint32_t line, uint64_t addr )
+		{
+			MemoryEventData::init( addr );
+			mSize = size;
+			mType = type;
+			mFile = file;
+			mLine = line;
+		}
+
+		void init( const FullAllocationEvent& inData )
+		{
+			MemoryEventData::init( inData );
+			mSize = inData.mSize;
+			mType = inData.mType;
+			mFile = inData.mFile;
+			mLine = inData.mLine;
+		}
+
+		bool operator==( const FullAllocationEvent& inOther ) const
+		{
+			return MemoryEventData::operator==( inOther )
+				&& mSize == inOther.mSize
+				&& safeStrEq( mType, inOther.mType )
+				&& safeStrEq( mFile, inOther.mFile )
+				&& mLine == inOther.mLine;
+		}
+			
+		void setup( MemoryEventHeader& ) const {}
+	};
+
+	template<> inline MemoryEventTypes::Enum getMemoryEventType<FullAllocationEvent>() { return MemoryEventTypes::FullAllocationEvent; }
+
+	struct DeallocationEvent : public MemoryEventData
+	{
+		void init( uint64_t addr = 0 ) { MemoryEventData::init( addr ); }
+		void init( const DeallocationEvent& inData ) { MemoryEventData::init( inData ); }
+	};
+	
+	template<> inline MemoryEventTypes::Enum getMemoryEventType<DeallocationEvent>() { return MemoryEventTypes::DeallocationEvent; }
+
+	class MemoryEvent
+	{
+	public:
+		typedef PX_PROFILE_UNION_5(StringTableEvent, AllocationEvent, DeallocationEvent, FullAllocationEvent, uint8_t) EventData;
+
+	private:
+		MemoryEventHeader mHeader;
+		EventData mData;
+	public:
+		
+		MemoryEvent() {}
+		MemoryEvent( MemoryEventHeader inHeader, const EventData& inData = EventData() )
+			: mHeader( inHeader )
+			, mData( inData )
+		{
+		}
+
+		template<typename TDataType>
+		MemoryEvent( const TDataType& inType )
+			: mHeader( getMemoryEventType<TDataType>() )
+			, mData( inType )
+		{
+			//set the appropriate compression bits.
+			inType.setup( mHeader );
+		}
+		const MemoryEventHeader& getHeader() const { return mHeader; }
+		const EventData& getData() const { return mData; }
+
+		template<typename TDataType>
+		const TDataType& getValue() const { PX_ASSERT( mHeader.getType() == getMemoryEventType<TDataType>() ); return mData.toType<TDataType>(); }
+
+		template<typename TDataType>
+		TDataType& getValue() { PX_ASSERT( mHeader.getType() == getMemoryEventType<TDataType>() ); return mData.toType<TDataType>(); }
+
+		template<typename TRetVal, typename TOperator>
+		inline TRetVal visit( TOperator inOp ) const;
+
+		bool operator==( const MemoryEvent& inOther ) const
+		{
+			if ( !(mHeader == inOther.mHeader ) ) return false;
+			if ( mHeader.getType() )
+				return inOther.visit<bool>( EventDataEqualOperator<EventData>( mData ) );
+			return true;
+		}
+	};
+
+	template<typename TRetVal, typename TOperator>
+	inline TRetVal visit( MemoryEventTypes::Enum inEventType, const MemoryEvent::EventData& inData, TOperator inOperator )
+	{
+		switch( inEventType )
+		{
+		case MemoryEventTypes::StringTableEvent:		return inOperator( inData.toType( Type2Type<StringTableEvent>() ) );
+		case MemoryEventTypes::AllocationEvent:			return inOperator( inData.toType( Type2Type<AllocationEvent>() ) );
+		case MemoryEventTypes::DeallocationEvent:		return inOperator( inData.toType( Type2Type<DeallocationEvent>() ) );
+		case MemoryEventTypes::FullAllocationEvent:		return inOperator( inData.toType( Type2Type<FullAllocationEvent>() ) );
+		case MemoryEventTypes::Unknown:					return inOperator( static_cast<uint8_t>( inEventType ) );
+		}
+		return TRetVal();
+	}
+
+	template<typename TRetVal, typename TOperator>
+	inline TRetVal MemoryEvent::visit( TOperator inOp ) const
+	{ 
+		return physx::profile::visit<TRetVal>( mHeader.getType(), mData, inOp ); 
+	}
+}}
+
+#endif // PXPVDSDK_PXPROFILEMEMORYEVENTS_H
diff --git a/PxShared/src/pvd/src/PxProfileScopedEvent.h b/PxShared/src/pvd/src/PxProfileScopedEvent.h
new file mode 100644
index 0000000..953fcf8
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileScopedEvent.h
@@ -0,0 +1,150 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPROFILESCOPEDEVENT_H
+#define PXPVDSDK_PXPROFILESCOPEDEVENT_H
+
+#include "PxProfileBase.h"
+#include "PxProfileEventId.h"
+#include "PxProfileCompileTimeEventFilter.h"
+
+namespace physx { namespace profile {
+
+#define TO_PXPVDSDK_PXPROFILEEVENTID( subsystem, eventId ) PxProfileEventId( SubsystemIds::subsystem, EventIds::subsystem##eventId );
+
+	/**
+	\brief Template version of startEvent, called directly on provided profile buffer.
+
+	\param inBuffer Profile event buffer.
+	\param inId Profile event id.
+	\param inContext Profile event context.
+	*/
+	template<bool TEnabled, typename TBufferType>
+	inline void startEvent( TBufferType* inBuffer, const PxProfileEventId& inId, uint64_t inContext )
+	{
+		if ( TEnabled && inBuffer ) inBuffer->startEvent( inId, inContext );
+	}
+
+	/**
+	\brief Template version of stopEvent, called directly on provided profile buffer.
+
+	\param inBuffer Profile event buffer.
+	\param inId Profile event id.
+	\param inContext Profile event context.
+	*/
+	template<bool TEnabled, typename TBufferType>
+	inline void stopEvent( TBufferType* inBuffer, const PxProfileEventId& inId, uint64_t inContext )
+	{
+		if ( TEnabled && inBuffer ) inBuffer->stopEvent( inId, inContext );
+	}
+	
+	/**
+	\brief Template version of startEvent, called directly on provided profile buffer.
+
+	\param inEnabled If profile event is enabled.
+	\param inBuffer Profile event buffer.
+	\param inId Profile event id.
+	\param inContext Profile event context.
+	*/
+	template<typename TBufferType>
+	inline void startEvent( bool inEnabled, TBufferType* inBuffer, const PxProfileEventId& inId, uint64_t inContext )
+	{
+		if ( inEnabled && inBuffer ) inBuffer->startEvent( inId, inContext );
+	}
+
+	/**
+	\brief Template version of stopEvent, called directly on provided profile buffer.
+
+	\param inEnabled If profile event is enabled.
+	\param inBuffer Profile event buffer.
+	\param inId Profile event id.
+	\param inContext Profile event context.
+	*/
+	template<typename TBufferType>
+	inline void stopEvent( bool inEnabled, TBufferType* inBuffer, const PxProfileEventId& inId, uint64_t inContext )
+	{
+		if ( inEnabled && inBuffer ) inBuffer->stopEvent( inId, inContext );
+	}
+	
+	/**
+	\brief Template version of eventValue, called directly on provided profile buffer.
+
+	\param inEnabled If profile event is enabled.
+	\param inBuffer Profile event buffer.
+	\param inId Profile event id.
+	\param inContext Profile event context.
+	\param inValue Event value.
+	*/
+	template<typename TBufferType>
+	inline void eventValue( bool inEnabled, TBufferType* inBuffer, const PxProfileEventId& inId, uint64_t inContext, int64_t inValue )
+	{
+		if ( inEnabled && inBuffer ) inBuffer->eventValue( inId, inContext, inValue );
+	}
+
+	template<bool TEnabled, typename TBufferType, uint16_t eventId>
+	struct ScopedEventWithContext
+	{
+		uint64_t				mContext;
+		TBufferType*		mBuffer;
+		ScopedEventWithContext( TBufferType* inBuffer, uint64_t inContext)
+			: mContext ( inContext )
+			, mBuffer( inBuffer )
+		{
+			startEvent<true>( mBuffer, PxProfileEventId(eventId), mContext );
+		}
+		~ScopedEventWithContext()
+		{
+			stopEvent<true>( mBuffer, PxProfileEventId(eventId), mContext );
+		}
+	};
+
+	template<typename TBufferType, uint16_t eventId>
+	struct ScopedEventWithContext<false,TBufferType,eventId> { ScopedEventWithContext( TBufferType*, uint64_t) {} };
+
+	template<typename TBufferType>
+	struct DynamicallyEnabledScopedEvent
+	{
+		TBufferType*		mBuffer;
+		PxProfileEventId	mId;
+		uint64_t				mContext;
+		DynamicallyEnabledScopedEvent( TBufferType* inBuffer, const PxProfileEventId& inId, uint64_t inContext)
+			: mBuffer( inBuffer )
+			, mId( inId )
+			, mContext( inContext )
+		{
+			if(mBuffer)
+				startEvent( mId.compileTimeEnabled, mBuffer, mId, mContext );
+		}
+		~DynamicallyEnabledScopedEvent()
+		{
+			if(mBuffer)
+				stopEvent( mId.compileTimeEnabled, mBuffer, mId, mContext );
+		}
+	};
+}}
+
+#endif // PXPVDSDK_PXPROFILESCOPEDEVENT_H
diff --git a/PxShared/src/pvd/src/PxProfileScopedMutexLock.h b/PxShared/src/pvd/src/PxProfileScopedMutexLock.h
new file mode 100644
index 0000000..9d21cb8
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileScopedMutexLock.h
@@ -0,0 +1,64 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PXPVDSDK_PXPROFILESCOPEDMUTEXLOCK_H
+#define PXPVDSDK_PXPROFILESCOPEDMUTEXLOCK_H
+
+#include "PxProfileBase.h"
+
+namespace physx { namespace profile {
+
+	/**
+	 *	Generic class to wrap any mutex type that has lock and unlock methods
+	 */
+	template<typename TMutexType>
+	struct ScopedLockImpl
+	{
+		TMutexType* mMutex;
+		ScopedLockImpl( TMutexType* inM ) : mMutex( inM )
+		{
+			if ( mMutex ) mMutex->lock();
+		}
+		~ScopedLockImpl()
+		{
+			if ( mMutex ) mMutex->unlock();
+		}
+	};
+
+	/**
+	 *	Null locking system that does nothing.
+	 */
+	struct NullLock
+	{
+		template<typename TDataType> NullLock( TDataType*) {}
+	};
+}}
+
+#endif // PXPVDSDK_PXPROFILESCOPEDMUTEXLOCK_H
diff --git a/PxShared/src/pvd/src/PxProfileZone.h b/PxShared/src/pvd/src/PxProfileZone.h
new file mode 100644
index 0000000..1573c2f
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileZone.h
@@ -0,0 +1,142 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPROFILEZONE_H
+#define PXPVDSDK_PXPROFILEZONE_H
+
+#include "foundation/PxPreprocessor.h"
+
+#include "PxProfileEventBufferClientManager.h"
+#include "PxProfileEventNames.h"
+#include "PxProfileEventSender.h"
+
+namespace physx { 
+	class PxAllocatorCallback;
+
+	namespace profile {
+
+	class PxProfileZoneManager;	
+
+	/**
+	\brief The profiling system was setup in the expectation that there would be several
+	 systems that each had its own island of profile information.  PhysX, client code,
+	 and APEX would be the first examples of these.  Each one of these islands is represented
+	 by a profile zone.
+	 
+	 A profile zone combines a name, a place where all the events coming from its interface
+	 can flushed, and a mapping from event number to full event name.
+	 	
+	 It also provides a top level filtering service where profile events
+	 can be filtered by event id.  
+	 
+	 The profile zone implements a system where if there is no one
+	 listening to events it doesn't provide a mechanism to send them.  In this way
+	 the event system is short circuited when there aren't any clients.
+	 
+	 All functions on this interface should be considered threadsafe.
+
+	 @see PxProfileZoneClientManager, PxProfileNameProvider, PxProfileEventSender, PxProfileEventFlusher
+	 */
+	class PxProfileZone : public PxProfileZoneClientManager
+						, public PxProfileNameProvider
+						, public PxProfileEventSender
+						, public PxProfileEventFlusher
+	{
+	protected:
+		virtual ~PxProfileZone(){}
+	public:
+		/**
+		\brief Get profile zone name.
+		\return Zone name.
+		*/
+		virtual const char* getName() = 0;
+		/**
+		\brief Release the profile zone.
+		*/
+		virtual void release() = 0;
+
+		/**
+		\brief Set profile zone manager for the zone.
+		\param inMgr Profile zone manager.
+		*/
+		virtual void setProfileZoneManager(PxProfileZoneManager* inMgr) = 0;
+		/**
+		\brief Get profile zone manager for the zone.
+		\return Profile zone manager.
+		*/
+		virtual PxProfileZoneManager* getProfileZoneManager() = 0;
+
+		/**
+		\brief Get or create a new event id for a given name.
+		If you pass in a previously defined event name (including one returned)
+		from the name provider) you will just get the same event id back.
+		\param inName Profile event name.
+		*/
+		virtual uint16_t getEventIdForName( const char* inName ) = 0;
+
+		/**
+		\brief Specifies that it is a safe point to flush read-write name map into
+		read-only map. Make sure getEventIdForName is not called from a different thread.
+		*/
+		virtual void flushEventIdNameMap() = 0;
+
+		/**
+		\brief Reserve a contiguous set of profile event ids for a set of names.
+			
+		This function does not do any meaningful error checking other than to ensure
+		that if it does generate new ids they are contiguous.  If the first name is already
+		registered, that is the ID that will be returned regardless of what other
+		names are registered.  Thus either use this function alone (without the above
+		function) or don't use it.  
+		If you register "one","two","three" and the function returns an id of 4, then
+		"one" is mapped to 4, "two" is mapped to 5, and "three" is mapped to 6.
+
+		\param inNames set of names to register.
+		\param inLen Length of the name list.
+
+		\return The first id associated with the first name.  The rest of the names
+		will be associated with monotonically incrementing uint16_t values from the first
+		id.  
+		 */
+		virtual uint16_t getEventIdsForNames( const char** inNames, uint32_t inLen ) = 0;
+
+		/**
+		\brief Create a new profile zone.  
+
+		\param inAllocator memory allocation is controlled through the foundation if one is passed in.
+		\param inSDKName Name of the profile zone; useful for clients to understand where events came from.
+		\param inNames Mapping from event id -> event name.
+		\param inEventBufferByteSize Size of the canonical event buffer.  This does not need to be a large number
+			as profile events are fairly small individually.
+		\return a profile zone implementation.
+		 */		
+		static PX_FOUNDATION_API PxProfileZone& createProfileZone(PxAllocatorCallback* inAllocator, const char* inSDKName, PxProfileNames inNames = PxProfileNames(), uint32_t inEventBufferByteSize = 0x10000 /*64k*/);
+
+	};
+} }
+
+#endif // PXPVDSDK_PXPROFILEZONE_H
diff --git a/PxShared/src/pvd/src/PxProfileZoneImpl.h b/PxShared/src/pvd/src/PxProfileZoneImpl.h
new file mode 100644
index 0000000..981180f
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileZoneImpl.h
@@ -0,0 +1,318 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+
+#ifndef PXPVDSDK_PXPROFILEZONEIMPL_H
+#define PXPVDSDK_PXPROFILEZONEIMPL_H
+
+#include "PxProfileZone.h"
+#include "PxProfileEventFilter.h"
+#include "PxProfileZoneManager.h"
+#include "PxProfileContextProviderImpl.h"
+#include "PxProfileScopedMutexLock.h"
+#include "PxProfileEventBufferAtomic.h"
+#include "PsMutex.h"
+
+namespace physx { namespace profile {
+
+	/**
+	\brief Simple event filter that enables all events.
+	*/
+	struct PxProfileNullEventFilter
+	{
+		void setEventEnabled( const PxProfileEventId&, bool) { PX_ASSERT(false); }
+		bool isEventEnabled( const PxProfileEventId&) const { return true; }
+	};
+
+	typedef shdfnd::MutexT<PxProfileWrapperReflectionAllocator<uint8_t> >	TZoneMutexType;
+	typedef ScopedLockImpl<TZoneMutexType>				TZoneLockType;
+	typedef EventBuffer< PxDefaultContextProvider, TZoneMutexType, TZoneLockType, PxProfileNullEventFilter > TZoneEventBufferType;
+	//typedef EventBufferAtomic< PxDefaultContextProvider, TZoneMutexType, TZoneLockType, PxProfileNullEventFilter > TZoneEventBufferType;
+
+	template<typename TNameProvider>
+	class ZoneImpl : TZoneEventBufferType //private inheritance intended
+					, public PxProfileZone
+					, public PxProfileEventBufferClient
+	{
+		typedef shdfnd::MutexT<PxProfileWrapperReflectionAllocator<uint8_t> >	TMutexType;
+		typedef PxProfileHashMap<const char*, uint32_t>			TNameToEvtIndexMap;
+		//ensure we don't reuse event ids.
+		typedef PxProfileHashMap<uint16_t, const char*>			TEvtIdToNameMap;
+		typedef TMutexType::ScopedLock						TLockType;
+
+
+		const char*										mName;
+		PxProfileAllocatorWrapper								mWrapper;
+		mutable TMutexType								mMutex;
+		PxProfileArray<PxProfileEventName>				mEventNames;
+		// to avoid locking, read-only and read-write map exist
+		TNameToEvtIndexMap								mNameToEvtIndexMapR;
+		TNameToEvtIndexMap								mNameToEvtIndexMapRW;
+		//ensure we don't reuse event ids.
+		TEvtIdToNameMap									mEvtIdToNameMap;
+
+		PxProfileZoneManager*							mProfileZoneManager;
+
+		PxProfileArray<PxProfileZoneClient*>				mClients;
+		volatile bool									mEventsActive;
+
+		PX_NOCOPY(ZoneImpl<TNameProvider>)
+	public:
+		ZoneImpl( PxAllocatorCallback* inAllocator, const char* inName, uint32_t bufferSize = 0x10000 /*64k*/, const TNameProvider& inProvider = TNameProvider() )
+			: TZoneEventBufferType( inAllocator, bufferSize, PxDefaultContextProvider(), NULL, PxProfileNullEventFilter() )
+			, mName( inName )
+			, mWrapper( inAllocator )
+			, mMutex( PxProfileWrapperReflectionAllocator<uint8_t>( mWrapper ) )
+			, mEventNames( mWrapper )
+			, mNameToEvtIndexMapR( mWrapper )
+			, mNameToEvtIndexMapRW(mWrapper)
+			, mEvtIdToNameMap( mWrapper )
+			, mProfileZoneManager( NULL )
+			, mClients( mWrapper )
+			, mEventsActive( false )
+		{
+			TZoneEventBufferType::setBufferMutex( &mMutex );
+			//Initialize the event name structure with existing names from the name provider.
+			PxProfileNames theNames( inProvider.getProfileNames() );
+			for ( uint32_t idx = 0; idx < theNames.eventCount; ++idx )
+			{
+				const PxProfileEventName& theName (theNames.events[idx]);
+				doAddName( theName.name, theName.eventId.eventId, theName.eventId.compileTimeEnabled );
+			}
+			TZoneEventBufferType::addClient( *this );
+		}
+
+		virtual ~ZoneImpl() {
+			if ( mProfileZoneManager != NULL )
+				mProfileZoneManager->removeProfileZone( *this );
+			mProfileZoneManager = NULL;
+			TZoneEventBufferType::removeClient( *this );
+		}
+
+		void doAddName( const char* inName, uint16_t inEventId, bool inCompileTimeEnabled )
+		{
+			TLockType theLocker( mMutex );
+			mEvtIdToNameMap.insert( inEventId, inName );
+			uint32_t idx = static_cast<uint32_t>( mEventNames.size() );
+			mNameToEvtIndexMapRW.insert( inName, idx );
+			mEventNames.pushBack( PxProfileEventName( inName, PxProfileEventId( inEventId, inCompileTimeEnabled ) ) );
+		}
+
+		virtual void flushEventIdNameMap()
+		{
+			// copy the RW map into R map
+			if (mNameToEvtIndexMapRW.size())
+			{
+				for (TNameToEvtIndexMap::Iterator iter = mNameToEvtIndexMapRW.getIterator(); !iter.done(); ++iter)
+				{
+					mNameToEvtIndexMapR.insert(iter->first, iter->second);
+				}
+				mNameToEvtIndexMapRW.clear();
+			}
+		}
+
+		virtual uint16_t getEventIdForName( const char* inName )
+		{
+			return getEventIdsForNames( &inName, 1 );
+		}
+
+		virtual uint16_t getEventIdsForNames( const char** inNames, uint32_t inLen )
+		{
+			if ( inLen == 0 )
+				return 0;
+
+			// search the read-only map first
+			const TNameToEvtIndexMap::Entry* theEntry( mNameToEvtIndexMapR.find( inNames[0] ) );
+			if ( theEntry )
+				return mEventNames[theEntry->second].eventId;
+
+			TLockType theLocker(mMutex);
+
+			const TNameToEvtIndexMap::Entry* theReEntry(mNameToEvtIndexMapRW.find(inNames[0]));
+			if (theReEntry)
+				return mEventNames[theReEntry->second].eventId;
+
+			//Else git R dun.
+			uint16_t nameSize = static_cast<uint16_t>( mEventNames.size() );
+			//We don't allow 0 as an event id.
+			uint16_t eventId = nameSize;
+			//Find a contiguous set of unique event ids
+			bool foundAnEventId = false;
+			do
+			{
+				foundAnEventId = false;
+				++eventId;
+				for ( uint16_t idx = 0; idx < inLen && foundAnEventId == false; ++idx )
+					foundAnEventId = mEvtIdToNameMap.find( uint16_t(eventId + idx) ) != NULL;
+			}
+			while( foundAnEventId );
+
+			uint32_t clientCount = mClients.size();
+			for ( uint16_t nameIdx = 0; nameIdx < inLen; ++nameIdx )
+			{
+				uint16_t newId = uint16_t(eventId + nameIdx);
+				doAddName( inNames[nameIdx], newId, true );
+				for( uint32_t clientIdx =0; clientIdx < clientCount; ++clientIdx )
+					mClients[clientIdx]->handleEventAdded( PxProfileEventName( inNames[nameIdx], PxProfileEventId( newId ) ) );
+			}
+
+			return eventId;
+		}
+
+		virtual void setProfileZoneManager(PxProfileZoneManager* inMgr)
+		{
+			mProfileZoneManager = inMgr;
+		}
+
+		virtual PxProfileZoneManager* getProfileZoneManager()
+		{
+			return mProfileZoneManager;
+		}
+
+
+
+		const char* getName() { return mName; }
+
+		PxProfileEventBufferClient* getEventBufferClient() { return this; }
+
+		//SDK implementation
+
+		void addClient( PxProfileZoneClient& inClient )
+		{
+			TLockType lock( mMutex );
+			mClients.pushBack( &inClient );
+			mEventsActive = true;
+		}
+
+		void removeClient( PxProfileZoneClient& inClient )
+		{
+			TLockType lock( mMutex );
+			for ( uint32_t idx =0; idx < mClients.size(); ++idx )
+			{
+				if ( mClients[idx] == &inClient )
+				{
+					inClient.handleClientRemoved();
+					mClients.replaceWithLast( idx );
+					break;
+				}
+			}
+			mEventsActive = mClients.size() != 0;
+		}
+
+		virtual bool hasClients() const
+		{
+			return mEventsActive;
+		}
+
+		virtual PxProfileNames getProfileNames() const
+		{
+			TLockType theLocker( mMutex );
+			const PxProfileEventName* theNames = mEventNames.begin();
+			uint32_t theEventCount = uint32_t(mEventNames.size());
+			return PxProfileNames( theEventCount, theNames );
+		}
+
+		virtual void release()
+		{
+			PX_PROFILE_DELETE( mWrapper.getAllocator(), this );
+		}
+
+		//Implementation chaining the buffer flush to our clients
+		virtual void handleBufferFlush( const uint8_t* inData, uint32_t inLength )
+		{
+			TLockType theLocker( mMutex );
+
+			uint32_t clientCount = mClients.size();
+			for( uint32_t idx =0; idx < clientCount; ++idx )
+				mClients[idx]->handleBufferFlush( inData, inLength );
+		}
+		//Happens if something removes all the clients from the manager.
+		virtual void handleClientRemoved() {}
+
+		//Send a profile event, optionally with a context.  Events are sorted by thread
+		//and context in the client side.
+		virtual void startEvent( uint16_t inId, uint64_t contextId)
+		{
+			if( mEventsActive )
+			{
+				TZoneEventBufferType::startEvent( inId, contextId );
+			}
+		}
+		virtual void stopEvent( uint16_t inId, uint64_t contextId)
+		{
+			if( mEventsActive )
+			{
+				TZoneEventBufferType::stopEvent( inId, contextId );
+			}
+		}
+
+		virtual void startEvent( uint16_t inId, uint64_t contextId, uint32_t threadId)
+		{
+			if( mEventsActive )
+			{
+				TZoneEventBufferType::startEvent( inId, contextId, threadId );
+			}
+		}
+		virtual void stopEvent( uint16_t inId, uint64_t contextId, uint32_t threadId )
+		{
+			if( mEventsActive )
+			{
+				TZoneEventBufferType::stopEvent( inId, contextId, threadId );
+			}
+		}
+
+		virtual void atEvent(uint16_t inId, uint64_t contextId, uint32_t threadId, uint64_t start, uint64_t stop)
+		{
+			if (mEventsActive)
+			{
+				TZoneEventBufferType::startEvent(inId, threadId, contextId, 0, 0, start);
+				TZoneEventBufferType::stopEvent(inId, threadId, contextId, 0, 0, stop);
+			}
+		}
+
+		/**
+		 *	Set an specific events value.  This is different than the profiling value
+		 *	for the event; it is a value recorded and kept around without a timestamp associated
+		 *	with it.  This value is displayed when the event itself is processed.
+		 */
+		virtual void eventValue( uint16_t inId, uint64_t contextId, int64_t inValue )
+		{
+			if( mEventsActive )
+			{
+				TZoneEventBufferType::eventValue( inId, contextId, inValue );
+			}
+		}
+		virtual void flushProfileEvents()
+		{
+			TZoneEventBufferType::flushProfileEvents();
+		}
+	};
+
+}}
+#endif // PXPVDSDK_PXPROFILEZONEIMPL_H
diff --git a/PxShared/src/pvd/src/PxProfileZoneManager.h b/PxShared/src/pvd/src/PxProfileZoneManager.h
new file mode 100644
index 0000000..9668460
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileZoneManager.h
@@ -0,0 +1,155 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPROFILEZONEMANAGER_H
+#define PXPVDSDK_PXPROFILEZONEMANAGER_H
+
+#include "PxProfileEventSender.h"
+#include "PxProfileEventNames.h"
+
+namespace physx { 
+	
+	class PxAllocatorCallback;
+	
+	namespace profile {
+
+	class PxProfileZone;
+	class PxProfileNameProvider;	
+
+	/**
+	\brief Profile zone handler for zone add/remove notification.
+	*/
+	class PxProfileZoneHandler
+	{
+	protected:
+		virtual ~PxProfileZoneHandler(){}
+	public:
+		/**
+		\brief On zone added notification		
+
+		\note Not a threadsafe call; handlers are expected to be able to handle
+		this from any thread.
+
+		\param inSDK Added zone.
+		*/
+		virtual void onZoneAdded( PxProfileZone& inSDK ) = 0;
+		/**
+		\brief On zone removed notification		
+
+		\note Not a threadsafe call; handlers are expected to be able to handle
+		this from any thread.
+
+		\param inSDK removed zone.
+		*/
+		virtual void onZoneRemoved( PxProfileZone& inSDK ) = 0;
+	};
+
+	/**
+	\brief The profiling system was setup in the expectation that there would be several
+	systems that each had its own island of profile information.  PhysX, client code,
+	and APEX would be the first examples of these.  Each one of these islands is represented
+	by a profile zone.
+	 	
+	The Manager is a singleton-like object where all these different systems can be registered
+	so that clients of the profiling system can have one point to capture *all* profiling events.
+	 
+	Flushing the manager implies that you want to loop through all the profile zones and flush
+	each one.
+
+	@see PxProfileEventFlusher
+	*/
+	class PxProfileZoneManager 
+		: public PxProfileEventFlusher //Tell all SDK's to flush their queue of profile events.
+	{
+	protected:
+		virtual ~PxProfileZoneManager(){}
+	public:
+		/**
+		\brief Add new profile zone for the manager.
+		\note Threadsafe call, can be done from any thread.  Handlers that are already connected
+		will get a new callback on the current thread.
+
+		\param inSDK Profile zone to add.
+		 */
+		virtual void addProfileZone( PxProfileZone& inSDK ) = 0;
+		/**
+		\brief Removes profile zone from the manager.
+		\note Threadsafe call, can be done from any thread.  Handlers that are already connected
+		will get a new callback on the current thread.
+
+		\param inSDK Profile zone to remove.
+		 */
+		virtual void removeProfileZone( PxProfileZone& inSDK ) = 0;
+
+		/**
+		\brief Add profile zone handler callback for the profile zone notifications.
+
+		\note Threadsafe call.  The new handler will immediately be notified about all
+		known SDKs.
+
+		\param inHandler Profile zone handler to add.
+		 */
+		virtual void addProfileZoneHandler( PxProfileZoneHandler& inHandler ) = 0;
+		/**
+		\brief Removes profile zone handler callback for the profile zone notifications.
+
+		\note Threadsafe call.  The new handler will immediately be notified about all
+		known SDKs.
+
+		\param inHandler Profile zone handler to remove.
+		 */
+		virtual void removeProfileZoneHandler( PxProfileZoneHandler& inHandler ) = 0;
+
+
+		/**
+		\brief Create a new profile zone.  This means you don't need access to a PxFoundation to 
+		create your profile zone object, and your object is automatically registered with
+		the profile zone manager.
+		
+		You still need to release your object when you are finished with it.
+		\param inSDKName Name of the SDK object.
+		\param inNames Option set of event id to name mappings.
+		\param inEventBufferByteSize rough maximum size of the event buffer.  May exceed this size
+		by sizeof one event.  When full an immediate call to all listeners is made.
+		*/
+		virtual PxProfileZone& createProfileZone( const char* inSDKName, PxProfileNames inNames = PxProfileNames(), uint32_t inEventBufferByteSize = 0x4000 /*16k*/ ) = 0;
+
+		/**
+		\brief Releases the profile manager instance.
+		*/
+		virtual void release() = 0;
+		
+		/**
+		\brief Create the profile zone manager.
+		\param inAllocatorCallback Allocator callback.
+		*/
+		static PxProfileZoneManager& createProfileZoneManager(PxAllocatorCallback* inAllocatorCallback );
+	};
+
+} }
+
+#endif // PXPVDSDK_PXPROFILEZONEMANAGER_H
diff --git a/PxShared/src/pvd/src/PxProfileZoneManagerImpl.h b/PxShared/src/pvd/src/PxProfileZoneManagerImpl.h
new file mode 100644
index 0000000..6542917
--- /dev/null
+++ b/PxShared/src/pvd/src/PxProfileZoneManagerImpl.h
@@ -0,0 +1,174 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PXPVDSDK_PXPROFILEZONEMANAGERIMPL_H
+#define PXPVDSDK_PXPROFILEZONEMANAGERIMPL_H
+
+#include "PxProfileZoneManager.h"
+#include "PxProfileBase.h"
+#include "PxProfileScopedMutexLock.h"
+#include "PxProfileZone.h"
+#include "PxProfileAllocatorWrapper.h"
+
+#include "PsArray.h"
+#include "PsMutex.h"
+
+namespace physx { namespace profile {
+
+	struct NullEventNameProvider : public PxProfileNameProvider
+	{
+		virtual PxProfileNames getProfileNames() const { return PxProfileNames( 0, 0 ); }
+	};
+
+	class ZoneManagerImpl : public PxProfileZoneManager
+	{
+		typedef ScopedLockImpl<shdfnd::Mutex> TScopedLockType;
+		PxProfileAllocatorWrapper					mWrapper;
+		PxProfileArray<PxProfileZone*>		mZones;
+		PxProfileArray<PxProfileZoneHandler*>	mHandlers;
+		shdfnd::Mutex mMutex;
+
+		ZoneManagerImpl( const ZoneManagerImpl& inOther );
+		ZoneManagerImpl& operator=( const ZoneManagerImpl& inOther );
+
+	public:
+
+		ZoneManagerImpl(PxAllocatorCallback* inFoundation) 
+			: mWrapper( inFoundation )
+			, mZones( mWrapper )
+			, mHandlers( mWrapper ) 
+		{}
+
+		virtual ~ZoneManagerImpl()
+		{
+			//This assert would mean that a profile zone is outliving us.
+			//This will cause a crash when the profile zone is released.
+			PX_ASSERT( mZones.size() == 0 );
+			while( mZones.size() )
+				removeProfileZone( *mZones.back() );
+		}
+
+		virtual void addProfileZone( PxProfileZone& inSDK )
+		{
+			TScopedLockType lock( &mMutex );
+			
+			if ( inSDK.getProfileZoneManager() != NULL )
+			{
+				if ( inSDK.getProfileZoneManager() == this )
+					return;
+				else //there must be two managers in the system somehow.
+				{
+					PX_ASSERT( false );
+					inSDK.getProfileZoneManager()->removeProfileZone( inSDK );
+				}
+			}
+			mZones.pushBack( &inSDK );
+			inSDK.setProfileZoneManager( this );
+			for ( uint32_t idx =0; idx < mHandlers.size(); ++idx )
+				mHandlers[idx]->onZoneAdded( inSDK );
+		}
+
+		virtual void removeProfileZone( PxProfileZone& inSDK )
+		{
+			TScopedLockType lock( &mMutex );
+			if ( inSDK.getProfileZoneManager() == NULL )
+				return;
+
+			else if ( inSDK.getProfileZoneManager() != this )
+			{
+				PX_ASSERT( false );
+				inSDK.getProfileZoneManager()->removeProfileZone( inSDK );
+				return;
+			}
+
+			inSDK.setProfileZoneManager( NULL );
+			for ( uint32_t idx = 0; idx < mZones.size(); ++idx )
+			{
+				if ( mZones[idx] == &inSDK )
+				{
+					for ( uint32_t handler =0; handler < mHandlers.size(); ++handler )
+						mHandlers[handler]->onZoneRemoved( inSDK );
+					mZones.replaceWithLast( idx );
+				}
+			}
+		}
+
+		virtual void flushProfileEvents()
+		{
+			uint32_t sdkCount = mZones.size();
+			for ( uint32_t idx = 0; idx < sdkCount; ++idx )
+				mZones[idx]->flushProfileEvents();
+		}
+
+		virtual void addProfileZoneHandler( PxProfileZoneHandler& inHandler )
+		{
+			TScopedLockType lock( &mMutex );
+			mHandlers.pushBack( &inHandler );
+			for ( uint32_t idx = 0; idx < mZones.size(); ++idx )
+				inHandler.onZoneAdded( *mZones[idx] );
+		}
+
+		virtual void removeProfileZoneHandler( PxProfileZoneHandler& inHandler )
+		{
+			TScopedLockType lock( &mMutex );
+			for( uint32_t idx = 0; idx < mZones.size(); ++idx )
+				inHandler.onZoneRemoved( *mZones[idx] );
+			for( uint32_t idx = 0; idx < mHandlers.size(); ++idx )
+			{
+				if ( mHandlers[idx] == &inHandler )
+					mHandlers.replaceWithLast( idx );
+			}
+		}
+		
+		virtual PxProfileZone& createProfileZone( const char* inSDKName, PxProfileNameProvider* inProvider, uint32_t inEventBufferByteSize )
+		{
+			NullEventNameProvider nullProvider;
+			if ( inProvider == NULL )
+				inProvider = &nullProvider;
+			return createProfileZone( inSDKName, inProvider->getProfileNames(), inEventBufferByteSize );
+		}
+		
+		
+		virtual PxProfileZone& createProfileZone( const char* inSDKName, PxProfileNames inNames, uint32_t inEventBufferByteSize )
+		{
+			PxProfileZone& retval( PxProfileZone::createProfileZone( &mWrapper.getAllocator(), inSDKName, inNames, inEventBufferByteSize ) );
+			addProfileZone( retval );
+			return retval;
+		}
+
+		virtual void release() 
+		{  
+			PX_PROFILE_DELETE( mWrapper.getAllocator(), this );
+		}
+	};
+} }
+
+
+#endif // PXPVDSDK_PXPROFILEZONEMANAGERIMPL_H
diff --git a/PxShared/src/pvd/src/PxPvd.cpp b/PxShared/src/pvd/src/PxPvd.cpp
new file mode 100644
index 0000000..4e1eb09
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvd.cpp
@@ -0,0 +1,56 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "pvd/PxPvd.h"
+
+#include "PxPvdImpl.h"
+
+namespace physx
+{
+namespace pvdsdk
+{
+
+ForwardingAllocator gForwardingAllocator;
+PxAllocatorCallback* gPvdAllocatorCallback = &gForwardingAllocator;
+
+void SetPvdAllocatorCallback(PxAllocatorCallback* inAllocatorCallback)
+{
+	gPvdAllocatorCallback = inAllocatorCallback;
+}
+
+} // namespace pvdsdk
+
+PxPvd* PxCreatePvd(PxFoundation& foundation)
+{
+	pvdsdk::gPvdAllocatorCallback = &foundation.getAllocatorCallback();
+	pvdsdk::PvdImpl::initialize();
+	return pvdsdk::PvdImpl::getInstance();
+}
+
+} // namespace physx
diff --git a/PxShared/src/pvd/src/PxPvdBits.h b/PxShared/src/pvd/src/PxPvdBits.h
new file mode 100644
index 0000000..b763065
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdBits.h
@@ -0,0 +1,173 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPVDBITS_H
+#define PXPVDSDK_PXPVDBITS_H
+
+#include "PxPvdObjectModelBaseTypes.h"
+
+namespace physx
+{
+namespace pvdsdk
+{
+
+// Marshallers cannot assume src is aligned, but they can assume dest is aligned.
+typedef void (*TSingleMarshaller)(const uint8_t* src, uint8_t* dest);
+typedef void (*TBlockMarshaller)(const uint8_t* src, uint8_t* dest, uint32_t numItems);
+
+template <uint8_t ByteCount>
+static inline void doSwapBytes(uint8_t* __restrict inData)
+{
+	for(uint32_t idx = 0; idx < ByteCount / 2; ++idx)
+	{
+		uint32_t endIdx = ByteCount - idx - 1;
+		uint8_t theTemp = inData[idx];
+		inData[idx] = inData[endIdx];
+		inData[endIdx] = theTemp;
+	}
+}
+
+template <uint8_t ByteCount>
+static inline void doSwapBytes(uint8_t* __restrict inData, uint32_t itemCount)
+{
+	uint8_t* end = inData + itemCount * ByteCount;
+	for(; inData < end; inData += ByteCount)
+		doSwapBytes<ByteCount>(inData);
+}
+
+static inline void swapBytes(uint8_t* __restrict dataPtr, uint32_t numBytes, uint32_t itemWidth)
+{
+	uint32_t numItems = numBytes / itemWidth;
+	switch(itemWidth)
+	{
+	case 1:
+		break;
+	case 2:
+		doSwapBytes<2>(dataPtr, numItems);
+		break;
+	case 4:
+		doSwapBytes<4>(dataPtr, numItems);
+		break;
+	case 8:
+		doSwapBytes<8>(dataPtr, numItems);
+		break;
+	case 16:
+		doSwapBytes<16>(dataPtr, numItems);
+		break;
+	default:
+		PX_ASSERT(false);
+		break;
+	}
+}
+
+template <uint8_t TByteCount, bool TShouldSwap>
+struct PvdByteSwapper
+{
+	void swapBytes(uint8_t* __restrict inData)
+	{
+		doSwapBytes<TByteCount>(inData);
+	}
+	void swapBytes(uint8_t* __restrict inData, uint32_t itemCount)
+	{
+		doSwapBytes<TByteCount>(inData, itemCount);
+	}
+	void swapBytes(uint8_t* __restrict dataPtr, uint32_t numBytes, uint32_t itemWidth)
+	{
+		physx::pvdsdk::swapBytes(dataPtr, numBytes, itemWidth);
+	}
+};
+
+struct PvdNullSwapper
+{
+
+	void swapBytes(uint8_t* __restrict)
+	{
+	}
+	void swapBytes(uint8_t* __restrict, uint32_t)
+	{
+	}
+	void swapBytes(uint8_t* __restrict, uint32_t, uint32_t)
+	{
+	}
+};
+// Anything that doesn't need swapping gets the null swapper
+template <uint8_t TByteCount>
+struct PvdByteSwapper<TByteCount, false> : public PvdNullSwapper
+{
+};
+// A 1 byte byte swapper can't really do anything.
+template <>
+struct PvdByteSwapper<1, true> : public PvdNullSwapper
+{
+};
+
+static inline void swapBytes(uint8_t&)
+{
+}
+static inline void swapBytes(int8_t&)
+{
+}
+static inline void swapBytes(uint16_t& inData)
+{
+	doSwapBytes<2>(reinterpret_cast<uint8_t*>(&inData));
+}
+static inline void swapBytes(int16_t& inData)
+{
+	doSwapBytes<2>(reinterpret_cast<uint8_t*>(&inData));
+}
+static inline void swapBytes(uint32_t& inData)
+{
+	doSwapBytes<4>(reinterpret_cast<uint8_t*>(&inData));
+}
+static inline void swapBytes(int32_t& inData)
+{
+	doSwapBytes<4>(reinterpret_cast<uint8_t*>(&inData));
+}
+static inline void swapBytes(float& inData)
+{
+	doSwapBytes<4>(reinterpret_cast<uint8_t*>(&inData));
+}
+static inline void swapBytes(uint64_t& inData)
+{
+	doSwapBytes<8>(reinterpret_cast<uint8_t*>(&inData));
+}
+static inline void swapBytes(int64_t& inData)
+{
+	doSwapBytes<8>(reinterpret_cast<uint8_t*>(&inData));
+}
+static inline void swapBytes(double& inData)
+{
+	doSwapBytes<8>(reinterpret_cast<uint8_t*>(&inData));
+}
+
+static inline bool checkLength(const uint8_t* inStart, const uint8_t* inStop, uint32_t inLength)
+{
+	return static_cast<uint32_t>(inStop - inStart) >= inLength;
+}
+}
+}
+#endif // PXPVDSDK_PXPVDBITS_H
diff --git a/PxShared/src/pvd/src/PxPvdByteStreams.h b/PxShared/src/pvd/src/PxPvdByteStreams.h
new file mode 100644
index 0000000..fff3c4f
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdByteStreams.h
@@ -0,0 +1,155 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPVDBYTESTREAMS_H
+#define PXPVDSDK_PXPVDBYTESTREAMS_H
+#include "PxPvdObjectModelBaseTypes.h"
+
+namespace physx
+{
+namespace pvdsdk
+{
+
+static inline uint32_t strLen(const char* inStr)
+{
+	uint32_t len = 0;
+	if(inStr)
+	{
+		while(*inStr)
+		{
+			++len;
+			++inStr;
+		}
+	}
+	return len;
+}
+
+class PvdInputStream
+{
+  protected:
+	virtual ~PvdInputStream()
+	{
+	}
+
+  public:
+	// Return false if you can't write the number of bytes requested
+	// But make an absolute best effort to read the data...
+	virtual bool read(uint8_t* buffer, uint32_t& len) = 0;
+
+	template <typename TDataType>
+	bool read(TDataType* buffer, uint32_t numItems)
+	{
+		uint32_t expected = numItems;
+		uint32_t amountToRead = numItems * sizeof(TDataType);
+		read(reinterpret_cast<uint8_t*>(buffer), amountToRead);
+		numItems = amountToRead / sizeof(TDataType);
+		PX_ASSERT(numItems == expected);
+		return expected == numItems;
+	}
+
+	template <typename TDataType>
+	PvdInputStream& operator>>(TDataType& data)
+	{
+		uint32_t dataSize = static_cast<uint32_t>(sizeof(TDataType));
+		bool success = read(reinterpret_cast<uint8_t*>(&data), dataSize);
+		// PX_ASSERT( success );
+		// PX_ASSERT( dataSize == sizeof( data ) );
+		(void)success;
+		return *this;
+	}
+};
+
+struct ByteSwappingPvdInputStream
+{
+  protected:
+	ByteSwappingPvdInputStream& operator=(ByteSwappingPvdInputStream& other);
+
+  public:
+	PvdInputStream& mStream;
+	ByteSwappingPvdInputStream(PvdInputStream& stream) : mStream(stream)
+	{
+	}
+
+	template <typename TDataType>
+	bool read(TDataType* buffer, uint32_t& numItems)
+	{
+		bool retval = mStream.read(buffer, numItems);
+		for(uint32_t idx = 0; idx < numItems; ++idx)
+			swapBytes(buffer[idx]);
+		return retval;
+	}
+
+	template <typename TDataType>
+	ByteSwappingPvdInputStream& operator>>(TDataType& data)
+	{
+		mStream >> data;
+		swapBytes(data);
+		return *this;
+	}
+};
+
+class PvdOutputStream
+{
+  protected:
+	virtual ~PvdOutputStream()
+	{
+	}
+
+  public:
+	// Return false if you can't write the number of bytes requested
+	// But make an absolute best effort to write the data...
+	virtual bool write(const uint8_t* buffer, uint32_t len) = 0;
+	virtual bool directCopy(PvdInputStream& inStream, uint32_t len) = 0;
+
+	template <typename TDataType>
+	bool write(const TDataType* buffer, uint32_t numItems)
+	{
+		return write(reinterpret_cast<const uint8_t*>(buffer), numItems * sizeof(TDataType));
+	}
+
+	template <typename TDataType>
+	PvdOutputStream& operator<<(const TDataType& data)
+	{
+		bool success = write(reinterpret_cast<const uint8_t*>(&data), sizeof(data));
+		PX_ASSERT(success);
+		(void)success;
+		return *this;
+	}
+
+	PvdOutputStream& operator<<(const char* inString)
+	{
+		if(inString && *inString)
+		{
+			uint32_t len(strLen(inString));
+			write(inString, len);
+		}
+		return *this;
+	}
+};
+}
+}
+#endif // PXPVDSDK_PXPVDBYTESTREAMS_H
diff --git a/PxShared/src/pvd/src/PxPvdCommStreamEventSink.h b/PxShared/src/pvd/src/PxPvdCommStreamEventSink.h
new file mode 100644
index 0000000..57e8635
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdCommStreamEventSink.h
@@ -0,0 +1,55 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPVDCOMMSTREAMEVENTSINK_H
+#define PXPVDSDK_PXPVDCOMMSTREAMEVENTSINK_H
+
+#include "PxPvdObjectModelBaseTypes.h"
+#include "PxPvdCommStreamEvents.h"
+#include "PxPvdCommStreamTypes.h"
+
+namespace physx
+{
+namespace pvdsdk
+{
+
+class PvdCommStreamEventSink
+{
+  public:
+	template <typename TStreamType>
+	static void writeStreamEvent(const EventSerializeable& evt, PvdCommStreamEventTypes::Enum evtType, TStreamType& stream)
+	{
+		EventStreamifier<TStreamType> streamifier_concrete(stream);
+		PvdEventSerializer& streamifier(streamifier_concrete);
+		streamifier.streamify(evtType);
+		const_cast<EventSerializeable&>(evt).serialize(streamifier);
+	}
+};
+
+} // pvd
+} // physx
+#endif // PXPVDSDK_PXPVDCOMMSTREAMEVENTSINK_H
diff --git a/PxShared/src/pvd/src/PxPvdCommStreamEvents.h b/PxShared/src/pvd/src/PxPvdCommStreamEvents.h
new file mode 100644
index 0000000..81770da
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdCommStreamEvents.h
@@ -0,0 +1,987 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPVDCOMMSTREAMEVENTS_H
+#define PXPVDSDK_PXPVDCOMMSTREAMEVENTS_H
+
+#include "foundation/PxVec3.h"
+#include "foundation/PxFlags.h"
+
+#include "PxPvdObjectModelBaseTypes.h"
+#include "PsTime.h"
+
+namespace physx
+{
+namespace pvdsdk
+{
+
+struct CommStreamFlagTypes
+{
+	enum Enum
+	{
+		Is64BitPtr = 1
+	};
+};
+
+typedef PxFlags<CommStreamFlagTypes::Enum, uint32_t> CommStreamFlags;
+
+template <typename TDataType>
+struct PvdCommVariableSizedEventCheck
+{
+	bool variable_size_check;
+};
+
+// Pick out the events that are possibly very large.
+// This helps us keep our buffers close to the size the user requested.
+#define DECLARE_TYPE_VARIABLE_SIZED(type)                                                                              \
+	template <>                                                                                                        \
+	struct PvdCommVariableSizedEventCheck<type>                                                                        \
+	{                                                                                                                  \
+		uint32_t variable_size_check;                                                                                  \
+	};
+
+struct NameHandleValue;
+struct StreamPropMessageArg;
+struct StringHandleEvent;
+struct CreateClass;
+struct DeriveClass;
+struct CreateProperty;
+struct CreatePropertyMessage;
+struct CreateInstance;
+struct SetPropertyValue;
+struct BeginSetPropertyValue;
+struct AppendPropertyValueData;
+struct EndSetPropertyValue;
+struct SetPropertyMessage;
+struct BeginPropertyMessageGroup;
+struct SendPropertyMessageFromGroup;
+struct EndPropertyMessageGroup;
+struct CreateDestroyInstanceProperty;
+struct PushBackObjectRef;
+struct RemoveObjectRef;
+struct BeginSection;
+struct EndSection;
+struct SetPickable;
+struct SetColor;
+struct SetIsTopLevel;
+struct SetCamera;
+struct AddProfileZone;
+struct AddProfileZoneEvent;
+struct StreamEndEvent;
+struct ErrorMessage;
+struct OriginShift;
+struct DestroyInstance;
+
+#define DECLARE_COMM_STREAM_EVENTS                                                                                     \
+	\
+DECLARE_PVD_COMM_STREAM_EVENT(StringHandleEvent) \
+DECLARE_PVD_COMM_STREAM_EVENT(CreateClass) \
+DECLARE_PVD_COMM_STREAM_EVENT(DeriveClass) \
+DECLARE_PVD_COMM_STREAM_EVENT(CreateProperty) \
+DECLARE_PVD_COMM_STREAM_EVENT(CreatePropertyMessage) \
+DECLARE_PVD_COMM_STREAM_EVENT(CreateInstance) \
+DECLARE_PVD_COMM_STREAM_EVENT(SetPropertyValue) \
+DECLARE_PVD_COMM_STREAM_EVENT(BeginSetPropertyValue) \
+DECLARE_PVD_COMM_STREAM_EVENT(AppendPropertyValueData) \
+DECLARE_PVD_COMM_STREAM_EVENT(EndSetPropertyValue) \
+DECLARE_PVD_COMM_STREAM_EVENT(SetPropertyMessage) \
+DECLARE_PVD_COMM_STREAM_EVENT(BeginPropertyMessageGroup) \
+DECLARE_PVD_COMM_STREAM_EVENT(SendPropertyMessageFromGroup) \
+DECLARE_PVD_COMM_STREAM_EVENT(EndPropertyMessageGroup) \
+DECLARE_PVD_COMM_STREAM_EVENT(DestroyInstance) \
+DECLARE_PVD_COMM_STREAM_EVENT(PushBackObjectRef) \
+DECLARE_PVD_COMM_STREAM_EVENT(RemoveObjectRef) \
+DECLARE_PVD_COMM_STREAM_EVENT(BeginSection) \
+DECLARE_PVD_COMM_STREAM_EVENT(EndSection) \
+DECLARE_PVD_COMM_STREAM_EVENT(SetPickable) \
+DECLARE_PVD_COMM_STREAM_EVENT(SetColor) \
+DECLARE_PVD_COMM_STREAM_EVENT(SetIsTopLevel) \
+DECLARE_PVD_COMM_STREAM_EVENT(SetCamera) \
+DECLARE_PVD_COMM_STREAM_EVENT(AddProfileZone) \
+DECLARE_PVD_COMM_STREAM_EVENT(AddProfileZoneEvent) \
+DECLARE_PVD_COMM_STREAM_EVENT(StreamEndEvent) \
+DECLARE_PVD_COMM_STREAM_EVENT(ErrorMessage) \
+DECLARE_PVD_COMM_STREAM_EVENT_NO_COMMA(OriginShift)
+
+struct PvdCommStreamEventTypes
+{
+	enum Enum
+	{
+		Unknown = 0,
+#define DECLARE_PVD_COMM_STREAM_EVENT(x) x,
+#define DECLARE_PVD_COMM_STREAM_EVENT_NO_COMMA(x) x
+		DECLARE_COMM_STREAM_EVENTS
+#undef DECLARE_PVD_COMM_STREAM_EVENT_NO_COMMA
+#undef DECLARE_PVD_COMM_STREAM_EVENT
+        , Last
+	};
+};
+
+template <typename TDataType>
+struct DatatypeToCommEventType
+{
+	bool compile_error;
+};
+template <PvdCommStreamEventTypes::Enum TEnumType>
+struct CommEventTypeToDatatype
+{
+	bool compile_error;
+};
+
+#define DECLARE_PVD_COMM_STREAM_EVENT(x)                                                                               \
+	template <>                                                                                                        \
+	struct DatatypeToCommEventType<x>                                                                                  \
+	{                                                                                                                  \
+		enum Enum                                                                                                      \
+		{                                                                                                              \
+			EEventTypeMap = PvdCommStreamEventTypes::x                                                                 \
+		};                                                                                                             \
+	};                                                                                                                 \
+	template <>                                                                                                        \
+	struct CommEventTypeToDatatype<PvdCommStreamEventTypes::x>                                                         \
+	{                                                                                                                  \
+		typedef x TEventType;                                                                                          \
+	};
+#define DECLARE_PVD_COMM_STREAM_EVENT_NO_COMMA(x)                                                                      \
+	\
+template<> struct DatatypeToCommEventType<x>                                                                           \
+	{                                                                                                                  \
+		enum Enum                                                                                                      \
+		{                                                                                                              \
+			EEventTypeMap = PvdCommStreamEventTypes::x                                                                 \
+		};                                                                                                             \
+	};                                                                                                                 \
+	\
+template<> struct CommEventTypeToDatatype<PvdCommStreamEventTypes::x>                                                  \
+	{                                                                                                                  \
+		typedef x TEventType;                                                                                          \
+	};
+
+DECLARE_COMM_STREAM_EVENTS
+#undef DECLARE_PVD_COMM_STREAM_EVENT_NO_COMMA
+#undef DECLARE_PVD_COMM_STREAM_EVENT
+
+template <typename TDataType>
+PvdCommStreamEventTypes::Enum getCommStreamEventType()
+{
+	return static_cast<PvdCommStreamEventTypes::Enum>(DatatypeToCommEventType<TDataType>::EEventTypeMap);
+}
+
+struct StreamNamespacedName
+{
+	StringHandle mNamespace; // StringHandle handles
+	StringHandle mName;
+	StreamNamespacedName(StringHandle ns = 0, StringHandle nm = 0) : mNamespace(ns), mName(nm)
+	{
+	}
+};
+
+class EventSerializeable;
+
+class PvdEventSerializer
+{
+  protected:
+	virtual ~PvdEventSerializer()
+	{
+	}
+
+  public:
+	virtual void streamify(uint8_t& val) = 0;
+	virtual void streamify(uint16_t& val) = 0;
+	virtual void streamify(uint32_t& val) = 0;
+	virtual void streamify(float& val) = 0;
+	virtual void streamify(uint64_t& val) = 0;
+	virtual void streamify(String& val) = 0;
+	virtual void streamify(DataRef<const uint8_t>& data) = 0;
+	virtual void streamify(DataRef<NameHandleValue>& data) = 0;
+	virtual void streamify(DataRef<StreamPropMessageArg>& data) = 0;
+	virtual void streamify(DataRef<StringHandle>& data) = 0;
+
+	void streamify(StringHandle& hdl)
+	{
+		streamify(hdl.mHandle);
+	}
+	void streamify(CommStreamFlags& flags)
+	{
+		uint32_t val(flags);
+		streamify(val);
+		flags = CommStreamFlags(val);
+	}
+
+	void streamify(PvdCommStreamEventTypes::Enum& val)
+	{
+		uint8_t detyped = static_cast<uint8_t>(val);
+		streamify(detyped);
+		val = static_cast<PvdCommStreamEventTypes::Enum>(detyped);
+	}
+	void streamify(PropertyType::Enum& val)
+	{
+		uint8_t detyped = static_cast<uint8_t>(val);
+		streamify(detyped);
+		val = static_cast<PropertyType::Enum>(detyped);
+	}
+
+	void streamify(bool& val)
+	{
+		uint8_t detyped = uint8_t(val ? 1 : 0);
+		streamify(detyped);
+		val = detyped ? true : false;
+	}
+
+	void streamify(StreamNamespacedName& name)
+	{
+		streamify(name.mNamespace);
+		streamify(name.mName);
+	}
+
+	void streamify(PvdColor& color)
+	{
+		streamify(color.r);
+		streamify(color.g);
+		streamify(color.b);
+		streamify(color.a);
+	}
+
+	void streamify(PxVec3& vec)
+	{
+		streamify(vec.x);
+		streamify(vec.y);
+		streamify(vec.z);
+	}
+
+	static uint32_t measure(const EventSerializeable& evt);
+};
+
+class EventSerializeable
+{
+  protected:
+	virtual ~EventSerializeable()
+	{
+	}
+
+  public:
+	virtual void serialize(PvdEventSerializer& serializer) = 0;
+};
+
+/** Numbers generated from random.org
+129919156	17973702	401496246	144984007	336950759
+907025328	837150850	679717896	601529147	269478202
+*/
+struct StreamInitialization : public EventSerializeable
+{
+	static uint32_t getStreamId()
+	{
+		return 837150850;
+	}
+	static uint32_t getStreamVersion()
+	{
+		return 1;
+	}
+
+	uint32_t mStreamId;
+	uint32_t mStreamVersion;
+	uint64_t mTimestampNumerator;
+	uint64_t mTimestampDenominator;
+	CommStreamFlags mStreamFlags;
+	StreamInitialization()
+	: mStreamId(getStreamId())
+	, mStreamVersion(getStreamVersion())
+	, mTimestampNumerator(physx::shdfnd::Time::getCounterFrequency().mNumerator * 10)
+	, mTimestampDenominator(physx::shdfnd::Time::getCounterFrequency().mDenominator)
+	, mStreamFlags(sizeof(void*) == 4 ? 0 : 1)
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mStreamId);
+		s.streamify(mStreamVersion);
+		s.streamify(mTimestampNumerator);
+		s.streamify(mTimestampDenominator);
+		s.streamify(mStreamFlags);
+	}
+};
+
+struct EventGroup : public EventSerializeable
+{
+	uint32_t mDataSize; // in bytes, data directly follows this header
+	uint32_t mNumEvents;
+	uint64_t mStreamId;
+	uint64_t mTimestamp;
+
+	EventGroup(uint32_t dataSize = 0, uint32_t numEvents = 0, uint64_t streamId = 0, uint64_t ts = 0)
+	: mDataSize(dataSize), mNumEvents(numEvents), mStreamId(streamId), mTimestamp(ts)
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mDataSize);
+		s.streamify(mNumEvents);
+		s.streamify(mStreamId);
+		s.streamify(mTimestamp);
+	}
+};
+
+struct StringHandleEvent : public EventSerializeable
+{
+	String mString;
+	uint32_t mHandle;
+	StringHandleEvent(String str, uint32_t hdl) : mString(str), mHandle(hdl)
+	{
+	}
+	StringHandleEvent()
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mString);
+		s.streamify(mHandle);
+	}
+};
+
+DECLARE_TYPE_VARIABLE_SIZED(StringHandleEvent)
+
+typedef uint64_t Timestamp;
+
+struct CreateClass : public EventSerializeable
+{
+	StreamNamespacedName mName;
+	CreateClass(StreamNamespacedName nm) : mName(nm)
+	{
+	}
+	CreateClass()
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mName);
+	}
+};
+
+struct DeriveClass : public EventSerializeable
+{
+	StreamNamespacedName mParent;
+	StreamNamespacedName mChild;
+
+	DeriveClass(StreamNamespacedName p, StreamNamespacedName c) : mParent(p), mChild(c)
+	{
+	}
+	DeriveClass()
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mParent);
+		s.streamify(mChild);
+	}
+};
+
+struct NameHandleValue : public EventSerializeable
+{
+	StringHandle mName;
+	uint32_t mValue;
+	NameHandleValue(StringHandle name, uint32_t val) : mName(name), mValue(val)
+	{
+	}
+	NameHandleValue()
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mName);
+		s.streamify(mValue);
+	}
+};
+/*virtual PvdError createProperty( StreamNamespacedName clsName, StringHandle name, StringHandle semantic
+                                    , StreamNamespacedName dtypeName, PropertyType::Enum propertyType
+                                    , DataRef<NamedValue> values = DataRef<NamedValue>() ) = 0; */
+struct CreateProperty : public EventSerializeable
+{
+	StreamNamespacedName mClass;
+	StringHandle mName;
+	StringHandle mSemantic;
+	StreamNamespacedName mDatatypeName;
+	PropertyType::Enum mPropertyType;
+	DataRef<NameHandleValue> mValues;
+
+	CreateProperty(StreamNamespacedName cls, StringHandle name, StringHandle semantic, StreamNamespacedName dtypeName,
+	               PropertyType::Enum ptype, DataRef<NameHandleValue> values)
+	: mClass(cls), mName(name), mSemantic(semantic), mDatatypeName(dtypeName), mPropertyType(ptype), mValues(values)
+	{
+	}
+	CreateProperty()
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mClass);
+		s.streamify(mName);
+		s.streamify(mSemantic);
+		s.streamify(mDatatypeName);
+		s.streamify(mPropertyType);
+		s.streamify(mValues);
+	}
+};
+
+struct StreamPropMessageArg : public EventSerializeable
+{
+	StringHandle mPropertyName;
+	StreamNamespacedName mDatatypeName;
+	uint32_t mMessageOffset;
+	uint32_t mByteSize;
+	StreamPropMessageArg(StringHandle pname, StreamNamespacedName dtypeName, uint32_t offset, uint32_t byteSize)
+	: mPropertyName(pname), mDatatypeName(dtypeName), mMessageOffset(offset), mByteSize(byteSize)
+	{
+	}
+
+	StreamPropMessageArg()
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mPropertyName);
+		s.streamify(mDatatypeName);
+		s.streamify(mMessageOffset);
+		s.streamify(mByteSize);
+	}
+};
+
+/*
+    virtual PvdError createPropertyMessage( StreamNamespacedName cls, StreamNamespacedName msgName
+                                                , DataRef<PropertyMessageArg> entries, uint32_t messageSizeInBytes ) =
+   0;*/
+struct CreatePropertyMessage : public EventSerializeable
+{
+	StreamNamespacedName mClass;
+	StreamNamespacedName mMessageName;
+	DataRef<StreamPropMessageArg> mMessageEntries;
+	uint32_t mMessageByteSize;
+
+	CreatePropertyMessage(StreamNamespacedName cls, StreamNamespacedName msgName, DataRef<StreamPropMessageArg> propArg,
+	                      uint32_t messageByteSize)
+	: mClass(cls), mMessageName(msgName), mMessageEntries(propArg), mMessageByteSize(messageByteSize)
+	{
+	}
+	CreatePropertyMessage()
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mClass);
+		s.streamify(mMessageName);
+		s.streamify(mMessageEntries);
+		s.streamify(mMessageByteSize);
+	}
+};
+
+/**Changing immediate data on instances*/
+
+// virtual PvdError createInstance( StreamNamespacedName cls, uint64_t instance ) = 0;
+struct CreateInstance : public EventSerializeable
+{
+	StreamNamespacedName mClass;
+	uint64_t mInstanceId;
+
+	CreateInstance(StreamNamespacedName cls, uint64_t streamId) : mClass(cls), mInstanceId(streamId)
+	{
+	}
+	CreateInstance()
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mClass);
+		s.streamify(mInstanceId);
+	}
+};
+
+// virtual PvdError setPropertyValue( uint64_t instance, StringHandle name, DataRef<const uint8_t> data,
+// StreamNamespacedName incomingTypeName ) = 0;
+struct SetPropertyValue : public EventSerializeable
+{
+	uint64_t mInstanceId;
+	StringHandle mPropertyName;
+	DataRef<const uint8_t> mData;
+	StreamNamespacedName mIncomingTypeName;
+	uint32_t mNumItems;
+
+	SetPropertyValue(uint64_t instance, StringHandle name, DataRef<const uint8_t> data,
+	                 StreamNamespacedName incomingTypeName, uint32_t numItems)
+	: mInstanceId(instance), mPropertyName(name), mData(data), mIncomingTypeName(incomingTypeName), mNumItems(numItems)
+	{
+	}
+
+	SetPropertyValue()
+	{
+	}
+
+	void serializeBeginning(PvdEventSerializer& s)
+	{
+		s.streamify(mInstanceId);
+		s.streamify(mPropertyName);
+		s.streamify(mIncomingTypeName);
+		s.streamify(mNumItems);
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		serializeBeginning(s);
+		s.streamify(mData);
+	}
+};
+
+DECLARE_TYPE_VARIABLE_SIZED(SetPropertyValue)
+
+struct BeginSetPropertyValue : public EventSerializeable
+{
+	uint64_t mInstanceId;
+	StringHandle mPropertyName;
+	StreamNamespacedName mIncomingTypeName;
+
+	BeginSetPropertyValue(uint64_t instance, StringHandle name, StreamNamespacedName incomingTypeName)
+	: mInstanceId(instance), mPropertyName(name), mIncomingTypeName(incomingTypeName)
+	{
+	}
+	BeginSetPropertyValue()
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mInstanceId);
+		s.streamify(mPropertyName);
+		s.streamify(mIncomingTypeName);
+	}
+};
+
+// virtual PvdError appendPropertyValueData( DataRef<const uint8_t> data ) = 0;
+struct AppendPropertyValueData : public EventSerializeable
+{
+	DataRef<const uint8_t> mData;
+	uint32_t mNumItems;
+	AppendPropertyValueData(DataRef<const uint8_t> data, uint32_t numItems) : mData(data), mNumItems(numItems)
+	{
+	}
+	AppendPropertyValueData()
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mData);
+		s.streamify(mNumItems);
+	}
+};
+
+DECLARE_TYPE_VARIABLE_SIZED(AppendPropertyValueData)
+
+// virtual PvdError endSetPropertyValue() = 0;
+struct EndSetPropertyValue : public EventSerializeable
+{
+	EndSetPropertyValue()
+	{
+	}
+
+	void serialize(PvdEventSerializer&)
+	{
+	}
+};
+
+// virtual PvdError setPropertyMessage( uint64_t instance, StreamNamespacedName msgName, DataRef<const uint8_t> data ) =
+// 0;
+struct SetPropertyMessage : public EventSerializeable
+{
+	uint64_t mInstanceId;
+	StreamNamespacedName mMessageName;
+	DataRef<const uint8_t> mData;
+
+	SetPropertyMessage(uint64_t instance, StreamNamespacedName msgName, DataRef<const uint8_t> data)
+	: mInstanceId(instance), mMessageName(msgName), mData(data)
+	{
+	}
+
+	SetPropertyMessage()
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mInstanceId);
+		s.streamify(mMessageName);
+		s.streamify(mData);
+	}
+};
+
+DECLARE_TYPE_VARIABLE_SIZED(SetPropertyMessage)
+
+// virtual PvdError beginPropertyMessageGroup( StreamNamespacedName msgName ) = 0;
+struct BeginPropertyMessageGroup : public EventSerializeable
+{
+	StreamNamespacedName mMsgName;
+	BeginPropertyMessageGroup(StreamNamespacedName msgName) : mMsgName(msgName)
+	{
+	}
+	BeginPropertyMessageGroup()
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mMsgName);
+	}
+};
+
+// virtual PvdError sendPropertyMessageFromGroup( uint64_t instance, DataRef<const uint8_t*> data ) = 0;
+struct SendPropertyMessageFromGroup : public EventSerializeable
+{
+	uint64_t mInstance;
+	DataRef<const uint8_t> mData;
+
+	SendPropertyMessageFromGroup(uint64_t instance, DataRef<const uint8_t> data) : mInstance(instance), mData(data)
+	{
+	}
+	SendPropertyMessageFromGroup()
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mInstance);
+		s.streamify(mData);
+	}
+};
+
+DECLARE_TYPE_VARIABLE_SIZED(SendPropertyMessageFromGroup)
+
+// virtual PvdError endPropertyMessageGroup() = 0;
+struct EndPropertyMessageGroup : public EventSerializeable
+{
+	EndPropertyMessageGroup()
+	{
+	}
+
+	void serialize(PvdEventSerializer&)
+	{
+	}
+};
+
+struct PushBackObjectRef : public EventSerializeable
+{
+	uint64_t mInstanceId;
+	StringHandle mProperty;
+	uint64_t mObjectRef;
+
+	PushBackObjectRef(uint64_t instId, StringHandle prop, uint64_t objRef)
+	: mInstanceId(instId), mProperty(prop), mObjectRef(objRef)
+	{
+	}
+
+	PushBackObjectRef()
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mInstanceId);
+		s.streamify(mProperty);
+		s.streamify(mObjectRef);
+	}
+};
+
+struct RemoveObjectRef : public EventSerializeable
+{
+	uint64_t mInstanceId;
+	StringHandle mProperty;
+	uint64_t mObjectRef;
+
+	RemoveObjectRef(uint64_t instId, StringHandle prop, uint64_t objRef)
+	: mInstanceId(instId), mProperty(prop), mObjectRef(objRef)
+	{
+	}
+
+	RemoveObjectRef()
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mInstanceId);
+		s.streamify(mProperty);
+		s.streamify(mObjectRef);
+	}
+};
+
+// virtual PvdError destroyInstance( uint64_t key ) = 0;
+struct DestroyInstance : public EventSerializeable
+{
+	uint64_t mInstanceId;
+	DestroyInstance(uint64_t instance) : mInstanceId(instance)
+	{
+	}
+	DestroyInstance()
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mInstanceId);
+	}
+};
+
+// virtual PvdError beginSection( uint64_t sectionId, StringHandle name ) = 0;
+struct BeginSection : public EventSerializeable
+{
+	uint64_t mSectionId;
+	StringHandle mName;
+	Timestamp mTimestamp;
+	BeginSection(uint64_t sectionId, StringHandle name, uint64_t timestamp)
+	: mSectionId(sectionId), mName(name), mTimestamp(timestamp)
+	{
+	}
+	BeginSection()
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mSectionId);
+		s.streamify(mName);
+		s.streamify(mTimestamp);
+	}
+};
+// virtual PvdError endSection( uint64_t sectionId, StringHandle name ) = 0;
+struct EndSection : public EventSerializeable
+{
+	uint64_t mSectionId;
+	StringHandle mName;
+	Timestamp mTimestamp;
+	EndSection(uint64_t sectionId, StringHandle name, uint64_t timestamp)
+	: mSectionId(sectionId), mName(name), mTimestamp(timestamp)
+	{
+	}
+	EndSection()
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mSectionId);
+		s.streamify(mName);
+		s.streamify(mTimestamp);
+	}
+};
+
+// virtual void setPickable( void* instance, bool pickable ) = 0;
+struct SetPickable : public EventSerializeable
+{
+	uint64_t mInstanceId;
+	bool mPickable;
+	SetPickable(uint64_t instId, bool pick) : mInstanceId(instId), mPickable(pick)
+	{
+	}
+	SetPickable()
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mInstanceId);
+		s.streamify(mPickable);
+	}
+};
+// virtual void setColor( void* instance, const PvdColor& color ) = 0;
+struct SetColor : public EventSerializeable
+{
+	uint64_t mInstanceId;
+	PvdColor mColor;
+	SetColor(uint64_t instId, PvdColor color) : mInstanceId(instId), mColor(color)
+	{
+	}
+	SetColor()
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mInstanceId);
+		s.streamify(mColor);
+	}
+};
+
+// virtual void setColor( void* instance, const PvdColor& color ) = 0;
+struct SetIsTopLevel : public EventSerializeable
+{
+	uint64_t mInstanceId;
+	bool mIsTopLevel;
+
+	SetIsTopLevel(uint64_t instId, bool topLevel) : mInstanceId(instId), mIsTopLevel(topLevel)
+	{
+	}
+	SetIsTopLevel() : mIsTopLevel(false)
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mInstanceId);
+		s.streamify(mIsTopLevel);
+	}
+};
+
+struct SetCamera : public EventSerializeable
+{
+	String mName;
+	PxVec3 mPosition;
+	PxVec3 mUp;
+	PxVec3 mTarget;
+	SetCamera(String name, const PxVec3& pos, const PxVec3& up, const PxVec3& target)
+	: mName(name), mPosition(pos), mUp(up), mTarget(target)
+	{
+	}
+	SetCamera() : mName(NULL)
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mName);
+		s.streamify(mPosition);
+		s.streamify(mUp);
+		s.streamify(mTarget);
+	}
+};
+
+struct ErrorMessage : public EventSerializeable
+{
+	uint32_t mCode;
+	String mMessage;
+	String mFile;
+	uint32_t mLine;
+
+	ErrorMessage(uint32_t code, String message, String file, uint32_t line)
+	: mCode(code), mMessage(message), mFile(file), mLine(line)
+	{
+	}
+
+	ErrorMessage() : mMessage(NULL), mFile(NULL)
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mCode);
+		s.streamify(mMessage);
+		s.streamify(mFile);
+		s.streamify(mLine);
+	}
+};
+
+struct AddProfileZone : public EventSerializeable
+{
+	uint64_t mInstanceId;
+	String mName;
+	AddProfileZone(uint64_t iid, String nm) : mInstanceId(iid), mName(nm)
+	{
+	}
+	AddProfileZone() : mName(NULL)
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mInstanceId);
+		s.streamify(mName);
+	}
+};
+
+struct AddProfileZoneEvent : public EventSerializeable
+{
+	uint64_t mInstanceId;
+	String mName;
+	uint16_t mEventId;
+	bool mCompileTimeEnabled;
+	AddProfileZoneEvent(uint64_t iid, String nm, uint16_t eid, bool cte)
+	: mInstanceId(iid), mName(nm), mEventId(eid), mCompileTimeEnabled(cte)
+	{
+	}
+	AddProfileZoneEvent()
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mInstanceId);
+		s.streamify(mName);
+		s.streamify(mEventId);
+		s.streamify(mCompileTimeEnabled);
+	}
+};
+
+struct StreamEndEvent : public EventSerializeable
+{
+	String mName;
+	StreamEndEvent() : mName("StreamEnd")
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mName);
+	}
+};
+
+struct OriginShift : public EventSerializeable
+{
+	uint64_t mInstanceId;
+	PxVec3 mShift;
+
+	OriginShift(uint64_t iid, const PxVec3& shift) : mInstanceId(iid), mShift(shift)
+	{
+	}
+	OriginShift()
+	{
+	}
+
+	void serialize(PvdEventSerializer& s)
+	{
+		s.streamify(mInstanceId);
+		s.streamify(mShift);
+	}
+};
+} // pvdsdk
+} // physx
+
+#endif // PXPVDSDK_PXPVDCOMMSTREAMEVENTS_H
diff --git a/PxShared/src/pvd/src/PxPvdCommStreamSDKEventTypes.h b/PxShared/src/pvd/src/PxPvdCommStreamSDKEventTypes.h
new file mode 100644
index 0000000..4105730
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdCommStreamSDKEventTypes.h
@@ -0,0 +1,32 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#define THERE_IS_NO_INCLUDE_GUARD_FOR_A_REASON
+
+DECLARE_PVD_COMM_STREAM_SDK_EVENT(SetPauseState)
+
+#undef THERE_IS_NO_INCLUDE_GUARD_FOR_A_REASON
diff --git a/PxShared/src/pvd/src/PxPvdCommStreamTypes.h b/PxShared/src/pvd/src/PxPvdCommStreamTypes.h
new file mode 100644
index 0000000..cbfda4d
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdCommStreamTypes.h
@@ -0,0 +1,262 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPVDCOMMSTREAMTYPES_H
+#define PXPVDSDK_PXPVDCOMMSTREAMTYPES_H
+
+#include "foundation/PxErrorCallback.h"
+#include "pvd/PxPvdTransport.h"
+
+#include "PxPvdRenderBuffer.h"
+#include "PxPvdObjectModelBaseTypes.h"
+#include "PxPvdCommStreamEvents.h"
+#include "PxPvdDataStream.h"
+#include "PsMutex.h"
+
+namespace physx
+{
+namespace profile
+{
+class PxProfileZone;
+class PxProfileMemoryEventBuffer;
+}
+namespace pvdsdk
+{
+struct PvdErrorMessage;
+class PvdObjectModelMetaData;
+
+DEFINE_PVD_TYPE_NAME_MAP(profile::PxProfileZone, "_debugger_", "PxProfileZone")
+DEFINE_PVD_TYPE_NAME_MAP(profile::PxProfileMemoryEventBuffer, "_debugger_", "PxProfileMemoryEventBuffer")
+DEFINE_PVD_TYPE_NAME_MAP(PvdErrorMessage, "_debugger_", "PvdErrorMessage")
+// All event streams are on the 'events' property of objects of these types
+static inline NamespacedName getMemoryEventTotalsClassName()
+{
+	return NamespacedName("_debugger", "MemoryEventTotals");
+}
+
+class PvdOMMetaDataProvider
+{
+  protected:
+	virtual ~PvdOMMetaDataProvider()
+	{
+	}
+
+  public:
+	virtual void addRef() = 0;
+	virtual void release() = 0;
+	virtual PvdObjectModelMetaData& lock() = 0;
+	virtual void unlock() = 0;
+	virtual bool createInstance(const NamespacedName& clsName, const void* instance) = 0;
+	virtual bool isInstanceValid(const void* instance) = 0;
+	virtual void destroyInstance(const void* instance) = 0;
+	virtual int32_t getInstanceClassType(const void* instance) = 0;
+};
+
+class PvdCommStreamInternalConnection;
+
+class PvdConnectionListener
+{
+  protected:
+	virtual ~PvdConnectionListener()
+	{
+	}
+
+  public:
+	virtual void onDisconnect(PvdCommStreamInternalConnection& connection) = 0;
+	virtual void onInstanceDestroyed(const void* instance) = 0;
+};
+
+class PvdCommStreamEmbeddedTypes
+{
+  public:
+	static const char* getProfileEventStreamSemantic()
+	{
+		return "profile event stream";
+	}
+	static const char* getMemoryEventStreamSemantic()
+	{
+		return "memory event stream";
+	}
+	static const char* getRendererEventStreamSemantic()
+	{
+		return "render event stream";
+	}
+};
+
+class PvdCommStreamEventBufferClient;
+
+template <typename TStreamType>
+struct EventStreamifier : public PvdEventSerializer
+{
+	TStreamType& mBuffer;
+	EventStreamifier(TStreamType& buf) : mBuffer(buf)
+	{
+	}
+
+	template <typename TDataType>
+	void write(const TDataType& type)
+	{
+		mBuffer.write(reinterpret_cast<const uint8_t*>(&type), sizeof(TDataType));
+	}
+	template <typename TDataType>
+	void write(const TDataType* type, uint32_t count)
+	{
+		mBuffer.write(reinterpret_cast<const uint8_t*>(type), count * sizeof(TDataType));
+	}
+
+	void writeRef(DataRef<const uint8_t> data)
+	{
+		uint32_t amount = static_cast<uint32_t>(data.size());
+		write(amount);
+		write(data.begin(), amount);
+	}
+	void writeRef(DataRef<StringHandle> data)
+	{
+		uint32_t amount = static_cast<uint32_t>(data.size());
+		write(amount);
+		write(data.begin(), amount);
+	}
+	template <typename TDataType>
+	void writeRef(DataRef<TDataType> data)
+	{
+		uint32_t amount = static_cast<uint32_t>(data.size());
+		write(amount);
+		for(uint32_t idx = 0; idx < amount; ++idx)
+		{
+			TDataType& dtype(const_cast<TDataType&>(data[idx]));
+			dtype.serialize(*this);
+		}
+	}
+
+	virtual void streamify(uint16_t& val)
+	{
+		write(val);
+	}
+	virtual void streamify(uint8_t& val)
+	{
+		write(val);
+	}
+	virtual void streamify(uint32_t& val)
+	{
+		write(val);
+	}
+	virtual void streamify(float& val)
+	{
+		write(val);
+	}
+	virtual void streamify(uint64_t& val)
+	{
+		write(val);
+	}
+	virtual void streamify(PvdDebugText& val)
+	{
+		write(val.color);
+		write(val.position);
+		write(val.size);
+		streamify(val.string);
+	}
+
+	virtual void streamify(String& val)
+	{
+		uint32_t len = 0;
+		String temp = nonNull(val);
+		if(*temp)
+			len = static_cast<uint32_t>(strlen(temp) + 1);
+		write(len);
+		write(val, len);
+	}
+	virtual void streamify(DataRef<const uint8_t>& val)
+	{
+		writeRef(val);
+	}
+	virtual void streamify(DataRef<NameHandleValue>& val)
+	{
+		writeRef(val);
+	}
+	virtual void streamify(DataRef<StreamPropMessageArg>& val)
+	{
+		writeRef(val);
+	}
+	virtual void streamify(DataRef<StringHandle>& val)
+	{
+		writeRef(val);
+	}
+
+  private:
+	EventStreamifier& operator=(const EventStreamifier&);
+};
+
+struct MeasureStream
+{
+	uint32_t mSize;
+	MeasureStream() : mSize(0)
+	{
+	}
+	template <typename TDataType>
+	void write(const TDataType& val)
+	{
+		mSize += sizeof(val);
+	}
+	template <typename TDataType>
+	void write(const TDataType*, uint32_t count)
+	{
+		mSize += sizeof(TDataType) * count;
+	}
+};
+
+struct DataStreamState
+{
+	enum Enum
+	{
+		Open,
+		SetPropertyValue,
+		PropertyMessageGroup
+	};
+};
+
+class ExtendedEventSerializer : public PvdEventSerializer
+{
+  protected:
+	virtual ~ExtendedEventSerializer()
+	{
+	}
+
+  public:
+	virtual void setData(DataRef<const uint8_t> eventData) = 0;
+	// True if this serializer performs byte swapping
+	virtual bool performsSwap() = 0;
+
+	virtual bool isGood() = 0;
+
+	virtual void release() = 0;
+
+	static ExtendedEventSerializer& createInputSerializer(bool swapBytes);
+};
+
+} // pvdsdk
+} // physx
+#endif // PXPVDSDK_PXPVDCOMMSTREAMTYPES_H
diff --git a/PxShared/src/pvd/src/PxPvdDataStream.cpp b/PxShared/src/pvd/src/PxPvdDataStream.cpp
new file mode 100644
index 0000000..da13140
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdDataStream.cpp
@@ -0,0 +1,870 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#include "PxPvdDataStream.h"
+#include "PxPvdFoundation.h"
+#include "PxPvdCommStreamEvents.h"
+#include "PxPvdCommStreamEventSink.h"
+#include "PxPvdCommStreamTypes.h"
+#include "PxPvdDataStreamHelpers.h"
+#include "PxPvdObjectModelInternalTypes.h"
+#include "PxPvdImpl.h"
+#include "PsTime.h"
+#include "PsFoundation.h"
+#include "foundation/PxMemory.h"
+
+using namespace physx;
+using namespace physx::pvdsdk;
+using namespace physx::shdfnd;
+
+namespace
+{
+
+struct ScopedMetaData
+{
+	PvdOMMetaDataProvider& mProvider;
+	PvdObjectModelMetaData& mMeta;
+	ScopedMetaData(PvdOMMetaDataProvider& provider) : mProvider(provider), mMeta(provider.lock())
+	{
+	}
+	~ScopedMetaData()
+	{
+		mProvider.unlock();
+	}
+	PvdObjectModelMetaData* operator->()
+	{
+		return &mMeta;
+	}
+
+  private:
+	ScopedMetaData& operator=(const ScopedMetaData&);
+};
+
+struct PropertyDefinitionHelper : public PvdPropertyDefinitionHelper
+{
+	PvdDataStream* mStream;
+	PvdOMMetaDataProvider& mProvider;
+    Array<char> mNameBuffer;
+    Array<uint32_t> mNameStack;
+    Array<NamedValue> mNamedValues;
+    Array<PropertyMessageArg> mPropertyMessageArgs;
+
+	PropertyDefinitionHelper(PvdOMMetaDataProvider& provider)
+	: mStream(NULL)
+	, mProvider(provider)
+	, mNameBuffer("PropertyDefinitionHelper::mNameBuffer")
+	, mNameStack("PropertyDefinitionHelper::mNameStack")
+	, mNamedValues("PropertyDefinitionHelper::mNamedValues")
+	, mPropertyMessageArgs("PropertyDefinitionHelper::mPropertyMessageArgs")
+	{
+	}
+	void setStream(PvdDataStream* stream)
+	{
+		mStream = stream;
+	}
+
+	inline void appendStrToBuffer(const char* str)
+	{
+		if(str == NULL)
+			return;
+		size_t strLen = strlen(str);
+		size_t endBufOffset = mNameBuffer.size();
+		size_t resizeLen = endBufOffset;
+		// account for null
+		if(mNameBuffer.empty())
+			resizeLen += 1;
+		else
+			endBufOffset -= 1;
+
+		mNameBuffer.resize(static_cast<uint32_t>(resizeLen + strLen));
+		char* endPtr = mNameBuffer.begin() + endBufOffset;
+		PxMemCopy(endPtr, str, static_cast<uint32_t>(strLen));
+	}
+
+	virtual void pushName(const char* nm, const char* appender = ".")
+	{
+		size_t nameBufLen = mNameBuffer.size();
+		mNameStack.pushBack(static_cast<uint32_t>(nameBufLen));
+		if(mNameBuffer.empty() == false)
+			appendStrToBuffer(appender);
+		appendStrToBuffer(nm);
+		mNameBuffer.back() = 0;
+	}
+
+	virtual void pushBracketedName(const char* inName, const char* leftBracket = "[", const char* rightBracket = "]")
+	{
+		size_t nameBufLen = mNameBuffer.size();
+		mNameStack.pushBack(static_cast<uint32_t>(nameBufLen));
+		appendStrToBuffer(leftBracket);
+		appendStrToBuffer(inName);
+		appendStrToBuffer(rightBracket);
+		mNameBuffer.back() = 0;
+	}
+
+	virtual void popName()
+	{
+		if(mNameStack.empty())
+			return;
+		mNameBuffer.resize(static_cast<uint32_t>(mNameStack.back()));
+		mNameStack.popBack();
+		if(mNameBuffer.empty() == false)
+			mNameBuffer.back() = 0;
+	}
+
+	virtual const char* getTopName()
+	{
+		if(mNameBuffer.size())
+			return mNameBuffer.begin();
+		return "";
+	}
+	virtual void clearNameStack()
+	{
+		mNameBuffer.clear();
+		mNameStack.clear();
+	}
+
+	virtual void addNamedValue(const char* name, uint32_t value)
+	{
+		mNamedValues.pushBack(NamedValue(name, value));
+	}
+	virtual void clearNamedValues()
+	{
+		mNamedValues.clear();
+	}
+
+	virtual DataRef<NamedValue> getNamedValues()
+	{
+		return DataRef<NamedValue>(mNamedValues.begin(), mNamedValues.size());
+	}
+
+	virtual void createProperty(const NamespacedName& clsName, const char* inSemantic, const NamespacedName& dtypeName,
+	                            PropertyType::Enum propType)
+	{
+		mStream->createProperty(clsName, getTopName(), inSemantic, dtypeName, propType, getNamedValues());
+		clearNamedValues();
+	}
+	const char* registerStr(const char* str)
+	{
+		ScopedMetaData scopedProvider(mProvider);
+		return scopedProvider->getStringTable().registerStr(str);
+	}
+	virtual void addPropertyMessageArg(const NamespacedName& inDatatype, uint32_t inOffset, uint32_t inSize)
+	{
+		mPropertyMessageArgs.pushBack(PropertyMessageArg(registerStr(getTopName()), inDatatype, inOffset, inSize));
+	}
+	virtual void addPropertyMessage(const NamespacedName& clsName, const NamespacedName& msgName,
+	                                uint32_t inStructSizeInBytes)
+	{
+		if(mPropertyMessageArgs.empty())
+		{
+			PX_ASSERT(false);
+			return;
+		}
+		mStream->createPropertyMessage(
+		    clsName, msgName, DataRef<PropertyMessageArg>(mPropertyMessageArgs.begin(), mPropertyMessageArgs.size()),
+		    inStructSizeInBytes);
+	}
+	virtual void clearPropertyMessageArgs()
+	{
+		mPropertyMessageArgs.clear();
+	}
+
+  private:
+	PropertyDefinitionHelper& operator=(const PropertyDefinitionHelper&);
+};
+
+class PvdMemPool
+{
+	// Link List
+    Array<uint8_t*> mMemBuffer;
+	uint32_t mLength;
+	uint32_t mBufIndex;
+
+	// 4k for one page
+	static const int BUFFER_LENGTH = 4096;
+	PX_NOCOPY(PvdMemPool)
+  public:
+	PvdMemPool(const char* bufDataName) : mMemBuffer(bufDataName), mLength(0), mBufIndex(0)
+	{
+		grow();
+	}
+
+	~PvdMemPool()
+	{
+		for(uint32_t i = 0; i < mMemBuffer.size(); i++)
+		{
+			PX_FREE(mMemBuffer[i]);
+		}
+	}
+
+	void grow()
+	{
+		if(mBufIndex + 1 < mMemBuffer.size())
+		{
+			mBufIndex++;
+		}
+		else
+		{
+			uint8_t* Buf = reinterpret_cast<uint8_t*>(PX_ALLOC(BUFFER_LENGTH, "PvdMemPool::mMemBuffer.buf"));
+			mMemBuffer.pushBack(Buf);
+			mBufIndex = mMemBuffer.size() - 1;
+		}
+		mLength = 0;
+	}
+
+	void* allocate(uint32_t length)
+	{
+		if(length > uint32_t(BUFFER_LENGTH))
+			return NULL;
+
+		if(length + mLength > uint32_t(BUFFER_LENGTH))
+			grow();
+
+		void* mem = reinterpret_cast<void*>(&mMemBuffer[mBufIndex][mLength]);
+		mLength += length;
+		return mem;
+	}
+
+	void clear()
+	{
+		mLength = 0;
+		mBufIndex = 0;
+	}
+};
+struct PvdOutStream : public PvdDataStream, public UserAllocated
+{
+    HashMap<String, uint32_t> mStringHashMap;
+	PvdOMMetaDataProvider& mMetaDataProvider;
+    Array<uint8_t> mTempBuffer;
+	PropertyDefinitionHelper mPropertyDefinitionHelper;
+	DataStreamState::Enum mStreamState;
+
+	ClassDescription mSPVClass;
+	PropertyMessageDescription mMessageDesc;
+	// Set property value and SetPropertyMessage calls require
+	// us to write the data out to a separate buffer
+	// when strings are involved.
+	ForwardingMemoryBuffer mSPVBuffer;
+	uint32_t mEventCount;
+	uint32_t mPropertyMessageSize;
+	bool mConnected;
+	uint64_t mStreamId;
+    Array<PvdCommand*> mPvdCommandArray;
+	PvdMemPool mPvdCommandPool;
+	PxPvdTransport& mTransport;
+
+	PvdOutStream(PxPvdTransport& transport, PvdOMMetaDataProvider& provider, uint64_t streamId)
+	: mStringHashMap("PvdOutStream::mStringHashMap")
+	, mMetaDataProvider(provider)
+	, mTempBuffer("PvdOutStream::mTempBuffer")
+	, mPropertyDefinitionHelper(mMetaDataProvider)
+	, mStreamState(DataStreamState::Open)
+	, mSPVBuffer("PvdCommStreamBufferedEventSink::mSPVBuffer")
+	, mEventCount(0)
+	, mPropertyMessageSize(0)
+	, mConnected(true)
+	, mStreamId(streamId)
+	, mPvdCommandArray("PvdCommStreamBufferedEventSink::mPvdCommandArray")
+	, mPvdCommandPool("PvdCommStreamBufferedEventSink::mPvdCommandPool")
+	, mTransport(transport)
+	{
+		mPropertyDefinitionHelper.setStream(this);
+	}
+	virtual ~PvdOutStream()
+	{
+	}
+
+	virtual void release()
+	{
+		PVD_DELETE(this);
+	}
+
+	StringHandle toStream(String nm)
+	{
+		if(nm == NULL || *nm == 0)
+			return 0;
+        const HashMap<String, uint32_t>::Entry* entry(mStringHashMap.find(nm));
+		if(entry)
+			return entry->second;
+		ScopedMetaData meta(mMetaDataProvider);
+		StringHandle hdl = meta->getStringTable().strToHandle(nm);
+		nm = meta->getStringTable().handleToStr(hdl);
+		handlePvdEvent(StringHandleEvent(nm, hdl));
+		mStringHashMap.insert(nm, hdl);
+		return hdl;
+	}
+
+	StreamNamespacedName toStream(const NamespacedName& nm)
+	{
+		return StreamNamespacedName(toStream(nm.mNamespace), toStream(nm.mName));
+	}
+
+	bool isClassExist(const NamespacedName& nm)
+	{
+		ScopedMetaData meta(mMetaDataProvider);
+		return meta->findClass(nm).hasValue();
+	}
+	
+	bool createMetaClass(const NamespacedName& nm)
+	{
+		ScopedMetaData meta(mMetaDataProvider);
+		meta->getOrCreateClass(nm);
+		return true;
+	}
+
+	bool deriveMetaClass(const NamespacedName& parent, const NamespacedName& child)
+	{
+		ScopedMetaData meta(mMetaDataProvider);
+		return meta->deriveClass(parent, child);
+	}
+	
+// You will notice that some functions are #pragma'd out throughout this file.
+// This is because they are only called from asserts which means they aren't
+// called in release.  This causes warnings when building using snc which break
+// the build.
+#if PX_DEBUG
+
+	bool propertyExists(const NamespacedName& nm, String pname)
+	{
+		ScopedMetaData meta(mMetaDataProvider);
+		return meta->findProperty(nm, pname).hasValue();
+	}
+
+#endif
+
+	PvdError boolToError(bool val)
+	{
+		if(val)
+			return PvdErrorType::Success;
+		return PvdErrorType::NetworkError;
+	}
+
+	// PvdMetaDataStream
+	virtual PvdError createClass(const NamespacedName& nm)
+	{
+		PX_ASSERT(mStreamState == DataStreamState::Open);
+#if PX_DEBUG
+		PX_ASSERT(isClassExist(nm) == false);
+#endif
+		createMetaClass(nm);
+		return boolToError(handlePvdEvent(CreateClass(toStream(nm))));
+	}
+
+	virtual PvdError deriveClass(const NamespacedName& parent, const NamespacedName& child)
+	{
+		PX_ASSERT(mStreamState == DataStreamState::Open);
+#if PX_DEBUG
+		PX_ASSERT(isClassExist(parent));
+		PX_ASSERT(isClassExist(child));
+#endif
+		deriveMetaClass(parent, child);
+		return boolToError(handlePvdEvent(DeriveClass(toStream(parent), toStream(child))));
+	}
+
+	template <typename TDataType>
+	TDataType* allocTemp(uint32_t numItems)
+	{
+		uint32_t desiredBytes = numItems * sizeof(TDataType);
+		if(desiredBytes > mTempBuffer.size())
+			mTempBuffer.resize(desiredBytes);
+		TDataType* retval = reinterpret_cast<TDataType*>(mTempBuffer.begin());
+		if(numItems)
+		{
+			PVD_FOREACH(idx, numItems) new (retval + idx) TDataType();
+		}
+		return retval;
+	}
+
+#if PX_DEBUG
+
+	// Property datatypes need to be uniform.
+	// At this point, the data stream cannot handle properties that
+	// A struct with a float member and a char member would work.
+	// A struct with a float member and a long member would work (more efficiently).
+	bool isValidPropertyDatatype(const NamespacedName& dtypeName)
+	{
+		ScopedMetaData meta(mMetaDataProvider);
+		ClassDescription clsDesc(meta->findClass(dtypeName));
+		return clsDesc.mRequiresDestruction == false;
+	}
+
+#endif
+
+	NamespacedName createMetaProperty(const NamespacedName& clsName, String name, String semantic,
+	                                  const NamespacedName& dtypeName, PropertyType::Enum propertyType)
+	{
+		ScopedMetaData meta(mMetaDataProvider);
+		int32_t dtypeType = meta->findClass(dtypeName)->mClassId;
+		NamespacedName typeName = dtypeName;
+		if(dtypeType == getPvdTypeForType<String>())
+		{
+			dtypeType = getPvdTypeForType<StringHandle>();
+			typeName = getPvdNamespacedNameForType<StringHandle>();
+		}
+		Option<PropertyDescription> propOpt =
+		    meta->createProperty(meta->findClass(clsName)->mClassId, name, semantic, dtypeType, propertyType);
+		PX_ASSERT(propOpt.hasValue());
+		PX_UNUSED(propOpt);
+		return typeName;
+	}
+
+	virtual PvdError createProperty(const NamespacedName& clsName, String name, String semantic,
+	                                const NamespacedName& incomingDtypeName, PropertyType::Enum propertyType,
+	                                DataRef<NamedValue> values)
+	{
+		PX_ASSERT(mStreamState == DataStreamState::Open);
+#if PX_DEBUG
+		PX_ASSERT(isClassExist(clsName));
+		PX_ASSERT(propertyExists(clsName, name) == false);
+#endif
+		NamespacedName dtypeName(incomingDtypeName);
+		if(safeStrEq(dtypeName.mName, "VoidPtr"))
+			dtypeName.mName = "ObjectRef";
+#if PX_DEBUG
+		PX_ASSERT(isClassExist(dtypeName));
+		PX_ASSERT(isValidPropertyDatatype(dtypeName));
+#endif
+		NamespacedName typeName = createMetaProperty(clsName, name, semantic, dtypeName, propertyType);
+		// Can't have arrays of strings or arrays of string handles due to the difficulty
+		// of quickly dealing with them on the network receiving side.
+		if(propertyType == PropertyType::Array && safeStrEq(typeName.mName, "StringHandle"))
+		{
+			PX_ASSERT(false);
+			return PvdErrorType::ArgumentError;
+		}
+		uint32_t numItems = values.size();
+		NameHandleValue* streamValues = allocTemp<NameHandleValue>(numItems);
+		PVD_FOREACH(idx, numItems)
+		streamValues[idx] = NameHandleValue(toStream(values[idx].mName), values[idx].mValue);
+		CreateProperty evt(toStream(clsName), toStream(name), toStream(semantic), toStream(typeName), propertyType,
+		                   DataRef<NameHandleValue>(streamValues, numItems));
+		return boolToError(handlePvdEvent(evt));
+	}
+
+	bool createMetaPropertyMessage(const NamespacedName& cls, const NamespacedName& msgName,
+	                               DataRef<PropertyMessageArg> entries, uint32_t messageSizeInBytes)
+	{
+		ScopedMetaData meta(mMetaDataProvider);
+		return meta->createPropertyMessage(cls, msgName, entries, messageSizeInBytes).hasValue();
+	}
+#if PX_DEBUG
+
+	bool messageExists(const NamespacedName& msgName)
+	{
+		ScopedMetaData meta(mMetaDataProvider);
+		return meta->findPropertyMessage(msgName).hasValue();
+	}
+
+#endif
+
+	virtual PvdError createPropertyMessage(const NamespacedName& cls, const NamespacedName& msgName,
+	                                       DataRef<PropertyMessageArg> entries, uint32_t messageSizeInBytes)
+	{
+		PX_ASSERT(mStreamState == DataStreamState::Open);
+#if PX_DEBUG
+		PX_ASSERT(isClassExist(cls));
+		PX_ASSERT(messageExists(msgName) == false);
+#endif
+		createMetaPropertyMessage(cls, msgName, entries, messageSizeInBytes);
+		uint32_t numItems = entries.size();
+		StreamPropMessageArg* streamValues = allocTemp<StreamPropMessageArg>(numItems);
+		PVD_FOREACH(idx, numItems)
+		streamValues[idx] =
+		    StreamPropMessageArg(toStream(entries[idx].mPropertyName), toStream(entries[idx].mDatatypeName),
+		                         entries[idx].mMessageOffset, entries[idx].mByteSize);
+		CreatePropertyMessage evt(toStream(cls), toStream(msgName),
+		                          DataRef<StreamPropMessageArg>(streamValues, numItems), messageSizeInBytes);
+		return boolToError(handlePvdEvent(evt));
+	}
+
+	uint64_t toStream(const void* instance)
+	{
+		return PVD_POINTER_TO_U64(instance);
+	}
+	virtual PvdError createInstance(const NamespacedName& cls, const void* instance)
+	{
+		PX_ASSERT(isInstanceValid(instance) == false);
+		PX_ASSERT(mStreamState == DataStreamState::Open);
+		bool success = mMetaDataProvider.createInstance(cls, instance);
+		PX_ASSERT(success);
+		(void)success;
+		return boolToError(handlePvdEvent(CreateInstance(toStream(cls), toStream(instance))));
+	}
+
+	virtual bool isInstanceValid(const void* instance)
+	{
+		return mMetaDataProvider.isInstanceValid(instance);
+	}
+
+#if PX_DEBUG
+
+	// If the property will fit or is already completely in memory
+	bool checkPropertyType(const void* instance, String name, const NamespacedName& incomingType)
+	{
+		int32_t instType = mMetaDataProvider.getInstanceClassType(instance);
+		ScopedMetaData meta(mMetaDataProvider);
+		Option<PropertyDescription> prop = meta->findProperty(instType, name);
+		if(prop.hasValue() == false)
+			return false;
+		int32_t propType = prop->mDatatype;
+		int32_t incomingTypeId = meta->findClass(incomingType)->mClassId;
+		if(incomingTypeId != getPvdTypeForType<VoidPtr>())
+		{
+			MarshalQueryResult result = meta->checkMarshalling(incomingTypeId, propType);
+			bool possible = result.needsMarshalling == false || result.canMarshal;
+			return possible;
+		}
+		else
+		{
+			if(propType != getPvdTypeForType<ObjectRef>())
+				return false;
+		}
+		return true;
+	}
+
+#endif
+
+	DataRef<const uint8_t> bufferPropertyValue(ClassDescriptionSizeInfo info, DataRef<const uint8_t> data)
+	{
+		uint32_t realSize = info.mByteSize;
+		uint32_t numItems = data.size() / realSize;
+		if(info.mPtrOffsets.size() != 0)
+		{
+			mSPVBuffer.clear();
+			PVD_FOREACH(item, numItems)
+			{
+				const uint8_t* itemPtr = data.begin() + item * realSize;
+				mSPVBuffer.write(itemPtr, realSize);
+				PVD_FOREACH(stringIdx, info.mPtrOffsets.size())
+				{
+					PtrOffset offset(info.mPtrOffsets[stringIdx]);
+					if(offset.mOffsetType == PtrOffsetType::VoidPtrOffset)
+						continue;
+					const char* strPtr;
+					physx::intrinsics::memCopy(&strPtr, itemPtr + offset.mOffset, sizeof(char*));
+					strPtr = nonNull(strPtr);
+					uint32_t len = safeStrLen(strPtr) + 1;
+					mSPVBuffer.write(strPtr, len);
+				}
+			}
+			data = DataRef<const uint8_t>(mSPVBuffer.begin(), mSPVBuffer.size());
+		}
+		return data;
+	}
+
+	virtual PvdError setPropertyValue(const void* instance, String name, DataRef<const uint8_t> data,
+	                                  const NamespacedName& incomingTypeName)
+	{
+
+		PX_ASSERT(isInstanceValid(instance));
+#if PX_DEBUG
+		PX_ASSERT(isClassExist(incomingTypeName));
+#endif
+		PX_ASSERT(mStreamState == DataStreamState::Open);
+		ClassDescription clsDesc;
+		{
+			ScopedMetaData meta(mMetaDataProvider);
+			clsDesc = meta->findClass(incomingTypeName);
+		}
+		uint32_t realSize = clsDesc.getNativeSize();
+		uint32_t numItems = data.size() / realSize;
+		data = bufferPropertyValue(clsDesc.getNativeSizeInfo(), data);
+		SetPropertyValue evt(toStream(instance), toStream(name), data, toStream(incomingTypeName), numItems);
+		return boolToError(handlePvdEvent(evt));
+	}
+
+	// Else if the property is very large (contact reports) you can send it in chunks.
+	virtual PvdError beginSetPropertyValue(const void* instance, String name, const NamespacedName& incomingTypeName)
+	{
+		PX_ASSERT(isInstanceValid(instance));
+#if PX_DEBUG
+		PX_ASSERT(isClassExist(incomingTypeName));
+		PX_ASSERT(checkPropertyType(instance, name, incomingTypeName));
+#endif
+		PX_ASSERT(mStreamState == DataStreamState::Open);
+		mStreamState = DataStreamState::SetPropertyValue;
+		{
+			ScopedMetaData meta(mMetaDataProvider);
+			mSPVClass = meta->findClass(incomingTypeName);
+		}
+		BeginSetPropertyValue evt(toStream(instance), toStream(name), toStream(incomingTypeName));
+		return boolToError(handlePvdEvent(evt));
+	}
+
+	virtual PvdError appendPropertyValueData(DataRef<const uint8_t> data)
+	{
+		uint32_t realSize = mSPVClass.getNativeSize();
+		uint32_t numItems = data.size() / realSize;
+		data = bufferPropertyValue(mSPVClass.getNativeSizeInfo(), data);
+		PX_ASSERT(mStreamState == DataStreamState::SetPropertyValue);
+		return boolToError(handlePvdEvent(AppendPropertyValueData(data, numItems)));
+	}
+	virtual PvdError endSetPropertyValue()
+	{
+		PX_ASSERT(mStreamState == DataStreamState::SetPropertyValue);
+		mStreamState = DataStreamState::Open;
+		return boolToError(handlePvdEvent(EndSetPropertyValue()));
+	}
+
+#if PX_DEBUG
+
+	bool checkPropertyMessage(const void* instance, const NamespacedName& msgName)
+	{
+		int32_t clsId = mMetaDataProvider.getInstanceClassType(instance);
+		ScopedMetaData meta(mMetaDataProvider);
+		PropertyMessageDescription desc(meta->findPropertyMessage(msgName));
+		bool retval = meta->isDerivedFrom(clsId, desc.mClassId);
+		return retval;
+	}
+
+#endif
+
+	DataRef<const uint8_t> bufferPropertyMessage(const PropertyMessageDescription& desc, DataRef<const uint8_t> data)
+	{
+		if(desc.mStringOffsets.size())
+		{
+			mSPVBuffer.clear();
+			mSPVBuffer.write(data.begin(), data.size());
+			PVD_FOREACH(idx, desc.mStringOffsets.size())
+			{
+				const char* strPtr;
+				physx::intrinsics::memCopy(&strPtr, data.begin() + desc.mStringOffsets[idx], sizeof(char*));
+				strPtr = nonNull(strPtr);
+				uint32_t len = safeStrLen(strPtr) + 1;
+				mSPVBuffer.write(strPtr, len);
+			}
+			data = DataRef<const uint8_t>(mSPVBuffer.begin(), mSPVBuffer.end());
+		}
+		return data;
+	}
+
+	virtual PvdError setPropertyMessage(const void* instance, const NamespacedName& msgName, DataRef<const uint8_t> data)
+	{
+		ScopedMetaData meta(mMetaDataProvider);
+		PX_ASSERT(isInstanceValid(instance));
+#if PX_DEBUG
+		PX_ASSERT(messageExists(msgName));
+		PX_ASSERT(checkPropertyMessage(instance, msgName));
+#endif
+		PropertyMessageDescription desc(meta->findPropertyMessage(msgName));
+		if(data.size() < desc.mMessageByteSize)
+		{
+			PX_ASSERT(false);
+			return PvdErrorType::ArgumentError;
+		}
+		data = bufferPropertyMessage(desc, data);
+		PX_ASSERT(mStreamState == DataStreamState::Open);
+		return boolToError(handlePvdEvent(SetPropertyMessage(toStream(instance), toStream(msgName), data)));
+	}
+
+#if PX_DEBUG
+
+	bool checkBeginPropertyMessageGroup(const NamespacedName& msgName)
+	{
+		ScopedMetaData meta(mMetaDataProvider);
+		PropertyMessageDescription desc(meta->findPropertyMessage(msgName));
+		return desc.mStringOffsets.size() == 0;
+	}
+
+#endif
+	// If you need to send of lot of identical messages, this avoids a hashtable lookup per message.
+	virtual PvdError beginPropertyMessageGroup(const NamespacedName& msgName)
+	{
+#if PX_DEBUG
+		PX_ASSERT(messageExists(msgName));
+		PX_ASSERT(checkBeginPropertyMessageGroup(msgName));
+#endif
+		PX_ASSERT(mStreamState == DataStreamState::Open);
+		mStreamState = DataStreamState::PropertyMessageGroup;
+		ScopedMetaData meta(mMetaDataProvider);
+		mMessageDesc = meta->findPropertyMessage(msgName);
+		return boolToError(handlePvdEvent(BeginPropertyMessageGroup(toStream(msgName))));
+	}
+
+	virtual PvdError sendPropertyMessageFromGroup(const void* instance, DataRef<const uint8_t> data)
+	{
+		PX_ASSERT(mStreamState == DataStreamState::PropertyMessageGroup);
+		PX_ASSERT(isInstanceValid(instance));
+#if PX_DEBUG
+		PX_ASSERT(checkPropertyMessage(instance, mMessageDesc.mMessageName));
+#endif
+		if(mMessageDesc.mMessageByteSize != data.size())
+		{
+			PX_ASSERT(false);
+			return PvdErrorType::ArgumentError;
+		}
+		if(data.size() < mMessageDesc.mMessageByteSize)
+			return PvdErrorType::ArgumentError;
+		data = bufferPropertyMessage(mMessageDesc, data);
+		return boolToError(handlePvdEvent(SendPropertyMessageFromGroup(toStream(instance), data)));
+	}
+	virtual PvdError endPropertyMessageGroup()
+	{
+		PX_ASSERT(mStreamState == DataStreamState::PropertyMessageGroup);
+		mStreamState = DataStreamState::Open;
+		return boolToError(handlePvdEvent(EndPropertyMessageGroup()));
+	}
+	virtual PvdError pushBackObjectRef(const void* instance, String propName, const void* data)
+	{
+		PX_ASSERT(isInstanceValid(instance));
+		PX_ASSERT(isInstanceValid(data));
+		PX_ASSERT(mStreamState == DataStreamState::Open);
+		return boolToError(handlePvdEvent(PushBackObjectRef(toStream(instance), toStream(propName), toStream(data))));
+	}
+	virtual PvdError removeObjectRef(const void* instance, String propName, const void* data)
+	{
+		PX_ASSERT(isInstanceValid(instance));
+		PX_ASSERT(isInstanceValid(data));
+		PX_ASSERT(mStreamState == DataStreamState::Open);
+		return boolToError(handlePvdEvent(RemoveObjectRef(toStream(instance), toStream(propName), toStream(data))));
+	}
+	// Instance elimination.
+	virtual PvdError destroyInstance(const void* instance)
+	{
+		PX_ASSERT(isInstanceValid(instance));
+		PX_ASSERT(mStreamState == DataStreamState::Open);
+		mMetaDataProvider.destroyInstance(instance);
+		return boolToError(handlePvdEvent(DestroyInstance(toStream(instance))));
+	}
+
+	// Profiling hooks
+	virtual PvdError beginSection(const void* instance, String name)
+	{
+		PX_ASSERT(mStreamState == DataStreamState::Open);
+		return boolToError(handlePvdEvent(
+            BeginSection(toStream(instance), toStream(name), Time::getCurrentCounterValue())));
+	}
+
+	virtual PvdError endSection(const void* instance, String name)
+	{
+		PX_ASSERT(mStreamState == DataStreamState::Open);
+		return boolToError(handlePvdEvent(
+            EndSection(toStream(instance), toStream(name), Time::getCurrentCounterValue())));
+	}
+
+	virtual PvdError originShift(const void* scene, PxVec3 shift)
+	{
+		PX_ASSERT(mStreamState == DataStreamState::Open);
+		return boolToError(handlePvdEvent(OriginShift(toStream(scene), shift)));
+	}
+
+	virtual void addProfileZone(void* zone, const char* name)
+	{
+		handlePvdEvent(AddProfileZone(toStream(zone), name));
+	}
+	virtual void addProfileZoneEvent(void* zone, const char* name, uint16_t eventId, bool compileTimeEnabled)
+	{
+		handlePvdEvent(AddProfileZoneEvent(toStream(zone), name, eventId, compileTimeEnabled));
+	}
+
+	// add a variable sized event
+	void addEvent(const EventSerializeable& evt, PvdCommStreamEventTypes::Enum evtType)
+	{
+		MeasureStream measure;
+		PvdCommStreamEventSink::writeStreamEvent(evt, evtType, measure);
+        EventGroup evtGroup(measure.mSize, 1, mStreamId, Time::getCurrentCounterValue());
+		EventStreamifier<PxPvdTransport> streamifier(mTransport.lock());
+		evtGroup.serialize(streamifier);
+		PvdCommStreamEventSink::writeStreamEvent(evt, evtType, mTransport);
+		mTransport.unlock();
+	}
+
+	void setIsTopLevelUIElement(const void* instance, bool topLevel)
+	{
+		addEvent(SetIsTopLevel(static_cast<uint64_t>(reinterpret_cast<size_t>(instance)), topLevel),
+		         getCommStreamEventType<SetIsTopLevel>());
+	}
+
+	void sendErrorMessage(uint32_t code, const char* message, const char* file, uint32_t line)
+	{
+		addEvent(ErrorMessage(code, message, file, line), getCommStreamEventType<ErrorMessage>());
+	}
+
+	void updateCamera(const char* name, const PxVec3& origin, const PxVec3& up, const PxVec3& target)
+	{
+		addEvent(SetCamera(name, origin, up, target), getCommStreamEventType<SetCamera>());
+	}
+
+	template <typename TEventType>
+	bool handlePvdEvent(const TEventType& evt)
+	{
+		addEvent(evt, getCommStreamEventType<TEventType>());
+		return mConnected;
+	}
+
+	virtual PvdPropertyDefinitionHelper& getPropertyDefinitionHelper()
+	{
+		mPropertyDefinitionHelper.clearBufferedData();
+		return mPropertyDefinitionHelper;
+	}
+
+	virtual bool isConnected()
+	{
+		return mConnected;
+	}
+
+	virtual void* allocateMemForCmd(uint32_t length)
+	{
+		return mPvdCommandPool.allocate(length);
+	}
+
+	virtual void pushPvdCommand(PvdCommand& cmd)
+	{
+		mPvdCommandArray.pushBack(&cmd);
+	}
+
+	virtual void flushPvdCommand()
+	{
+		uint32_t cmdQueueSize = mPvdCommandArray.size();
+		for(uint32_t i = 0; i < cmdQueueSize; i++)
+		{
+			if(mPvdCommandArray[i])
+			{
+				// if(mPvdCommandArray[i]->canRun(*this))
+				mPvdCommandArray[i]->run(*this);
+				mPvdCommandArray[i]->~PvdCommand();
+			}
+		}
+		mPvdCommandArray.clear();
+		mPvdCommandPool.clear();
+	}
+
+	PX_NOCOPY(PvdOutStream)
+};
+}
+
+PvdDataStream* PvdDataStream::create(PxPvd* pvd)
+{
+	if(pvd == NULL)
+	{
+        getFoundation().error(PxErrorCode::eINVALID_PARAMETER, __FILE__, __LINE__, "PvdDataStream::create - pvd must be non-NULL!");
+	    return NULL;
+	}
+
+	PvdImpl* pvdImpl = static_cast<PvdImpl*>(pvd);
+	return PVD_NEW(PvdOutStream)(*pvdImpl->getTransport(), pvdImpl->getMetaDataProvider(), pvdImpl->getNextStreamId());
+}
diff --git a/PxShared/src/pvd/src/PxPvdDefaultFileTransport.cpp b/PxShared/src/pvd/src/PxPvdDefaultFileTransport.cpp
new file mode 100644
index 0000000..e3499a6
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdDefaultFileTransport.cpp
@@ -0,0 +1,123 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "pvd/PxPvdTransport.h"
+#include "foundation/PxAssert.h"
+
+#include "PxPvdDefaultFileTransport.h"
+
+namespace physx
+{
+namespace pvdsdk
+{
+
+PvdDefaultFileTransport::PvdDefaultFileTransport(const char* name) : mConnected(false), mWrittenData(0), mLocked(false)
+{
+	mFileBuffer = PX_NEW(PsFileBuffer)(name, PxFileBuf::OPEN_WRITE_ONLY);
+}
+
+PvdDefaultFileTransport::~PvdDefaultFileTransport()
+{
+}
+
+bool PvdDefaultFileTransport::connect()
+{
+	PX_ASSERT(mFileBuffer);
+	mConnected = mFileBuffer->isOpen();
+	return mConnected;
+}
+
+void PvdDefaultFileTransport::disconnect()
+{
+	mConnected = false;
+}
+
+bool PvdDefaultFileTransport::isConnected()
+{
+	return mConnected;
+}
+
+bool PvdDefaultFileTransport::write(const uint8_t* inBytes, uint32_t inLength)
+{
+	PX_ASSERT(mLocked);
+	PX_ASSERT(mFileBuffer);
+	if (mConnected)
+	{
+		uint32_t len = mFileBuffer->write(inBytes, inLength);
+		mWrittenData += len;
+		return len == inLength;
+	}
+	else
+		return false;
+}
+
+PxPvdTransport& PvdDefaultFileTransport::lock()
+{
+	mMutex.lock();
+	PX_ASSERT(!mLocked);
+	mLocked = true;
+	return *this;
+}
+
+void PvdDefaultFileTransport::unlock()
+{
+	PX_ASSERT(mLocked);
+	mLocked = false;
+	mMutex.unlock();
+}
+
+void PvdDefaultFileTransport::flush()
+{
+}
+
+uint64_t PvdDefaultFileTransport::getWrittenDataSize()
+{
+	return mWrittenData;
+}
+
+void PvdDefaultFileTransport::release()
+{
+	if (mFileBuffer)
+	{
+		mFileBuffer->close();
+		delete mFileBuffer;
+	}
+	mFileBuffer = NULL;
+	PX_DELETE(this);
+}
+
+} // namespace pvdsdk
+
+PxPvdTransport* PxDefaultPvdFileTransportCreate(const char* name)
+{
+	return PX_NEW(pvdsdk::PvdDefaultFileTransport)(name);
+}
+
+} // namespace physx
+
diff --git a/PxShared/src/pvd/src/PxPvdDefaultFileTransport.h b/PxShared/src/pvd/src/PxPvdDefaultFileTransport.h
new file mode 100644
index 0000000..9f4166f
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdDefaultFileTransport.h
@@ -0,0 +1,77 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PXPVDSDK_PXPVDDEFAULTFILETRANSPORT_H
+#define PXPVDSDK_PXPVDDEFAULTFILETRANSPORT_H
+
+#include "pvd/PxPvdTransport.h"
+
+#include "PsUserAllocated.h"
+#include "PsFileBuffer.h"
+#include "PsMutex.h"
+
+namespace physx
+{
+namespace pvdsdk
+{
+
+class PvdDefaultFileTransport : public physx::PxPvdTransport, public physx::shdfnd::UserAllocated
+{
+	PX_NOCOPY(PvdDefaultFileTransport)
+  public:
+	PvdDefaultFileTransport(const char* name);
+	virtual ~PvdDefaultFileTransport();
+
+	virtual bool connect();
+	virtual void disconnect();
+	virtual bool isConnected();
+
+	virtual bool write(const uint8_t* inBytes, uint32_t inLength);
+
+	virtual PxPvdTransport& lock();
+	virtual void unlock();
+
+	virtual void flush();
+
+	virtual uint64_t getWrittenDataSize();
+
+	virtual void release();
+
+  private:
+	physx::PsFileBuffer* mFileBuffer;
+	bool mConnected;
+	uint64_t mWrittenData;
+	physx::shdfnd::Mutex mMutex;
+	bool mLocked; // for debug, remove it when finished
+};
+
+} // pvdsdk
+} // physx
+
+#endif // PXPVDSDK_PXPVDDEFAULTFILETRANSPORT_H
diff --git a/PxShared/src/pvd/src/PxPvdDefaultSocketTransport.cpp b/PxShared/src/pvd/src/PxPvdDefaultSocketTransport.cpp
new file mode 100644
index 0000000..48b94b1
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdDefaultSocketTransport.cpp
@@ -0,0 +1,136 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "pvd/PxPvdTransport.h"
+
+#include "PxPvdDefaultSocketTransport.h"
+
+namespace physx
+{
+namespace pvdsdk
+{
+PvdDefaultSocketTransport::PvdDefaultSocketTransport(const char* host, int port, unsigned int timeoutInMilliseconds)
+: mHost(host), mPort(uint16_t(port)), mTimeout(timeoutInMilliseconds), mConnected(false), mWrittenData(0)
+{
+}
+
+PvdDefaultSocketTransport::~PvdDefaultSocketTransport()
+{
+}
+
+bool PvdDefaultSocketTransport::connect()
+{
+	if(mConnected)
+		return true;
+
+	if(mSocket.connect(mHost, mPort, mTimeout))
+	{
+		mSocket.setBlocking(true);
+		mConnected = true;
+	}
+	return mConnected;
+}
+
+void PvdDefaultSocketTransport::disconnect()
+{
+	mSocket.flush();
+	mSocket.disconnect();
+	mConnected = false;
+}
+
+bool PvdDefaultSocketTransport::isConnected()
+{
+	return mSocket.isConnected();
+}
+
+bool PvdDefaultSocketTransport::write(const uint8_t* inBytes, uint32_t inLength)
+{
+	if(mConnected)
+	{
+		if(inLength == 0)
+			return true;
+
+		uint32_t amountWritten = 0;
+		uint32_t totalWritten = 0;
+		do
+		{
+			// Sockets don't have to write as much as requested, so we need
+			// to wrap this call in a do/while loop.
+			// If they don't write any bytes then we consider them disconnected.
+			amountWritten = mSocket.write(inBytes, inLength);
+			inLength -= amountWritten;
+			inBytes += amountWritten;
+			totalWritten += amountWritten;
+		} while(inLength && amountWritten);
+
+		if(amountWritten == 0)
+			return false;
+
+		mWrittenData += totalWritten;
+
+		return true;
+	}
+	else
+		return false;
+}
+
+PxPvdTransport& PvdDefaultSocketTransport::lock()
+{
+	mMutex.lock();
+	return *this;
+}
+
+void PvdDefaultSocketTransport::unlock()
+{
+	mMutex.unlock();
+}
+
+void PvdDefaultSocketTransport::flush()
+{
+	mSocket.flush();
+}
+
+uint64_t PvdDefaultSocketTransport::getWrittenDataSize()
+{
+	return mWrittenData;
+}
+
+void PvdDefaultSocketTransport::release()
+{
+	PX_DELETE(this);
+}
+
+} // namespace pvdsdk
+
+PxPvdTransport* PxDefaultPvdSocketTransportCreate(const char* host, int port, unsigned int timeoutInMilliseconds)
+{
+	return PX_NEW(pvdsdk::PvdDefaultSocketTransport)(host, port, timeoutInMilliseconds);
+}
+
+} // namespace physx
diff --git a/PxShared/src/pvd/src/PxPvdDefaultSocketTransport.h b/PxShared/src/pvd/src/PxPvdDefaultSocketTransport.h
new file mode 100644
index 0000000..b02b934
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdDefaultSocketTransport.h
@@ -0,0 +1,79 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PXPVDSDK_PXPVDDEFAULTSOCKETTRANSPORT_H
+#define PXPVDSDK_PXPVDDEFAULTSOCKETTRANSPORT_H
+
+#include "pvd/PxPvdTransport.h"
+
+#include "PsUserAllocated.h"
+#include "PsSocket.h"
+#include "PsMutex.h"
+
+namespace physx
+{
+namespace pvdsdk
+{
+class PvdDefaultSocketTransport : public PxPvdTransport, public shdfnd::UserAllocated
+{
+	PX_NOCOPY(PvdDefaultSocketTransport)
+  public:
+	PvdDefaultSocketTransport(const char* host, int port, unsigned int timeoutInMilliseconds);
+	virtual ~PvdDefaultSocketTransport();
+
+	virtual bool connect();
+	virtual void disconnect();
+	virtual bool isConnected();
+
+	virtual bool write(const uint8_t* inBytes, uint32_t inLength);
+
+	virtual void flush();
+
+	virtual PxPvdTransport& lock();
+	virtual void unlock();
+
+	virtual uint64_t getWrittenDataSize();
+
+	virtual void release();
+
+  private:
+	shdfnd::Socket mSocket;
+	const char* mHost;
+	uint16_t mPort;
+	unsigned int mTimeout;
+	bool mConnected;
+	uint64_t mWrittenData;
+	shdfnd::Mutex mMutex;
+	bool mlocked;
+};
+
+} // pvdsdk
+} // physx
+
+#endif // PXPVDSDK_PXPVDDEFAULTSOCKETTRANSPORT_H
diff --git a/PxShared/src/pvd/src/PxPvdFoundation.h b/PxShared/src/pvd/src/PxPvdFoundation.h
new file mode 100644
index 0000000..90fc77f
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdFoundation.h
@@ -0,0 +1,504 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPVDFOUNDATION_H
+#define PXPVDSDK_PXPVDFOUNDATION_H
+
+#include "foundation/PxVec3.h"
+#include "foundation/PxTransform.h"
+#include "foundation/PxBounds3.h"
+
+#include "PsArray.h"
+#include "PsHashMap.h"
+#include "PsHashSet.h"
+#include "PsPool.h"
+#include "PsString.h"
+
+#include "PxPvdObjectModelBaseTypes.h"
+
+namespace physx
+{
+namespace pvdsdk
+{
+
+extern PxAllocatorCallback* gPvdAllocatorCallback;
+
+class ForwardingAllocator : public PxAllocatorCallback
+{
+	void* allocate(size_t size, const char* typeName, const char* filename, int line)
+	{
+		return shdfnd::getAllocator().allocate(size, typeName, filename, line);
+	}
+	void deallocate(void* ptr)
+	{
+		shdfnd::getAllocator().deallocate(ptr);
+	}
+};
+
+class RawMemoryBuffer
+{
+	uint8_t* mBegin;
+	uint8_t* mEnd;
+	uint8_t* mCapacityEnd;
+	const char* mBufDataName;
+
+  public:
+	RawMemoryBuffer(const char* name) : mBegin(0), mEnd(0), mCapacityEnd(0),mBufDataName(name)
+	{
+		PX_UNUSED(mBufDataName);
+	}
+	~RawMemoryBuffer()
+	{
+		if(mBegin)
+			PX_FREE(mBegin);
+	}
+	uint32_t size() const
+	{
+		return static_cast<uint32_t>(mEnd - mBegin);
+	}
+	uint32_t capacity() const
+	{
+		return static_cast<uint32_t>(mCapacityEnd - mBegin);
+	}
+	uint8_t* begin()
+	{
+		return mBegin;
+	}
+	uint8_t* end()
+	{
+		return mEnd;
+	}
+	const uint8_t* begin() const
+	{
+		return mBegin;
+	}
+	const uint8_t* end() const
+	{
+		return mEnd;
+	}
+	void clear()
+	{
+		mEnd = mBegin;
+	}
+	const char* cStr()
+	{
+		if(mEnd && (*mEnd != 0))
+			write(0);
+		return reinterpret_cast<const char*>(mBegin);
+	}
+	uint32_t write(uint8_t inValue)
+	{
+		*growBuf(1) = inValue;
+		return 1;
+	}
+
+	template <typename TDataType>
+	uint32_t write(const TDataType& inValue)
+	{
+		const uint8_t* __restrict readPtr = reinterpret_cast<const uint8_t*>(&inValue);
+		uint8_t* __restrict writePtr = growBuf(sizeof(TDataType));
+		for(uint32_t idx = 0; idx < sizeof(TDataType); ++idx)
+			writePtr[idx] = readPtr[idx];
+		return sizeof(TDataType);
+	}
+
+	template <typename TDataType>
+	uint32_t write(const TDataType* inValue, uint32_t inLength)
+	{
+		uint32_t writeSize = inLength * sizeof(TDataType);
+		if(inValue && inLength)
+		{
+			physx::intrinsics::memCopy(growBuf(writeSize), inValue, writeSize);
+		}
+		if(inLength && !inValue)
+		{
+			PX_ASSERT(false);
+			// You can't not write something, because that will cause
+			// the receiving end to crash.
+			for(uint32_t idx = 0; idx < writeSize; ++idx)
+				write(0);
+		}
+		return writeSize;
+	}
+
+	uint8_t* growBuf(uint32_t inAmount)
+	{
+		uint32_t offset = size();
+		uint32_t newSize = offset + inAmount;
+		reserve(newSize);
+		mEnd += inAmount;
+		return mBegin + offset;
+	}
+	void writeZeros(uint32_t inAmount)
+	{
+		uint32_t offset = size();
+		growBuf(inAmount);
+		physx::intrinsics::memZero(begin() + offset, inAmount);
+	}
+	void reserve(uint32_t newSize)
+	{
+		uint32_t currentSize = size();
+		if(newSize && newSize >= capacity())
+		{
+			uint32_t newDataSize = newSize > 4096 ? newSize + (newSize >> 2) : newSize*2;
+			uint8_t* newData = static_cast<uint8_t*>(PX_ALLOC(newDataSize, mBufDataName));
+			if(mBegin)
+			{
+				physx::intrinsics::memCopy(newData, mBegin, currentSize);
+				PX_FREE(mBegin);
+			}
+			mBegin = newData;
+			mEnd = mBegin + currentSize;
+			mCapacityEnd = mBegin + newDataSize;
+		}
+	}
+};
+
+struct ForwardingMemoryBuffer : public RawMemoryBuffer
+{
+	ForwardingMemoryBuffer(const char* bufDataName) : RawMemoryBuffer(bufDataName)
+	{
+	}
+
+	ForwardingMemoryBuffer& operator<<(const char* inString)
+	{
+		if(inString && *inString)
+		{
+			uint32_t len = static_cast<uint32_t>(strlen(inString));
+			write(inString, len);
+		}
+		return *this;
+	}
+
+	template <typename TDataType>
+	inline ForwardingMemoryBuffer& toStream(const char* inFormat, const TDataType inData)
+	{
+		char buffer[128] = { 0 };
+		shdfnd::snprintf(buffer, 128, inFormat, inData);
+		*this << buffer;
+		return *this;
+	}
+
+	inline ForwardingMemoryBuffer& operator<<(bool inData)
+	{
+		*this << (inData ? "true" : "false");
+		return *this;
+	}
+	inline ForwardingMemoryBuffer& operator<<(int32_t inData)
+	{
+		return toStream("%d", inData);
+	}
+	inline ForwardingMemoryBuffer& operator<<(uint16_t inData)
+	{
+		return toStream("%u", uint32_t(inData));
+	}
+	inline ForwardingMemoryBuffer& operator<<(uint8_t inData)
+	{
+		return toStream("%u", uint32_t(inData));
+	}
+	inline ForwardingMemoryBuffer& operator<<(char inData)
+	{
+		return toStream("%c", inData);
+	}
+	inline ForwardingMemoryBuffer& operator<<(uint32_t inData)
+	{
+		return toStream("%u", inData);
+	}
+	inline ForwardingMemoryBuffer& operator<<(uint64_t inData)
+	{
+		return toStream("%I64u", inData);
+	}
+	inline ForwardingMemoryBuffer& operator<<(int64_t inData)
+	{
+		return toStream("%I64d", inData);
+	}
+	inline ForwardingMemoryBuffer& operator<<(const void* inData)
+	{
+		return *this << static_cast<uint64_t>(reinterpret_cast<size_t>(inData));
+	}
+	inline ForwardingMemoryBuffer& operator<<(float inData)
+	{
+		return toStream("%g", double(inData));
+	}
+	inline ForwardingMemoryBuffer& operator<<(double inData)
+	{
+		return toStream("%g", inData);
+	}
+	inline ForwardingMemoryBuffer& operator<<(const PxVec3& inData)
+	{
+		*this << inData[0];
+		*this << " ";
+		*this << inData[1];
+		*this << " ";
+		*this << inData[2];
+		return *this;
+	}
+
+	inline ForwardingMemoryBuffer& operator<<(const PxQuat& inData)
+	{
+		*this << inData.x;
+		*this << " ";
+		*this << inData.y;
+		*this << " ";
+		*this << inData.z;
+		*this << " ";
+		*this << inData.w;
+		return *this;
+	}
+
+	inline ForwardingMemoryBuffer& operator<<(const PxTransform& inData)
+	{
+		*this << inData.q;
+		*this << " ";
+		*this << inData.p;
+		return *this;
+	}
+
+	inline ForwardingMemoryBuffer& operator<<(const PxBounds3& inData)
+	{
+		*this << inData.minimum;
+		*this << " ";
+		*this << inData.maximum;
+		return *this;
+	}
+
+};
+
+template <typename TObjectType, typename TGetSetIndexOp, typename TSetSetIndexOp>
+class InvasiveSet
+{
+    shdfnd::Array<TObjectType*> mSet;
+
+	InvasiveSet(const InvasiveSet& other);
+	InvasiveSet& operator=(const InvasiveSet& other);
+
+  public:
+	InvasiveSet(const char* allocName) : mSet(allocName)
+	{
+	}
+
+	bool insert(TObjectType& inObject)
+	{
+		uint32_t currentIdx = TGetSetIndexOp()(inObject);
+		if(currentIdx == UINT32_MAX)
+		{
+			TSetSetIndexOp()(inObject, mSet.size());
+			mSet.pushBack(&inObject);
+			return true;
+		}
+		return false;
+	}
+
+	bool remove(TObjectType& inObject)
+	{
+		uint32_t currentIdx = TGetSetIndexOp()(inObject);
+		if(currentIdx != UINT32_MAX)
+		{
+			TObjectType* theEnd = mSet.back();
+			TObjectType* theObj = &inObject;
+			if(theEnd != theObj)
+			{
+				TSetSetIndexOp()(*theEnd, currentIdx);
+				mSet[currentIdx] = theEnd;
+			}
+			mSet.popBack();
+			TSetSetIndexOp()(inObject, UINT32_MAX);
+			return true;
+		}
+		return false;
+	}
+
+	bool contains(TObjectType& inObject)
+	{
+		return TGetSetIndexOp()(inObject) != UINT32_MAX;
+	}
+
+	void clear()
+	{
+		for(uint32_t idx = 0; idx < mSet.size(); ++idx)
+			TSetSetIndexOp()(*(mSet[idx]), UINT32_MAX);
+		mSet.clear();
+	}
+
+	TObjectType* operator[](uint32_t idx)
+	{
+		return mSet[idx];
+	}
+	const TObjectType* operator[](uint32_t idx) const
+	{
+		return mSet[idx];
+	}
+	uint32_t size() const
+	{
+		return mSet.size();
+	}
+	TObjectType** begin()
+	{
+		return mSet.begin();
+	}
+	TObjectType** end()
+	{
+		return mSet.end();
+	}
+	const TObjectType** begin() const
+	{
+		return mSet.begin();
+	}
+	const TObjectType** end() const
+	{
+		return mSet.end();
+	}
+	const TObjectType* back() const
+	{
+		return mSet.back();
+	}
+	TObjectType* back()
+	{
+		return mSet.back();
+	}
+};
+
+template <typename TDataType>
+inline void* PvdAllocate(const char* typeName, const char* file, int line)
+{
+	PX_ASSERT(gPvdAllocatorCallback);
+	return gPvdAllocatorCallback->allocate(sizeof(TDataType), typeName, file, line);
+}
+
+template <typename TDataType>
+inline void PvdDeleteAndDeallocate(TDataType* inDType)
+{
+	PX_ASSERT(gPvdAllocatorCallback);
+	if(inDType)
+	{
+		inDType->~TDataType();
+		gPvdAllocatorCallback->deallocate(inDType);
+	}
+}
+}
+}
+
+#define PVD_NEW(dtype) new (PvdAllocate<dtype>(#dtype, __FILE__, __LINE__)) dtype
+#define PVD_DELETE(obj) PvdDeleteAndDeallocate(obj);
+//#define PVD_NEW(dtype) PX_NEW(dtype)
+//#define PVD_DELETE(obj) PX_DELETE(obj)
+#define PVD_FOREACH(varname, stop) for(uint32_t varname = 0; varname < stop; ++varname)
+
+namespace physx
+{
+namespace pvdsdk
+{
+
+template <typename TKeyType, typename TValueType, typename THashType, typename TBufType, typename TOperator>
+uint32_t getMapKeysOp(shdfnd::HashMap<TKeyType, TValueType, THashType>& map, TBufType* buffer, uint32_t bufSize,
+                      uint32_t startIdx, TOperator op)
+{
+	uint32_t numItems = static_cast<uint32_t>(map.size());
+	if(numItems == 0 || bufSize == 0)
+		return 0;
+
+	startIdx = PxMin(numItems - 1, startIdx);
+	uint32_t retval = 0;
+    for(typename shdfnd::HashMap<TKeyType, TValueType, THashType>::Iterator iter = map.getIterator();
+	    iter.done() == false && bufSize; ++iter)
+	{
+		if(startIdx)
+			--startIdx;
+		else
+		{
+			buffer[retval] = op(iter->first);
+			--bufSize;
+			++retval;
+		}
+	}
+	return retval;
+}
+
+struct IdOp
+{
+	template <typename TDataType>
+	TDataType operator()(const TDataType& item)
+	{
+		return item;
+	}
+};
+
+template <typename TKeyType, typename TValueType, typename THashType>
+uint32_t getMapKeys(shdfnd::HashMap<TKeyType, TValueType, THashType>& map, TKeyType* buffer, uint32_t bufSize, uint32_t startIdx)
+{
+	return getMapKeysOp(map, buffer, bufSize, startIdx, IdOp());
+}
+
+struct DerefOp
+{
+	template <typename TDataType>
+	TDataType operator()(const TDataType* item)
+	{
+		return *item;
+	}
+};
+
+template <typename TKeyType, typename TValueType, typename TBufType, typename TOp>
+uint32_t getMapValues(shdfnd::HashMap<TKeyType, TValueType>& map, TBufType* buffer, uint32_t bufSize, uint32_t startIdx, TOp op)
+{
+	uint32_t numItems = static_cast<uint32_t>(map.size());
+	if(numItems == 0 || bufSize == 0)
+		return 0;
+
+	startIdx = PxMin(numItems - 1, startIdx);
+	uint32_t retval = 0;
+    for(typename shdfnd::HashMap<TKeyType, TValueType>::Iterator iter = map.getIterator(); iter.done() == false && bufSize; ++iter)
+	{
+		if(startIdx)
+			--startIdx;
+		else
+		{
+			buffer[retval] = op(iter->second);
+			--bufSize;
+			++retval;
+		}
+	}
+	return retval;
+}
+
+template <typename TValueType, typename TBufType>
+uint32_t getArrayEntries(shdfnd::Array<TValueType>& data, TBufType* buffer, uint32_t bufSize, uint32_t startIdx)
+{
+	uint32_t numItems = static_cast<uint32_t>(data.size());
+	if(numItems == 0 || bufSize == 0)
+		return 0;
+
+	startIdx = PxMin(numItems - 1, startIdx);
+	uint32_t available = PxMin(numItems - startIdx, bufSize);
+	PVD_FOREACH(idx, available)
+	buffer[idx] = data[idx + startIdx];
+	return available;
+}
+#define PVD_POINTER_TO_U64(ptr) static_cast<uint64_t>(reinterpret_cast<size_t>(ptr))
+}
+}
+#endif // PXPVDSDK_PXPVDFOUNDATION_H
diff --git a/PxShared/src/pvd/src/PxPvdImpl.cpp b/PxShared/src/pvd/src/PxPvdImpl.cpp
new file mode 100644
index 0000000..9fa82a0
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdImpl.cpp
@@ -0,0 +1,405 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "pvd/PxPvdTransport.h"
+
+#include "PxPvdImpl.h"
+#include "PxPvdFoundation.h"
+#include "PxPvdClient.h"
+#include "PxPvdMemClient.h"
+#include "PxPvdProfileZoneClient.h"
+#include "PxPvdCommStreamTypes.h"
+#include "PxProfileZoneManager.h"
+#include "PxProfileZone.h"
+
+#include "PsFoundation.h"
+
+#if PX_NVTX
+#include "nvToolsExt.h"
+#endif
+
+namespace
+{
+	const char* gSdkName = "PhysXSDK";
+}
+
+namespace physx
+{
+namespace pvdsdk
+{
+
+class CmEventNameProvider : public physx::profile::PxProfileNameProvider
+{
+public:
+	physx::profile::PxProfileNames getProfileNames() const
+	{
+		physx::profile::PxProfileNames  ret;
+		ret.eventCount = 0;
+		return ret;
+	}
+};
+
+CmEventNameProvider gProfileNameProvider;
+
+void initializeModelTypes(PvdDataStream& stream)
+{
+	stream.createClass<profile::PxProfileZone>();
+	stream.createProperty<profile::PxProfileZone, uint8_t>(
+	    "events", PvdCommStreamEmbeddedTypes::getProfileEventStreamSemantic(), PropertyType::Array);
+
+	stream.createClass<profile::PxProfileMemoryEventBuffer>();
+	stream.createProperty<profile::PxProfileMemoryEventBuffer, uint8_t>(
+	    "events", PvdCommStreamEmbeddedTypes::getMemoryEventStreamSemantic(), PropertyType::Array);
+
+	stream.createClass<PvdUserRenderer>();
+	stream.createProperty<PvdUserRenderer, uint8_t>(
+	    "events", PvdCommStreamEmbeddedTypes::getRendererEventStreamSemantic(), PropertyType::Array);
+}
+
+PvdImpl* PvdImpl::sInstance = NULL;
+uint32_t PvdImpl::sRefCount = 0;
+
+PvdImpl::PvdImpl()
+: mPvdTransport(NULL)
+, mSharedMetaProvider(NULL)
+, mMemClient(NULL)
+, mIsConnected(false)
+, mIsNVTXSupportEnabled(true)
+, mNVTXContext(0)
+, mNextStreamId(1)
+, mProfileClient(NULL)
+, mProfileZone(NULL)
+{
+	mProfileZoneManager = &physx::profile::PxProfileZoneManager::createProfileZoneManager(&physx::shdfnd::getAllocator());
+	mProfileClient = PVD_NEW(PvdProfileZoneClient)(*this);
+}
+
+PvdImpl::~PvdImpl()
+{
+	if((mFlags & PxPvdInstrumentationFlag::ePROFILE) )
+	{
+		PxSetProfilerCallback(NULL);
+	}
+
+	disconnect();
+
+	if ( mProfileZoneManager )
+	{
+		mProfileZoneManager->release();
+		mProfileZoneManager = NULL;
+	}
+
+	PVD_DELETE(mProfileClient);
+	mProfileClient = NULL;
+}
+
+bool PvdImpl::connect(PxPvdTransport& transport, PxPvdInstrumentationFlags flags)
+{
+	if(mIsConnected)
+	{
+		physx::shdfnd::getFoundation().error(PxErrorCode::eINVALID_PARAMETER, __FILE__, __LINE__, "PxPvd::connect - recall connect! Should call disconnect before re-connect.");
+	    return false;
+	}
+
+	mFlags = flags;	
+	mPvdTransport = &transport;
+
+	mIsConnected = mPvdTransport->connect();
+
+	if(mIsConnected)
+	{
+		mSharedMetaProvider = PVD_NEW(MetaDataProvider);
+		sendTransportInitialization();
+
+		PvdDataStream* stream = PvdDataStream::create(this);
+		initializeModelTypes(*stream);
+		stream->release();
+
+		if(mFlags & PxPvdInstrumentationFlag::eMEMORY)
+		{
+			mMemClient = PVD_NEW(PvdMemClient)(*this);
+			mPvdClients.pushBack(mMemClient);
+		}
+
+		if((mFlags & PxPvdInstrumentationFlag::ePROFILE) && mProfileZoneManager)
+		{			
+			mPvdClients.pushBack(mProfileClient);
+			mProfileZone = &physx::profile::PxProfileZone::createProfileZone(&physx::shdfnd::getAllocator(),gSdkName,gProfileNameProvider.getProfileNames());
+		}
+
+		for(uint32_t i = 0; i < mPvdClients.size(); i++)
+			mPvdClients[i]->onPvdConnected();
+
+		if (mProfileZone)
+		{
+			mProfileZoneManager->addProfileZoneHandler(*mProfileClient);
+			mProfileZoneManager->addProfileZone( *mProfileZone );
+		}
+
+		if ((mFlags & PxPvdInstrumentationFlag::ePROFILE))
+		{
+			PxSetProfilerCallback(this);
+		}
+	}
+	return mIsConnected;
+}
+
+void PvdImpl::disconnect()
+{
+	if(mProfileZone)
+	{
+		mProfileZoneManager->removeProfileZoneHandler(*mProfileClient);		
+		mProfileZoneManager->removeProfileZone( *mProfileZone );				
+		mProfileZone->release();
+		mProfileZone=NULL;	
+		removeClient(mProfileClient);
+	}
+
+	if(mIsConnected)
+	{
+		for(uint32_t i = 0; i < mPvdClients.size(); i++)
+			mPvdClients[i]->onPvdDisconnected();		
+
+		if(mMemClient)
+		{
+			removeClient(mMemClient);
+			PvdMemClient* tmp = mMemClient;  //avoid tracking deallocation itsself
+			mMemClient = NULL;
+			PVD_DELETE(tmp);	        
+		}
+		 
+		mSharedMetaProvider->release();
+		mPvdTransport->disconnect();
+		mObjectRegistrar.clear();
+		mIsConnected = false;
+	}
+}
+
+void PvdImpl::flush()
+{
+	for(uint32_t i = 0; i < mPvdClients.size(); i++)
+		mPvdClients[i]->flush();
+	if ( mProfileZone )
+	{
+		mProfileZone->flushEventIdNameMap();
+		mProfileZone->flushProfileEvents();
+	}
+}
+
+bool PvdImpl::isConnected(bool useCachedStatus)
+{
+	if(mPvdTransport)
+	    return useCachedStatus ? mIsConnected : mPvdTransport->isConnected();
+	else
+		return false;
+}
+
+PxPvdTransport* PvdImpl::getTransport()
+{
+	return mPvdTransport;
+}
+
+PxPvdInstrumentationFlags PvdImpl::getInstrumentationFlags()
+{
+	return mFlags;
+}
+
+void PvdImpl::sendTransportInitialization()
+{
+	StreamInitialization init;
+	EventStreamifier<PxPvdTransport> stream(mPvdTransport->lock());
+	init.serialize(stream);
+	mPvdTransport->unlock();
+}
+
+void PvdImpl::addClient(PvdClient* client)
+{
+	PX_ASSERT(client);
+	for(uint32_t i = 0; i < mPvdClients.size(); i++)
+	{
+		if(client == mPvdClients[i])
+		    return;
+	}
+	mPvdClients.pushBack(client);
+	if(mIsConnected)
+	{
+		client->onPvdConnected();
+	}
+}
+
+void PvdImpl::removeClient(PvdClient* client)
+{
+	for(uint32_t i = 0; i < mPvdClients.size(); i++)
+	{
+		if(client == mPvdClients[i])
+		{
+			client->onPvdDisconnected();
+			mPvdClients.remove(i);
+		}
+	}
+}
+
+void PvdImpl::onAllocation(size_t inSize, const char* inType, const char* inFile, int inLine, void* inAddr)
+{
+	if(mMemClient)
+       mMemClient->onAllocation(inSize, inType, inFile, inLine, inAddr);
+}
+
+void PvdImpl::onDeallocation(void* inAddr)
+{
+	if(mMemClient)
+       mMemClient->onDeallocation(inAddr);
+}
+
+PvdOMMetaDataProvider& PvdImpl::getMetaDataProvider()
+{
+	return *mSharedMetaProvider;
+}
+
+bool PvdImpl::registerObject(const void* inItem)
+{
+	return mObjectRegistrar.addItem(inItem);
+}
+
+
+bool PvdImpl::unRegisterObject(const void* inItem)
+{
+	return mObjectRegistrar.decItem(inItem);
+}
+
+uint64_t PvdImpl::getNextStreamId()
+{
+	uint64_t retval = ++mNextStreamId;
+	return retval;
+}
+
+bool PvdImpl::initialize()
+{
+	if(0 == sRefCount)
+	{
+		sInstance = PVD_NEW(PvdImpl)();
+	}
+	++sRefCount;
+	return !!sInstance;
+}
+
+void PvdImpl::release()
+{
+	if(sRefCount > 0)
+	{
+		if(--sRefCount)
+			return;
+
+		PVD_DELETE(sInstance);
+		sInstance = NULL;
+	}
+}
+
+PvdImpl* PvdImpl::getInstance()
+{
+	return sInstance;
+}
+
+
+/**************************************************************************************************************************
+Instrumented profiling events
+***************************************************************************************************************************/
+
+static const uint32_t CrossThreadId = 99999789;
+
+void* PvdImpl::zoneStart(const char* eventName, bool detached, uint64_t contextId)
+{
+	if(mProfileZone)
+	{
+		const uint16_t id = mProfileZone->getEventIdForName(eventName);
+		if(detached)
+			mProfileZone->startEvent(id, contextId, CrossThreadId);
+		else
+			mProfileZone->startEvent(id, contextId);
+	}
+#if PX_NVTX
+	if(mIsNVTXSupportEnabled)
+	{ 
+		if(detached)
+		{
+			// TODO : Need to use the nvtxRangeStart API for cross thread events
+			nvtxEventAttributes_t eventAttrib;
+			memset(&eventAttrib, 0, sizeof(eventAttrib));
+			eventAttrib.version = NVTX_VERSION;
+			eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+			eventAttrib.colorType = NVTX_COLOR_ARGB;
+			eventAttrib.color = 0xFF00FF00;
+			eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+			eventAttrib.message.ascii = eventName;
+			nvtxMarkEx(&eventAttrib);
+		}
+		else
+		{
+			nvtxRangePush(eventName);
+		}
+	}
+#endif
+	return NULL;
+}
+
+void PvdImpl::zoneEnd(void* /*profilerData*/, const char* eventName, bool detached, uint64_t contextId)
+{
+	if(mProfileZone)
+	{
+		const uint16_t id = mProfileZone->getEventIdForName(eventName);
+		if(detached)
+			mProfileZone->stopEvent(id, contextId, CrossThreadId);
+		else
+			mProfileZone->stopEvent(id, contextId);
+	}
+#if PX_NVTX
+	if(mIsNVTXSupportEnabled)
+	{
+		if(detached)
+		{
+			nvtxEventAttributes_t eventAttrib;
+			memset(&eventAttrib, 0, sizeof(eventAttrib));
+			eventAttrib.version = NVTX_VERSION;
+			eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+			eventAttrib.colorType = NVTX_COLOR_ARGB;
+			eventAttrib.color = 0xFFFF0000;
+			eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+			eventAttrib.message.ascii = eventName;
+			nvtxMarkEx(&eventAttrib);
+		}
+		else
+		{
+			nvtxRangePop();
+		}
+	}
+#endif
+}
+} // pvd
+
+} // physx
diff --git a/PxShared/src/pvd/src/PxPvdImpl.h b/PxShared/src/pvd/src/PxPvdImpl.h
new file mode 100644
index 0000000..64d4e16
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdImpl.h
@@ -0,0 +1,221 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PXPVDSDK_PXPVDIMPL_H
+#define PXPVDSDK_PXPVDIMPL_H
+
+#include "foundation/PxProfiler.h"
+
+#include "PsAllocator.h"
+#include "PsPvd.h"
+#include "PsArray.h"
+#include "PsMutex.h"
+#include "PxPvdCommStreamTypes.h"
+#include "PxPvdFoundation.h"
+#include "PxPvdObjectModelMetaData.h"
+#include "PxPvdObjectRegistrar.h"
+
+namespace physx
+{
+
+namespace profile
+{
+	class PxProfileZoneManager;
+}
+
+namespace pvdsdk
+{
+class PvdMemClient;
+class PvdProfileZoneClient;
+
+struct MetaDataProvider : public PvdOMMetaDataProvider, public shdfnd::UserAllocated
+{
+    typedef shdfnd::Mutex::ScopedLock TScopedLockType;
+    typedef shdfnd::HashMap<const void*, int32_t> TInstTypeMap;
+	PvdObjectModelMetaData& mMetaData;
+    shdfnd::Mutex mMutex;
+	uint32_t mRefCount;
+	TInstTypeMap mTypeMap;
+
+	MetaDataProvider()
+	: mMetaData(PvdObjectModelMetaData::create()), mRefCount(0), mTypeMap("MetaDataProvider::mTypeMap")
+	{
+		mMetaData.addRef();
+	}
+	virtual ~MetaDataProvider()
+	{
+		mMetaData.release();
+	}
+
+	virtual void addRef()
+	{
+		TScopedLockType locker(mMutex);
+		++mRefCount;
+	}
+	virtual void release()
+	{
+		{
+			TScopedLockType locker(mMutex);
+			if(mRefCount)
+				--mRefCount;
+		}
+		if(!mRefCount)
+			PVD_DELETE(this);
+	}
+	virtual PvdObjectModelMetaData& lock()
+	{
+		mMutex.lock();
+		return mMetaData;
+	}
+	virtual void unlock()
+	{
+		mMutex.unlock();
+	}
+
+	virtual bool createInstance(const NamespacedName& clsName, const void* instance)
+	{
+		TScopedLockType locker(mMutex);
+		Option<ClassDescription> cls(mMetaData.findClass(clsName));
+		if(cls.hasValue() == false)
+			return false;
+		int32_t instType = cls->mClassId;
+		mTypeMap.insert(instance, instType);
+		return true;
+	}
+	virtual bool isInstanceValid(const void* instance)
+	{
+		TScopedLockType locker(mMutex);
+		ClassDescription classDesc;
+		bool retval = mTypeMap.find(instance) != NULL;
+#if PX_DEBUG
+		if(retval)
+			classDesc = mMetaData.getClass(mTypeMap.find(instance)->second);
+#endif
+		return retval;
+	}
+	virtual void destroyInstance(const void* instance)
+	{
+		{
+			TScopedLockType locker(mMutex);
+			mTypeMap.erase(instance);
+		}
+	}
+	virtual int32_t getInstanceClassType(const void* instance)
+	{
+		TScopedLockType locker(mMutex);
+		const TInstTypeMap::Entry* entry = mTypeMap.find(instance);
+		if(entry)
+			return entry->second;
+		return -1;
+	}
+
+  private:
+	MetaDataProvider& operator=(const MetaDataProvider&);
+	MetaDataProvider(const MetaDataProvider&);
+};
+
+//////////////////////////////////////////////////////////////////////////
+/*!
+PvdImpl is the realization of PxPvd.
+It implements the interface methods and provides richer functionality for advanced users or internal clients (such as
+PhysX or APEX), including handler notification for clients.
+*/
+//////////////////////////////////////////////////////////////////////////
+class PvdImpl : public PsPvd, public shdfnd::UserAllocated
+{
+	PX_NOCOPY(PvdImpl)
+
+    typedef shdfnd::Mutex::ScopedLock TScopedLockType;
+	typedef void (PvdImpl::*TAllocationHandler)(size_t size, const char* typeName, const char* filename, int line,
+	                                            void* allocatedMemory);
+	typedef void (PvdImpl::*TDeallocationHandler)(void* allocatedMemory);
+
+  public:
+	PvdImpl();
+	virtual ~PvdImpl();
+	void release();
+
+	bool connect(PxPvdTransport& transport, PxPvdInstrumentationFlags flags);
+	void disconnect();
+	bool isConnected(bool useCachedStatus = true);
+	void flush();
+
+	PxPvdTransport* getTransport();
+	PxPvdInstrumentationFlags getInstrumentationFlags();
+
+	void addClient(PvdClient* client);
+	void removeClient(PvdClient* client);
+
+	PvdOMMetaDataProvider& getMetaDataProvider();
+
+	bool registerObject(const void* inItem);
+	bool unRegisterObject(const void* inItem);
+
+	//AllocationListener
+	void onAllocation(size_t size, const char* typeName, const char* filename, int line, void* allocatedMemory);
+	void onDeallocation(void* addr);
+
+	uint64_t getNextStreamId();
+
+	static bool initialize();
+	static PvdImpl* getInstance();
+
+	// Profiling
+
+	virtual void* zoneStart(const char* eventName, bool detached, uint64_t contextId);
+
+	virtual void zoneEnd(void* profilerData, const char *eventName, bool detached, uint64_t contextId);
+
+  private:
+	void sendTransportInitialization();
+
+	PxPvdTransport*						mPvdTransport;
+	physx::shdfnd::Array<PvdClient*>	mPvdClients;
+
+	MetaDataProvider*					mSharedMetaProvider; // shared between clients
+	ObjectRegistrar						mObjectRegistrar;
+
+	PvdMemClient*						mMemClient;
+
+	PxPvdInstrumentationFlags			mFlags;
+	bool								mIsConnected;
+	bool								mIsNVTXSupportEnabled;
+	uint32_t							mNVTXContext;
+	uint64_t							mNextStreamId;
+	physx::profile::PxProfileZoneManager*mProfileZoneManager;
+	PvdProfileZoneClient*				mProfileClient;
+	physx::profile::PxProfileZone*		mProfileZone;
+	static PvdImpl*						sInstance;
+	static uint32_t						sRefCount;
+};
+
+} // namespace pvdsdk
+}
+
+#endif // PXPVDSDK_PXPVDIMPL_H
diff --git a/PxShared/src/pvd/src/PxPvdInternalByteStreams.h b/PxShared/src/pvd/src/PxPvdInternalByteStreams.h
new file mode 100644
index 0000000..1fd5ddd
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdInternalByteStreams.h
@@ -0,0 +1,147 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPVDINTERNALBYTESTREAMS_H
+#define PXPVDSDK_PXPVDINTERNALBYTESTREAMS_H
+
+#include "PxPvdByteStreams.h"
+#include "PxPvdFoundation.h"
+
+namespace physx
+{
+namespace pvdsdk
+{
+struct MemPvdOutputStream : public PvdOutputStream
+{
+	ForwardingMemoryBuffer mBuffer;
+	MemPvdOutputStream(const char* memName) : mBuffer(memName)
+	{
+	}
+
+	virtual bool write(const uint8_t* buffer, uint32_t len)
+	{
+		mBuffer.write(buffer, len);
+		return true;
+	}
+
+	virtual bool directCopy(PvdInputStream& inStream, uint32_t len)
+	{
+		uint32_t offset = mBuffer.size();
+		mBuffer.growBuf(len);
+		uint32_t readLen = len;
+		inStream.read(mBuffer.begin() + offset, readLen);
+		if(readLen != len)
+			physx::intrinsics::memZero(mBuffer.begin() + offset, len);
+		return readLen == len;
+	}
+
+	const uint8_t* begin() const
+	{
+		return mBuffer.begin();
+	}
+	uint32_t size() const
+	{
+		return mBuffer.size();
+	}
+	void clear()
+	{
+		mBuffer.clear();
+	}
+	DataRef<const uint8_t> toRef() const
+	{
+		return DataRef<const uint8_t>(mBuffer.begin(), mBuffer.end());
+	}
+};
+
+struct MemPvdInputStream : public PvdInputStream
+{
+	const uint8_t* mBegin;
+	const uint8_t* mEnd;
+	bool mGood;
+
+	MemPvdInputStream(const MemPvdOutputStream& stream) : mGood(true)
+	{
+		mBegin = stream.mBuffer.begin();
+		mEnd = stream.mBuffer.end();
+	}
+
+	MemPvdInputStream(const uint8_t* beg = NULL, const uint8_t* end = NULL)
+	{
+		mBegin = beg;
+		mEnd = end;
+		mGood = true;
+	}
+
+	uint32_t size() const
+	{
+		return mGood ? static_cast<uint32_t>(mEnd - mBegin) : 0;
+	}
+	bool isGood() const
+	{
+		return mGood;
+	}
+
+	void setup(uint8_t* start, uint8_t* stop)
+	{
+		mBegin = start;
+		mEnd = stop;
+	}
+
+	void nocopyRead(uint8_t*& buffer, uint32_t& len)
+	{
+		if(len == 0 || mGood == false)
+		{
+			len = 0;
+			buffer = NULL;
+			return;
+		}
+		uint32_t original = len;
+		len = PxMin(len, size());
+		if(mGood && len != original)
+			mGood = false;
+		buffer = const_cast<uint8_t*>(mBegin);
+		mBegin += len;
+	}
+
+	virtual bool read(uint8_t* buffer, uint32_t& len)
+	{
+		if(len == 0)
+			return true;
+		uint32_t original = len;
+		len = PxMin(len, size());
+
+		physx::intrinsics::memCopy(buffer, mBegin, len);
+		mBegin += len;
+		if(len < original)
+			physx::intrinsics::memZero(buffer + len, original - len);
+		mGood = mGood && len == original;
+		return mGood;
+	}
+};
+}
+}
+#endif // PXPVDSDK_PXPVDINTERNALBYTESTREAMS_H
diff --git a/PxShared/src/pvd/src/PxPvdMarshalling.h b/PxShared/src/pvd/src/PxPvdMarshalling.h
new file mode 100644
index 0000000..0aeaee4
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdMarshalling.h
@@ -0,0 +1,220 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPVDMARSHALLING_H
+#define PXPVDSDK_PXPVDMARSHALLING_H
+
+#include "foundation/PxIntrinsics.h"
+
+#include "PxPvdObjectModelBaseTypes.h"
+#include "PxPvdBits.h"
+
+namespace physx
+{
+namespace pvdsdk
+{
+
+// Define marshalling
+
+template <typename TSmallerType, typename TLargerType>
+struct PvdMarshalling
+{
+	bool canMarshal;
+	PvdMarshalling() : canMarshal(false)
+	{
+	}
+};
+
+template <typename smtype, typename lgtype>
+static inline void marshalSingleT(const uint8_t* srcData, uint8_t* destData)
+{
+	smtype incoming;
+
+	physx::intrinsics::memCopy(&incoming, srcData, sizeof(smtype));
+	lgtype outgoing = static_cast<lgtype>(incoming);
+	physx::intrinsics::memCopy(destData, &outgoing, sizeof(lgtype));
+}
+
+template <typename smtype, typename lgtype>
+static inline void marshalBlockT(const uint8_t* srcData, uint8_t* destData, uint32_t numBytes)
+{
+	for(const uint8_t* item = srcData, *end = srcData + numBytes; item < end;
+	    item += sizeof(smtype), destData += sizeof(lgtype))
+		marshalSingleT<smtype, lgtype>(item, destData);
+}
+
+#define PVD_TYPE_MARSHALLER(smtype, lgtype)                                                                            \
+	template <>                                                                                                        \
+	struct PvdMarshalling<smtype, lgtype>                                                                              \
+	{                                                                                                                  \
+		uint32_t canMarshal;                                                                                           \
+		static void marshalSingle(const uint8_t* srcData, uint8_t* destData)                                           \
+		{                                                                                                              \
+			marshalSingleT<smtype, lgtype>(srcData, destData);                                                         \
+		}                                                                                                              \
+		static void marshalBlock(const uint8_t* srcData, uint8_t* destData, uint32_t numBytes)                         \
+		{                                                                                                              \
+			marshalBlockT<smtype, lgtype>(srcData, destData, numBytes);                                                \
+		}                                                                                                              \
+	};
+
+// define marshalling tables.
+PVD_TYPE_MARSHALLER(int8_t, int16_t)
+PVD_TYPE_MARSHALLER(int8_t, uint16_t)
+PVD_TYPE_MARSHALLER(int8_t, int32_t)
+PVD_TYPE_MARSHALLER(int8_t, uint32_t)
+PVD_TYPE_MARSHALLER(int8_t, int64_t)
+PVD_TYPE_MARSHALLER(int8_t, uint64_t)
+PVD_TYPE_MARSHALLER(int8_t, PvdF32)
+PVD_TYPE_MARSHALLER(int8_t, PvdF64)
+
+PVD_TYPE_MARSHALLER(uint8_t, int16_t)
+PVD_TYPE_MARSHALLER(uint8_t, uint16_t)
+PVD_TYPE_MARSHALLER(uint8_t, int32_t)
+PVD_TYPE_MARSHALLER(uint8_t, uint32_t)
+PVD_TYPE_MARSHALLER(uint8_t, int64_t)
+PVD_TYPE_MARSHALLER(uint8_t, uint64_t)
+PVD_TYPE_MARSHALLER(uint8_t, PvdF32)
+PVD_TYPE_MARSHALLER(uint8_t, PvdF64)
+
+PVD_TYPE_MARSHALLER(int16_t, int32_t)
+PVD_TYPE_MARSHALLER(int16_t, uint32_t)
+PVD_TYPE_MARSHALLER(int16_t, int64_t)
+PVD_TYPE_MARSHALLER(int16_t, uint64_t)
+PVD_TYPE_MARSHALLER(int16_t, PvdF32)
+PVD_TYPE_MARSHALLER(int16_t, PvdF64)
+
+PVD_TYPE_MARSHALLER(uint16_t, int32_t)
+PVD_TYPE_MARSHALLER(uint16_t, uint32_t)
+PVD_TYPE_MARSHALLER(uint16_t, int64_t)
+PVD_TYPE_MARSHALLER(uint16_t, uint64_t)
+PVD_TYPE_MARSHALLER(uint16_t, PvdF32)
+PVD_TYPE_MARSHALLER(uint16_t, PvdF64)
+
+PVD_TYPE_MARSHALLER(int32_t, int64_t)
+PVD_TYPE_MARSHALLER(int32_t, uint64_t)
+PVD_TYPE_MARSHALLER(int32_t, PvdF64)
+PVD_TYPE_MARSHALLER(int32_t, PvdF32)
+
+PVD_TYPE_MARSHALLER(uint32_t, int64_t)
+PVD_TYPE_MARSHALLER(uint32_t, uint64_t)
+PVD_TYPE_MARSHALLER(uint32_t, PvdF64)
+PVD_TYPE_MARSHALLER(uint32_t, PvdF32)
+
+PVD_TYPE_MARSHALLER(PvdF32, PvdF64)
+PVD_TYPE_MARSHALLER(PvdF32, uint32_t)
+PVD_TYPE_MARSHALLER(PvdF32, int32_t)
+
+PVD_TYPE_MARSHALLER(uint64_t, PvdF64)
+PVD_TYPE_MARSHALLER(int64_t, PvdF64)
+PVD_TYPE_MARSHALLER(PvdF64, uint64_t)
+PVD_TYPE_MARSHALLER(PvdF64, int64_t)
+
+template <typename TMarshaller>
+static inline bool getMarshalOperators(TSingleMarshaller&, TBlockMarshaller&, TMarshaller&, bool)
+{
+	return false;
+}
+
+template <typename TMarshaller>
+static inline bool getMarshalOperators(TSingleMarshaller& single, TBlockMarshaller& block, TMarshaller&, uint32_t)
+{
+	single = TMarshaller::marshalSingle;
+	block = TMarshaller::marshalBlock;
+	return true;
+}
+
+template <typename smtype, typename lgtype>
+static inline bool getMarshalOperators(TSingleMarshaller& single, TBlockMarshaller& block)
+{
+	single = NULL;
+	block = NULL;
+	PvdMarshalling<smtype, lgtype> marshaller = PvdMarshalling<smtype, lgtype>();
+	return getMarshalOperators(single, block, marshaller, marshaller.canMarshal);
+}
+
+template <typename smtype>
+static inline bool getMarshalOperators(TSingleMarshaller& single, TBlockMarshaller& block, int32_t lgtypeId)
+{
+	switch(lgtypeId)
+	{
+	case PvdBaseType::PvdI8: // int8_t:
+		return getMarshalOperators<smtype, int8_t>(single, block);
+	case PvdBaseType::PvdU8: // uint8_t:
+		return getMarshalOperators<smtype, uint8_t>(single, block);
+	case PvdBaseType::PvdI16: // int16_t:
+		return getMarshalOperators<smtype, int16_t>(single, block);
+	case PvdBaseType::PvdU16: // uint16_t:
+		return getMarshalOperators<smtype, uint16_t>(single, block);
+	case PvdBaseType::PvdI32: // int32_t:
+		return getMarshalOperators<smtype, int32_t>(single, block);
+	case PvdBaseType::PvdU32: // uint32_t:
+		return getMarshalOperators<smtype, uint32_t>(single, block);
+	case PvdBaseType::PvdI64: // int64_t:
+		return getMarshalOperators<smtype, int64_t>(single, block);
+	case PvdBaseType::PvdU64: // uint64_t:
+		return getMarshalOperators<smtype, uint64_t>(single, block);
+	case PvdBaseType::PvdF32:
+		return getMarshalOperators<smtype, PvdF32>(single, block);
+	case PvdBaseType::PvdF64:
+		return getMarshalOperators<smtype, PvdF64>(single, block);
+	}
+	return false;
+}
+
+static inline bool getMarshalOperators(TSingleMarshaller& single, TBlockMarshaller& block, int32_t smtypeId,
+                                       int32_t lgtypeId)
+{
+	switch(smtypeId)
+	{
+	case PvdBaseType::PvdI8: // int8_t:
+		return getMarshalOperators<int8_t>(single, block, lgtypeId);
+	case PvdBaseType::PvdU8: // uint8_t:
+		return getMarshalOperators<uint8_t>(single, block, lgtypeId);
+	case PvdBaseType::PvdI16: // int16_t:
+		return getMarshalOperators<int16_t>(single, block, lgtypeId);
+	case PvdBaseType::PvdU16: // uint16_t:
+		return getMarshalOperators<uint16_t>(single, block, lgtypeId);
+	case PvdBaseType::PvdI32: // int32_t:
+		return getMarshalOperators<int32_t>(single, block, lgtypeId);
+	case PvdBaseType::PvdU32: // uint32_t:
+		return getMarshalOperators<uint32_t>(single, block, lgtypeId);
+	case PvdBaseType::PvdI64: // int64_t:
+		return getMarshalOperators<int64_t>(single, block, lgtypeId);
+	case PvdBaseType::PvdU64: // uint64_t:
+		return getMarshalOperators<uint64_t>(single, block, lgtypeId);
+	case PvdBaseType::PvdF32:
+		return getMarshalOperators<PvdF32>(single, block, lgtypeId);
+	case PvdBaseType::PvdF64:
+		return getMarshalOperators<PvdF64>(single, block, lgtypeId);
+	}
+	return false;
+}
+}
+}
+
+#endif // PXPVDSDK_PXPVDMARSHALLING_H
diff --git a/PxShared/src/pvd/src/PxPvdMemClient.cpp b/PxShared/src/pvd/src/PxPvdMemClient.cpp
new file mode 100644
index 0000000..05b7899
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdMemClient.cpp
@@ -0,0 +1,134 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "pvd/PxPvdTransport.h"
+#include "foundation/PxProfiler.h"
+
+#include "PxPvdImpl.h"
+#include "PxPvdMemClient.h"
+#include "PxProfileMemory.h"
+
+namespace physx
+{
+namespace pvdsdk
+{
+
+PvdMemClient::PvdMemClient(PvdImpl& pvd)
+: mSDKPvd(pvd)
+, mPvdDataStream(NULL)
+, mIsConnected(false)
+, mMemEventBuffer(profile::PxProfileMemoryEventBuffer::createMemoryEventBuffer(*gPvdAllocatorCallback))
+{
+}
+
+PvdMemClient::~PvdMemClient()
+{
+	mSDKPvd.removeClient(this);
+	if(mMemEventBuffer.hasClients())
+		mPvdDataStream->destroyInstance(&mMemEventBuffer);
+	mMemEventBuffer.release();
+}
+
+PvdDataStream* PvdMemClient::getDataStream()
+{
+	return mPvdDataStream;
+}
+
+PvdUserRenderer* PvdMemClient::getUserRender()
+{
+	PX_ASSERT(0);
+	return NULL;
+}
+
+void PvdMemClient::setObjectRegistrar(ObjectRegistrar*)
+{
+}
+
+bool PvdMemClient::isConnected() const
+{
+	return mIsConnected;
+}
+
+void PvdMemClient::onPvdConnected()
+{
+	if(mIsConnected)
+		return;
+	mIsConnected = true;
+
+	mPvdDataStream = PvdDataStream::create(&mSDKPvd);
+	mPvdDataStream->createInstance(&mMemEventBuffer);
+	mMemEventBuffer.addClient(*this);
+}
+
+void PvdMemClient::onPvdDisconnected()
+{
+	if(!mIsConnected)
+		return;
+	mIsConnected = false;
+
+	flush();
+
+	mMemEventBuffer.removeClient(*this);
+	mPvdDataStream->release();
+	mPvdDataStream = NULL;
+}
+
+void PvdMemClient::onAllocation(size_t inSize, const char* inType, const char* inFile, int inLine, void* inAddr)
+{
+	mMutex.lock();
+	mMemEventBuffer.onAllocation(inSize, inType, inFile, inLine, inAddr);
+	mMutex.unlock();
+}
+
+void PvdMemClient::onDeallocation(void* inAddr)
+{
+	mMutex.lock();
+	mMemEventBuffer.onDeallocation(inAddr);
+	mMutex.unlock();
+}
+
+void PvdMemClient::flush()
+{
+	mMutex.lock();
+	mMemEventBuffer.flushProfileEvents();
+	mMutex.unlock();
+}
+
+void PvdMemClient::handleBufferFlush(const uint8_t* inData, uint32_t inLength)
+{
+	if(mPvdDataStream)
+	    mPvdDataStream->setPropertyValue(&mMemEventBuffer, "events", inData, inLength);
+}
+
+void PvdMemClient::handleClientRemoved()
+{
+}
+
+} // pvd
+} // physx
diff --git a/PxShared/src/pvd/src/PxPvdMemClient.h b/PxShared/src/pvd/src/PxPvdMemClient.h
new file mode 100644
index 0000000..37ac4ff
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdMemClient.h
@@ -0,0 +1,85 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PXPVDSDK_PXPVDMEMCLIENT_H
+#define PXPVDSDK_PXPVDMEMCLIENT_H
+
+#include "PxPvdClient.h"
+#include "PsHashMap.h"
+#include "PsMutex.h"
+#include "PsBroadcast.h"
+#include "PxProfileEventBufferClient.h"
+#include "PxProfileMemory.h"
+
+namespace physx
+{
+class PvdDataStream;
+
+namespace pvdsdk
+{
+class PvdImpl;
+class PvdMemClient : public PvdClient,                   
+                     public profile::PxProfileEventBufferClient,
+                     public shdfnd::UserAllocated
+{
+	PX_NOCOPY(PvdMemClient)
+  public:
+	PvdMemClient(PvdImpl& pvd);
+	virtual ~PvdMemClient();
+
+	bool isConnected() const;
+	void onPvdConnected();
+	void onPvdDisconnected();
+	void flush();
+
+	PvdDataStream* getDataStream();
+	PvdUserRenderer* getUserRender();
+	void setObjectRegistrar(ObjectRegistrar*);
+	void sendMemEvents();
+
+	// memory event
+	void onAllocation(size_t size, const char* typeName, const char* filename, int line, void* allocatedMemory);
+	void onDeallocation(void* addr);
+
+  private:
+	PvdImpl& mSDKPvd;
+	PvdDataStream* mPvdDataStream;
+	bool mIsConnected;
+
+	// mem profile
+	shdfnd::Mutex mMutex; // mem onallocation can called from different threads
+	profile::PxProfileMemoryEventBuffer& mMemEventBuffer;
+	void handleBufferFlush(const uint8_t* inData, uint32_t inLength);
+	void handleClientRemoved();
+};
+
+} // namespace pvdsdk
+} // namespace physx
+
+#endif // PXPVDSDK_PXPVDMEMCLIENT_H
diff --git a/PxShared/src/pvd/src/PxPvdObjectModel.h b/PxShared/src/pvd/src/PxPvdObjectModel.h
new file mode 100644
index 0000000..f4858df
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdObjectModel.h
@@ -0,0 +1,437 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPVDOBJECTMODEL_H
+#define PXPVDSDK_PXPVDOBJECTMODEL_H
+
+#include "PsBasicTemplates.h"
+#include "PxPvdObjectModelMetaData.h"
+
+namespace physx
+{
+namespace pvdsdk
+{
+
+#if PX_VC == 11 || PX_VC == 12 || PX_VC == 14
+#pragma warning(push)
+#pragma warning(disable : 4435) // 'class1' : Object layout under /vd2 will change due to virtual base 'class2'
+#endif
+
+class PvdInputStream;
+class PvdOutputStream;
+
+struct InstanceDescription
+{
+	int32_t mId;
+	int32_t mClassId;
+	void* mInstPtr;
+	bool mAlive;
+
+	InstanceDescription(int32_t id, int32_t classId, void* inst, bool alive)
+	: mId(id), mClassId(classId), mInstPtr(inst), mAlive(alive)
+	{
+	}
+	InstanceDescription() : mId(-1), mClassId(-1), mInstPtr(NULL), mAlive(false)
+	{
+	}
+	operator void*()
+	{
+		PX_ASSERT(mAlive);
+		if(mAlive)
+			return mInstPtr;
+		return NULL;
+	}
+	operator int32_t()
+	{
+		return mId;
+	}
+};
+
+typedef physx::shdfnd::Pair<int32_t, int32_t> InstancePropertyPair;
+
+class PvdObjectModelBase
+{
+  protected:
+	virtual ~PvdObjectModelBase()
+	{
+	}
+
+  public:
+	virtual void addRef() = 0;
+	virtual void release() = 0;
+	virtual void* idToPtr(int32_t instId) const = 0;
+	virtual int32_t ptrToId(void* instPtr) const = 0;
+	virtual InstanceDescription idToDescriptor(int32_t instId) const = 0;
+	virtual InstanceDescription ptrToDescriptor(void* instPtr) const = 0;
+	virtual Option<ClassDescription> getClassOf(void* instId) const = 0;
+	virtual const PvdObjectModelMetaData& getMetaData() const = 0;
+};
+
+class PvdObjectModelMutator : public virtual PvdObjectModelBase
+{
+  protected:
+	virtual ~PvdObjectModelMutator()
+	{
+	}
+
+  public:
+	// if the instance is alive, this destroyes any arrays and sets the instance back to its initial state.
+	virtual InstanceDescription createInstance(int32_t clsId, int32_t instId) = 0;
+	virtual InstanceDescription createInstance(int32_t clsId) = 0;
+	// Instances that are pinned are not removed from the system, ever.
+	// This means that createInstance, pinInstance, deleteInstance
+	// can be called in this order and you can still call getClassOf, etc. on the instances.
+	// The instances will never be removed from memory if they are pinned, so use at your
+	// careful discretion.
+	virtual void pinInstance(void* instId) = 0;
+	virtual void unPinInstance(void* instId) = 0;
+	// when doing capture, should update all events in a section at once, otherwis there possible parse data
+	// incompltely.
+	virtual void recordCompletedInstances() = 0;
+
+	virtual void destroyInstance(void* instId) = 0;
+	virtual int32_t getNextInstanceHandleValue() const = 0;
+	// reserve a set of instance handle values by getting the current, adding an amount to it
+	// and setting the value.  You can never set the value lower than it already is, it only climbs.
+	virtual void setNextInstanceHandleValue(int32_t hdlValue) = 0;
+	// If incoming type is provided, then we may be able to marshal simple types
+	// This works for arrays, it just completely replaces the entire array.
+	// Because if this, it is an error of the property identifier
+	virtual bool setPropertyValue(void* instId, int32_t propId, const uint8_t* data, uint32_t dataLen,
+	                              int32_t incomingType) = 0;
+	// Set a set of properties defined by a property message
+	virtual bool setPropertyMessage(void* instId, int32_t msgId, const uint8_t* data, uint32_t dataLen) = 0;
+	// insert an element(s) into array index.  If index > numElements, element(s) is(are) appended.
+	virtual bool insertArrayElement(void* instId, int32_t propId, int32_t index, const uint8_t* data, uint32_t dataLen,
+	                                int32_t incomingType = -1) = 0;
+	virtual bool removeArrayElement(void* instId, int32_t propId, int32_t index) = 0;
+	// Add this array element to end end if it doesn't already exist in the array.
+	// The option is false if there was an error with the function call.
+	// The integer has no value if nothing was added, else it tells you the index
+	// where the item was added.  Comparison is done using memcmp.
+	virtual Option<int32_t> pushBackArrayElementIf(void* instId, int32_t propId, const uint8_t* data, uint32_t dataLen,
+	                                               int32_t incomingType = -1) = 0;
+	// Remove an array element if it exists in the array.
+	// The option is false if there was an error with the function call.
+	// the integer has no value if the item wasn't found, else it tells you the index where
+	// the item resided.  Comparison is memcmp.
+	virtual Option<int32_t> removeArrayElementIf(void* instId, int32_t propId, const uint8_t* data, uint32_t dataLen,
+	                                             int32_t incomingType = -1) = 0;
+	virtual bool setArrayElementValue(void* instId, int32_t propId, int32_t propIdx, const uint8_t* data,
+	                                  uint32_t dataLen, int32_t incomingType) = 0;
+
+	virtual void originShift(void* instId, PxVec3 shift) = 0;
+
+	InstanceDescription createInstance(const NamespacedName& name)
+	{
+		return createInstance(getMetaData().findClass(name)->mClassId);
+	}
+	template <typename TDataType>
+	bool setPropertyValue(void* instId, const char* propName, const TDataType* dtype, uint32_t count)
+	{
+		ClassDescription cls(getClassOf(instId));
+		Option<PropertyDescription> descOpt(getMetaData().findProperty(cls.mClassId, propName));
+		if(!descOpt.hasValue())
+		{
+			PX_ASSERT(false);
+			return false;
+		}
+		const PropertyDescription& prop(descOpt);
+		Option<ClassDescription> incomingCls(getMetaData().findClass(getPvdNamespacedNameForType<TDataType>()));
+		if(incomingCls.hasValue())
+			return setPropertyValue(instId, prop.mPropertyId, reinterpret_cast<const uint8_t*>(dtype),
+			                        sizeof(*dtype) * count, incomingCls.getValue().mClassId);
+		return false;
+	}
+
+	// Simplest possible setPropertyValue
+	template <typename TDataType>
+	bool setPropertyValue(void* instId, const char* propName, const TDataType& dtype)
+	{
+		return setPropertyValue(instId, propName, &dtype, 1);
+	}
+
+	template <typename TDataType>
+	bool setPropertyMessage(void* instId, const TDataType& msg)
+	{
+		Option<PropertyMessageDescription> msgId =
+		    getMetaData().findPropertyMessage(getPvdNamespacedNameForType<TDataType>());
+		if(msgId.hasValue() == false)
+			return false;
+		return setPropertyMessage(instId, msgId.getValue().mMessageId, reinterpret_cast<const uint8_t*>(&msg),
+		                          sizeof(msg));
+	}
+	template <typename TDataType>
+	bool insertArrayElement(void* instId, const char* propName, int32_t idx, const TDataType& dtype)
+	{
+		ClassDescription cls(getClassOf(instId));
+		Option<PropertyDescription> descOpt(getMetaData().findProperty(cls.mClassId, propName));
+		if(!descOpt.hasValue())
+		{
+			PX_ASSERT(false);
+			return false;
+		}
+		const PropertyDescription& prop(descOpt);
+		Option<ClassDescription> incomingCls(getMetaData().findClass(getPvdNamespacedNameForType<TDataType>()));
+		if(incomingCls.hasValue())
+		{
+			return insertArrayElement(instId, prop.mPropertyId, idx, reinterpret_cast<const uint8_t*>(&dtype),
+			                          sizeof(dtype), incomingCls.getValue().mClassId);
+		}
+		return false;
+	}
+
+	bool removeArrayElement(void* instId, const char* propName, int32_t idx)
+	{
+		ClassDescription cls(getClassOf(instId));
+		Option<PropertyDescription> descOpt(getMetaData().findProperty(cls.mClassId, propName));
+		if(!descOpt.hasValue())
+		{
+			PX_ASSERT(false);
+			return false;
+		}
+		const PropertyDescription& prop(descOpt);
+		return removeArrayElement(instId, prop.mPropertyId, idx);
+	}
+	template <typename TDataType>
+	Option<int32_t> pushBackArrayElementIf(void* instId, const char* pname, const TDataType& item)
+	{
+		ClassDescription cls(getClassOf(instId));
+		Option<PropertyDescription> descOpt(getMetaData().findProperty(cls.mClassId, pname));
+		if(!descOpt.hasValue())
+		{
+			PX_ASSERT(false);
+			return None();
+		}
+		const PropertyDescription& prop(descOpt);
+		Option<ClassDescription> incomingCls(getMetaData().findClass(getPvdNamespacedNameForType<TDataType>()));
+		if(incomingCls.hasValue() && (incomingCls.getValue().mClassId == prop.mDatatype))
+		{
+			return pushBackArrayElementIf(instId, prop.mPropertyId, reinterpret_cast<const uint8_t*>(&item),
+			                              sizeof(item), incomingCls.getValue().mClassId);
+		}
+		return None();
+	}
+	template <typename TDataType>
+	Option<int32_t> removeArrayElementIf(void* instId, const char* propId, const TDataType& item)
+	{
+		ClassDescription cls(getClassOf(instId));
+		Option<PropertyDescription> descOpt(getMetaData().findProperty(cls.mClassId, propId));
+		if(!descOpt.hasValue())
+		{
+			PX_ASSERT(false);
+			return None();
+		}
+		const PropertyDescription& prop(descOpt);
+		Option<ClassDescription> incomingCls(getMetaData().findClass(getPvdNamespacedNameForType<TDataType>()));
+		if(incomingCls.hasValue() && (incomingCls.getValue().mClassId == prop.mDatatype))
+		{
+			return removeArrayElementIf(instId, prop.mPropertyId, reinterpret_cast<const uint8_t*>(&item), sizeof(item),
+			                            incomingCls.getValue().mClassId);
+		}
+		return None();
+	}
+	template <typename TDataType>
+	bool setArrayElementValue(void* instId, const char* propName, int32_t propIdx, TDataType& item)
+	{
+		ClassDescription cls(getClassOf(instId));
+		Option<PropertyDescription> descOpt(getMetaData().findProperty(cls.mClassId, propName));
+		if(!descOpt.hasValue())
+		{
+			PX_ASSERT(false);
+			return false;
+		}
+		const PropertyDescription& prop(descOpt);
+		Option<ClassDescription> incomingCls(getMetaData().findClass(getPvdNamespacedNameForType<TDataType>()));
+		if(incomingCls.hasValue() && (incomingCls.getValue().mClassId == prop.mDatatype))
+			return setArrayElementValue(instId, prop.mPropertyId, propIdx, reinterpret_cast<const uint8_t*>(&item),
+			                            sizeof(item), incomingCls.getValue().mClassId);
+		PX_ASSERT(false);
+		return false;
+	}
+};
+
+class PvdObjectModelReader : public virtual PvdObjectModelBase
+{
+  protected:
+	virtual ~PvdObjectModelReader()
+	{
+	}
+
+  public:
+	// Return the byte size of a possible nested property
+	virtual uint32_t getPropertyByteSize(void* instId, int32_t propId) = 0;
+	uint32_t getPropertyByteSize(void* instId, String propName)
+	{
+		int32_t propId = getMetaData().findProperty(getClassOf(instId)->mClassId, propName)->mPropertyId;
+		return getPropertyByteSize(instId, propId);
+	}
+	// Return the value of a possible nested property
+	virtual uint32_t getPropertyValue(void* instId, int32_t propId, uint8_t* outData, uint32_t outDataLen) = 0;
+	// Get the actual raw database memory.  This is subject to change drastically if the object gets deleted.
+	virtual DataRef<uint8_t> getRawPropertyValue(void* instId, int32_t propId) = 0;
+
+	DataRef<uint8_t> getRawPropertyValue(void* instId, const char* propName)
+	{
+		ClassDescription cls(getClassOf(instId));
+		Option<PropertyDescription> descOpt(getMetaData().findProperty(cls.mClassId, propName));
+		if(!descOpt.hasValue())
+		{
+			PX_ASSERT(false);
+			return 0;
+		}
+		return getRawPropertyValue(instId, descOpt->mPropertyId);
+	}
+
+	template <typename TDataType>
+	DataRef<TDataType> getTypedRawPropertyValue(void* instId, int32_t propId)
+	{
+		DataRef<uint8_t> propVal = getRawPropertyValue(instId, propId);
+		return DataRef<TDataType>(reinterpret_cast<const TDataType*>(propVal.begin()),
+		                          propVal.size() / sizeof(TDataType));
+	}
+
+	template <typename TDataType>
+	DataRef<TDataType> getTypedRawPropertyValue(void* instId, const char* propName)
+	{
+		DataRef<uint8_t> propVal = getRawPropertyValue(instId, propName);
+		return DataRef<TDataType>(reinterpret_cast<const TDataType*>(propVal.begin()),
+		                          propVal.size() / sizeof(TDataType));
+	}
+
+	template <typename TDataType>
+	uint32_t getPropertyValue(void* instId, const char* propName, TDataType* outBuffer, uint32_t outNumBufferItems)
+	{
+		ClassDescription cls(getClassOf(instId));
+		Option<PropertyDescription> descOpt(getMetaData().findProperty(cls.mClassId, propName));
+		if(!descOpt.hasValue())
+		{
+			PX_ASSERT(false);
+			return 0;
+		}
+		const PropertyDescription& prop(descOpt);
+		uint32_t desired = outNumBufferItems * sizeof(TDataType);
+		return getPropertyValue(instId, prop.mPropertyId, reinterpret_cast<uint8_t*>(outBuffer), desired) /
+		       sizeof(TDataType);
+	}
+
+	template <typename TDataType>
+	Option<TDataType> getPropertyValue(void* instId, const char* propName)
+	{
+		TDataType retval;
+		if(getPropertyValue(instId, propName, &retval, 1) == 1)
+			return retval;
+		return None();
+	}
+
+	// Get this one item out of the array
+	// return array[idx]
+	virtual uint32_t getPropertyValue(void* instId, int32_t propId, int inArrayIndex, uint8_t* outData,
+	                                  uint32_t outDataLen) = 0;
+	// Get this sub element of one item out of the array
+	// return array[idx].a
+	virtual uint32_t getPropertyValue(void* instId, int32_t propId, int inArrayIndex, int nestedProperty,
+	                                  uint8_t* outData, uint32_t outDataLen) = 0;
+
+	// Get a set of properties defined by a property message
+	virtual bool getPropertyMessage(void* instId, int32_t msgId, uint8_t* data, uint32_t dataLen) const = 0;
+
+	template <typename TDataType>
+	bool getPropertyMessage(void* instId, TDataType& msg)
+	{
+		Option<PropertyMessageDescription> msgId(
+		    getMetaData().findPropertyMessage(getPvdNamespacedNameForType<TDataType>()));
+		if(msgId.hasValue() == false)
+			return false;
+		return getPropertyMessage(instId, msgId.getValue().mMessageId, reinterpret_cast<uint8_t*>(&msg), sizeof(msg));
+	}
+
+	// clearing the array is performed with a set property value call with no data.
+	virtual uint32_t getNbArrayElements(void* instId, int32_t propId) = 0;
+	uint32_t getNbArrayElements(void* instId, const char* propName)
+	{
+		ClassDescription cls(getClassOf(instId));
+		Option<PropertyDescription> descOpt(getMetaData().findProperty(cls.mClassId, propName));
+		if(!descOpt.hasValue())
+		{
+			PX_ASSERT(false);
+			return false;
+		}
+		const PropertyDescription& prop(descOpt);
+		return getNbArrayElements(instId, prop.mPropertyId);
+	}
+
+	// Write this instance out.  Offset is set as the instances last write offset.
+	// This offset is cleared if the object is changed.
+	// If offset doesn't have a value, then the instance isn't changed.
+	virtual void writeInstance(void* instId, PvdOutputStream& stream) = 0;
+
+	virtual uint32_t getNbInstances() const = 0;
+	virtual uint32_t getInstances(InstanceDescription* outBuffer, uint32_t count, uint32_t startIndex = 0) const = 0;
+
+	// Get the list of updated objects since the last time someone cleared the updated instance list.
+	virtual uint32_t getNbUpdatedInstances() const = 0;
+	virtual uint32_t getUpdatedInstances(InstanceDescription* outBuffer, uint32_t count, uint32_t startIndex = 0) = 0;
+	// Must be called for instances to be released.  Only instances that aren't live nor are they updated
+	// are valid.
+	virtual void clearUpdatedInstances() = 0;
+};
+
+class PvdObjectModel : public PvdObjectModelMutator, public PvdObjectModelReader
+{
+  protected:
+	virtual ~PvdObjectModel()
+	{
+	}
+
+  public:
+	virtual void destroyAllInstances() = 0;
+	virtual bool setPropertyValueToDefault(void* instId, int32_t propId) = 0;
+	// Read an instance data and put a copy of the data in the output stream.
+	static bool readInstance(PvdInputStream& inStream, PvdOutputStream& outStream);
+	virtual InstanceDescription readInstance(DataRef<const uint8_t> writtenData) = 0;
+	// Set just this property from this serialized instance.
+	// Expects the instance to be alive, just like setPropertyValue
+	virtual bool readInstanceProperty(DataRef<const uint8_t> writtenData, int32_t propId) = 0;
+
+	virtual void recordCompletedInstances() = 0;
+
+	// OriginShift seekback support
+	virtual uint32_t getNbShifted() = 0;
+	virtual void getShiftedPair(InstancePropertyPair* outData, uint32_t count) = 0;
+	virtual void clearShiftedPair() = 0;
+	virtual void shiftObject(void* instId, int32_t propId, PxVec3 shift) = 0;
+	static PvdObjectModel& create(physx::PxAllocatorCallback& callback, PvdObjectModelMetaData& metaData,
+	                              bool isCapture = false);
+};
+
+#if PX_VC == 11 || PX_VC == 12 || PX_VC == 14
+#pragma warning(pop)
+#endif
+}
+}
+#endif // PXPVDSDK_PXPVDOBJECTMODEL_H
diff --git a/PxShared/src/pvd/src/PxPvdObjectModelInternalTypeDefs.h b/PxShared/src/pvd/src/PxPvdObjectModelInternalTypeDefs.h
new file mode 100644
index 0000000..eca7858
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdObjectModelInternalTypeDefs.h
@@ -0,0 +1,32 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#define THERE_IS_NO_INCLUDE_GUARD_HERE_FOR_A_REASON
+
+DECLARE_INTERNAL_PVD_TYPE(ArrayData)
+
+#undef THERE_IS_NO_INCLUDE_GUARD_HERE_FOR_A_REASON
diff --git a/PxShared/src/pvd/src/PxPvdObjectModelInternalTypes.h b/PxShared/src/pvd/src/PxPvdObjectModelInternalTypes.h
new file mode 100644
index 0000000..3344140
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdObjectModelInternalTypes.h
@@ -0,0 +1,171 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#ifndef PXPVDSDK_PXPVDOBJECTMODELINTERNALTYPES_H
+#define PXPVDSDK_PXPVDOBJECTMODELINTERNALTYPES_H
+
+#include "foundation/PxMemory.h"
+#include "PxPvdObjectModelBaseTypes.h"
+#include "PsArray.h"
+#include "PxPvdFoundation.h"
+#include "PxPvdObjectModel.h"
+
+namespace physx
+{
+namespace pvdsdk
+{
+
+struct PvdInternalType
+{
+	enum Enum
+	{
+		None = 0,
+#define DECLARE_INTERNAL_PVD_TYPE(type) type,
+#include "PxPvdObjectModelInternalTypeDefs.h"
+		Last
+#undef DECLARE_INTERNAL_PVD_TYPE
+	};
+};
+
+PX_COMPILE_TIME_ASSERT(uint32_t(PvdInternalType::Last) <= uint32_t(PvdBaseType::InternalStop));
+
+template <typename T>
+struct DataTypeToPvdTypeMap
+{
+	bool compile_error;
+};
+template <PvdInternalType::Enum>
+struct PvdTypeToDataTypeMap
+{
+	bool compile_error;
+};
+
+#define DECLARE_INTERNAL_PVD_TYPE(type)                                                                                \
+	template <>                                                                                                        \
+	struct DataTypeToPvdTypeMap<type>                                                                                  \
+	{                                                                                                                  \
+		enum Enum                                                                                                      \
+		{                                                                                                              \
+			BaseTypeEnum = PvdInternalType::type                                                                       \
+		};                                                                                                             \
+	};                                                                                                                 \
+	template <>                                                                                                        \
+	struct PvdTypeToDataTypeMap<PvdInternalType::type>                                                                 \
+	{                                                                                                                  \
+		typedef type TDataType;                                                                                        \
+	};                                                                                                                 \
+	template <>                                                                                                        \
+	struct PvdDataTypeToNamespacedNameMap<type>                                                                        \
+	{                                                                                                                  \
+		NamespacedName Name;                                                                                           \
+		PvdDataTypeToNamespacedNameMap<type>() : Name("physx3_debugger_internal", #type)                               \
+		{                                                                                                              \
+		}                                                                                                              \
+	};
+#include "PxPvdObjectModelInternalTypeDefs.h"
+#undef DECLARE_INTERNAL_PVD_TYPE
+
+template <typename TDataType, typename TAlloc>
+DataRef<TDataType> toDataRef(const shdfnd::Array<TDataType, TAlloc>& data)
+{
+	return DataRef<TDataType>(data.begin(), data.end());
+}
+
+static inline bool safeStrEq(const DataRef<String>& lhs, const DataRef<String>& rhs)
+{
+	uint32_t count = lhs.size();
+	if(count != rhs.size())
+		return false;
+	for(uint32_t idx = 0; idx < count; ++idx)
+		if(!safeStrEq(lhs[idx], rhs[idx]))
+			return false;
+	return true;
+}
+
+static inline char* copyStr(const char* str)
+{
+	str = nonNull(str);
+	uint32_t len = static_cast<uint32_t>(strlen(str));
+	char* newData = reinterpret_cast<char*>(PX_ALLOC(len + 1, "string"));
+	PxMemCopy(newData, str, len);
+	newData[len] = 0;
+	return newData;
+}
+
+// Used for predictable bit fields.
+template <typename TDataType, uint8_t TNumBits, uint8_t TOffset, typename TInputType>
+struct BitMaskSetter
+{
+	// Create a mask that masks out the orginal value shift into place
+	static TDataType createOffsetMask()
+	{
+		return createMask() << TOffset;
+	}
+	// Create a mask of TNumBits number of tis
+	static TDataType createMask()
+	{
+		return static_cast<TDataType>((1 << TNumBits) - 1);
+	}
+	void setValue(TDataType& inCurrent, TInputType inData)
+	{
+		PX_ASSERT(inData < (1 << TNumBits));
+
+		// Create a mask to remove the current value.
+		TDataType theMask = ~(createOffsetMask());
+		// Clear out current value.
+		inCurrent = inCurrent & theMask;
+		// Create the new value.
+		TDataType theAddition = reinterpret_cast<TDataType>(inData << TOffset);
+		// or it into the existing value.
+		inCurrent = inCurrent | theAddition;
+	}
+
+	TInputType getValue(TDataType inCurrent)
+	{
+		return static_cast<TInputType>((inCurrent >> TOffset) & createMask());
+	}
+};
+
+template <typename TObjType>
+DataRef<TObjType> getArray(shdfnd::Array<uint8_t>& dataBuffer, PvdObjectModelReader& reader, InstanceDescription instanceDesc,
+                           String propName)
+{
+	int32_t propId = reader.getMetaData().findProperty(reader.getClassOf(instanceDesc)->mClassId, propName)->mPropertyId;
+	uint32_t numBytes = reader.getPropertyByteSize(instanceDesc.mInstPtr, propId);
+	uint32_t numItems = reader.getNbArrayElements(instanceDesc.mInstPtr, propId);
+	if(numBytes == 0)
+		return NULL;
+	if(numBytes > dataBuffer.size())
+		dataBuffer.resize(numBytes);
+
+	TObjType* dataPtr = reinterpret_cast<TObjType*>(dataBuffer.begin());
+	reader.getPropertyValue(instanceDesc, propId, dataBuffer.begin(), numBytes);
+	return DataRef<TObjType>(dataPtr, numItems);
+}
+}
+}
+#endif // PXPVDSDK_PXPVDOBJECTMODELINTERNALTYPES_H
diff --git a/PxShared/src/pvd/src/PxPvdObjectModelMetaData.cpp b/PxShared/src/pvd/src/PxPvdObjectModelMetaData.cpp
new file mode 100644
index 0000000..9971d00
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdObjectModelMetaData.cpp
@@ -0,0 +1,1515 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+#include "PxPvdObjectModel.h"
+#include "PxPvdObjectModelBaseTypes.h"
+#include "PxPvdObjectModelInternalTypes.h"
+#include "PxPvdObjectModelMetaData.h"
+#include "PxPvdFoundation.h"
+#include "PsMutex.h"
+#include "PxPvdByteStreams.h"
+#include "PxPvdInternalByteStreams.h"
+#include "PxPvdMarshalling.h"
+
+using namespace physx;
+using namespace pvdsdk;
+using namespace shdfnd;
+
+namespace
+{
+
+struct PropDescImpl : public PropertyDescription, public UserAllocated
+{
+    Array<NamedValue> mValueNames;
+	PropDescImpl(const PropertyDescription& inBase, StringTable& table)
+	: PropertyDescription(inBase), mValueNames("NamedValue")
+	{
+		mName = table.registerStr(mName);
+	}
+	PropDescImpl() : mValueNames("NamedValue")
+	{
+	}
+
+	template <typename TSerializer>
+	void serialize(TSerializer& serializer)
+	{
+		serializer.streamify(mOwnerClassName);
+		serializer.streamify(mOwnerClassId);
+		serializer.streamify(mSemantic);
+		serializer.streamify(mDatatype);
+		serializer.streamify(mDatatypeName);
+		serializer.streamify(mPropertyType);
+		serializer.streamify(mPropertyId);
+		serializer.streamify(m32BitOffset);
+		serializer.streamify(m64BitOffset);
+		serializer.streamify(mValueNames);
+		serializer.streamify(mName);
+	}
+};
+
+struct ClassDescImpl : public ClassDescription, public UserAllocated
+{
+    Array<PropDescImpl*> mPropImps;
+    Array<PtrOffset> m32OffsetArray;
+	Array<PtrOffset> m64OffsetArray;
+	ClassDescImpl(const ClassDescription& inBase)
+	: ClassDescription(inBase)
+	, mPropImps("PropDescImpl*")
+	, m32OffsetArray("ClassDescImpl::m32OffsetArray")
+	, m64OffsetArray("ClassDescImpl::m64OffsetArray")
+	{
+		PVD_FOREACH(idx, get32BitSizeInfo().mPtrOffsets.size())
+		m32OffsetArray.pushBack(get32BitSizeInfo().mPtrOffsets[idx]);
+		PVD_FOREACH(idx, get64BitSizeInfo().mPtrOffsets.size())
+		m64OffsetArray.pushBack(get64BitSizeInfo().mPtrOffsets[idx]);
+	}
+	ClassDescImpl()
+	: mPropImps("PropDescImpl*")
+	, m32OffsetArray("ClassDescImpl::m32OffsetArray")
+	, m64OffsetArray("ClassDescImpl::m64OffsetArray")
+	{
+	}
+	PropDescImpl* findProperty(String name)
+	{
+		PVD_FOREACH(idx, mPropImps.size())
+		{
+			if(safeStrEq(mPropImps[idx]->mName, name))
+				return mPropImps[idx];
+		}
+		return NULL;
+	}
+	void addProperty(PropDescImpl* prop)
+	{
+		mPropImps.pushBack(prop);
+	}
+
+	void addPtrOffset(PtrOffsetType::Enum type, uint32_t offset32, uint32_t offset64)
+	{
+		m32OffsetArray.pushBack(PtrOffset(type, offset32));
+		m64OffsetArray.pushBack(PtrOffset(type, offset64));
+		get32BitSizeInfo().mPtrOffsets = DataRef<PtrOffset>(m32OffsetArray.begin(), m32OffsetArray.end());
+		get64BitSizeInfo().mPtrOffsets = DataRef<PtrOffset>(m64OffsetArray.begin(), m64OffsetArray.end());
+	}
+
+	template <typename TSerializer>
+	void serialize(TSerializer& serializer)
+	{
+		serializer.streamify(mName);
+		serializer.streamify(mClassId);
+		serializer.streamify(mBaseClass);
+		serializer.streamify(mPackedUniformWidth);
+		serializer.streamify(mPackedClassType);
+		serializer.streamify(mLocked);
+		serializer.streamify(mRequiresDestruction);
+		serializer.streamify(get32BitSize());
+		serializer.streamify(get32BitSizeInfo().mDataByteSize);
+		serializer.streamify(get32BitSizeInfo().mAlignment);
+		serializer.streamify(get64BitSize());
+		serializer.streamify(get64BitSizeInfo().mDataByteSize);
+		serializer.streamify(get64BitSizeInfo().mAlignment);
+		serializer.streamifyLinks(mPropImps);
+		serializer.streamify(m32OffsetArray);
+		serializer.streamify(m64OffsetArray);
+		get32BitSizeInfo().mPtrOffsets = DataRef<PtrOffset>(m32OffsetArray.begin(), m32OffsetArray.end());
+		get64BitSizeInfo().mPtrOffsets = DataRef<PtrOffset>(m64OffsetArray.begin(), m64OffsetArray.end());
+	}
+};
+
+class StringTableImpl : public StringTable, public UserAllocated
+{
+	HashMap<const char*, char*> mStrings;
+	uint32_t mNextStrHandle;
+	HashMap<uint32_t, char*> mHandleToStr;
+	HashMap<const char*, uint32_t> mStrToHandle;
+
+  public:
+	StringTableImpl()
+	: mStrings("StringTableImpl::mStrings")
+	, mNextStrHandle(1)
+	, mHandleToStr("StringTableImpl::mHandleToStr")
+	, mStrToHandle("StringTableImpl::mStrToHandle")
+	{
+	}
+	uint32_t nextHandleValue()
+	{
+		return mNextStrHandle++;
+	}
+	virtual ~StringTableImpl()
+	{
+		for(HashMap<const char*, char*>::Iterator iter = mStrings.getIterator(); !iter.done(); ++iter)
+			PX_FREE(iter->second);
+		mStrings.clear();
+	}
+	virtual uint32_t getNbStrs()
+	{
+		return mStrings.size();
+	}
+	virtual uint32_t getStrs(const char** outStrs, uint32_t bufLen, uint32_t startIdx = 0)
+	{
+		startIdx = PxMin(getNbStrs(), startIdx);
+		uint32_t numStrs(PxMin(getNbStrs() - startIdx, bufLen));
+		HashMap<const char*, char*>::Iterator iter(mStrings.getIterator());
+		for(uint32_t idx = 0; idx < startIdx; ++idx, ++iter)
+			;
+		for(uint32_t idx = 0; idx < numStrs && !iter.done(); ++idx, ++iter)
+			outStrs[idx] = iter->second;
+		return numStrs;
+	}
+	void addStringHandle(char* str, uint32_t hdl)
+	{
+		mHandleToStr.insert(hdl, str);
+		mStrToHandle.insert(str, hdl);
+	}
+
+	uint32_t addStringHandle(char* str)
+	{
+		uint32_t theNewHandle = nextHandleValue();
+		addStringHandle(str, theNewHandle);
+		return theNewHandle;
+	}
+	const char* doRegisterStr(const char* str, bool& outAdded)
+	{
+		PX_ASSERT(isMeaningful(str));
+		const HashMap<const char*, char*>::Entry* entry(mStrings.find(str));
+        if(entry == NULL)
+		{
+			outAdded = true;
+			char* retval(copyStr(str));
+			mStrings.insert(retval, retval);
+			return retval;
+		}
+		return entry->second;
+	}
+	virtual const char* registerStr(const char* str, bool& outAdded)
+	{
+		outAdded = false;
+		if(isMeaningful(str) == false)
+			return "";
+		const char* retval = doRegisterStr(str, outAdded);
+		if(outAdded)
+			addStringHandle(const_cast<char*>(retval));
+		return retval;
+	}
+
+	NamespacedName registerName(const NamespacedName& nm)
+	{
+		return NamespacedName(registerStr(nm.mNamespace), registerStr(nm.mName));
+	}
+	const char* registerStr(const char* str)
+	{
+		bool ignored;
+		return registerStr(str, ignored);
+	}
+
+	virtual StringHandle strToHandle(const char* str)
+	{
+		if(isMeaningful(str) == false)
+			return 0;
+		const HashMap<const char*, uint32_t>::Entry* entry(mStrToHandle.find(str));
+		if(entry)
+			return entry->second;
+		bool added = false;
+		const char* registeredStr = doRegisterStr(str, added);
+		uint32_t theNewHandle = addStringHandle(const_cast<char*>(registeredStr));
+		PX_ASSERT(mStrToHandle.find(str));
+		PX_ASSERT(added);
+		return theNewHandle;
+	}
+
+	virtual const char* handleToStr(uint32_t hdl)
+	{
+		if(hdl == 0)
+			return "";
+		const HashMap<uint32_t, char*>::Entry* entry(mHandleToStr.find(hdl));
+		if(entry)
+			return entry->second;
+		// unregistered handle...
+		return "";
+	}
+
+	void write(PvdOutputStream& stream)
+	{
+		uint32_t numStrs = static_cast<uint32_t>(mHandleToStr.size());
+		stream << numStrs;
+		stream << mNextStrHandle;
+		for(HashMap<uint32_t, char*>::Iterator iter = mHandleToStr.getIterator(); !iter.done(); ++iter)
+		{
+			stream << iter->first;
+			uint32_t len = static_cast<uint32_t>(strlen(iter->second) + 1);
+			stream << len;
+			stream.write(reinterpret_cast<uint8_t*>(iter->second), len);
+		}
+	}
+
+	template <typename TReader>
+	void read(TReader& stream)
+	{
+		mHandleToStr.clear();
+		mStrToHandle.clear();
+		uint32_t numStrs;
+		stream >> numStrs;
+		stream >> mNextStrHandle;
+		Array<uint8_t> readBuffer("StringTable::read::readBuffer");
+		uint32_t bufSize = 0;
+		for(uint32_t idx = 0; idx < numStrs; ++idx)
+		{
+			uint32_t handleValue;
+			uint32_t bufLen;
+			stream >> handleValue;
+			stream >> bufLen;
+			if(bufSize < bufLen)
+				readBuffer.resize(bufLen);
+			bufSize = PxMax(bufSize, bufLen);
+			stream.read(readBuffer.begin(), bufLen);
+			bool ignored;
+			const char* newStr = doRegisterStr(reinterpret_cast<const char*>(readBuffer.begin()), ignored);
+			addStringHandle(const_cast<char*>(newStr), handleValue);
+		}
+	}
+
+	virtual void release()
+	{
+		PVD_DELETE(this);
+	}
+
+  private:
+	StringTableImpl& operator=(const StringTableImpl&);
+};
+
+struct NamespacedNameHasher
+{
+	uint32_t operator()(const NamespacedName& nm)
+	{
+		return Hash<const char*>()(nm.mNamespace) ^ Hash<const char*>()(nm.mName);
+	}
+	bool equal(const NamespacedName& lhs, const NamespacedName& rhs)
+	{
+		return safeStrEq(lhs.mNamespace, rhs.mNamespace) && safeStrEq(lhs.mName, rhs.mName);
+	}
+};
+
+struct ClassPropertyName
+{
+	NamespacedName mName;
+	String mPropName;
+	ClassPropertyName(const NamespacedName& name = NamespacedName(), String propName = "")
+	: mName(name), mPropName(propName)
+	{
+	}
+};
+
+struct ClassPropertyNameHasher
+{
+	uint32_t operator()(const ClassPropertyName& nm)
+	{
+		return NamespacedNameHasher()(nm.mName) ^ Hash<const char*>()(nm.mPropName);
+	}
+	bool equal(const ClassPropertyName& lhs, const ClassPropertyName& rhs)
+	{
+		return NamespacedNameHasher().equal(lhs.mName, rhs.mName) && safeStrEq(lhs.mPropName, rhs.mPropName);
+	}
+};
+
+struct PropertyMessageEntryImpl : public PropertyMessageEntry
+{
+	PropertyMessageEntryImpl(const PropertyMessageEntry& data) : PropertyMessageEntry(data)
+	{
+	}
+	PropertyMessageEntryImpl()
+	{
+	}
+	template <typename TSerializerType>
+	void serialize(TSerializerType& serializer)
+	{
+		serializer.streamify(mDatatypeName);
+		serializer.streamify(mDatatypeId);
+		serializer.streamify(mMessageOffset);
+		serializer.streamify(mByteSize);
+		serializer.streamify(mDestByteSize);
+		serializer.streamify(mProperty);
+	}
+};
+
+struct PropertyMessageDescriptionImpl : public PropertyMessageDescription, public UserAllocated
+{
+	Array<PropertyMessageEntryImpl> mEntryImpls;
+	Array<PropertyMessageEntry> mEntries;
+	Array<uint32_t> mStringOffsetArray;
+	PropertyMessageDescriptionImpl(const PropertyMessageDescription& data)
+	: PropertyMessageDescription(data)
+	, mEntryImpls("PropertyMessageDescriptionImpl::mEntryImpls")
+	, mEntries("PropertyMessageDescriptionImpl::mEntries")
+	, mStringOffsetArray("PropertyMessageDescriptionImpl::mStringOffsets")
+	{
+	}
+	PropertyMessageDescriptionImpl()
+	: mEntryImpls("PropertyMessageDescriptionImpl::mEntryImpls")
+	, mEntries("PropertyMessageDescriptionImpl::mEntries")
+	, mStringOffsetArray("PropertyMessageDescriptionImpl::mStringOffsets")
+	{
+	}
+
+	~PropertyMessageDescriptionImpl()
+	{
+	}
+
+	void addEntry(const PropertyMessageEntryImpl& entry)
+	{
+		mEntryImpls.pushBack(entry);
+		mEntries.pushBack(entry);
+		mProperties = DataRef<PropertyMessageEntry>(mEntries.begin(), mEntries.end());
+	}
+
+	template <typename TSerializerType>
+	void serialize(TSerializerType& serializer)
+	{
+		serializer.streamify(mClassName);
+		serializer.streamify(mClassId); // No other class has this id, it is DB-unique
+		serializer.streamify(mMessageName);
+		serializer.streamify(mMessageId);
+		serializer.streamify(mMessageByteSize);
+		serializer.streamify(mEntryImpls);
+		serializer.streamify(mStringOffsetArray);
+		if(mEntries.size() != mEntryImpls.size())
+		{
+			mEntries.clear();
+			uint32_t numEntries = static_cast<uint32_t>(mEntryImpls.size());
+			for(uint32_t idx = 0; idx < numEntries; ++idx)
+				mEntries.pushBack(mEntryImpls[idx]);
+		}
+		mProperties = DataRef<PropertyMessageEntry>(mEntries.begin(), mEntries.end());
+		mStringOffsets = DataRef<uint32_t>(mStringOffsetArray.begin(), mStringOffsetArray.end());
+	}
+
+  private:
+	PropertyMessageDescriptionImpl& operator=(const PropertyMessageDescriptionImpl&);
+};
+
+struct PvdObjectModelMetaDataImpl : public PvdObjectModelMetaData, public UserAllocated
+{
+	typedef HashMap<NamespacedName, ClassDescImpl*, NamespacedNameHasher> TNameToClassMap;
+	typedef HashMap<ClassPropertyName, PropDescImpl*, ClassPropertyNameHasher> TNameToPropMap;
+	typedef HashMap<NamespacedName, PropertyMessageDescriptionImpl*, NamespacedNameHasher> TNameToPropertyMessageMap;
+
+	TNameToClassMap mNameToClasses;
+	TNameToPropMap mNameToProperties;
+	Array<ClassDescImpl*> mClasses;
+	Array<PropDescImpl*> mProperties;
+    StringTableImpl* mStringTable;
+	TNameToPropertyMessageMap mPropertyMessageMap;
+	Array<PropertyMessageDescriptionImpl*> mPropertyMessages;
+	int32_t mNextClassId;
+	uint32_t mRefCount;
+
+	PvdObjectModelMetaDataImpl()
+	: mNameToClasses("NamespacedName->ClassDescImpl*")
+	, mNameToProperties("ClassPropertyName->PropDescImpl*")
+	, mClasses("ClassDescImpl*")
+	, mProperties("PropDescImpl*")
+    , mStringTable(PVD_NEW(StringTableImpl)())
+	, mPropertyMessageMap("PropertyMessageMap")
+	, mPropertyMessages("PvdObjectModelMetaDataImpl::mPropertyMessages")
+	, mNextClassId(1)
+	, mRefCount(0)
+	{
+	}
+
+  private:
+	PvdObjectModelMetaDataImpl& operator=(const PvdObjectModelMetaDataImpl&);
+
+  public:
+	int32_t nextClassId()
+	{
+		return mNextClassId++;
+	}
+	void initialize()
+	{
+		// Create the default classes.
+		{
+			ClassDescImpl& aryData = getOrCreateClassImpl(getPvdNamespacedNameForType<ArrayData>(),
+			                                              DataTypeToPvdTypeMap<ArrayData>::BaseTypeEnum);
+			aryData.get32BitSize() = sizeof(ArrayData);
+			aryData.get32BitSizeInfo().mAlignment = sizeof(void*);
+			aryData.get64BitSize() = sizeof(ArrayData);
+			aryData.get64BitSizeInfo().mAlignment = sizeof(void*);
+			aryData.mLocked = true;
+		}
+#define CREATE_BASIC_PVD_CLASS(type)                                                                                   \
+	{                                                                                                                  \
+		ClassDescImpl& cls = getOrCreateClassImpl(getPvdNamespacedNameForType<type>(), getPvdTypeForType<type>());     \
+		cls.get32BitSize() = sizeof(type);                                                                             \
+		cls.get32BitSizeInfo().mAlignment = sizeof(type);                                                              \
+		cls.get64BitSize() = sizeof(type);                                                                             \
+		cls.get64BitSizeInfo().mAlignment = sizeof(type);                                                              \
+		cls.mLocked = true;                                                                                            \
+		cls.mPackedUniformWidth = sizeof(type);                                                                        \
+		cls.mPackedClassType = getPvdTypeForType<type>();                                                              \
+	}
+		CREATE_BASIC_PVD_CLASS(int8_t)
+		CREATE_BASIC_PVD_CLASS(uint8_t)
+		CREATE_BASIC_PVD_CLASS(bool)
+		CREATE_BASIC_PVD_CLASS(int16_t)
+		CREATE_BASIC_PVD_CLASS(uint16_t)
+		CREATE_BASIC_PVD_CLASS(int32_t)
+		CREATE_BASIC_PVD_CLASS(uint32_t)
+		// CREATE_BASIC_PVD_CLASS(uint32_t)
+		CREATE_BASIC_PVD_CLASS(int64_t)
+		CREATE_BASIC_PVD_CLASS(uint64_t)
+		CREATE_BASIC_PVD_CLASS(float)
+		CREATE_BASIC_PVD_CLASS(double)
+#undef CREATE_BASIC_PVD_CLASS
+
+#define CREATE_PTR_TYPE_PVD_CLASS(type, ptrType)                                                                       \
+	{                                                                                                                  \
+		ClassDescImpl& cls = getOrCreateClassImpl(getPvdNamespacedNameForType<type>(), getPvdTypeForType<type>());     \
+		cls.get32BitSize() = 4;                                                                                        \
+		cls.get32BitSizeInfo().mAlignment = 4;                                                                         \
+		cls.get64BitSize() = 8;                                                                                        \
+		cls.get64BitSizeInfo().mAlignment = 8;                                                                         \
+		cls.mLocked = true;                                                                                            \
+		cls.addPtrOffset(PtrOffsetType::ptrType, 0, 0);                                                                \
+	}
+
+		CREATE_PTR_TYPE_PVD_CLASS(String, StringOffset)
+		CREATE_PTR_TYPE_PVD_CLASS(VoidPtr, VoidPtrOffset)
+		CREATE_PTR_TYPE_PVD_CLASS(StringHandle, StringOffset)
+		CREATE_PTR_TYPE_PVD_CLASS(ObjectRef, VoidPtrOffset)
+
+#undef CREATE_64BIT_ADJUST_PVD_CLASS
+
+		int32_t fltClassType = getPvdTypeForType<float>();
+		int32_t u32ClassType = getPvdTypeForType<uint32_t>();
+		int32_t v3ClassType = getPvdTypeForType<PxVec3>();
+		int32_t v4ClassType = getPvdTypeForType<PxVec4>();
+		int32_t qtClassType = getPvdTypeForType<PxQuat>();
+		{
+			ClassDescImpl& cls =
+			    getOrCreateClassImpl(getPvdNamespacedNameForType<PvdColor>(), getPvdTypeForType<PvdColor>());
+			createProperty(cls.mClassId, "r", "", getPvdTypeForType<uint8_t>(), PropertyType::Scalar);
+			createProperty(cls.mClassId, "g", "", getPvdTypeForType<uint8_t>(), PropertyType::Scalar);
+			createProperty(cls.mClassId, "b", "", getPvdTypeForType<uint8_t>(), PropertyType::Scalar);
+			createProperty(cls.mClassId, "a", "", getPvdTypeForType<uint8_t>(), PropertyType::Scalar);
+			PX_ASSERT(cls.get32BitSizeInfo().mAlignment == 1);
+			PX_ASSERT(cls.get32BitSize() == 4);
+			PX_ASSERT(cls.get64BitSizeInfo().mAlignment == 1);
+			PX_ASSERT(cls.get64BitSize() == 4);
+			PX_ASSERT(cls.mPackedUniformWidth == 1);
+			PX_ASSERT(cls.mPackedClassType == getPvdTypeForType<uint8_t>());
+			cls.mLocked = true;
+		}
+
+		{
+			ClassDescImpl& cls = getOrCreateClassImpl(getPvdNamespacedNameForType<PxVec2>(), getPvdTypeForType<PxVec2>());
+			createProperty(cls.mClassId, "x", "", fltClassType, PropertyType::Scalar);
+			createProperty(cls.mClassId, "y", "", fltClassType, PropertyType::Scalar);
+			PX_ASSERT(cls.get32BitSizeInfo().mAlignment == 4);
+			PX_ASSERT(cls.get32BitSize() == 8);
+			PX_ASSERT(cls.get64BitSizeInfo().mAlignment == 4);
+			PX_ASSERT(cls.get64BitSize() == 8);
+			PX_ASSERT(cls.mPackedUniformWidth == 4);
+			PX_ASSERT(cls.mPackedClassType == fltClassType);
+			cls.mLocked = true;
+		}
+		{
+			ClassDescImpl& cls = getOrCreateClassImpl(getPvdNamespacedNameForType<PxVec3>(), getPvdTypeForType<PxVec3>());
+			createProperty(cls.mClassId, "x", "", fltClassType, PropertyType::Scalar);
+			createProperty(cls.mClassId, "y", "", fltClassType, PropertyType::Scalar);
+			createProperty(cls.mClassId, "z", "", fltClassType, PropertyType::Scalar);
+			PX_ASSERT(cls.get32BitSizeInfo().mAlignment == 4);
+			PX_ASSERT(cls.get32BitSize() == 12);
+			PX_ASSERT(cls.get64BitSizeInfo().mAlignment == 4);
+			PX_ASSERT(cls.get64BitSize() == 12);
+			PX_ASSERT(cls.mPackedUniformWidth == 4);
+			PX_ASSERT(cls.mPackedClassType == fltClassType);
+			cls.mLocked = true;
+		}
+		{
+			ClassDescImpl& cls = getOrCreateClassImpl(getPvdNamespacedNameForType<PxVec4>(), getPvdTypeForType<PxVec4>());
+			createProperty(cls.mClassId, "x", "", fltClassType, PropertyType::Scalar);
+			createProperty(cls.mClassId, "y", "", fltClassType, PropertyType::Scalar);
+			createProperty(cls.mClassId, "z", "", fltClassType, PropertyType::Scalar);
+			createProperty(cls.mClassId, "w", "", fltClassType, PropertyType::Scalar);
+			PX_ASSERT(cls.get32BitSizeInfo().mAlignment == 4);
+			PX_ASSERT(cls.get32BitSize() == 16);
+			PX_ASSERT(cls.get64BitSizeInfo().mAlignment == 4);
+			PX_ASSERT(cls.get64BitSize() == 16);
+			PX_ASSERT(cls.mPackedUniformWidth == 4);
+			PX_ASSERT(cls.mPackedClassType == fltClassType);
+			cls.mLocked = true;
+		}
+
+		{
+			ClassDescImpl& cls = getOrCreateClassImpl(getPvdNamespacedNameForType<PxQuat>(), getPvdTypeForType<PxQuat>());
+			createProperty(cls.mClassId, "x", "", fltClassType, PropertyType::Scalar);
+			createProperty(cls.mClassId, "y", "", fltClassType, PropertyType::Scalar);
+			createProperty(cls.mClassId, "z", "", fltClassType, PropertyType::Scalar);
+			createProperty(cls.mClassId, "w", "", fltClassType, PropertyType::Scalar);
+			PX_ASSERT(cls.get32BitSizeInfo().mAlignment == 4);
+			PX_ASSERT(cls.get32BitSize() == 16);
+			PX_ASSERT(cls.get64BitSizeInfo().mAlignment == 4);
+			PX_ASSERT(cls.get64BitSize() == 16);
+			PX_ASSERT(cls.mPackedUniformWidth == 4);
+			PX_ASSERT(cls.mPackedClassType == fltClassType);
+			cls.mLocked = true;
+		}
+
+		{
+			ClassDescImpl& cls =
+			    getOrCreateClassImpl(getPvdNamespacedNameForType<PxBounds3>(), getPvdTypeForType<PxBounds3>());
+			createProperty(cls.mClassId, "minimum", "", v3ClassType, PropertyType::Scalar);
+			createProperty(cls.mClassId, "maximum", "", v3ClassType, PropertyType::Scalar);
+			PX_ASSERT(cls.get32BitSizeInfo().mAlignment == 4);
+			PX_ASSERT(cls.get32BitSize() == 24);
+			PX_ASSERT(cls.mPackedUniformWidth == 4);
+			PX_ASSERT(cls.mPackedClassType == fltClassType);
+			cls.mLocked = true;
+		}
+
+		{
+			ClassDescImpl& cls =
+			    getOrCreateClassImpl(getPvdNamespacedNameForType<PxTransform>(), getPvdTypeForType<PxTransform>());
+			createProperty(cls.mClassId, "q", "", qtClassType, PropertyType::Scalar);
+			createProperty(cls.mClassId, "p", "", v3ClassType, PropertyType::Scalar);
+			PX_ASSERT(cls.get32BitSizeInfo().mAlignment == 4);
+			PX_ASSERT(cls.get32BitSize() == 28);
+			PX_ASSERT(cls.mPackedUniformWidth == 4);
+			PX_ASSERT(cls.mPackedClassType == fltClassType);
+			cls.mLocked = true;
+		}
+
+		{
+			ClassDescImpl& cls =
+			    getOrCreateClassImpl(getPvdNamespacedNameForType<PxMat33>(), getPvdTypeForType<PxMat33>());
+			createProperty(cls.mClassId, "column0", "", v3ClassType, PropertyType::Scalar);
+			createProperty(cls.mClassId, "column1", "", v3ClassType, PropertyType::Scalar);
+			createProperty(cls.mClassId, "column2", "", v3ClassType, PropertyType::Scalar);
+			PX_ASSERT(cls.get32BitSizeInfo().mAlignment == 4);
+			PX_ASSERT(cls.get32BitSize() == 36);
+			PX_ASSERT(cls.mPackedUniformWidth == 4);
+			PX_ASSERT(cls.mPackedClassType == fltClassType);
+			cls.mLocked = true;
+		}
+
+		{
+			ClassDescImpl& cls =
+			    getOrCreateClassImpl(getPvdNamespacedNameForType<PxMat44>(), getPvdTypeForType<PxMat44>());
+			createProperty(cls.mClassId, "column0", "", v4ClassType, PropertyType::Scalar);
+			createProperty(cls.mClassId, "column1", "", v4ClassType, PropertyType::Scalar);
+			createProperty(cls.mClassId, "column2", "", v4ClassType, PropertyType::Scalar);
+			createProperty(cls.mClassId, "column3", "", v4ClassType, PropertyType::Scalar);
+			PX_ASSERT(cls.get32BitSizeInfo().mAlignment == 4);
+			PX_ASSERT(cls.get32BitSize() == 64);
+			PX_ASSERT(cls.mPackedUniformWidth == 4);
+			PX_ASSERT(cls.mPackedClassType == fltClassType);
+			cls.mLocked = true;
+		}
+
+		{
+			ClassDescImpl& cls =
+			    getOrCreateClassImpl(getPvdNamespacedNameForType<U32Array4>(), getPvdTypeForType<U32Array4>());
+			createProperty(cls.mClassId, "d0", "", u32ClassType, PropertyType::Scalar);
+			createProperty(cls.mClassId, "d1", "", u32ClassType, PropertyType::Scalar);
+			createProperty(cls.mClassId, "d2", "", u32ClassType, PropertyType::Scalar);
+			createProperty(cls.mClassId, "d3", "", u32ClassType, PropertyType::Scalar);
+			cls.mLocked = true;
+		}
+	}
+	virtual ~PvdObjectModelMetaDataImpl()
+	{
+        mStringTable->release();
+		PVD_FOREACH(idx, mClasses.size())
+		{
+			if(mClasses[idx] != NULL)
+				PVD_DELETE(mClasses[idx]);
+		}
+		mClasses.clear();
+		PVD_FOREACH(idx, mProperties.size()) PVD_DELETE(mProperties[idx]);
+		mProperties.clear();
+		PVD_FOREACH(idx, mPropertyMessages.size()) PVD_DELETE(mPropertyMessages[idx]);
+		mPropertyMessages.clear();
+	}
+
+	ClassDescImpl& getOrCreateClassImpl(const NamespacedName& nm, int32_t idx)
+	{
+		ClassDescImpl* impl(getClassImpl(idx));
+		if(impl)
+			return *impl;
+        NamespacedName safeName(mStringTable->registerStr(nm.mNamespace), mStringTable->registerStr(nm.mName));
+		while(idx >= int32_t(mClasses.size()))
+			mClasses.pushBack(NULL);
+		mClasses[uint32_t(idx)] = PVD_NEW(ClassDescImpl)(ClassDescription(safeName, idx));
+		mNameToClasses.insert(nm, mClasses[uint32_t(idx)]);
+		mNextClassId = PxMax(mNextClassId, idx + 1);
+		return *mClasses[uint32_t(idx)];
+	}
+
+	ClassDescImpl& getOrCreateClassImpl(const NamespacedName& nm)
+	{
+		ClassDescImpl* retval = findClassImpl(nm);
+		if(retval)
+			return *retval;
+		return getOrCreateClassImpl(nm, nextClassId());
+	}
+	virtual ClassDescription getOrCreateClass(const NamespacedName& nm)
+	{
+		return getOrCreateClassImpl(nm);
+	}
+	// get or create parent, lock parent. deriveFrom getOrCreatechild.
+	virtual bool deriveClass(const NamespacedName& parent, const NamespacedName& child)
+	{
+		ClassDescImpl& p(getOrCreateClassImpl(parent));
+		ClassDescImpl& c(getOrCreateClassImpl(child));
+
+		if(c.mBaseClass >= 0)
+		{
+			PX_ASSERT(c.mBaseClass == p.mClassId);
+			return false;
+		}
+		p.mLocked = true;
+		c.mBaseClass = p.mClassId;
+		c.get32BitSizeInfo() = p.get32BitSizeInfo();
+		c.get64BitSizeInfo() = p.get64BitSizeInfo();
+		c.mPackedClassType = p.mPackedClassType;
+		c.mPackedUniformWidth = p.mPackedUniformWidth;
+		c.mRequiresDestruction = p.mRequiresDestruction;
+		c.m32OffsetArray = p.m32OffsetArray;
+		c.m64OffsetArray = p.m64OffsetArray;
+		// Add all the parent propertes to this class in the global name map.
+		for(ClassDescImpl* parent0 = &p; parent0 != NULL; parent0 = getClassImpl(parent0->mBaseClass))
+		{
+			PVD_FOREACH(idx, parent0->mPropImps.size())
+			mNameToProperties.insert(ClassPropertyName(c.mName, parent0->mPropImps[idx]->mName), parent0->mPropImps[idx]);
+
+			if(parent0->mBaseClass < 0)
+				break;
+		}
+
+		return true;
+	}
+	ClassDescImpl* findClassImpl(const NamespacedName& nm) const
+	{
+		const TNameToClassMap::Entry* entry(mNameToClasses.find(nm));
+		if(entry)
+			return entry->second;
+		return NULL;
+	}
+	virtual Option<ClassDescription> findClass(const NamespacedName& nm) const
+	{
+		ClassDescImpl* retval = findClassImpl(nm);
+		if(retval)
+			return *retval;
+		return Option<ClassDescription>();
+	}
+
+	ClassDescImpl* getClassImpl(int32_t classId) const
+	{
+		if(classId < 0)
+            return NULL;
+		uint32_t idx = uint32_t(classId);
+		if(idx < mClasses.size())
+			return mClasses[idx];
+        return NULL;
+	}
+
+	virtual Option<ClassDescription> getClass(int32_t classId) const
+	{
+		ClassDescImpl* impl(getClassImpl(classId));
+		if(impl)
+			return *impl;
+		return None();
+	}
+
+	virtual ClassDescription* getClassPtr(int32_t classId) const
+	{
+		return getClassImpl(classId);
+	}
+
+	virtual Option<ClassDescription> getParentClass(int32_t classId) const
+	{
+		ClassDescImpl* impl(getClassImpl(classId));
+		if(impl == NULL)
+			return None();
+		return getClass(impl->mBaseClass);
+	}
+
+	virtual void lockClass(int32_t classId)
+	{
+		ClassDescImpl* impl(getClassImpl(classId));
+		PX_ASSERT(impl);
+		if(impl)
+			impl->mLocked = true;
+	}
+	virtual uint32_t getNbClasses() const
+	{
+		uint32_t total = 0;
+		PVD_FOREACH(idx, mClasses.size()) if(mClasses[idx])++ total;
+		return total;
+	}
+
+	virtual uint32_t getClasses(ClassDescription* outClasses, uint32_t requestCount, uint32_t startIndex = 0) const
+	{
+		uint32_t classCount(getNbClasses());
+		startIndex = PxMin(classCount, startIndex);
+		uint32_t retAmount = PxMin(requestCount, classCount - startIndex);
+
+		uint32_t idx = 0;
+		while(startIndex)
+		{
+			if(mClasses[idx] != NULL)
+				--startIndex;
+			++idx;
+		}
+
+		uint32_t inserted = 0;
+		uint32_t classesSize = static_cast<uint32_t>(mClasses.size());
+		while(inserted < retAmount && idx < classesSize)
+		{
+			if(mClasses[idx] != NULL)
+			{
+				outClasses[inserted] = *mClasses[idx];
+				++inserted;
+			}
+			++idx;
+		}
+		return inserted;
+	}
+
+	uint32_t updateByteSizeAndGetPropertyAlignment(ClassDescriptionSizeInfo& dest, const ClassDescriptionSizeInfo& src)
+	{
+		uint32_t alignment = src.mAlignment;
+		dest.mAlignment = PxMax(dest.mAlignment, alignment);
+		uint32_t offset = align(dest.mDataByteSize, alignment);
+		dest.mDataByteSize = offset + src.mByteSize;
+		dest.mByteSize = align(dest.mDataByteSize, dest.mAlignment);
+		return offset;
+	}
+
+	void transferPtrOffsets(ClassDescriptionSizeInfo& destInfo, Array<PtrOffset>& destArray,
+	                        const Array<PtrOffset>& src, uint32_t offset)
+	{
+		PVD_FOREACH(idx, src.size())
+		destArray.pushBack(PtrOffset(src[idx].mOffsetType, src[idx].mOffset + offset));
+		destInfo.mPtrOffsets = DataRef<PtrOffset>(destArray.begin(), destArray.end());
+	}
+
+	virtual Option<PropertyDescription> createProperty(int32_t classId, String name, String semantic, int32_t datatype,
+	                                                   PropertyType::Enum propertyType)
+	{
+		ClassDescImpl* cls(getClassImpl(classId));
+		PX_ASSERT(cls);
+		if(!cls)
+			return None();
+		if(cls->mLocked)
+		{
+            PX_ASSERT(false);
+			return None();
+		}
+		PropDescImpl* impl(cls->findProperty(name));
+		// duplicate property definition
+		if(impl)
+		{
+			PX_ASSERT(false);
+			return None();
+		}
+		if(datatype == getPvdTypeForType<String>())
+		{
+			PX_ASSERT(false);
+			return None();
+		}
+		// The datatype for this property has not been declared.
+		ClassDescImpl* propDType(getClassImpl(datatype));
+		PX_ASSERT(propDType);
+		if(!propDType)
+			return None();
+		NamespacedName propClsName(propDType->mName);
+		int32_t propPackedWidth = propDType->mPackedUniformWidth;
+		int32_t propPackedType = propDType->mPackedClassType;
+		// The implications of properties being complex types aren't major
+		//*until* you start trying to undue a property event that set values
+		// of those complex types.  Then things just get too complex.
+		if(propDType->mRequiresDestruction)
+		{
+			PX_ASSERT(false);
+			return None();
+		}
+		bool requiresDestruction = propDType->mRequiresDestruction || cls->mRequiresDestruction;
+
+		if(propertyType == PropertyType::Array)
+		{
+			int32_t tempId = DataTypeToPvdTypeMap<ArrayData>::BaseTypeEnum;
+			propDType = getClassImpl(tempId);
+			PX_ASSERT(propDType);
+			if(!propDType)
+				return None();
+			requiresDestruction = true;
+		}
+		uint32_t offset32 = updateByteSizeAndGetPropertyAlignment(cls->get32BitSizeInfo(), propDType->get32BitSizeInfo());
+		uint32_t offset64 = updateByteSizeAndGetPropertyAlignment(cls->get64BitSizeInfo(), propDType->get64BitSizeInfo());
+		transferPtrOffsets(cls->get32BitSizeInfo(), cls->m32OffsetArray, propDType->m32OffsetArray, offset32);
+		transferPtrOffsets(cls->get64BitSizeInfo(), cls->m64OffsetArray, propDType->m64OffsetArray, offset64);
+		propDType->mLocked = true; // Can't add members to the property type.
+		cls->mRequiresDestruction = requiresDestruction;
+		int32_t propId = int32_t(mProperties.size());
+		PropertyDescription newDesc(cls->mName, cls->mClassId, name, semantic, datatype, propClsName, propertyType,
+		                            propId, offset32, offset64);
+        mProperties.pushBack(PVD_NEW(PropDescImpl)(newDesc, *mStringTable));
+		mNameToProperties.insert(ClassPropertyName(cls->mName, mProperties.back()->mName), mProperties.back());
+		cls->addProperty(mProperties.back());
+		bool firstProp = cls->mPropImps.size() == 1;
+
+		if(firstProp)
+		{
+			cls->mPackedUniformWidth = propPackedWidth;
+			cls->mPackedClassType = propPackedType;
+		}
+		else
+		{
+			bool packed = (propPackedWidth > 0) && (cls->get32BitSizeInfo().mDataByteSize % propPackedWidth) == 0;
+			if(cls->mPackedClassType >= 0) // maybe uncheck packed class type
+			{
+				if(propPackedType < 0 || cls->mPackedClassType != propPackedType
+				                             // Object refs require conversion from stream to db id
+				   ||
+				   datatype == getPvdTypeForType<ObjectRef>()
+				       // Strings also require conversion from stream to db id.
+				   ||
+				   datatype == getPvdTypeForType<StringHandle>() || packed == false)
+					cls->mPackedClassType = -1;
+			}
+			if(cls->mPackedUniformWidth >= 0) // maybe uncheck packed class width
+			{
+				if(propPackedWidth < 0 || cls->mPackedUniformWidth != propPackedWidth
+				                              // object refs, because they require special treatment during parsing,
+				                              // cannot be packed
+				   ||
+				   datatype == getPvdTypeForType<ObjectRef>()
+				       // Likewise, string handles are special because the data needs to be sent *after*
+				       // the
+				   ||
+				   datatype == getPvdTypeForType<StringHandle>() || packed == false)
+					cls->mPackedUniformWidth = -1; // invalid packed width.
+			}
+		}
+		return *mProperties.back();
+	}
+
+	PropDescImpl* findPropImpl(const NamespacedName& clsName, String prop) const
+	{
+		const TNameToPropMap::Entry* entry = mNameToProperties.find(ClassPropertyName(clsName, prop));
+		if(entry)
+			return entry->second;
+		return NULL;
+	}
+	virtual Option<PropertyDescription> findProperty(const NamespacedName& cls, String propName) const
+	{
+		PropDescImpl* prop(findPropImpl(cls, propName));
+		if(prop)
+			return *prop;
+		return None();
+	}
+
+	virtual Option<PropertyDescription> findProperty(int32_t clsId, String propName) const
+	{
+		ClassDescImpl* cls(getClassImpl(clsId));
+		PX_ASSERT(cls);
+		if(!cls)
+			return None();
+		PropDescImpl* prop(findPropImpl(cls->mName, propName));
+		if(prop)
+			return *prop;
+		return None();
+	}
+
+	PropDescImpl* getPropertyImpl(int32_t propId) const
+	{
+		PX_ASSERT(propId >= 0);
+		if(propId < 0)
+            return NULL;
+		uint32_t val = uint32_t(propId);
+		if(val >= mProperties.size())
+		{
+			PX_ASSERT(false);
+            return NULL;
+		}
+		return mProperties[val];
+	}
+
+	virtual Option<PropertyDescription> getProperty(int32_t propId) const
+	{
+		PropDescImpl* impl(getPropertyImpl(propId));
+		if(impl)
+			return *impl;
+		return None();
+	}
+
+	virtual void setNamedPropertyValues(DataRef<NamedValue> values, int32_t propId)
+	{
+		PropDescImpl* impl(getPropertyImpl(propId));
+		if(impl)
+		{
+			impl->mValueNames.resize(values.size());
+			PVD_FOREACH(idx, values.size()) impl->mValueNames[idx] = values[idx];
+		}
+	}
+
+	virtual DataRef<NamedValue> getNamedPropertyValues(int32_t propId) const
+	{
+		PropDescImpl* impl(getPropertyImpl(propId));
+		if(impl)
+		{
+			return toDataRef(impl->mValueNames);
+		}
+		return DataRef<NamedValue>();
+	}
+
+	virtual uint32_t getNbProperties(int32_t classId) const
+	{
+		uint32_t retval = 0;
+		for(ClassDescImpl* impl(getClassImpl(classId)); impl; impl = getClassImpl(impl->mBaseClass))
+		{
+			retval += impl->mPropImps.size();
+			if(impl->mBaseClass < 0)
+				break;
+		}
+		return retval;
+	}
+
+	// Properties need to be returned in base class order, so this requires a recursive function.
+	uint32_t getPropertiesImpl(int32_t classId, PropertyDescription*& outBuffer, uint32_t& numItems,
+	                           uint32_t& startIdx) const
+	{
+		ClassDescImpl* impl = getClassImpl(classId);
+		if(impl)
+		{
+			uint32_t retval = 0;
+			if(impl->mBaseClass >= 0)
+				retval = getPropertiesImpl(impl->mBaseClass, outBuffer, numItems, startIdx);
+
+			uint32_t localStart = PxMin(impl->mPropImps.size(), startIdx);
+			uint32_t localNumItems = PxMin(numItems, impl->mPropImps.size() - localStart);
+			PVD_FOREACH(idx, localNumItems)
+			{
+				outBuffer[idx] = *impl->mPropImps[localStart + idx];
+			}
+
+			startIdx -= localStart;
+			numItems -= localNumItems;
+			outBuffer += localNumItems;
+			return retval + localNumItems;
+		}
+		return 0;
+	}
+
+	virtual uint32_t getProperties(int32_t classId, PropertyDescription* outBuffer, uint32_t numItems,
+	                               uint32_t startIdx) const
+	{
+		return getPropertiesImpl(classId, outBuffer, numItems, startIdx);
+	}
+
+	virtual MarshalQueryResult checkMarshalling(int32_t srcClsId, int32_t dstClsId) const
+	{
+		Option<ClassDescription> propTypeOpt(getClass(dstClsId));
+		if(propTypeOpt.hasValue() == false)
+		{
+			PX_ASSERT(false);
+			return MarshalQueryResult();
+		}
+		const ClassDescription& propType(propTypeOpt);
+
+		Option<ClassDescription> incomingTypeOpt(getClass(srcClsId));
+		if(incomingTypeOpt.hasValue() == false)
+		{
+			PX_ASSERT(false);
+			return MarshalQueryResult();
+		}
+		const ClassDescription& incomingType(incomingTypeOpt);
+		// Can only marshal simple things at this point in time.
+		bool needsMarshalling = false;
+		bool canMarshal = false;
+		TSingleMarshaller single = NULL;
+		TBlockMarshaller block = NULL;
+		if(incomingType.mClassId != propType.mClassId)
+		{
+			// Check that marshalling is even possible.
+			if((incomingType.mPackedUniformWidth >= 0 && propType.mPackedUniformWidth >= 0) == false)
+			{
+				PX_ASSERT(false);
+				return MarshalQueryResult();
+			}
+
+			int32_t srcType = incomingType.mPackedClassType;
+			int32_t dstType = propType.mPackedClassType;
+
+			int32_t srcWidth = incomingType.mPackedUniformWidth;
+			int32_t dstWidth = propType.mPackedUniformWidth;
+			canMarshal = getMarshalOperators(single, block, srcType, dstType);
+			if(srcWidth == dstWidth)
+				needsMarshalling = canMarshal; // If the types are the same width, we assume we can convert between some
+			                                   // of them seamlessly (uint16_t, int16_t)
+			else
+			{
+				needsMarshalling = true;
+				// If we can't marshall and we have to then we can't set the property value.
+				// This indicates that the src and dest are different properties and we don't
+				// know how to convert between them.
+				if(!canMarshal)
+				{
+					PX_ASSERT(false);
+					return MarshalQueryResult();
+				}
+			}
+		}
+		return MarshalQueryResult(srcClsId, dstClsId, canMarshal, needsMarshalling, block);
+	}
+
+	PropertyMessageDescriptionImpl* findPropertyMessageImpl(const NamespacedName& messageName) const
+	{
+		const TNameToPropertyMessageMap::Entry* entry = mPropertyMessageMap.find(messageName);
+		if(entry)
+			return entry->second;
+		return NULL;
+	}
+
+	PropertyMessageDescriptionImpl* getPropertyMessageImpl(int32_t msg) const
+	{
+		int32_t msgCount = int32_t(mPropertyMessages.size());
+		if(msg >= 0 && msg < msgCount)
+			return mPropertyMessages[uint32_t(msg)];
+		return NULL;
+	}
+
+	virtual Option<PropertyMessageDescription> createPropertyMessage(const NamespacedName& clsName,
+	                                                                 const NamespacedName& messageName,
+	                                                                 DataRef<PropertyMessageArg> entries,
+	                                                                 uint32_t messageSize)
+	{
+		PropertyMessageDescriptionImpl* existing(findPropertyMessageImpl(messageName));
+		if(existing)
+		{
+			PX_ASSERT(false);
+			return None();
+		}
+		ClassDescImpl* cls = findClassImpl(clsName);
+		PX_ASSERT(cls);
+		if(!cls)
+			return None();
+		int32_t msgId = int32_t(mPropertyMessages.size());
+		PropertyMessageDescriptionImpl* newMessage = PVD_NEW(PropertyMessageDescriptionImpl)(
+            PropertyMessageDescription(mStringTable->registerName(clsName), cls->mClassId,
+                                       mStringTable->registerName(messageName), msgId, messageSize));
+		uint32_t calculatedSize = 0;
+		PVD_FOREACH(idx, entries.size())
+		{
+			PropertyMessageArg entry(entries[idx]);
+			ClassDescImpl* dtypeCls = findClassImpl(entry.mDatatypeName);
+			if(dtypeCls == NULL)
+			{
+				PX_ASSERT(false);
+				goto DestroyNewMessage;
+			}
+			ClassDescriptionSizeInfo dtypeInfo(dtypeCls->get32BitSizeInfo());
+			uint32_t incomingSize = dtypeInfo.mByteSize;
+			if(entry.mByteSize < incomingSize)
+			{
+				PX_ASSERT(false);
+				goto DestroyNewMessage;
+			}
+
+			calculatedSize = PxMax(calculatedSize, entry.mMessageOffset + entry.mByteSize);
+			if(calculatedSize > messageSize)
+			{
+				PX_ASSERT(false);
+				goto DestroyNewMessage;
+			}
+
+			Option<PropertyDescription> propName(findProperty(cls->mClassId, entry.mPropertyName));
+			if(propName.hasValue() == false)
+			{
+				PX_ASSERT(false);
+				goto DestroyNewMessage;
+			}
+
+			Option<ClassDescription> propCls(getClass(propName.getValue().mDatatype));
+			if(propCls.hasValue() == false)
+			{
+				PX_ASSERT(false);
+				goto DestroyNewMessage;
+			}
+
+			PropertyMessageEntryImpl newEntry(PropertyMessageEntry(
+			    propName, dtypeCls->mName, dtypeCls->mClassId, entry.mMessageOffset, incomingSize, dtypeInfo.mByteSize));
+			newMessage->addEntry(newEntry);
+
+			if(newEntry.mDatatypeId == getPvdTypeForType<String>())
+				newMessage->mStringOffsetArray.pushBack(entry.mMessageOffset);
+
+			// property messages cannot be marshalled at this time.
+			if(newEntry.mDatatypeId != getPvdTypeForType<String>() && newEntry.mDatatypeId != getPvdTypeForType<VoidPtr>())
+			{
+				MarshalQueryResult marshalInfo = checkMarshalling(newEntry.mDatatypeId, newEntry.mProperty.mDatatype);
+				if(marshalInfo.needsMarshalling)
+				{
+					PX_ASSERT(false);
+					goto DestroyNewMessage;
+				}
+			}
+		}
+
+		if(newMessage)
+		{
+			newMessage->mStringOffsets =
+			    DataRef<uint32_t>(newMessage->mStringOffsetArray.begin(), newMessage->mStringOffsetArray.end());
+			mPropertyMessages.pushBack(newMessage);
+			mPropertyMessageMap.insert(messageName, newMessage);
+			return *newMessage;
+		}
+
+	DestroyNewMessage:
+		if(newMessage)
+			PVD_DELETE(newMessage);
+
+		return None();
+	}
+	virtual Option<PropertyMessageDescription> findPropertyMessage(const NamespacedName& msgName) const
+	{
+		PropertyMessageDescriptionImpl* desc(findPropertyMessageImpl(msgName));
+		if(desc)
+			return *desc;
+		return None();
+	}
+
+	virtual Option<PropertyMessageDescription> getPropertyMessage(int32_t msgId) const
+	{
+		PropertyMessageDescriptionImpl* desc(getPropertyMessageImpl(msgId));
+		if(desc)
+			return *desc;
+		return None();
+	}
+
+	virtual uint32_t getNbPropertyMessages() const
+	{
+		return mPropertyMessages.size();
+	}
+
+	virtual uint32_t getPropertyMessages(PropertyMessageDescription* msgBuf, uint32_t bufLen, uint32_t startIdx = 0) const
+	{
+		startIdx = PxMin(startIdx, getNbPropertyMessages());
+		bufLen = PxMin(bufLen, getNbPropertyMessages() - startIdx);
+		PVD_FOREACH(idx, bufLen) msgBuf[idx] = *mPropertyMessages[idx + startIdx];
+		return bufLen;
+	}
+
+	struct MetaDataWriter
+	{
+		const PvdObjectModelMetaDataImpl& mMetaData;
+		PvdOutputStream& mStream;
+		MetaDataWriter(const PvdObjectModelMetaDataImpl& meta, PvdOutputStream& stream)
+		: mMetaData(meta), mStream(stream)
+		{
+		}
+
+		void streamify(NamespacedName& type)
+		{
+            mStream << mMetaData.mStringTable->strToHandle(type.mNamespace);
+            mStream << mMetaData.mStringTable->strToHandle(type.mName);
+		}
+		void streamify(String& type)
+		{
+            mStream << mMetaData.mStringTable->strToHandle(type);
+		}
+		void streamify(int32_t& type)
+		{
+			mStream << type;
+		}
+		void streamify(uint32_t& type)
+		{
+			mStream << type;
+		}
+		void streamify(uint8_t type)
+		{
+			mStream << type;
+		}
+		void streamify(bool type)
+		{
+			streamify( uint8_t(type));
+		}
+		void streamify(PropertyType::Enum type)
+		{
+			uint32_t val = static_cast<uint32_t>(type);
+			mStream << val;
+		}
+		void streamify(NamedValue& type)
+		{
+			streamify(type.mValue);
+			streamify(type.mName);
+		}
+		void streamifyLinks(PropDescImpl* prop)
+		{
+			streamify(prop->mPropertyId);
+		}
+		void streamify(PropertyDescription& prop)
+		{
+			streamify(prop.mPropertyId);
+		}
+		void streamify(PropertyMessageEntryImpl& prop)
+		{
+			prop.serialize(*this);
+		}
+		void streamify(PtrOffset& off)
+		{
+			uint32_t type = off.mOffsetType;
+			mStream << type;
+			mStream << off.mOffset;
+		}
+		template <typename TDataType>
+		void streamify(TDataType* type)
+		{
+			int32_t existMarker = type ? 1 : 0;
+			mStream << existMarker;
+			if(type)
+				type->serialize(*this);
+		}
+		template <typename TArrayType>
+		void streamify(const Array<TArrayType>& type)
+		{
+			mStream << static_cast<uint32_t>(type.size());
+			PVD_FOREACH(idx, type.size()) streamify(const_cast<TArrayType&>(type[idx]));
+		}
+		template <typename TArrayType>
+		void streamifyLinks(const Array<TArrayType>& type)
+		{
+			mStream << static_cast<uint32_t>(type.size());
+			PVD_FOREACH(idx, type.size()) streamifyLinks(const_cast<TArrayType&>(type[idx]));
+		}
+
+	  private:
+		MetaDataWriter& operator=(const MetaDataWriter&);
+	};
+
+	template <typename TStreamType>
+	struct MetaDataReader
+	{
+		PvdObjectModelMetaDataImpl& mMetaData;
+		TStreamType& mStream;
+		MetaDataReader(PvdObjectModelMetaDataImpl& meta, TStreamType& stream) : mMetaData(meta), mStream(stream)
+		{
+		}
+
+		void streamify(NamespacedName& type)
+		{
+			streamify(type.mNamespace);
+			streamify(type.mName);
+		}
+
+		void streamify(String& type)
+		{
+			uint32_t handle;
+			mStream >> handle;
+            type = mMetaData.mStringTable->handleToStr(handle);
+		}
+		void streamify(int32_t& type)
+		{
+			mStream >> type;
+		}
+		void streamify(uint32_t& type)
+		{
+			mStream >> type;
+		}
+		void streamify(bool& type)
+		{
+			uint8_t data;
+			mStream >> data;
+			type = data ? true : false;
+		}
+
+		void streamify(PropertyType::Enum& type)
+		{
+			uint32_t val;
+			mStream >> val;
+			type = static_cast<PropertyType::Enum>(val);
+		}
+		void streamify(NamedValue& type)
+		{
+			streamify(type.mValue);
+			streamify(type.mName);
+		}
+		void streamify(PropertyMessageEntryImpl& type)
+		{
+			type.serialize(*this);
+		}
+		void streamify(PtrOffset& off)
+		{
+			uint32_t type;
+			mStream >> type;
+			mStream >> off.mOffset;
+			off.mOffsetType = static_cast<PtrOffsetType::Enum>(type);
+		}
+		void streamifyLinks(PropDescImpl*& prop)
+		{
+			int32_t propId;
+			streamify(propId);
+			prop = mMetaData.getPropertyImpl(propId);
+		}
+		void streamify(PropertyDescription& prop)
+		{
+			streamify(prop.mPropertyId);
+			prop = mMetaData.getProperty(prop.mPropertyId);
+		}
+		template <typename TDataType>
+		void streamify(TDataType*& type)
+		{
+			uint32_t existMarker;
+			mStream >> existMarker;
+			if(existMarker)
+			{
+				TDataType* newType = PVD_NEW(TDataType)();
+				newType->serialize(*this);
+				type = newType;
+			}
+			else
+				type = NULL;
+		}
+		template <typename TArrayType>
+		void streamify(Array<TArrayType>& type)
+		{
+			uint32_t typeSize;
+			mStream >> typeSize;
+			type.resize(typeSize);
+			PVD_FOREACH(idx, type.size()) streamify(type[idx]);
+		}
+		template <typename TArrayType>
+		void streamifyLinks(Array<TArrayType>& type)
+		{
+			uint32_t typeSize;
+			mStream >> typeSize;
+			type.resize(typeSize);
+			PVD_FOREACH(idx, type.size()) streamifyLinks(type[idx]);
+		}
+
+	  private:
+		MetaDataReader& operator=(const MetaDataReader&);
+	};
+
+	virtual void write(PvdOutputStream& stream) const
+	{
+		stream << getCurrentPvdObjectModelVersion();
+		stream << mNextClassId;
+        mStringTable->write(stream);
+		MetaDataWriter writer(*this, stream);
+		writer.streamify(mProperties);
+		writer.streamify(mClasses);
+		writer.streamify(mPropertyMessages);
+	}
+
+	template <typename TReaderType>
+	void read(TReaderType& stream)
+	{
+		uint32_t version;
+		stream >> version;
+		stream >> mNextClassId;
+        mStringTable->read(stream);
+		MetaDataReader<TReaderType> reader(*this, stream);
+		reader.streamify(mProperties);
+		reader.streamify(mClasses);
+		reader.streamify(mPropertyMessages);
+
+		mNameToClasses.clear();
+		mNameToProperties.clear();
+		mPropertyMessageMap.clear();
+		PVD_FOREACH(i, mClasses.size())
+		{
+			ClassDescImpl* cls(mClasses[i]);
+			if(cls == NULL)
+				continue;
+			mNameToClasses.insert(cls->mName, mClasses[i]);
+			uint32_t propCount = getNbProperties(cls->mClassId);
+			PropertyDescription descs[16];
+			uint32_t offset = 0;
+			for(uint32_t idx = 0; idx < propCount; idx = offset)
+			{
+				uint32_t numProps = getProperties(cls->mClassId, descs, 16, offset);
+				offset += numProps;
+				for(uint32_t propIdx = 0; propIdx < numProps; ++propIdx)
+				{
+					PropDescImpl* prop = getPropertyImpl(descs[propIdx].mPropertyId);
+					if(prop)
+						mNameToProperties.insert(ClassPropertyName(cls->mName, prop->mName), prop);
+				}
+			}
+		}
+		PVD_FOREACH(idx, mPropertyMessages.size())
+		mPropertyMessageMap.insert(mPropertyMessages[idx]->mMessageName, mPropertyMessages[idx]);
+	}
+
+	virtual PvdObjectModelMetaData& clone() const
+	{
+		MemPvdOutputStream tempStream("PvdObjectModelMetaData::clone");
+		write(tempStream);
+		MemPvdInputStream inStream(tempStream);
+		return create(inStream);
+	}
+
+	virtual StringTable& getStringTable() const
+	{
+        return *mStringTable;
+	}
+	virtual void addRef()
+	{
+		++mRefCount;
+	}
+	virtual void release()
+	{
+		if(mRefCount)
+			--mRefCount;
+		if(!mRefCount)
+			PVD_DELETE(this);
+	}
+};
+}
+
+uint32_t PvdObjectModelMetaData::getCurrentPvdObjectModelVersion()
+{
+	return 1;
+}
+
+PvdObjectModelMetaData& PvdObjectModelMetaData::create()
+{
+	PvdObjectModelMetaDataImpl& retval(*PVD_NEW(PvdObjectModelMetaDataImpl)());
+	retval.initialize();
+	return retval;
+}
+
+PvdObjectModelMetaData& PvdObjectModelMetaData::create(PvdInputStream& stream)
+{
+	PvdObjectModelMetaDataImpl& retval(*PVD_NEW(PvdObjectModelMetaDataImpl)());
+	retval.read(stream);
+	return retval;
+}
+
+StringTable& StringTable::create()
+{
+	return *PVD_NEW(StringTableImpl)();
+}
diff --git a/PxShared/src/pvd/src/PxPvdObjectModelMetaData.h b/PxShared/src/pvd/src/PxPvdObjectModelMetaData.h
new file mode 100644
index 0000000..7357708
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdObjectModelMetaData.h
@@ -0,0 +1,495 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+#ifndef PXPVDSDK_PXPVDOBJECTMODELMETADATA_H
+#define PXPVDSDK_PXPVDOBJECTMODELMETADATA_H
+
+#include "foundation/PxAssert.h"
+#include "PxPvdObjectModelBaseTypes.h"
+#include "PxPvdBits.h"
+
+namespace physx
+{
+namespace pvdsdk
+{
+
+class PvdInputStream;
+class PvdOutputStream;
+
+struct PropertyDescription
+{
+	NamespacedName mOwnerClassName;
+	int32_t mOwnerClassId;
+	String mName;
+	String mSemantic;
+	// The datatype this property corresponds to.
+	int32_t mDatatype;
+	// The name of the datatype
+	NamespacedName mDatatypeName;
+	// Scalar or array.
+	PropertyType::Enum mPropertyType;
+	// No other property under any class has this id, it is DB-unique.
+	int32_t mPropertyId;
+	// Offset in bytes into the object's data section where this property starts.
+	uint32_t m32BitOffset;
+	// Offset in bytes into the object's data section where this property starts.
+	uint32_t m64BitOffset;
+
+	PropertyDescription(const NamespacedName& clsName, int32_t classId, String name, String semantic, int32_t datatype,
+	                    const NamespacedName& datatypeName, PropertyType::Enum propType, int32_t propId,
+	                    uint32_t offset32, uint32_t offset64)
+	: mOwnerClassName(clsName)
+	, mOwnerClassId(classId)
+	, mName(name)
+	, mSemantic(semantic)
+	, mDatatype(datatype)
+	, mDatatypeName(datatypeName)
+	, mPropertyType(propType)
+	, mPropertyId(propId)
+	, m32BitOffset(offset32)
+	, m64BitOffset(offset64)
+	{
+	}
+	PropertyDescription()
+	: mOwnerClassId(-1)
+	, mName("")
+	, mSemantic("")
+	, mDatatype(-1)
+	, mPropertyType(PropertyType::Unknown)
+	, mPropertyId(-1)
+	, m32BitOffset(0)
+	, m64BitOffset(0)
+
+	{
+	}
+
+	virtual ~PropertyDescription()
+	{
+	}
+};
+
+struct PtrOffsetType
+{
+	enum Enum
+	{
+		UnknownOffset,
+		VoidPtrOffset,
+		StringOffset
+	};
+};
+
+struct PtrOffset
+{
+	PtrOffsetType::Enum mOffsetType;
+	uint32_t mOffset;
+	PtrOffset(PtrOffsetType::Enum type, uint32_t offset) : mOffsetType(type), mOffset(offset)
+	{
+	}
+	PtrOffset() : mOffsetType(PtrOffsetType::UnknownOffset), mOffset(0)
+	{
+	}
+};
+
+inline uint32_t align(uint32_t offset, uint32_t alignment)
+{
+	uint32_t startOffset = offset;
+	uint32_t alignmentMask = ~(alignment - 1);
+	offset = (offset + alignment - 1) & alignmentMask;
+	PX_ASSERT(offset >= startOffset && (offset % alignment) == 0);
+	(void)startOffset;
+	return offset;
+}
+
+struct ClassDescriptionSizeInfo
+{
+	// The size of the data section of this object, padded to alignment.
+	uint32_t mByteSize;
+	// The last data member goes to here.
+	uint32_t mDataByteSize;
+	// Alignment in bytes of the data section of this object.
+	uint32_t mAlignment;
+	// the offsets of string handles in the binary value of this class
+	DataRef<PtrOffset> mPtrOffsets;
+	ClassDescriptionSizeInfo() : mByteSize(0), mDataByteSize(0), mAlignment(0)
+	{
+	}
+};
+
+struct ClassDescription
+{
+	NamespacedName mName;
+	// No other class has this id, it is DB-unique
+	int32_t mClassId;
+	// Only single derivation supported.
+	int32_t mBaseClass;
+	// If this class has properties that are of uniform type, then we note that.
+	// This means that when deserialization an array of these objects we can just use
+	// single function to endian convert the entire mess at once.
+	int32_t mPackedUniformWidth;
+	// If this class is composed uniformly of members of a given type
+	// Or all of its properties are composed uniformly of members of
+	// a give ntype, then this class's packed type is that type.
+	// PxTransform's packed type would be float.
+	int32_t mPackedClassType;
+	// 0: 32Bit 1: 64Bit
+	ClassDescriptionSizeInfo mSizeInfo[2];
+	// No further property additions allowed.
+	bool mLocked;
+	// True when this datatype has an array on it that needs to be
+	// separately deleted.
+	bool mRequiresDestruction;
+
+	ClassDescription(NamespacedName name, int32_t id)
+	: mName(name)
+	, mClassId(id)
+	, mBaseClass(-1)
+	, mPackedUniformWidth(-1)
+	, mPackedClassType(-1)
+	, mLocked(false)
+	, mRequiresDestruction(false)
+	{
+	}
+	ClassDescription()
+	: mClassId(-1), mBaseClass(-1), mPackedUniformWidth(-1), mPackedClassType(-1), mLocked(false), mRequiresDestruction(false)
+	{
+	}
+	virtual ~ClassDescription()
+	{
+	}
+
+	ClassDescriptionSizeInfo& get32BitSizeInfo()
+	{
+		return mSizeInfo[0];
+	}
+	ClassDescriptionSizeInfo& get64BitSizeInfo()
+	{
+		return mSizeInfo[1];
+	}
+	uint32_t& get32BitSize()
+	{
+		return get32BitSizeInfo().mByteSize;
+	}
+	uint32_t& get64BitSize()
+	{
+		return get64BitSizeInfo().mByteSize;
+	}
+
+	uint32_t get32BitSize() const
+	{
+		return mSizeInfo[0].mByteSize;
+	}
+	const ClassDescriptionSizeInfo& getNativeSizeInfo() const
+	{
+		return mSizeInfo[(sizeof(void*) >> 2) - 1];
+	}
+	uint32_t getNativeSize() const
+	{
+		return getNativeSizeInfo().mByteSize;
+	}
+};
+
+struct MarshalQueryResult
+{
+	int32_t srcType;
+	int32_t dstType;
+	// If canMarshal != needsMarshalling we have a problem.
+	bool canMarshal;
+	bool needsMarshalling;
+	// Non null if marshalling is possible.
+	TBlockMarshaller marshaller;
+	MarshalQueryResult(int32_t _srcType = -1, int32_t _dstType = -1, bool _canMarshal = false, bool _needs = false,
+	                   TBlockMarshaller _m = NULL)
+	: srcType(_srcType), dstType(_dstType), canMarshal(_canMarshal), needsMarshalling(_needs), marshaller(_m)
+	{
+	}
+};
+
+struct PropertyMessageEntry
+{
+	PropertyDescription mProperty;
+	NamespacedName mDatatypeName;
+	// datatype of the data in the message.
+	int32_t mDatatypeId;
+	// where in the message this property starts.
+	uint32_t mMessageOffset;
+	// size of this entry object
+	uint32_t mByteSize;
+
+	// If the chain of properties doesn't have any array properties this indicates the
+	uint32_t mDestByteSize;
+
+	PropertyMessageEntry(PropertyDescription propName, NamespacedName dtypeName, int32_t dtype, uint32_t messageOff,
+	                     uint32_t byteSize, uint32_t destByteSize)
+	: mProperty(propName)
+	, mDatatypeName(dtypeName)
+	, mDatatypeId(dtype)
+	, mMessageOffset(messageOff)
+	, mByteSize(byteSize)
+	, mDestByteSize(destByteSize)
+	{
+	}
+	PropertyMessageEntry() : mDatatypeId(-1), mMessageOffset(0), mByteSize(0), mDestByteSize(0)
+	{
+	}
+};
+
+// Create a struct that defines a subset of the properties on an object.
+struct PropertyMessageDescription
+{
+	NamespacedName mClassName;
+	// No other class has this id, it is DB-unique
+	int32_t mClassId;
+	NamespacedName mMessageName;
+	int32_t mMessageId;
+	DataRef<PropertyMessageEntry> mProperties;
+	uint32_t mMessageByteSize;
+	// Offsets into the property message where const char* items are.
+	DataRef<uint32_t> mStringOffsets;
+	PropertyMessageDescription(const NamespacedName& nm, int32_t clsId, const NamespacedName& msgName, int32_t msgId,
+	                           uint32_t msgSize)
+	: mClassName(nm), mClassId(clsId), mMessageName(msgName), mMessageId(msgId), mMessageByteSize(msgSize)
+	{
+	}
+	PropertyMessageDescription() : mClassId(-1), mMessageId(-1), mMessageByteSize(0)
+	{
+	}
+	virtual ~PropertyMessageDescription()
+	{
+	}
+};
+
+class StringTable
+{
+  protected:
+	virtual ~StringTable()
+	{
+	}
+
+  public:
+	virtual uint32_t getNbStrs() = 0;
+	virtual uint32_t getStrs(const char** outStrs, uint32_t bufLen, uint32_t startIdx = 0) = 0;
+	virtual const char* registerStr(const char* str, bool& outAdded) = 0;
+	const char* registerStr(const char* str)
+	{
+		bool ignored;
+		return registerStr(str, ignored);
+	}
+	virtual StringHandle strToHandle(const char* str) = 0;
+	virtual const char* handleToStr(uint32_t hdl) = 0;
+	virtual void release() = 0;
+
+	static StringTable& create();
+};
+
+struct None
+{
+};
+
+template <typename T>
+class Option
+{
+	T mValue;
+	bool mHasValue;
+
+  public:
+	Option(const T& val) : mValue(val), mHasValue(true)
+	{
+	}
+	Option(None nothing = None()) : mHasValue(false)
+	{
+		(void)nothing;
+	}
+	Option(const Option& other) : mValue(other.mValue), mHasValue(other.mHasValue)
+	{
+	}
+	Option& operator=(const Option& other)
+	{
+		mValue = other.mValue;
+		mHasValue = other.mHasValue;
+		return *this;
+	}
+	bool hasValue() const
+	{
+		return mHasValue;
+	}
+	const T& getValue() const
+	{
+		PX_ASSERT(hasValue());
+		return mValue;
+	}
+	T& getValue()
+	{
+		PX_ASSERT(hasValue());
+		return mValue;
+	}
+	operator const T&() const
+	{
+		return getValue();
+	}
+	operator T&()
+	{
+		return getValue();
+	}
+	T* operator->()
+	{
+		return &getValue();
+	}
+	const T* operator->() const
+	{
+		return &getValue();
+	}
+};
+
+/**
+ *	Create new classes and add properties to some existing ones.
+ *	The default classes are created already, the simple types
+ *  along with the basic math types.
+ *	(uint8_t, int8_t, etc )
+ *	(PxVec3, PxQuat, PxTransform, PxMat33, PxMat34, PxMat44)
+ */
+class PvdObjectModelMetaData
+{
+  protected:
+	virtual ~PvdObjectModelMetaData()
+	{
+	}
+
+  public:
+	virtual ClassDescription getOrCreateClass(const NamespacedName& nm) = 0;
+	// get or create parent, lock parent. deriveFrom getOrCreatechild.
+	virtual bool deriveClass(const NamespacedName& parent, const NamespacedName& child) = 0;
+	virtual Option<ClassDescription> findClass(const NamespacedName& nm) const = 0;
+	template <typename TDataType>
+	Option<ClassDescription> findClass()
+	{
+		return findClass(getPvdNamespacedNameForType<TDataType>());
+	}
+	virtual Option<ClassDescription> getClass(int32_t classId) const = 0;
+	virtual ClassDescription* getClassPtr(int32_t classId) const = 0;
+
+	virtual Option<ClassDescription> getParentClass(int32_t classId) const = 0;
+	bool isDerivedFrom(int32_t classId, int32_t parentClass) const
+	{
+		if(classId == parentClass)
+			return true;
+		ClassDescription* p = getClassPtr(getClassPtr(classId)->mBaseClass);
+		while(p != NULL)
+		{
+			if(p->mClassId == parentClass)
+				return true;
+			p = getClassPtr(p->mBaseClass);
+		}
+		return false;
+	}
+
+	virtual void lockClass(int32_t classId) = 0;
+
+	virtual uint32_t getNbClasses() const = 0;
+	virtual uint32_t getClasses(ClassDescription* outClasses, uint32_t requestCount, uint32_t startIndex = 0) const = 0;
+
+	// Create a nested property.
+	// This way you can have obj.p.x without explicity defining the class p.
+	virtual Option<PropertyDescription> createProperty(int32_t classId, String name, String semantic, int32_t datatype,
+	                                                   PropertyType::Enum propertyType = PropertyType::Scalar) = 0;
+	Option<PropertyDescription> createProperty(NamespacedName clsId, String name, String semantic, NamespacedName dtype,
+	                                           PropertyType::Enum propertyType = PropertyType::Scalar)
+	{
+		return createProperty(findClass(clsId)->mClassId, name, semantic, findClass(dtype)->mClassId, propertyType);
+	}
+	Option<PropertyDescription> createProperty(NamespacedName clsId, String name, NamespacedName dtype,
+	                                           PropertyType::Enum propertyType = PropertyType::Scalar)
+	{
+		return createProperty(findClass(clsId)->mClassId, name, "", findClass(dtype)->mClassId, propertyType);
+	}
+	Option<PropertyDescription> createProperty(int32_t clsId, String name, int32_t dtype,
+	                                           PropertyType::Enum propertyType = PropertyType::Scalar)
+	{
+		return createProperty(clsId, name, "", dtype, propertyType);
+	}
+	template <typename TDataType>
+	Option<PropertyDescription> createProperty(int32_t clsId, String name, String semantic = "",
+	                                           PropertyType::Enum propertyType = PropertyType::Scalar)
+	{
+		return createProperty(clsId, name, semantic, getPvdNamespacedNameForType<TDataType>(), propertyType);
+	}
+	virtual Option<PropertyDescription> findProperty(const NamespacedName& cls, String prop) const = 0;
+	virtual Option<PropertyDescription> findProperty(int32_t clsId, String prop) const = 0;
+	virtual Option<PropertyDescription> getProperty(int32_t propId) const = 0;
+	virtual void setNamedPropertyValues(DataRef<NamedValue> values, int32_t propId) = 0;
+	// for enumerations and flags.
+	virtual DataRef<NamedValue> getNamedPropertyValues(int32_t propId) const = 0;
+
+	virtual uint32_t getNbProperties(int32_t classId) const = 0;
+	virtual uint32_t getProperties(int32_t classId, PropertyDescription* outBuffer, uint32_t bufCount,
+	                               uint32_t startIdx = 0) const = 0;
+
+	// Check that a property path, starting at the given class id and first property is value.  Return the resolved
+	// properties.
+	// outbuffer.size *must* equal the propPath.size().
+	Option<PropertyDescription> resolvePropertyPath(int32_t clsId, const int32_t propId) const
+	{
+		Option<PropertyDescription> prop(getProperty(propId));
+		if(prop.hasValue() == false)
+			return prop;
+		if(isDerivedFrom(clsId, prop.getValue().mOwnerClassId) == false)
+			return None();
+		return prop;
+	}
+	// Does one cls id differ marshalling to another and if so return the functions to do it.
+	virtual MarshalQueryResult checkMarshalling(int32_t srcClsId, int32_t dstClsId) const = 0;
+
+	// messages and classes are stored in separate maps, so a property message can have the same name as a class.
+	virtual Option<PropertyMessageDescription> createPropertyMessage(const NamespacedName& cls,
+	                                                                 const NamespacedName& msgName,
+	                                                                 DataRef<PropertyMessageArg> entries,
+	                                                                 uint32_t messageSize) = 0;
+	virtual Option<PropertyMessageDescription> findPropertyMessage(const NamespacedName& msgName) const = 0;
+	virtual Option<PropertyMessageDescription> getPropertyMessage(int32_t msgId) const = 0;
+
+	virtual uint32_t getNbPropertyMessages() const = 0;
+	virtual uint32_t getPropertyMessages(PropertyMessageDescription* msgBuf, uint32_t bufLen,
+	                                     uint32_t startIdx = 0) const = 0;
+
+	virtual StringTable& getStringTable() const = 0;
+
+	virtual void write(PvdOutputStream& stream) const = 0;
+	void save(PvdOutputStream& stream) const
+	{
+		write(stream);
+	}
+
+	virtual PvdObjectModelMetaData& clone() const = 0;
+
+	virtual void addRef() = 0;
+	virtual void release() = 0;
+
+	static uint32_t getCurrentPvdObjectModelVersion();
+	static PvdObjectModelMetaData& create();
+	static PvdObjectModelMetaData& create(PvdInputStream& stream);
+};
+}
+}
+#endif // PXPVDSDK_PXPVDOBJECTMODELMETADATA_H
diff --git a/PxShared/src/pvd/src/PxPvdObjectRegistrar.cpp b/PxShared/src/pvd/src/PxPvdObjectRegistrar.cpp
new file mode 100644
index 0000000..67667f9
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdObjectRegistrar.cpp
@@ -0,0 +1,80 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "PxPvdObjectRegistrar.h"
+
+namespace physx
+{
+namespace pvdsdk
+{
+
+bool ObjectRegistrar::addItem(const void* inItem)
+{
+	physx::shdfnd::Mutex::ScopedLock lock(mRefCountMapLock);
+
+	if(mRefCountMap.find(inItem))
+	{
+		uint32_t& counter = mRefCountMap[inItem];
+		counter++;
+		return false;
+	}
+	else
+	{
+		mRefCountMap.insert(inItem, 1);
+		return true;
+	}
+}
+
+bool ObjectRegistrar::decItem(const void* inItem)
+{
+	physx::shdfnd::Mutex::ScopedLock lock(mRefCountMapLock);
+	const physx::shdfnd::HashMap<const void*, uint32_t>::Entry* entry = mRefCountMap.find(inItem);
+	if(entry)
+	{
+		uint32_t& retval(const_cast<uint32_t&>(entry->second));
+		if(retval)
+			--retval;
+		uint32_t theValue = retval;
+		if(theValue == 0)
+		{
+			mRefCountMap.erase(inItem);
+			return true;
+		}
+	}
+	return false;
+}
+
+void ObjectRegistrar::clear()
+{
+	physx::shdfnd::Mutex::ScopedLock lock(mRefCountMapLock);
+	mRefCountMap.clear();
+}
+
+} // pvdsdk
+} // physx
diff --git a/PxShared/src/pvd/src/PxPvdObjectRegistrar.h b/PxShared/src/pvd/src/PxPvdObjectRegistrar.h
new file mode 100644
index 0000000..dbd9ebc
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdObjectRegistrar.h
@@ -0,0 +1,71 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PXPVDSDK_PXPVDOBJECTREGISTRAR_H
+#define PXPVDSDK_PXPVDOBJECTREGISTRAR_H
+
+/** \addtogroup pvd
+@{
+*/
+
+#include "PsHashMap.h"
+#include "PsMutex.h"
+
+#if !PX_DOXYGEN
+namespace physx
+{
+namespace pvdsdk
+{
+#endif
+class ObjectRegistrar
+{
+	PX_NOCOPY(ObjectRegistrar)
+  public:
+	ObjectRegistrar()
+	{
+	}
+	virtual ~ObjectRegistrar()
+	{
+	}
+
+	bool addItem(const void* inItem);
+	bool decItem(const void* inItem);
+	void clear();
+
+  private:
+	physx::shdfnd::HashMap<const void*, uint32_t> mRefCountMap;
+	physx::shdfnd::Mutex mRefCountMapLock;
+};
+#if !PX_DOXYGEN
+} // pvdsdk
+} // physx
+#endif
+
+/** @} */
+#endif // PXPVDSDK_PXPVDOBJECTREGISTRAR_H
diff --git a/PxShared/src/pvd/src/PxPvdProfileZoneClient.cpp b/PxShared/src/pvd/src/PxPvdProfileZoneClient.cpp
new file mode 100644
index 0000000..8d8582c
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdProfileZoneClient.cpp
@@ -0,0 +1,173 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "pvd/PxPvdTransport.h"
+
+#include "PxPvdImpl.h"
+#include "PxPvdProfileZoneClient.h"
+#include "PxProfileZone.h"
+
+namespace physx
+{
+namespace pvdsdk
+{
+struct ProfileZoneClient : public profile::PxProfileZoneClient, public shdfnd::UserAllocated
+{
+	profile::PxProfileZone& mZone;
+	PvdDataStream& mStream;
+
+	ProfileZoneClient(profile::PxProfileZone& zone, PvdDataStream& stream) : mZone(zone), mStream(stream)
+	{
+	}
+
+	~ProfileZoneClient()
+	{
+		mZone.removeClient(*this);
+	}
+
+	virtual void createInstance()
+	{
+		mStream.addProfileZone(&mZone, mZone.getName());
+		mStream.createInstance(&mZone);
+		mZone.addClient(*this);
+		profile::PxProfileNames names(mZone.getProfileNames());
+		PVD_FOREACH(idx, names.eventCount)
+		{
+			handleEventAdded(names.events[idx]);
+		}
+	}
+
+	virtual void handleEventAdded(const profile::PxProfileEventName& inName)
+	{
+		mStream.addProfileZoneEvent(&mZone, inName.name, inName.eventId.eventId, inName.eventId.compileTimeEnabled);
+	}
+
+	virtual void handleBufferFlush(const uint8_t* inData, uint32_t inLength)
+	{
+		mStream.setPropertyValue(&mZone, "events", inData, inLength);
+	}
+
+	virtual void handleClientRemoved()
+	{
+		mStream.destroyInstance(&mZone);
+	}
+
+  private:
+	ProfileZoneClient& operator=(const ProfileZoneClient&);
+};
+}
+}
+
+using namespace physx;
+using namespace pvdsdk;
+
+PvdProfileZoneClient::PvdProfileZoneClient(PvdImpl& pvd) : mSDKPvd(pvd), mPvdDataStream(NULL), mIsConnected(false)
+{
+}
+
+PvdProfileZoneClient::~PvdProfileZoneClient()
+{
+	mSDKPvd.removeClient(this);
+	// all zones should removed
+	PX_ASSERT(mProfileZoneClients.size() == 0);
+}
+
+PvdDataStream* PvdProfileZoneClient::getDataStream()
+{
+	return mPvdDataStream;
+}
+
+PvdUserRenderer* PvdProfileZoneClient::getUserRender()
+{
+	PX_ASSERT(0);
+	return NULL;
+}
+
+void PvdProfileZoneClient::setObjectRegistrar(ObjectRegistrar*)
+{
+}
+
+bool PvdProfileZoneClient::isConnected() const
+{
+	return mIsConnected;
+}
+
+void PvdProfileZoneClient::onPvdConnected()
+{
+	if(mIsConnected)
+		return;
+	mIsConnected = true;
+
+	mPvdDataStream = PvdDataStream::create(&mSDKPvd);
+
+}
+
+void PvdProfileZoneClient::onPvdDisconnected()
+{
+	if(!mIsConnected)
+		return;
+
+	mIsConnected = false;
+	flush();
+
+	mPvdDataStream->release();
+	mPvdDataStream = NULL;
+}
+
+void PvdProfileZoneClient::flush()
+{
+	PVD_FOREACH(idx, mProfileZoneClients.size())
+	mProfileZoneClients[idx]->mZone.flushProfileEvents();
+}
+
+void PvdProfileZoneClient::onZoneAdded(profile::PxProfileZone& zone)
+{
+	PX_ASSERT(mIsConnected);
+	ProfileZoneClient* client = PVD_NEW(ProfileZoneClient)(zone, *mPvdDataStream);
+	mMutex.lock();
+	client->createInstance();
+	mProfileZoneClients.pushBack(client);
+	mMutex.unlock();
+}
+
+void PvdProfileZoneClient::onZoneRemoved(profile::PxProfileZone& zone)
+{
+	for(uint32_t i = 0; i < mProfileZoneClients.size(); i++)
+	{
+		if(&zone == &mProfileZoneClients[i]->mZone)
+		{
+			mMutex.lock();
+			ProfileZoneClient* client = mProfileZoneClients[i];
+			mProfileZoneClients.replaceWithLast(i);
+			PVD_DELETE(client);
+			mMutex.unlock();
+			return;
+		}
+	}
+}
diff --git a/PxShared/src/pvd/src/PxPvdProfileZoneClient.h b/PxShared/src/pvd/src/PxPvdProfileZoneClient.h
new file mode 100644
index 0000000..4484997
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdProfileZoneClient.h
@@ -0,0 +1,77 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PXPVDSDK_PXPVDPROFILEZONECLIENT_H
+#define PXPVDSDK_PXPVDPROFILEZONECLIENT_H
+#include "PxPvdClient.h"
+#include "PsHashMap.h"
+#include "PsMutex.h"
+#include "PxProfileZoneManager.h"
+
+namespace physx
+{
+namespace pvdsdk
+{
+class PvdImpl;
+class PvdDataStream;
+
+struct ProfileZoneClient;
+
+class PvdProfileZoneClient : public PvdClient, public profile::PxProfileZoneHandler, public shdfnd::UserAllocated
+{
+	PX_NOCOPY(PvdProfileZoneClient)
+  public:
+	PvdProfileZoneClient(PvdImpl& pvd);
+	virtual ~PvdProfileZoneClient();
+
+	bool isConnected() const;
+	void onPvdConnected();
+	void onPvdDisconnected();
+	void flush();
+
+	PvdDataStream* getDataStream();
+	PvdUserRenderer* getUserRender();
+	void setObjectRegistrar(ObjectRegistrar*);
+
+	// PxProfileZoneHandler
+	void onZoneAdded(profile::PxProfileZone& inSDK);
+	void onZoneRemoved(profile::PxProfileZone& inSDK);
+
+  private:
+	shdfnd::Mutex mMutex; // zoneAdded can called from different threads
+	PvdImpl& mSDKPvd;
+	PvdDataStream* mPvdDataStream;	
+	physx::shdfnd::Array<ProfileZoneClient*> mProfileZoneClients;
+	bool mIsConnected;
+};
+
+} // namespace pvdsdk
+} // namespace physx
+
+#endif // PXPVDSDK_PXPVDPROFILEZONECLIENT_H
diff --git a/PxShared/src/pvd/src/PxPvdUserRenderImpl.h b/PxShared/src/pvd/src/PxPvdUserRenderImpl.h
new file mode 100644
index 0000000..04574e9
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdUserRenderImpl.h
@@ -0,0 +1,411 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+#ifndef PXPVDSDK_PXPVDUSERRENDERIMPL_H
+#define PXPVDSDK_PXPVDUSERRENDERIMPL_H
+
+#include "PxPvdUserRenderer.h"
+
+namespace physx
+{
+namespace pvdsdk
+{
+
+struct PvdUserRenderTypes
+{
+	enum Enum
+	{
+		Unknown = 0,
+#define DECLARE_PVD_IMMEDIATE_RENDER_TYPE(type) type,
+#define DECLARE_PVD_IMMEDIATE_RENDER_TYPE_NO_COMMA(type) type
+#include "PxPvdUserRenderTypes.h"
+#undef DECLARE_PVD_IMMEDIATE_RENDER_TYPE_NO_COMMA
+#undef DECLARE_PVD_IMMEDIATE_RENDER_TYPE
+	};
+};
+
+class RenderSerializer
+{
+  protected:
+	virtual ~RenderSerializer()
+	{
+	}
+
+  public:
+	virtual void streamify(uint64_t& val) = 0;
+	virtual void streamify(float& val) = 0;
+	virtual void streamify(uint32_t& val) = 0;
+	virtual void streamify(uint8_t& val) = 0;
+	virtual void streamify(DataRef<uint8_t>& val) = 0;
+	virtual void streamify(DataRef<PvdDebugPoint>& val) = 0;
+	virtual void streamify(DataRef<PvdDebugLine>& val) = 0;
+	virtual void streamify(DataRef<PvdDebugTriangle>& val) = 0;
+	virtual void streamify(PvdDebugText& val) = 0;
+	virtual bool isGood() = 0;
+	virtual uint32_t hasData() = 0;
+
+	void streamify(PvdUserRenderTypes::Enum& val)
+	{
+		uint8_t data = static_cast<uint8_t>(val);
+		streamify(data);
+		val = static_cast<PvdUserRenderTypes::Enum>(data);
+	}
+	void streamify(PxVec3& val)
+	{
+		streamify(val[0]);
+		streamify(val[1]);
+		streamify(val[2]);
+	}
+
+	void streamify(PvdColor& val)
+	{
+		streamify(val.r);
+		streamify(val.g);
+		streamify(val.b);
+		streamify(val.a);
+	}
+	void streamify(PxTransform& val)
+	{
+		streamify(val.q.x);
+		streamify(val.q.y);
+		streamify(val.q.z);
+		streamify(val.q.w);
+		streamify(val.p.x);
+		streamify(val.p.y);
+		streamify(val.p.z);
+	}
+	void streamify(bool& val)
+	{
+		uint8_t tempVal = uint8_t(val ? 1 : 0);
+		streamify(tempVal);
+		val = tempVal ? true : false;
+	}
+};
+
+template <typename TBulkRenderType>
+struct BulkRenderEvent
+{
+	DataRef<TBulkRenderType> mData;
+	BulkRenderEvent(const TBulkRenderType* data, uint32_t count) : mData(data, count)
+	{
+	}
+	BulkRenderEvent()
+	{
+	}
+	void serialize(RenderSerializer& serializer)
+	{
+		serializer.streamify(mData);
+	}
+};
+struct SetInstanceIdRenderEvent
+{
+	uint64_t mInstanceId;
+	SetInstanceIdRenderEvent(uint64_t iid) : mInstanceId(iid)
+	{
+	}
+	SetInstanceIdRenderEvent()
+	{
+	}
+	void serialize(RenderSerializer& serializer)
+	{
+		serializer.streamify(mInstanceId);
+	}
+};
+struct PointsRenderEvent : BulkRenderEvent<PvdDebugPoint>
+{
+	PointsRenderEvent(const PvdDebugPoint* data, uint32_t count) : BulkRenderEvent<PvdDebugPoint>(data, count)
+	{
+	}
+	PointsRenderEvent()
+	{
+	}
+};
+struct LinesRenderEvent : BulkRenderEvent<PvdDebugLine>
+{
+	LinesRenderEvent(const PvdDebugLine* data, uint32_t count) : BulkRenderEvent<PvdDebugLine>(data, count)
+	{
+	}
+	LinesRenderEvent()
+	{
+	}
+};
+struct TrianglesRenderEvent : BulkRenderEvent<PvdDebugTriangle>
+{
+	TrianglesRenderEvent(const PvdDebugTriangle* data, uint32_t count) : BulkRenderEvent<PvdDebugTriangle>(data, count)
+	{
+	}
+	TrianglesRenderEvent()
+	{
+	}
+};
+struct DebugRenderEvent
+{
+	DataRef<PvdDebugPoint> mPointData;
+	DataRef<PvdDebugLine> mLineData;
+	DataRef<PvdDebugTriangle> mTriangleData;
+	DebugRenderEvent(const PvdDebugPoint* pointData, uint32_t pointCount, const PvdDebugLine* lineData,
+	                 uint32_t lineCount, const PvdDebugTriangle* triangleData, uint32_t triangleCount)
+	: mPointData(pointData, pointCount), mLineData(lineData, lineCount), mTriangleData(triangleData, triangleCount)
+	{
+	}
+
+	DebugRenderEvent()
+	{
+	}
+	void serialize(RenderSerializer& serializer)
+	{
+		serializer.streamify(mPointData);
+		serializer.streamify(mLineData);
+		serializer.streamify(mTriangleData);
+	}
+};
+
+struct TextRenderEvent
+{
+	PvdDebugText mText;
+	TextRenderEvent(const PvdDebugText& text)
+	{
+		mText.color = text.color;
+		mText.position = text.position;
+		mText.size = text.size;
+		mText.string = text.string;
+	}
+	TextRenderEvent()
+	{
+	}
+	void serialize(RenderSerializer& serializer)
+	{
+		serializer.streamify(mText);
+	}
+};
+
+struct JointFramesRenderEvent
+{
+	PxTransform parent;
+	PxTransform child;
+	JointFramesRenderEvent(const PxTransform& p, const PxTransform& c) : parent(p), child(c)
+	{
+	}
+	JointFramesRenderEvent()
+	{
+	}
+	void serialize(RenderSerializer& serializer)
+	{
+		serializer.streamify(parent);
+		serializer.streamify(child);
+	}
+};
+struct LinearLimitRenderEvent
+{
+	PxTransform t0;
+	PxTransform t1;
+	float value;
+	bool active;
+	LinearLimitRenderEvent(const PxTransform& _t0, const PxTransform& _t1, float _value, bool _active)
+	: t0(_t0), t1(_t1), value(_value), active(_active)
+	{
+	}
+	LinearLimitRenderEvent()
+	{
+	}
+	void serialize(RenderSerializer& serializer)
+	{
+		serializer.streamify(t0);
+		serializer.streamify(t1);
+		serializer.streamify(value);
+		serializer.streamify(active);
+	}
+};
+struct AngularLimitRenderEvent
+{
+	PxTransform t0;
+	float lower;
+	float upper;
+	bool active;
+	AngularLimitRenderEvent(const PxTransform& _t0, float _lower, float _upper, bool _active)
+	: t0(_t0), lower(_lower), upper(_upper), active(_active)
+	{
+	}
+	AngularLimitRenderEvent()
+	{
+	}
+	void serialize(RenderSerializer& serializer)
+	{
+		serializer.streamify(t0);
+		serializer.streamify(lower);
+		serializer.streamify(upper);
+		serializer.streamify(active);
+	}
+};
+struct LimitConeRenderEvent
+{
+	PxTransform t;
+	float ySwing;
+	float zSwing;
+	bool active;
+	LimitConeRenderEvent(const PxTransform& _t, float _ySwing, float _zSwing, bool _active)
+	: t(_t), ySwing(_ySwing), zSwing(_zSwing), active(_active)
+	{
+	}
+	LimitConeRenderEvent()
+	{
+	}
+	void serialize(RenderSerializer& serializer)
+	{
+		serializer.streamify(t);
+		serializer.streamify(ySwing);
+		serializer.streamify(zSwing);
+		serializer.streamify(active);
+	}
+};
+struct DoubleConeRenderEvent
+{
+	PxTransform t;
+	float angle;
+	bool active;
+	DoubleConeRenderEvent(const PxTransform& _t, float _angle, bool _active) : t(_t), angle(_angle), active(_active)
+	{
+	}
+	DoubleConeRenderEvent()
+	{
+	}
+	void serialize(RenderSerializer& serializer)
+	{
+		serializer.streamify(t);
+		serializer.streamify(angle);
+		serializer.streamify(active);
+	}
+};
+
+template <typename TDataType>
+struct RenderSerializerMap
+{
+	void serialize(RenderSerializer& s, TDataType& d)
+	{
+		d.serialize(s);
+	}
+};
+template <>
+struct RenderSerializerMap<uint8_t>
+{
+	void serialize(RenderSerializer& s, uint8_t& d)
+	{
+		s.streamify(d);
+	}
+};
+
+template <>
+struct RenderSerializerMap<PvdDebugPoint>
+{
+	void serialize(RenderSerializer& s, PvdDebugPoint& d)
+	{
+		s.streamify(d.pos);
+		s.streamify(d.color);
+	}
+};
+
+template <>
+struct RenderSerializerMap<PvdDebugLine>
+{
+	void serialize(RenderSerializer& s, PvdDebugLine& d)
+	{
+		s.streamify(d.pos0);
+		s.streamify(d.color0);
+		s.streamify(d.pos1);
+		s.streamify(d.color1);
+	}
+};
+
+template <>
+struct RenderSerializerMap<PvdDebugTriangle>
+{
+	void serialize(RenderSerializer& s, PvdDebugTriangle& d)
+	{
+		s.streamify(d.pos0);
+		s.streamify(d.color0);
+		s.streamify(d.pos1);
+		s.streamify(d.color1);
+		s.streamify(d.pos2);
+		s.streamify(d.color2);
+	}
+};
+
+template <typename TDataType>
+struct PvdTypeToRenderType
+{
+	bool compile_error;
+};
+
+#define DECLARE_PVD_IMMEDIATE_RENDER_TYPE(type)                                                                        \
+	template <>                                                                                                        \
+	struct PvdTypeToRenderType<type##RenderEvent>                                                                      \
+	{                                                                                                                  \
+		enum Enum                                                                                                      \
+		{                                                                                                              \
+			EnumVal = PvdUserRenderTypes::type                                                                         \
+		};                                                                                                             \
+	};
+
+#include "PxPvdUserRenderTypes.h"
+#undef DECLARE_PVD_IMMEDIATE_RENDER_TYPE
+
+template <typename TDataType>
+PvdUserRenderTypes::Enum getPvdRenderTypeFromType()
+{
+	return static_cast<PvdUserRenderTypes::Enum>(PvdTypeToRenderType<TDataType>::EnumVal);
+}
+
+class PvdUserRenderHandler
+{
+  protected:
+	virtual ~PvdUserRenderHandler()
+	{
+	}
+
+  public:
+#define DECLARE_PVD_IMMEDIATE_RENDER_TYPE(type) virtual void handleRenderEvent(const type##RenderEvent& evt) = 0;
+
+#include "PxPvdUserRenderTypes.h"
+#undef DECLARE_PVD_IMMEDIATE_RENDER_TYPE
+};
+
+class PvdUserRenderParser
+{
+  protected:
+	virtual ~PvdUserRenderParser()
+	{
+	}
+
+  public:
+	virtual void release() = 0;
+	virtual void parseData(DataRef<const uint8_t> data, PvdUserRenderHandler& handler) = 0;
+
+	static PvdUserRenderParser& create(bool swapBytes);
+};
+}
+}
+
+#endif // PXPVDSDK_PXPVDUSERRENDERIMPL_H
diff --git a/PxShared/src/pvd/src/PxPvdUserRenderTypes.h b/PxShared/src/pvd/src/PxPvdUserRenderTypes.h
new file mode 100644
index 0000000..6a47abb
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdUserRenderTypes.h
@@ -0,0 +1,46 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+#define THERE_IS_NO_INCLUDE_GUARD_HERE_FOR_A_REASON
+
+#ifndef DECLARE_PVD_IMMEDIATE_RENDER_TYPE_NO_COMMA
+#define DECLARE_PVD_IMMEDIATE_RENDER_TYPE_NO_COMMA DECLARE_PVD_IMMEDIATE_RENDER_TYPE
+#endif
+
+DECLARE_PVD_IMMEDIATE_RENDER_TYPE(SetInstanceId)
+DECLARE_PVD_IMMEDIATE_RENDER_TYPE(Points)
+DECLARE_PVD_IMMEDIATE_RENDER_TYPE(Lines)
+DECLARE_PVD_IMMEDIATE_RENDER_TYPE(Triangles)
+DECLARE_PVD_IMMEDIATE_RENDER_TYPE(JointFrames)
+DECLARE_PVD_IMMEDIATE_RENDER_TYPE(LinearLimit)
+DECLARE_PVD_IMMEDIATE_RENDER_TYPE(AngularLimit)
+DECLARE_PVD_IMMEDIATE_RENDER_TYPE(LimitCone)
+DECLARE_PVD_IMMEDIATE_RENDER_TYPE(DoubleCone)
+DECLARE_PVD_IMMEDIATE_RENDER_TYPE(Text)
+DECLARE_PVD_IMMEDIATE_RENDER_TYPE_NO_COMMA(Debug)
+
+#undef DECLARE_PVD_IMMEDIATE_RENDER_TYPE_NO_COMMA
+#undef THERE_IS_NO_INCLUDE_GUARD_HERE_FOR_A_REASON
diff --git a/PxShared/src/pvd/src/PxPvdUserRenderer.cpp b/PxShared/src/pvd/src/PxPvdUserRenderer.cpp
new file mode 100644
index 0000000..784d115
--- /dev/null
+++ b/PxShared/src/pvd/src/PxPvdUserRenderer.cpp
@@ -0,0 +1,460 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+#include "PxPvdUserRenderer.h"
+#include "PxPvdUserRenderImpl.h"
+#include "PxPvdFoundation.h"
+#include "PxPvdInternalByteStreams.h"
+#include "PxPvdBits.h"
+#include "PxPvdCommStreamTypes.h"
+#include <stdarg.h>
+
+using namespace physx;
+using namespace physx::pvdsdk;
+
+namespace
+{
+
+template <typename TStreamType>
+struct RenderWriter : public RenderSerializer
+{
+	TStreamType& mStream;
+	RenderWriter(TStreamType& stream) : mStream(stream)
+	{
+	}
+	template <typename TDataType>
+	void write(const TDataType* val, uint32_t count)
+	{
+		uint32_t numBytes = count * sizeof(TDataType);
+		mStream.write(reinterpret_cast<const uint8_t*>(val), numBytes);
+	}
+	template <typename TDataType>
+	void write(const TDataType& val)
+	{
+		write(&val, 1);
+	}
+
+	template <typename TDataType>
+	void writeRef(DataRef<TDataType>& val)
+	{
+		uint32_t amount = val.size();
+		write(amount);
+		if(amount)
+			write(val.begin(), amount);
+	}
+
+	virtual void streamify(uint64_t& val)
+	{
+		write(val);
+	}
+	virtual void streamify(uint32_t& val)
+	{
+		write(val);
+	}
+	virtual void streamify(float& val)
+	{
+		write(val);
+	}
+	virtual void streamify(uint8_t& val)
+	{
+		write(val);
+	}
+	virtual void streamify(DataRef<uint8_t>& val)
+	{
+		writeRef(val);
+	}
+
+	virtual void streamify(PvdDebugText& val)
+	{
+		write(val.color);
+		write(val.position);
+		write(val.size);
+
+		uint32_t amount = static_cast<uint32_t>(strlen(val.string)) + 1;
+		write(amount);
+		if(amount)
+			write(val.string, amount);
+	}
+
+	virtual void streamify(DataRef<PvdDebugPoint>& val)
+	{
+		writeRef(val);
+	}
+	virtual void streamify(DataRef<PvdDebugLine>& val)
+	{
+		writeRef(val);
+	}
+	virtual void streamify(DataRef<PvdDebugTriangle>& val)
+	{
+		writeRef(val);
+	}
+
+	virtual uint32_t hasData()
+	{
+		return false;
+	}
+	virtual bool isGood()
+	{
+		return true;
+	}
+
+  private:
+	RenderWriter& operator=(const RenderWriter&);
+};
+
+struct UserRenderer : public PvdUserRenderer
+{
+	ForwardingMemoryBuffer mBuffer;
+	uint32_t mBufferCapacity;
+	RendererEventClient* mClient;
+
+	UserRenderer(uint32_t bufferFullAmount)
+	: mBuffer("UserRenderBuffer"), mBufferCapacity(bufferFullAmount), mClient(NULL)
+	{
+	}
+	virtual ~UserRenderer()
+	{
+	}
+	virtual void release()
+	{
+		PVD_DELETE(this);
+	}
+
+	template <typename TEventType>
+	void handleEvent(TEventType evt)
+	{
+		RenderWriter<ForwardingMemoryBuffer> _writer(mBuffer);
+		RenderSerializer& writer(_writer);
+
+		PvdUserRenderTypes::Enum evtType(getPvdRenderTypeFromType<TEventType>());
+		writer.streamify(evtType);
+		evt.serialize(writer);
+		if(mBuffer.size() >= mBufferCapacity)
+			flushRenderEvents();
+	}
+	virtual void setInstanceId(const void* iid)
+	{
+		handleEvent(SetInstanceIdRenderEvent(PVD_POINTER_TO_U64(iid)));
+	}
+	// Draw these points associated with this instance
+	virtual void drawPoints(const PvdDebugPoint* points, uint32_t count)
+	{
+		handleEvent(PointsRenderEvent(points, count));
+	}
+	// Draw these lines associated with this instance
+	virtual void drawLines(const PvdDebugLine* lines, uint32_t count)
+	{
+		handleEvent(LinesRenderEvent(lines, count));
+	}
+	// Draw these triangles associated with this instance
+	virtual void drawTriangles(const PvdDebugTriangle* triangles, uint32_t count)
+	{
+		handleEvent(TrianglesRenderEvent(triangles, count));
+	}
+
+	virtual void drawText(const PvdDebugText& text)
+	{
+		handleEvent(TextRenderEvent(text));
+	}
+
+	virtual void drawRenderbuffer(const PvdDebugPoint* pointData, uint32_t pointCount, const PvdDebugLine* lineData,
+	                              uint32_t lineCount, const PvdDebugTriangle* triangleData, uint32_t triangleCount)
+	{
+		handleEvent(DebugRenderEvent(pointData, pointCount, lineData, lineCount, triangleData, triangleCount));
+	}
+
+	// Constraint visualization routines
+	virtual void visualizeJointFrames(const PxTransform& parent, const PxTransform& child)
+	{
+		handleEvent(JointFramesRenderEvent(parent, child));
+	}
+	virtual void visualizeLinearLimit(const PxTransform& t0, const PxTransform& t1, float value, bool active)
+	{
+		handleEvent(LinearLimitRenderEvent(t0, t1, value, active));
+	}
+	virtual void visualizeAngularLimit(const PxTransform& t0, float lower, float upper, bool active)
+	{
+		handleEvent(AngularLimitRenderEvent(t0, lower, upper, active));
+	}
+	virtual void visualizeLimitCone(const PxTransform& t, float ySwing, float zSwing, bool active)
+	{
+		handleEvent(LimitConeRenderEvent(t, ySwing, zSwing, active));
+	}
+	virtual void visualizeDoubleCone(const PxTransform& t, float angle, bool active)
+	{
+		handleEvent(DoubleConeRenderEvent(t, angle, active));
+	}
+	// Clear the immedate buffer.
+	virtual void flushRenderEvents()
+	{
+		if(mClient)
+			mClient->handleBufferFlush(mBuffer.begin(), mBuffer.size());
+		mBuffer.clear();
+	}
+
+	virtual void setClient(RendererEventClient* client)
+	{
+		mClient = client;
+	}
+
+  private:
+	UserRenderer& operator=(const UserRenderer&);
+};
+
+template <bool swapBytes>
+struct RenderReader : public RenderSerializer
+{
+	MemPvdInputStream mStream;
+	ForwardingMemoryBuffer& mBuffer;
+
+	RenderReader(ForwardingMemoryBuffer& buf) : mBuffer(buf)
+	{
+	}
+	void setData(DataRef<const uint8_t> data)
+	{
+		mStream.setup(const_cast<uint8_t*>(data.begin()), const_cast<uint8_t*>(data.end()));
+	}
+	virtual void streamify(uint32_t& val)
+	{
+		mStream >> val;
+	}
+	virtual void streamify(uint64_t& val)
+	{
+		mStream >> val;
+	}
+	virtual void streamify(float& val)
+	{
+		mStream >> val;
+	}
+	virtual void streamify(uint8_t& val)
+	{
+		mStream >> val;
+	}
+	template <typename TDataType>
+	void readRef(DataRef<TDataType>& val)
+	{
+		uint32_t count;
+		mStream >> count;
+		uint32_t numBytes = sizeof(TDataType) * count;
+
+		TDataType* dataPtr = reinterpret_cast<TDataType*>(mBuffer.growBuf(numBytes));
+		mStream.read(reinterpret_cast<uint8_t*>(dataPtr), numBytes);
+		val = DataRef<TDataType>(dataPtr, count);
+	}
+
+	virtual void streamify(DataRef<PvdDebugPoint>& val)
+	{
+		readRef(val);
+	}
+	virtual void streamify(DataRef<PvdDebugLine>& val)
+	{
+		readRef(val);
+	}
+	virtual void streamify(DataRef<PvdDebugTriangle>& val)
+	{
+		readRef(val);
+	}
+	virtual void streamify(PvdDebugText& val)
+	{
+		mStream >> val.color;
+		mStream >> val.position;
+		mStream >> val.size;
+
+		uint32_t len = 0;
+		mStream >> len;
+
+		uint8_t* dataPtr = mBuffer.growBuf(len);
+		mStream.read(dataPtr, len);
+		val.string = reinterpret_cast<const char*>(dataPtr);
+	}
+	virtual void streamify(DataRef<uint8_t>& val)
+	{
+		readRef(val);
+	}
+	virtual bool isGood()
+	{
+		return mStream.isGood();
+	}
+	virtual uint32_t hasData()
+	{
+		return uint32_t(mStream.size() > 0);
+	}
+
+  private:
+	RenderReader& operator=(const RenderReader&);
+};
+
+template <>
+struct RenderReader<true> : public RenderSerializer
+{
+	MemPvdInputStream mStream;
+	ForwardingMemoryBuffer& mBuffer;
+	RenderReader(ForwardingMemoryBuffer& buf) : mBuffer(buf)
+	{
+	}
+	void setData(DataRef<const uint8_t> data)
+	{
+		mStream.setup(const_cast<uint8_t*>(data.begin()), const_cast<uint8_t*>(data.end()));
+	}
+
+	template <typename TDataType>
+	void read(TDataType& val)
+	{
+		mStream >> val;
+		swapBytes(val);
+	}
+	virtual void streamify(uint64_t& val)
+	{
+		read(val);
+	}
+	virtual void streamify(uint32_t& val)
+	{
+		read(val);
+	}
+	virtual void streamify(float& val)
+	{
+		read(val);
+	}
+	virtual void streamify(uint8_t& val)
+	{
+		read(val);
+	}
+	template <typename TDataType>
+	void readRef(DataRef<TDataType>& val)
+	{
+		uint32_t count;
+		mStream >> count;
+		swapBytes(count);
+		uint32_t numBytes = sizeof(TDataType) * count;
+
+		TDataType* dataPtr = reinterpret_cast<TDataType*>(mBuffer.growBuf(numBytes));
+		PVD_FOREACH(idx, count)
+		RenderSerializerMap<TDataType>().serialize(*this, dataPtr[idx]);
+		val = DataRef<TDataType>(dataPtr, count);
+	}
+
+	virtual void streamify(DataRef<PvdDebugPoint>& val)
+	{
+		readRef(val);
+	}
+	virtual void streamify(DataRef<PvdDebugLine>& val)
+	{
+		readRef(val);
+	}
+	virtual void streamify(DataRef<PvdDebugTriangle>& val)
+	{
+		readRef(val);
+	}
+	virtual void streamify(PvdDebugText& val)
+	{
+		mStream >> val.color;
+		mStream >> val.position;
+		mStream >> val.size;
+
+		uint32_t len = 0;
+		mStream >> len;
+
+		uint8_t* dataPtr = mBuffer.growBuf(len);
+		mStream.read(dataPtr, len);
+		val.string = reinterpret_cast<const char*>(dataPtr);
+	}
+	virtual void streamify(DataRef<uint8_t>& val)
+	{
+		readRef(val);
+	}
+	virtual bool isGood()
+	{
+		return mStream.isGood();
+	}
+	virtual uint32_t hasData()
+	{
+		return uint32_t(mStream.size() > 0);
+	}
+
+  private:
+	RenderReader& operator=(const RenderReader&);
+};
+
+template <bool swapBytes>
+struct Parser : public PvdUserRenderParser
+{
+	ForwardingMemoryBuffer mBuffer;
+	RenderReader<swapBytes> mReader;
+	Parser() : mBuffer("PvdUserRenderParser::mBuffer"), mReader(mBuffer)
+	{
+	}
+
+	void release()
+	{
+		PVD_DELETE(this);
+	}
+	void parseData(DataRef<const uint8_t> data, PvdUserRenderHandler& handler)
+	{
+		mReader.setData(data);
+		RenderSerializer& serializer(mReader);
+		while(serializer.isGood() && serializer.hasData())
+		{
+			mReader.mBuffer.clear();
+			PvdUserRenderTypes::Enum evtType = PvdUserRenderTypes::Unknown;
+			serializer.streamify(evtType);
+			switch(evtType)
+			{
+#define DECLARE_PVD_IMMEDIATE_RENDER_TYPE(type)                                                                        \
+	case PvdUserRenderTypes::type:                                                                                     \
+	{                                                                                                                  \
+		type##RenderEvent evt;                                                                                         \
+		evt.serialize(serializer);                                                                                     \
+		handler.handleRenderEvent(evt);                                                                                \
+	}                                                                                                                  \
+	break;
+#include "PxPvdUserRenderTypes.h"
+#undef DECLARE_PVD_IMMEDIATE_RENDER_TYPE
+			case PvdUserRenderTypes::Unknown:
+				PX_ASSERT(false);
+				return;
+			}
+		}
+		PX_ASSERT(serializer.isGood());
+		return;
+	}
+
+	PX_NOCOPY(Parser<swapBytes>)
+};
+}
+
+PvdUserRenderParser& PvdUserRenderParser::create(bool swapBytes)
+{
+	if(swapBytes)
+		return *PVD_NEW(Parser<true>);
+	else
+		return *PVD_NEW(Parser<false>);
+}
+
+PvdUserRenderer* PvdUserRenderer::create(uint32_t bufferSize)
+{
+	return PVD_NEW(UserRenderer)(bufferSize);
+}
+
diff --git a/PxShared/src/pvd/src/windows/PxWindowsPvdDelayLoadHook.cpp b/PxShared/src/pvd/src/windows/PxWindowsPvdDelayLoadHook.cpp
new file mode 100644
index 0000000..a8c6df0
--- /dev/null
+++ b/PxShared/src/pvd/src/windows/PxWindowsPvdDelayLoadHook.cpp
@@ -0,0 +1,82 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "pvd/windows/PxWindowsPvdDelayLoadHook.h"
+#include "windows/PsWindowsInclude.h"
+#include "windows/PsWindowsLoadLibrary.h"
+
+// Prior to Visual Studio 2015 Update 3, these hooks were non-const.
+#define DELAYIMP_INSECURE_WRITABLE_HOOKS
+#include <delayimp.h>
+
+static const physx::PxFoundationDelayLoadHook* gDelayLoadHook = NULL;
+
+void physx::PxPvdSetFoundationDelayLoadHook(const physx::PxFoundationDelayLoadHook* hook)
+{
+	gDelayLoadHook = hook;
+}
+
+using namespace physx;
+
+#pragma comment(lib, "delayimp")
+
+FARPROC WINAPI delayHook(unsigned dliNotify, PDelayLoadInfo pdli)
+{
+	switch (dliNotify) {
+	case dliStartProcessing :
+		break;
+
+	case dliNotePreLoadLibrary :
+		{
+			return physx::shdfnd::foundationDliNotePreLoadLibrary(pdli->szDll, gDelayLoadHook);
+		}
+		break;
+
+	case dliNotePreGetProcAddress :
+		break;
+
+	case dliFailLoadLib :
+		break;
+
+	case dliFailGetProc :
+		break;
+
+	case dliNoteEndProcessing :
+		break;
+
+	default :
+
+		return NULL;
+	}
+
+	return NULL;
+}
+
+PfnDliHook __pfnDliNotifyHook2 = delayHook;
diff --git a/PxShared/src/task/src/TaskManager.cpp b/PxShared/src/task/src/TaskManager.cpp
new file mode 100644
index 0000000..ffcbfcd
--- /dev/null
+++ b/PxShared/src/task/src/TaskManager.cpp
@@ -0,0 +1,733 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+
+#include "task/PxTask.h"
+#include "task/PxTaskDefine.h"
+#include "foundation/PxErrors.h"
+
+#include "PsThread.h"
+#include "PsAtomic.h"
+#include "PsMutex.h"
+#include "PsHashMap.h"
+#include "PsArray.h"
+#include "PsAllocator.h"
+
+
+#if PX_SUPPORT_PXTASK_PROFILING
+#include "foundation/PxProfiler.h"
+#endif
+
+#define DOT_LOG 0
+
+// for information on generating tasks graphs see this wiki page
+// https://wiki.nvidia.com/engwiki/index.php/PhysX/sdk/InternalDoc_Example_TaskGraph
+#if DOT_LOG
+#include "stdio.h"
+#define LOG_FRAME_NUM 60
+static int  framenum;
+static FILE *logfile;
+
+static const char* logFilename = "pxtask-graph.txt";
+__declspec(thread) static physx::PxBaseTask* currentTask;
+
+template<class A, class B> PX_FORCE_INLINE A PxTaskUnionCast(B b)
+{
+	union AB
+	{
+		AB(B bb)
+			: _b(bb)
+		{
+		}
+		B _b;
+		A _a;
+	} u(b);
+	return u._a;
+}
+#endif
+
+#define LOCK()  shdfnd::Mutex::ScopedLock __lock__(mMutex)
+
+namespace physx
+{
+    const int EOL = -1;
+	typedef shdfnd::HashMap<const char *, PxTaskID> PxTaskNameToIDMap;
+
+	struct PxTaskDepTableRow
+	{
+		PxTaskID    mTaskID;
+		int       mNextDep;
+	};
+	typedef shdfnd::Array<PxTaskDepTableRow> PxTaskDepTable;
+
+	class PxTaskTableRow
+	{
+	public:
+		PxTaskTableRow() : mRefCount( 1 ), mStartDep(EOL), mLastDep(EOL) {}
+		void addDependency( PxTaskDepTable& depTable, PxTaskID taskID )
+		{
+			int newDep = int(depTable.size());
+			PxTaskDepTableRow row;
+			row.mTaskID = taskID;
+			row.mNextDep = EOL;
+			depTable.pushBack( row );
+
+			if( mLastDep == EOL )
+			{
+				mStartDep = mLastDep = newDep;
+			}
+			else
+			{
+				depTable[ uint32_t(mLastDep) ].mNextDep = newDep;
+				mLastDep = newDep;
+			}
+		}
+
+		PxTask *    mTask;
+		volatile int mRefCount;
+		PxTaskType::Enum mType;
+		int       mStartDep;
+		int       mLastDep;
+	};
+	typedef shdfnd::Array<PxTaskTableRow> PxTaskTable;
+
+
+/* Implementation of PxTaskManager abstract API */
+class PxTaskMgr : public PxTaskManager, public shdfnd::UserAllocated
+{
+	PX_NOCOPY(PxTaskMgr)
+public:
+	PxTaskMgr(PxErrorCallback& , PxCpuDispatcher*, PxGpuDispatcher*);
+	~PxTaskMgr();
+
+	void     setCpuDispatcher( PxCpuDispatcher& ref )
+	{
+		mCpuDispatcher = &ref;
+	}
+
+	void     setGpuDispatcher( PxGpuDispatcher& ref )
+	{
+		mGpuDispatcher = &ref;
+	}
+
+	PxCpuDispatcher* getCpuDispatcher() const
+	{
+		return mCpuDispatcher;
+	}
+
+	PxGpuDispatcher* getGpuDispatcher() const
+	{
+		return mGpuDispatcher;
+	}
+
+	void	resetDependencies();
+	void	startSimulation();
+	void	stopSimulation();
+	void	taskCompleted( PxTask& task );
+
+	PxTaskID  getNamedTask( const char *name );
+	PxTaskID  submitNamedTask( PxTask *task, const char *name, PxTaskType::Enum type = PxTaskType::TT_CPU );
+	PxTaskID  submitUnnamedTask( PxTask& task, PxTaskType::Enum type = PxTaskType::TT_CPU );
+	PxTask*   getTaskFromID( PxTaskID );
+
+	bool    dispatchTask( PxTaskID taskID, bool gpuGroupStart );
+	bool    resolveRow( PxTaskID taskID, bool gpuGroupStart );
+
+	void    release();
+
+	void	finishBefore( PxTask& task, PxTaskID taskID );
+	void	startAfter( PxTask& task, PxTaskID taskID );
+
+	void	addReference( PxTaskID taskID );
+	void	decrReference( PxTaskID taskID );
+	int32_t	getReference( PxTaskID taskID ) const;
+
+	void	decrReference( PxLightCpuTask& lighttask );
+	void	addReference( PxLightCpuTask& lighttask );		
+
+	void	emitStartEvent( PxBaseTask& basetask, uint32_t threadId=0);
+	void	emitStopEvent( PxBaseTask& basetask, uint32_t threadId=0);
+
+	PxErrorCallback&			mErrorCallback;
+	PxCpuDispatcher           *mCpuDispatcher;
+	PxGpuDispatcher           *mGpuDispatcher;		
+	PxTaskNameToIDMap          mName2IDmap;
+	volatile int			 mPendingTasks;
+    shdfnd::Mutex            mMutex;
+
+	PxTaskDepTable				 mDepTable;
+	PxTaskTable				 mTaskTable;
+
+	shdfnd::Array<PxTaskID>	 mStartDispatch;
+
+
+#if DOT_LOG
+	static void debugGraphEnd();
+	static void debugGraphEdge(PxBaseTask* prev, uint32_t prevIndex, uint32_t prevType, PxBaseTask* next, uint32_t nextIndex, uint32_t nextType, uint32_t weight);
+	static void debugGraphBegin(const char* filename);
+#endif
+	};
+
+PxTaskManager* PxTaskManager::createTaskManager(PxErrorCallback& errorCallback, PxCpuDispatcher* cpuDispatcher, PxGpuDispatcher* gpuDispatcher)
+{
+	return PX_NEW(PxTaskMgr)(errorCallback, cpuDispatcher, gpuDispatcher);
+}
+
+PxTaskMgr::PxTaskMgr(PxErrorCallback& errorCallback, PxCpuDispatcher* cpuDispatcher, PxGpuDispatcher* gpuDispatcher)
+	: mErrorCallback (errorCallback)
+	, mCpuDispatcher( cpuDispatcher )
+    , mGpuDispatcher( gpuDispatcher )	
+	, mPendingTasks( 0 )
+	, mDepTable(PX_DEBUG_EXP("PxTaskDepTable"))
+	, mTaskTable(PX_DEBUG_EXP("PxTaskTable"))	
+	, mStartDispatch(PX_DEBUG_EXP("StartDispatch"))
+{
+}
+
+
+#if DOT_LOG
+void PxTaskMgr::debugGraphBegin(const char* filename)
+{
+	logfile = fopen(filename, "w");
+	
+	if (logfile)
+	{
+		fprintf(logfile, "digraph tasks {\n");
+		fprintf(logfile, "dpi=300;\n");
+		fprintf(logfile, "node [width=.3, height=0.8 style=\"rounded, filled\"];");
+	}
+}
+void PxTaskMgr::debugGraphEnd()
+{
+	if (logfile)
+	{
+		fprintf(logfile, "}\n");
+		fclose(logfile);
+		logfile = NULL;
+	}
+}
+
+void PxTaskMgr::debugGraphEdge(PxBaseTask* prev, uint32_t prevIndex, uint32_t prevType, PxBaseTask* next, uint32_t nextIndex, uint32_t nextType, uint32_t weight)
+{
+	PX_ASSERT(next);
+
+	enum Type
+	{
+		eCpuNode,
+		eSpuNode,
+		eJoinNode
+	};
+
+	if (logfile)
+	{
+		// lock
+		PxTaskMgr& mgr = static_cast<PxTaskMgr&>(*next->getTaskManager());
+		shdfnd::Mutex::ScopedLock lock(mgr.mMutex);
+
+		// check both task and their task manager is valid
+		if (prev && prev->mTm)
+			fprintf(logfile, "{node [shape=%s,label=\"%s\"] t%d%d};\n", (prevType==eSpuNode)?"box,fillcolor=lightblue":"ellipse,fillcolor=lightgrey", prev->getName(), PxTaskUnionCast<uint32_t>(prev), prevIndex);
+		
+		if (next && next->mTm)
+			fprintf(logfile, "{node [shape=%s,label=\"%s\"] t%d%d};\n", (nextType==eSpuNode)?"box,fillcolor=lightblue":"ellipse,fillcolor=lightgrey", next->getName(), PxTaskUnionCast<uint32_t>(next), nextIndex);
+
+		if (weight > 0 && prev && next)
+			fprintf(logfile, "t%d%d->t%d%d [weight=%d];\n", PxTaskUnionCast<uint32_t>(prev), prevIndex, PxTaskUnionCast<uint32_t>(next), nextIndex, weight);
+	}
+}
+#endif
+
+
+PxTaskMgr::~PxTaskMgr()
+{
+}
+
+void PxTaskMgr::release()
+{
+	PX_DELETE(this);
+}
+
+void PxTaskMgr::decrReference(PxLightCpuTask& lighttask)
+{
+#if DOT_LOG	
+	uint32_t weight = 1;
+#endif
+
+	/* This does not need a lock! */
+	if (!shdfnd::atomicDecrement(&lighttask.mRefCount))
+	{
+#if DOT_LOG
+		++weight;
+#endif
+		PX_ASSERT(mCpuDispatcher);
+		if (mCpuDispatcher)
+		{
+			mCpuDispatcher->submitTask(lighttask);
+		}
+		else
+		{
+			lighttask.release();
+		}
+	}
+
+#if DOT_LOG	
+	debugGraphEdge(currentTask, 0, 0, &lighttask, 0, 0, weight);
+#endif
+}
+
+void PxTaskMgr::addReference(PxLightCpuTask& lighttask)
+{
+	/* This does not need a lock! */
+	shdfnd::atomicIncrement(&lighttask.mRefCount);
+}
+
+void PxTaskMgr::emitStartEvent(PxBaseTask& basetask, uint32_t threadId)
+{
+#if DOT_LOG
+	currentTask = &basetask;
+#endif
+
+	PxBaseTask* tmp = &basetask;
+	PX_UNUSED(tmp);
+	PX_UNUSED(threadId);
+
+	/* This does not need a lock! */
+#if PX_SUPPORT_PXTASK_PROFILING
+	//PX_COMPILE_TIME_ASSERT(sizeof(PxProfileEventId::mEventId) == sizeof(PxBaseTask::mEventID));
+	PX_PROFILE_START_CROSSTHREAD(basetask.getName(),0);
+#endif
+}
+
+void PxTaskMgr::emitStopEvent(PxBaseTask& basetask, uint32_t threadId)
+{
+	PxBaseTask* tmp = &basetask;
+	PX_UNUSED(tmp);
+	PX_UNUSED(threadId);
+
+	/* This does not need a lock! */
+#if PX_SUPPORT_PXTASK_PROFILING
+	//PX_COMPILE_TIME_ASSERT(sizeof(PxProfileEventId::mEventId) == sizeof(PxBaseTask::mEventID));
+	PX_PROFILE_STOP_CROSSTHREAD(basetask.getName(),0);
+#endif
+}
+
+/*
+ * Called by the owner (Scene) at the start of every frame, before
+ * asking for tasks to be submitted.
+ */
+void PxTaskMgr::resetDependencies()
+{
+#if DOT_LOG
+	if( logfile )
+	{
+		debugGraphEnd();
+	}
+	if( framenum++ == LOG_FRAME_NUM )
+	{
+		debugGraphBegin(logFilename);
+	}
+#endif
+
+	PX_ASSERT( !mPendingTasks ); // only valid if you don't resubmit named tasks, this is true for the SDK
+    PX_ASSERT( mCpuDispatcher );
+    mTaskTable.clear();
+    mDepTable.clear();
+    mName2IDmap.clear();
+    mPendingTasks = 0;
+}
+
+/* 
+ * Called by the owner (Scene) to start simulating the task graph.
+ * Dispatch all tasks with refCount == 1
+ */
+void PxTaskMgr::startSimulation()
+{
+    PX_ASSERT( mCpuDispatcher );
+
+	if( mGpuDispatcher )
+	{
+		mGpuDispatcher->startSimulation();
+	}
+
+	/* Handle empty task graph */
+	if( mPendingTasks == 0 )
+    {
+
+		return;
+    }
+
+    bool gpuDispatch = false;
+    for( PxTaskID i = 0 ; i < mTaskTable.size() ; i++ )
+    {
+		if(	mTaskTable[ i ].mType == PxTaskType::TT_COMPLETED )
+		{
+			continue;
+		}
+		if( !shdfnd::atomicDecrement( &mTaskTable[ i ].mRefCount ) )
+		{
+			mStartDispatch.pushBack(i);
+		}
+	}
+	for( uint32_t i=0; i<mStartDispatch.size(); ++i)
+	{
+		gpuDispatch |= dispatchTask( mStartDispatch[i], gpuDispatch );
+	}
+	//mStartDispatch.resize(0);
+	mStartDispatch.forceSize_Unsafe(0);
+
+    if( mGpuDispatcher && gpuDispatch )
+	{
+        mGpuDispatcher->finishGroup();
+	}
+}
+
+void PxTaskMgr::stopSimulation()
+{
+	if( mGpuDispatcher )
+	{
+		mGpuDispatcher->stopSimulation();
+	}
+}
+
+PxTaskID PxTaskMgr::getNamedTask( const char *name )
+{
+	const PxTaskNameToIDMap::Entry *ret;
+    {
+        LOCK();
+		ret = mName2IDmap.find( name );
+    }
+    if( ret )
+	{
+        return ret->second;
+	}
+    else
+	{
+        // create named entry in task table, without a task
+        return submitNamedTask( NULL, name, PxTaskType::TT_NOT_PRESENT );
+}
+}
+
+PxTask* PxTaskMgr::getTaskFromID( PxTaskID id )
+{
+	LOCK(); // todo: reader lock necessary?
+	return mTaskTable[ id ].mTask;
+}
+
+
+/* If called at runtime, must be thread-safe */
+PxTaskID PxTaskMgr::submitNamedTask( PxTask *task, const char *name, PxTaskType::Enum type )
+{
+    if( task )
+    {
+        task->mTm = this;
+        task->submitted();
+    }
+
+    LOCK();
+
+	const PxTaskNameToIDMap::Entry *ret = mName2IDmap.find( name );
+    if( ret )
+    {
+		PxTaskID prereg = ret->second;
+		if( task )
+		{
+			/* name was registered for us by a dependent task */
+			PX_ASSERT( !mTaskTable[ prereg ].mTask );
+			PX_ASSERT( mTaskTable[ prereg ].mType == PxTaskType::TT_NOT_PRESENT );
+			mTaskTable[ prereg ].mTask = task;
+			mTaskTable[ prereg ].mType = type;
+			task->mTaskID = prereg;
+		}
+		return prereg;
+    }
+    else
+    {
+        shdfnd::atomicIncrement(&mPendingTasks);
+        PxTaskID id = static_cast<PxTaskID>(mTaskTable.size());
+        mName2IDmap[ name ] = id;
+        if( task )
+		{
+            task->mTaskID = id;
+		}
+        PxTaskTableRow r;
+        r.mTask = task;
+        r.mType = type;
+#if DOT_LOG
+		if( logfile )
+		{
+			if( type == PxTaskType::TT_GPU )
+			{
+				fprintf(logfile, "{node [shape=box,label=\"%s\"] t%d0};\n", task->getName(), PxTaskUnionCast<uint32_t>(task));
+			}
+			else if (type == PxTaskType::TT_NOT_PRESENT)
+			{
+				fprintf(logfile, "{node [shape=invhouse,label=\"%s\"] t%d0};\n", name, PxTaskUnionCast<uint32_t>(task));
+			}
+			else
+			{
+				fprintf(logfile, "{node [label=\"%s\"] t%d0};\n", task->getName(), PxTaskUnionCast<uint32_t>(task));
+			}
+		}
+#endif
+		mTaskTable.pushBack(r);
+        return id;
+    }
+}
+
+/*
+ * Add an unnamed task to the task table
+ */
+PxTaskID PxTaskMgr::submitUnnamedTask( PxTask& task, PxTaskType::Enum type )
+{
+    shdfnd::atomicIncrement(&mPendingTasks);
+
+	task.mTm = this;
+    task.submitted();
+    
+	LOCK();
+    task.mTaskID = static_cast<PxTaskID>(mTaskTable.size());
+    PxTaskTableRow r;
+    r.mTask = &task;
+    r.mType = type;
+#if DOT_LOG
+	if( logfile )
+	{
+		if( type == PxTaskType::TT_GPU )
+		{
+			fprintf(logfile, "{node [shape=box,label=\"%s\"] t%d0};\n", task.getName(), PxTaskUnionCast<uint32_t>(&task));
+		}
+		else
+		{
+			fprintf(logfile, "{node [label=\"%s\"] t%d0};\n", task.getName(), PxTaskUnionCast<uint32_t>(&task));
+		}
+	}
+#endif
+    mTaskTable.pushBack(r);
+    return task.mTaskID;
+}
+
+
+/* Called by worker threads (or cooperating application threads) when a
+ * PxTask has completed.  Propogate depdenencies, decrementing all
+ * referenced tasks' refCounts.  If any of those reach zero, activate
+ * those tasks.
+ */
+void PxTaskMgr::taskCompleted( PxTask& task )
+{
+    LOCK();
+    if( resolveRow( task.mTaskID, false ) )
+	{
+        mGpuDispatcher->finishGroup();
+	}
+}
+
+/* ================== Private Functions ======================= */
+
+/*
+ * Add a dependency to force 'task' to complete before the
+ * referenced 'taskID' is allowed to be dispatched.
+ */
+void PxTaskMgr::finishBefore( PxTask& task, PxTaskID taskID )
+{
+    LOCK();
+	PX_ASSERT( mTaskTable[ taskID ].mType != PxTaskType::TT_COMPLETED );
+
+#if DOT_LOG
+	if( logfile )
+	{
+		fprintf(logfile, "t%d0->t%d0;\n", PxTaskUnionCast<uint32_t>(&task), PxTaskUnionCast<uint32_t>(mTaskTable[ taskID ].mTask));
+	}
+#endif
+
+    mTaskTable[ task.mTaskID ].addDependency( mDepTable, taskID );
+	shdfnd::atomicIncrement( &mTaskTable[ taskID ].mRefCount );
+}
+
+
+/*
+ * Add a dependency to force 'task' to wait for the referenced 'taskID'
+ * to complete before it is allowed to be dispatched.
+ */
+void PxTaskMgr::startAfter( PxTask& task, PxTaskID taskID )
+{
+    LOCK();
+	PX_ASSERT( mTaskTable[ taskID ].mType != PxTaskType::TT_COMPLETED );
+
+#if DOT_LOG
+	if( logfile )
+	{
+		fprintf(logfile, "t%d0->t%d0;\n",	PxTaskUnionCast<uint32_t>(mTaskTable[ taskID ].mTask), PxTaskUnionCast<uint32_t>(&task));
+	}
+#endif
+
+    mTaskTable[ taskID ].addDependency( mDepTable, task.mTaskID );
+	shdfnd::atomicIncrement( &mTaskTable[ task.mTaskID ].mRefCount );
+}
+
+
+void PxTaskMgr::addReference( PxTaskID taskID )
+{
+    LOCK();
+    shdfnd::atomicIncrement( &mTaskTable[ taskID ].mRefCount );
+}
+
+/*
+ * Remove one reference count from a task.  Intended for use by the
+ * GPU dispatcher, to remove reference counts when CUDA events are
+ * resolved.  Must be done here to make it thread safe.
+ */
+void PxTaskMgr::decrReference( PxTaskID taskID )
+{
+    LOCK();
+
+#if DOT_LOG	
+	debugGraphEdge(currentTask, 0, 0, mTaskTable[ taskID ].mTask, 0, 0, 1);
+#endif
+
+    if( !shdfnd::atomicDecrement( &mTaskTable[ taskID ].mRefCount ) )
+    {
+        if( dispatchTask( taskID, false ) )
+        {
+            mGpuDispatcher->finishGroup();
+        }
+    }
+}
+
+int32_t PxTaskMgr::getReference(PxTaskID taskID) const
+{
+	return mTaskTable[ taskID ].mRefCount;
+}
+
+/*
+ * A task has completed, decrement all dependencies and submit tasks
+ * that are ready to run.  Signal simulation end if ther are no more
+ * pending tasks.
+ */
+bool PxTaskMgr::resolveRow( PxTaskID taskID, bool gpuGroupStart )
+{
+    int depRow = mTaskTable[ taskID ].mStartDep;
+
+	uint32_t streamIndex = 0;
+	bool syncRequired = false;
+	if( mTaskTable[ taskID ].mTask )
+	{
+		streamIndex = mTaskTable[ taskID ].mTask->mStreamIndex;
+	}
+
+    while( depRow != EOL )
+    {
+        PxTaskDepTableRow& row = mDepTable[ uint32_t(depRow) ];
+        PxTaskTableRow& dtt = mTaskTable[ row.mTaskID ];
+
+		// pass stream index to (up to one) dependent GPU task
+		if( dtt.mTask && dtt.mType == PxTaskType::TT_GPU && streamIndex )
+		{
+			if( dtt.mTask->mStreamIndex )
+			{
+				PX_ASSERT( dtt.mTask->mStreamIndex != streamIndex );
+				dtt.mTask->mPreSyncRequired = true;
+			}
+			else if( syncRequired )
+			{
+				dtt.mTask->mPreSyncRequired = true;
+			}
+			else
+			{
+				dtt.mTask->mStreamIndex = streamIndex;
+				/* only one forward task gets to use this stream */
+				syncRequired = true;
+			}
+		}
+
+        if( !shdfnd::atomicDecrement( &dtt.mRefCount ) )
+		{
+			gpuGroupStart |= dispatchTask( row.mTaskID, gpuGroupStart );
+		}
+
+        depRow = row.mNextDep;
+    }
+
+    shdfnd::atomicDecrement( &mPendingTasks );
+    return gpuGroupStart;
+}
+
+/*
+ * Submit a ready task to its appropriate dispatcher.
+ */
+bool PxTaskMgr::dispatchTask( PxTaskID taskID, bool gpuGroupStart )
+{
+	LOCK(); // todo: reader lock necessary?
+    PxTaskTableRow& tt = mTaskTable[ taskID ];
+
+    // prevent re-submission
+    if( tt.mType == PxTaskType::TT_COMPLETED )
+    {		
+		mErrorCallback.reportError(PxErrorCode::eDEBUG_WARNING, "PxTask dispatched twice", __FILE__, __LINE__);
+        return false;
+    }
+
+    switch ( tt.mType )
+    {
+    case PxTaskType::TT_CPU:
+        mCpuDispatcher->submitTask( *tt.mTask );
+        break;
+
+	case PxTaskType::TT_GPU:
+#if PX_WINDOWS_FAMILY
+        if( mGpuDispatcher )
+        {
+			if( !gpuGroupStart )
+			{
+				mGpuDispatcher->startGroup();
+			}
+			mGpuDispatcher->submitTask( *tt.mTask );
+			gpuGroupStart = true;
+		}
+		else
+#endif
+		{
+			mErrorCallback.reportError(PxErrorCode::eDEBUG_WARNING, "No GPU dispatcher", __FILE__, __LINE__);
+		}
+		break;
+
+    case PxTaskType::TT_NOT_PRESENT:
+		/* No task registered with this taskID, resolve its dependencies */
+		PX_ASSERT(!tt.mTask);
+		//shdfnd::getFoundation().error(PX_INFO, "unregistered task resolved");
+        gpuGroupStart |= resolveRow( taskID, gpuGroupStart );
+		break;
+	case PxTaskType::TT_COMPLETED:
+    default:
+        mErrorCallback.reportError(PxErrorCode::eDEBUG_WARNING, "Unknown task type", __FILE__, __LINE__);
+        gpuGroupStart |= resolveRow( taskID, gpuGroupStart );
+        break;
+    }
+
+    tt.mType = PxTaskType::TT_COMPLETED;
+    return gpuGroupStart;
+}
+
+}// end physx namespace
author	mtamis <[email protected]>	2017-02-15 16:06:25 +0100
committer	mtamis <[email protected]>	2017-02-15 16:06:25 +0100
commit	85305930aeeb1d513e23522bd91f29ba81aa6d14 (patch)
tree	45f1bb20a45a300d1fef107e436cac95602a0e57 /PxShared/src
download	nvcloth-85305930aeeb1d513e23522bd91f29ba81aa6d14.tar.xz nvcloth-85305930aeeb1d513e23522bd91f29ba81aa6d14.zip