Initial commit:

PhysX 3.4.0 Update @ 21294896 APEX 1.4.0 Update @ 21275617 [CL 21300167]
author: git perforce import user <a@b> 2016-10-25 12:29:14 -0600
committer: Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees> 2016-10-25 18:56:37 -0500
commit: 3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
tree: fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2
download: physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz
physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip
5 files changed, 1174 insertions, 0 deletions
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/Simd4f.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/Simd4f.h
new file mode 100644
index 00000000..3f04750f
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/Simd4f.h
@@ -0,0 +1,411 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// factory implementation
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline Simd4fFactory<const float&>::operator Simd4f() const
+{
+	return _mm_set1_ps(v);
+}
+
+inline Simd4fFactory<detail::FourTuple>::operator Simd4f() const
+{
+	return reinterpret_cast<const Simd4f&>(v);
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<0> >::operator Simd4f() const
+{
+	return _mm_setzero_ps();
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<1> >::operator Simd4f() const
+{
+	return _mm_set1_ps(1.0f);
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<int(0x80000000)> >::operator Simd4f() const
+{
+	return _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<int(0xffffffff)> >::operator Simd4f() const
+{
+	return _mm_castsi128_ps(_mm_set1_epi32(-1));
+}
+
+template <>
+inline Simd4fFactory<const float*>::operator Simd4f() const
+{
+	return _mm_loadu_ps(v);
+}
+
+template <>
+inline Simd4fFactory<detail::AlignedPointer<float> >::operator Simd4f() const
+{
+	return _mm_load_ps(v.ptr);
+}
+
+template <>
+inline Simd4fFactory<detail::OffsetPointer<float> >::operator Simd4f() const
+{
+	return _mm_load_ps(reinterpret_cast<const float*>(reinterpret_cast<const char*>(v.ptr) + v.offset));
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression template
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline ComplementExpr<Simd4f>::operator Simd4f() const
+{
+	return _mm_andnot_ps(v, _mm_castsi128_ps(_mm_set1_epi32(-1)));
+}
+
+Simd4f operator&(const ComplementExpr<Simd4f>& complement, const Simd4f& v)
+{
+	return _mm_andnot_ps(complement.v, v);
+}
+
+Simd4f operator&(const Simd4f& v, const ComplementExpr<Simd4f>& complement)
+{
+	return _mm_andnot_ps(complement.v, v);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operator implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4f operator==(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_cmpeq_ps(v0, v1);
+}
+
+Simd4f operator<(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_cmplt_ps(v0, v1);
+}
+
+Simd4f operator<=(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_cmple_ps(v0, v1);
+}
+
+Simd4f operator>(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_cmpgt_ps(v0, v1);
+}
+
+Simd4f operator>=(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_cmpge_ps(v0, v1);
+}
+
+ComplementExpr<Simd4f> operator~(const Simd4f& v)
+{
+	return ComplementExpr<Simd4f>(v);
+}
+
+Simd4f operator&(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_and_ps(v0, v1);
+}
+
+Simd4f operator|(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_or_ps(v0, v1);
+}
+
+Simd4f operator^(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_xor_ps(v0, v1);
+}
+
+Simd4f operator<<(const Simd4f& v, int shift)
+{
+	return _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(v), shift));
+}
+
+Simd4f operator>>(const Simd4f& v, int shift)
+{
+	return _mm_castsi128_ps(_mm_srli_epi32(_mm_castps_si128(v), shift));
+}
+
+Simd4f operator+(const Simd4f& v)
+{
+	return v;
+}
+
+Simd4f operator+(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_add_ps(v0, v1);
+}
+
+Simd4f operator-(const Simd4f& v)
+{
+	return _mm_sub_ps(_mm_setzero_ps(), v);
+}
+
+Simd4f operator-(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_sub_ps(v0, v1);
+}
+
+Simd4f operator*(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_mul_ps(v0, v1);
+}
+
+Simd4f operator/(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_div_ps(v0, v1);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// function implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4f simd4f(const Simd4i& v)
+{
+	return _mm_castsi128_ps(v);
+}
+
+float (&array(Simd4f& v))[4]
+{
+	return reinterpret_cast<float(&)[4]>(v);
+}
+
+const float (&array(const Simd4f& v))[4]
+{
+	return reinterpret_cast<const float(&)[4]>(v);
+}
+
+void store(float* ptr, Simd4f const& v)
+{
+	_mm_storeu_ps(ptr, v);
+}
+
+void storeAligned(float* ptr, Simd4f const& v)
+{
+	_mm_store_ps(ptr, v);
+}
+
+void storeAligned(float* ptr, unsigned int offset, Simd4f const& v)
+{
+	_mm_store_ps(reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + offset), v);
+}
+
+template <size_t i>
+Simd4f splat(Simd4f const& v)
+{
+	return _mm_shuffle_ps(v, v, _MM_SHUFFLE(i, i, i, i));
+}
+
+Simd4f select(Simd4f const& mask, Simd4f const& v0, Simd4f const& v1)
+{
+	return _mm_xor_ps(v1, _mm_and_ps(mask, _mm_xor_ps(v1, v0)));
+}
+
+Simd4f abs(const Simd4f& v)
+{
+	return _mm_andnot_ps(_mm_castsi128_ps(_mm_set1_epi32(0x80000000)), v);
+}
+
+Simd4f floor(const Simd4f& v)
+{
+	// SSE 4.1: return _mm_floor_ps(v);
+	Simd4i i = _mm_cvttps_epi32(v);
+	return _mm_cvtepi32_ps(_mm_sub_epi32(i, _mm_srli_epi32(i, 31)));
+}
+
+Simd4f max(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_max_ps(v0, v1);
+}
+
+Simd4f min(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_min_ps(v0, v1);
+}
+
+Simd4f recip(const Simd4f& v)
+{
+	return _mm_rcp_ps(v);
+}
+
+template <int n>
+Simd4f recipT(const Simd4f& v)
+{
+	Simd4f two = simd4f(2.0f);
+	Simd4f recipV = recip(v);
+	for(int i = 0; i < n; ++i)
+		recipV = recipV * (two - v * recipV);
+	return recipV;
+}
+
+Simd4f sqrt(const Simd4f& v)
+{
+	return _mm_sqrt_ps(v);
+}
+
+Simd4f rsqrt(const Simd4f& v)
+{
+	return _mm_rsqrt_ps(v);
+}
+
+template <int n>
+Simd4f rsqrtT(const Simd4f& v)
+{
+	Simd4f halfV = v * simd4f(0.5f);
+	Simd4f threeHalf = simd4f(1.5f);
+	Simd4f rsqrtV = rsqrt(v);
+	for(int i = 0; i < n; ++i)
+		rsqrtV = rsqrtV * (threeHalf - halfV * rsqrtV * rsqrtV);
+	return rsqrtV;
+}
+
+Simd4f exp2(const Simd4f& v)
+{
+	// http://www.netlib.org/cephes/
+
+	Simd4f limit = simd4f(127.4999f);
+	Simd4f x = min(max(-limit, v), limit);
+
+	// separate into integer and fractional part
+
+	Simd4f fx = x + simd4f(0.5f);
+	Simd4i ix = _mm_sub_epi32(_mm_cvttps_epi32(fx), _mm_srli_epi32(_mm_castps_si128(fx), 31));
+	fx = x - Simd4f(_mm_cvtepi32_ps(ix));
+
+	// exp2(fx) ~ 1 + 2*P(fx) / (Q(fx) - P(fx))
+
+	Simd4f fx2 = fx * fx;
+
+	Simd4f px = fx * (simd4f(1.51390680115615096133e+3f) +
+	                  fx2 * (simd4f(2.02020656693165307700e+1f) + fx2 * simd4f(2.30933477057345225087e-2f)));
+	Simd4f qx = simd4f(4.36821166879210612817e+3f) + fx2 * (simd4f(2.33184211722314911771e+2f) + fx2);
+
+	Simd4f exp2fx = px * recip(qx - px);
+	exp2fx = simd4f(_1) + exp2fx + exp2fx;
+
+	// exp2(ix)
+
+	Simd4f exp2ix = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ix, _mm_set1_epi32(0x7f)), 23));
+
+	return exp2fx * exp2ix;
+}
+
+Simd4f log2(const Simd4f& v)
+{
+	// todo: fast approximate implementation like exp2
+	Simd4f scale = simd4f(1.44269504088896341f); // 1/ln(2)
+	const float* ptr = array(v);
+	return simd4f(::logf(ptr[0]), ::logf(ptr[1]), ::logf(ptr[2]), ::logf(ptr[3])) * scale;
+}
+
+Simd4f dot3(const Simd4f& v0, const Simd4f& v1)
+{
+	Simd4f tmp = v0 * v1;
+	return splat<0>(tmp) + splat<1>(tmp) + splat<2>(tmp);
+}
+
+Simd4f cross3(const Simd4f& v0, const Simd4f& v1)
+{
+	Simd4f t0 = _mm_shuffle_ps(v0, v0, 0xc9); // w z y x -> w x z y
+	Simd4f t1 = _mm_shuffle_ps(v1, v1, 0xc9);
+	Simd4f tmp = v0 * t1 - t0 * v1;
+	return _mm_shuffle_ps(tmp, tmp, 0xc9);
+}
+
+void transpose(Simd4f& x, Simd4f& y, Simd4f& z, Simd4f& w)
+{
+	_MM_TRANSPOSE4_PS(x, y, z, w);
+}
+
+int allEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return allTrue(v0 == v1);
+}
+
+int allEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return allTrue(outMask = v0 == v1);
+}
+
+int anyEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return anyTrue(v0 == v1);
+}
+
+int anyEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return anyTrue(outMask = v0 == v1);
+}
+
+int allGreater(const Simd4f& v0, const Simd4f& v1)
+{
+	return allTrue(v0 > v1);
+}
+
+int allGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return allTrue(outMask = v0 > v1);
+}
+
+int anyGreater(const Simd4f& v0, const Simd4f& v1)
+{
+	return anyTrue(v0 > v1);
+}
+
+int anyGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return anyTrue(outMask = v0 > v1);
+}
+
+int allGreaterEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return allTrue(v0 >= v1);
+}
+
+int allGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return allTrue(outMask = v0 >= v1);
+}
+
+int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return anyTrue(v0 >= v1);
+}
+
+int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return anyTrue(outMask = v0 >= v1);
+}
+
+int allTrue(const Simd4f& v)
+{
+	return _mm_movemask_ps(v) == 0xf;
+}
+
+int anyTrue(const Simd4f& v)
+{
+	return _mm_movemask_ps(v);
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/Simd4i.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/Simd4i.h
new file mode 100644
index 00000000..d4a70a02
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/Simd4i.h
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// factory implementation
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline Simd4iFactory<const int&>::operator Simd4i() const
+{
+	return _mm_set1_epi32(v);
+}
+
+inline Simd4iFactory<detail::FourTuple>::operator Simd4i() const
+{
+	return reinterpret_cast<const Simd4i&>(v);
+}
+
+template <int i>
+inline Simd4iFactory<detail::IntType<i> >::operator Simd4i() const
+{
+	return _mm_set1_epi32(i);
+}
+
+template <>
+inline Simd4iFactory<detail::IntType<0> >::operator Simd4i() const
+{
+	return _mm_setzero_si128();
+}
+
+template <>
+inline Simd4iFactory<const int*>::operator Simd4i() const
+{
+	return _mm_loadu_si128(reinterpret_cast<const __m128i*>(v));
+}
+
+template <>
+inline Simd4iFactory<detail::AlignedPointer<int> >::operator Simd4i() const
+{
+	return _mm_load_si128(reinterpret_cast<const __m128i*>(v.ptr));
+}
+
+template <>
+inline Simd4iFactory<detail::OffsetPointer<int> >::operator Simd4i() const
+{
+	return _mm_load_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const char*>(v.ptr) + v.offset));
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression template
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline ComplementExpr<Simd4i>::operator Simd4i() const
+{
+	return _mm_andnot_si128(v, _mm_set1_epi32(0xffffffff));
+}
+
+Simd4i operator&(const ComplementExpr<Simd4i>& complement, const Simd4i& v)
+{
+	return _mm_andnot_si128(complement.v, v);
+}
+
+Simd4i operator&(const Simd4i& v, const ComplementExpr<Simd4i>& complement)
+{
+	return _mm_andnot_si128(complement.v, v);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operator implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4i simdi::operator==(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_cmpeq_epi32(v0, v1);
+}
+
+Simd4i simdi::operator<(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_cmplt_epi32(v0, v1);
+}
+
+Simd4i simdi::operator>(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_cmpgt_epi32(v0, v1);
+}
+
+ComplementExpr<Simd4i> operator~(const Simd4i& v)
+{
+	return ComplementExpr<Simd4i>(v);
+}
+
+Simd4i operator&(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_and_si128(v0, v1);
+}
+
+Simd4i operator|(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_or_si128(v0, v1);
+}
+
+Simd4i operator^(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_xor_si128(v0, v1);
+}
+
+Simd4i operator<<(const Simd4i& v, int shift)
+{
+	return _mm_slli_epi32(v, shift);
+}
+
+Simd4i operator>>(const Simd4i& v, int shift)
+{
+	return _mm_srli_epi32(v, shift);
+}
+
+Simd4i simdi::operator+(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_add_epi32(v0, v1);
+}
+
+Simd4i simdi::operator-(const Simd4i& v)
+{
+	return _mm_sub_epi32(_mm_setzero_si128(), v);
+}
+
+Simd4i simdi::operator-(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_sub_epi32(v0, v1);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// function implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4i simd4i(const Simd4f& v)
+{
+	return _mm_castps_si128(v);
+}
+
+int (&simdi::array(Simd4i& v))[4]
+{
+	return reinterpret_cast<int(&)[4]>(v);
+}
+
+const int (&simdi::array(const Simd4i& v))[4]
+{
+	return reinterpret_cast<const int(&)[4]>(v);
+}
+
+void store(int* ptr, const Simd4i& v)
+{
+	_mm_storeu_si128(reinterpret_cast<__m128i*>(ptr), v);
+}
+
+void storeAligned(int* ptr, const Simd4i& v)
+{
+	_mm_store_si128(reinterpret_cast<__m128i*>(ptr), v);
+}
+
+void storeAligned(int* ptr, unsigned int offset, const Simd4i& v)
+{
+	_mm_store_si128(reinterpret_cast<__m128i*>(reinterpret_cast<char*>(ptr) + offset), v);
+}
+
+template <size_t i>
+Simd4i splat(const Simd4i& v)
+{
+	return _mm_shuffle_epi32(v, _MM_SHUFFLE(i, i, i, i));
+}
+
+Simd4i select(const Simd4i& mask, const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_xor_si128(v1, _mm_and_si128(mask, _mm_xor_si128(v1, v0)));
+}
+
+int simdi::allEqual(const Simd4i& v0, const Simd4i& v1)
+{
+	return allTrue(simdi::operator==(v0, v1));
+}
+
+int simdi::allEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return allTrue(outMask = simdi::operator==(v0, v1));
+}
+
+int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1)
+{
+	return anyTrue(simdi::operator==(v0, v1));
+}
+
+int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return anyTrue(outMask = simdi::operator==(v0, v1));
+}
+
+int simdi::allGreater(const Simd4i& v0, const Simd4i& v1)
+{
+	return allTrue(simdi::operator>(v0, v1));
+}
+
+int simdi::allGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return allTrue(outMask = simdi::operator>(v0, v1));
+}
+
+int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1)
+{
+	return anyTrue(simdi::operator>(v0, v1));
+}
+
+int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return anyTrue(outMask = simdi::operator>(v0, v1));
+}
+
+int allTrue(const Simd4i& v)
+{
+	return _mm_movemask_ps(_mm_castsi128_ps(v)) == 0xf;
+}
+
+int anyTrue(const Simd4i& v)
+{
+	return _mm_movemask_ps(_mm_castsi128_ps(v));
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SimdTypes.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SimdTypes.h
new file mode 100644
index 00000000..e54edde7
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SimdTypes.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// SSE + SSE2 (don't include intrin.h!)
+#include <emmintrin.h>
+
+#if defined(_MSC_VER)
+
+typedef __m128 Simd4f;
+typedef __m128i Simd4i;
+
+#else
+
+struct Simd4f
+{
+	Simd4f()
+	{
+	}
+	Simd4f(__m128 x) : m128(x)
+	{
+	}
+
+	operator __m128&()
+	{
+		return m128;
+	}
+	operator const __m128&() const
+	{
+		return m128;
+	}
+
+  private:
+	__m128 m128;
+};
+
+struct Simd4i
+{
+	Simd4i()
+	{
+	}
+	Simd4i(__m128i x) : m128i(x)
+	{
+	}
+
+	operator __m128i&()
+	{
+		return m128i;
+	}
+	operator const __m128i&() const
+	{
+		return m128i;
+	}
+
+  private:
+	__m128i m128i;
+};
+
+#endif
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SwCollisionHelpers.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SwCollisionHelpers.h
new file mode 100644
index 00000000..0750fcf5
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SwCollisionHelpers.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#ifdef PX_GCC_FAMILY
+#include <xmmintrin.h> // _BitScanForward
+#else
+#pragma warning(push)
+#pragma warning(disable : 4668) //'symbol'  is not defined as a preprocessor macro, replacing with '0' for 'directives'
+#pragma warning(disable : 4987) // nonstandard extension used: 'throw (...)'
+#include <intrin.h>             // _BitScanForward
+#pragma warning(pop)
+#endif
+
+namespace nvidia
+{
+namespace cloth
+{
+
+uint32_t findBitSet(uint32_t mask)
+{
+#if defined(_MSC_VER)
+	unsigned long result;
+	_BitScanForward(&result, unsigned long(mask));
+	return result;
+#else
+	return __builtin_ffs(mask) - 1;
+#endif
+}
+
+Simd4i intFloor(const Simd4f& v)
+{
+	Simd4i i = _mm_cvttps_epi32(v);
+	return simdi::operator-(i, _mm_srli_epi32(simd4i(v), 31));
+}
+
+Simd4i horizontalOr(Simd4i mask)
+{
+	Simd4i tmp = mask | _mm_shuffle_epi32(mask, 0xb1); // w z y x -> z w x y
+	return tmp | _mm_shuffle_epi32(tmp, 0x4e);         // w z y x -> y x w z
+}
+
+Gather<Simd4i>::Gather(const Simd4i& index)
+{
+	mSelectQ = _mm_srai_epi32(index << 29, 31);
+	mSelectD = _mm_srai_epi32(index << 30, 31);
+	mSelectW = _mm_srai_epi32(index << 31, 31);
+	mOutOfRange = simdi::operator>(index ^ sIntSignBit, sSignedMask);
+}
+
+Simd4i Gather<Simd4i>::operator()(const Simd4i* ptr) const
+{
+	// more efficient with _mm_shuffle_epi8 (SSSE3)
+	Simd4i lo = ptr[0], hi = ptr[1];
+	Simd4i m01 = select(mSelectW, splat<1>(lo), splat<0>(lo));
+	Simd4i m23 = select(mSelectW, splat<3>(lo), splat<2>(lo));
+	Simd4i m45 = select(mSelectW, splat<1>(hi), splat<0>(hi));
+	Simd4i m67 = select(mSelectW, splat<3>(hi), splat<2>(hi));
+	Simd4i m0123 = select(mSelectD, m23, m01);
+	Simd4i m4567 = select(mSelectD, m67, m45);
+	return select(mSelectQ, m4567, m0123) & ~mOutOfRange;
+}
+
+} // namespace cloth
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SwSolveConstraints.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SwSolveConstraints.h
new file mode 100644
index 00000000..382812bb
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SwSolveConstraints.h
@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma warning(push)
+#pragma warning(disable:4127) // Disable the nag warning 'conditional expression is constant'
+
+template <bool useMultiplier>
+void solveConstraints(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd,
+                      const uint16_t* __restrict iIt, __m128 stiffness)
+{
+	__m128 sOne = _mm_set1_ps(1.0f);
+
+	__m128 stretchLimit, compressionLimit, multiplier;
+	if(useMultiplier)
+	{
+		stretchLimit = _mm_shuffle_ps(stiffness, stiffness, 0xff);
+		compressionLimit = _mm_shuffle_ps(stiffness, stiffness, 0xaa);
+		multiplier = _mm_shuffle_ps(stiffness, stiffness, 0x55);
+	}
+	stiffness = _mm_shuffle_ps(stiffness, stiffness, 0x00);
+
+	for(; rIt != rEnd; rIt += 4, iIt += 8)
+	{
+		float* p0i = posIt + iIt[0] * 4;
+		float* p0j = posIt + iIt[1] * 4;
+		float* p1i = posIt + iIt[2] * 4;
+		float* p1j = posIt + iIt[3] * 4;
+		float* p2i = posIt + iIt[4] * 4;
+		float* p2j = posIt + iIt[5] * 4;
+		float* p3i = posIt + iIt[6] * 4;
+		float* p3j = posIt + iIt[7] * 4;
+
+		__m128 v0i = _mm_load_ps(p0i);
+		__m128 v0j = _mm_load_ps(p0j);
+		__m128 v1i = _mm_load_ps(p1i);
+		__m128 v1j = _mm_load_ps(p1j);
+		__m128 v2i = _mm_load_ps(p2i);
+		__m128 v2j = _mm_load_ps(p2j);
+		__m128 v3i = _mm_load_ps(p3i);
+		__m128 v3j = _mm_load_ps(p3j);
+
+		__m128 h0ij = _mm_add_ps(v0j, _mm_mul_ps(v0i, sMinusOneXYZOneW));
+		__m128 h1ij = _mm_add_ps(v1j, _mm_mul_ps(v1i, sMinusOneXYZOneW));
+		__m128 h2ij = _mm_add_ps(v2j, _mm_mul_ps(v2i, sMinusOneXYZOneW));
+		__m128 h3ij = _mm_add_ps(v3j, _mm_mul_ps(v3i, sMinusOneXYZOneW));
+
+		__m128 a = _mm_unpacklo_ps(h0ij, h2ij);
+		__m128 b = _mm_unpackhi_ps(h0ij, h2ij);
+		__m128 c = _mm_unpacklo_ps(h1ij, h3ij);
+		__m128 d = _mm_unpackhi_ps(h1ij, h3ij);
+
+		__m128 hxij = _mm_unpacklo_ps(a, c);
+		__m128 hyij = _mm_unpackhi_ps(a, c);
+		__m128 hzij = _mm_unpacklo_ps(b, d);
+		__m128 vwij = _mm_unpackhi_ps(b, d);
+
+		__m128 rij = _mm_load_ps(rIt);
+		__m128 e2ij = _mm_add_ps(
+		    sEpsilon, _mm_add_ps(_mm_mul_ps(hxij, hxij), _mm_add_ps(_mm_mul_ps(hyij, hyij), _mm_mul_ps(hzij, hzij))));
+		__m128 mask = _mm_cmpnle_ps(rij, sEpsilon);
+		__m128 erij = _mm_and_ps(_mm_sub_ps(sOne, _mm_mul_ps(rij, _mm_rsqrt_ps(e2ij))), mask);
+
+		if(useMultiplier)
+		{
+			erij = _mm_sub_ps(erij, _mm_mul_ps(multiplier, _mm_max_ps(compressionLimit, _mm_min_ps(erij, stretchLimit))));
+		}
+		__m128 exij = _mm_mul_ps(erij, _mm_mul_ps(stiffness, _mm_rcp_ps(_mm_add_ps(sEpsilon, vwij))));
+
+		__m128 exlo = _mm_and_ps(sMaskXY, exij);
+		__m128 exhi = _mm_andnot_ps(sMaskXY, exij);
+
+		__m128 f0ij = _mm_mul_ps(h0ij, _mm_shuffle_ps(exlo, exlo, 0xc0));
+		__m128 f1ij = _mm_mul_ps(h1ij, _mm_shuffle_ps(exlo, exlo, 0xd5));
+		__m128 f2ij = _mm_mul_ps(h2ij, _mm_shuffle_ps(exhi, exhi, 0x2a));
+		__m128 f3ij = _mm_mul_ps(h3ij, _mm_shuffle_ps(exhi, exhi, 0x3f));
+
+		__m128 u0i = _mm_add_ps(v0i, _mm_mul_ps(f0ij, _mm_shuffle_ps(v0i, v0i, 0xff)));
+		__m128 u0j = _mm_sub_ps(v0j, _mm_mul_ps(f0ij, _mm_shuffle_ps(v0j, v0j, 0xff)));
+		__m128 u1i = _mm_add_ps(v1i, _mm_mul_ps(f1ij, _mm_shuffle_ps(v1i, v1i, 0xff)));
+		__m128 u1j = _mm_sub_ps(v1j, _mm_mul_ps(f1ij, _mm_shuffle_ps(v1j, v1j, 0xff)));
+		__m128 u2i = _mm_add_ps(v2i, _mm_mul_ps(f2ij, _mm_shuffle_ps(v2i, v2i, 0xff)));
+		__m128 u2j = _mm_sub_ps(v2j, _mm_mul_ps(f2ij, _mm_shuffle_ps(v2j, v2j, 0xff)));
+		__m128 u3i = _mm_add_ps(v3i, _mm_mul_ps(f3ij, _mm_shuffle_ps(v3i, v3i, 0xff)));
+		__m128 u3j = _mm_sub_ps(v3j, _mm_mul_ps(f3ij, _mm_shuffle_ps(v3j, v3j, 0xff)));
+
+		_mm_store_ps(p0i, u0i);
+		_mm_store_ps(p0j, u0j);
+		_mm_store_ps(p1i, u1i);
+		_mm_store_ps(p1j, u1j);
+		_mm_store_ps(p2i, u2i);
+		_mm_store_ps(p2j, u2j);
+		_mm_store_ps(p3i, u3i);
+		_mm_store_ps(p3j, u3j);
+	}
+}
+
+#if PX_X86
+
+// clang-format:disable
+
+// asm blocks in static condition blocks don't get removed, specialize
+template <>
+void solveConstraints<false>(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd, 
+                             const uint16_t* __restrict iIt, __m128 stiffness)
+{
+	__m128 sOne = _mm_set1_ps(1.0f);
+	stiffness = _mm_shuffle_ps(stiffness, stiffness, 0x00);
+
+	__m128 htmp[4];
+	float* ptmp[8];
+
+	__asm 
+	{
+		mov edx, rIt
+		mov esi, rEnd
+
+		cmp edx, esi
+		jae forEnd
+
+		mov eax, iIt
+		mov ecx, posIt
+
+forBegin:
+		movzx edi, WORD PTR [eax   ] __asm shl edi, 4 __asm mov [ptmp   ], edi __asm movaps xmm0, XMMWORD PTR [edi + ecx] /* v0i */
+		movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v0j */
+		movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm movaps xmm1, XMMWORD PTR [edi + ecx] /* v1i */
+		movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v1j */
+
+		movaps xmm7, sMinusOneXYZOneW
+		mulps xmm2, xmm7 __asm addps xmm0, xmm2 __asm movaps XMMWORD PTR [htmp   ], xmm0 /* h0ij */
+		mulps xmm3, xmm7 __asm addps xmm1, xmm3 __asm movaps XMMWORD PTR [htmp+16], xmm1 /* h1ij */
+
+		movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */
+		movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v2j */
+		movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm movaps xmm5, XMMWORD PTR [edi + ecx] /* v3i */
+		movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v3j */
+
+		mulps xmm2, xmm7 __asm addps xmm2, xmm4 __asm movaps XMMWORD PTR [htmp+32], xmm2 /* h2ij */
+		mulps xmm3, xmm7 __asm addps xmm3, xmm5 __asm movaps XMMWORD PTR [htmp+48], xmm3 /* h3ij */
+
+		movaps xmm4, xmm0
+		movaps xmm5, xmm1
+
+		unpcklps xmm0, xmm2 /* a */
+		unpckhps xmm4, xmm2 /* b */
+		unpcklps xmm1, xmm3 /* c */
+		unpckhps xmm5, xmm3 /* d */
+
+		movaps xmm2, xmm0
+		movaps xmm6, xmm4
+
+		unpcklps xmm0, xmm1 /* hxij */
+		unpckhps xmm2, xmm1 /* hyij */
+		unpcklps xmm4, xmm5 /* hzij */
+		unpckhps xmm6, xmm5 /* vwij */
+
+		movaps xmm7, sEpsilon
+		movaps xmm5, sOne
+		movaps xmm3, stiffness
+		movaps xmm1, XMMWORD PTR [edx] /* rij */
+
+		mulps xmm0, xmm0 __asm addps xmm0, xmm7 /* e2ij */
+		mulps xmm2, xmm2 __asm addps xmm0, xmm2
+		mulps xmm4, xmm4 __asm addps xmm0, xmm4
+
+		rsqrtps xmm0, xmm0 __asm mulps xmm0, xmm1 /* erij */
+		cmpnleps xmm1, xmm7 /* mask */
+		subps xmm5, xmm0 __asm andps xmm5, xmm1
+		addps xmm6, xmm7 __asm rcpps xmm6, xmm6
+
+		mulps xmm6, xmm3 __asm mulps xmm6, xmm5 /* exij */
+
+		movaps xmm7, sMaskXY
+		andps xmm7, xmm6 /* exlo */
+		xorps xmm6, xmm7 /* exhi */
+
+		movaps xmm0, XMMWORD PTR [htmp   ] /* h0ij */
+		movaps xmm1, XMMWORD PTR [htmp+16] /* h1ij */
+		movaps xmm2, XMMWORD PTR [htmp+32] /* h2ij */
+		movaps xmm3, XMMWORD PTR [htmp+48] /* h3ij */
+
+		pshufd xmm5, xmm7, 0xc0 __asm mulps xmm0, xmm5 /* f0ij */
+		pshufd xmm7, xmm7, 0xd5 __asm mulps xmm1, xmm7 /* f1ij */
+		pshufd xmm4, xmm6, 0x2a __asm mulps xmm2, xmm4 /* f2ij */
+		pshufd xmm6, xmm6, 0x3f __asm mulps xmm3, xmm6 /* f3ij */
+
+		mov edi, [ptmp   ] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v0i */
+		pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm0 __asm subps xmm4, xmm5 /* u0i */
+		movaps XMMWORD PTR [edi + ecx], xmm4
+
+		mov edi, [ptmp+ 4] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v0j */
+		pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm0 __asm addps xmm6, xmm7 /* u0j */
+		movaps XMMWORD PTR [edi + ecx], xmm6
+
+		mov edi, [ptmp+ 8] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v1i */
+		pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm1 __asm subps xmm4, xmm5 /* u1i */
+		movaps XMMWORD PTR [edi + ecx], xmm4
+
+		mov edi, [ptmp+12] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v1j */
+		pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm1 __asm addps xmm6, xmm7 /* u1j */
+		movaps XMMWORD PTR [edi + ecx], xmm6
+
+		mov edi, [ptmp+16] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */
+		pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm2 __asm subps xmm4, xmm5 /* u2i */
+		movaps XMMWORD PTR [edi + ecx], xmm4
+
+		mov edi, [ptmp+20] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v2j */
+		pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm2 __asm addps xmm6, xmm7 /* u2j */
+		movaps XMMWORD PTR [edi + ecx], xmm6
+
+		mov edi, [ptmp+24] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v3i */
+		pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm3 __asm subps xmm4, xmm5 /* u3i */
+		movaps XMMWORD PTR [edi + ecx], xmm4
+
+		mov edi, [ptmp+28] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v3j */
+		pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm3 __asm addps xmm6, xmm7 /* u3j */
+		movaps XMMWORD PTR [edi + ecx], xmm6
+
+		add eax, 16
+		add edx, 16
+
+		cmp edx, esi
+		jb forBegin
+forEnd:
+	}
+}
+
+template <>
+void solveConstraints<true>(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd, 
+                            const uint16_t* __restrict iIt, __m128 stiffness)
+{
+	__m128 sOne = _mm_set1_ps(1.0f);
+	__m128 stretchLimit = _mm_shuffle_ps(stiffness, stiffness, 0xff);
+	__m128 compressionLimit = _mm_shuffle_ps(stiffness, stiffness, 0xaa);
+	__m128 multiplier = _mm_shuffle_ps(stiffness, stiffness, 0x55);
+	stiffness = _mm_shuffle_ps(stiffness, stiffness, 0x00);
+
+	__m128 htmp[4];
+	float* ptmp[8];
+
+	__asm 
+	{
+		mov edx, rIt
+		mov esi, rEnd
+
+		cmp edx, esi
+		jae forEnd
+
+		mov eax, iIt
+		mov ecx, posIt
+
+forBegin:
+		movzx edi, WORD PTR [eax   ] __asm shl edi, 4 __asm mov [ptmp   ], edi __asm movaps xmm0, XMMWORD PTR [edi + ecx] /* v0i */
+		movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v0j */
+		movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm movaps xmm1, XMMWORD PTR [edi + ecx] /* v1i */
+		movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v1j */
+
+		movaps xmm7, sMinusOneXYZOneW
+		mulps xmm2, xmm7 __asm addps xmm0, xmm2 __asm movaps XMMWORD PTR [htmp   ], xmm0 /* h0ij */
+		mulps xmm3, xmm7 __asm addps xmm1, xmm3 __asm movaps XMMWORD PTR [htmp+16], xmm1 /* h1ij */
+
+		movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */
+		movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v2j */
+		movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm movaps xmm5, XMMWORD PTR [edi + ecx] /* v3i */
+		movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v3j */
+
+		mulps xmm2, xmm7 __asm addps xmm2, xmm4 __asm movaps XMMWORD PTR [htmp+32], xmm2 /* h2ij */
+		mulps xmm3, xmm7 __asm addps xmm3, xmm5 __asm movaps XMMWORD PTR [htmp+48], xmm3 /* h3ij */
+
+		movaps xmm4, xmm0
+		movaps xmm5, xmm1
+
+		unpcklps xmm0, xmm2 /* a */
+		unpckhps xmm4, xmm2 /* b */
+		unpcklps xmm1, xmm3 /* c */
+		unpckhps xmm5, xmm3 /* d */
+
+		movaps xmm2, xmm0
+		movaps xmm6, xmm4
+
+		unpcklps xmm0, xmm1 /* hxij */
+		unpckhps xmm2, xmm1 /* hyij */
+		unpcklps xmm4, xmm5 /* hzij */
+		unpckhps xmm6, xmm5 /* vwij */
+
+		movaps xmm7, sEpsilon
+		movaps xmm5, sOne
+		movaps xmm3, stiffness
+		movaps xmm1, XMMWORD PTR [edx] /* rij */
+
+		mulps xmm0, xmm0 __asm addps xmm0, xmm7 /* e2ij */
+		mulps xmm2, xmm2 __asm addps xmm0, xmm2
+		mulps xmm4, xmm4 __asm addps xmm0, xmm4
+
+		rsqrtps xmm0, xmm0 __asm mulps xmm0, xmm1 /* erij */
+		cmpnleps xmm1, xmm7 /* mask */
+		subps xmm5, xmm0 __asm andps xmm5, xmm1
+		addps xmm6, xmm7 __asm rcpps xmm6, xmm6
+
+		movaps xmm0, stretchLimit /* multiplier block */
+		movaps xmm1, compressionLimit
+		movaps xmm2, multiplier
+		minps xmm0, xmm5
+		maxps xmm1, xmm0
+		mulps xmm2, xmm1
+		subps xmm5, xmm2
+
+		mulps xmm6, xmm3 __asm mulps xmm6, xmm5 /* exij */
+
+		movaps xmm7, sMaskXY
+		andps xmm7, xmm6 /* exlo */
+		xorps xmm6, xmm7 /* exhi */
+
+		movaps xmm0, XMMWORD PTR [htmp   ] /* h0ij */
+		movaps xmm1, XMMWORD PTR [htmp+16] /* h1ij */
+		movaps xmm2, XMMWORD PTR [htmp+32] /* h2ij */
+		movaps xmm3, XMMWORD PTR [htmp+48] /* h3ij */
+
+		pshufd xmm5, xmm7, 0xc0 __asm mulps xmm0, xmm5 /* f0ij */
+		pshufd xmm7, xmm7, 0xd5 __asm mulps xmm1, xmm7 /* f1ij */
+		pshufd xmm4, xmm6, 0x2a __asm mulps xmm2, xmm4 /* f2ij */
+		pshufd xmm6, xmm6, 0x3f __asm mulps xmm3, xmm6 /* f3ij */
+
+		mov edi, [ptmp   ] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v0i */
+		pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm0 __asm subps xmm4, xmm5 /* u0i */
+		movaps XMMWORD PTR [edi + ecx], xmm4
+
+		mov edi, [ptmp+ 4] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v0j */
+		pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm0 __asm addps xmm6, xmm7 /* u0j */
+		movaps XMMWORD PTR [edi + ecx], xmm6
+
+		mov edi, [ptmp+ 8] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v1i */
+		pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm1 __asm subps xmm4, xmm5 /* u1i */
+		movaps XMMWORD PTR [edi + ecx], xmm4
+
+		mov edi, [ptmp+12] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v1j */
+		pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm1 __asm addps xmm6, xmm7 /* u1j */
+		movaps XMMWORD PTR [edi + ecx], xmm6
+
+		mov edi, [ptmp+16] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */
+		pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm2 __asm subps xmm4, xmm5 /* u2i */
+		movaps XMMWORD PTR [edi + ecx], xmm4
+
+		mov edi, [ptmp+20] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v2j */
+		pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm2 __asm addps xmm6, xmm7 /* u2j */
+		movaps XMMWORD PTR [edi + ecx], xmm6
+
+		mov edi, [ptmp+24] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v3i */
+		pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm3 __asm subps xmm4, xmm5 /* u3i */
+		movaps XMMWORD PTR [edi + ecx], xmm4
+
+		mov edi, [ptmp+28] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v3j */
+		pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm3 __asm addps xmm6, xmm7 /* u3j */
+		movaps XMMWORD PTR [edi + ecx], xmm6
+
+		add eax, 16
+		add edx, 16
+
+		cmp edx, esi
+		jb forBegin
+forEnd:
+	}
+}
+
+// clang-format:enable
+
+#endif
+
+#pragma warning(pop)
author	git perforce import user <a@b>	2016-10-25 12:29:14 -0600
committer	Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees>	2016-10-25 18:56:37 -0500
commit	3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
tree	fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2
download	physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip