1 files changed, 340 insertions, 0 deletions
diff --git a/NvCloth/src/avx/SwSolveConstraints.cpp b/NvCloth/src/avx/SwSolveConstraints.cpp
new file mode 100644
index 0000000..db250b7
--- /dev/null
+++ b/NvCloth/src/avx/SwSolveConstraints.cpp
@@ -0,0 +1,340 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma warning(push)
+#pragma warning(disable : 4668) //'symbol' is not defined as a preprocessor macro, replacing with '0' for 'directives'
+#pragma warning(disable : 4987) // nonstandard extension used: 'throw (...)'
+#include <intrin.h>
+#pragma warning(pop)
+
+#pragma warning(disable : 4127) // conditional expression is constant
+
+typedef unsigned __int16 uint16_t;
+typedef unsigned __int32 uint32_t;
+
+namespace avx
+{
+__m128 sMaskYZW;
+__m256 sOne, sEpsilon, sMinusOneXYZOneW, sMaskXY;
+
+void initialize()
+{
+	sMaskYZW = _mm_castsi128_ps(_mm_setr_epi32(0, ~0, ~0, ~0));
+	sOne = _mm256_set1_ps(1.0f);
+	sEpsilon = _mm256_set1_ps(1.192092896e-07f);
+	sMinusOneXYZOneW = _mm256_setr_ps(-1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f);
+	sMaskXY = _mm256_castsi256_ps(_mm256_setr_epi32(~0, ~0, 0, 0, ~0, ~0, 0, 0));
+}
+
+template <uint32_t>
+__m256 fmadd_ps(__m256 a, __m256 b, __m256 c)
+{
+	return _mm256_add_ps(_mm256_mul_ps(a, b), c);
+}
+template <uint32_t>
+__m256 fnmadd_ps(__m256 a, __m256 b, __m256 c)
+{
+	return _mm256_sub_ps(c, _mm256_mul_ps(a, b));
+}
+#if _MSC_VER >= 1700
+template <>
+__m256 fmadd_ps<2>(__m256 a, __m256 b, __m256 c)
+{
+	return _mm256_fmadd_ps(a, b, c);
+}
+template <>
+__m256 fnmadd_ps<2>(__m256 a, __m256 b, __m256 c)
+{
+	return _mm256_fnmadd_ps(a, b, c);
+}
+#endif
+
+template <uint32_t>
+__m256 exp2(const __m256& v)
+{
+	// http://www.netlib.org/cephes/
+
+
+	__m256 x = _mm256_min_ps(_mm256_max_ps(_mm256_set1_ps(-127.4999f), v), _mm256_set1_ps(127.4999f));
+
+	// separate into integer and fractional part
+
+	__m256 fx = _mm256_add_ps(x,_mm256_set1_ps(0.5f));
+	__m128 fx0 = _mm256_extractf128_ps(fx,0);
+	__m128 fx1 = _mm256_extractf128_ps(fx,1);
+
+	__m128i ix0 = _mm_sub_epi32(_mm_cvttps_epi32(fx0), _mm_srli_epi32(_mm_castps_si128(fx0), 31));
+	__m128i ix1 = _mm_sub_epi32(_mm_cvttps_epi32(fx1), _mm_srli_epi32(_mm_castps_si128(fx1), 31));
+	__m256i ix = _mm256_loadu2_m128i(&ix0,&ix1);
+	fx = _mm256_sub_ps(x,_mm256_cvtepi32_ps(ix));
+
+	// exp2(fx) ~ 1 + 2 * P(fx) / (Q(fx) - P(fx))
+
+	__m256 fx2 = _mm256_mul_ps(fx,fx);
+
+	__m256 px = _mm256_mul_ps(fx,
+						_mm256_add_ps(_mm256_add_ps(
+							_mm256_set1_ps(1.51390680115615096133e+3f),
+							_mm256_mul_ps(fx2, _mm256_set1_ps(2.02020656693165307700e+1f))),
+							_mm256_mul_ps(fx2, _mm256_set1_ps(2.30933477057345225087e-2f))
+						)
+				);
+
+
+
+	__m256 qx = _mm256_add_ps(
+					_mm256_set1_ps(4.36821166879210612817e+3f),
+					_mm256_mul_ps(
+						fx2,
+						_mm256_add_ps(_mm256_set1_ps(2.33184211722314911771e+2f), fx2)
+					)
+				);
+
+	__m256 exp2fx = _mm256_mul_ps(px,_mm256_rcp_ps(_mm256_sub_ps(qx, px)));
+	exp2fx = _mm256_add_ps(_mm256_add_ps(sOne, exp2fx), exp2fx);
+
+	// exp2(ix)
+
+	__m128 exp2ix0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ix0, _mm_set1_epi32(0x7f)), 23));
+	__m128 exp2ix1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ix1, _mm_set1_epi32(0x7f)), 23));
+	__m256 exp2ix = _mm256_loadu2_m128((float*)&exp2ix0,(float*)&exp2ix1);
+
+	return _mm256_mul_ps(exp2fx, exp2ix);
+}
+#if _MSC_VER >= 1700
+//AVX2
+template <>
+__m256 exp2<2>(const __m256& v)
+{
+	// http://www.netlib.org/cephes/
+
+
+	__m256 x = _mm256_min_ps(_mm256_max_ps(_mm256_set1_ps(-127.4999f), v), _mm256_set1_ps(127.4999f));
+
+	// separate into integer and fractional part
+
+	__m256 fx = _mm256_add_ps(x,_mm256_set1_ps(0.5f));
+	__m256i ix = _mm256_sub_epi32(_mm256_cvttps_epi32(fx), _mm256_srli_epi32(_mm256_castps_si256(fx), 31));
+	fx = _mm256_sub_ps(x,_mm256_cvtepi32_ps(ix));
+
+	// exp2(fx) ~ 1 + 2 * P(fx) / (Q(fx) - P(fx))
+
+	__m256 fx2 = _mm256_mul_ps(fx,fx);
+
+	__m256 px = _mm256_mul_ps(fx,
+						_mm256_add_ps(_mm256_add_ps(
+							_mm256_set1_ps(1.51390680115615096133e+3f),
+							_mm256_mul_ps(fx2, _mm256_set1_ps(2.02020656693165307700e+1f))),
+							_mm256_mul_ps(fx2, _mm256_set1_ps(2.30933477057345225087e-2f))
+						)
+				);
+
+
+
+	__m256 qx = _mm256_add_ps(
+					_mm256_set1_ps(4.36821166879210612817e+3f),
+					_mm256_mul_ps(
+						fx2,
+						_mm256_add_ps(_mm256_set1_ps(2.33184211722314911771e+2f), fx2)
+					)
+				);
+
+	__m256 exp2fx = _mm256_mul_ps(px,_mm256_rcp_ps(_mm256_sub_ps(qx, px)));
+	exp2fx = _mm256_add_ps(_mm256_add_ps(sOne, exp2fx), exp2fx);
+
+	// exp2(ix)
+
+	__m256 exp2ix = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_add_epi32(ix, _mm256_set1_epi32(0x7f)), 23));
+
+	return _mm256_mul_ps(exp2fx, exp2ix);
+}
+#endif
+
+// roughly same perf as SSE2 intrinsics, the asm version below is about 10% faster
+template <bool useMultiplier, uint32_t avx>
+void solveConstraints(float* __restrict posIt, const float* __restrict rIt, const float* __restrict stIt, const float* __restrict rEnd,
+                      const uint16_t* __restrict iIt, const __m128& stiffnessEtc, const __m128& stiffnessExponent)
+{
+	__m256 stiffness, stretchLimit, compressionLimit, multiplier;
+
+	if (useMultiplier)
+	{
+		//	  A0          A1            A2              A3            B0          B1            B2              B3
+		// (stiffness, multiplier, compressionLimit, stretchLimit, stiffness, multiplier, compressionLimit, stretchLimit)
+		//  float*[0], float*[1], etc..
+		stiffness = _mm256_broadcast_ps(&stiffnessEtc);
+		stretchLimit = _mm256_permute_ps(stiffness, 0xff);		//(A3, A3, A3, A3, B3, B3, B3, B3)
+		compressionLimit = _mm256_permute_ps(stiffness, 0xaa);	//(A2, A2, A2, A2, B2, B2, B2, B2)
+		multiplier = _mm256_permute_ps(stiffness, 0x55);		//(A1, A1, A1, A1, B1, B1, B1, B1)
+		stiffness = _mm256_permute_ps(stiffness, 0x00);			//(A0, A0, A0, A0, B0, B0, B0, B0)
+	}
+	else
+	{
+		stiffness = _mm256_broadcast_ss((const float*)&stiffnessEtc); // (float*[0], float*[0], ..., float*[0]);
+	}
+
+	bool useStiffnessPerConstraint = stIt!=nullptr;
+
+	for (; rIt < rEnd; rIt += 8, iIt += 16, stIt += 8)
+	{
+		float* p0i = posIt + iIt[0] * 4;
+		float* p4i = posIt + iIt[8] * 4;
+		float* p0j = posIt + iIt[1] * 4;
+		float* p4j = posIt + iIt[9] * 4;
+		float* p1i = posIt + iIt[2] * 4;
+		float* p5i = posIt + iIt[10] * 4;
+		float* p1j = posIt + iIt[3] * 4;
+		float* p5j = posIt + iIt[11] * 4;
+
+		__m128 v0i = _mm_load_ps(p0i);
+		__m128 v4i = _mm_load_ps(p4i);
+		__m128 v0j = _mm_load_ps(p0j);
+		__m128 v4j = _mm_load_ps(p4j);
+		__m128 v1i = _mm_load_ps(p1i);
+		__m128 v5i = _mm_load_ps(p5i);
+		__m128 v1j = _mm_load_ps(p1j);
+		__m128 v5j = _mm_load_ps(p5j);
+
+		__m256 v04i = _mm256_insertf128_ps(_mm256_castps128_ps256(v0i), v4i, 1);
+		__m256 v04j = _mm256_insertf128_ps(_mm256_castps128_ps256(v0j), v4j, 1);
+		__m256 v15i = _mm256_insertf128_ps(_mm256_castps128_ps256(v1i), v5i, 1);
+		__m256 v15j = _mm256_insertf128_ps(_mm256_castps128_ps256(v1j), v5j, 1);
+
+		__m256 h04ij = fmadd_ps<avx>(sMinusOneXYZOneW, v04i, v04j);
+		__m256 h15ij = fmadd_ps<avx>(sMinusOneXYZOneW, v15i, v15j);
+
+		float* p2i = posIt + iIt[4] * 4;
+		float* p6i = posIt + iIt[12] * 4;
+		float* p2j = posIt + iIt[5] * 4;
+		float* p6j = posIt + iIt[13] * 4;
+		float* p3i = posIt + iIt[6] * 4;
+		float* p7i = posIt + iIt[14] * 4;
+		float* p3j = posIt + iIt[7] * 4;
+		float* p7j = posIt + iIt[15] * 4;
+
+		__m128 v2i = _mm_load_ps(p2i);
+		__m128 v6i = _mm_load_ps(p6i);
+		__m128 v2j = _mm_load_ps(p2j);
+		__m128 v6j = _mm_load_ps(p6j);
+		__m128 v3i = _mm_load_ps(p3i);
+		__m128 v7i = _mm_load_ps(p7i);
+		__m128 v3j = _mm_load_ps(p3j);
+		__m128 v7j = _mm_load_ps(p7j);
+
+		__m256 v26i = _mm256_insertf128_ps(_mm256_castps128_ps256(v2i), v6i, 1);
+		__m256 v26j = _mm256_insertf128_ps(_mm256_castps128_ps256(v2j), v6j, 1);
+		__m256 v37i = _mm256_insertf128_ps(_mm256_castps128_ps256(v3i), v7i, 1);
+		__m256 v37j = _mm256_insertf128_ps(_mm256_castps128_ps256(v3j), v7j, 1);
+
+		__m256 h26ij = fmadd_ps<avx>(sMinusOneXYZOneW, v26i, v26j);
+		__m256 h37ij = fmadd_ps<avx>(sMinusOneXYZOneW, v37i, v37j);
+
+		__m256 a = _mm256_unpacklo_ps(h04ij, h26ij);
+		__m256 b = _mm256_unpackhi_ps(h04ij, h26ij);
+		__m256 c = _mm256_unpacklo_ps(h15ij, h37ij);
+		__m256 d = _mm256_unpackhi_ps(h15ij, h37ij);
+
+		__m256 hxij = _mm256_unpacklo_ps(a, c);
+		__m256 hyij = _mm256_unpackhi_ps(a, c);
+		__m256 hzij = _mm256_unpacklo_ps(b, d);
+		__m256 vwij = _mm256_unpackhi_ps(b, d);
+
+		__m256 e2ij = fmadd_ps<avx>(hxij, hxij, fmadd_ps<avx>(hyij, hyij, fmadd_ps<avx>(hzij, hzij, sEpsilon)));
+
+		__m256 rij = _mm256_load_ps(rIt);
+		__m256 stij = useStiffnessPerConstraint?_mm256_sub_ps(sOne, exp2<avx>(_mm256_mul_ps(_mm256_load_ps(stIt),_mm256_broadcast_ps(&stiffnessExponent)))):stiffness;
+		__m256 mask = _mm256_cmp_ps(rij, sEpsilon, _CMP_GT_OQ);
+		__m256 erij = _mm256_and_ps(fnmadd_ps<avx>(rij, _mm256_rsqrt_ps(e2ij), sOne), mask);
+
+		if (useMultiplier)
+		{
+			erij = fnmadd_ps<avx>(multiplier, _mm256_max_ps(compressionLimit, _mm256_min_ps(erij, stretchLimit)), erij);
+		}
+
+		__m256 exij = _mm256_mul_ps(erij, _mm256_mul_ps(stij, _mm256_rcp_ps(_mm256_add_ps(sEpsilon, vwij))));
+
+		// replace these two instructions with _mm_maskstore_ps below?
+		__m256 exlo = _mm256_and_ps(sMaskXY, exij);
+		__m256 exhi = _mm256_andnot_ps(sMaskXY, exij);
+
+		__m256 f04ij = _mm256_mul_ps(h04ij, _mm256_permute_ps(exlo, 0xc0));
+		__m256 u04i = fmadd_ps<avx>(f04ij, _mm256_permute_ps(v04i, 0xff), v04i);
+		__m256 u04j = fnmadd_ps<avx>(f04ij, _mm256_permute_ps(v04j, 0xff), v04j);
+
+		_mm_store_ps(p0i, _mm256_extractf128_ps(u04i, 0));
+		_mm_store_ps(p0j, _mm256_extractf128_ps(u04j, 0));
+		_mm_store_ps(p4i, _mm256_extractf128_ps(u04i, 1));
+		_mm_store_ps(p4j, _mm256_extractf128_ps(u04j, 1));
+
+		__m256 f15ij = _mm256_mul_ps(h15ij, _mm256_permute_ps(exlo, 0xd5));
+		__m256 u15i = fmadd_ps<avx>(f15ij, _mm256_permute_ps(v15i, 0xff), v15i);
+		__m256 u15j = fnmadd_ps<avx>(f15ij, _mm256_permute_ps(v15j, 0xff), v15j);
+
+		_mm_store_ps(p1i, _mm256_extractf128_ps(u15i, 0));
+		_mm_store_ps(p1j, _mm256_extractf128_ps(u15j, 0));
+		_mm_store_ps(p5i, _mm256_extractf128_ps(u15i, 1));
+		_mm_store_ps(p5j, _mm256_extractf128_ps(u15j, 1));
+
+		__m256 f26ij = _mm256_mul_ps(h26ij, _mm256_permute_ps(exhi, 0x2a));
+		__m256 u26i = fmadd_ps<avx>(f26ij, _mm256_permute_ps(v26i, 0xff), v26i);
+		__m256 u26j = fnmadd_ps<avx>(f26ij, _mm256_permute_ps(v26j, 0xff), v26j);
+
+		_mm_store_ps(p2i, _mm256_extractf128_ps(u26i, 0));
+		_mm_store_ps(p2j, _mm256_extractf128_ps(u26j, 0));
+		_mm_store_ps(p6i, _mm256_extractf128_ps(u26i, 1));
+		_mm_store_ps(p6j, _mm256_extractf128_ps(u26j, 1));
+
+		__m256 f37ij = _mm256_mul_ps(h37ij, _mm256_permute_ps(exhi, 0x3f));
+		__m256 u37i = fmadd_ps<avx>(f37ij, _mm256_permute_ps(v37i, 0xff), v37i);
+		__m256 u37j = fnmadd_ps<avx>(f37ij, _mm256_permute_ps(v37j, 0xff), v37j);
+
+		_mm_store_ps(p3i, _mm256_extractf128_ps(u37i, 0));
+		_mm_store_ps(p3j, _mm256_extractf128_ps(u37j, 0));
+		_mm_store_ps(p7i, _mm256_extractf128_ps(u37i, 1));
+		_mm_store_ps(p7j, _mm256_extractf128_ps(u37j, 1));
+	}
+
+	_mm256_zeroupper();
+}
+
+
+template void solveConstraints<false, 1>(float* __restrict, const float* __restrict, const float* __restrict, const float* __restrict,
+                                         const uint16_t* __restrict, const __m128&, const __m128&);
+
+template void solveConstraints<true, 1>(float* __restrict, const float* __restrict, const float* __restrict, const float* __restrict,
+                                         const uint16_t* __restrict, const __m128&, const __m128&);
+
+template void solveConstraints<false, 2>(float* __restrict, const float* __restrict, const float* __restrict, const float* __restrict,
+                                         const uint16_t* __restrict, const __m128&, const __m128&);
+
+template void solveConstraints<true, 2>(float* __restrict, const float* __restrict, const float* __restrict, const float* __restrict,
+                                         const uint16_t* __restrict, const __m128&, const __m128&);
+
+
+} // namespace avx