diff options
Diffstat (limited to 'NvCloth/src/avx/SwSolveConstraints.cpp')
| -rw-r--r-- | NvCloth/src/avx/SwSolveConstraints.cpp | 340 |
1 files changed, 340 insertions, 0 deletions
diff --git a/NvCloth/src/avx/SwSolveConstraints.cpp b/NvCloth/src/avx/SwSolveConstraints.cpp new file mode 100644 index 0000000..db250b7 --- /dev/null +++ b/NvCloth/src/avx/SwSolveConstraints.cpp @@ -0,0 +1,340 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma warning(push) +#pragma warning(disable : 4668) //'symbol' is not defined as a preprocessor macro, replacing with '0' for 'directives' +#pragma warning(disable : 4987) // nonstandard extension used: 'throw (...)' +#include <intrin.h> +#pragma warning(pop) + +#pragma warning(disable : 4127) // conditional expression is constant + +typedef unsigned __int16 uint16_t; +typedef unsigned __int32 uint32_t; + +namespace avx +{ +__m128 sMaskYZW; +__m256 sOne, sEpsilon, sMinusOneXYZOneW, sMaskXY; + +void initialize() +{ + sMaskYZW = _mm_castsi128_ps(_mm_setr_epi32(0, ~0, ~0, ~0)); + sOne = _mm256_set1_ps(1.0f); + sEpsilon = _mm256_set1_ps(1.192092896e-07f); + sMinusOneXYZOneW = _mm256_setr_ps(-1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f); + sMaskXY = _mm256_castsi256_ps(_mm256_setr_epi32(~0, ~0, 0, 0, ~0, ~0, 0, 0)); +} + +template <uint32_t> +__m256 fmadd_ps(__m256 a, __m256 b, __m256 c) +{ + return _mm256_add_ps(_mm256_mul_ps(a, b), c); +} +template <uint32_t> +__m256 fnmadd_ps(__m256 a, __m256 b, __m256 c) +{ + return _mm256_sub_ps(c, _mm256_mul_ps(a, b)); +} +#if _MSC_VER >= 1700 +template <> +__m256 fmadd_ps<2>(__m256 a, __m256 b, __m256 c) +{ + return _mm256_fmadd_ps(a, b, c); +} +template <> +__m256 fnmadd_ps<2>(__m256 a, __m256 b, __m256 c) +{ + return _mm256_fnmadd_ps(a, b, c); +} +#endif + +template <uint32_t> +__m256 exp2(const __m256& v) +{ + // http://www.netlib.org/cephes/ + + + __m256 x = _mm256_min_ps(_mm256_max_ps(_mm256_set1_ps(-127.4999f), v), _mm256_set1_ps(127.4999f)); + + // separate into integer and fractional part + + __m256 fx = _mm256_add_ps(x,_mm256_set1_ps(0.5f)); + __m128 fx0 = _mm256_extractf128_ps(fx,0); + __m128 fx1 = _mm256_extractf128_ps(fx,1); + + __m128i ix0 = _mm_sub_epi32(_mm_cvttps_epi32(fx0), _mm_srli_epi32(_mm_castps_si128(fx0), 31)); + __m128i ix1 = _mm_sub_epi32(_mm_cvttps_epi32(fx1), _mm_srli_epi32(_mm_castps_si128(fx1), 31)); + __m256i ix = _mm256_loadu2_m128i(&ix0,&ix1); + fx = _mm256_sub_ps(x,_mm256_cvtepi32_ps(ix)); + + // exp2(fx) ~ 1 + 2 * P(fx) / (Q(fx) - P(fx)) + + __m256 fx2 = _mm256_mul_ps(fx,fx); + + __m256 px = _mm256_mul_ps(fx, + _mm256_add_ps(_mm256_add_ps( + _mm256_set1_ps(1.51390680115615096133e+3f), + _mm256_mul_ps(fx2, _mm256_set1_ps(2.02020656693165307700e+1f))), + _mm256_mul_ps(fx2, _mm256_set1_ps(2.30933477057345225087e-2f)) + ) + ); + + + + __m256 qx = _mm256_add_ps( + _mm256_set1_ps(4.36821166879210612817e+3f), + _mm256_mul_ps( + fx2, + _mm256_add_ps(_mm256_set1_ps(2.33184211722314911771e+2f), fx2) + ) + ); + + __m256 exp2fx = _mm256_mul_ps(px,_mm256_rcp_ps(_mm256_sub_ps(qx, px))); + exp2fx = _mm256_add_ps(_mm256_add_ps(sOne, exp2fx), exp2fx); + + // exp2(ix) + + __m128 exp2ix0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ix0, _mm_set1_epi32(0x7f)), 23)); + __m128 exp2ix1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ix1, _mm_set1_epi32(0x7f)), 23)); + __m256 exp2ix = _mm256_loadu2_m128((float*)&exp2ix0,(float*)&exp2ix1); + + return _mm256_mul_ps(exp2fx, exp2ix); +} +#if _MSC_VER >= 1700 +//AVX2 +template <> +__m256 exp2<2>(const __m256& v) +{ + // http://www.netlib.org/cephes/ + + + __m256 x = _mm256_min_ps(_mm256_max_ps(_mm256_set1_ps(-127.4999f), v), _mm256_set1_ps(127.4999f)); + + // separate into integer and fractional part + + __m256 fx = _mm256_add_ps(x,_mm256_set1_ps(0.5f)); + __m256i ix = _mm256_sub_epi32(_mm256_cvttps_epi32(fx), _mm256_srli_epi32(_mm256_castps_si256(fx), 31)); + fx = _mm256_sub_ps(x,_mm256_cvtepi32_ps(ix)); + + // exp2(fx) ~ 1 + 2 * P(fx) / (Q(fx) - P(fx)) + + __m256 fx2 = _mm256_mul_ps(fx,fx); + + __m256 px = _mm256_mul_ps(fx, + _mm256_add_ps(_mm256_add_ps( + _mm256_set1_ps(1.51390680115615096133e+3f), + _mm256_mul_ps(fx2, _mm256_set1_ps(2.02020656693165307700e+1f))), + _mm256_mul_ps(fx2, _mm256_set1_ps(2.30933477057345225087e-2f)) + ) + ); + + + + __m256 qx = _mm256_add_ps( + _mm256_set1_ps(4.36821166879210612817e+3f), + _mm256_mul_ps( + fx2, + _mm256_add_ps(_mm256_set1_ps(2.33184211722314911771e+2f), fx2) + ) + ); + + __m256 exp2fx = _mm256_mul_ps(px,_mm256_rcp_ps(_mm256_sub_ps(qx, px))); + exp2fx = _mm256_add_ps(_mm256_add_ps(sOne, exp2fx), exp2fx); + + // exp2(ix) + + __m256 exp2ix = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_add_epi32(ix, _mm256_set1_epi32(0x7f)), 23)); + + return _mm256_mul_ps(exp2fx, exp2ix); +} +#endif + +// roughly same perf as SSE2 intrinsics, the asm version below is about 10% faster +template <bool useMultiplier, uint32_t avx> +void solveConstraints(float* __restrict posIt, const float* __restrict rIt, const float* __restrict stIt, const float* __restrict rEnd, + const uint16_t* __restrict iIt, const __m128& stiffnessEtc, const __m128& stiffnessExponent) +{ + __m256 stiffness, stretchLimit, compressionLimit, multiplier; + + if (useMultiplier) + { + // A0 A1 A2 A3 B0 B1 B2 B3 + // (stiffness, multiplier, compressionLimit, stretchLimit, stiffness, multiplier, compressionLimit, stretchLimit) + // float*[0], float*[1], etc.. + stiffness = _mm256_broadcast_ps(&stiffnessEtc); + stretchLimit = _mm256_permute_ps(stiffness, 0xff); //(A3, A3, A3, A3, B3, B3, B3, B3) + compressionLimit = _mm256_permute_ps(stiffness, 0xaa); //(A2, A2, A2, A2, B2, B2, B2, B2) + multiplier = _mm256_permute_ps(stiffness, 0x55); //(A1, A1, A1, A1, B1, B1, B1, B1) + stiffness = _mm256_permute_ps(stiffness, 0x00); //(A0, A0, A0, A0, B0, B0, B0, B0) + } + else + { + stiffness = _mm256_broadcast_ss((const float*)&stiffnessEtc); // (float*[0], float*[0], ..., float*[0]); + } + + bool useStiffnessPerConstraint = stIt!=nullptr; + + for (; rIt < rEnd; rIt += 8, iIt += 16, stIt += 8) + { + float* p0i = posIt + iIt[0] * 4; + float* p4i = posIt + iIt[8] * 4; + float* p0j = posIt + iIt[1] * 4; + float* p4j = posIt + iIt[9] * 4; + float* p1i = posIt + iIt[2] * 4; + float* p5i = posIt + iIt[10] * 4; + float* p1j = posIt + iIt[3] * 4; + float* p5j = posIt + iIt[11] * 4; + + __m128 v0i = _mm_load_ps(p0i); + __m128 v4i = _mm_load_ps(p4i); + __m128 v0j = _mm_load_ps(p0j); + __m128 v4j = _mm_load_ps(p4j); + __m128 v1i = _mm_load_ps(p1i); + __m128 v5i = _mm_load_ps(p5i); + __m128 v1j = _mm_load_ps(p1j); + __m128 v5j = _mm_load_ps(p5j); + + __m256 v04i = _mm256_insertf128_ps(_mm256_castps128_ps256(v0i), v4i, 1); + __m256 v04j = _mm256_insertf128_ps(_mm256_castps128_ps256(v0j), v4j, 1); + __m256 v15i = _mm256_insertf128_ps(_mm256_castps128_ps256(v1i), v5i, 1); + __m256 v15j = _mm256_insertf128_ps(_mm256_castps128_ps256(v1j), v5j, 1); + + __m256 h04ij = fmadd_ps<avx>(sMinusOneXYZOneW, v04i, v04j); + __m256 h15ij = fmadd_ps<avx>(sMinusOneXYZOneW, v15i, v15j); + + float* p2i = posIt + iIt[4] * 4; + float* p6i = posIt + iIt[12] * 4; + float* p2j = posIt + iIt[5] * 4; + float* p6j = posIt + iIt[13] * 4; + float* p3i = posIt + iIt[6] * 4; + float* p7i = posIt + iIt[14] * 4; + float* p3j = posIt + iIt[7] * 4; + float* p7j = posIt + iIt[15] * 4; + + __m128 v2i = _mm_load_ps(p2i); + __m128 v6i = _mm_load_ps(p6i); + __m128 v2j = _mm_load_ps(p2j); + __m128 v6j = _mm_load_ps(p6j); + __m128 v3i = _mm_load_ps(p3i); + __m128 v7i = _mm_load_ps(p7i); + __m128 v3j = _mm_load_ps(p3j); + __m128 v7j = _mm_load_ps(p7j); + + __m256 v26i = _mm256_insertf128_ps(_mm256_castps128_ps256(v2i), v6i, 1); + __m256 v26j = _mm256_insertf128_ps(_mm256_castps128_ps256(v2j), v6j, 1); + __m256 v37i = _mm256_insertf128_ps(_mm256_castps128_ps256(v3i), v7i, 1); + __m256 v37j = _mm256_insertf128_ps(_mm256_castps128_ps256(v3j), v7j, 1); + + __m256 h26ij = fmadd_ps<avx>(sMinusOneXYZOneW, v26i, v26j); + __m256 h37ij = fmadd_ps<avx>(sMinusOneXYZOneW, v37i, v37j); + + __m256 a = _mm256_unpacklo_ps(h04ij, h26ij); + __m256 b = _mm256_unpackhi_ps(h04ij, h26ij); + __m256 c = _mm256_unpacklo_ps(h15ij, h37ij); + __m256 d = _mm256_unpackhi_ps(h15ij, h37ij); + + __m256 hxij = _mm256_unpacklo_ps(a, c); + __m256 hyij = _mm256_unpackhi_ps(a, c); + __m256 hzij = _mm256_unpacklo_ps(b, d); + __m256 vwij = _mm256_unpackhi_ps(b, d); + + __m256 e2ij = fmadd_ps<avx>(hxij, hxij, fmadd_ps<avx>(hyij, hyij, fmadd_ps<avx>(hzij, hzij, sEpsilon))); + + __m256 rij = _mm256_load_ps(rIt); + __m256 stij = useStiffnessPerConstraint?_mm256_sub_ps(sOne, exp2<avx>(_mm256_mul_ps(_mm256_load_ps(stIt),_mm256_broadcast_ps(&stiffnessExponent)))):stiffness; + __m256 mask = _mm256_cmp_ps(rij, sEpsilon, _CMP_GT_OQ); + __m256 erij = _mm256_and_ps(fnmadd_ps<avx>(rij, _mm256_rsqrt_ps(e2ij), sOne), mask); + + if (useMultiplier) + { + erij = fnmadd_ps<avx>(multiplier, _mm256_max_ps(compressionLimit, _mm256_min_ps(erij, stretchLimit)), erij); + } + + __m256 exij = _mm256_mul_ps(erij, _mm256_mul_ps(stij, _mm256_rcp_ps(_mm256_add_ps(sEpsilon, vwij)))); + + // replace these two instructions with _mm_maskstore_ps below? + __m256 exlo = _mm256_and_ps(sMaskXY, exij); + __m256 exhi = _mm256_andnot_ps(sMaskXY, exij); + + __m256 f04ij = _mm256_mul_ps(h04ij, _mm256_permute_ps(exlo, 0xc0)); + __m256 u04i = fmadd_ps<avx>(f04ij, _mm256_permute_ps(v04i, 0xff), v04i); + __m256 u04j = fnmadd_ps<avx>(f04ij, _mm256_permute_ps(v04j, 0xff), v04j); + + _mm_store_ps(p0i, _mm256_extractf128_ps(u04i, 0)); + _mm_store_ps(p0j, _mm256_extractf128_ps(u04j, 0)); + _mm_store_ps(p4i, _mm256_extractf128_ps(u04i, 1)); + _mm_store_ps(p4j, _mm256_extractf128_ps(u04j, 1)); + + __m256 f15ij = _mm256_mul_ps(h15ij, _mm256_permute_ps(exlo, 0xd5)); + __m256 u15i = fmadd_ps<avx>(f15ij, _mm256_permute_ps(v15i, 0xff), v15i); + __m256 u15j = fnmadd_ps<avx>(f15ij, _mm256_permute_ps(v15j, 0xff), v15j); + + _mm_store_ps(p1i, _mm256_extractf128_ps(u15i, 0)); + _mm_store_ps(p1j, _mm256_extractf128_ps(u15j, 0)); + _mm_store_ps(p5i, _mm256_extractf128_ps(u15i, 1)); + _mm_store_ps(p5j, _mm256_extractf128_ps(u15j, 1)); + + __m256 f26ij = _mm256_mul_ps(h26ij, _mm256_permute_ps(exhi, 0x2a)); + __m256 u26i = fmadd_ps<avx>(f26ij, _mm256_permute_ps(v26i, 0xff), v26i); + __m256 u26j = fnmadd_ps<avx>(f26ij, _mm256_permute_ps(v26j, 0xff), v26j); + + _mm_store_ps(p2i, _mm256_extractf128_ps(u26i, 0)); + _mm_store_ps(p2j, _mm256_extractf128_ps(u26j, 0)); + _mm_store_ps(p6i, _mm256_extractf128_ps(u26i, 1)); + _mm_store_ps(p6j, _mm256_extractf128_ps(u26j, 1)); + + __m256 f37ij = _mm256_mul_ps(h37ij, _mm256_permute_ps(exhi, 0x3f)); + __m256 u37i = fmadd_ps<avx>(f37ij, _mm256_permute_ps(v37i, 0xff), v37i); + __m256 u37j = fnmadd_ps<avx>(f37ij, _mm256_permute_ps(v37j, 0xff), v37j); + + _mm_store_ps(p3i, _mm256_extractf128_ps(u37i, 0)); + _mm_store_ps(p3j, _mm256_extractf128_ps(u37j, 0)); + _mm_store_ps(p7i, _mm256_extractf128_ps(u37i, 1)); + _mm_store_ps(p7j, _mm256_extractf128_ps(u37j, 1)); + } + + _mm256_zeroupper(); +} + + +template void solveConstraints<false, 1>(float* __restrict, const float* __restrict, const float* __restrict, const float* __restrict, + const uint16_t* __restrict, const __m128&, const __m128&); + +template void solveConstraints<true, 1>(float* __restrict, const float* __restrict, const float* __restrict, const float* __restrict, + const uint16_t* __restrict, const __m128&, const __m128&); + +template void solveConstraints<false, 2>(float* __restrict, const float* __restrict, const float* __restrict, const float* __restrict, + const uint16_t* __restrict, const __m128&, const __m128&); + +template void solveConstraints<true, 2>(float* __restrict, const float* __restrict, const float* __restrict, const float* __restrict, + const uint16_t* __restrict, const __m128&, const __m128&); + + +} // namespace avx |