// This code contains NVIDIA Confidential Information and is disclosed to you // under a form of NVIDIA software license agreement provided separately to you. // // Notice // NVIDIA Corporation and its licensors retain all intellectual property and // proprietary rights in and to this software and related documentation and // any modifications thereto. Any use, reproduction, disclosure, or // distribution of this software and related documentation without an express // license agreement from NVIDIA Corporation is strictly prohibited. // // ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES // NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO // THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, // MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. // // Information and code furnished is believed to be accurate and reliable. // However, NVIDIA Corporation assumes no responsibility for the consequences of use of such // information or for any infringement of patents or other rights of third parties that may // result from its use. No license is granted by implication or otherwise under any patent // or patent rights of NVIDIA Corporation. Details are subject to change without notice. // This code supersedes and replaces all information previously supplied. // NVIDIA Corporation products are not authorized for use as critical // components in life support devices or systems without express written approval of // NVIDIA Corporation. // // Copyright (c) 2008-2020 NVIDIA Corporation. All rights reserved. // Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. // Copyright (c) 2001-2004 NovodeX AG. All rights reserved. #pragma warning(push) #pragma warning(disable : 4668) //'symbol' is not defined as a preprocessor macro, replacing with '0' for 'directives' #pragma warning(disable : 4987) // nonstandard extension used: 'throw (...)' #include #pragma warning(pop) #pragma warning(disable : 4127) // conditional expression is constant typedef unsigned __int16 uint16_t; typedef unsigned __int32 uint32_t; namespace avx { __m128 sMaskYZW; __m256 sOne, sEpsilon, sMinusOneXYZOneW, sMaskXY; void initialize() { sMaskYZW = _mm_castsi128_ps(_mm_setr_epi32(0, ~0, ~0, ~0)); sOne = _mm256_set1_ps(1.0f); sEpsilon = _mm256_set1_ps(1.192092896e-07f); sMinusOneXYZOneW = _mm256_setr_ps(-1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f); sMaskXY = _mm256_castsi256_ps(_mm256_setr_epi32(~0, ~0, 0, 0, ~0, ~0, 0, 0)); } template __m256 fmadd_ps(__m256 a, __m256 b, __m256 c) { return _mm256_add_ps(_mm256_mul_ps(a, b), c); } template __m256 fnmadd_ps(__m256 a, __m256 b, __m256 c) { return _mm256_sub_ps(c, _mm256_mul_ps(a, b)); } #if _MSC_VER >= 1700 template <> __m256 fmadd_ps<2>(__m256 a, __m256 b, __m256 c) { return _mm256_fmadd_ps(a, b, c); } template <> __m256 fnmadd_ps<2>(__m256 a, __m256 b, __m256 c) { return _mm256_fnmadd_ps(a, b, c); } #endif template __m256 exp2(const __m256& v) { // http://www.netlib.org/cephes/ __m256 x = _mm256_min_ps(_mm256_max_ps(_mm256_set1_ps(-127.4999f), v), _mm256_set1_ps(127.4999f)); // separate into integer and fractional part __m256 fx = _mm256_add_ps(x,_mm256_set1_ps(0.5f)); __m128 fx0 = _mm256_extractf128_ps(fx,0); __m128 fx1 = _mm256_extractf128_ps(fx,1); __m128i ix0 = _mm_sub_epi32(_mm_cvttps_epi32(fx0), _mm_srli_epi32(_mm_castps_si128(fx0), 31)); __m128i ix1 = _mm_sub_epi32(_mm_cvttps_epi32(fx1), _mm_srli_epi32(_mm_castps_si128(fx1), 31)); __m256i ix = _mm256_loadu2_m128i(&ix0,&ix1); fx = _mm256_sub_ps(x,_mm256_cvtepi32_ps(ix)); // exp2(fx) ~ 1 + 2 * P(fx) / (Q(fx) - P(fx)) __m256 fx2 = _mm256_mul_ps(fx,fx); __m256 px = _mm256_mul_ps(fx, _mm256_add_ps(_mm256_add_ps( _mm256_set1_ps(1.51390680115615096133e+3f), _mm256_mul_ps(fx2, _mm256_set1_ps(2.02020656693165307700e+1f))), _mm256_mul_ps(fx2, _mm256_set1_ps(2.30933477057345225087e-2f)) ) ); __m256 qx = _mm256_add_ps( _mm256_set1_ps(4.36821166879210612817e+3f), _mm256_mul_ps( fx2, _mm256_add_ps(_mm256_set1_ps(2.33184211722314911771e+2f), fx2) ) ); __m256 exp2fx = _mm256_mul_ps(px,_mm256_rcp_ps(_mm256_sub_ps(qx, px))); exp2fx = _mm256_add_ps(_mm256_add_ps(sOne, exp2fx), exp2fx); // exp2(ix) __m128 exp2ix0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ix0, _mm_set1_epi32(0x7f)), 23)); __m128 exp2ix1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ix1, _mm_set1_epi32(0x7f)), 23)); __m256 exp2ix = _mm256_loadu2_m128((float*)&exp2ix0,(float*)&exp2ix1); return _mm256_mul_ps(exp2fx, exp2ix); } #if _MSC_VER >= 1700 //AVX2 template <> __m256 exp2<2>(const __m256& v) { // http://www.netlib.org/cephes/ __m256 x = _mm256_min_ps(_mm256_max_ps(_mm256_set1_ps(-127.4999f), v), _mm256_set1_ps(127.4999f)); // separate into integer and fractional part __m256 fx = _mm256_add_ps(x,_mm256_set1_ps(0.5f)); __m256i ix = _mm256_sub_epi32(_mm256_cvttps_epi32(fx), _mm256_srli_epi32(_mm256_castps_si256(fx), 31)); fx = _mm256_sub_ps(x,_mm256_cvtepi32_ps(ix)); // exp2(fx) ~ 1 + 2 * P(fx) / (Q(fx) - P(fx)) __m256 fx2 = _mm256_mul_ps(fx,fx); __m256 px = _mm256_mul_ps(fx, _mm256_add_ps(_mm256_add_ps( _mm256_set1_ps(1.51390680115615096133e+3f), _mm256_mul_ps(fx2, _mm256_set1_ps(2.02020656693165307700e+1f))), _mm256_mul_ps(fx2, _mm256_set1_ps(2.30933477057345225087e-2f)) ) ); __m256 qx = _mm256_add_ps( _mm256_set1_ps(4.36821166879210612817e+3f), _mm256_mul_ps( fx2, _mm256_add_ps(_mm256_set1_ps(2.33184211722314911771e+2f), fx2) ) ); __m256 exp2fx = _mm256_mul_ps(px,_mm256_rcp_ps(_mm256_sub_ps(qx, px))); exp2fx = _mm256_add_ps(_mm256_add_ps(sOne, exp2fx), exp2fx); // exp2(ix) __m256 exp2ix = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_add_epi32(ix, _mm256_set1_epi32(0x7f)), 23)); return _mm256_mul_ps(exp2fx, exp2ix); } #endif // roughly same perf as SSE2 intrinsics, the asm version below is about 10% faster template void solveConstraints(float* __restrict posIt, const float* __restrict rIt, const float* __restrict stIt, const float* __restrict rEnd, const uint16_t* __restrict iIt, const __m128& stiffnessEtc, const __m128& stiffnessExponent) { __m256 stiffness, stretchLimit, compressionLimit, multiplier; if (useMultiplier) { // A0 A1 A2 A3 B0 B1 B2 B3 // (stiffness, multiplier, compressionLimit, stretchLimit, stiffness, multiplier, compressionLimit, stretchLimit) // float*[0], float*[1], etc.. stiffness = _mm256_broadcast_ps(&stiffnessEtc); stretchLimit = _mm256_permute_ps(stiffness, 0xff); //(A3, A3, A3, A3, B3, B3, B3, B3) compressionLimit = _mm256_permute_ps(stiffness, 0xaa); //(A2, A2, A2, A2, B2, B2, B2, B2) multiplier = _mm256_permute_ps(stiffness, 0x55); //(A1, A1, A1, A1, B1, B1, B1, B1) stiffness = _mm256_permute_ps(stiffness, 0x00); //(A0, A0, A0, A0, B0, B0, B0, B0) } else { stiffness = _mm256_broadcast_ss((const float*)&stiffnessEtc); // (float*[0], float*[0], ..., float*[0]); } bool useStiffnessPerConstraint = stIt!=nullptr; for (; rIt < rEnd; rIt += 8, iIt += 16, stIt += 8) { float* p0i = posIt + iIt[0] * 4; float* p4i = posIt + iIt[8] * 4; float* p0j = posIt + iIt[1] * 4; float* p4j = posIt + iIt[9] * 4; float* p1i = posIt + iIt[2] * 4; float* p5i = posIt + iIt[10] * 4; float* p1j = posIt + iIt[3] * 4; float* p5j = posIt + iIt[11] * 4; __m128 v0i = _mm_load_ps(p0i); __m128 v4i = _mm_load_ps(p4i); __m128 v0j = _mm_load_ps(p0j); __m128 v4j = _mm_load_ps(p4j); __m128 v1i = _mm_load_ps(p1i); __m128 v5i = _mm_load_ps(p5i); __m128 v1j = _mm_load_ps(p1j); __m128 v5j = _mm_load_ps(p5j); __m256 v04i = _mm256_insertf128_ps(_mm256_castps128_ps256(v0i), v4i, 1); __m256 v04j = _mm256_insertf128_ps(_mm256_castps128_ps256(v0j), v4j, 1); __m256 v15i = _mm256_insertf128_ps(_mm256_castps128_ps256(v1i), v5i, 1); __m256 v15j = _mm256_insertf128_ps(_mm256_castps128_ps256(v1j), v5j, 1); __m256 h04ij = fmadd_ps(sMinusOneXYZOneW, v04i, v04j); __m256 h15ij = fmadd_ps(sMinusOneXYZOneW, v15i, v15j); float* p2i = posIt + iIt[4] * 4; float* p6i = posIt + iIt[12] * 4; float* p2j = posIt + iIt[5] * 4; float* p6j = posIt + iIt[13] * 4; float* p3i = posIt + iIt[6] * 4; float* p7i = posIt + iIt[14] * 4; float* p3j = posIt + iIt[7] * 4; float* p7j = posIt + iIt[15] * 4; __m128 v2i = _mm_load_ps(p2i); __m128 v6i = _mm_load_ps(p6i); __m128 v2j = _mm_load_ps(p2j); __m128 v6j = _mm_load_ps(p6j); __m128 v3i = _mm_load_ps(p3i); __m128 v7i = _mm_load_ps(p7i); __m128 v3j = _mm_load_ps(p3j); __m128 v7j = _mm_load_ps(p7j); __m256 v26i = _mm256_insertf128_ps(_mm256_castps128_ps256(v2i), v6i, 1); __m256 v26j = _mm256_insertf128_ps(_mm256_castps128_ps256(v2j), v6j, 1); __m256 v37i = _mm256_insertf128_ps(_mm256_castps128_ps256(v3i), v7i, 1); __m256 v37j = _mm256_insertf128_ps(_mm256_castps128_ps256(v3j), v7j, 1); __m256 h26ij = fmadd_ps(sMinusOneXYZOneW, v26i, v26j); __m256 h37ij = fmadd_ps(sMinusOneXYZOneW, v37i, v37j); __m256 a = _mm256_unpacklo_ps(h04ij, h26ij); __m256 b = _mm256_unpackhi_ps(h04ij, h26ij); __m256 c = _mm256_unpacklo_ps(h15ij, h37ij); __m256 d = _mm256_unpackhi_ps(h15ij, h37ij); __m256 hxij = _mm256_unpacklo_ps(a, c); __m256 hyij = _mm256_unpackhi_ps(a, c); __m256 hzij = _mm256_unpacklo_ps(b, d); __m256 vwij = _mm256_unpackhi_ps(b, d); __m256 e2ij = fmadd_ps(hxij, hxij, fmadd_ps(hyij, hyij, fmadd_ps(hzij, hzij, sEpsilon))); __m256 rij = _mm256_load_ps(rIt); __m256 stij = useStiffnessPerConstraint?_mm256_sub_ps(sOne, exp2(_mm256_mul_ps(_mm256_load_ps(stIt),_mm256_broadcast_ps(&stiffnessExponent)))):stiffness; __m256 mask = _mm256_cmp_ps(rij, sEpsilon, _CMP_GT_OQ); __m256 erij = _mm256_and_ps(fnmadd_ps(rij, _mm256_rsqrt_ps(e2ij), sOne), mask); if (useMultiplier) { erij = fnmadd_ps(multiplier, _mm256_max_ps(compressionLimit, _mm256_min_ps(erij, stretchLimit)), erij); } __m256 exij = _mm256_mul_ps(erij, _mm256_mul_ps(stij, _mm256_rcp_ps(_mm256_add_ps(sEpsilon, vwij)))); // replace these two instructions with _mm_maskstore_ps below? __m256 exlo = _mm256_and_ps(sMaskXY, exij); __m256 exhi = _mm256_andnot_ps(sMaskXY, exij); __m256 f04ij = _mm256_mul_ps(h04ij, _mm256_permute_ps(exlo, 0xc0)); __m256 u04i = fmadd_ps(f04ij, _mm256_permute_ps(v04i, 0xff), v04i); __m256 u04j = fnmadd_ps(f04ij, _mm256_permute_ps(v04j, 0xff), v04j); _mm_store_ps(p0i, _mm256_extractf128_ps(u04i, 0)); _mm_store_ps(p0j, _mm256_extractf128_ps(u04j, 0)); _mm_store_ps(p4i, _mm256_extractf128_ps(u04i, 1)); _mm_store_ps(p4j, _mm256_extractf128_ps(u04j, 1)); __m256 f15ij = _mm256_mul_ps(h15ij, _mm256_permute_ps(exlo, 0xd5)); __m256 u15i = fmadd_ps(f15ij, _mm256_permute_ps(v15i, 0xff), v15i); __m256 u15j = fnmadd_ps(f15ij, _mm256_permute_ps(v15j, 0xff), v15j); _mm_store_ps(p1i, _mm256_extractf128_ps(u15i, 0)); _mm_store_ps(p1j, _mm256_extractf128_ps(u15j, 0)); _mm_store_ps(p5i, _mm256_extractf128_ps(u15i, 1)); _mm_store_ps(p5j, _mm256_extractf128_ps(u15j, 1)); __m256 f26ij = _mm256_mul_ps(h26ij, _mm256_permute_ps(exhi, 0x2a)); __m256 u26i = fmadd_ps(f26ij, _mm256_permute_ps(v26i, 0xff), v26i); __m256 u26j = fnmadd_ps(f26ij, _mm256_permute_ps(v26j, 0xff), v26j); _mm_store_ps(p2i, _mm256_extractf128_ps(u26i, 0)); _mm_store_ps(p2j, _mm256_extractf128_ps(u26j, 0)); _mm_store_ps(p6i, _mm256_extractf128_ps(u26i, 1)); _mm_store_ps(p6j, _mm256_extractf128_ps(u26j, 1)); __m256 f37ij = _mm256_mul_ps(h37ij, _mm256_permute_ps(exhi, 0x3f)); __m256 u37i = fmadd_ps(f37ij, _mm256_permute_ps(v37i, 0xff), v37i); __m256 u37j = fnmadd_ps(f37ij, _mm256_permute_ps(v37j, 0xff), v37j); _mm_store_ps(p3i, _mm256_extractf128_ps(u37i, 0)); _mm_store_ps(p3j, _mm256_extractf128_ps(u37j, 0)); _mm_store_ps(p7i, _mm256_extractf128_ps(u37i, 1)); _mm_store_ps(p7j, _mm256_extractf128_ps(u37j, 1)); } _mm256_zeroupper(); } template void solveConstraints(float* __restrict, const float* __restrict, const float* __restrict, const float* __restrict, const uint16_t* __restrict, const __m128&, const __m128&); template void solveConstraints(float* __restrict, const float* __restrict, const float* __restrict, const float* __restrict, const uint16_t* __restrict, const __m128&, const __m128&); template void solveConstraints(float* __restrict, const float* __restrict, const float* __restrict, const float* __restrict, const uint16_t* __restrict, const __m128&, const __m128&); template void solveConstraints(float* __restrict, const float* __restrict, const float* __restrict, const float* __restrict, const uint16_t* __restrict, const __m128&, const __m128&); } // namespace avx