aboutsummaryrefslogtreecommitdiff
path: root/PhysX_3.4/Source/LowLevelCloth/src/avx/SwSolveConstraints.cpp
diff options
context:
space:
mode:
authorgit perforce import user <a@b>2016-10-25 12:29:14 -0600
committerSheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees>2016-10-25 18:56:37 -0500
commit3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
treefa6485c169e50d7415a651bf838f5bcd0fd3bfbd /PhysX_3.4/Source/LowLevelCloth/src/avx/SwSolveConstraints.cpp
downloadphysx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz
physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip
Initial commit:
PhysX 3.4.0 Update @ 21294896 APEX 1.4.0 Update @ 21275617 [CL 21300167]
Diffstat (limited to 'PhysX_3.4/Source/LowLevelCloth/src/avx/SwSolveConstraints.cpp')
-rw-r--r--PhysX_3.4/Source/LowLevelCloth/src/avx/SwSolveConstraints.cpp932
1 files changed, 932 insertions, 0 deletions
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/avx/SwSolveConstraints.cpp b/PhysX_3.4/Source/LowLevelCloth/src/avx/SwSolveConstraints.cpp
new file mode 100644
index 00000000..b242aaba
--- /dev/null
+++ b/PhysX_3.4/Source/LowLevelCloth/src/avx/SwSolveConstraints.cpp
@@ -0,0 +1,932 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma warning(push)
+#pragma warning(disable : 4668) //'symbol' is not defined as a preprocessor macro, replacing with '0' for 'directives'
+#pragma warning(disable : 4987) // nonstandard extension used: 'throw (...)'
+#include <intrin.h>
+#pragma warning(pop)
+
+#pragma warning(disable : 4127) // conditional expression is constant
+
+typedef unsigned __int16 uint16_t;
+typedef unsigned __int32 uint32_t;
+
+namespace avx
+{
+__m128 sMaskYZW;
+__m256 sOne, sEpsilon, sMinusOneXYZOneW, sMaskXY;
+
+void initialize()
+{
+ sMaskYZW = _mm_castsi128_ps(_mm_setr_epi32(0, ~0, ~0, ~0));
+ sOne = _mm256_set1_ps(1.0f);
+ sEpsilon = _mm256_set1_ps(1.192092896e-07f);
+ sMinusOneXYZOneW = _mm256_setr_ps(-1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f);
+ sMaskXY = _mm256_castsi256_ps(_mm256_setr_epi32(~0, ~0, 0, 0, ~0, ~0, 0, 0));
+}
+
+template <uint32_t>
+__m256 fmadd_ps(__m256 a, __m256 b, __m256 c)
+{
+ return _mm256_add_ps(_mm256_mul_ps(a, b), c);
+}
+template <uint32_t>
+__m256 fnmadd_ps(__m256 a, __m256 b, __m256 c)
+{
+ return _mm256_sub_ps(c, _mm256_mul_ps(a, b));
+}
+#if _MSC_VER >= 1700
+template <>
+__m256 fmadd_ps<2>(__m256 a, __m256 b, __m256 c)
+{
+ return _mm256_fmadd_ps(a, b, c);
+}
+template <>
+__m256 fnmadd_ps<2>(__m256 a, __m256 b, __m256 c)
+{
+ return _mm256_fnmadd_ps(a, b, c);
+}
+#endif
+
+// roughly same perf as SSE2 intrinsics, the asm version below is about 10% faster
+template <bool useMultiplier, uint32_t avx>
+void solveConstraints(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd,
+ const uint16_t* __restrict iIt, const __m128& stiffnessRef)
+{
+ __m256 stiffness, stretchLimit, compressionLimit, multiplier;
+
+ if(useMultiplier)
+ {
+ stiffness = _mm256_broadcast_ps(&stiffnessRef);
+ stretchLimit = _mm256_permute_ps(stiffness, 0xff);
+ compressionLimit = _mm256_permute_ps(stiffness, 0xaa);
+ multiplier = _mm256_permute_ps(stiffness, 0x55);
+ stiffness = _mm256_permute_ps(stiffness, 0x00);
+ }
+ else
+ {
+ stiffness = _mm256_broadcast_ss((const float*)&stiffnessRef);
+ }
+
+ for(; rIt < rEnd; rIt += 8, iIt += 16)
+ {
+ float* p0i = posIt + iIt[0] * 4;
+ float* p4i = posIt + iIt[8] * 4;
+ float* p0j = posIt + iIt[1] * 4;
+ float* p4j = posIt + iIt[9] * 4;
+ float* p1i = posIt + iIt[2] * 4;
+ float* p5i = posIt + iIt[10] * 4;
+ float* p1j = posIt + iIt[3] * 4;
+ float* p5j = posIt + iIt[11] * 4;
+
+ __m128 v0i = _mm_load_ps(p0i);
+ __m128 v4i = _mm_load_ps(p4i);
+ __m128 v0j = _mm_load_ps(p0j);
+ __m128 v4j = _mm_load_ps(p4j);
+ __m128 v1i = _mm_load_ps(p1i);
+ __m128 v5i = _mm_load_ps(p5i);
+ __m128 v1j = _mm_load_ps(p1j);
+ __m128 v5j = _mm_load_ps(p5j);
+
+ __m256 v04i = _mm256_insertf128_ps(_mm256_castps128_ps256(v0i), v4i, 1);
+ __m256 v04j = _mm256_insertf128_ps(_mm256_castps128_ps256(v0j), v4j, 1);
+ __m256 v15i = _mm256_insertf128_ps(_mm256_castps128_ps256(v1i), v5i, 1);
+ __m256 v15j = _mm256_insertf128_ps(_mm256_castps128_ps256(v1j), v5j, 1);
+
+ __m256 h04ij = fmadd_ps<avx>(sMinusOneXYZOneW, v04i, v04j);
+ __m256 h15ij = fmadd_ps<avx>(sMinusOneXYZOneW, v15i, v15j);
+
+ float* p2i = posIt + iIt[4] * 4;
+ float* p6i = posIt + iIt[12] * 4;
+ float* p2j = posIt + iIt[5] * 4;
+ float* p6j = posIt + iIt[13] * 4;
+ float* p3i = posIt + iIt[6] * 4;
+ float* p7i = posIt + iIt[14] * 4;
+ float* p3j = posIt + iIt[7] * 4;
+ float* p7j = posIt + iIt[15] * 4;
+
+ __m128 v2i = _mm_load_ps(p2i);
+ __m128 v6i = _mm_load_ps(p6i);
+ __m128 v2j = _mm_load_ps(p2j);
+ __m128 v6j = _mm_load_ps(p6j);
+ __m128 v3i = _mm_load_ps(p3i);
+ __m128 v7i = _mm_load_ps(p7i);
+ __m128 v3j = _mm_load_ps(p3j);
+ __m128 v7j = _mm_load_ps(p7j);
+
+ __m256 v26i = _mm256_insertf128_ps(_mm256_castps128_ps256(v2i), v6i, 1);
+ __m256 v26j = _mm256_insertf128_ps(_mm256_castps128_ps256(v2j), v6j, 1);
+ __m256 v37i = _mm256_insertf128_ps(_mm256_castps128_ps256(v3i), v7i, 1);
+ __m256 v37j = _mm256_insertf128_ps(_mm256_castps128_ps256(v3j), v7j, 1);
+
+ __m256 h26ij = fmadd_ps<avx>(sMinusOneXYZOneW, v26i, v26j);
+ __m256 h37ij = fmadd_ps<avx>(sMinusOneXYZOneW, v37i, v37j);
+
+ __m256 a = _mm256_unpacklo_ps(h04ij, h26ij);
+ __m256 b = _mm256_unpackhi_ps(h04ij, h26ij);
+ __m256 c = _mm256_unpacklo_ps(h15ij, h37ij);
+ __m256 d = _mm256_unpackhi_ps(h15ij, h37ij);
+
+ __m256 hxij = _mm256_unpacklo_ps(a, c);
+ __m256 hyij = _mm256_unpackhi_ps(a, c);
+ __m256 hzij = _mm256_unpacklo_ps(b, d);
+ __m256 vwij = _mm256_unpackhi_ps(b, d);
+
+ __m256 e2ij = fmadd_ps<avx>(hxij, hxij, fmadd_ps<avx>(hyij, hyij, fmadd_ps<avx>(hzij, hzij, sEpsilon)));
+
+ __m256 rij = _mm256_load_ps(rIt);
+ __m256 mask = _mm256_cmp_ps(rij, sEpsilon, _CMP_GT_OQ);
+ __m256 erij = _mm256_and_ps(fnmadd_ps<avx>(rij, _mm256_rsqrt_ps(e2ij), sOne), mask);
+
+ if(useMultiplier)
+ {
+ erij = fnmadd_ps<avx>(multiplier, _mm256_max_ps(compressionLimit, _mm256_min_ps(erij, stretchLimit)), erij);
+ }
+
+ __m256 exij = _mm256_mul_ps(erij, _mm256_mul_ps(stiffness, _mm256_rcp_ps(_mm256_add_ps(sEpsilon, vwij))));
+
+ // replace these two instructions with _mm_maskstore_ps below?
+ __m256 exlo = _mm256_and_ps(sMaskXY, exij);
+ __m256 exhi = _mm256_andnot_ps(sMaskXY, exij);
+
+ __m256 f04ij = _mm256_mul_ps(h04ij, _mm256_permute_ps(exlo, 0xc0));
+ __m256 u04i = fmadd_ps<avx>(f04ij, _mm256_permute_ps(v04i, 0xff), v04i);
+ __m256 u04j = fnmadd_ps<avx>(f04ij, _mm256_permute_ps(v04j, 0xff), v04j);
+
+ _mm_store_ps(p0i, _mm256_extractf128_ps(u04i, 0));
+ _mm_store_ps(p0j, _mm256_extractf128_ps(u04j, 0));
+ _mm_store_ps(p4i, _mm256_extractf128_ps(u04i, 1));
+ _mm_store_ps(p4j, _mm256_extractf128_ps(u04j, 1));
+
+ __m256 f15ij = _mm256_mul_ps(h15ij, _mm256_permute_ps(exlo, 0xd5));
+ __m256 u15i = fmadd_ps<avx>(f15ij, _mm256_permute_ps(v15i, 0xff), v15i);
+ __m256 u15j = fnmadd_ps<avx>(f15ij, _mm256_permute_ps(v15j, 0xff), v15j);
+
+ _mm_store_ps(p1i, _mm256_extractf128_ps(u15i, 0));
+ _mm_store_ps(p1j, _mm256_extractf128_ps(u15j, 0));
+ _mm_store_ps(p5i, _mm256_extractf128_ps(u15i, 1));
+ _mm_store_ps(p5j, _mm256_extractf128_ps(u15j, 1));
+
+ __m256 f26ij = _mm256_mul_ps(h26ij, _mm256_permute_ps(exhi, 0x2a));
+ __m256 u26i = fmadd_ps<avx>(f26ij, _mm256_permute_ps(v26i, 0xff), v26i);
+ __m256 u26j = fnmadd_ps<avx>(f26ij, _mm256_permute_ps(v26j, 0xff), v26j);
+
+ _mm_store_ps(p2i, _mm256_extractf128_ps(u26i, 0));
+ _mm_store_ps(p2j, _mm256_extractf128_ps(u26j, 0));
+ _mm_store_ps(p6i, _mm256_extractf128_ps(u26i, 1));
+ _mm_store_ps(p6j, _mm256_extractf128_ps(u26j, 1));
+
+ __m256 f37ij = _mm256_mul_ps(h37ij, _mm256_permute_ps(exhi, 0x3f));
+ __m256 u37i = fmadd_ps<avx>(f37ij, _mm256_permute_ps(v37i, 0xff), v37i);
+ __m256 u37j = fnmadd_ps<avx>(f37ij, _mm256_permute_ps(v37j, 0xff), v37j);
+
+ _mm_store_ps(p3i, _mm256_extractf128_ps(u37i, 0));
+ _mm_store_ps(p3j, _mm256_extractf128_ps(u37j, 0));
+ _mm_store_ps(p7i, _mm256_extractf128_ps(u37i, 1));
+ _mm_store_ps(p7j, _mm256_extractf128_ps(u37j, 1));
+ }
+
+ _mm256_zeroupper();
+}
+
+#ifdef _M_IX86
+
+// clang-format:disable
+
+/* full template specializations of above functions in assembler */
+
+// AVX without useMultiplier
+template <>
+void solveConstraints<false, 1>(float* __restrict posIt, const float* __restrict rIt,
+ const float* __restrict rEnd, const uint16_t* __restrict iIt, const __m128& stiffnessRef)
+{
+ __m256 stiffness = _mm256_broadcast_ss((const float*)&stiffnessRef);
+
+ __m256 vtmp[8], htmp[4];
+ float* ptmp[16];
+
+ __asm
+ {
+ mov edx, rIt
+ mov esi, rEnd
+
+ cmp edx, esi
+ jae forEnd
+
+ mov eax, iIt
+ mov ecx, posIt
+
+forBegin:
+ movzx edi, WORD PTR [eax ] __asm shl edi, 4 __asm mov [ptmp ], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v0i
+ movzx edi, WORD PTR [eax+16] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v4i
+ movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v0j
+ movzx edi, WORD PTR [eax+18] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v4j
+ movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v1i
+ movzx edi, WORD PTR [eax+20] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v5i
+ movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v1j
+ movzx edi, WORD PTR [eax+22] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v5j
+
+ vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp ], ymm0 // v04i
+ vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+ 32], ymm2 // v04j
+ vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+ 64], ymm4 // v15i
+ vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+ 96], ymm6 // v15j
+
+ vmovaps ymm7, sMinusOneXYZOneW
+ vmulps ymm2, ymm2, ymm7 __asm vaddps ymm0, ymm0, ymm2 __asm vmovaps YMMWORD PTR [htmp ], ymm0 // h04ij
+ vmulps ymm6, ymm6, ymm7 __asm vaddps ymm4, ymm4, ymm6 __asm vmovaps YMMWORD PTR [htmp+32], ymm4 // h15ij
+
+ movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+32], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v2i
+ movzx edi, WORD PTR [eax+24] __asm shl edi, 4 __asm mov [ptmp+36], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v6i
+ movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+40], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v2j
+ movzx edi, WORD PTR [eax+26] __asm shl edi, 4 __asm mov [ptmp+44], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v6j
+ movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+48], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v3i
+ movzx edi, WORD PTR [eax+28] __asm shl edi, 4 __asm mov [ptmp+52], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v7i
+ movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+56], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v3j
+ movzx edi, WORD PTR [eax+30] __asm shl edi, 4 __asm mov [ptmp+60], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v7j
+
+ vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp+128], ymm0 // v26i
+ vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+160], ymm2 // v26j
+ vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+192], ymm4 // v37i
+ vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+224], ymm6 // v37j
+
+ vmovaps ymm7, sMinusOneXYZOneW
+ vmulps ymm2, ymm2, ymm7 __asm vaddps ymm2, ymm0, ymm2 __asm vmovaps YMMWORD PTR [htmp+64], ymm2 // h26ij
+ vmulps ymm6, ymm6, ymm7 __asm vaddps ymm6, ymm4, ymm6 __asm vmovaps YMMWORD PTR [htmp+96], ymm6 // h37ij
+
+ vmovaps ymm0, YMMWORD PTR [htmp ] // h04ij
+ vmovaps ymm4, YMMWORD PTR [htmp+32] // h15ij
+
+ vunpcklps ymm1, ymm0, ymm2 // a
+ vunpckhps ymm3, ymm0, ymm2 // b
+ vunpcklps ymm5, ymm4, ymm6 // c
+ vunpckhps ymm7, ymm4, ymm6 // d
+
+ vunpcklps ymm0, ymm1, ymm5 // hxij
+ vunpckhps ymm2, ymm1, ymm5 // hyij
+ vunpcklps ymm4, ymm3, ymm7 // hzij
+ vunpckhps ymm6, ymm3, ymm7 // vwij
+
+ vmovaps ymm7, sEpsilon
+ vmovaps ymm5, sOne
+ vmovaps ymm3, stiffness
+ vmovaps ymm1, YMMWORD PTR [edx] // rij
+
+ vmulps ymm0, ymm0, ymm0 __asm vaddps ymm0, ymm0, ymm7 // e2ij
+ vmulps ymm2, ymm2, ymm2 __asm vaddps ymm0, ymm0, ymm2
+ vmulps ymm4, ymm4, ymm4 __asm vaddps ymm0, ymm0, ymm4
+
+ vcmpgt_oqps ymm2, ymm1, ymm7 // mask
+ vrsqrtps ymm0, ymm0 __asm vmulps ymm0, ymm0, ymm1 // erij
+ vsubps ymm5, ymm5, ymm0 __asm vandps ymm5, ymm5, ymm2
+ vaddps ymm6, ymm6, ymm7 __asm vrcpps ymm6, ymm6
+
+ vmulps ymm6, ymm6, ymm3 __asm vmulps ymm6, ymm6, ymm5 // exij
+
+ vmovaps ymm7, sMaskXY
+ vandps ymm7, ymm7, ymm6 // exlo
+ vxorps ymm6, ymm6, ymm7 // exhi
+
+ vmovaps ymm4, YMMWORD PTR [htmp ] // h04ij
+ vmovaps ymm0, YMMWORD PTR [vtmp ] // v04i
+ vmovaps ymm1, YMMWORD PTR [vtmp+ 32] // v04j
+
+ vpermilps ymm5, ymm7, 0xc0 __asm vmulps ymm4, ymm4, ymm5 // f04ij
+ vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u04i
+ vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u04j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp ] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v0i
+ mov edi, [ptmp+ 8] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v0j
+ mov edi, [ptmp+ 4] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v4i
+ mov edi, [ptmp+12] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v4j
+
+ vmovaps ymm4, YMMWORD PTR [htmp+ 32] // h15ij
+ vmovaps ymm0, YMMWORD PTR [vtmp+ 64] // v15i
+ vmovaps ymm1, YMMWORD PTR [vtmp+ 96] // v15j
+
+ vpermilps ymm5, ymm7, 0xd5 __asm vmulps ymm4, ymm4, ymm5 // f15ij
+ vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u15i
+ vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u15j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp+16] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v1i
+ mov edi, [ptmp+24] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v1j
+ mov edi, [ptmp+20] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v5i
+ mov edi, [ptmp+28] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v5j
+
+ vmovaps ymm4, YMMWORD PTR [htmp+ 64] // h26ij
+ vmovaps ymm0, YMMWORD PTR [vtmp+128] // v26i
+ vmovaps ymm1, YMMWORD PTR [vtmp+160] // v26j
+
+ vpermilps ymm5, ymm6, 0x2a __asm vmulps ymm4, ymm4, ymm5 // f26ij
+ vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u26i
+ vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u26j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp+32] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v2i
+ mov edi, [ptmp+40] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v2j
+ mov edi, [ptmp+36] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v6i
+ mov edi, [ptmp+44] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v6j
+
+ vmovaps ymm4, YMMWORD PTR [htmp+ 96] // h37ij
+ vmovaps ymm0, YMMWORD PTR [vtmp+192] // v37i
+ vmovaps ymm1, YMMWORD PTR [vtmp+224] // v37j
+
+ vpermilps ymm5, ymm6, 0x3f __asm vmulps ymm4, ymm4, ymm5 // f37ij
+ vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u37i
+ vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u37j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp+48] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v3i
+ mov edi, [ptmp+56] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v3j
+ mov edi, [ptmp+52] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v7i
+ mov edi, [ptmp+60] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v7j
+
+ add eax, 32
+ add edx, 32
+
+ cmp edx, esi
+ jb forBegin
+forEnd:
+ }
+
+ _mm256_zeroupper();
+}
+
+// AVX with useMultiplier
+template <>
+void solveConstraints<true, 1>(float* __restrict posIt, const float* __restrict rIt,
+ const float* __restrict rEnd, const uint16_t* __restrict iIt, const __m128& stiffnessRef)
+{
+ __m256 stiffness = _mm256_broadcast_ps(&stiffnessRef);
+ __m256 stretchLimit = _mm256_permute_ps(stiffness, 0xff);
+ __m256 compressionLimit = _mm256_permute_ps(stiffness, 0xaa);
+ __m256 multiplier = _mm256_permute_ps(stiffness, 0x55);
+ stiffness = _mm256_permute_ps(stiffness, 0x00);
+
+ __m256 vtmp[8], htmp[4];
+ float* ptmp[16];
+
+ __asm
+ {
+ mov edx, rIt
+ mov esi, rEnd
+
+ cmp edx, esi
+ jae forEnd
+
+ mov eax, iIt
+ mov ecx, posIt
+
+forBegin:
+ movzx edi, WORD PTR [eax ] __asm shl edi, 4 __asm mov [ptmp ], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v0i
+ movzx edi, WORD PTR [eax+16] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v4i
+ movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v0j
+ movzx edi, WORD PTR [eax+18] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v4j
+ movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v1i
+ movzx edi, WORD PTR [eax+20] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v5i
+ movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v1j
+ movzx edi, WORD PTR [eax+22] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v5j
+
+ vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp ], ymm0 // v04i
+ vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+ 32], ymm2 // v04j
+ vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+ 64], ymm4 // v15i
+ vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+ 96], ymm6 // v15j
+
+ vmovaps ymm7, sMinusOneXYZOneW
+ vmulps ymm2, ymm2, ymm7 __asm vaddps ymm0, ymm0, ymm2 __asm vmovaps YMMWORD PTR [htmp ], ymm0 // h04ij
+ vmulps ymm6, ymm6, ymm7 __asm vaddps ymm4, ymm4, ymm6 __asm vmovaps YMMWORD PTR [htmp+32], ymm4 // h15ij
+
+ movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+32], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v2i
+ movzx edi, WORD PTR [eax+24] __asm shl edi, 4 __asm mov [ptmp+36], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v6i
+ movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+40], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v2j
+ movzx edi, WORD PTR [eax+26] __asm shl edi, 4 __asm mov [ptmp+44], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v6j
+ movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+48], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v3i
+ movzx edi, WORD PTR [eax+28] __asm shl edi, 4 __asm mov [ptmp+52], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v7i
+ movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+56], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v3j
+ movzx edi, WORD PTR [eax+30] __asm shl edi, 4 __asm mov [ptmp+60], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v7j
+
+ vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp+128], ymm0 // v26i
+ vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+160], ymm2 // v26j
+ vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+192], ymm4 // v37i
+ vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+224], ymm6 // v37j
+
+ vmovaps ymm7, sMinusOneXYZOneW
+ vmulps ymm2, ymm2, ymm7 __asm vaddps ymm2, ymm0, ymm2 __asm vmovaps YMMWORD PTR [htmp+64], ymm2 // h26ij
+ vmulps ymm6, ymm6, ymm7 __asm vaddps ymm6, ymm4, ymm6 __asm vmovaps YMMWORD PTR [htmp+96], ymm6 // h37ij
+
+ vmovaps ymm0, YMMWORD PTR [htmp ] // h04ij
+ vmovaps ymm4, YMMWORD PTR [htmp+32] // h15ij
+
+ vunpcklps ymm1, ymm0, ymm2 // a
+ vunpckhps ymm3, ymm0, ymm2 // b
+ vunpcklps ymm5, ymm4, ymm6 // c
+ vunpckhps ymm7, ymm4, ymm6 // d
+
+ vunpcklps ymm0, ymm1, ymm5 // hxij
+ vunpckhps ymm2, ymm1, ymm5 // hyij
+ vunpcklps ymm4, ymm3, ymm7 // hzij
+ vunpckhps ymm6, ymm3, ymm7 // vwij
+
+ vmovaps ymm7, sEpsilon
+ vmovaps ymm5, sOne
+ vmovaps ymm3, stiffness
+ vmovaps ymm1, YMMWORD PTR [edx] // rij
+
+ vmulps ymm0, ymm0, ymm0 __asm vaddps ymm0, ymm0, ymm7 // e2ij
+ vmulps ymm2, ymm2, ymm2 __asm vaddps ymm0, ymm0, ymm2
+ vmulps ymm4, ymm4, ymm4 __asm vaddps ymm0, ymm0, ymm4
+
+ vcmpgt_oqps ymm2, ymm1, ymm7 // mask
+ vrsqrtps ymm0, ymm0 __asm vmulps ymm0, ymm0, ymm1 // erij
+ vsubps ymm5, ymm5, ymm0 __asm vandps ymm5, ymm5, ymm2
+ vaddps ymm6, ymm6, ymm7 __asm vrcpps ymm6, ymm6
+
+ vmovaps ymm0, stretchLimit // multiplier block
+ vmovaps ymm1, compressionLimit
+ vmovaps ymm2, multiplier
+ vminps ymm0, ymm0, ymm5
+ vmaxps ymm1, ymm1, ymm0
+ vmulps ymm2, ymm2, ymm1
+ vsubps ymm5, ymm5, ymm2
+
+ vmulps ymm6, ymm6, ymm3 __asm vmulps ymm6, ymm6, ymm5 // exij
+
+ vmovaps ymm7, sMaskXY
+ vandps ymm7, ymm7, ymm6 // exlo
+ vxorps ymm6, ymm6, ymm7 // exhi
+
+ vmovaps ymm4, YMMWORD PTR [htmp ] // h04ij
+ vmovaps ymm0, YMMWORD PTR [vtmp ] // v04i
+ vmovaps ymm1, YMMWORD PTR [vtmp+ 32] // v04j
+
+ vpermilps ymm5, ymm7, 0xc0 __asm vmulps ymm4, ymm4, ymm5 // f04ij
+ vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u04i
+ vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u04j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp ] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v0i
+ mov edi, [ptmp+ 8] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v0j
+ mov edi, [ptmp+ 4] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v4i
+ mov edi, [ptmp+12] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v4j
+
+ vmovaps ymm4, YMMWORD PTR [htmp+ 32] // h15ij
+ vmovaps ymm0, YMMWORD PTR [vtmp+ 64] // v15i
+ vmovaps ymm1, YMMWORD PTR [vtmp+ 96] // v15j
+
+ vpermilps ymm5, ymm7, 0xd5 __asm vmulps ymm4, ymm4, ymm5 // f15ij
+ vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u15i
+ vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u15j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp+16] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v1i
+ mov edi, [ptmp+24] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v1j
+ mov edi, [ptmp+20] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v5i
+ mov edi, [ptmp+28] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v5j
+
+ vmovaps ymm4, YMMWORD PTR [htmp+ 64] // h26ij
+ vmovaps ymm0, YMMWORD PTR [vtmp+128] // v26i
+ vmovaps ymm1, YMMWORD PTR [vtmp+160] // v26j
+
+ vpermilps ymm5, ymm6, 0x2a __asm vmulps ymm4, ymm4, ymm5 // f26ij
+ vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u26i
+ vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u26j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp+32] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v2i
+ mov edi, [ptmp+40] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v2j
+ mov edi, [ptmp+36] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v6i
+ mov edi, [ptmp+44] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v6j
+
+ vmovaps ymm4, YMMWORD PTR [htmp+ 96] // h37ij
+ vmovaps ymm0, YMMWORD PTR [vtmp+192] // v37i
+ vmovaps ymm1, YMMWORD PTR [vtmp+224] // v37j
+
+ vpermilps ymm5, ymm6, 0x3f __asm vmulps ymm4, ymm4, ymm5 // f37ij
+ vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u37i
+ vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u37j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp+48] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v3i
+ mov edi, [ptmp+56] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v3j
+ mov edi, [ptmp+52] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v7i
+ mov edi, [ptmp+60] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v7j
+
+ add eax, 32
+ add edx, 32
+
+ cmp edx, esi
+ jb forBegin
+forEnd:
+ }
+
+ _mm256_zeroupper();
+}
+
+#if _MSC_VER >= 1700
+// AVX2 without useMultiplier
+template <>
+void solveConstraints<false, 2>(float* __restrict posIt, const float* __restrict rIt,
+ const float* __restrict rEnd, const uint16_t* __restrict iIt, const __m128& stiffnessRef)
+{
+ __m256 stiffness = _mm256_broadcast_ss((const float*)&stiffnessRef);
+
+ __m256 vtmp[8], htmp[4];
+ float* ptmp[16];
+
+ __asm
+ {
+ mov edx, rIt
+ mov esi, rEnd
+
+ cmp edx, esi
+ jae forEnd
+
+ mov eax, iIt
+ mov ecx, posIt
+
+forBegin:
+ movzx edi, WORD PTR [eax ] __asm shl edi, 4 __asm mov [ptmp ], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v0i
+ movzx edi, WORD PTR [eax+16] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v4i
+ movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v0j
+ movzx edi, WORD PTR [eax+18] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v4j
+ movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v1i
+ movzx edi, WORD PTR [eax+20] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v5i
+ movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v1j
+ movzx edi, WORD PTR [eax+22] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v5j
+
+ vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp ], ymm0 // v04i
+ vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+ 32], ymm2 // v04j
+ vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+ 64], ymm4 // v15i
+ vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+ 96], ymm6 // v15j
+
+ vmovaps ymm7, sMinusOneXYZOneW
+ vfmadd213ps ymm2, ymm7, ymm0 __asm vmovaps YMMWORD PTR [htmp ], ymm2 // h04ij
+ vfmadd213ps ymm6, ymm7, ymm4 __asm vmovaps YMMWORD PTR [htmp+32], ymm6 // h15ij
+
+ movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+32], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v2i
+ movzx edi, WORD PTR [eax+24] __asm shl edi, 4 __asm mov [ptmp+36], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v6i
+ movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+40], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v2j
+ movzx edi, WORD PTR [eax+26] __asm shl edi, 4 __asm mov [ptmp+44], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v6j
+ movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+48], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v3i
+ movzx edi, WORD PTR [eax+28] __asm shl edi, 4 __asm mov [ptmp+52], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v7i
+ movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+56], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v3j
+ movzx edi, WORD PTR [eax+30] __asm shl edi, 4 __asm mov [ptmp+60], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v7j
+
+ vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp+128], ymm0 // v26i
+ vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+160], ymm2 // v26j
+ vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+192], ymm4 // v37i
+ vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+224], ymm6 // v37j
+
+ vmovaps ymm7, sMinusOneXYZOneW
+ vfmadd213ps ymm2, ymm7, ymm0 __asm vmovaps YMMWORD PTR [htmp+64], ymm2 // h26ij
+ vfmadd213ps ymm6, ymm7, ymm4 __asm vmovaps YMMWORD PTR [htmp+96], ymm6 // h37ij
+
+ vmovaps ymm0, YMMWORD PTR [htmp ] // h04ij
+ vmovaps ymm4, YMMWORD PTR [htmp+32] // h15ij
+
+ vunpcklps ymm1, ymm0, ymm2 // a
+ vunpckhps ymm3, ymm0, ymm2 // b
+ vunpcklps ymm5, ymm4, ymm6 // c
+ vunpckhps ymm7, ymm4, ymm6 // d
+
+ vunpcklps ymm0, ymm1, ymm5 // hxij
+ vunpckhps ymm2, ymm1, ymm5 // hyij
+ vunpcklps ymm4, ymm3, ymm7 // hzij
+ vunpckhps ymm6, ymm3, ymm7 // vwij
+
+ vmovaps ymm7, sEpsilon
+ vmovaps ymm5, sOne
+ vmovaps ymm3, stiffness
+ vmovaps ymm1, YMMWORD PTR [edx] // rij
+
+ vfmadd213ps ymm4, ymm4, ymm7 // e2ij
+ vfmadd213ps ymm2, ymm2, ymm4
+ vfmadd213ps ymm0, ymm0, ymm2
+
+ vcmpgt_oqps ymm2, ymm1, ymm7 // mask
+ vrsqrtps ymm0, ymm0 __asm vfnmadd231ps ymm5, ymm0, ymm1 // erij
+ vandps ymm5, ymm5, ymm2
+ vaddps ymm6, ymm6, ymm7 __asm vrcpps ymm6, ymm6
+
+ vmulps ymm6, ymm6, ymm3 __asm vmulps ymm6, ymm6, ymm5 // exij
+
+ vmovaps ymm7, sMaskXY
+ vandps ymm7, ymm7, ymm6 // exlo
+ vxorps ymm6, ymm6, ymm7 // exhi
+
+ vmovaps ymm4, YMMWORD PTR [htmp ] // h04ij
+ vmovaps ymm0, YMMWORD PTR [vtmp ] // v04i
+ vmovaps ymm1, YMMWORD PTR [vtmp+ 32] // v04j
+
+ vpermilps ymm5, ymm7, 0xc0 __asm vmulps ymm4, ymm4, ymm5 // f04ij
+ vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u04i
+ vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u04j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp ] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v0i
+ mov edi, [ptmp+ 8] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v0j
+ mov edi, [ptmp+ 4] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v4i
+ mov edi, [ptmp+12] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v4j
+
+ vmovaps ymm4, YMMWORD PTR [htmp+ 32] // h15ij
+ vmovaps ymm0, YMMWORD PTR [vtmp+ 64] // v15i
+ vmovaps ymm1, YMMWORD PTR [vtmp+ 96] // v15j
+
+ vpermilps ymm5, ymm7, 0xd5 __asm vmulps ymm4, ymm4, ymm5 // f15ij
+ vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u15i
+ vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u15j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp+16] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v1i
+ mov edi, [ptmp+24] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v1j
+ mov edi, [ptmp+20] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v5i
+ mov edi, [ptmp+28] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v5j
+
+ vmovaps ymm4, YMMWORD PTR [htmp+ 64] // h26ij
+ vmovaps ymm0, YMMWORD PTR [vtmp+128] // v26i
+ vmovaps ymm1, YMMWORD PTR [vtmp+160] // v26j
+
+ vpermilps ymm5, ymm6, 0x2a __asm vmulps ymm4, ymm4, ymm5 // f26ij
+ vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u26i
+ vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u26j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp+32] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v2i
+ mov edi, [ptmp+40] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v2j
+ mov edi, [ptmp+36] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v6i
+ mov edi, [ptmp+44] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v6j
+
+ vmovaps ymm4, YMMWORD PTR [htmp+ 96] // h37ij
+ vmovaps ymm0, YMMWORD PTR [vtmp+192] // v37i
+ vmovaps ymm1, YMMWORD PTR [vtmp+224] // v37j
+
+ vpermilps ymm5, ymm6, 0x3f __asm vmulps ymm4, ymm4, ymm5 // f37ij
+ vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u37i
+ vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u37j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp+48] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v3i
+ mov edi, [ptmp+56] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v3j
+ mov edi, [ptmp+52] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v7i
+ mov edi, [ptmp+60] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v7j
+
+ add eax, 32
+ add edx, 32
+
+ cmp edx, esi
+ jb forBegin
+forEnd:
+ }
+
+ _mm256_zeroupper();
+}
+
+// AVX2 with useMultiplier
+template <>
+void solveConstraints<true, 2>(float* __restrict posIt, const float* __restrict rIt,
+ const float* __restrict rEnd, const uint16_t* __restrict iIt, const __m128& stiffnessRef)
+{
+ __m256 stiffness = _mm256_broadcast_ps(&stiffnessRef);
+ __m256 stretchLimit = _mm256_permute_ps(stiffness, 0xff);
+ __m256 compressionLimit = _mm256_permute_ps(stiffness, 0xaa);
+ __m256 multiplier = _mm256_permute_ps(stiffness, 0x55);
+ stiffness = _mm256_permute_ps(stiffness, 0x00);
+
+ __m256 vtmp[8], htmp[4];
+ float* ptmp[16];
+
+ __asm
+ {
+ mov edx, rIt
+ mov esi, rEnd
+
+ cmp edx, esi
+ jae forEnd
+
+ mov eax, iIt
+ mov ecx, posIt
+
+forBegin:
+ movzx edi, WORD PTR [eax ] __asm shl edi, 4 __asm mov [ptmp ], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v0i
+ movzx edi, WORD PTR [eax+16] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v4i
+ movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v0j
+ movzx edi, WORD PTR [eax+18] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v4j
+ movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v1i
+ movzx edi, WORD PTR [eax+20] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v5i
+ movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v1j
+ movzx edi, WORD PTR [eax+22] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v5j
+
+ vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp ], ymm0 // v04i
+ vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+ 32], ymm2 // v04j
+ vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+ 64], ymm4 // v15i
+ vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+ 96], ymm6 // v15j
+
+ vmovaps ymm7, sMinusOneXYZOneW
+ vfmadd213ps ymm2, ymm7, ymm0 __asm vmovaps YMMWORD PTR [htmp ], ymm2 // h04ij
+ vfmadd213ps ymm6, ymm7, ymm4 __asm vmovaps YMMWORD PTR [htmp+32], ymm6 // h15ij
+
+ movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+32], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v2i
+ movzx edi, WORD PTR [eax+24] __asm shl edi, 4 __asm mov [ptmp+36], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v6i
+ movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+40], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v2j
+ movzx edi, WORD PTR [eax+26] __asm shl edi, 4 __asm mov [ptmp+44], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v6j
+ movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+48], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v3i
+ movzx edi, WORD PTR [eax+28] __asm shl edi, 4 __asm mov [ptmp+52], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v7i
+ movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+56], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v3j
+ movzx edi, WORD PTR [eax+30] __asm shl edi, 4 __asm mov [ptmp+60], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v7j
+
+ vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp+128], ymm0 // v26i
+ vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+160], ymm2 // v26j
+ vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+192], ymm4 // v37i
+ vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+224], ymm6 // v37j
+
+ vmovaps ymm7, sMinusOneXYZOneW
+ vfmadd213ps ymm2, ymm7, ymm0 __asm vmovaps YMMWORD PTR [htmp+64], ymm2 // h26ij
+ vfmadd213ps ymm6, ymm7, ymm4 __asm vmovaps YMMWORD PTR [htmp+96], ymm6 // h37ij
+
+ vmovaps ymm0, YMMWORD PTR [htmp ] // h04ij
+ vmovaps ymm4, YMMWORD PTR [htmp+32] // h15ij
+
+ vunpcklps ymm1, ymm0, ymm2 // a
+ vunpckhps ymm3, ymm0, ymm2 // b
+ vunpcklps ymm5, ymm4, ymm6 // c
+ vunpckhps ymm7, ymm4, ymm6 // d
+
+ vunpcklps ymm0, ymm1, ymm5 // hxij
+ vunpckhps ymm2, ymm1, ymm5 // hyij
+ vunpcklps ymm4, ymm3, ymm7 // hzij
+ vunpckhps ymm6, ymm3, ymm7 // vwij
+
+ vmovaps ymm7, sEpsilon
+ vmovaps ymm5, sOne
+ vmovaps ymm3, stiffness
+ vmovaps ymm1, YMMWORD PTR [edx] // rij
+
+ vfmadd213ps ymm4, ymm4, ymm7 // e2ij
+ vfmadd213ps ymm2, ymm2, ymm4
+ vfmadd213ps ymm0, ymm0, ymm2
+
+ vcmpgt_oqps ymm2, ymm1, ymm7 // mask
+ vrsqrtps ymm0, ymm0 __asm vfnmadd231ps ymm5, ymm0, ymm1 // erij
+ vandps ymm5, ymm5, ymm2
+ vaddps ymm6, ymm6, ymm7 __asm vrcpps ymm6, ymm6
+
+ vmovaps ymm0, stretchLimit // multiplier block
+ vmovaps ymm1, compressionLimit
+ vmovaps ymm2, multiplier
+ vminps ymm0, ymm0, ymm5
+ vmaxps ymm1, ymm1, ymm0
+ vfnmadd231ps ymm5, ymm1, ymm2
+
+ vmulps ymm6, ymm6, ymm3 __asm vmulps ymm6, ymm6, ymm5 // exij
+
+ vmovaps ymm7, sMaskXY
+ vandps ymm7, ymm7, ymm6 // exlo
+ vxorps ymm6, ymm6, ymm7 // exhi
+
+ vmovaps ymm4, YMMWORD PTR [htmp ] // h04ij
+ vmovaps ymm0, YMMWORD PTR [vtmp ] // v04i
+ vmovaps ymm1, YMMWORD PTR [vtmp+ 32] // v04j
+
+ vpermilps ymm5, ymm7, 0xc0 __asm vmulps ymm4, ymm4, ymm5 // f04ij
+ vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u04i
+ vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u04j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp ] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v0i
+ mov edi, [ptmp+ 8] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v0j
+ mov edi, [ptmp+ 4] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v4i
+ mov edi, [ptmp+12] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v4j
+
+ vmovaps ymm4, YMMWORD PTR [htmp+ 32] // h15ij
+ vmovaps ymm0, YMMWORD PTR [vtmp+ 64] // v15i
+ vmovaps ymm1, YMMWORD PTR [vtmp+ 96] // v15j
+
+ vpermilps ymm5, ymm7, 0xd5 __asm vmulps ymm4, ymm4, ymm5 // f15ij
+ vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u15i
+ vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u15j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp+16] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v1i
+ mov edi, [ptmp+24] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v1j
+ mov edi, [ptmp+20] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v5i
+ mov edi, [ptmp+28] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v5j
+
+ vmovaps ymm4, YMMWORD PTR [htmp+ 64] // h26ij
+ vmovaps ymm0, YMMWORD PTR [vtmp+128] // v26i
+ vmovaps ymm1, YMMWORD PTR [vtmp+160] // v26j
+
+ vpermilps ymm5, ymm6, 0x2a __asm vmulps ymm4, ymm4, ymm5 // f26ij
+ vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u26i
+ vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u26j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp+32] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v2i
+ mov edi, [ptmp+40] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v2j
+ mov edi, [ptmp+36] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v6i
+ mov edi, [ptmp+44] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v6j
+
+ vmovaps ymm4, YMMWORD PTR [htmp+ 96] // h37ij
+ vmovaps ymm0, YMMWORD PTR [vtmp+192] // v37i
+ vmovaps ymm1, YMMWORD PTR [vtmp+224] // v37j
+
+ vpermilps ymm5, ymm6, 0x3f __asm vmulps ymm4, ymm4, ymm5 // f37ij
+ vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u37i
+ vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u37j
+
+ vextractf128 xmm2, ymm0, 1
+ vextractf128 xmm3, ymm1, 1
+
+ mov edi, [ptmp+48] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v3i
+ mov edi, [ptmp+56] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v3j
+ mov edi, [ptmp+52] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v7i
+ mov edi, [ptmp+60] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v7j
+
+ add eax, 32
+ add edx, 32
+
+ cmp edx, esi
+ jb forBegin
+forEnd:
+ }
+
+ _mm256_zeroupper();
+}
+#endif // _MSC_VER >= 1700
+
+// clang-format:enable
+
+#else // _M_IX86
+
+template void solveConstraints<false, 1>(float* __restrict, const float* __restrict, const float* __restrict,
+ const uint16_t* __restrict, const __m128&);
+
+template void solveConstraints<true, 1>(float* __restrict, const float* __restrict, const float* __restrict,
+ const uint16_t* __restrict, const __m128&);
+
+template void solveConstraints<false, 2>(float* __restrict, const float* __restrict, const float* __restrict,
+ const uint16_t* __restrict, const __m128&);
+
+template void solveConstraints<true, 2>(float* __restrict, const float* __restrict, const float* __restrict,
+ const uint16_t* __restrict, const __m128&);
+
+#endif // _M_IX86
+
+} // namespace avx