diff options
| author | git perforce import user <a@b> | 2016-10-25 12:29:14 -0600 |
|---|---|---|
| committer | Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees> | 2016-10-25 18:56:37 -0500 |
| commit | 3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch) | |
| tree | fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /PhysX_3.4/Source/LowLevelCloth/src/sse2 | |
| download | physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip | |
Initial commit:
PhysX 3.4.0 Update @ 21294896
APEX 1.4.0 Update @ 21275617
[CL 21300167]
Diffstat (limited to 'PhysX_3.4/Source/LowLevelCloth/src/sse2')
| -rw-r--r-- | PhysX_3.4/Source/LowLevelCloth/src/sse2/SwCollisionHelpers.h | 92 | ||||
| -rw-r--r-- | PhysX_3.4/Source/LowLevelCloth/src/sse2/SwSolveConstraints.h | 392 |
2 files changed, 484 insertions, 0 deletions
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/sse2/SwCollisionHelpers.h b/PhysX_3.4/Source/LowLevelCloth/src/sse2/SwCollisionHelpers.h new file mode 100644 index 00000000..85e33c3c --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/sse2/SwCollisionHelpers.h @@ -0,0 +1,92 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#if PX_GCC_FAMILY +#include <xmmintrin.h> // _BitScanForward +#else +#pragma warning(push) +#pragma warning(disable : 4668) //'symbol' is not defined as a preprocessor macro, replacing with '0' for 'directives' +#pragma warning(disable : 4987) // nonstandard extension used: 'throw (...)' +#include <intrin.h> // _BitScanForward +#pragma warning(pop) +#endif + +namespace physx +{ +namespace cloth +{ + +uint32_t findBitSet(uint32_t mask) +{ +#if defined(_MSC_VER) + unsigned long result; + _BitScanForward(&result, unsigned long(mask)); + return result; +#else + return __builtin_ffs(mask) - 1; +#endif +} + +Simd4i intFloor(const Simd4f& v) +{ + Simd4i i = _mm_cvttps_epi32(v); + return _mm_sub_epi32(i, _mm_srli_epi32(simd4i(v), 31)); +} + +Simd4i horizontalOr(const Simd4i& mask) +{ + Simd4i tmp = mask | _mm_shuffle_epi32(mask, 0xb1); // w z y x -> z w x y + return tmp | _mm_shuffle_epi32(tmp, 0x4e); // w z y x -> y x w z +} + +Gather<Simd4i>::Gather(const Simd4i& index) +{ + mSelectQ = _mm_srai_epi32(index << 29, 31); + mSelectD = _mm_srai_epi32(index << 30, 31); + mSelectW = _mm_srai_epi32(index << 31, 31); + mOutOfRange = (index ^ sIntSignBit) > sSignedMask; +} + +Simd4i Gather<Simd4i>::operator()(const Simd4i* ptr) const +{ + // more efficient with _mm_shuffle_epi8 (SSSE3) + Simd4i lo = ptr[0], hi = ptr[1]; + Simd4i m01 = select(mSelectW, splat<1>(lo), splat<0>(lo)); + Simd4i m23 = select(mSelectW, splat<3>(lo), splat<2>(lo)); + Simd4i m45 = select(mSelectW, splat<1>(hi), splat<0>(hi)); + Simd4i m67 = select(mSelectW, splat<3>(hi), splat<2>(hi)); + Simd4i m0123 = select(mSelectD, m23, m01); + Simd4i m4567 = select(mSelectD, m67, m45); + return select(mSelectQ, m4567, m0123) & ~mOutOfRange; +} + +} // namespace cloth +} // namespace physx diff --git a/PhysX_3.4/Source/LowLevelCloth/src/sse2/SwSolveConstraints.h b/PhysX_3.4/Source/LowLevelCloth/src/sse2/SwSolveConstraints.h new file mode 100644 index 00000000..cb141be5 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/sse2/SwSolveConstraints.h @@ -0,0 +1,392 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2016 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +template <bool useMultiplier> +void solveConstraints(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd, + const uint16_t* __restrict iIt, __m128 stiffness) +{ + __m128 sOne = _mm_set1_ps(1.0f); + + __m128 stretchLimit, compressionLimit, multiplier; + if(useMultiplier) + { + stretchLimit = _mm_shuffle_ps(stiffness, stiffness, 0xff); + compressionLimit = _mm_shuffle_ps(stiffness, stiffness, 0xaa); + multiplier = _mm_shuffle_ps(stiffness, stiffness, 0x55); + } + stiffness = _mm_shuffle_ps(stiffness, stiffness, 0x00); + + for(; rIt != rEnd; rIt += 4, iIt += 8) + { + float* p0i = posIt + iIt[0] * 4; + float* p0j = posIt + iIt[1] * 4; + float* p1i = posIt + iIt[2] * 4; + float* p1j = posIt + iIt[3] * 4; + float* p2i = posIt + iIt[4] * 4; + float* p2j = posIt + iIt[5] * 4; + float* p3i = posIt + iIt[6] * 4; + float* p3j = posIt + iIt[7] * 4; + + __m128 v0i = _mm_load_ps(p0i); + __m128 v0j = _mm_load_ps(p0j); + __m128 v1i = _mm_load_ps(p1i); + __m128 v1j = _mm_load_ps(p1j); + __m128 v2i = _mm_load_ps(p2i); + __m128 v2j = _mm_load_ps(p2j); + __m128 v3i = _mm_load_ps(p3i); + __m128 v3j = _mm_load_ps(p3j); + + __m128 h0ij = _mm_add_ps(v0j, _mm_mul_ps(v0i, sMinusOneXYZOneW)); + __m128 h1ij = _mm_add_ps(v1j, _mm_mul_ps(v1i, sMinusOneXYZOneW)); + __m128 h2ij = _mm_add_ps(v2j, _mm_mul_ps(v2i, sMinusOneXYZOneW)); + __m128 h3ij = _mm_add_ps(v3j, _mm_mul_ps(v3i, sMinusOneXYZOneW)); + + __m128 a = _mm_unpacklo_ps(h0ij, h2ij); + __m128 b = _mm_unpackhi_ps(h0ij, h2ij); + __m128 c = _mm_unpacklo_ps(h1ij, h3ij); + __m128 d = _mm_unpackhi_ps(h1ij, h3ij); + + __m128 hxij = _mm_unpacklo_ps(a, c); + __m128 hyij = _mm_unpackhi_ps(a, c); + __m128 hzij = _mm_unpacklo_ps(b, d); + __m128 vwij = _mm_unpackhi_ps(b, d); + + __m128 rij = _mm_load_ps(rIt); + __m128 e2ij = _mm_add_ps(gSimd4fEpsilon, _mm_add_ps(_mm_mul_ps(hxij, hxij), + _mm_add_ps(_mm_mul_ps(hyij, hyij), _mm_mul_ps(hzij, hzij)))); + __m128 mask = _mm_cmpnle_ps(rij, gSimd4fEpsilon); + __m128 erij = _mm_and_ps(_mm_sub_ps(sOne, _mm_mul_ps(rij, _mm_rsqrt_ps(e2ij))), mask); + + if(useMultiplier) + { + erij = _mm_sub_ps(erij, _mm_mul_ps(multiplier, _mm_max_ps(compressionLimit, _mm_min_ps(erij, stretchLimit)))); + } + __m128 exij = _mm_mul_ps(erij, _mm_mul_ps(stiffness, _mm_rcp_ps(_mm_add_ps(gSimd4fEpsilon, vwij)))); + + __m128 exlo = _mm_and_ps(sMaskXY, exij); + __m128 exhi = _mm_andnot_ps(sMaskXY, exij); + + __m128 f0ij = _mm_mul_ps(h0ij, _mm_shuffle_ps(exlo, exlo, 0xc0)); + __m128 f1ij = _mm_mul_ps(h1ij, _mm_shuffle_ps(exlo, exlo, 0xd5)); + __m128 f2ij = _mm_mul_ps(h2ij, _mm_shuffle_ps(exhi, exhi, 0x2a)); + __m128 f3ij = _mm_mul_ps(h3ij, _mm_shuffle_ps(exhi, exhi, 0x3f)); + + __m128 u0i = _mm_add_ps(v0i, _mm_mul_ps(f0ij, _mm_shuffle_ps(v0i, v0i, 0xff))); + __m128 u0j = _mm_sub_ps(v0j, _mm_mul_ps(f0ij, _mm_shuffle_ps(v0j, v0j, 0xff))); + __m128 u1i = _mm_add_ps(v1i, _mm_mul_ps(f1ij, _mm_shuffle_ps(v1i, v1i, 0xff))); + __m128 u1j = _mm_sub_ps(v1j, _mm_mul_ps(f1ij, _mm_shuffle_ps(v1j, v1j, 0xff))); + __m128 u2i = _mm_add_ps(v2i, _mm_mul_ps(f2ij, _mm_shuffle_ps(v2i, v2i, 0xff))); + __m128 u2j = _mm_sub_ps(v2j, _mm_mul_ps(f2ij, _mm_shuffle_ps(v2j, v2j, 0xff))); + __m128 u3i = _mm_add_ps(v3i, _mm_mul_ps(f3ij, _mm_shuffle_ps(v3i, v3i, 0xff))); + __m128 u3j = _mm_sub_ps(v3j, _mm_mul_ps(f3ij, _mm_shuffle_ps(v3j, v3j, 0xff))); + + _mm_store_ps(p0i, u0i); + _mm_store_ps(p0j, u0j); + _mm_store_ps(p1i, u1i); + _mm_store_ps(p1j, u1j); + _mm_store_ps(p2i, u2i); + _mm_store_ps(p2j, u2j); + _mm_store_ps(p3i, u3i); + _mm_store_ps(p3j, u3j); + } +} + +#if PX_X86 + +// clang-format:disable + +// asm blocks in static condition blocks don't get removed, specialize +template <> +void solveConstraints<false>(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd, + const uint16_t* __restrict iIt, __m128 stiffness) +{ + __m128 sOne = _mm_set1_ps(1.0f); + __m128 sEpsilon = gSimd4fEpsilon; + stiffness = _mm_shuffle_ps(stiffness, stiffness, 0x00); + + __m128 htmp[4]; + float* ptmp[8]; + + __asm + { + mov edx, rIt + mov esi, rEnd + + cmp edx, esi + jae forEnd + + mov eax, iIt + mov ecx, posIt + +forBegin: + movzx edi, WORD PTR [eax ] __asm shl edi, 4 __asm mov [ptmp ], edi __asm movaps xmm0, XMMWORD PTR [edi + ecx] /* v0i */ + movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v0j */ + movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm movaps xmm1, XMMWORD PTR [edi + ecx] /* v1i */ + movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v1j */ + + movaps xmm7, sMinusOneXYZOneW + mulps xmm2, xmm7 __asm addps xmm0, xmm2 __asm movaps XMMWORD PTR [htmp ], xmm0 /* h0ij */ + mulps xmm3, xmm7 __asm addps xmm1, xmm3 __asm movaps XMMWORD PTR [htmp+16], xmm1 /* h1ij */ + + movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */ + movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v2j */ + movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm movaps xmm5, XMMWORD PTR [edi + ecx] /* v3i */ + movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v3j */ + + mulps xmm2, xmm7 __asm addps xmm2, xmm4 __asm movaps XMMWORD PTR [htmp+32], xmm2 /* h2ij */ + mulps xmm3, xmm7 __asm addps xmm3, xmm5 __asm movaps XMMWORD PTR [htmp+48], xmm3 /* h3ij */ + + movaps xmm4, xmm0 + movaps xmm5, xmm1 + + unpcklps xmm0, xmm2 /* a */ + unpckhps xmm4, xmm2 /* b */ + unpcklps xmm1, xmm3 /* c */ + unpckhps xmm5, xmm3 /* d */ + + movaps xmm2, xmm0 + movaps xmm6, xmm4 + + unpcklps xmm0, xmm1 /* hxij */ + unpckhps xmm2, xmm1 /* hyij */ + unpcklps xmm4, xmm5 /* hzij */ + unpckhps xmm6, xmm5 /* vwij */ + + movaps xmm7, sEpsilon + movaps xmm5, sOne + movaps xmm3, stiffness + movaps xmm1, XMMWORD PTR [edx] /* rij */ + + mulps xmm0, xmm0 __asm addps xmm0, xmm7 /* e2ij */ + mulps xmm2, xmm2 __asm addps xmm0, xmm2 + mulps xmm4, xmm4 __asm addps xmm0, xmm4 + + rsqrtps xmm0, xmm0 __asm mulps xmm0, xmm1 /* erij */ + cmpnleps xmm1, xmm7 /* mask */ + subps xmm5, xmm0 __asm andps xmm5, xmm1 + addps xmm6, xmm7 __asm rcpps xmm6, xmm6 + + mulps xmm6, xmm3 __asm mulps xmm6, xmm5 /* exij */ + + movaps xmm7, sMaskXY + andps xmm7, xmm6 /* exlo */ + xorps xmm6, xmm7 /* exhi */ + + movaps xmm0, XMMWORD PTR [htmp ] /* h0ij */ + movaps xmm1, XMMWORD PTR [htmp+16] /* h1ij */ + movaps xmm2, XMMWORD PTR [htmp+32] /* h2ij */ + movaps xmm3, XMMWORD PTR [htmp+48] /* h3ij */ + + pshufd xmm5, xmm7, 0xc0 __asm mulps xmm0, xmm5 /* f0ij */ + pshufd xmm7, xmm7, 0xd5 __asm mulps xmm1, xmm7 /* f1ij */ + pshufd xmm4, xmm6, 0x2a __asm mulps xmm2, xmm4 /* f2ij */ + pshufd xmm6, xmm6, 0x3f __asm mulps xmm3, xmm6 /* f3ij */ + + mov edi, [ptmp ] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v0i */ + pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm0 __asm subps xmm4, xmm5 /* u0i */ + movaps XMMWORD PTR [edi + ecx], xmm4 + + mov edi, [ptmp+ 4] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v0j */ + pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm0 __asm addps xmm6, xmm7 /* u0j */ + movaps XMMWORD PTR [edi + ecx], xmm6 + + mov edi, [ptmp+ 8] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v1i */ + pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm1 __asm subps xmm4, xmm5 /* u1i */ + movaps XMMWORD PTR [edi + ecx], xmm4 + + mov edi, [ptmp+12] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v1j */ + pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm1 __asm addps xmm6, xmm7 /* u1j */ + movaps XMMWORD PTR [edi + ecx], xmm6 + + mov edi, [ptmp+16] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */ + pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm2 __asm subps xmm4, xmm5 /* u2i */ + movaps XMMWORD PTR [edi + ecx], xmm4 + + mov edi, [ptmp+20] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v2j */ + pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm2 __asm addps xmm6, xmm7 /* u2j */ + movaps XMMWORD PTR [edi + ecx], xmm6 + + mov edi, [ptmp+24] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v3i */ + pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm3 __asm subps xmm4, xmm5 /* u3i */ + movaps XMMWORD PTR [edi + ecx], xmm4 + + mov edi, [ptmp+28] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v3j */ + pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm3 __asm addps xmm6, xmm7 /* u3j */ + movaps XMMWORD PTR [edi + ecx], xmm6 + + add eax, 16 + add edx, 16 + + cmp edx, esi + jb forBegin +forEnd: + } +} + +template <> +void solveConstraints<true>(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd, + const uint16_t* __restrict iIt, __m128 stiffness) +{ + __m128 sOne = _mm_set1_ps(1.0f); + __m128 sEpsilon = gSimd4fEpsilon; + __m128 stretchLimit = _mm_shuffle_ps(stiffness, stiffness, 0xff); + __m128 compressionLimit = _mm_shuffle_ps(stiffness, stiffness, 0xaa); + __m128 multiplier = _mm_shuffle_ps(stiffness, stiffness, 0x55); + stiffness = _mm_shuffle_ps(stiffness, stiffness, 0x00); + + __m128 htmp[4]; + float* ptmp[8]; + + __asm + { + mov edx, rIt + mov esi, rEnd + + cmp edx, esi + jae forEnd + + mov eax, iIt + mov ecx, posIt + +forBegin: + movzx edi, WORD PTR [eax ] __asm shl edi, 4 __asm mov [ptmp ], edi __asm movaps xmm0, XMMWORD PTR [edi + ecx] /* v0i */ + movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v0j */ + movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm movaps xmm1, XMMWORD PTR [edi + ecx] /* v1i */ + movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v1j */ + + movaps xmm7, sMinusOneXYZOneW + mulps xmm2, xmm7 __asm addps xmm0, xmm2 __asm movaps XMMWORD PTR [htmp ], xmm0 /* h0ij */ + mulps xmm3, xmm7 __asm addps xmm1, xmm3 __asm movaps XMMWORD PTR [htmp+16], xmm1 /* h1ij */ + + movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */ + movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v2j */ + movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm movaps xmm5, XMMWORD PTR [edi + ecx] /* v3i */ + movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v3j */ + + mulps xmm2, xmm7 __asm addps xmm2, xmm4 __asm movaps XMMWORD PTR [htmp+32], xmm2 /* h2ij */ + mulps xmm3, xmm7 __asm addps xmm3, xmm5 __asm movaps XMMWORD PTR [htmp+48], xmm3 /* h3ij */ + + movaps xmm4, xmm0 + movaps xmm5, xmm1 + + unpcklps xmm0, xmm2 /* a */ + unpckhps xmm4, xmm2 /* b */ + unpcklps xmm1, xmm3 /* c */ + unpckhps xmm5, xmm3 /* d */ + + movaps xmm2, xmm0 + movaps xmm6, xmm4 + + unpcklps xmm0, xmm1 /* hxij */ + unpckhps xmm2, xmm1 /* hyij */ + unpcklps xmm4, xmm5 /* hzij */ + unpckhps xmm6, xmm5 /* vwij */ + + movaps xmm7, sEpsilon + movaps xmm5, sOne + movaps xmm3, stiffness + movaps xmm1, XMMWORD PTR [edx] /* rij */ + + mulps xmm0, xmm0 __asm addps xmm0, xmm7 /* e2ij */ + mulps xmm2, xmm2 __asm addps xmm0, xmm2 + mulps xmm4, xmm4 __asm addps xmm0, xmm4 + + rsqrtps xmm0, xmm0 __asm mulps xmm0, xmm1 /* erij */ + cmpnleps xmm1, xmm7 /* mask */ + subps xmm5, xmm0 __asm andps xmm5, xmm1 + addps xmm6, xmm7 __asm rcpps xmm6, xmm6 + + movaps xmm0, stretchLimit /* multiplier block */ + movaps xmm1, compressionLimit + movaps xmm2, multiplier + minps xmm0, xmm5 + maxps xmm1, xmm0 + mulps xmm2, xmm1 + subps xmm5, xmm2 + + mulps xmm6, xmm3 __asm mulps xmm6, xmm5 /* exij */ + + movaps xmm7, sMaskXY + andps xmm7, xmm6 /* exlo */ + xorps xmm6, xmm7 /* exhi */ + + movaps xmm0, XMMWORD PTR [htmp ] /* h0ij */ + movaps xmm1, XMMWORD PTR [htmp+16] /* h1ij */ + movaps xmm2, XMMWORD PTR [htmp+32] /* h2ij */ + movaps xmm3, XMMWORD PTR [htmp+48] /* h3ij */ + + pshufd xmm5, xmm7, 0xc0 __asm mulps xmm0, xmm5 /* f0ij */ + pshufd xmm7, xmm7, 0xd5 __asm mulps xmm1, xmm7 /* f1ij */ + pshufd xmm4, xmm6, 0x2a __asm mulps xmm2, xmm4 /* f2ij */ + pshufd xmm6, xmm6, 0x3f __asm mulps xmm3, xmm6 /* f3ij */ + + mov edi, [ptmp ] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v0i */ + pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm0 __asm subps xmm4, xmm5 /* u0i */ + movaps XMMWORD PTR [edi + ecx], xmm4 + + mov edi, [ptmp+ 4] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v0j */ + pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm0 __asm addps xmm6, xmm7 /* u0j */ + movaps XMMWORD PTR [edi + ecx], xmm6 + + mov edi, [ptmp+ 8] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v1i */ + pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm1 __asm subps xmm4, xmm5 /* u1i */ + movaps XMMWORD PTR [edi + ecx], xmm4 + + mov edi, [ptmp+12] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v1j */ + pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm1 __asm addps xmm6, xmm7 /* u1j */ + movaps XMMWORD PTR [edi + ecx], xmm6 + + mov edi, [ptmp+16] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */ + pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm2 __asm subps xmm4, xmm5 /* u2i */ + movaps XMMWORD PTR [edi + ecx], xmm4 + + mov edi, [ptmp+20] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v2j */ + pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm2 __asm addps xmm6, xmm7 /* u2j */ + movaps XMMWORD PTR [edi + ecx], xmm6 + + mov edi, [ptmp+24] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v3i */ + pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm3 __asm subps xmm4, xmm5 /* u3i */ + movaps XMMWORD PTR [edi + ecx], xmm4 + + mov edi, [ptmp+28] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v3j */ + pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm3 __asm addps xmm6, xmm7 /* u3j */ + movaps XMMWORD PTR [edi + ecx], xmm6 + + add eax, 16 + add edx, 16 + + cmp edx, esi + jb forBegin +forEnd: + } +} + +// clang-format:enable + +#endif |