diff options
| author | Sheikh Dawood Abdul Ajees <[email protected]> | 2017-05-12 17:45:18 -0500 |
|---|---|---|
| committer | Sheikh Dawood Abdul Ajees <[email protected]> | 2017-05-12 17:45:18 -0500 |
| commit | 7f12de60542edc8f1c6683e6b4cdce8570e51456 (patch) | |
| tree | 0b5d533bae189ea286257b5ab78b635fafb19aa0 /PhysX_3.4/Source/LowLevelCloth/src/neon | |
| parent | PhysX 3.4, APEX 1.4 patch release @22017166 (diff) | |
| download | physx-3.4-7f12de60542edc8f1c6683e6b4cdce8570e51456.tar.xz physx-3.4-7f12de60542edc8f1c6683e6b4cdce8570e51456.zip | |
PhysX 3.4, APEX 1.4 patch release @22121272
Diffstat (limited to 'PhysX_3.4/Source/LowLevelCloth/src/neon')
| -rw-r--r-- | PhysX_3.4/Source/LowLevelCloth/src/neon/Simd4f.h | 585 | ||||
| -rw-r--r-- | PhysX_3.4/Source/LowLevelCloth/src/neon/Simd4i.h | 303 | ||||
| -rw-r--r-- | PhysX_3.4/Source/LowLevelCloth/src/neon/SimdTypes.h | 71 |
3 files changed, 959 insertions, 0 deletions
diff --git a/PhysX_3.4/Source/LowLevelCloth/src/neon/Simd4f.h b/PhysX_3.4/Source/LowLevelCloth/src/neon/Simd4f.h new file mode 100644 index 00000000..550b45c6 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/neon/Simd4f.h @@ -0,0 +1,585 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +NV_SIMD_NAMESPACE_BEGIN + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// factory implementation +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4fZeroFactory::operator Simd4f() const +{ + return vdupq_n_u32(0); +} + +Simd4fOneFactory::operator Simd4f() const +{ + return vdupq_n_f32(1.0f); +} + +Simd4fScalarFactory::operator Simd4f() const +{ + return vdupq_n_f32(reinterpret_cast<const float32_t&>(value)); +} + +Simd4fTupleFactory::operator Simd4f() const +{ + return reinterpret_cast<const Simd4f&>(tuple); +} + +Simd4fLoadFactory::operator Simd4f() const +{ + return vld1q_f32(static_cast<const float32_t*>(ptr)); +} + +Simd4fLoad3Factory::operator Simd4f() const +{ +#if 0 + float32x2_t xy = vld1_f32(ptr); + float32x2_t zz = vld1_dup_f32(ptr+2); + return vcombine_f32(xy, zz); +#else + float fltArray[] = { ptr[0], ptr[1], ptr[2], 0.0 }; + return vld1q_f32(static_cast<const float32_t*>(fltArray)); +#endif +} + +Simd4fLoad3SetWFactory::operator Simd4f() const +{ +#if 0 + float32x2_t xy = vld1_f32(ptr); + float32x2_t zz = vld1_dup_f32(ptr+2); + return vcombine_f32(xy, zz); +#else + float fltArray[] = { ptr[0], ptr[1], ptr[2], w }; + return vld1q_f32(static_cast<const float32_t*>(fltArray)); +#endif +} + +Simd4fAlignedLoadFactory::operator Simd4f() const +{ + return vld1q_f32(static_cast<const float32_t*>(ptr)); +} + +Simd4fOffsetLoadFactory::operator Simd4f() const +{ + return vld1q_f32(reinterpret_cast<const float32_t*>(reinterpret_cast<const char*>(ptr) + offset)); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// expression templates +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline ComplementExpr<Simd4f>::operator Simd4f() const +{ + return vbicq_u32(vdupq_n_u32(0xffffffff), v.u4); +} + +template <> +inline Simd4f operator&(const ComplementExpr<Simd4f>& complement, const Simd4f& v) +{ + return vbicq_u32(v.u4, complement.v.u4); +} + +template <> +inline Simd4f operator&(const Simd4f& v, const ComplementExpr<Simd4f>& complement) +{ + return vbicq_u32(v.u4, complement.v.u4); +} + +ProductExpr::operator Simd4f() const +{ + return vmulq_f32(v0.f4, v1.f4); +} + +Simd4f operator+(const ProductExpr& p, const Simd4f& v) +{ + return vmlaq_f32(v.f4, p.v0.f4, p.v1.f4); +} + +Simd4f operator+(const Simd4f& v, const ProductExpr& p) +{ + return vmlaq_f32(v.f4, p.v0.f4, p.v1.f4); +} + +Simd4f operator+(const ProductExpr& p0, const ProductExpr& p1) +{ + // cast calls operator Simd4f() which evaluates the other ProductExpr + return vmlaq_f32(static_cast<Simd4f>(p0).f4, p1.v0.f4, p1.v1.f4); +} + +Simd4f operator-(const Simd4f& v, const ProductExpr& p) +{ + return vmlsq_f32(v.f4, p.v0.f4, p.v1.f4); +} + +Simd4f operator-(const ProductExpr& p0, const ProductExpr& p1) +{ + // cast calls operator Simd4f() which evaluates the other ProductExpr + return vmlsq_f32(static_cast<Simd4f>(p0).f4, p1.v0.f4, p1.v1.f4); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// operator implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4f operator==(const Simd4f& v0, const Simd4f& v1) +{ + return vceqq_f32(v0.f4, v1.f4); +} + +Simd4f operator<(const Simd4f& v0, const Simd4f& v1) +{ + return vcltq_f32(v0.f4, v1.f4); +} + +Simd4f operator<=(const Simd4f& v0, const Simd4f& v1) +{ + return vcleq_f32(v0.f4, v1.f4); +} + +Simd4f operator>(const Simd4f& v0, const Simd4f& v1) +{ + return vcgtq_f32(v0.f4, v1.f4); +} + +Simd4f operator>=(const Simd4f& v0, const Simd4f& v1) +{ + return vcgeq_f32(v0.f4, v1.f4); +} + +ComplementExpr<Simd4f> operator~(const Simd4f& v) +{ + return ComplementExpr<Simd4f>(v); +} + +Simd4f operator&(const Simd4f& v0, const Simd4f& v1) +{ + return vandq_u32(v0.u4, v1.u4); +} + +Simd4f operator|(const Simd4f& v0, const Simd4f& v1) +{ + return vorrq_u32(v0.u4, v1.u4); +} + +Simd4f operator^(const Simd4f& v0, const Simd4f& v1) +{ + return veorq_u32(v0.u4, v1.u4); +} + +Simd4f operator<<(const Simd4f& v, int shift) +{ + return vshlq_u32(v.u4, vdupq_n_s32(shift)); +} + +Simd4f operator>>(const Simd4f& v, int shift) +{ + return vshlq_u32(v.u4, vdupq_n_s32(-shift)); +} + +Simd4f operator<<(const Simd4f& v, const Simd4f& shift) +{ + return vshlq_u32(v.u4, shift.i4); +} + +Simd4f operator>>(const Simd4f& v, const Simd4f& shift) +{ + return vshlq_u32(v.u4, vnegq_s32(shift.i4)); +} + +Simd4f operator+(const Simd4f& v) +{ + return v; +} + +Simd4f operator+(const Simd4f& v0, const Simd4f& v1) +{ + return vaddq_f32(v0.f4, v1.f4); +} + +Simd4f operator-(const Simd4f& v) +{ + return vnegq_f32(v.f4); +} + +Simd4f operator-(const Simd4f& v0, const Simd4f& v1) +{ + return vsubq_f32(v0.f4, v1.f4); +} + +ProductExpr operator*(const Simd4f& v0, const Simd4f& v1) +{ + return ProductExpr(v0, v1); +} + +Simd4f operator/(const Simd4f& v0, const Simd4f& v1) +{ + return v0 * vrecpeq_f32(v1.f4); // reciprocal estimate +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// function implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4f simd4f(const Simd4i& v) +{ + return v.u4; +} + +Simd4f convert(const Simd4i& v) +{ + return vcvtq_f32_s32(v.i4); +} + +float (&array(Simd4f& v))[4] +{ + return reinterpret_cast<float(&)[4]>(v); +} + +const float (&array(const Simd4f& v))[4] +{ + return reinterpret_cast<const float(&)[4]>(v); +} + +void store(float* ptr, Simd4f const& v) +{ + return vst1q_f32(reinterpret_cast<float32_t*>(ptr), v.f4); +} + +void storeAligned(float* ptr, Simd4f const& v) +{ + return vst1q_f32(reinterpret_cast<float32_t*>(ptr), v.f4); +} + +void store3(float* dst, const Simd4f& v) +{ + const float* __restrict src = array(v); + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; +} + +void storeAligned(float* ptr, unsigned int offset, Simd4f const& v) +{ + return storeAligned(reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + offset), v); +} + +template <size_t i> +Simd4f splat(Simd4f const& v) +{ + return vdupq_n_f32(array(v)[i]); +} + +Simd4f select(Simd4f const& mask, Simd4f const& v0, Simd4f const& v1) +{ + return vbslq_f32(mask.u4, v0.f4, v1.f4); +} + +Simd4f abs(const Simd4f& v) +{ + return vabsq_f32(v.f4); +} + +Simd4f floor(const Simd4f& v) +{ + int32x4_t i = vcvtq_s32_f32(v.f4); + int32x4_t s = vreinterpretq_s32_u32(vcgtq_f32(vcvtq_f32_s32(i), v.f4)); + return vcvtq_f32_s32(vsubq_s32(i, vshrq_n_s32(s, 31))); +} + +#if !defined max +Simd4f max(const Simd4f& v0, const Simd4f& v1) +{ + return vmaxq_f32(v0.f4, v1.f4); +} +#endif + +#if !defined min +Simd4f min(const Simd4f& v0, const Simd4f& v1) +{ + return vminq_f32(v0.f4, v1.f4); +} +#endif + +Simd4f recip(const Simd4f& v) +{ + return recip<0>(v); +} + +template <int n> +Simd4f recip(const Simd4f& v) +{ + Simd4f r = vrecpeq_f32(v.f4); + // n+1 newton iterations because initial approximation is crude + for(int i = 0; i <= n; ++i) + r = vrecpsq_f32(v.f4, r.f4) * r; + return r; +} + +Simd4f sqrt(const Simd4f& v) +{ + return (v > gSimd4fZero) & (v * rsqrt(v)); +} + +Simd4f rsqrt(const Simd4f& v) +{ + return rsqrt<0>(v); +} + +template <int n> +Simd4f rsqrt(const Simd4f& v) +{ + Simd4f r = vrsqrteq_f32(v.f4); + // n+1 newton iterations because initial approximation is crude + for(int i = 0; i <= n; ++i) + r = vrsqrtsq_f32(vmulq_f32(v.f4, r.f4), r.f4) * r; + return r; +} + +Simd4f exp2(const Simd4f& v) +{ + // http://www.netlib.org/cephes/ + + Simd4f limit = simd4f(127.4999f); + Simd4f x = min(max(-limit, v), limit); + + // separate into integer and fractional part + + Simd4f fx = x + simd4f(0.5f); + Simd4i ix = vsubq_s32(vcvtq_s32_f32(fx.f4), vreinterpretq_s32_u32(vshrq_n_u32(fx.u4, 31))); + fx = x - vcvtq_f32_s32(ix.i4); + + // exp2(fx) ~ 1 + 2*P(fx) / (Q(fx) - P(fx)) + + Simd4f fx2 = fx * fx; + + Simd4f px = fx * (simd4f(1.51390680115615096133e+3f) + + fx2 * (simd4f(2.02020656693165307700e+1f) + fx2 * simd4f(2.30933477057345225087e-2f))); + Simd4f qx = simd4f(4.36821166879210612817e+3f) + fx2 * (simd4f(2.33184211722314911771e+2f) + fx2); + + Simd4f exp2fx = px * recip(qx - px); + exp2fx = gSimd4fOne + exp2fx + exp2fx; + + // exp2(ix) + + Simd4f exp2ix = vreinterpretq_f32_s32(vshlq_n_s32(vaddq_s32(ix.i4, vdupq_n_s32(0x7f)), 23)); + + return exp2fx * exp2ix; +} + +Simd4f log2(const Simd4f& v) +{ + Simd4f scale = simd4f(1.44269504088896341f); // 1/ln(2) + const float* ptr = array(v); + return simd4f(::logf(ptr[0]), ::logf(ptr[1]), ::logf(ptr[2]), ::logf(ptr[3])) * scale; +} + +Simd4f dot3(const Simd4f& v0, const Simd4f& v1) +{ + Simd4f tmp = v0 * v1; + return splat<0>(tmp) + splat<1>(tmp) + splat<2>(tmp); +} + +Simd4f cross3(const Simd4f& v0, const Simd4f& v1) +{ + float32x2_t x0_y0 = vget_low_f32(v0.f4); + float32x2_t z0_w0 = vget_high_f32(v0.f4); + float32x2_t x1_y1 = vget_low_f32(v1.f4); + float32x2_t z1_w1 = vget_high_f32(v1.f4); + + float32x2_t y1_z1 = vext_f32(x1_y1, z1_w1, 1); + float32x2_t y0_z0 = vext_f32(x0_y0, z0_w0, 1); + + float32x2_t z0x1_w0y1 = vmul_f32(z0_w0, x1_y1); + float32x2_t x0y1_y0z1 = vmul_f32(x0_y0, y1_z1); + + float32x2_t y2_w2 = vmls_f32(z0x1_w0y1, x0_y0, z1_w1); + float32x2_t z2_x2 = vmls_f32(x0y1_y0z1, y0_z0, x1_y1); + float32x2_t x2_y2 = vext_f32(z2_x2, y2_w2, 1); + + return vcombine_f32(x2_y2, z2_x2); +} + +void transpose(Simd4f& x, Simd4f& y, Simd4f& z, Simd4f& w) +{ +#if NV_SIMD_INLINE_ASSEMBLER + asm volatile("vzip.f32 %q0, %q2 \n\t" + "vzip.f32 %q1, %q3 \n\t" + "vzip.f32 %q0, %q1 \n\t" + "vzip.f32 %q2, %q3 \n\t" + : "+w"(x.f4), "+w"(y.f4), "+w"(z.f4), "+w"(w.f4)); +#else + float32x4x2_t v0v1 = vzipq_f32(x.f4, z.f4); + float32x4x2_t v2v3 = vzipq_f32(y.f4, w.f4); + float32x4x2_t zip0 = vzipq_f32(v0v1.val[0], v2v3.val[0]); + float32x4x2_t zip1 = vzipq_f32(v0v1.val[1], v2v3.val[1]); + + x = zip0.val[0]; + y = zip0.val[1]; + z = zip1.val[0]; + w = zip1.val[1]; +#endif +} + +void zip(Simd4f& v0, Simd4f& v1) +{ +#if NV_SIMD_INLINE_ASSEMBLER + asm volatile("vzip.f32 %q0, %q1 \n\t" : "+w"(v0.f4), "+w"(v1.f4)); +#else + float32x4x2_t uzp = vzipq_f32(v0.f4, v1.f4); + v0 = uzp.val[0]; + v1 = uzp.val[1]; +#endif +} + +void unzip(Simd4f& v0, Simd4f& v1) +{ +#if NV_SIMD_INLINE_ASSEMBLER + asm volatile("vuzp.f32 %q0, %q1 \n\t" : "+w"(v0.f4), "+w"(v1.f4)); +#else + float32x4x2_t uzp = vuzpq_f32(v0.f4, v1.f4); + v0 = uzp.val[0]; + v1 = uzp.val[1]; +#endif +} + +Simd4f swaphilo(const Simd4f& v) +{ + return vcombine_f32(vget_high_f32(v.f4), vget_low_f32(v.f4)); +} + +int allEqual(const Simd4f& v0, const Simd4f& v1) +{ + return allTrue(v0 == v1); +} + +int allEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return allTrue(outMask = v0 == v1); +} + +int anyEqual(const Simd4f& v0, const Simd4f& v1) +{ + return anyTrue(v0 == v1); +} + +int anyEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return anyTrue(outMask = v0 == v1); +} + +int allGreater(const Simd4f& v0, const Simd4f& v1) +{ + return allTrue(v0 > v1); +} + +int allGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return allTrue(outMask = v0 > v1); +} + +int anyGreater(const Simd4f& v0, const Simd4f& v1) +{ + return anyTrue(v0 > v1); +} + +int anyGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return anyTrue(outMask = v0 > v1); +} + +int allGreaterEqual(const Simd4f& v0, const Simd4f& v1) +{ + return allTrue(v0 >= v1); +} + +int allGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return allTrue(outMask = v0 >= v1); +} + +int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1) +{ + return anyTrue(v0 >= v1); +} + +int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask) +{ + return anyTrue(outMask = v0 >= v1); +} + +int allTrue(const Simd4f& v) +{ +#if NV_SIMD_INLINE_ASSEMBLER + int result; + asm volatile("vmovq q0, %q1 \n\t" + "vand.u32 d0, d0, d1 \n\t" + "vpmin.u32 d0, d0, d0 \n\t" + "vcmp.f32 s0, #0 \n\t" + "fmrx %0, fpscr" + : "=r"(result) + : "w"(v.f4) + : "q0"); + return result >> 28 & 0x1; +#else + uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4)); + uint16x4_t lo = vmovn_u32(v.u4); + uint16x8_t combined = vcombine_u16(lo, hi); + uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined)); + return vget_lane_u32(reduced, 0) == 0xffffffff; +#endif +} + +int anyTrue(const Simd4f& v) +{ +#if NV_SIMD_INLINE_ASSEMBLER + int result; + asm volatile("vmovq q0, %q1 \n\t" + "vorr.u32 d0, d0, d1 \n\t" + "vpmax.u32 d0, d0, d0 \n\t" + "vcmp.f32 s0, #0 \n\t" + "fmrx %0, fpscr" + : "=r"(result) + : "w"(v.f4) + : "q0"); + return result >> 28 & 0x1; +#else + uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4)); + uint16x4_t lo = vmovn_u32(v.u4); + uint16x8_t combined = vcombine_u16(lo, hi); + uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined)); + return vget_lane_u32(reduced, 0) != 0x0; +#endif +} + +NV_SIMD_NAMESPACE_END diff --git a/PhysX_3.4/Source/LowLevelCloth/src/neon/Simd4i.h b/PhysX_3.4/Source/LowLevelCloth/src/neon/Simd4i.h new file mode 100644 index 00000000..7cf4ec49 --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/neon/Simd4i.h @@ -0,0 +1,303 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +NV_SIMD_NAMESPACE_BEGIN + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// factory implementation +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4iZeroFactory::operator Simd4i() const +{ + return vdupq_n_s32(0); +} + +Simd4iScalarFactory::operator Simd4i() const +{ + return vdupq_n_s32(value); +} + +Simd4iTupleFactory::operator Simd4i() const +{ + return reinterpret_cast<const Simd4i&>(tuple); +} + +Simd4iLoadFactory::operator Simd4i() const +{ + return vld1q_s32(ptr); +} + +Simd4iAlignedLoadFactory::operator Simd4i() const +{ + return vld1q_s32(ptr); +} + +Simd4iOffsetLoadFactory::operator Simd4i() const +{ + return vld1q_s32(reinterpret_cast<const int*>(reinterpret_cast<const char*>(ptr) + offset)); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// expression template +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template <> +inline ComplementExpr<Simd4i>::operator Simd4i() const +{ + return vbicq_u32(vdupq_n_u32(0xffffffff), v.u4); +} + +template <> +inline Simd4i operator&(const ComplementExpr<Simd4i>& complement, const Simd4i& v) +{ + return vbicq_u32(v.u4, complement.v.u4); +} + +template <> +inline Simd4i operator&(const Simd4i& v, const ComplementExpr<Simd4i>& complement) +{ + return vbicq_u32(v.u4, complement.v.u4); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// operator implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4i operator==(const Simd4i& v0, const Simd4i& v1) +{ + return vceqq_u32(v0.u4, v1.u4); +} + +Simd4i operator<(const Simd4i& v0, const Simd4i& v1) +{ + return vcltq_s32(v0.i4, v1.i4); +} + +Simd4i operator>(const Simd4i& v0, const Simd4i& v1) +{ + return vcgtq_s32(v0.i4, v1.i4); +} + +ComplementExpr<Simd4i> operator~(const Simd4i& v) +{ + return ComplementExpr<Simd4i>(v); +} + +Simd4i operator&(const Simd4i& v0, const Simd4i& v1) +{ + return vandq_u32(v0.u4, v1.u4); +} + +Simd4i operator|(const Simd4i& v0, const Simd4i& v1) +{ + return vorrq_u32(v0.u4, v1.u4); +} + +Simd4i operator^(const Simd4i& v0, const Simd4i& v1) +{ + return veorq_u32(v0.u4, v1.u4); +} + +Simd4i operator<<(const Simd4i& v, int shift) +{ + return vshlq_u32(v.u4, vdupq_n_s32(shift)); +} + +Simd4i operator>>(const Simd4i& v, int shift) +{ + return vshlq_u32(v.u4, vdupq_n_s32(-shift)); +} + +Simd4i operator<<(const Simd4i& v, const Simd4i& shift) +{ + return vshlq_u32(v.u4, shift.i4); +} + +Simd4i operator>>(const Simd4i& v, const Simd4i& shift) +{ + return vshlq_u32(v.u4, vnegq_s32(shift.i4)); +} + +Simd4i operator+(const Simd4i& v) +{ + return v; +} + +Simd4i operator+(const Simd4i& v0, const Simd4i& v1) +{ + return vaddq_u32(v0.u4, v1.u4); +} + +Simd4i operator-(const Simd4i& v) +{ + return vnegq_s32(v.i4); +} + +Simd4i operator-(const Simd4i& v0, const Simd4i& v1) +{ + return vsubq_u32(v0.u4, v1.u4); +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +// function implementations +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +Simd4i simd4i(const Simd4f& v) +{ + return v.u4; +} + +Simd4i truncate(const Simd4f& v) +{ + return vcvtq_s32_f32(v.f4); +} + +int (&array(Simd4i& v))[4] +{ + return reinterpret_cast<int(&)[4]>(v); +} + +const int (&array(const Simd4i& v))[4] +{ + return reinterpret_cast<const int(&)[4]>(v); +} + +void store(int* ptr, const Simd4i& v) +{ + return vst1q_s32(ptr, v.i4); +} + +void storeAligned(int* ptr, const Simd4i& v) +{ + vst1q_s32(ptr, v.i4); +} + +void storeAligned(int* ptr, unsigned int offset, const Simd4i& v) +{ + return storeAligned(reinterpret_cast<int*>(reinterpret_cast<char*>(ptr) + offset), v); +} + +template <size_t i> +Simd4i splat(Simd4i const& v) +{ + return vdupq_n_s32(array(v)[i]); +} + +Simd4i select(Simd4i const& mask, Simd4i const& v0, Simd4i const& v1) +{ + return vbslq_u32(mask.u4, v0.u4, v1.u4); +} + +int allEqual(const Simd4i& v0, const Simd4i& v1) +{ + return allTrue(operator==(v0, v1)); +} + +int allEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + return allTrue(outMask = operator==(v0, v1)); +} + +int anyEqual(const Simd4i& v0, const Simd4i& v1) +{ + return anyTrue(operator==(v0, v1)); +} + +int anyEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + return anyTrue(outMask = operator==(v0, v1)); +} + +int allGreater(const Simd4i& v0, const Simd4i& v1) +{ + return allTrue(operator>(v0, v1)); +} + +int allGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + return allTrue(outMask = operator>(v0, v1)); +} + +int anyGreater(const Simd4i& v0, const Simd4i& v1) +{ + return anyTrue(operator>(v0, v1)); +} + +int anyGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask) +{ + return anyTrue(outMask = operator>(v0, v1)); +} + +int allTrue(const Simd4i& v) +{ +#if NV_SIMD_INLINE_ASSEMBLER + int result; + asm volatile("vmovq q0, %q1 \n\t" + "vand.u32 d0, d0, d1 \n\t" + "vpmin.u32 d0, d0, d0 \n\t" + "vcmp.f32 s0, #0 \n\t" + "fmrx %0, fpscr" + : "=r"(result) + : "w"(v.u4) + : "q0"); + return result >> 28 & 0x1; +#else + uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4)); + uint16x4_t lo = vmovn_u32(v.u4); + uint16x8_t combined = vcombine_u16(lo, hi); + uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined)); + return vget_lane_u32(reduced, 0) == 0xffffffff; +#endif +} + +int anyTrue(const Simd4i& v) +{ +#if NV_SIMD_INLINE_ASSEMBLER + int result; + asm volatile("vmovq q0, %q1 \n\t" + "vorr.u32 d0, d0, d1 \n\t" + "vpmax.u32 d0, d0, d0 \n\t" + "vcmp.f32 s0, #0 \n\t" + "fmrx %0, fpscr" + : "=r"(result) + : "w"(v.u4) + : "q0"); + return result >> 28 & 0x1; +#else + uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4)); + uint16x4_t lo = vmovn_u32(v.u4); + uint16x8_t combined = vcombine_u16(lo, hi); + uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined)); + return vget_lane_u32(reduced, 0) != 0x0; +#endif +} + +NV_SIMD_NAMESPACE_END diff --git a/PhysX_3.4/Source/LowLevelCloth/src/neon/SimdTypes.h b/PhysX_3.4/Source/LowLevelCloth/src/neon/SimdTypes.h new file mode 100644 index 00000000..a1d6820e --- /dev/null +++ b/PhysX_3.4/Source/LowLevelCloth/src/neon/SimdTypes.h @@ -0,0 +1,71 @@ +// This code contains NVIDIA Confidential Information and is disclosed to you +// under a form of NVIDIA software license agreement provided separately to you. +// +// Notice +// NVIDIA Corporation and its licensors retain all intellectual property and +// proprietary rights in and to this software and related documentation and +// any modifications thereto. Any use, reproduction, disclosure, or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA Corporation is strictly prohibited. +// +// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES +// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO +// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT, +// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE. +// +// Information and code furnished is believed to be accurate and reliable. +// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such +// information or for any infringement of patents or other rights of third parties that may +// result from its use. No license is granted by implication or otherwise under any patent +// or patent rights of NVIDIA Corporation. Details are subject to change without notice. +// This code supersedes and replaces all information previously supplied. +// NVIDIA Corporation products are not authorized for use as critical +// components in life support devices or systems without express written approval of +// NVIDIA Corporation. +// +// Copyright (c) 2008-2017 NVIDIA Corporation. All rights reserved. +// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. +// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. + +#pragma once + +#include <arm_neon.h> + +NV_SIMD_NAMESPACE_BEGIN + +union Simd4f +{ + Simd4f() + { + } + Simd4f(const float32x4_t& v) : f4(v) + { + } +#ifndef _M_ARM // all *32x4_t map to the same type + Simd4f(const uint32x4_t& v) : u4(v) + { + } +#endif + float32x4_t f4; + uint32x4_t u4; + int32x4_t i4; +}; + +union Simd4i +{ + Simd4i() + { + } + Simd4i(const uint32x4_t& v) : u4(v) + { + } +#ifndef _M_ARM // all *32x4_t map to the same type + Simd4i(const int32x4_t& v) : i4(v) + { + } +#endif + uint32x4_t u4; + int32x4_t i4; +}; + +NV_SIMD_NAMESPACE_END |