Initial commit:

PhysX 3.4.0 Update @ 21294896 APEX 1.4.0 Update @ 21275617 [CL 21300167]
author: git perforce import user <a@b> 2016-10-25 12:29:14 -0600
committer: Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees> 2016-10-25 18:56:37 -0500
commit: 3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
tree: fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon
download: physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz
physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip
7 files changed, 987 insertions, 0 deletions
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonCollision.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonCollision.cpp
new file mode 100644
index 00000000..01f1fb50
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonCollision.cpp
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef __ARM_NEON__
+#error This file needs to be compiled with NEON support!
+#endif
+
+#include "SwCollision.cpp"
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonSelfCollision.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonSelfCollision.cpp
new file mode 100644
index 00000000..d272bb6d
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonSelfCollision.cpp
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef __ARM_NEON__
+#error This file needs to be compiled with NEON support!
+#endif
+
+#include "SwSelfCollision.cpp"
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonSolverKernel.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonSolverKernel.cpp
new file mode 100644
index 00000000..068c900a
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonSolverKernel.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef __ARM_NEON__
+#error This file needs to be compiled with NEON support!
+#endif
+
+#include "SwSolverKernel.cpp"
+
+#include <cpu-features.h>
+
+namespace nvidia
+{
+namespace cloth
+{
+bool neonSolverKernel(SwCloth const& cloth, SwClothData& data, SwKernelAllocator& allocator,
+                      IterationStateFactory& factory, PxProfileZone* profileZone)
+{
+	return ANDROID_CPU_ARM_FEATURE_NEON & android_getCpuFeatures() &&
+	       (SwSolverKernel<Simd4f>(cloth, data, allocator, factory, profileZone)(), true);
+}
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/Simd4f.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/Simd4f.h
new file mode 100644
index 00000000..0c0b884c
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/Simd4f.h
@@ -0,0 +1,500 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// factory implementation
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline Simd4fFactory<const float&>::operator Simd4f() const
+{
+	return vdupq_n_f32(reinterpret_cast<const float32_t&>(v));
+}
+
+inline Simd4fFactory<detail::FourTuple>::operator Simd4f() const
+{
+	return reinterpret_cast<const Simd4f&>(v);
+}
+
+template <int i>
+inline Simd4fFactory<detail::IntType<i> >::operator Simd4f() const
+{
+	return vdupq_n_u32(i);
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<1> >::operator Simd4f() const
+{
+	return vdupq_n_f32(1.0f);
+}
+
+template <>
+inline Simd4fFactory<const float*>::operator Simd4f() const
+{
+	return vld1q_f32((const float32_t*)v);
+}
+
+template <>
+inline Simd4fFactory<detail::AlignedPointer<float> >::operator Simd4f() const
+{
+	return vld1q_f32((const float32_t*)v.ptr);
+}
+
+template <>
+inline Simd4fFactory<detail::OffsetPointer<float> >::operator Simd4f() const
+{
+	return vld1q_f32(reinterpret_cast<const float32_t*>(reinterpret_cast<const char*>(v.ptr) + v.offset));
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression templates
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline ComplementExpr<Simd4f>::operator Simd4f() const
+{
+	return vbicq_u32(vdupq_n_u32(0xffffffff), v.u4);
+}
+
+Simd4f operator&(const ComplementExpr<Simd4f>& complement, const Simd4f& v)
+{
+	return vbicq_u32(v.u4, complement.v.u4);
+}
+
+Simd4f operator&(const Simd4f& v, const ComplementExpr<Simd4f>& complement)
+{
+	return vbicq_u32(v.u4, complement.v.u4);
+}
+
+ProductExpr::operator Simd4f() const
+{
+	return vmulq_f32(v0.f4, v1.f4);
+}
+
+Simd4f operator+(const ProductExpr& p, const Simd4f& v)
+{
+	return vmlaq_f32(v.f4, p.v0.f4, p.v1.f4);
+}
+
+Simd4f operator+(const Simd4f& v, const ProductExpr& p)
+{
+	return vmlaq_f32(v.f4, p.v0.f4, p.v1.f4);
+}
+
+Simd4f operator+(const ProductExpr& p0, const ProductExpr& p1)
+{
+	// cast calls operator Simd4f() which evaluates the other ProductExpr
+	return vmlaq_f32(static_cast<Simd4f>(p0).f4, p1.v0.f4, p1.v1.f4);
+}
+
+Simd4f operator-(const Simd4f& v, const ProductExpr& p)
+{
+	return vmlsq_f32(v.f4, p.v0.f4, p.v1.f4);
+}
+
+Simd4f operator-(const ProductExpr& p0, const ProductExpr& p1)
+{
+	// cast calls operator Simd4f() which evaluates the other ProductExpr
+	return vmlsq_f32(static_cast<Simd4f>(p0).f4, p1.v0.f4, p1.v1.f4);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operator implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4f operator==(const Simd4f& v0, const Simd4f& v1)
+{
+	return vceqq_f32(v0.f4, v1.f4);
+}
+
+Simd4f operator<(const Simd4f& v0, const Simd4f& v1)
+{
+	return vcltq_f32(v0.f4, v1.f4);
+}
+
+Simd4f operator<=(const Simd4f& v0, const Simd4f& v1)
+{
+	return vcleq_f32(v0.f4, v1.f4);
+}
+
+Simd4f operator>(const Simd4f& v0, const Simd4f& v1)
+{
+	return vcgtq_f32(v0.f4, v1.f4);
+}
+
+Simd4f operator>=(const Simd4f& v0, const Simd4f& v1)
+{
+	return vcgeq_f32(v0.f4, v1.f4);
+}
+
+ComplementExpr<Simd4f> operator~(const Simd4f& v)
+{
+	return ComplementExpr<Simd4f>(v);
+}
+
+Simd4f operator&(const Simd4f& v0, const Simd4f& v1)
+{
+	return vandq_u32(v0.u4, v1.u4);
+}
+
+Simd4f operator|(const Simd4f& v0, const Simd4f& v1)
+{
+	return vorrq_u32(v0.u4, v1.u4);
+}
+
+Simd4f operator^(const Simd4f& v0, const Simd4f& v1)
+{
+	return veorq_u32(v0.u4, v1.u4);
+}
+
+Simd4f operator<<(const Simd4f& v, int shift)
+{
+	return vshlq_u32(v.u4, vdupq_n_s32(shift));
+}
+
+Simd4f operator>>(const Simd4f& v, int shift)
+{
+	return vshlq_u32(v.u4, vdupq_n_s32(-shift));
+}
+
+Simd4f operator<<(const Simd4f& v, const Simd4f& shift)
+{
+	return vshlq_u32(v.u4, shift.i4);
+}
+
+Simd4f operator>>(const Simd4f& v, const Simd4f& shift)
+{
+	return vshlq_u32(v.u4, vnegq_s32(shift.i4));
+}
+
+Simd4f operator+(const Simd4f& v)
+{
+	return v;
+}
+
+Simd4f operator+(const Simd4f& v0, const Simd4f& v1)
+{
+	return vaddq_f32(v0.f4, v1.f4);
+}
+
+Simd4f operator-(const Simd4f& v)
+{
+	return vnegq_f32(v.f4);
+}
+
+Simd4f operator-(const Simd4f& v0, const Simd4f& v1)
+{
+	return vsubq_f32(v0.f4, v1.f4);
+}
+
+ProductExpr operator*(const Simd4f& v0, const Simd4f& v1)
+{
+	return ProductExpr(v0, v1);
+}
+
+Simd4f operator/(const Simd4f& v0, const Simd4f& v1)
+{
+	return v0 * vrecpeq_f32(v1.f4); // reciprocal estimate
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// function implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4f simd4f(const Simd4i& v)
+{
+	return v.u4;
+}
+
+float (&array(Simd4f& v))[4]
+{
+	return (float(&)[4])v;
+}
+
+const float (&array(const Simd4f& v))[4]
+{
+	return (const float(&)[4])v;
+}
+
+void store(float* ptr, Simd4f const& v)
+{
+	return vst1q_f32((float32_t*)ptr, v.f4);
+}
+
+void storeAligned(float* ptr, Simd4f const& v)
+{
+	return vst1q_f32((float32_t*)ptr, v.f4);
+}
+
+void storeAligned(float* ptr, unsigned int offset, Simd4f const& v)
+{
+	return storeAligned(reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + offset), v);
+}
+
+template <size_t i>
+Simd4f splat(Simd4f const& v)
+{
+	return vdupq_n_f32(array(v)[i]);
+}
+
+Simd4f select(Simd4f const& mask, Simd4f const& v0, Simd4f const& v1)
+{
+	return vbslq_f32(mask.u4, v0.f4, v1.f4);
+}
+
+Simd4f abs(const Simd4f& v)
+{
+	return vabsq_f32(v.f4);
+}
+
+Simd4f floor(const Simd4f& v)
+{
+	int32x4_t neg = vreinterpretq_s32_u32(vshrq_n_u32(v.u4, 31));
+	return vcvtq_f32_s32(vsubq_s32(vcvtq_s32_f32(v.f4), neg));
+}
+
+Simd4f max(const Simd4f& v0, const Simd4f& v1)
+{
+	return vmaxq_f32(v0.f4, v1.f4);
+}
+
+Simd4f min(const Simd4f& v0, const Simd4f& v1)
+{
+	return vminq_f32(v0.f4, v1.f4);
+}
+
+Simd4f recip(const Simd4f& v)
+{
+	return recipT<0>(v);
+}
+
+template <int n>
+Simd4f recipT(const Simd4f& v)
+{
+	Simd4f recipV = vrecpeq_f32(v.f4);
+	// n+1 newton iterations because initial approximation is crude
+	for(int i = 0; i <= n; ++i)
+		recipV = vrecpsq_f32(v.f4, recipV.f4) * recipV;
+	return recipV;
+}
+
+Simd4f sqrt(const Simd4f& v)
+{
+	return v * rsqrt(v);
+}
+
+Simd4f rsqrt(const Simd4f& v)
+{
+	return rsqrtT<0>(v);
+}
+
+template <int n>
+Simd4f rsqrtT(const Simd4f& v)
+{
+	Simd4f rsqrtV = vrsqrteq_f32(v.f4);
+	// n+1 newton iterations because initial approximation is crude
+	for(int i = 0; i <= n; ++i)
+		rsqrtV = vrsqrtsq_f32(vmulq_f32(v.f4, rsqrtV.f4), rsqrtV.f4) * rsqrtV;
+	return rsqrtV;
+}
+
+Simd4f exp2(const Simd4f& v)
+{
+	// http://www.netlib.org/cephes/
+
+	Simd4f limit = simd4f(127.4999f);
+	Simd4f x = min(max(-limit, v), limit);
+
+	// separate into integer and fractional part
+
+	Simd4f fx = x + simd4f(0.5f);
+	Simd4i ix = vsubq_s32(vcvtq_s32_f32(fx.f4), vreinterpretq_s32_u32(vshrq_n_u32(fx.u4, 31)));
+	fx = x - vcvtq_f32_s32(ix.i4);
+
+	// exp2(fx) ~ 1 + 2*P(fx) / (Q(fx) - P(fx))
+
+	Simd4f fx2 = fx * fx;
+
+	Simd4f px = fx * (simd4f(1.51390680115615096133e+3f) +
+	                  fx2 * (simd4f(2.02020656693165307700e+1f) + fx2 * simd4f(2.30933477057345225087e-2f)));
+	Simd4f qx = simd4f(4.36821166879210612817e+3f) + fx2 * (simd4f(2.33184211722314911771e+2f) + fx2);
+
+	Simd4f exp2fx = px * recip(qx - px);
+	exp2fx = simd4f(_1) + exp2fx + exp2fx;
+
+	// exp2(ix)
+
+	Simd4f exp2ix = vreinterpretq_f32_s32(vshlq_n_s32(vaddq_s32(ix.i4, vdupq_n_s32(0x7f)), 23));
+
+	return exp2fx * exp2ix;
+}
+
+Simd4f log2(const Simd4f& v)
+{
+	Simd4f scale = simd4f(1.44269504088896341f); // 1/ln(2)
+	const float* ptr = array(v);
+	return simd4f(::logf(ptr[0]), ::logf(ptr[1]), ::logf(ptr[2]), ::logf(ptr[3])) * scale;
+}
+
+Simd4f dot3(const Simd4f& v0, const Simd4f& v1)
+{
+	Simd4f tmp = v0 * v1;
+	return splat<0>(tmp) + splat<1>(tmp) + splat<2>(tmp);
+}
+
+Simd4f cross3(const Simd4f& v0, const Simd4f& v1)
+{
+	float32x2_t x0_y0 = vget_low_f32(v0.f4);
+	float32x2_t z0_w0 = vget_high_f32(v0.f4);
+	float32x2_t x1_y1 = vget_low_f32(v1.f4);
+	float32x2_t z1_w1 = vget_high_f32(v1.f4);
+
+	float32x2_t y1_z1 = vext_f32(x1_y1, z1_w1, 1);
+	float32x2_t y0_z0 = vext_f32(x0_y0, z0_w0, 1);
+
+	float32x2_t z0x1_w0y1 = vmul_f32(z0_w0, x1_y1);
+	float32x2_t x0y1_y0z1 = vmul_f32(x0_y0, y1_z1);
+
+	float32x2_t y2_w2 = vmls_f32(z0x1_w0y1, x0_y0, z1_w1);
+	float32x2_t z2_x2 = vmls_f32(x0y1_y0z1, y0_z0, x1_y1);
+	float32x2_t x2_y2 = vext_f32(z2_x2, y2_w2, 1);
+
+	return vcombine_f32(x2_y2, z2_x2);
+}
+
+void transpose(Simd4f& x, Simd4f& y, Simd4f& z, Simd4f& w)
+{
+#if NVMATH_INLINE_ASSEMBLER
+	asm volatile("vzip.f32 %q0, %q2 \n\t"
+	             "vzip.f32 %q1, %q3 \n\t"
+	             "vzip.f32 %q0, %q1 \n\t"
+	             "vzip.f32 %q2, %q3 \n\t"
+	             : "+w"(x.f4), "+w"(y.f4), "+w"(z.f4), "+w"(w.f4));
+#else
+	float32x4x2_t v0v1 = vzipq_f32(x.f4, z.f4);
+	float32x4x2_t v2v3 = vzipq_f32(y.f4, w.f4);
+	float32x4x2_t zip0 = vzipq_f32(v0v1.val[0], v2v3.val[0]);
+	float32x4x2_t zip1 = vzipq_f32(v0v1.val[1], v2v3.val[1]);
+
+	x = zip0.val[0];
+	y = zip0.val[1];
+	z = zip1.val[0];
+	w = zip1.val[1];
+#endif
+}
+
+int allEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return allTrue(v0 == v1);
+}
+
+int allEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return allTrue(outMask = v0 == v1);
+}
+
+int anyEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return anyTrue(v0 == v1);
+}
+
+int anyEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return anyTrue(outMask = v0 == v1);
+}
+
+int allGreater(const Simd4f& v0, const Simd4f& v1)
+{
+	return allTrue(v0 > v1);
+}
+
+int allGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return allTrue(outMask = v0 > v1);
+}
+
+int anyGreater(const Simd4f& v0, const Simd4f& v1)
+{
+	return anyTrue(v0 > v1);
+}
+
+int anyGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return anyTrue(outMask = v0 > v1);
+}
+
+int allGreaterEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return allTrue(v0 >= v1);
+}
+
+int allGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return allTrue(outMask = v0 >= v1);
+}
+
+int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return anyTrue(v0 >= v1);
+}
+
+int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return anyTrue(outMask = v0 >= v1);
+}
+
+int allTrue(const Simd4f& v)
+{
+#if NVMATH_INLINE_ASSEMBLER
+	int result;
+	asm volatile("vmovq q0, %q1 \n\t"
+	             "vand.u32 d0, d0, d1 \n\t"
+	             "vpmin.u32 d0, d0, d0 \n\t"
+	             "vcmp.f32 s0, #0 \n\t"
+	             "fmrx %0, fpscr"
+	             : "=r"(result)
+	             : "w"(v.f4)
+	             : "q0");
+	return result >> 28 & 0x1;
+#else
+	uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4));
+	uint16x4_t lo = vmovn_u32(v.u4);
+	uint16x8_t combined = vcombine_u16(lo, hi);
+	uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined));
+	return vget_lane_u32(reduced, 0) == 0xffffffff;
+#endif
+}
+
+int anyTrue(const Simd4f& v)
+{
+#if NVMATH_INLINE_ASSEMBLER
+	int result;
+	asm volatile("vmovq q0, %q1 \n\t"
+	             "vorr.u32 d0, d0, d1 \n\t"
+	             "vpmax.u32 d0, d0, d0 \n\t"
+	             "vcmp.f32 s0, #0 \n\t"
+	             "fmrx %0, fpscr"
+	             : "=r"(result)
+	             : "w"(v.f4)
+	             : "q0");
+	return result >> 28 & 0x1;
+#else
+	uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4));
+	uint16x4_t lo = vmovn_u32(v.u4);
+	uint16x8_t combined = vcombine_u16(lo, hi);
+	uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined));
+	return vget_lane_u32(reduced, 0) != 0x0;
+#endif
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/Simd4i.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/Simd4i.h
new file mode 100644
index 00000000..7a566256
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/Simd4i.h
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// factory implementation
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline Simd4iFactory<const int&>::operator Simd4i() const
+{
+	return vdupq_n_s32(v);
+}
+
+inline Simd4iFactory<detail::FourTuple>::operator Simd4i() const
+{
+	return reinterpret_cast<const Simd4i&>(v);
+}
+
+template <int i>
+inline Simd4iFactory<detail::IntType<i> >::operator Simd4i() const
+{
+	return vdupq_n_u32(i);
+}
+
+template <>
+inline Simd4iFactory<const int*>::operator Simd4i() const
+{
+	return vld1q_s32(v);
+}
+
+template <>
+inline Simd4iFactory<detail::AlignedPointer<int> >::operator Simd4i() const
+{
+	return vld1q_s32(v.ptr);
+}
+
+template <>
+inline Simd4iFactory<detail::OffsetPointer<int> >::operator Simd4i() const
+{
+	return vld1q_s32(reinterpret_cast<const int*>(reinterpret_cast<const char*>(v.ptr) + v.offset));
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression template
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline ComplementExpr<Simd4i>::operator Simd4i() const
+{
+	return vbicq_u32(vdupq_n_u32(0xffffffff), v.u4);
+}
+
+Simd4i operator&(const ComplementExpr<Simd4i>& complement, const Simd4i& v)
+{
+	return vbicq_u32(v.u4, complement.v.u4);
+}
+
+Simd4i operator&(const Simd4i& v, const ComplementExpr<Simd4i>& complement)
+{
+	return vbicq_u32(v.u4, complement.v.u4);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operator implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4i simdi::operator==(const Simd4i& v0, const Simd4i& v1)
+{
+	return vceqq_u32(v0.u4, v1.u4);
+}
+
+Simd4i simdi::operator<(const Simd4i& v0, const Simd4i& v1)
+{
+	return vcltq_s32(v0.i4, v1.i4);
+}
+
+Simd4i simdi::operator>(const Simd4i& v0, const Simd4i& v1)
+{
+	return vcgtq_s32(v0.i4, v1.i4);
+}
+
+ComplementExpr<Simd4i> operator~(const Simd4i& v)
+{
+	return ComplementExpr<Simd4i>(v);
+}
+
+Simd4i operator&(const Simd4i& v0, const Simd4i& v1)
+{
+	return vandq_u32(v0.u4, v1.u4);
+}
+
+Simd4i operator|(const Simd4i& v0, const Simd4i& v1)
+{
+	return vorrq_u32(v0.u4, v1.u4);
+}
+
+Simd4i operator^(const Simd4i& v0, const Simd4i& v1)
+{
+	return veorq_u32(v0.u4, v1.u4);
+}
+
+Simd4i operator<<(const Simd4i& v, int shift)
+{
+	return vshlq_u32(v.u4, vdupq_n_s32(shift));
+}
+
+Simd4i operator>>(const Simd4i& v, int shift)
+{
+	return vshlq_u32(v.u4, vdupq_n_s32(-shift));
+}
+
+Simd4i operator<<(const Simd4i& v, const Simd4i& shift)
+{
+	return vshlq_u32(v.u4, shift.i4);
+}
+
+Simd4i operator>>(const Simd4i& v, const Simd4i& shift)
+{
+	return vshlq_u32(v.u4, vnegq_s32(shift.i4));
+}
+
+Simd4i simdi::operator+(const Simd4i& v0, const Simd4i& v1)
+{
+	return vaddq_u32(v0.u4, v1.u4);
+}
+
+Simd4i simdi::operator-(const Simd4i& v)
+{
+	return vnegq_s32(v.i4);
+}
+
+Simd4i simdi::operator-(const Simd4i& v0, const Simd4i& v1)
+{
+	return vsubq_u32(v0.u4, v1.u4);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// function implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4i simd4i(const Simd4f& v)
+{
+	return v.u4;
+}
+
+int (&simdi::array(Simd4i& v))[4]
+{
+	return (int(&)[4])v;
+}
+
+const int (&simdi::array(const Simd4i& v))[4]
+{
+	return (const int(&)[4])v;
+}
+
+void store(int* ptr, const Simd4i& v)
+{
+	return vst1q_s32(ptr, v.i4);
+}
+
+void storeAligned(int* ptr, const Simd4i& v)
+{
+	vst1q_s32(ptr, v.i4);
+}
+
+void storeAligned(int* ptr, unsigned int offset, const Simd4i& v)
+{
+	return storeAligned(reinterpret_cast<int*>(reinterpret_cast<char*>(ptr) + offset), v);
+}
+
+template <size_t i>
+Simd4i splat(Simd4i const& v)
+{
+	return vdupq_n_s32(simdi::array(v)[i]);
+}
+
+Simd4i select(Simd4i const& mask, Simd4i const& v0, Simd4i const& v1)
+{
+	return vbslq_u32(mask.u4, v0.u4, v1.u4);
+}
+
+int simdi::allEqual(const Simd4i& v0, const Simd4i& v1)
+{
+	return allTrue(simdi::operator==(v0, v1));
+}
+
+int simdi::allEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return allTrue(outMask = simdi::operator==(v0, v1));
+}
+
+int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1)
+{
+	return anyTrue(simdi::operator==(v0, v1));
+}
+
+int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return anyTrue(outMask = simdi::operator==(v0, v1));
+}
+
+int simdi::allGreater(const Simd4i& v0, const Simd4i& v1)
+{
+	return allTrue(simdi::operator>(v0, v1));
+}
+
+int simdi::allGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return allTrue(outMask = simdi::operator>(v0, v1));
+}
+
+int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1)
+{
+	return anyTrue(simdi::operator>(v0, v1));
+}
+
+int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return anyTrue(outMask = simdi::operator>(v0, v1));
+}
+
+int allTrue(const Simd4i& v)
+{
+#if NVMATH_INLINE_ASSEMBLER
+	int result;
+	asm volatile("vmovq q0, %q1 \n\t"
+	             "vand.u32 d0, d0, d1 \n\t"
+	             "vpmin.u32 d0, d0, d0 \n\t"
+	             "vcmp.f32 s0, #0 \n\t"
+	             "fmrx %0, fpscr"
+	             : "=r"(result)
+	             : "w"(v.u4)
+	             : "q0");
+	return result >> 28 & 0x1;
+#else
+	uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4));
+	uint16x4_t lo = vmovn_u32(v.u4);
+	uint16x8_t combined = vcombine_u16(lo, hi);
+	uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined));
+	return vget_lane_u32(reduced, 0) == 0xffffffff;
+#endif
+}
+
+int anyTrue(const Simd4i& v)
+{
+#if NVMATH_INLINE_ASSEMBLER
+	int result;
+	asm volatile("vmovq q0, %q1 \n\t"
+	             "vorr.u32 d0, d0, d1 \n\t"
+	             "vpmax.u32 d0, d0, d0 \n\t"
+	             "vcmp.f32 s0, #0 \n\t"
+	             "fmrx %0, fpscr"
+	             : "=r"(result)
+	             : "w"(v.u4)
+	             : "q0");
+	return result >> 28 & 0x1;
+#else
+	uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4));
+	uint16x4_t lo = vmovn_u32(v.u4);
+	uint16x8_t combined = vcombine_u16(lo, hi);
+	uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined));
+	return vget_lane_u32(reduced, 0) != 0x0;
+#endif
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/SimdTypes.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/SimdTypes.h
new file mode 100644
index 00000000..542fac08
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/SimdTypes.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include <arm_neon.h>
+
+union Simd4f
+{
+	Simd4f()
+	{
+	}
+	Simd4f(const float32x4_t& v) : f4(v)
+	{
+	}
+#ifndef _M_ARM // all *32x4_t map to the same type
+	Simd4f(const uint32x4_t& v) : u4(v)
+	{
+	}
+#endif
+	float32x4_t f4;
+	uint32x4_t u4;
+	int32x4_t i4;
+};
+
+union Simd4i
+{
+	Simd4i()
+	{
+	}
+	Simd4i(const uint32x4_t& v) : u4(v)
+	{
+	}
+#ifndef _M_ARM // all *32x4_t map to the same type
+	Simd4i(const int32x4_t& v) : i4(v)
+	{
+	}
+#endif
+	uint32x4_t u4;
+	int32x4_t i4;
+};
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/SwCollisionHelpers.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/SwCollisionHelpers.h
new file mode 100644
index 00000000..b67f96aa
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/SwCollisionHelpers.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#ifdef _M_ARM
+#include <arm_neon.h>
+#endif
+
+namespace nvidia
+{
+namespace cloth
+{
+
+uint32_t findBitSet(uint32_t mask)
+{
+#ifdef _M_ARM
+	__n64 t = { mask };
+	return 31 - (vclz_u32(t)).n64_u32[0];
+#else
+	return 31 - __builtin_clz(mask);
+#endif
+}
+
+Simd4i intFloor(const Simd4f& v)
+{
+	int32x4_t neg = vreinterpretq_s32_u32(vshrq_n_u32(v.u4, 31));
+	return vsubq_s32(vcvtq_s32_f32(v.f4), neg);
+}
+
+Simd4i horizontalOr(Simd4i mask)
+{
+	using namespace simdi;
+	uint32x2_t hi = vget_high_u32(mask.u4);
+	uint32x2_t lo = vget_low_u32(mask.u4);
+	uint32x2_t tmp = vorr_u32(lo, hi);
+	uint32x2_t rev = vrev64_u32(tmp);
+	uint32x2_t res = vorr_u32(tmp, rev);
+	return vcombine_u32(res, res);
+}
+
+Gather<Simd4i>::Gather(const Simd4i& index)
+{
+#ifdef __arm64__
+	using namespace simdi;
+	PX_ALIGN(16, uint8x8x2_t) byteIndex = reinterpret_cast<const uint8x8x2_t&>(sPack);
+	uint8x16_t lohiIndex = reinterpret_cast<const uint8x16_t&>(index);
+	byteIndex.val[0] = vtbl1q_u8(lohiIndex, byteIndex.val[0]);
+	byteIndex.val[1] = vtbl1q_u8(lohiIndex, byteIndex.val[1]);
+	mPermute = vshlq_n_u32(reinterpret_cast<const uint32x4_t&>(byteIndex), 2);
+	mPermute = mPermute | sOffset | vcgtq_u32(index.u4, sMask.u4);
+#else
+	using namespace simdi;
+	PX_ALIGN(16, uint8x8x2_t) byteIndex = reinterpret_cast<const uint8x8x2_t&>(sPack);
+	uint8x8x2_t lohiIndex = reinterpret_cast<const uint8x8x2_t&>(index);
+	byteIndex.val[0] = vtbl2_u8(lohiIndex, byteIndex.val[0]);
+	byteIndex.val[1] = vtbl2_u8(lohiIndex, byteIndex.val[1]);
+	mPermute = vshlq_n_u32(reinterpret_cast<const uint32x4_t&>(byteIndex), 2);
+	mPermute = mPermute | sOffset | vcgtq_u32(index.u4, sMask.u4);
+#endif
+}
+
+Simd4i Gather<Simd4i>::operator()(const Simd4i* ptr) const
+{
+#ifdef __arm64__
+	PX_ALIGN(16, uint8x8x2_t) result = reinterpret_cast<const uint8x8x2_t&>(mPermute);
+	const uint8x16x2_t* table = reinterpret_cast<const uint8x16x2_t*>(ptr);
+	result.val[0] = vtbl2q_u8(*table, result.val[0]);
+	result.val[1] = vtbl2q_u8(*table, result.val[1]);
+	return reinterpret_cast<const Simd4i&>(result);
+#else
+	PX_ALIGN(16, uint8x8x2_t) result = reinterpret_cast<const uint8x8x2_t&>(mPermute);
+	const uint8x8x4_t* table = reinterpret_cast<const uint8x8x4_t*>(ptr);
+	result.val[0] = vtbl4_u8(*table, result.val[0]);
+	result.val[1] = vtbl4_u8(*table, result.val[1]);
+	return reinterpret_cast<const Simd4i&>(result);
+#endif
+}
+
+} // namespace cloth
+} // namespace physx
author	git perforce import user <a@b>	2016-10-25 12:29:14 -0600
committer	Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees>	2016-10-25 18:56:37 -0500
commit	3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
tree	fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon
download	physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip