Initial commit with PS4 and XBone stuff trimmed.

author: Jason Maskell <[email protected]> 2016-05-09 10:39:54 +0200
committer: Jason Maskell <[email protected]> 2016-05-09 10:39:54 +0200
commit: 79b3462799c28af8ba586349bd671b1b56e72353 (patch)
tree: 3b06e36c390254c0dc7f3733a0d32af213d87293 /src/simd
download: waveworks_archive-79b3462799c28af8ba586349bd671b1b56e72353.tar.xz
waveworks_archive-79b3462799c28af8ba586349bd671b1b56e72353.zip
18 files changed, 5143 insertions, 0 deletions
diff --git a/src/simd/Simd4f.h b/src/simd/Simd4f.h
new file mode 100644
index 0000000..9b352a6
--- /dev/null
+++ b/src/simd/Simd4f.h
@@ -0,0 +1,517 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "SimdTypes.h"
+
+#if NVMATH_FUSE_MULTIPLY_ADD
+
+/*! \brief Expression template to fuse multiply-adds.
+ * \relates Simd4f */
+struct ProductExpr
+{
+	inline ProductExpr(Simd4f const& v0_, Simd4f const& v1_) : v0(v0_), v1(v1_)
+	{
+	}
+	inline operator Simd4f() const;
+	const Simd4f v0, v1;
+
+  private:
+	ProductExpr& operator=(const ProductExpr&); // not implemented
+};
+
+inline Simd4f operator+(const ProductExpr&, const Simd4f&);
+inline Simd4f operator+(const Simd4f& v, const ProductExpr&);
+inline Simd4f operator+(const ProductExpr&, const ProductExpr&);
+inline Simd4f operator-(const Simd4f& v, const ProductExpr&);
+inline Simd4f operator-(const ProductExpr&, const ProductExpr&);
+
+#else  // NVMATH_FUSE_MULTIPLY_ADD
+typedef Simd4f ProductExpr;
+#endif // NVMATH_FUSE_MULTIPLY_ADD
+
+template <typename T>
+struct Simd4fFactory
+{
+	Simd4fFactory(T v_) : v(v_)
+	{
+	}
+	inline operator Simd4f() const;
+	inline operator Scalar4f() const;
+	Simd4fFactory& operator=(const Simd4fFactory&); // not implemented
+	T v;
+};
+
+template <>
+struct Simd4fFactory<detail::FourTuple>
+{
+	Simd4fFactory(float x, float y, float z, float w)
+	{
+		v[0] = x, v[1] = y, v[2] = z, v[3] = w;
+	}
+	Simd4fFactory(const Simd4fFactory<const float&>& f)
+	{
+		v[3] = v[2] = v[1] = v[0] = f.v;
+	}
+	inline operator Simd4f() const;
+	inline operator Scalar4f() const;
+	Simd4fFactory& operator=(const Simd4fFactory&); // not implemented
+	NVMATH_ALIGN(16, float) v[4];
+};
+
+template <int i>
+struct Simd4fFactory<detail::IntType<i> >
+{
+	inline operator Simd4f() const;
+	inline operator Scalar4f() const;
+};
+
+// forward declaration
+template <typename>
+struct Simd4iFactory;
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression template
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+#if NVMATH_SIMD
+inline Simd4f operator&(const ComplementExpr<Simd4f>&, const Simd4f&);
+inline Simd4f operator&(const Simd4f&, const ComplementExpr<Simd4f>&);
+#endif
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operators
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+// note: operator?= missing because they don't have corresponding intrinsics.
+
+/*! \brief Test for equality of two vectors.
+* \return Vector of per element result mask (all bits set for 'true', none set for 'false').
+* \note QNaNs aren't handled on SPU: comparing two QNaNs will return true.
+* \relates Simd4f */
+inline Simd4f operator==(const Simd4f& v0, const Simd4f& v1);
+
+// no operator!= because VMX128 does not support it, use ~operator== and handle QNaNs
+
+/*! \brief Less-compare all elements of two vectors.
+* \return Vector of per element result mask (all bits set for 'true', none set for 'false').
+* \note QNaNs aren't handled on SPU: comparisons against QNaNs don't necessarily return false.
+* \relates Simd4f */
+inline Simd4f operator<(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Less-or-equal-compare all elements of two vectors.
+* \return Vector of per element result mask (all bits set for 'true', none set for 'false').
+* \note QNaNs aren't handled on SPU: comparisons against QNaNs don't necessarily return false.
+* \relates Simd4f */
+inline Simd4f operator<=(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Greater-compare all elements of two vectors.
+* \return Vector of per element result mask (all bits set for 'true', none set for 'false').
+* \note QNaNs aren't handled on SPU: comparisons against QNaNs don't necessarily return false.
+* \relates Simd4f */
+inline Simd4f operator>(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Greater-or-equal-compare all elements of two vectors.
+* \return Vector of per element result mask (all bits set for 'true', none set for 'false').
+* \note QNaNs aren't handled on SPU: comparisons against QNaNs don't necessarily return false.
+* \relates Simd4f */
+inline Simd4f operator>=(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Vector bit-wise NOT operator
+* \return A vector holding the bit-negate of \a v.
+* \relates Simd4f */
+inline ComplementExpr<Simd4f> operator~(const Simd4f& v);
+
+/*! \brief Vector bit-wise AND operator
+* \return A vector holding the bit-wise AND of \a v0 and \a v1.
+* \relates Simd4f */
+inline Simd4f operator&(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Vector bit-wise OR operator
+* \return A vector holding the bit-wise OR of \a v0 and \a v1.
+* \relates Simd4f */
+inline Simd4f operator|(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Vector bit-wise XOR operator
+* \return A vector holding the bit-wise XOR of \a v0 and \a v1.
+* \relates Simd4f */
+inline Simd4f operator^(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Vector logical left shift.
+* \return A vector with 4 elements of \a v0, each shifted left by \a shift bits.
+* \relates Simd4f */
+inline Simd4f operator<<(const Simd4f& v, int shift);
+
+/*! \brief Vector logical right shift.
+* \return A vector with 4 elements of \a v0, each shifted right by \a shift bits.
+* \relates Simd4f */
+inline Simd4f operator>>(const Simd4f& v, int shift);
+
+#if NVMATH_SHIFT_BY_VECTOR
+/*! \brief Vector logical left shift.
+* \return A vector with 4 elements of \a v0, each shifted left by \a shift bits.
+* \relates Simd4f */
+inline Simd4f operator<<(const Simd4f& v, const Simd4f& shift);
+
+/*! \brief Vector logical right shift.
+* \return A vector with 4 elements of \a v0, each shifted right by \a shift bits.
+* \relates Simd4f */
+inline Simd4f operator>>(const Simd4f& v, const Simd4f& shift);
+#endif
+
+/*! \brief Unary vector addition operator.
+* \return A vector holding the component-wise copy of \a v.
+* \relates Simd4f */
+inline Simd4f operator+(const Simd4f& v);
+
+/*! \brief Vector addition operator
+* \return A vector holding the component-wise sum of \a v0 and \a v1.
+* \relates Simd4f */
+inline Simd4f operator+(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Unary vector negation operator.
+* \return A vector holding the component-wise negation of \a v.
+* \relates Simd4f */
+inline Simd4f operator-(const Simd4f& v);
+
+/*! \brief Vector subtraction operator.
+* \return A vector holding the component-wise difference of \a v0 and \a v1.
+* \relates Simd4f */
+inline Simd4f operator-(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Vector multiplication.
+* \return Element-wise product of \a v0 and \a v1.
+* \note For VMX, returns expression template to fuse multiply-add.
+* \relates Simd4f */
+inline ProductExpr operator*(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Vector division.
+* \return Element-wise division of \a v0 and \a v1.
+* \relates Simd4f */
+inline Simd4f operator/(const Simd4f& v0, const Simd4f& v1);
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// functions
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+/*! \brief Load float value into all vector components.
+* \relates Simd4f */
+inline Simd4fFactory<const float&> simd4f(const float& s)
+{
+	return Simd4fFactory<const float&>(s);
+}
+
+/*! \brief Load 4 float values into vector.
+* \relates Simd4f */
+inline Simd4fFactory<detail::FourTuple> simd4f(float x, float y, float z, float w)
+{
+	return Simd4fFactory<detail::FourTuple>(x, y, z, w);
+}
+
+/*! \brief Create vector from literal.
+* \return Vector with all elements set to i.
+* \relates Simd4f */
+template <int i>
+inline Simd4fFactory<detail::IntType<i> > simd4f(detail::IntType<i> const&)
+{
+	return Simd4fFactory<detail::IntType<i> >();
+}
+
+/*! \brief Reinterpret Simd4i as Simd4f.
+* \return A copy of \a v, but reinterpreted as Simd4f.
+* \relates Simd4f */
+inline Simd4f simd4f(const Simd4i& v);
+
+/*! \brief Reinterpret Simd4iFactory as Simd4fFactory.
+* \relates Simd4f */
+template <typename T>
+inline Simd4fFactory<T> simd4f(const Simd4iFactory<T>& v)
+{
+	return reinterpret_cast<const Simd4fFactory<T>&>(v);
+}
+
+/*! \brief Convert Simd4i to Simd4f.
+* \relates Simd4f */
+inline Simd4f convert(const Simd4i& v);
+
+/*! \brief return reference to contiguous array of vector elements
+* \relates Simd4f */
+inline float (&array(Simd4f& v))[4];
+
+/*! \brief return constant reference to contiguous array of vector elements
+* \relates Simd4f */
+inline const float (&array(const Simd4f& v))[4];
+
+/*! \brief Create vector from float array.
+* \relates Simd4f */
+inline Simd4fFactory<const float*> load(const float* ptr)
+{
+	return ptr;
+}
+
+/*! \brief Create vector from aligned float array.
+* \note \a ptr needs to be 16 byte aligned.
+* \relates Simd4f */
+inline Simd4fFactory<detail::AlignedPointer<float> > loadAligned(const float* ptr)
+{
+	return detail::AlignedPointer<float>(ptr);
+}
+
+/*! \brief Create vector from aligned float array.
+* \param offset pointer offset in bytes.
+* \note \a ptr+offset needs to be 16 byte aligned.
+* \relates Simd4f */
+inline Simd4fFactory<detail::OffsetPointer<float> > loadAligned(const float* ptr, unsigned int offset)
+{
+	return detail::OffsetPointer<float>(ptr, offset);
+}
+
+/*! \brief Store vector \a v to float array \a ptr.
+* \relates Simd4f */
+inline void store(float* ptr, Simd4f const& v);
+
+/*! \brief Store vector \a v to aligned float array \a ptr.
+* \note \a ptr needs to be 16 byte aligned.
+* \relates Simd4f */
+inline void storeAligned(float* ptr, Simd4f const& v);
+
+/*! \brief Store vector \a v to aligned float array \a ptr.
+* \param offset pointer offset in bytes.
+* \note \a ptr+offset needs to be 16 byte aligned.
+* \relates Simd4f */
+inline void storeAligned(float* ptr, unsigned int offset, Simd4f const& v);
+
+/*! \brief replicate i-th component into all vector components.
+* \return Vector with all elements set to \a v[i].
+* \relates Simd4f */
+template <size_t i>
+inline Simd4f splat(Simd4f const& v);
+
+/*! \brief Select \a v0 or \a v1 based on \a mask.
+* \return mask ? v0 : v1
+* \relates Simd4f */
+inline Simd4f select(Simd4f const& mask, Simd4f const& v0, Simd4f const& v1);
+
+/*! \brief Per element absolute value.
+* \return Vector with absolute values of \a v.
+* \relates Simd4f */
+inline Simd4f abs(const Simd4f& v);
+
+/*! \brief Per element floor value.
+* \note Result undefined for QNaN elements.
+* \note Translates to 6 instructions on SSE and NEON.
+* \relates Simd4f */
+inline Simd4f floor(const Simd4f& v);
+
+/*! \brief Per-component minimum of two vectors
+* \note Result undefined for QNaN elements.
+* \relates Simd4f */
+inline Simd4f max(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Per-component minimum of two vectors
+* \note Result undefined for QNaN elements.
+* \relates Simd4f */
+inline Simd4f min(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Return reciprocal estimate of a vector.
+* \return Vector of per-element reciprocal estimate.
+* \relates Simd4f */
+inline Simd4f recip(const Simd4f& v);
+
+/*! \brief Return reciprocal of a vector.
+* \return Vector of per-element reciprocal.
+* \note Performs \a n Newton-Raphson iterations on initial estimate.
+* \relates Simd4f */
+template <int n>
+inline Simd4f recip(const Simd4f& v);
+
+/*! \brief Return square root of a vector.
+* \return Vector of per-element square root.
+* \note The behavior is undefined for negative elements.
+* \relates Simd4f */
+inline Simd4f sqrt(const Simd4f& v);
+
+/*! \brief Return inverse square root estimate of a vector.
+* \return Vector of per-element inverse square root estimate.
+* \note The behavior is undefined for negative, zero, and infinity elements.
+* \relates Simd4f */
+inline Simd4f rsqrt(const Simd4f& v);
+
+/*! \brief Return inverse square root of a vector.
+* \return Vector of per-element inverse square root.
+* \note Performs \a n Newton-Raphson iterations on initial estimate.
+* \note The behavior is undefined for negative and infinity elements.
+* \relates Simd4f */
+template <int n>
+inline Simd4f rsqrt(const Simd4f& v);
+
+/*! \brief Return 2 raised to the power of v.
+* \note Result undefined for QNaN elements.
+* \relates Simd4f */
+inline Simd4f exp2(const Simd4f& v);
+
+#if NVMATH_SIMD
+namespace simdf
+{
+// PSP2 is confused resolving about exp2, forwarding works
+inline Simd4f exp2(const Simd4f& v)
+{
+	return ::exp2(v);
+}
+}
+#endif
+
+/*! \brief Return logarithm of v to base 2.
+* \note Result undefined for QNaN elements.
+* \relates Simd4f */
+inline Simd4f log2(const Simd4f& v);
+
+/*! \brief Return dot product of two 3-vectors.
+* \note The result is replicated across all 4 components.
+* \relates Simd4f */
+inline Simd4f dot3(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Return cross product of two 3-vectors.
+* \note The 4th component is undefined.
+* \relates Simd4f */
+inline Simd4f cross3(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Transposes 4x4 matrix represented by \a x, \a y, \a z, and \a w.
+* \relates Simd4f */
+inline void transpose(Simd4f& x, Simd4f& y, Simd4f& z, Simd4f& w);
+
+/*! \brief Interleave elements.
+* \a v0 becomes {x0, x1, y0, y1}, v1 becomes {z0, z1, w0, w1}.
+* \relates Simd4f */
+inline void zip(Simd4f& v0, Simd4f& v1);
+
+/*! \brief De-interleave elements.
+* \a v0 becomes {x0, z0, x1, z1}, v1 becomes {y0, w0, y1, w1}.
+* \relates Simd4f */
+inline void unzip(Simd4f& v0, Simd4f& v1);
+
+/*! \brief Swaps quad words.
+* Returns {z0, w0, x0, y0}
+* \relates Simd4f */
+inline Simd4f swaphilo(const Simd4f& v);
+
+/*! \brief returns non-zero if all elements or \a v0 and \a v1 are equal
+* \note QNaNs aren't handled on SPU: comparing two QNaNs will return true.
+* \relates Simd4f */
+inline int allEqual(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief returns non-zero if all elements or \a v0 and \a v1 are equal
+* \param outMask holds the result of \a v0 == \a v1.
+* \note QNaNs aren't handled on SPU: comparing two QNaNs will return true.
+* \relates Simd4f */
+inline int allEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are equal
+* \note QNaNs aren't handled on SPU: comparing two QNaNs will return true.
+* \relates Simd4f */
+inline int anyEqual(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are equal
+* \param outMask holds the result of \a v0 == \a v1.
+* \note QNaNs aren't handled on SPU: comparing two QNaNs will return true.
+* \relates Simd4f */
+inline int anyEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask);
+
+/*! \brief returns non-zero if all elements or \a v0 and \a v1 are greater
+* \note QNaNs aren't handled on SPU: comparisons against QNaNs don't necessarily return false.
+* \relates Simd4f */
+inline int allGreater(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief returns non-zero if all elements or \a v0 and \a v1 are greater
+* \param outMask holds the result of \a v0 == \a v1.
+* \note QNaNs aren't handled on SPU: comparisons against QNaNs don't necessarily return false.
+* \relates Simd4f */
+inline int allGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater
+* \note QNaNs aren't handled on SPU: comparisons against QNaNs don't necessarily return false.
+* \relates Simd4f */
+inline int anyGreater(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater
+* \param outMask holds the result of \a v0 == \a v1.
+* \note QNaNs aren't handled on SPU: comparisons against QNaNs don't necessarily return false.
+* \relates Simd4f */
+inline int anyGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask);
+
+/*! \brief returns non-zero if all elements or \a v0 and \a v1 are greater or equal
+* \note QNaNs aren't handled on SPU: comparisons against QNaNs don't necessarily return false.
+* \relates Simd4f */
+inline int allGreaterEqual(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief returns non-zero if all elements or \a v0 and \a v1 are greater or equal
+* \param outMask holds the result of \a v0 == \a v1.
+* \note QNaNs aren't handled on SPU: comparisons against QNaNs don't necessarily return false.
+* \relates Simd4f */
+inline int allGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater or equal
+* \note QNaNs aren't handled on SPU: comparisons against QNaNs don't necessarily return false.
+* \relates Simd4f */
+inline int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater or equal
+* \param outMask holds the result of \a v0 == \a v1.
+* \note QNaNs aren't handled on SPU: comparisons against QNaNs don't necessarily return false.
+* \relates Simd4f */
+inline int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask);
+
+/*! \brief returns non-zero if all elements are true
+* \note Undefined if parameter is not result of a comparison.
+* \relates Simd4f */
+inline int allTrue(const Simd4f& v);
+
+/*! \brief returns non-zero if any element is true
+* \note Undefined if parameter is not result of a comparison.
+* \relates Simd4f */
+inline int anyTrue(const Simd4f& v);
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// platform specific includes
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+#if NVMATH_SSE2
+#include "sse2/Simd4f.h"
+#elif NVMATH_VMX128
+#include "xbox360/Simd4f.h"
+#elif NVMATH_ALTIVEC
+#include "ps3/Simd4f.h"
+#elif NVMATH_NEON
+#include "neon/Simd4f.h"
+#endif
+
+#if NVMATH_SCALAR
+#include "scalar/Simd4f.h"
+#endif
diff --git a/src/simd/Simd4i.h b/src/simd/Simd4i.h
new file mode 100644
index 0000000..803c8e5
--- /dev/null
+++ b/src/simd/Simd4i.h
@@ -0,0 +1,387 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "SimdTypes.h"
+
+template <typename T>
+struct Simd4iFactory
+{
+	Simd4iFactory(T v_) : v(v_)
+	{
+	}
+	inline operator Simd4i() const;
+	inline operator Scalar4i() const;
+	Simd4iFactory& operator=(const Simd4iFactory&); // not implemented
+	T v;
+};
+
+template <>
+struct Simd4iFactory<detail::FourTuple>
+{
+	Simd4iFactory(int x, int y, int z, int w)
+	{
+		v[0] = x, v[1] = y, v[2] = z, v[3] = w;
+	}
+	Simd4iFactory(const Simd4iFactory<const int&>& f)
+	{
+		v[3] = v[2] = v[1] = v[0] = f.v;
+	}
+	inline operator Simd4i() const;
+	inline operator Scalar4i() const;
+	Simd4iFactory& operator=(const Simd4iFactory&); // not implemented
+	NVMATH_ALIGN(16, int) v[4];
+};
+
+template <int i>
+struct Simd4iFactory<detail::IntType<i> >
+{
+	inline operator Simd4i() const;
+	inline operator Scalar4i() const;
+};
+
+// forward declaration
+template <typename>
+struct Simd4fFactory;
+
+// map Simd4f/Scalar4f to Simd4i/Scalar4i
+template <typename>
+struct Simd4fToSimd4i;
+template <>
+struct Simd4fToSimd4i<Simd4f>
+{
+	typedef Simd4i Type;
+};
+template <>
+struct Simd4fToSimd4i<Scalar4f>
+{
+	typedef Scalar4i Type;
+};
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression template
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+#if NVMATH_DISTINCT_TYPES
+inline Simd4i operator&(const ComplementExpr<Simd4i>&, const Simd4i&);
+inline Simd4i operator&(const Simd4i&, const ComplementExpr<Simd4i>&);
+#endif
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operators
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+#if NVMATH_DISTINCT_TYPES
+
+/*! \brief Vector bit-wise NOT operator
+* \return A vector holding the bit-negate of \a v.
+* \relates Simd4i */
+inline ComplementExpr<Simd4i> operator~(const Simd4i& v);
+
+/*! \brief Vector bit-wise AND operator
+* \return A vector holding the bit-wise AND of \a v0 and \a v1.
+* \relates Simd4i */
+inline Simd4i operator&(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief Vector bit-wise OR operator
+* \return A vector holding the bit-wise OR of \a v0 and \a v1.
+* \relates Simd4i */
+inline Simd4i operator|(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief Vector bit-wise XOR operator
+* \return A vector holding the bit-wise XOR of \a v0 and \a v1.
+* \relates Simd4i */
+inline Simd4i operator^(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief Vector logical left shift.
+* \return A vector with 4 elements of \a v0, each shifted left by \a shift bits.
+* \relates Simd4i */
+inline Simd4i operator<<(const Simd4i& v, int shift);
+
+/*! \brief Vector logical right shift.
+* \return A vector with 4 elements of \a v0, each shifted right by \a shift bits.
+* \relates Simd4i */
+inline Simd4i operator>>(const Simd4i& v, int shift);
+
+#if NVMATH_SHIFT_BY_VECTOR
+
+/*! \brief Vector logical left shift.
+* \return A vector with 4 elements of \a v0, each shifted left by \a shift bits.
+* \relates Simd4i */
+inline Simd4i operator<<(const Simd4i& v, const Simd4i& shift);
+
+/*! \brief Vector logical right shift.
+* \return A vector with 4 elements of \a v0, each shifted right by \a shift bits.
+* \relates Simd4i */
+inline Simd4i operator>>(const Simd4i& v, const Simd4i& shift);
+
+#endif // NVMATH_SHIFT_BY_VECTOR
+
+#endif // NVMATH_DISTINCT_TYPES
+
+namespace simdi // disambiguate for VMX
+{
+// note: operator?= missing because they don't have corresponding intrinsics.
+
+/*! \brief Test for equality of two vectors.
+* \return Vector of per element result mask (all bits set for 'true', none set for 'false').
+* \relates Simd4i */
+inline Simd4i operator==(const Simd4i& v0, const Simd4i& v1);
+
+// no !=, <=, >= because VMX128/SSE don't support it, use ~equal etc.
+
+/*! \brief Less-compare all elements of two *signed* vectors.
+* \return Vector of per element result mask (all bits set for 'true', none set for 'false').
+* \relates Simd4i */
+inline Simd4i operator<(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief Greater-compare all elements of two *signed* vectors.
+* \return Vector of per element result mask (all bits set for 'true', none set for 'false').
+* \relates Simd4i */
+inline Simd4i operator>(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief Vector addition operator
+* \return A vector holding the component-wise sum of \a v0 and \a v1.
+* \relates Simd4i */
+inline Simd4i operator+(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief Unary vector negation operator.
+* \return A vector holding the component-wise negation of \a v.
+* \relates Simd4i */
+inline Simd4i operator-(const Simd4i& v);
+
+/*! \brief Vector subtraction operator.
+* \return A vector holding the component-wise difference of \a v0 and \a v1.
+* \relates Simd4i */
+inline Simd4i operator-(const Simd4i& v0, const Simd4i& v1);
+
+} // namespace simdi
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// functions
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+/*! \brief Load int value into all vector components.
+* \relates Simd4i */
+inline Simd4iFactory<const int&> simd4i(const int& s)
+{
+	return Simd4iFactory<const int&>(s);
+}
+
+/*! \brief Load 4 int values into vector.
+* \relates Simd4i */
+inline Simd4iFactory<detail::FourTuple> simd4i(int x, int y, int z, int w)
+{
+	return Simd4iFactory<detail::FourTuple>(x, y, z, w);
+}
+
+/*! \brief Create vector from literal.
+* \return Vector with all elements set to \c i.
+* \relates Simd4i */
+template <int i>
+inline Simd4iFactory<detail::IntType<i> > simd4i(const detail::IntType<i>&)
+{
+	return Simd4iFactory<detail::IntType<i> >();
+}
+
+template <>
+inline Simd4iFactory<detail::IntType<1> > simd4i(const detail::IntType<1>&)
+{
+	return Simd4iFactory<detail::IntType<1> >();
+}
+
+template <>
+inline Simd4iFactory<detail::IntType<int(0x80000000)> > simd4i(const detail::IntType<int(0x80000000)>&)
+{
+	return Simd4iFactory<detail::IntType<int(0x80000000)> >();
+}
+
+template <>
+inline Simd4iFactory<detail::IntType<-1> > simd4i(const detail::IntType<-1>&)
+{
+	return Simd4iFactory<detail::IntType<-1> >();
+}
+
+/*! \brief Reinterpret Simd4f as Simd4i.
+* \return A copy of \a v, but reinterpreted as Simd4i.
+* \relates Simd4i */
+inline Simd4i simd4i(const Simd4f& v);
+
+/*! \brief Reinterpret Simd4fFactory as Simd4iFactory.
+* \relates Simd4i */
+template <typename T>
+inline Simd4iFactory<T> simd4i(const Simd4fFactory<T>& v)
+{
+	return reinterpret_cast<const Simd4iFactory<T>&>(v);
+}
+
+/*! \brief Truncate Simd4f to Simd4i.
+* \relates Simd4i */
+inline Simd4i truncate(const Simd4f& v);
+
+namespace simdi
+{
+
+/*! \brief return reference to contiguous array of vector elements
+* \relates Simd4i */
+inline int (&array(Simd4i& v))[4];
+
+/*! \brief return constant reference to contiguous array of vector elements
+* \relates Simd4i */
+inline const int (&array(const Simd4i& v))[4];
+
+} // namespace simdi
+
+/*! \brief Create vector from int array.
+* \relates Simd4i */
+inline Simd4iFactory<const int*> load(const int* ptr)
+{
+	return ptr;
+}
+
+/*! \brief Create vector from aligned int array.
+* \note \a ptr needs to be 16 byte aligned.
+* \relates Simd4i */
+inline Simd4iFactory<detail::AlignedPointer<int> > loadAligned(const int* ptr)
+{
+	return detail::AlignedPointer<int>(ptr);
+}
+
+/*! \brief Create vector from aligned float array.
+* \param offset pointer offset in bytes.
+* \note \a ptr+offset needs to be 16 byte aligned.
+* \relates Simd4i */
+inline Simd4iFactory<detail::OffsetPointer<int> > loadAligned(const int* ptr, unsigned int offset)
+{
+	return detail::OffsetPointer<int>(ptr, offset);
+}
+
+/*! \brief Store vector \a v to int array \a ptr.
+* \relates Simd4i */
+inline void store(int* ptr, const Simd4i& v);
+
+/*! \brief Store vector \a v to aligned int array \a ptr.
+* \note \a ptr needs to be 16 byte aligned.
+* \relates Simd4i */
+inline void storeAligned(int* ptr, const Simd4i& v);
+
+/*! \brief Store vector \a v to aligned int array \a ptr.
+* \param offset pointer offset in bytes.
+* \note \a ptr+offset needs to be 16 byte aligned.
+* \relates Simd4i */
+inline void storeAligned(int* ptr, unsigned int offset, const Simd4i& v);
+
+#if NVMATH_DISTINCT_TYPES
+
+/*! \brief replicate i-th component into all vector components.
+* \return Vector with all elements set to \a v[i].
+* \relates Simd4i */
+template <size_t i>
+inline Simd4i splat(const Simd4i& v);
+
+/*! \brief Select \a v0 or \a v1 based on \a mask.
+* \return mask ? v0 : v1
+* \relates Simd4i */
+inline Simd4i select(const Simd4i& mask, const Simd4i& v0, const Simd4i& v1);
+
+#endif // NVMATH_DISTINCT_TYPES
+
+namespace simdi // disambiguate for VMX
+{
+
+/*! \brief returns non-zero if all elements or \a v0 and \a v1 are equal
+* \relates Simd4i */
+inline int allEqual(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief returns non-zero if all elements or \a v0 and \a v1 are equal
+* \param outMask holds the result of \a v0 == \a v1.
+* \relates Simd4i */
+inline int allEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are equal
+* \relates Simd4i */
+inline int anyEqual(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are equal
+* \param outMask holds the result of \a v0 == \a v1.
+* \relates Simd4i */
+inline int anyEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask);
+
+/*! \brief returns non-zero if all *signed* elements or \a v0 and \a v1 are greater
+* \relates Simd4i */
+inline int allGreater(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief returns non-zero if all *signed* elements or \a v0 and \a v1 are greater
+* \param outMask holds the result of \a v0 == \a v1.
+* \relates Simd4i */
+inline int allGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater
+* \relates Simd4i */
+inline int anyGreater(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater
+* \param outMask holds the result of \a v0 == \a v1.
+* \relates Simd4i */
+inline int anyGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask);
+
+} // namespace simdi
+
+#if NVMATH_DISTINCT_TYPES
+
+/*! \brief returns non-zero if all elements are true
+* \note undefined if parameter is not result of a comparison.
+* \relates Simd4i */
+inline int allTrue(const Simd4i& v);
+
+/*! \brief returns non-zero if any element is true
+* \note undefined if parameter is not result of a comparison.
+* \relates Simd4i */
+inline int anyTrue(const Simd4i& v);
+
+#endif // NVMATH_DISTINCT_TYPES
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// platform specific includes
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+#if NVMATH_SSE2
+#include "sse2/Simd4i.h"
+#elif NVMATH_VMX128
+#include "xbox360/Simd4i.h"
+#elif NVMATH_ALTIVEC
+#include "ps3/Simd4i.h"
+#elif NVMATH_NEON
+#include "neon/Simd4i.h"
+#endif
+
+#if NVMATH_SCALAR
+#include "scalar/Simd4i.h"
+#endif
diff --git a/src/simd/SimdTypes.h b/src/simd/SimdTypes.h
new file mode 100644
index 0000000..225400c
--- /dev/null
+++ b/src/simd/SimdTypes.h
@@ -0,0 +1,169 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include <cmath>
+
+// ps4 compiler defines _M_X64 without value
+#if ((defined _M_IX86) || (defined _M_X64) || (defined __i386__) || (defined __x86_64__)) && !defined(__ANDROID__)
+#define NVMATH_SSE2 1
+#else
+#define NVMATH_SSE2 0
+#endif
+#define NVMATH_VMX128 (defined _M_PPC)
+#define NVMATH_ALTIVEC (defined __CELLOS_LV2__)
+#define NVMATH_NEON (defined _M_ARM || defined __ARM_NEON__)
+
+// which simd types are implemented (one or both are all valid options)
+#define NVMATH_SIMD (NVMATH_SSE2 || NVMATH_VMX128 || NVMATH_ALTIVEC || NVMATH_NEON)
+#define NVMATH_SCALAR !NVMATH_SIMD
+// #define NVMATH_SCALAR 1
+
+#ifdef _MSC_VER
+#define NVMATH_ALIGN(alignment, decl) __declspec(align(alignment)) decl
+#else
+#define NVMATH_ALIGN(alignment, decl) decl __attribute__ ((aligned(alignment)))
+#endif
+
+#ifdef min
+#undef min
+#endif
+#ifdef max
+#undef max
+#endif
+
+// use template expression to fuse multiply-adds into a single instruction
+#define NVMATH_FUSE_MULTIPLY_ADD (NVMATH_VMX128 || NVMATH_ALTIVEC || NVMATH_NEON)
+// support shift by vector operarations
+#define NVMATH_SHIFT_BY_VECTOR (NVMATH_VMX128 || NVMATH_ALTIVEC || NVMATH_NEON)
+// Simd4f and Simd4i map to different types
+#define NVMATH_DISTINCT_TYPES (NVMATH_SSE2 || NVMATH_ALTIVEC || NVMATH_NEON)
+// support inline assembler
+#define NVMATH_INLINE_ASSEMBLER !((defined _M_ARM) || (defined SN_TARGET_PSP2) || (defined __arm64__))
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression template
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+/*! \brief Expression template to fuse and-not. */
+template <typename T>
+struct ComplementExpr
+{
+	inline ComplementExpr(T const& v_) : v(v_)
+	{
+	}
+	inline operator T() const;
+	const T v;
+
+  private:
+	ComplementExpr& operator=(const ComplementExpr&); // not implemented
+};
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// helper functions
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <typename T>
+T sqr(const T& x)
+{
+	return x * x;
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// details
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+namespace detail
+{
+template <typename T>
+struct AlignedPointer
+{
+	AlignedPointer(const T* p) : ptr(p)
+	{
+	}
+	const T* ptr;
+};
+
+template <typename T>
+struct OffsetPointer
+{
+	OffsetPointer(const T* p, unsigned int off) : ptr(p), offset(off)
+	{
+	}
+	const T* ptr;
+	unsigned int offset;
+};
+
+struct FourTuple
+{
+};
+
+// zero and one literals
+template <int i>
+struct IntType
+{
+};
+}
+
+// Supress warnings
+#if defined(__GNUC__) || defined(__SNC__)
+#define NVMATH_UNUSED __attribute__((unused))
+#else
+#define NVMATH_UNUSED
+#endif
+
+static detail::IntType<0> _0 NVMATH_UNUSED;
+static detail::IntType<1> _1 NVMATH_UNUSED;
+static detail::IntType<int(0x80000000)> _sign NVMATH_UNUSED;
+static detail::IntType<-1> _true NVMATH_UNUSED;
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// platform specific includes
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+#if NVMATH_SSE2
+#include "sse2/SimdTypes.h"
+#elif NVMATH_VMX128
+#include "xbox360/SimdTypes.h"
+#elif NVMATH_ALTIVEC
+#include "ps3/SimdTypes.h"
+#elif NVMATH_NEON
+#include "neon/SimdTypes.h"
+#else
+struct Simd4f;
+struct Simd4i;
+#endif
+
+#if NVMATH_SCALAR
+#include "scalar/SimdTypes.h"
+#else
+struct Scalar4f;
+struct Scalar4i;
+#endif
diff --git a/src/simd/neon/Simd4f.h b/src/simd/neon/Simd4f.h
new file mode 100644
index 0000000..a43fd32
--- /dev/null
+++ b/src/simd/neon/Simd4f.h
@@ -0,0 +1,553 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// factory implementation
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline Simd4fFactory<const float&>::operator Simd4f() const
+{
+	return vdupq_n_f32(reinterpret_cast<const float32_t&>(v));
+}
+
+inline Simd4fFactory<detail::FourTuple>::operator Simd4f() const
+{
+	return reinterpret_cast<const Simd4f&>(v);
+}
+
+template <int i>
+inline Simd4fFactory<detail::IntType<i> >::operator Simd4f() const
+{
+	return vdupq_n_u32(i);
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<1> >::operator Simd4f() const
+{
+	return vdupq_n_f32(1.0f);
+}
+
+template <>
+inline Simd4fFactory<const float*>::operator Simd4f() const
+{
+	return vld1q_f32((const float32_t*)v);
+}
+
+template <>
+inline Simd4fFactory<detail::AlignedPointer<float> >::operator Simd4f() const
+{
+	return vld1q_f32((const float32_t*)v.ptr);
+}
+
+template <>
+inline Simd4fFactory<detail::OffsetPointer<float> >::operator Simd4f() const
+{
+	return vld1q_f32(reinterpret_cast<const float32_t*>(reinterpret_cast<const char*>(v.ptr) + v.offset));
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression templates
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline ComplementExpr<Simd4f>::operator Simd4f() const
+{
+	return vbicq_u32(vdupq_n_u32(0xffffffff), v.u4);
+}
+
+Simd4f operator&(const ComplementExpr<Simd4f>& complement, const Simd4f& v)
+{
+	return vbicq_u32(v.u4, complement.v.u4);
+}
+
+Simd4f operator&(const Simd4f& v, const ComplementExpr<Simd4f>& complement)
+{
+	return vbicq_u32(v.u4, complement.v.u4);
+}
+
+ProductExpr::operator Simd4f() const
+{
+	return vmulq_f32(v0.f4, v1.f4);
+}
+
+Simd4f operator+(const ProductExpr& p, const Simd4f& v)
+{
+	return vmlaq_f32(v.f4, p.v0.f4, p.v1.f4);
+}
+
+Simd4f operator+(const Simd4f& v, const ProductExpr& p)
+{
+	return vmlaq_f32(v.f4, p.v0.f4, p.v1.f4);
+}
+
+Simd4f operator+(const ProductExpr& p0, const ProductExpr& p1)
+{
+	// cast calls operator Simd4f() which evaluates the other ProductExpr
+	return vmlaq_f32(static_cast<Simd4f>(p0).f4, p1.v0.f4, p1.v1.f4);
+}
+
+Simd4f operator-(const Simd4f& v, const ProductExpr& p)
+{
+	return vmlsq_f32(v.f4, p.v0.f4, p.v1.f4);
+}
+
+Simd4f operator-(const ProductExpr& p0, const ProductExpr& p1)
+{
+	// cast calls operator Simd4f() which evaluates the other ProductExpr
+	return vmlsq_f32(static_cast<Simd4f>(p0).f4, p1.v0.f4, p1.v1.f4);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operator implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4f operator==(const Simd4f& v0, const Simd4f& v1)
+{
+	return vceqq_f32(v0.f4, v1.f4);
+}
+
+Simd4f operator<(const Simd4f& v0, const Simd4f& v1)
+{
+	return vcltq_f32(v0.f4, v1.f4);
+}
+
+Simd4f operator<=(const Simd4f& v0, const Simd4f& v1)
+{
+	return vcleq_f32(v0.f4, v1.f4);
+}
+
+Simd4f operator>(const Simd4f& v0, const Simd4f& v1)
+{
+	return vcgtq_f32(v0.f4, v1.f4);
+}
+
+Simd4f operator>=(const Simd4f& v0, const Simd4f& v1)
+{
+	return vcgeq_f32(v0.f4, v1.f4);
+}
+
+ComplementExpr<Simd4f> operator~(const Simd4f& v)
+{
+	return ComplementExpr<Simd4f>(v);
+}
+
+Simd4f operator&(const Simd4f& v0, const Simd4f& v1)
+{
+	return vandq_u32(v0.u4, v1.u4);
+}
+
+Simd4f operator|(const Simd4f& v0, const Simd4f& v1)
+{
+	return vorrq_u32(v0.u4, v1.u4);
+}
+
+Simd4f operator^(const Simd4f& v0, const Simd4f& v1)
+{
+	return veorq_u32(v0.u4, v1.u4);
+}
+
+Simd4f operator<<(const Simd4f& v, int shift)
+{
+	return vshlq_u32(v.u4, vdupq_n_s32(shift));
+}
+
+Simd4f operator>>(const Simd4f& v, int shift)
+{
+	return vshlq_u32(v.u4, vdupq_n_s32(-shift));
+}
+
+Simd4f operator<<(const Simd4f& v, const Simd4f& shift)
+{
+	return vshlq_u32(v.u4, shift.i4);
+}
+
+Simd4f operator>>(const Simd4f& v, const Simd4f& shift)
+{
+	return vshlq_u32(v.u4, vnegq_s32(shift.i4));
+}
+
+Simd4f operator+(const Simd4f& v)
+{
+	return v;
+}
+
+Simd4f operator+(const Simd4f& v0, const Simd4f& v1)
+{
+	return vaddq_f32(v0.f4, v1.f4);
+}
+
+Simd4f operator-(const Simd4f& v)
+{
+	return vnegq_f32(v.f4);
+}
+
+Simd4f operator-(const Simd4f& v0, const Simd4f& v1)
+{
+	return vsubq_f32(v0.f4, v1.f4);
+}
+
+ProductExpr operator*(const Simd4f& v0, const Simd4f& v1)
+{
+	return ProductExpr(v0, v1);
+}
+
+Simd4f operator/(const Simd4f& v0, const Simd4f& v1)
+{
+	return v0 * vrecpeq_f32(v1.f4); // reciprocal estimate
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// function implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4f simd4f(const Simd4i& v)
+{
+	return v.u4;
+}
+
+Simd4f convert(const Simd4i& v)
+{
+	return vcvtq_f32_s32(v.i4);
+}
+
+float (&array(Simd4f& v))[4]
+{
+	return (float(&)[4])v;
+}
+
+const float (&array(const Simd4f& v))[4]
+{
+	return (const float(&)[4])v;
+}
+
+void store(float* ptr, Simd4f const& v)
+{
+	return vst1q_f32((float32_t*)ptr, v.f4);
+}
+
+void storeAligned(float* ptr, Simd4f const& v)
+{
+	return vst1q_f32((float32_t*)ptr, v.f4);
+}
+
+void storeAligned(float* ptr, unsigned int offset, Simd4f const& v)
+{
+	return storeAligned(reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + offset), v);
+}
+
+template <size_t i>
+Simd4f splat(Simd4f const& v)
+{
+	return vdupq_n_f32(array(v)[i]);
+}
+
+Simd4f select(Simd4f const& mask, Simd4f const& v0, Simd4f const& v1)
+{
+	return vbslq_f32(mask.u4, v0.f4, v1.f4);
+}
+
+Simd4f abs(const Simd4f& v)
+{
+	return vabsq_f32(v.f4);
+}
+
+Simd4f floor(const Simd4f& v)
+{
+	int32x4_t i = vcvtq_s32_f32(v.f4);
+	int32x4_t s = vreinterpretq_s32_u32(vcgtq_f32(vcvtq_f32_s32(i), v.f4));
+	return vcvtq_f32_s32(vsubq_s32(i, vshrq_n_u32(s, 31)));
+}
+
+Simd4f max(const Simd4f& v0, const Simd4f& v1)
+{
+	return vmaxq_f32(v0.f4, v1.f4);
+}
+
+Simd4f min(const Simd4f& v0, const Simd4f& v1)
+{
+	return vminq_f32(v0.f4, v1.f4);
+}
+
+Simd4f recip(const Simd4f& v)
+{
+	return recip<0>(v);
+}
+
+template <int n>
+Simd4f recip(const Simd4f& v)
+{
+	Simd4f recipV = vrecpeq_f32(v.f4);
+	// n+1 newton iterations because initial approximation is crude
+	for(int i = 0; i <= n; ++i)
+		recipV = vrecpsq_f32(v.f4, recipV.f4) * recipV;
+	return recipV;
+}
+
+Simd4f sqrt(const Simd4f& v)
+{
+	Simd4f r = v * rsqrt(v);
+	Simd4f zero = simd4f(0);
+	return select(vceqq_f32(zero.f4, v.f4), zero, r);
+}
+
+Simd4f rsqrt(const Simd4f& v)
+{
+	return rsqrt<0>(v);
+}
+
+template <int n>
+Simd4f rsqrt(const Simd4f& v)
+{
+	Simd4f rsqrtV = vrsqrteq_f32(v.f4);
+	// n+1 newton iterations because initial approximation is crude
+	for(int i = 0; i <= n; ++i)
+		rsqrtV = vrsqrtsq_f32(vmulq_f32(v.f4, rsqrtV.f4), rsqrtV.f4) * rsqrtV;
+	return rsqrtV;
+}
+
+Simd4f exp2(const Simd4f& v)
+{
+	// http://www.netlib.org/cephes/
+
+	Simd4f limit = simd4f(127.4999f);
+	Simd4f x = min(max(-limit, v), limit);
+
+	// separate into integer and fractional part
+
+	Simd4f fx = x + simd4f(0.5f);
+	Simd4i ix = vsubq_s32(vcvtq_s32_f32(fx.f4), vreinterpretq_s32_u32(vshrq_n_u32(fx.u4, 31)));
+	fx = x - vcvtq_f32_s32(ix.i4);
+
+	// exp2(fx) ~ 1 + 2*P(fx) / (Q(fx) - P(fx))
+
+	Simd4f fx2 = fx * fx;
+
+	Simd4f px = fx * (simd4f(1.51390680115615096133e+3f) +
+	                  fx2 * (simd4f(2.02020656693165307700e+1f) + fx2 * simd4f(2.30933477057345225087e-2f)));
+	Simd4f qx = simd4f(4.36821166879210612817e+3f) + fx2 * (simd4f(2.33184211722314911771e+2f) + fx2);
+
+	Simd4f exp2fx = px * recip(qx - px);
+	exp2fx = simd4f(_1) + exp2fx + exp2fx;
+
+	// exp2(ix)
+
+	Simd4f exp2ix = vreinterpretq_f32_s32(vshlq_n_s32(vaddq_s32(ix.i4, vdupq_n_s32(0x7f)), 23));
+
+	return exp2fx * exp2ix;
+}
+
+Simd4f log2(const Simd4f& v)
+{
+	Simd4f scale = simd4f(1.44269504088896341f); // 1/ln(2)
+	const float* ptr = array(v);
+	return simd4f(::logf(ptr[0]), ::logf(ptr[1]), ::logf(ptr[2]), ::logf(ptr[3])) * scale;
+}
+
+Simd4f dot3(const Simd4f& v0, const Simd4f& v1)
+{
+	Simd4f tmp = v0 * v1;
+	return splat<0>(tmp) + splat<1>(tmp) + splat<2>(tmp);
+}
+
+Simd4f cross3(const Simd4f& v0, const Simd4f& v1)
+{
+	float32x2_t x0_y0 = vget_low_f32(v0.f4);
+	float32x2_t z0_w0 = vget_high_f32(v0.f4);
+	float32x2_t x1_y1 = vget_low_f32(v1.f4);
+	float32x2_t z1_w1 = vget_high_f32(v1.f4);
+
+	float32x2_t y1_z1 = vext_f32(x1_y1, z1_w1, 1);
+	float32x2_t y0_z0 = vext_f32(x0_y0, z0_w0, 1);
+
+	float32x2_t z0x1_w0y1 = vmul_f32(z0_w0, x1_y1);
+	float32x2_t x0y1_y0z1 = vmul_f32(x0_y0, y1_z1);
+
+	float32x2_t y2_w2 = vmls_f32(z0x1_w0y1, x0_y0, z1_w1);
+	float32x2_t z2_x2 = vmls_f32(x0y1_y0z1, y0_z0, x1_y1);
+	float32x2_t x2_y2 = vext_f32(z2_x2, y2_w2, 1);
+
+	return vcombine_f32(x2_y2, z2_x2);
+}
+
+void transpose(Simd4f& x, Simd4f& y, Simd4f& z, Simd4f& w)
+{
+#if NVMATH_INLINE_ASSEMBLER
+	asm volatile("vzip.f32 %q0, %q2 \n\t"
+	             "vzip.f32 %q1, %q3 \n\t"
+	             "vzip.f32 %q0, %q1 \n\t"
+	             "vzip.f32 %q2, %q3 \n\t"
+	             : "+w"(x.f4), "+w"(y.f4), "+w"(z.f4), "+w"(w.f4));
+#else
+	float32x4x2_t v0v1 = vzipq_f32(x.f4, z.f4);
+	float32x4x2_t v2v3 = vzipq_f32(y.f4, w.f4);
+	float32x4x2_t zip0 = vzipq_f32(v0v1.val[0], v2v3.val[0]);
+	float32x4x2_t zip1 = vzipq_f32(v0v1.val[1], v2v3.val[1]);
+
+	x = zip0.val[0];
+	y = zip0.val[1];
+	z = zip1.val[0];
+	w = zip1.val[1];
+#endif
+}
+
+void zip(Simd4f& v0, Simd4f& v1)
+{
+#if NVMATH_INLINE_ASSEMBLER
+	asm volatile("vzip.f32 %q0, %q1 \n\t"
+		: "+w"(v0.f4), "+w"(v1.f4));
+#else
+	float32x4x2_t uzp = vzipq_f32(v0.f4, v1.f4);
+	v0 = uzp.val[0];
+	v1 = uzp.val[1];
+#endif
+}
+
+void unzip(Simd4f& v0, Simd4f& v1)
+{
+#if NVMATH_INLINE_ASSEMBLER
+	asm volatile("vuzp.f32 %q0, %q1 \n\t"
+		: "+w"(v0.f4), "+w"(v1.f4));
+#else
+	float32x4x2_t uzp = vuzpq_f32(v0.f4, v1.f4);
+	v0 = uzp.val[0];
+	v1 = uzp.val[1];
+#endif
+}
+
+Simd4f swaphilo(const Simd4f& v)
+{
+	return vcombine_f32(vget_high_f32(v.f4), vget_low_f32(v.f4));
+}
+
+int allEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return allTrue(v0 == v1);
+}
+
+int allEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return allTrue(outMask = v0 == v1);
+}
+
+int anyEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return anyTrue(v0 == v1);
+}
+
+int anyEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return anyTrue(outMask = v0 == v1);
+}
+
+int allGreater(const Simd4f& v0, const Simd4f& v1)
+{
+	return allTrue(v0 > v1);
+}
+
+int allGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return allTrue(outMask = v0 > v1);
+}
+
+int anyGreater(const Simd4f& v0, const Simd4f& v1)
+{
+	return anyTrue(v0 > v1);
+}
+
+int anyGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return anyTrue(outMask = v0 > v1);
+}
+
+int allGreaterEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return allTrue(v0 >= v1);
+}
+
+int allGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return allTrue(outMask = v0 >= v1);
+}
+
+int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return anyTrue(v0 >= v1);
+}
+
+int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return anyTrue(outMask = v0 >= v1);
+}
+
+int allTrue(const Simd4f& v)
+{
+#if NVMATH_INLINE_ASSEMBLER
+	int result;
+	asm volatile("vmovq q0, %q1 \n\t"
+	             "vand.u32 d0, d0, d1 \n\t"
+	             "vpmin.u32 d0, d0, d0 \n\t"
+	             "vcmp.f32 s0, #0 \n\t"
+	             "fmrx %0, fpscr"
+	             : "=r"(result)
+	             : "w"(v.f4)
+	             : "q0");
+	return result >> 28 & 0x1;
+#else
+	uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4));
+	uint16x4_t lo = vmovn_u32(v.u4);
+	uint16x8_t combined = vcombine_u16(lo, hi);
+	uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined));
+	return vget_lane_u32(reduced, 0) == 0xffffffff;
+#endif
+}
+
+int anyTrue(const Simd4f& v)
+{
+#if NVMATH_INLINE_ASSEMBLER
+	int result;
+	asm volatile("vmovq q0, %q1 \n\t"
+	             "vorr.u32 d0, d0, d1 \n\t"
+	             "vpmax.u32 d0, d0, d0 \n\t"
+	             "vcmp.f32 s0, #0 \n\t"
+	             "fmrx %0, fpscr"
+	             : "=r"(result)
+	             : "w"(v.f4)
+	             : "q0");
+	return result >> 28 & 0x1;
+#else
+	uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4));
+	uint16x4_t lo = vmovn_u32(v.u4);
+	uint16x8_t combined = vcombine_u16(lo, hi);
+	uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined));
+	return vget_lane_u32(reduced, 0) != 0x0;
+#endif
+}
diff --git a/src/simd/neon/Simd4i.h b/src/simd/neon/Simd4i.h
new file mode 100644
index 0000000..56e113b
--- /dev/null
+++ b/src/simd/neon/Simd4i.h
@@ -0,0 +1,297 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// factory implementation
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline Simd4iFactory<const int&>::operator Simd4i() const
+{
+	return vdupq_n_s32(v);
+}
+
+inline Simd4iFactory<detail::FourTuple>::operator Simd4i() const
+{
+	return reinterpret_cast<const Simd4i&>(v);
+}
+
+template <int i>
+inline Simd4iFactory<detail::IntType<i> >::operator Simd4i() const
+{
+	return vdupq_n_u32(i);
+}
+
+template <>
+inline Simd4iFactory<const int*>::operator Simd4i() const
+{
+	return vld1q_s32(v);
+}
+
+template <>
+inline Simd4iFactory<detail::AlignedPointer<int> >::operator Simd4i() const
+{
+	return vld1q_s32(v.ptr);
+}
+
+template <>
+inline Simd4iFactory<detail::OffsetPointer<int> >::operator Simd4i() const
+{
+	return vld1q_s32(reinterpret_cast<const int*>(reinterpret_cast<const char*>(v.ptr) + v.offset));
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression template
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline ComplementExpr<Simd4i>::operator Simd4i() const
+{
+	return vbicq_u32(vdupq_n_u32(0xffffffff), v.u4);
+}
+
+Simd4i operator&(const ComplementExpr<Simd4i>& complement, const Simd4i& v)
+{
+	return vbicq_u32(v.u4, complement.v.u4);
+}
+
+Simd4i operator&(const Simd4i& v, const ComplementExpr<Simd4i>& complement)
+{
+	return vbicq_u32(v.u4, complement.v.u4);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operator implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4i simdi::operator==(const Simd4i& v0, const Simd4i& v1)
+{
+	return vceqq_u32(v0.u4, v1.u4);
+}
+
+Simd4i simdi::operator<(const Simd4i& v0, const Simd4i& v1)
+{
+	return vcltq_s32(v0.i4, v1.i4);
+}
+
+Simd4i simdi::operator>(const Simd4i& v0, const Simd4i& v1)
+{
+	return vcgtq_s32(v0.i4, v1.i4);
+}
+
+ComplementExpr<Simd4i> operator~(const Simd4i& v)
+{
+	return ComplementExpr<Simd4i>(v);
+}
+
+Simd4i operator&(const Simd4i& v0, const Simd4i& v1)
+{
+	return vandq_u32(v0.u4, v1.u4);
+}
+
+Simd4i operator|(const Simd4i& v0, const Simd4i& v1)
+{
+	return vorrq_u32(v0.u4, v1.u4);
+}
+
+Simd4i operator^(const Simd4i& v0, const Simd4i& v1)
+{
+	return veorq_u32(v0.u4, v1.u4);
+}
+
+Simd4i operator<<(const Simd4i& v, int shift)
+{
+	return vshlq_u32(v.u4, vdupq_n_s32(shift));
+}
+
+Simd4i operator>>(const Simd4i& v, int shift)
+{
+	return vshlq_u32(v.u4, vdupq_n_s32(-shift));
+}
+
+Simd4i operator<<(const Simd4i& v, const Simd4i& shift)
+{
+	return vshlq_u32(v.u4, shift.i4);
+}
+
+Simd4i operator>>(const Simd4i& v, const Simd4i& shift)
+{
+	return vshlq_u32(v.u4, vnegq_s32(shift.i4));
+}
+
+Simd4i simdi::operator+(const Simd4i& v0, const Simd4i& v1)
+{
+	return vaddq_s32(v0.u4, v1.u4);
+}
+
+Simd4i simdi::operator-(const Simd4i& v)
+{
+	return vnegq_s32(v.i4);
+}
+
+Simd4i simdi::operator-(const Simd4i& v0, const Simd4i& v1)
+{
+	return vsubq_u32(v0.u4, v1.u4);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// function implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4i simd4i(const Simd4f& v)
+{
+	return v.u4;
+}
+
+Simd4i truncate(const Simd4f& v)
+{
+	return vcvtq_s32_f32(v.f4);	
+}
+
+int (&simdi::array(Simd4i& v))[4]
+{
+	return (int(&)[4])v;
+}
+
+const int (&simdi::array(const Simd4i& v))[4]
+{
+	return (const int(&)[4])v;
+}
+
+void store(int* ptr, const Simd4i& v)
+{
+	return vst1q_s32(ptr, v.i4);
+}
+
+void storeAligned(int* ptr, const Simd4i& v)
+{
+	vst1q_s32(ptr, v.i4);
+}
+
+void storeAligned(int* ptr, unsigned int offset, const Simd4i& v)
+{
+	return storeAligned(reinterpret_cast<int*>(reinterpret_cast<char*>(ptr) + offset), v);
+}
+
+template <size_t i>
+Simd4i splat(Simd4i const& v)
+{
+	return vdupq_n_s32(simdi::array(v)[i]);
+}
+
+Simd4i select(Simd4i const& mask, Simd4i const& v0, Simd4i const& v1)
+{
+	return vbslq_u32(mask.u4, v0.u4, v1.u4);
+}
+
+int simdi::allEqual(const Simd4i& v0, const Simd4i& v1)
+{
+	return allTrue(simdi::operator==(v0, v1));
+}
+
+int simdi::allEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return allTrue(outMask = simdi::operator==(v0, v1));
+}
+
+int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1)
+{
+	return anyTrue(simdi::operator==(v0, v1));
+}
+
+int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return anyTrue(outMask = simdi::operator==(v0, v1));
+}
+
+int simdi::allGreater(const Simd4i& v0, const Simd4i& v1)
+{
+	return allTrue(simdi::operator>(v0, v1));
+}
+
+int simdi::allGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return allTrue(outMask = simdi::operator>(v0, v1));
+}
+
+int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1)
+{
+	return anyTrue(simdi::operator>(v0, v1));
+}
+
+int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return anyTrue(outMask = simdi::operator>(v0, v1));
+}
+
+int allTrue(const Simd4i& v)
+{
+#if NVMATH_INLINE_ASSEMBLER
+	int result;
+	asm volatile("vmovq q0, %q1 \n\t"
+	             "vand.u32 d0, d0, d1 \n\t"
+	             "vpmin.u32 d0, d0, d0 \n\t"
+	             "vcmp.f32 s0, #0 \n\t"
+	             "fmrx %0, fpscr"
+	             : "=r"(result)
+	             : "w"(v.u4)
+	             : "q0");
+	return result >> 28 & 0x1;
+#else
+	uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4));
+	uint16x4_t lo = vmovn_u32(v.u4);
+	uint16x8_t combined = vcombine_u16(lo, hi);
+	uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined));
+	return vget_lane_u32(reduced, 0) == 0xffffffff;
+#endif
+}
+
+int anyTrue(const Simd4i& v)
+{
+#if NVMATH_INLINE_ASSEMBLER
+	int result;
+	asm volatile("vmovq q0, %q1 \n\t"
+	             "vorr.u32 d0, d0, d1 \n\t"
+	             "vpmax.u32 d0, d0, d0 \n\t"
+	             "vcmp.f32 s0, #0 \n\t"
+	             "fmrx %0, fpscr"
+	             : "=r"(result)
+	             : "w"(v.u4)
+	             : "q0");
+	return result >> 28 & 0x1;
+#else
+	uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4));
+	uint16x4_t lo = vmovn_u32(v.u4);
+	uint16x8_t combined = vcombine_u16(lo, hi);
+	uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined));
+	return vget_lane_u32(reduced, 0) != 0x0;
+#endif
+}
diff --git a/src/simd/neon/SimdTypes.h b/src/simd/neon/SimdTypes.h
new file mode 100644
index 0000000..6f0d276
--- /dev/null
+++ b/src/simd/neon/SimdTypes.h
@@ -0,0 +1,67 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include <arm_neon.h>
+
+union Simd4f
+{
+	Simd4f()
+	{
+	}
+	Simd4f(const float32x4_t& v) : f4(v)
+	{
+	}
+#ifndef __ARM_NEON__ // all *32x4_t map to the same type
+	Simd4f(const uint32x4_t& v) : u4(v)
+	{
+	}
+#endif
+	float32x4_t f4;
+	uint32x4_t u4;
+	int32x4_t i4;
+};
+
+union Simd4i
+{
+	Simd4i()
+	{
+	}
+	Simd4i(const uint32x4_t& v) : u4(v)
+	{
+	}
+#ifndef __ARM_NEON__ // all *32x4_t map to the same type
+	Simd4i(const int32x4_t& v) : i4(v)
+	{
+	}
+#endif
+	uint32x4_t u4;
+	int32x4_t i4;
+};
diff --git a/src/simd/ps3/Simd4f.h b/src/simd/ps3/Simd4f.h
new file mode 100644
index 0000000..ec6f00d
--- /dev/null
+++ b/src/simd/ps3/Simd4f.h
@@ -0,0 +1,497 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// factory implementation
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline Simd4fFactory<const float&>::operator Simd4f() const
+{
+	return vec_splat(vec_lvlx(0, const_cast<float*>(&v)), 0);
+}
+
+inline Simd4fFactory<detail::FourTuple>::operator Simd4f() const
+{
+	return (const vec_float4&)v;
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<0> >::operator Simd4f() const
+{
+	return (vec_float4)vec_splat_s32(0);
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<1> >::operator Simd4f() const
+{
+	return vec_splats(1.0f);
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<0x80000000> >::operator Simd4f() const
+{
+	vec_uint4 mask = (vec_uint4)vec_splat_s32(-1);
+	return (vec_float4)vec_sl(mask, mask);
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<0xffffffff> >::operator Simd4f() const
+{
+	return (vec_float4)vec_splat_s32(-1);
+}
+
+template <>
+inline Simd4fFactory<const float*>::operator Simd4f() const
+{
+	return (vec_float4)vec_or(vec_lvlx(0, const_cast<float*>(v)), vec_lvrx(16, const_cast<float*>(v)));
+}
+
+template <>
+inline Simd4fFactory<detail::AlignedPointer<float> >::operator Simd4f() const
+{
+	return vec_ld(0, const_cast<float*>(v.ptr));
+}
+
+template <>
+inline Simd4fFactory<detail::OffsetPointer<float> >::operator Simd4f() const
+{
+	return vec_ld(v.offset, const_cast<float*>(v.ptr));
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression templates
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline ComplementExpr<Simd4f>::operator Simd4f() const
+{
+	return vec_nor(v.f4, v.f4);
+}
+
+Simd4f operator&(const ComplementExpr<Simd4f>& complement, const Simd4f& v)
+{
+	return vec_andc(v.f4, complement.v.f4);
+}
+
+Simd4f operator&(const Simd4f& v, const ComplementExpr<Simd4f>& complement)
+{
+	return vec_andc(v.f4, complement.v.f4);
+}
+
+ProductExpr::operator Simd4f() const
+{
+	return vec_madd(v0.f4, v1.f4, (vec_float4)vec_splat_s32(0));
+}
+
+Simd4f operator+(const ProductExpr& p, const Simd4f& v)
+{
+	return vec_madd(p.v0.f4, p.v1.f4, v.f4);
+}
+
+Simd4f operator+(const Simd4f& v, const ProductExpr& p)
+{
+	return vec_madd(p.v0.f4, p.v1.f4, v.f4);
+}
+
+Simd4f operator+(const ProductExpr& p0, const ProductExpr& p1)
+{
+	// cast calls operator Simd4f() which evaluates the other ProductExpr
+	return vec_madd(p1.v0.f4, p1.v1.f4, static_cast<Simd4f>(p0).f4);
+}
+
+Simd4f operator-(const Simd4f& v, const ProductExpr& p)
+{
+	return vec_nmsub(p.v0.f4, p.v1.f4, v.f4);
+}
+
+Simd4f operator-(const ProductExpr& p0, const ProductExpr& p1)
+{
+	// cast calls operator Simd4f() which evaluates the other ProductExpr
+	return vec_nmsub(p1.v0.f4, p1.v1.f4, static_cast<Simd4f>(p0).f4);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operator implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4f operator==(const Simd4f& v0, const Simd4f& v1)
+{
+	return (vec_float4)vec_cmpeq(v0.f4, v1.f4);
+}
+
+Simd4f operator<(const Simd4f& v0, const Simd4f& v1)
+{
+	return (vec_float4)vec_cmplt(v0.f4, v1.f4);
+}
+
+Simd4f operator<=(const Simd4f& v0, const Simd4f& v1)
+{
+	return (vec_float4)vec_cmple(v0.f4, v1.f4);
+}
+
+Simd4f operator>(const Simd4f& v0, const Simd4f& v1)
+{
+	return (vec_float4)vec_cmpgt(v0.f4, v1.f4);
+}
+
+Simd4f operator>=(const Simd4f& v0, const Simd4f& v1)
+{
+	return (vec_float4)vec_cmpge(v0.f4, v1.f4);
+}
+
+ComplementExpr<Simd4f> operator~(const Simd4f& v)
+{
+	return ComplementExpr<Simd4f>(v);
+}
+
+Simd4f operator&(const Simd4f& v0, const Simd4f& v1)
+{
+	return vec_and(v0.f4, v1.f4);
+}
+
+Simd4f operator|(const Simd4f& v0, const Simd4f& v1)
+{
+	return vec_or(v0.f4, v1.f4);
+}
+
+Simd4f operator^(const Simd4f& v0, const Simd4f& v1)
+{
+	return vec_xor(v0.f4, v1.f4);
+}
+
+Simd4f operator<<(const Simd4f& v, int shift)
+{
+	return (vec_float4)vec_sl((vec_uint4)v.f4, vec_splat((vec_uint4)vec_lvlx(0, &shift), 0));
+}
+
+Simd4f operator>>(const Simd4f& v, int shift)
+{
+	return (vec_float4)vec_sr((vec_uint4)v.f4, vec_splat((vec_uint4)vec_lvlx(0, &shift), 0));
+}
+
+Simd4f operator<<(const Simd4f& v, const Simd4f& shift)
+{
+	return (vec_float4)vec_sl((vec_uint4)v.f4, (vec_uint4)shift.f4);
+}
+
+Simd4f operator>>(const Simd4f& v, const Simd4f& shift)
+{
+	return (vec_float4)vec_sr((vec_uint4)v.f4, (vec_uint4)shift.f4);
+}
+
+Simd4f operator+(const Simd4f& v)
+{
+	return v;
+}
+
+Simd4f operator+(const Simd4f& v0, const Simd4f& v1)
+{
+	return vec_add(v0.f4, v1.f4);
+}
+
+Simd4f operator-(const Simd4f& v)
+{
+	vec_uint4 mask = (vec_uint4)vec_splat_s32(-1);
+	return vec_xor(v.f4, (vec_float4)vec_sl(mask, mask));
+}
+
+Simd4f operator-(const Simd4f& v0, const Simd4f& v1)
+{
+	return vec_sub(v0.f4, v1.f4);
+}
+
+ProductExpr operator*(const Simd4f& v0, const Simd4f& v1)
+{
+	return ProductExpr(v0, v1);
+}
+
+Simd4f operator/(const Simd4f& v0, const Simd4f& v1)
+{
+	return v0 * vec_re(v1.f4); // reciprocal estimate
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// function implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4f simd4f(const Simd4i& v)
+{
+	return (vec_float4)v.u4;
+}
+
+Simd4f convert(const Simd4i& v)
+{
+	return vec_ctf(v.i4, 0);
+}
+
+float (&array(Simd4f& v))[4]
+{
+	return (float(&)[4])v;
+}
+
+const float (&array(const Simd4f& v))[4]
+{
+	return (const float(&)[4])v;
+}
+
+void store(float* ptr, Simd4f const& v)
+{
+	vec_stvlx(v.f4, 0, ptr);
+	vec_stvrx(v.f4, 16, ptr);
+}
+
+void storeAligned(float* ptr, Simd4f const& v)
+{
+	vec_stvlx(v.f4, 0, ptr);
+}
+
+void storeAligned(float* ptr, unsigned int offset, Simd4f const& v)
+{
+	vec_stvlx(v.f4, offset, ptr);
+}
+
+template <size_t i>
+Simd4f splat(Simd4f const& v)
+{
+	return vec_splat(v.f4, i);
+}
+
+Simd4f select(Simd4f const& mask, Simd4f const& v0, Simd4f const& v1)
+{
+	return vec_sel(v1.f4, v0.f4, (vec_uint4)mask.f4);
+}
+
+Simd4f abs(const Simd4f& v)
+{
+	vec_uint4 mask = (vec_uint4)vec_splat_s32(-1);
+	return (vec_float4)vec_andc((vec_uint4)v.f4, vec_sl(mask, mask));
+}
+
+Simd4f floor(const Simd4f& v)
+{
+	return vec_floor(v.f4);
+}
+
+Simd4f max(const Simd4f& v0, const Simd4f& v1)
+{
+	return vec_max(v0.f4, v1.f4);
+}
+
+Simd4f min(const Simd4f& v0, const Simd4f& v1)
+{
+	return vec_min(v0.f4, v1.f4);
+}
+
+Simd4f recip(const Simd4f& v)
+{
+	return vec_re(v.f4);
+}
+
+template <int n>
+Simd4f recip(const Simd4f& v)
+{
+	Simd4f two = simd4f(2.0f);
+	Simd4f recipV = recip(v);
+	for(int i = 0; i < n; ++i)
+		recipV = recipV * (two - v * recipV);
+	return recipV;
+}
+
+Simd4f sqrt(const Simd4f& v)
+{
+	return v * vec_rsqrte(v.f4);
+}
+
+Simd4f rsqrt(const Simd4f& v)
+{
+	return vec_rsqrte(v.f4);
+}
+
+template <int n>
+Simd4f rsqrt(const Simd4f& v)
+{
+	Simd4f halfV = v * simd4f(0.5f);
+	Simd4f threeHalf = simd4f(1.5f);
+	Simd4f rsqrtV = rsqrt(v);
+	for(int i = 0; i < n; ++i)
+		rsqrtV = rsqrtV * (threeHalf - halfV * rsqrtV * rsqrtV);
+	return rsqrtV;
+}
+
+Simd4f exp2(const Simd4f& v)
+{
+	// vec_expte approximation only valid for domain [-127, 127]
+	Simd4f limit = simd4f(127.0f);
+	Simd4f x = min(max(v, -limit), limit);
+
+	return vec_expte(x.f4);
+}
+
+Simd4f log2(const Simd4f& v)
+{
+	return vec_loge(v.f4);
+}
+
+Simd4f dot3(const Simd4f& v0, const Simd4f& v1)
+{
+	Simd4f tmp = v0 * v1;
+	return splat<0>(tmp) + splat<1>(tmp) + splat<2>(tmp);
+}
+
+Simd4f cross3(const Simd4f& v0, const Simd4f& v1)
+{
+	// w z y x -> w x z y
+	uint32_t data[] __attribute__((aligned(16))) = { 0x04050607, 0x08090a0b, 0x00010203, 0x0c0d0e0f };
+	vec_uchar16 perm = vec_ld(0, (unsigned char*)data);
+
+	Simd4f t0 = vec_perm(v0.f4, v0.f4, perm);
+	Simd4f t1 = vec_perm(v1.f4, v1.f4, perm);
+	Simd4f tmp = v0 * t1 - t0 * v1;
+	return vec_perm(tmp.f4, tmp.f4, perm);
+}
+
+void transpose(Simd4f& x, Simd4f& y, Simd4f& z, Simd4f& w)
+{
+	Simd4f v0 = vec_mergel(x.f4, z.f4);
+	Simd4f v1 = vec_mergeh(x.f4, z.f4);
+	Simd4f v2 = vec_mergel(y.f4, w.f4);
+	Simd4f v3 = vec_mergeh(y.f4, w.f4);
+	x = vec_mergeh(v1.f4, v3.f4);
+	y = vec_mergel(v1.f4, v3.f4);
+	z = vec_mergeh(v0.f4, v2.f4);
+	w = vec_mergel(v0.f4, v2.f4);
+}
+
+void zip(Simd4f& v0, Simd4f& v1)
+{
+	Simd4f t0 = v0;
+	v0 = vec_mergel(v0, v1);
+	v1 = vec_mergeh(t0, v1);
+}
+
+void unzip(Simd4f& v0, Simd4f& v1)
+{
+	Simd4f t0 = vec_mergel(v0, v1); // v0.x, v1.x, v0.y, v1.y
+	Simd4f t1 = vec_mergeh(v0, v1); // v0.z, v1.z, v0.w, v1.w
+	v0 = vec_mergel(t0, t1); // v0.x, v0.z, v1.x, v1.z
+	v1 = vec_mergeh(t0, t1); // v0.y, v0.w, v1.y, v1.w
+}
+
+Simd4f swaphilo(const Simd4f& v)
+{
+	uint32_t data[] __attribute__((aligned(16))) = { 0x08090a0b, 0x0c0d0e0f, 0x00010203, 0x04050607 };
+	vec_uchar16 perm = vec_ld(0, (unsigned char*)data);
+
+	return vec_perm(v0.f4, v0.f4, perm);
+}
+
+int allEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return vec_all_eq(v0.f4, v1.f4);
+}
+
+int allEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	int r = allEqual(v0, v1);
+	outMask = v0 == v1;
+	return r;
+}
+
+int anyEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return vec_any_eq(v0.f4, v1.f4);
+}
+
+int anyEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	int r = anyEqual(v0, v1);
+	outMask = v0 == v1;
+	return r;
+}
+
+int allGreater(const Simd4f& v0, const Simd4f& v1)
+{
+	return vec_all_gt(v0.f4, v1.f4);
+}
+
+int allGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	int r = allGreater(v0, v1);
+	outMask = v0 > v1;
+	return r;
+}
+
+int anyGreater(const Simd4f& v0, const Simd4f& v1)
+{
+	return vec_any_gt(v0.f4, v1.f4);
+}
+
+int anyGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	int r = anyGreater(v0, v1);
+	outMask = v0 > v1;
+	return r;
+}
+
+int allGreaterEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return vec_all_ge(v0.f4, v1.f4);
+}
+
+int allGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	int r = allGreaterEqual(v0, v1);
+	outMask = v0 >= v1;
+	return r;
+}
+
+int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return vec_any_ge(v0.f4, v1.f4);
+}
+
+int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	int r = anyGreaterEqual(v0, v1);
+	outMask = v0 >= v1;
+	return r;
+}
+
+int allTrue(const Simd4f& v)
+{
+	return !vec_any_ge(v.f4, (vec_float4)vec_splat_s32(0));
+}
+
+int anyTrue(const Simd4f& v)
+{
+	return !vec_all_ge(v.f4, (vec_float4)vec_splat_s32(0));
+}
diff --git a/src/simd/ps3/Simd4i.h b/src/simd/ps3/Simd4i.h
new file mode 100644
index 0000000..aaae344
--- /dev/null
+++ b/src/simd/ps3/Simd4i.h
@@ -0,0 +1,279 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// factory implementation
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline Simd4iFactory<const int&>::operator Simd4i() const
+{
+	return (vec_uint4)vec_splat(vec_lvlx(0, (int*)&v), 0);
+}
+
+inline Simd4iFactory<detail::FourTuple>::operator Simd4i() const
+{
+	return (const vec_uint4&)v;
+}
+
+template <int i>
+inline Simd4iFactory<detail::IntType<i> >::operator Simd4i() const
+{
+	return (vec_uint4)vec_splat_s32(i);
+}
+
+template <>
+inline Simd4iFactory<detail::IntType<0x80000000> >::operator Simd4i() const
+{
+	vec_uint4 mask = (vec_uint4)vec_splat_s32(-1);
+	return vec_sl(mask, mask);
+}
+
+template <>
+inline Simd4iFactory<const int*>::operator Simd4i() const
+{
+	return (vec_uint4)vec_or(vec_lvlx(0, const_cast<int*>(v)), vec_lvrx(16, const_cast<int*>(v)));
+}
+
+template <>
+inline Simd4iFactory<detail::AlignedPointer<int> >::operator Simd4i() const
+{
+	return (vec_uint4)vec_ld(0, const_cast<int*>(v.ptr));
+}
+
+template <>
+inline Simd4iFactory<detail::OffsetPointer<int> >::operator Simd4i() const
+{
+	return (vec_uint4)vec_ld(v.offset, const_cast<int*>(v.ptr));
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression template
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline ComplementExpr<Simd4i>::operator Simd4i() const
+{
+	return vec_nor(v.u4, v.u4);
+}
+
+Simd4i operator&(const ComplementExpr<Simd4i>& complement, const Simd4i& v)
+{
+	return vec_andc(v.u4, complement.v.u4);
+}
+
+Simd4i operator&(const Simd4i& v, const ComplementExpr<Simd4i>& complement)
+{
+	return vec_andc(v.u4, complement.v.u4);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operator implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4i simdi::operator==(const Simd4i& v0, const Simd4i& v1)
+{
+	return (vec_uint4)vec_cmpeq(v0.u4, v1.u4);
+}
+
+Simd4i simdi::operator<(const Simd4i& v0, const Simd4i& v1)
+{
+	return (vec_uint4)vec_cmplt((vec_int4)v0.u4, (vec_int4)v1.u4);
+}
+
+Simd4i simdi::operator>(const Simd4i& v0, const Simd4i& v1)
+{
+	return (vec_uint4)vec_cmpgt((vec_int4)v0.u4, (vec_int4)v1.u4);
+}
+
+ComplementExpr<Simd4i> operator~(const Simd4i& v)
+{
+	return ComplementExpr<Simd4i>(v);
+}
+
+Simd4i operator&(const Simd4i& v0, const Simd4i& v1)
+{
+	return vec_and(v0.u4, v1.u4);
+}
+
+Simd4i operator|(const Simd4i& v0, const Simd4i& v1)
+{
+	return vec_or(v0.u4, v1.u4);
+}
+
+Simd4i operator^(const Simd4i& v0, const Simd4i& v1)
+{
+	return vec_xor(v0.u4, v1.u4);
+}
+
+Simd4i operator<<(const Simd4i& v, int shift)
+{
+	return vec_sl(v.u4, vec_splat((vec_uint4)vec_lvlx(0, &shift), 0));
+}
+
+Simd4i operator>>(const Simd4i& v, int shift)
+{
+	return vec_sr(v.u4, vec_splat((vec_uint4)vec_lvlx(0, &shift), 0));
+}
+
+Simd4i operator<<(const Simd4i& v, const Simd4i& shift)
+{
+	return vec_sl(v.u4, shift.u4);
+}
+
+Simd4i operator>>(const Simd4i& v, const Simd4i& shift)
+{
+	return vec_sr(v.u4, shift.u4);
+}
+
+Simd4i simdi::operator+(const Simd4i& v0, const Simd4i& v1)
+{
+	return vec_add(v0.u4, v1.u4);
+}
+
+Simd4i simdi::operator-(const Simd4i& v)
+{
+	return vec_sub((vec_uint4)vec_splat_s32(0), v.u4);
+}
+
+Simd4i simdi::operator-(const Simd4i& v0, const Simd4i& v1)
+{
+	return vec_sub(v0.u4, v1.u4);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// function implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4i simd4i(const Simd4f& v)
+{
+	return (vec_uint4)v.f4;
+}
+
+Simd4i truncate(const Simd4f& v)
+{
+	return vec_cts(v.f4, 0);	
+}
+
+int (&simdi::array(Simd4i& v))[4]
+{
+	return (int(&)[4])v;
+}
+
+const int (&simdi::array(const Simd4i& v))[4]
+{
+	return (const int(&)[4])v;
+}
+
+void store(int* ptr, const Simd4i& v)
+{
+	vec_stvlx((vec_int4)v.u4, 0, ptr);
+	vec_stvrx((vec_int4)v.u4, 16, ptr);
+}
+
+void storeAligned(int* ptr, const Simd4i& v)
+{
+	vec_stvlx((vec_int4)v.u4, 0, ptr);
+}
+
+void storeAligned(int* ptr, unsigned int offset, const Simd4i& v)
+{
+	vec_stvlx((vec_int4)v.u4, offset, ptr);
+}
+
+template <size_t i>
+Simd4i splat(Simd4i const& v)
+{
+	return vec_splat(v.u4, i);
+}
+
+Simd4i select(Simd4i const& mask, Simd4i const& v0, Simd4i const& v1)
+{
+	return vec_sel(v1.u4, v0.u4, mask.u4);
+}
+
+int simdi::allEqual(const Simd4i& v0, const Simd4i& v1)
+{
+	return vec_all_eq(v0.u4, v1.u4);
+}
+
+int simdi::allEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	int r = simdi::allEqual(v0, v1);
+	outMask = simdi::operator==(v0, v1);
+	return r;
+}
+
+int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1)
+{
+	return vec_any_eq(v0.u4, v1.u4);
+}
+
+int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	int r = simdi::anyEqual(v0, v1);
+	outMask = simdi::operator==(v0, v1);
+	return r;
+}
+
+int simdi::allGreater(const Simd4i& v0, const Simd4i& v1)
+{
+	return vec_all_gt(v0.u4, v1.u4);
+}
+
+int simdi::allGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	int r = simdi::allGreater(v0, v1);
+	outMask = simdi::operator>(v0, v1);
+	return r;
+}
+
+int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1)
+{
+	return vec_any_gt(v0.u4, v1.u4);
+}
+
+int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	int r = simdi::anyGreater(v0, v1);
+	outMask = simdi::operator>(v0, v1);
+	return r;
+}
+
+int allTrue(const Simd4i& v)
+{
+	return vec_all_lt((vec_int4)v.u4, vec_splat_s32(0));
+}
+
+int anyTrue(const Simd4i& v)
+{
+	return vec_any_lt((vec_int4)v.u4, vec_splat_s32(0));
+}
diff --git a/src/simd/ps3/SimdTypes.h b/src/simd/ps3/SimdTypes.h
new file mode 100644
index 0000000..fee9277
--- /dev/null
+++ b/src/simd/ps3/SimdTypes.h
@@ -0,0 +1,64 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include <vec_types.h>
+
+#ifdef __PPU__
+#include <altivec.h>
+#define NVMATH_VECRETURN __attribute__((vecreturn))
+#else
+#include <vmx2spu.h>
+#define NVMATH_VECRETURN
+#endif
+
+struct Simd4f
+{
+	Simd4f()
+	{
+	}
+	Simd4f(const vec_float4& v) : f4(v)
+	{
+	}
+
+	vec_float4 f4;
+} NVMATH_VECRETURN;
+
+struct Simd4i
+{
+	Simd4i()
+	{
+	}
+	Simd4i(const vec_uint4& v) : u4(v)
+	{
+	}
+
+	vec_uint4 u4;
+} NVMATH_VECRETURN;
diff --git a/src/simd/scalar/Simd4f.h b/src/simd/scalar/Simd4f.h
new file mode 100644
index 0000000..d59b55f
--- /dev/null
+++ b/src/simd/scalar/Simd4f.h
@@ -0,0 +1,462 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// factory implementation
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline Simd4fFactory<const float&>::operator Scalar4f() const
+{
+	return Scalar4f(v, v, v, v);
+}
+
+inline Simd4fFactory<detail::FourTuple>::operator Scalar4f() const
+{
+	return reinterpret_cast<const Scalar4f&>(v);
+}
+
+template <int i>
+inline Simd4fFactory<detail::IntType<i> >::operator Scalar4f() const
+{
+	float s = i;
+	return Scalar4f(s, s, s, s);
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<0x80000000u> >::operator Scalar4f() const
+{
+	int32_t i = 0x80000000u;
+	return Scalar4f(i, i, i, i);
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<0xffffffff> >::operator Scalar4f() const
+{
+	int32_t i = 0xffffffff;
+	return Scalar4f(i, i, i, i);
+}
+
+template <>
+inline Simd4fFactory<const float*>::operator Scalar4f() const
+{
+	return Scalar4f(v[0], v[1], v[2], v[3]);
+}
+
+template <>
+inline Simd4fFactory<detail::AlignedPointer<float> >::operator Scalar4f() const
+{
+	return Scalar4f(v.ptr[0], v.ptr[1], v.ptr[2], v.ptr[3]);
+}
+
+template <>
+inline Simd4fFactory<detail::OffsetPointer<float> >::operator Scalar4f() const
+{
+	const float* ptr = reinterpret_cast<const float*>(reinterpret_cast<const char*>(v.ptr) + v.offset);
+	return Scalar4f(ptr[0], ptr[1], ptr[2], ptr[3]);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression template
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline ComplementExpr<Scalar4f>::operator Scalar4f() const
+{
+	return Scalar4f(~v.u4[0], ~v.u4[1], ~v.u4[2], ~v.u4[3]);
+}
+
+inline Scalar4f operator&(const ComplementExpr<Scalar4f>& complement, const Scalar4f& v)
+{
+	return Scalar4f(v.u4[0] & ~complement.v.u4[0], v.u4[1] & ~complement.v.u4[1], v.u4[2] & ~complement.v.u4[2],
+	                v.u4[3] & ~complement.v.u4[3]);
+}
+
+inline Scalar4f operator&(const Scalar4f& v, const ComplementExpr<Scalar4f>& complement)
+{
+	return Scalar4f(v.u4[0] & ~complement.v.u4[0], v.u4[1] & ~complement.v.u4[1], v.u4[2] & ~complement.v.u4[2],
+	                v.u4[3] & ~complement.v.u4[3]);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operator implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+inline Scalar4f operator==(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] == v1.f4[0], v0.f4[1] == v1.f4[1], v0.f4[2] == v1.f4[2], v0.f4[3] == v1.f4[3]);
+}
+
+inline Scalar4f operator<(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] < v1.f4[0], v0.f4[1] < v1.f4[1], v0.f4[2] < v1.f4[2], v0.f4[3] < v1.f4[3]);
+}
+
+inline Scalar4f operator<=(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] <= v1.f4[0], v0.f4[1] <= v1.f4[1], v0.f4[2] <= v1.f4[2], v0.f4[3] <= v1.f4[3]);
+}
+
+inline Scalar4f operator>(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] > v1.f4[0], v0.f4[1] > v1.f4[1], v0.f4[2] > v1.f4[2], v0.f4[3] > v1.f4[3]);
+}
+
+inline Scalar4f operator>=(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] >= v1.f4[0], v0.f4[1] >= v1.f4[1], v0.f4[2] >= v1.f4[2], v0.f4[3] >= v1.f4[3]);
+}
+
+inline ComplementExpr<Scalar4f> operator~(const Scalar4f& v)
+{
+	return ComplementExpr<Scalar4f>(v);
+}
+
+inline Scalar4f operator&(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.u4[0] & v1.u4[0], v0.u4[1] & v1.u4[1], v0.u4[2] & v1.u4[2], v0.u4[3] & v1.u4[3]);
+}
+
+inline Scalar4f operator|(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.u4[0] | v1.u4[0], v0.u4[1] | v1.u4[1], v0.u4[2] | v1.u4[2], v0.u4[3] | v1.u4[3]);
+}
+
+inline Scalar4f operator^(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.u4[0] ^ v1.u4[0], v0.u4[1] ^ v1.u4[1], v0.u4[2] ^ v1.u4[2], v0.u4[3] ^ v1.u4[3]);
+}
+
+inline Scalar4f operator<<(const Scalar4f& v, int shift)
+{
+	return Scalar4f(v.u4[0] << shift, v.u4[1] << shift, v.u4[2] << shift, v.u4[3] << shift);
+}
+
+inline Scalar4f operator>>(const Scalar4f& v, int shift)
+{
+	return Scalar4f(v.u4[0] >> shift, v.u4[1] >> shift, v.u4[2] >> shift, v.u4[3] >> shift);
+}
+
+inline Scalar4f operator+(const Scalar4f& v)
+{
+	return v;
+}
+
+inline Scalar4f operator+(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] + v1.f4[0], v0.f4[1] + v1.f4[1], v0.f4[2] + v1.f4[2], v0.f4[3] + v1.f4[3]);
+}
+
+inline Scalar4f operator-(const Scalar4f& v)
+{
+	return Scalar4f(-v.f4[0], -v.f4[1], -v.f4[2], -v.f4[3]);
+}
+
+inline Scalar4f operator-(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] - v1.f4[0], v0.f4[1] - v1.f4[1], v0.f4[2] - v1.f4[2], v0.f4[3] - v1.f4[3]);
+}
+
+inline Scalar4f operator*(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] * v1.f4[0], v0.f4[1] * v1.f4[1], v0.f4[2] * v1.f4[2], v0.f4[3] * v1.f4[3]);
+}
+
+inline Scalar4f operator/(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] / v1.f4[0], v0.f4[1] / v1.f4[1], v0.f4[2] / v1.f4[2], v0.f4[3] / v1.f4[3]);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// function implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+inline Scalar4f simd4f(const Scalar4i& v)
+{
+	return v;
+}
+
+inline Scalar4f convert(const Scalar4i& v)
+{
+	return Scalar4f(float(v.i4[0]), float(v.i4[1]), float(v.i4[2]), float(v.i4[3]));
+}
+
+inline float (&array(Scalar4f& v))[4]
+{
+	return v.f4;
+}
+
+inline const float (&array(const Scalar4f& v))[4]
+{
+	return v.f4;
+}
+
+inline void store(float* ptr, const Scalar4f& v)
+{
+	ptr[0] = v.f4[0];
+	ptr[1] = v.f4[1];
+	ptr[2] = v.f4[2];
+	ptr[3] = v.f4[3];
+}
+
+inline void storeAligned(float* ptr, const Scalar4f& v)
+{
+	store(ptr, v);
+}
+
+inline void storeAligned(float* ptr, unsigned int offset, const Scalar4f& v)
+{
+	storeAligned(reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + offset), v);
+}
+
+template <size_t i>
+inline Scalar4f splat(const Scalar4f& v)
+{
+	return Scalar4f(v.f4[i], v.f4[i], v.f4[i], v.f4[i]);
+}
+
+inline Scalar4f select(const Scalar4f& mask, const Scalar4f& v0, const Scalar4f& v1)
+{
+	return ((v0 ^ v1) & mask) ^ v1;
+}
+
+inline Scalar4f abs(const Scalar4f& v)
+{
+	return Scalar4f(::fabsf(v.f4[0]), ::fabsf(v.f4[1]), ::fabsf(v.f4[2]), ::fabsf(v.f4[3]));
+}
+
+inline Scalar4f floor(const Scalar4f& v)
+{
+	return Scalar4f(::floorf(v.f4[0]), ::floorf(v.f4[1]), ::floorf(v.f4[2]), ::floorf(v.f4[3]));
+}
+
+inline Scalar4f max(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(std::max(v0.f4[0], v1.f4[0]), std::max(v0.f4[1], v1.f4[1]), std::max(v0.f4[2], v1.f4[2]),
+	                std::max(v0.f4[3], v1.f4[3]));
+}
+
+inline Scalar4f min(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(std::min(v0.f4[0], v1.f4[0]), std::min(v0.f4[1], v1.f4[1]), std::min(v0.f4[2], v1.f4[2]),
+	                std::min(v0.f4[3], v1.f4[3]));
+}
+
+inline Scalar4f recip(const Scalar4f& v)
+{
+	return Scalar4f(1 / v.f4[0], 1 / v.f4[1], 1 / v.f4[2], 1 / v.f4[3]);
+}
+
+template <int n>
+inline Scalar4f recip(const Scalar4f& v)
+{
+	return recip(v);
+}
+
+inline Scalar4f sqrt(const Scalar4f& v)
+{
+	return Scalar4f(::sqrtf(v.f4[0]), ::sqrtf(v.f4[1]), ::sqrtf(v.f4[2]), ::sqrtf(v.f4[3]));
+}
+
+inline Scalar4f rsqrt(const Scalar4f& v)
+{
+	return recip(sqrt(v));
+}
+
+template <int n>
+inline Scalar4f rsqrt(const Scalar4f& v)
+{
+	return rsqrt(v);
+}
+
+inline Scalar4f exp2(const Scalar4f& v)
+{
+	float scale = 0.69314718055994531f; // ::logf(2.0f);
+	return Scalar4f(::expf(v.f4[0] * scale), ::expf(v.f4[1] * scale), ::expf(v.f4[2] * scale), ::expf(v.f4[3] * scale));
+}
+
+namespace simdf
+{
+// PSP2 is confused resolving about exp2, forwarding works
+inline Scalar4f exp2(const Scalar4f& v)
+{
+	return ::exp2(v);
+}
+}
+
+inline Scalar4f log2(const Scalar4f& v)
+{
+	float scale = 1.44269504088896341f; // 1/ln(2)
+	return Scalar4f(::logf(v.f4[0]) * scale, ::logf(v.f4[1]) * scale, ::logf(v.f4[2]) * scale, ::logf(v.f4[3]) * scale);
+}
+
+inline Scalar4f dot3(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return simd4f(v0.f4[0] * v1.f4[0] + v0.f4[1] * v1.f4[1] + v0.f4[2] * v1.f4[2]);
+}
+
+inline Scalar4f cross3(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return simd4f(v0.f4[1] * v1.f4[2] - v0.f4[2] * v1.f4[1], v0.f4[2] * v1.f4[0] - v0.f4[0] * v1.f4[2],
+	              v0.f4[0] * v1.f4[1] - v0.f4[1] * v1.f4[0], 0.0f);
+}
+
+inline void transpose(Scalar4f& x, Scalar4f& y, Scalar4f& z, Scalar4f& w)
+{
+	float x1 = x.f4[1], x2 = x.f4[2], x3 = x.f4[3];
+	float y2 = y.f4[2], y3 = y.f4[3], z3 = z.f4[3];
+
+	x.f4[1] = y.f4[0];
+	x.f4[2] = z.f4[0];
+	x.f4[3] = w.f4[0];
+	y.f4[0] = x1;
+	y.f4[2] = z.f4[1];
+	y.f4[3] = w.f4[1];
+	z.f4[0] = x2;
+	z.f4[1] = y2;
+	z.f4[3] = w.f4[2];
+	w.f4[0] = x3;
+	w.f4[1] = y3;
+	w.f4[2] = z3;
+}
+
+inline void zip(Scalar4f& v0, Scalar4f& v1)
+{
+	float z0 = v0.f4[2];
+	v0.f4[2] = v0.f4[1];
+	v0.f4[1] = v1.f4[0];
+	v1.f4[0] = z0;
+
+	float z1 = v1.f4[2];
+	v1.f4[2] = v0.f4[3];
+	v0.f4[3] = v1.f4[1];
+	v1.f4[1] = z1;
+}
+
+inline void unzip(Scalar4f& v0, Scalar4f& v1)
+{
+	float z0 = v0.f4[2];
+	v0.f4[2] = v1.f4[0];
+	v1.f4[0] = v0.f4[1];
+	v0.f4[1] = z0;
+
+	float z1 = v1.f4[2];
+	v1.f4[2] = v1.f4[1];
+	v1.f4[1] = v0.f4[3];
+	v0.f4[3] = z1;
+}
+
+inline Scalar4f swaphilo(const Scalar4f& v)
+{
+	return Scalar4f(v.f4[2], v.f4[3], v.f4[0], v.f4[1]);
+}
+
+inline int allEqual(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return v0.f4[0] == v1.f4[0] && v0.f4[1] == v1.f4[1] && v0.f4[2] == v1.f4[2] && v0.f4[3] == v1.f4[3];
+}
+
+inline int allEqual(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask)
+{
+	bool b0 = v0.f4[0] == v1.f4[0], b1 = v0.f4[1] == v1.f4[1], b2 = v0.f4[2] == v1.f4[2], b3 = v0.f4[3] == v1.f4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 && b1 && b2 && b3;
+}
+
+inline int anyEqual(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return v0.f4[0] == v1.f4[0] || v0.f4[1] == v1.f4[1] || v0.f4[2] == v1.f4[2] || v0.f4[3] == v1.f4[3];
+}
+
+inline int anyEqual(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask)
+{
+	bool b0 = v0.f4[0] == v1.f4[0], b1 = v0.f4[1] == v1.f4[1], b2 = v0.f4[2] == v1.f4[2], b3 = v0.f4[3] == v1.f4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 || b1 || b2 || b3;
+}
+
+inline int allGreater(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return v0.f4[0] > v1.f4[0] && v0.f4[1] > v1.f4[1] && v0.f4[2] > v1.f4[2] && v0.f4[3] > v1.f4[3];
+}
+
+inline int allGreater(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask)
+{
+	bool b0 = v0.f4[0] > v1.f4[0], b1 = v0.f4[1] > v1.f4[1], b2 = v0.f4[2] > v1.f4[2], b3 = v0.f4[3] > v1.f4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 && b1 && b2 && b3;
+}
+
+inline int anyGreater(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return v0.f4[0] > v1.f4[0] || v0.f4[1] > v1.f4[1] || v0.f4[2] > v1.f4[2] || v0.f4[3] > v1.f4[3];
+}
+
+inline int anyGreater(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask)
+{
+	bool b0 = v0.f4[0] > v1.f4[0], b1 = v0.f4[1] > v1.f4[1], b2 = v0.f4[2] > v1.f4[2], b3 = v0.f4[3] > v1.f4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 || b1 || b2 || b3;
+}
+
+inline int allGreaterEqual(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return v0.f4[0] >= v1.f4[0] && v0.f4[1] >= v1.f4[1] && v0.f4[2] >= v1.f4[2] && v0.f4[3] >= v1.f4[3];
+}
+
+inline int allGreaterEqual(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask)
+{
+	bool b0 = v0.f4[0] >= v1.f4[0], b1 = v0.f4[1] >= v1.f4[1], b2 = v0.f4[2] >= v1.f4[2], b3 = v0.f4[3] >= v1.f4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 && b1 && b2 && b3;
+}
+
+inline int anyGreaterEqual(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return v0.f4[0] >= v1.f4[0] || v0.f4[1] >= v1.f4[1] || v0.f4[2] >= v1.f4[2] || v0.f4[3] >= v1.f4[3];
+}
+
+inline int anyGreaterEqual(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask)
+{
+	bool b0 = v0.f4[0] >= v1.f4[0], b1 = v0.f4[1] >= v1.f4[1], b2 = v0.f4[2] >= v1.f4[2], b3 = v0.f4[3] >= v1.f4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 || b1 || b2 || b3;
+}
+
+inline int allTrue(const Scalar4f& v)
+{
+	return v.u4[0] & v.u4[1] & v.u4[2] & v.u4[3];
+}
+
+inline int anyTrue(const Scalar4f& v)
+{
+	return v.u4[0] | v.u4[1] | v.u4[2] | v.u4[3];
+}
diff --git a/src/simd/scalar/Simd4i.h b/src/simd/scalar/Simd4i.h
new file mode 100644
index 0000000..dd64682
--- /dev/null
+++ b/src/simd/scalar/Simd4i.h
@@ -0,0 +1,209 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// factory implementation
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline Simd4iFactory<const int&>::operator Scalar4i() const
+{
+	return Scalar4i(v, v, v, v);
+}
+
+inline Simd4iFactory<detail::FourTuple>::operator Scalar4i() const
+{
+	return reinterpret_cast<const Scalar4i&>(v);
+}
+
+template <int i>
+inline Simd4iFactory<detail::IntType<i> >::operator Scalar4i() const
+{
+	return Scalar4i(i, i, i, i);
+}
+
+template <>
+inline Simd4iFactory<const int*>::operator Scalar4i() const
+{
+	return Scalar4i(v[0], v[1], v[2], v[3]);
+}
+
+template <>
+inline Simd4iFactory<detail::AlignedPointer<int> >::operator Scalar4i() const
+{
+	return Scalar4i(v.ptr[0], v.ptr[1], v.ptr[2], v.ptr[3]);
+}
+
+template <>
+inline Simd4iFactory<detail::OffsetPointer<int> >::operator Scalar4i() const
+{
+	const int* ptr = reinterpret_cast<const int*>(reinterpret_cast<const char*>(v.ptr) + v.offset);
+	return Scalar4i(ptr[0], ptr[1], ptr[2], ptr[3]);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operator implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+namespace simdi
+{
+
+inline Scalar4i operator==(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return Scalar4i(v0.i4[0] == v1.i4[0], v0.i4[1] == v1.i4[1], v0.i4[2] == v1.i4[2], v0.i4[3] == v1.i4[3]);
+}
+
+inline Scalar4i operator<(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return Scalar4i(v0.i4[0] < v1.i4[0], v0.i4[1] < v1.i4[1], v0.i4[2] < v1.i4[2], v0.i4[3] < v1.i4[3]);
+}
+
+inline Scalar4i operator>(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return Scalar4i(v0.i4[0] > v1.i4[0], v0.i4[1] > v1.i4[1], v0.i4[2] > v1.i4[2], v0.i4[3] > v1.i4[3]);
+}
+
+inline Scalar4i operator+(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return Scalar4i(v0.i4[0] + v1.i4[0], v0.i4[1] + v1.i4[1], v0.i4[2] + v1.i4[2], v0.i4[3] + v1.i4[3]);
+}
+
+inline Scalar4i operator-(const Scalar4i& v)
+{
+	return Scalar4i(-v.i4[0], -v.i4[1], -v.i4[2], -v.i4[3]);
+}
+
+inline Scalar4i operator-(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return Scalar4i(v0.i4[0] - v1.i4[0], v0.i4[1] - v1.i4[1], v0.i4[2] - v1.i4[2], v0.i4[3] - v1.i4[3]);
+}
+
+} // namespace simd
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// function implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+inline Scalar4i simd4i(const Scalar4f& v)
+{
+	return v;
+}
+
+inline Scalar4i truncate(const Scalar4f& v)
+{
+	return Scalar4i(int(v.f4[0]), int(v.f4[1]), int(v.f4[2]), int(v.f4[3]));	
+}
+
+namespace simdi
+{
+
+inline int (&array(Scalar4i& v))[4]
+{
+	return v.i4;
+}
+
+inline const int (&array(const Scalar4i& v))[4]
+{
+	return v.i4;
+}
+
+} // namespace simdi
+
+inline void store(int* ptr, const Scalar4i& v)
+{
+	ptr[0] = v.i4[0];
+	ptr[1] = v.i4[1];
+	ptr[2] = v.i4[2];
+	ptr[3] = v.i4[3];
+}
+
+inline void storeAligned(int* ptr, const Scalar4i& v)
+{
+	store(ptr, v);
+}
+
+inline void storeAligned(int* ptr, unsigned int offset, const Scalar4i& v)
+{
+	store(reinterpret_cast<int*>(reinterpret_cast<char*>(ptr) + offset), v);
+}
+
+namespace simdi
+{
+
+inline int allEqual(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return v0.i4[0] == v1.i4[0] && v0.i4[1] == v1.i4[1] && v0.i4[2] == v1.i4[2] && v0.i4[3] == v1.i4[3];
+}
+
+inline int allEqual(const Scalar4i& v0, const Scalar4i& v1, Scalar4i& outMask)
+{
+	bool b0 = v0.i4[0] == v1.i4[0], b1 = v0.i4[1] == v1.i4[1], b2 = v0.i4[2] == v1.i4[2], b3 = v0.i4[3] == v1.i4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 && b1 && b2 && b3;
+}
+
+inline int anyEqual(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return v0.i4[0] == v1.i4[0] || v0.i4[1] == v1.i4[1] || v0.i4[2] == v1.i4[2] || v0.i4[3] == v1.i4[3];
+}
+
+inline int anyEqual(const Scalar4i& v0, const Scalar4i& v1, Scalar4i& outMask)
+{
+	bool b0 = v0.i4[0] == v1.i4[0], b1 = v0.i4[1] == v1.i4[1], b2 = v0.i4[2] == v1.i4[2], b3 = v0.i4[3] == v1.i4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 || b1 || b2 || b3;
+}
+
+inline int allGreater(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return v0.i4[0] > v1.i4[0] && v0.i4[1] > v1.i4[1] && v0.i4[2] > v1.i4[2] && v0.i4[3] > v1.i4[3];
+}
+
+inline int allGreater(const Scalar4i& v0, const Scalar4i& v1, Scalar4i& outMask)
+{
+	bool b0 = v0.i4[0] > v1.i4[0], b1 = v0.i4[1] > v1.i4[1], b2 = v0.i4[2] > v1.i4[2], b3 = v0.i4[3] > v1.i4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 && b1 && b2 && b3;
+}
+
+inline int anyGreater(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return v0.i4[0] > v1.i4[0] || v0.i4[1] > v1.i4[1] || v0.i4[2] > v1.i4[2] || v0.i4[3] > v1.i4[3];
+}
+
+inline int anyGreater(const Scalar4i& v0, const Scalar4i& v1, Scalar4i& outMask)
+{
+	bool b0 = v0.i4[0] > v1.i4[0], b1 = v0.i4[1] > v1.i4[1], b2 = v0.i4[2] > v1.i4[2], b3 = v0.i4[3] > v1.i4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 || b1 || b2 || b3;
+}
+
+} // namespace simd
diff --git a/src/simd/scalar/SimdTypes.h b/src/simd/scalar/SimdTypes.h
new file mode 100644
index 0000000..d6b3e6b
--- /dev/null
+++ b/src/simd/scalar/SimdTypes.h
@@ -0,0 +1,107 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#ifdef PX_WIIU
+#pragma ghs nowarning 193 // warning #193-D: zero used for undefined preprocessing identifier
+#endif
+
+#ifdef _MSC_VER
+typedef __int32 int32_t;
+typedef unsigned __int32 uint32_t;
+#endif
+
+#include <algorithm>
+
+#ifdef PX_WIIU
+#pragma ghs endnowarning
+#endif
+
+union Scalar4f
+{
+	Scalar4f()
+	{
+	}
+
+	Scalar4f(float x, float y, float z, float w)
+	{
+		f4[0] = x;
+		f4[1] = y;
+		f4[2] = z;
+		f4[3] = w;
+	}
+
+	Scalar4f(int32_t x, int32_t y, int32_t z, int32_t w)
+	{
+		i4[0] = x;
+		i4[1] = y;
+		i4[2] = z;
+		i4[3] = w;
+	}
+
+	Scalar4f(uint32_t x, uint32_t y, uint32_t z, uint32_t w)
+	{
+		u4[0] = x;
+		u4[1] = y;
+		u4[2] = z;
+		u4[3] = w;
+	}
+
+	Scalar4f(bool x, bool y, bool z, bool w)
+	{
+		u4[0] = ~(uint32_t(x) - 1);
+		u4[1] = ~(uint32_t(y) - 1);
+		u4[2] = ~(uint32_t(z) - 1);
+		u4[3] = ~(uint32_t(w) - 1);
+	}
+
+	Scalar4f(const Scalar4f& other)
+	{
+		u4[0] = other.u4[0];
+		u4[1] = other.u4[1];
+		u4[2] = other.u4[2];
+		u4[3] = other.u4[3];
+	}
+
+	Scalar4f& operator=(const Scalar4f& other)
+	{
+		u4[0] = other.u4[0];
+		u4[1] = other.u4[1];
+		u4[2] = other.u4[2];
+		u4[3] = other.u4[3];
+		return *this;
+	}
+
+	float f4[4];
+	int32_t i4[4];
+	uint32_t u4[4];
+};
+
+typedef Scalar4f Scalar4i;
diff --git a/src/simd/sse2/Simd4f.h b/src/simd/sse2/Simd4f.h
new file mode 100644
index 0000000..983e16e
--- /dev/null
+++ b/src/simd/sse2/Simd4f.h
@@ -0,0 +1,452 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// factory implementation
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline Simd4fFactory<const float&>::operator Simd4f() const
+{
+	return _mm_set1_ps(v);
+}
+
+inline Simd4fFactory<detail::FourTuple>::operator Simd4f() const
+{
+	return reinterpret_cast<const Simd4f&>(v);
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<0> >::operator Simd4f() const
+{
+	return _mm_setzero_ps();
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<1> >::operator Simd4f() const
+{
+	return _mm_set1_ps(1.0f);
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<int(0x80000000)> >::operator Simd4f() const
+{
+	return _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<int(0xffffffff)> >::operator Simd4f() const
+{
+	return _mm_castsi128_ps(_mm_set1_epi32(-1));
+}
+
+template <>
+inline Simd4fFactory<const float*>::operator Simd4f() const
+{
+	return _mm_loadu_ps(v);
+}
+
+template <>
+inline Simd4fFactory<detail::AlignedPointer<float> >::operator Simd4f() const
+{
+	return _mm_load_ps(v.ptr);
+}
+
+template <>
+inline Simd4fFactory<detail::OffsetPointer<float> >::operator Simd4f() const
+{
+	return _mm_load_ps(reinterpret_cast<const float*>(reinterpret_cast<const char*>(v.ptr) + v.offset));
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression template
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline ComplementExpr<Simd4f>::operator Simd4f() const
+{
+	return _mm_andnot_ps(v, _mm_castsi128_ps(_mm_set1_epi32(-1)));
+}
+
+Simd4f operator&(const ComplementExpr<Simd4f>& complement, const Simd4f& v)
+{
+	return _mm_andnot_ps(complement.v, v);
+}
+
+Simd4f operator&(const Simd4f& v, const ComplementExpr<Simd4f>& complement)
+{
+	return _mm_andnot_ps(complement.v, v);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operator implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4f operator==(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_cmpeq_ps(v0, v1);
+}
+
+Simd4f operator<(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_cmplt_ps(v0, v1);
+}
+
+Simd4f operator<=(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_cmple_ps(v0, v1);
+}
+
+Simd4f operator>(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_cmpgt_ps(v0, v1);
+}
+
+Simd4f operator>=(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_cmpge_ps(v0, v1);
+}
+
+ComplementExpr<Simd4f> operator~(const Simd4f& v)
+{
+	return ComplementExpr<Simd4f>(v);
+}
+
+Simd4f operator&(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_and_ps(v0, v1);
+}
+
+Simd4f operator|(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_or_ps(v0, v1);
+}
+
+Simd4f operator^(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_xor_ps(v0, v1);
+}
+
+Simd4f operator<<(const Simd4f& v, int shift)
+{
+	return _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(v), shift));
+}
+
+Simd4f operator>>(const Simd4f& v, int shift)
+{
+	return _mm_castsi128_ps(_mm_srli_epi32(_mm_castps_si128(v), shift));
+}
+
+Simd4f operator+(const Simd4f& v)
+{
+	return v;
+}
+
+Simd4f operator+(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_add_ps(v0, v1);
+}
+
+Simd4f operator-(const Simd4f& v)
+{
+	return _mm_sub_ps(_mm_setzero_ps(), v);
+}
+
+Simd4f operator-(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_sub_ps(v0, v1);
+}
+
+Simd4f operator*(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_mul_ps(v0, v1);
+}
+
+Simd4f operator/(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_div_ps(v0, v1);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// function implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4f simd4f(const Simd4i& v)
+{
+	return _mm_castsi128_ps(v);
+}
+
+Simd4f convert(const Simd4i& v)
+{
+	return _mm_cvtepi32_ps(v);
+}
+
+float (&array(Simd4f& v))[4]
+{
+	return reinterpret_cast<float(&)[4]>(v);
+}
+
+const float (&array(const Simd4f& v))[4]
+{
+	return reinterpret_cast<const float(&)[4]>(v);
+}
+
+void store(float* ptr, Simd4f const& v)
+{
+	_mm_storeu_ps(ptr, v);
+}
+
+void storeAligned(float* ptr, Simd4f const& v)
+{
+	_mm_store_ps(ptr, v);
+}
+
+void storeAligned(float* ptr, unsigned int offset, Simd4f const& v)
+{
+	_mm_store_ps(reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + offset), v);
+}
+
+template <size_t i>
+Simd4f splat(Simd4f const& v)
+{
+	return _mm_shuffle_ps(v, v, _MM_SHUFFLE(i, i, i, i));
+}
+
+Simd4f select(Simd4f const& mask, Simd4f const& v0, Simd4f const& v1)
+{
+	return _mm_xor_ps(v1, _mm_and_ps(mask, _mm_xor_ps(v1, v0)));
+}
+
+Simd4f abs(const Simd4f& v)
+{
+	return _mm_andnot_ps(_mm_castsi128_ps(_mm_set1_epi32(0x80000000)), v);
+}
+
+Simd4f floor(const Simd4f& v)
+{
+	// SSE 4.1: return _mm_floor_ps(v);
+	Simd4i i = _mm_cvttps_epi32(v);
+	Simd4i s = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(i), v));
+	return _mm_cvtepi32_ps(_mm_sub_epi32(i, _mm_srli_epi32(s, 31)));
+}
+
+Simd4f max(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_max_ps(v0, v1);
+}
+
+Simd4f min(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_min_ps(v0, v1);
+}
+
+Simd4f recip(const Simd4f& v)
+{
+	return _mm_rcp_ps(v);
+}
+
+template <int n>
+Simd4f recip(const Simd4f& v)
+{
+	Simd4f two = simd4f(2.0f);
+	Simd4f recipV = recip(v);
+	for(int i = 0; i < n; ++i)
+		recipV = recipV * (two - v * recipV);
+	return recipV;
+}
+
+Simd4f sqrt(const Simd4f& v)
+{
+	return _mm_sqrt_ps(v);
+}
+
+Simd4f rsqrt(const Simd4f& v)
+{
+	return _mm_rsqrt_ps(v);
+}
+
+template <int n>
+Simd4f rsqrt(const Simd4f& v)
+{
+	Simd4f halfV = v * simd4f(0.5f);
+	Simd4f threeHalf = simd4f(1.5f);
+	Simd4f rsqrtV = rsqrt(v);
+	for(int i = 0; i < n; ++i)
+		rsqrtV = rsqrtV * (threeHalf - halfV * rsqrtV * rsqrtV);
+	return rsqrtV;
+}
+
+Simd4f exp2(const Simd4f& v)
+{
+	// http://www.netlib.org/cephes/
+
+	Simd4f limit = simd4f(127.4999f);
+	Simd4f x = min(max(-limit, v), limit);
+
+	// separate into integer and fractional part
+
+	Simd4f fx = x + simd4f(0.5f);
+	Simd4i ix = _mm_sub_epi32(_mm_cvttps_epi32(fx), _mm_srli_epi32(_mm_castps_si128(fx), 31));
+	fx = x - Simd4f(_mm_cvtepi32_ps(ix));
+
+	// exp2(fx) ~ 1 + 2*P(fx) / (Q(fx) - P(fx))
+
+	Simd4f fx2 = fx * fx;
+
+	Simd4f px = fx * (simd4f(1.51390680115615096133e+3f) +
+	                  fx2 * (simd4f(2.02020656693165307700e+1f) + fx2 * simd4f(2.30933477057345225087e-2f)));
+	Simd4f qx = simd4f(4.36821166879210612817e+3f) + fx2 * (simd4f(2.33184211722314911771e+2f) + fx2);
+
+	Simd4f exp2fx = px * recip(qx - px);
+	exp2fx = simd4f(_1) + exp2fx + exp2fx;
+
+	// exp2(ix)
+
+	Simd4f exp2ix = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ix, _mm_set1_epi32(0x7f)), 23));
+
+	return exp2fx * exp2ix;
+}
+
+Simd4f log2(const Simd4f& v)
+{
+	// todo: fast approximate implementation like exp2
+	Simd4f scale = simd4f(1.44269504088896341f); // 1/ln(2)
+	const float* ptr = array(v);
+	return simd4f(::logf(ptr[0]), ::logf(ptr[1]), ::logf(ptr[2]), ::logf(ptr[3])) * scale;
+}
+
+Simd4f dot3(const Simd4f& v0, const Simd4f& v1)
+{
+	Simd4f tmp = v0 * v1;
+	return splat<0>(tmp) + splat<1>(tmp) + splat<2>(tmp);
+}
+
+Simd4f cross3(const Simd4f& v0, const Simd4f& v1)
+{
+	Simd4f t0 = _mm_shuffle_ps(v0, v0, 0xc9); // w z y x -> w x z y
+	Simd4f t1 = _mm_shuffle_ps(v1, v1, 0xc9);
+	Simd4f tmp = v0 * t1 - t0 * v1;
+	return _mm_shuffle_ps(tmp, tmp, 0xc9);
+}
+
+void transpose(Simd4f& x, Simd4f& y, Simd4f& z, Simd4f& w)
+{
+	_MM_TRANSPOSE4_PS(x, y, z, w);
+}
+
+void zip(Simd4f& v0, Simd4f& v1)
+{
+	Simd4f t0 = v0;
+	v0 = _mm_unpacklo_ps(v0, v1);
+	v1 = _mm_unpackhi_ps(t0, v1);
+}
+
+void unzip(Simd4f& v0, Simd4f& v1)
+{
+	Simd4f t0 = v0;
+	v0 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(2, 0, 2, 0));
+	v1 = _mm_shuffle_ps(t0, v1, _MM_SHUFFLE(3, 1, 3, 1));
+}
+
+Simd4f swaphilo(const Simd4f& v)
+{
+	return _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 0, 3, 2));
+}
+
+int allEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return allTrue(v0 == v1);
+}
+
+int allEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return allTrue(outMask = v0 == v1);
+}
+
+int anyEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return anyTrue(v0 == v1);
+}
+
+int anyEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return anyTrue(outMask = v0 == v1);
+}
+
+int allGreater(const Simd4f& v0, const Simd4f& v1)
+{
+	return allTrue(v0 > v1);
+}
+
+int allGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return allTrue(outMask = v0 > v1);
+}
+
+int anyGreater(const Simd4f& v0, const Simd4f& v1)
+{
+	return anyTrue(v0 > v1);
+}
+
+int anyGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return anyTrue(outMask = v0 > v1);
+}
+
+int allGreaterEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return allTrue(v0 >= v1);
+}
+
+int allGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return allTrue(outMask = v0 >= v1);
+}
+
+int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return anyTrue(v0 >= v1);
+}
+
+int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return anyTrue(outMask = v0 >= v1);
+}
+
+int allTrue(const Simd4f& v)
+{
+	return _mm_movemask_ps(v) == 0xf;
+}
+
+int anyTrue(const Simd4f& v)
+{
+	return _mm_movemask_ps(v);
+}
diff --git a/src/simd/sse2/Simd4i.h b/src/simd/sse2/Simd4i.h
new file mode 100644
index 0000000..1843bfc
--- /dev/null
+++ b/src/simd/sse2/Simd4i.h
@@ -0,0 +1,259 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// factory implementation
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline Simd4iFactory<const int&>::operator Simd4i() const
+{
+	return _mm_set1_epi32(v);
+}
+
+inline Simd4iFactory<detail::FourTuple>::operator Simd4i() const
+{
+	return reinterpret_cast<const Simd4i&>(v);
+}
+
+template <int i>
+inline Simd4iFactory<detail::IntType<i> >::operator Simd4i() const
+{
+	return _mm_set1_epi32(i);
+}
+
+template <>
+inline Simd4iFactory<detail::IntType<0> >::operator Simd4i() const
+{
+	return _mm_setzero_si128();
+}
+
+template <>
+inline Simd4iFactory<const int*>::operator Simd4i() const
+{
+	return _mm_loadu_si128(reinterpret_cast<const __m128i*>(v));
+}
+
+template <>
+inline Simd4iFactory<detail::AlignedPointer<int> >::operator Simd4i() const
+{
+	return _mm_load_si128(reinterpret_cast<const __m128i*>(v.ptr));
+}
+
+template <>
+inline Simd4iFactory<detail::OffsetPointer<int> >::operator Simd4i() const
+{
+	return _mm_load_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const char*>(v.ptr) + v.offset));
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression template
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline ComplementExpr<Simd4i>::operator Simd4i() const
+{
+	return _mm_andnot_si128(v, _mm_set1_epi32(0xffffffff));
+}
+
+Simd4i operator&(const ComplementExpr<Simd4i>& complement, const Simd4i& v)
+{
+	return _mm_andnot_si128(complement.v, v);
+}
+
+Simd4i operator&(const Simd4i& v, const ComplementExpr<Simd4i>& complement)
+{
+	return _mm_andnot_si128(complement.v, v);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operator implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4i simdi::operator==(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_cmpeq_epi32(v0, v1);
+}
+
+Simd4i simdi::operator<(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_cmplt_epi32(v0, v1);
+}
+
+Simd4i simdi::operator>(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_cmpgt_epi32(v0, v1);
+}
+
+ComplementExpr<Simd4i> operator~(const Simd4i& v)
+{
+	return ComplementExpr<Simd4i>(v);
+}
+
+Simd4i operator&(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_and_si128(v0, v1);
+}
+
+Simd4i operator|(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_or_si128(v0, v1);
+}
+
+Simd4i operator^(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_xor_si128(v0, v1);
+}
+
+Simd4i operator<<(const Simd4i& v, int shift)
+{
+	return _mm_slli_epi32(v, shift);
+}
+
+Simd4i operator>>(const Simd4i& v, int shift)
+{
+	return _mm_srli_epi32(v, shift);
+}
+
+Simd4i simdi::operator+(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_add_epi32(v0, v1);
+}
+
+Simd4i simdi::operator-(const Simd4i& v)
+{
+	return _mm_sub_epi32(_mm_setzero_si128(), v);
+}
+
+Simd4i simdi::operator-(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_sub_epi32(v0, v1);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// function implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4i simd4i(const Simd4f& v)
+{
+	return _mm_castps_si128(v);
+}
+
+Simd4i truncate(const Simd4f& v)
+{
+	return _mm_cvttps_epi32(v);	
+}
+
+int (&simdi::array(Simd4i& v))[4]
+{
+	return reinterpret_cast<int(&)[4]>(v);
+}
+
+const int (&simdi::array(const Simd4i& v))[4]
+{
+	return reinterpret_cast<const int(&)[4]>(v);
+}
+
+void store(int* ptr, const Simd4i& v)
+{
+	_mm_storeu_si128(reinterpret_cast<__m128i*>(ptr), v);
+}
+
+void storeAligned(int* ptr, const Simd4i& v)
+{
+	_mm_store_si128(reinterpret_cast<__m128i*>(ptr), v);
+}
+
+void storeAligned(int* ptr, unsigned int offset, const Simd4i& v)
+{
+	_mm_store_si128(reinterpret_cast<__m128i*>(reinterpret_cast<char*>(ptr) + offset), v);
+}
+
+template <size_t i>
+Simd4i splat(const Simd4i& v)
+{
+	return _mm_shuffle_epi32(v, _MM_SHUFFLE(i, i, i, i));
+}
+
+Simd4i select(const Simd4i& mask, const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_xor_si128(v1, _mm_and_si128(mask, _mm_xor_si128(v1, v0)));
+}
+
+int simdi::allEqual(const Simd4i& v0, const Simd4i& v1)
+{
+	return allTrue(simdi::operator==(v0, v1));
+}
+
+int simdi::allEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return allTrue(outMask = simdi::operator==(v0, v1));
+}
+
+int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1)
+{
+	return anyTrue(simdi::operator==(v0, v1));
+}
+
+int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return anyTrue(outMask = simdi::operator==(v0, v1));
+}
+
+int simdi::allGreater(const Simd4i& v0, const Simd4i& v1)
+{
+	return allTrue(simdi::operator>(v0, v1));
+}
+
+int simdi::allGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return allTrue(outMask = simdi::operator>(v0, v1));
+}
+
+int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1)
+{
+	return anyTrue(simdi::operator>(v0, v1));
+}
+
+int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return anyTrue(outMask = simdi::operator>(v0, v1));
+}
+
+int allTrue(const Simd4i& v)
+{
+	return _mm_movemask_ps(_mm_castsi128_ps(v)) == 0xf;
+}
+
+int anyTrue(const Simd4i& v)
+{
+	return _mm_movemask_ps(_mm_castsi128_ps(v));
+}
diff --git a/src/simd/sse2/SimdTypes.h b/src/simd/sse2/SimdTypes.h
new file mode 100644
index 0000000..0c4a80a
--- /dev/null
+++ b/src/simd/sse2/SimdTypes.h
@@ -0,0 +1,86 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// SSE + SSE2 (don't include intrin.h!)
+#include <emmintrin.h>
+
+#if (defined(_MSC_VER)) && (!defined(__ANDROID__))
+
+typedef __m128 Simd4f;
+typedef __m128i Simd4i;
+
+#else
+
+struct Simd4f
+{
+	Simd4f()
+	{
+	}
+	Simd4f(__m128 x) : m128(x)
+	{
+	}
+
+	operator __m128&()
+	{
+		return m128;
+	}
+	operator const __m128&() const
+	{
+		return m128;
+	}
+
+  private:
+	__m128 m128;
+};
+
+struct Simd4i
+{
+	Simd4i()
+	{
+	}
+	Simd4i(__m128i x) : m128i(x)
+	{
+	}
+
+	operator __m128i&()
+	{
+		return m128i;
+	}
+	operator const __m128i&() const
+	{
+		return m128i;
+	}
+
+  private:
+	__m128i m128i;
+};
+
+#endif
diff --git a/src/simd/xbox360/Simd4f.h b/src/simd/xbox360/Simd4f.h
new file mode 100644
index 0000000..5f63856
--- /dev/null
+++ b/src/simd/xbox360/Simd4f.h
@@ -0,0 +1,497 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// factory implementation
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline Simd4fFactory<const float&>::operator Simd4f() const
+{
+	return __vspltw(__lvlx(&v, 0), 0);
+}
+
+inline Simd4fFactory<detail::FourTuple>::operator Simd4f() const
+{
+	return reinterpret_cast<const Simd4f&>(v);
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<0> >::operator Simd4f() const
+{
+	return __vspltisw(0);
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<1> >::operator Simd4f() const
+{
+	return __vupkd3d(__vspltisw(0), VPACK_D3DCOLOR);
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<0x80000000> >::operator Simd4f() const
+{
+	Simd4f mask = __vspltisw(-1);
+	return __vslw(mask, mask);
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<0xffffffff> >::operator Simd4f() const
+{
+	return __vspltisw(-1);
+}
+
+template <>
+inline Simd4fFactory<const float*>::operator Simd4f() const
+{
+	return __vor(__lvlx(v, 0), __lvrx(v, 16));
+}
+
+template <>
+inline Simd4fFactory<detail::AlignedPointer<float> >::operator Simd4f() const
+{
+	return __lvx(v.ptr, 0);
+}
+
+template <>
+inline Simd4fFactory<detail::OffsetPointer<float> >::operator Simd4f() const
+{
+	return __lvx(v.ptr, int(v.offset));
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression templates
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+ComplementExpr<Simd4f>::operator Simd4f() const
+{
+	return __vnor(v, v);
+}
+
+Simd4f operator&(const ComplementExpr<Simd4f>& complement, const Simd4f& v)
+{
+	return __vandc(v, complement.v);
+}
+
+Simd4f operator&(const Simd4f& v, const ComplementExpr<Simd4f>& complement)
+{
+	return __vandc(v, complement.v);
+}
+
+ProductExpr::operator Simd4f() const
+{
+	return __vmulfp(v0, v1);
+}
+
+Simd4f operator+(const ProductExpr& p, const Simd4f& v)
+{
+	return __vmaddfp(p.v0, p.v1, v);
+}
+
+Simd4f operator+(const Simd4f& v, const ProductExpr& p)
+{
+	return __vmaddfp(p.v0, p.v1, v);
+}
+
+Simd4f operator+(const ProductExpr& p0, const ProductExpr& p1)
+{
+	return __vmaddfp(p1.v0, p1.v1, p0);
+}
+
+Simd4f operator-(const Simd4f& v, const ProductExpr& p)
+{
+	return __vnmsubfp(p.v0, p.v1, v);
+}
+
+Simd4f operator-(const ProductExpr& p0, const ProductExpr& p1)
+{
+	return __vnmsubfp(p1.v0, p1.v1, p0);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operator implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4f operator==(const Simd4f& v0, const Simd4f& v1)
+{
+	return __vcmpeqfp(v0, v1);
+}
+
+Simd4f operator<(const Simd4f& v0, const Simd4f& v1)
+{
+	return __vcmpgtfp(v1, v0);
+}
+
+Simd4f operator<=(const Simd4f& v0, const Simd4f& v1)
+{
+	return __vcmpgefp(v1, v0);
+}
+
+Simd4f operator>(const Simd4f& v0, const Simd4f& v1)
+{
+	return __vcmpgtfp(v0, v1);
+}
+
+Simd4f operator>=(const Simd4f& v0, const Simd4f& v1)
+{
+	return __vcmpgefp(v0, v1);
+}
+
+ComplementExpr<Simd4f> operator~(const Simd4f& v)
+{
+	return ComplementExpr<Simd4f>(v);
+}
+
+Simd4f operator&(const Simd4f& v0, const Simd4f& v1)
+{
+	return __vand(v0, v1);
+}
+
+Simd4f operator|(const Simd4f& v0, const Simd4f& v1)
+{
+	return __vor(v0, v1);
+}
+
+Simd4f operator^(const Simd4f& v0, const Simd4f& v1)
+{
+	return __vxor(v0, v1);
+}
+
+Simd4f operator<<(const Simd4f& v, int shift)
+{
+	return __vslw(v, __vspltw(__lvlx(&shift, 0), 0));
+}
+
+Simd4f operator>>(const Simd4f& v, int shift)
+{
+	return __vsrw(v, __vspltw(__lvlx(&shift, 0), 0));
+}
+
+Simd4f operator<<(const Simd4f& v, const Simd4f& shift)
+{
+	return __vslw(v, shift);
+}
+
+Simd4f operator>>(const Simd4f& v, const Simd4f& shift)
+{
+	return __vsrw(v, shift);
+}
+
+Simd4f operator+(const Simd4f& v)
+{
+	return v;
+}
+
+Simd4f operator+(const Simd4f& v0, const Simd4f& v1)
+{
+	return __vaddfp(v0, v1);
+}
+
+Simd4f operator-(const Simd4f& v)
+{
+	return __vxor(v, simd4f(_sign));
+}
+
+Simd4f operator-(const Simd4f& v0, const Simd4f& v1)
+{
+	return __vsubfp(v0, v1);
+}
+
+ProductExpr operator*(const Simd4f& v0, const Simd4f& v1)
+{
+	return ProductExpr(v0, v1);
+}
+
+Simd4f operator/(const Simd4f& v0, const Simd4f& v1)
+{
+	return __vmulfp(v0, __vrefp(v1)); // reciprocal estimate
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// function implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4f simd4f(const Simd4i& v)
+{
+	return v;
+}
+
+Simd4f convert(const Simd4i& v)
+{
+	return __vcfsx(v, 0);
+}
+
+float (&array(Simd4f& v))[4]
+{
+	return v.vector4_f32;
+}
+
+const float (&array(const Simd4f& v))[4]
+{
+	return v.vector4_f32;
+}
+
+void store(float* ptr, Simd4f const& v)
+{
+	__stvlx(v, ptr, 0);
+	__stvrx(v, ptr, 16);
+}
+
+void storeAligned(float* ptr, Simd4f const& v)
+{
+	__stvlx(v, ptr, 0);
+}
+
+void storeAligned(float* ptr, unsigned int offset, Simd4f const& v)
+{
+	__stvlx(v, ptr, int(offset));
+}
+
+template <size_t i>
+Simd4f splat(Simd4f const& v)
+{
+	return __vspltw(v, i);
+}
+
+Simd4f select(Simd4f const& mask, Simd4f const& v0, Simd4f const& v1)
+{
+	return __vsel(v1, v0, mask);
+}
+
+Simd4f abs(const Simd4f& v)
+{
+	return __vandc(v, simd4f(_sign));
+}
+
+Simd4f floor(const Simd4f& v)
+{
+	return __vrfim(v);
+}
+
+Simd4f max(const Simd4f& v0, const Simd4f& v1)
+{
+	return __vmaxfp(v0, v1);
+}
+
+Simd4f min(const Simd4f& v0, const Simd4f& v1)
+{
+	return __vminfp(v0, v1);
+}
+
+Simd4f recip(const Simd4f& v)
+{
+	return __vrefp(v);
+}
+
+template <int n>
+Simd4f recip(const Simd4f& v)
+{
+	Simd4f two = simd4f(2.0f);
+	Simd4f recipV = recip(v);
+	for(int i = 0; i < n; ++i)
+		recipV = recipV * (two - v * recipV);
+	return recipV;
+}
+
+Simd4f sqrt(const Simd4f& v)
+{
+	return __vmulfp(v, __vrsqrtefp(v));
+}
+
+Simd4f rsqrt(const Simd4f& v)
+{
+	return __vrsqrtefp(v);
+}
+
+template <int n>
+Simd4f rsqrt(const Simd4f& v)
+{
+	Simd4f halfV = v * simd4f(0.5f);
+	Simd4f threeHalf = simd4f(1.5f);
+	Simd4f rsqrtV = rsqrt(v);
+	for(int i = 0; i < n; ++i)
+		rsqrtV = rsqrtV * (threeHalf - halfV * rsqrtV * rsqrtV);
+	return rsqrtV;
+}
+
+Simd4f exp2(const Simd4f& v)
+{
+	return __vexptefp(v);
+}
+
+Simd4f log2(const Simd4f& v)
+{
+	return __vlogefp(v);
+}
+
+Simd4f dot3(const Simd4f& v0, const Simd4f& v1)
+{
+	return __vmsum3fp(v0, v1);
+}
+
+Simd4f cross3(const Simd4f& v0, const Simd4f& v1)
+{
+	Simd4f t0 = __vpermwi(v0, 0x63); // x y z w -> y z x w
+	Simd4f t1 = __vpermwi(v1, 0x63);
+	Simd4f tmp = __vnmsubfp(t0, v1, __vmulfp(v0, t1));
+	return __vpermwi(tmp, 0x63);
+}
+
+void transpose(Simd4f& x, Simd4f& y, Simd4f& z, Simd4f& w)
+{
+	Simd4f v0 = __vmrglw(x, z);
+	Simd4f v1 = __vmrghw(x, z);
+	Simd4f v2 = __vmrglw(y, w);
+	Simd4f v3 = __vmrghw(y, w);
+	x = __vmrghw(v1, v3);
+	y = __vmrglw(v1, v3);
+	z = __vmrghw(v0, v2);
+	w = __vmrglw(v0, v2);
+}
+
+void zip(Simd4f& v0, Simd4f& v1)
+{
+	Simd4f t0 = v0;
+	v0 = __vmrglw(v0, v1);
+	v1 = __vmrghw(t0, v1);
+}
+
+void unzip(Simd4f& v0, Simd4f& v1)
+{
+	Simd4f t0 = __vmrglw(v0, v1); // v0.x, v1.x, v0.y, v1.y
+	Simd4f t1 = __vmrghw(v0, v1); // v0.z, v1.z, v0.w, v1.w
+	v0 = __vmrglw(t0, t1); // v0.x, v0.z, v1.x, v1.z
+	v1 = __vmrghw(t0, t1); // v0.y, v0.w, v1.y, v1.w
+}
+
+Simd4f swaphilo(const Simd4f& v)
+{
+	return __vpermwi(v, 0xa1); // x y z w -> z w x y
+}
+
+int allEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	unsigned int control;
+	__vcmpeqfpR(v0, v1, &control);
+	return int(0x80 & control); // all true
+}
+
+int allEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	unsigned int control;
+	outMask = __vcmpeqfpR(v0, v1, &control);
+	return int(0x80 & control); // all true
+}
+
+int anyEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	unsigned int control;
+	__vcmpeqfpR(v0, v1, &control);
+	return int(0x20 & ~control); // not all false
+}
+
+int anyEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	unsigned int control;
+	outMask = __vcmpeqfpR(v0, v1, &control);
+	return int(0x20 & ~control); // not all false
+}
+
+int allGreater(const Simd4f& v0, const Simd4f& v1)
+{
+	unsigned int control;
+	__vcmpgtfpR(v0, v1, &control);
+	return int(0x80 & control); // all true
+}
+
+int allGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	unsigned int control;
+	outMask = __vcmpgtfpR(v0, v1, &control);
+	return int(0x80 & control); // all true
+}
+
+int anyGreater(const Simd4f& v0, const Simd4f& v1)
+{
+	unsigned int control;
+	__vcmpgtfpR(v0, v1, &control);
+	return int(0x20 & ~control); // not all false
+}
+
+int anyGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	unsigned int control;
+	outMask = __vcmpgtfpR(v0, v1, &control);
+	return int(0x20 & ~control); // not all false
+}
+
+int allGreaterEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	unsigned int control;
+	__vcmpgefpR(v0, v1, &control);
+	return int(0x80 & control); // all true
+}
+
+int allGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	unsigned int control;
+	outMask = __vcmpgefpR(v0, v1, &control);
+	return int(0x80 & control); // all true
+}
+
+int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	unsigned int control;
+	__vcmpgefpR(v0, v1, &control);
+	return int(0x20 & ~control); // not all false
+}
+
+int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	unsigned int control;
+	outMask = __vcmpgefpR(v0, v1, &control);
+	return int(0x20 & ~control); // not all false
+}
+
+int allTrue(const Simd4f& v)
+{
+	unsigned int control;
+	__vcmpgefpR(v, simd4f(_0), &control);
+	return int(0x20 & control); // all false
+}
+
+int anyTrue(const Simd4f& v)
+{
+	unsigned int control;
+	__vcmpgefpR(v, simd4f(_0), &control);
+	return int(0x80 & ~control); // not all true
+}
diff --git a/src/simd/xbox360/Simd4i.h b/src/simd/xbox360/Simd4i.h
new file mode 100644
index 0000000..004c06b
--- /dev/null
+++ b/src/simd/xbox360/Simd4i.h
@@ -0,0 +1,206 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// factory implementation
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline Simd4iFactory<const int&>::operator Simd4i() const
+{
+	return __vspltw(__lvlx(&v, 0), 0);
+}
+
+inline Simd4iFactory<detail::FourTuple>::operator Simd4i() const
+{
+	return reinterpret_cast<const Simd4i&>(v);
+}
+
+template <int i>
+inline Simd4iFactory<detail::IntType<i> >::operator Simd4i() const
+{
+	return __vspltisw(i);
+}
+
+template <>
+inline Simd4iFactory<detail::IntType<0x80000000> >::operator Simd4i() const
+{
+	Simd4f mask = __vspltisw(-1);
+	return __vslw(mask, mask);
+}
+
+template <>
+inline Simd4iFactory<const int*>::operator Simd4i() const
+{
+	return __vor(__lvlx(v, 0), __lvrx(v, 16));
+}
+
+template <>
+inline Simd4iFactory<detail::AlignedPointer<int> >::operator Simd4i() const
+{
+	return __lvx(v.ptr, 0);
+}
+
+template <>
+inline Simd4iFactory<detail::OffsetPointer<int> >::operator Simd4i() const
+{
+	return __lvx(v.ptr, int(v.offset));
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operator implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4i simdi::operator==(const Simd4i& v0, const Simd4i& v1)
+{
+	return __vcmpequw(v0, v1);
+}
+
+Simd4i simdi::operator<(const Simd4i& v0, const Simd4i& v1)
+{
+	return __vcmpgtsw(v1, v0);
+}
+
+Simd4i simdi::operator>(const Simd4i& v0, const Simd4i& v1)
+{
+	return __vcmpgtsw(v0, v1);
+}
+
+Simd4i simdi::operator+(const Simd4i& v0, const Simd4i& v1)
+{
+	return __vadduwm(v0, v1);
+}
+
+Simd4i simdi::operator-(const Simd4i& v)
+{
+	return __vsubuwm(__vspltisw(0), v);
+}
+
+Simd4i simdi::operator-(const Simd4i& v0, const Simd4i& v1)
+{
+	return __vsubuwm(v0, v1);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// function implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4i simd4i(const Simd4f& v)
+{
+	return v;
+}
+
+Simd4i truncate(const Simd4f& v)
+{
+	return __vrfiz(v);	
+}
+
+int (&simdi::array(Simd4i& v))[4]
+{
+	return reinterpret_cast<int(&)[4]>(v.vector4_u32);
+}
+
+const int (&simdi::array(const Simd4i& v))[4]
+{
+	return reinterpret_cast<const int(&)[4]>(v.vector4_u32);
+}
+
+void store(int* ptr, const Simd4i& v)
+{
+	__stvlx(v, ptr, 0);
+	__stvrx(v, ptr, 16);
+}
+
+void storeAligned(int* ptr, const Simd4i& v)
+{
+	__stvlx(v, ptr, 0);
+}
+
+void storeAligned(int* ptr, unsigned int offset, const Simd4i& v)
+{
+	__stvlx(v, ptr, int(offset));
+}
+
+int simdi::allEqual(const Simd4i& v0, const Simd4i& v1)
+{
+	unsigned int control;
+	__vcmpequwR(v0, v1, &control);
+	return int(0x80 & control); // all true
+}
+
+int simdi::allEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	unsigned int control;
+	outMask = __vcmpequwR(v0, v1, &control);
+	return int(0x80 & control); // all true
+}
+
+int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1)
+{
+	unsigned int control;
+	__vcmpequwR(v0, v1, &control);
+	return int(0x20 & ~control); // not all false
+}
+
+int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	unsigned int control;
+	outMask = __vcmpequwR(v0, v1, &control);
+	return int(0x20 & ~control); // not all false
+}
+
+int simdi::allGreater(const Simd4i& v0, const Simd4i& v1)
+{
+	unsigned int control;
+	__vcmpgtswR(v0, v1, &control);
+	return int(0x80 & control); // all true
+}
+
+int simdi::allGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	unsigned int control;
+	outMask = __vcmpgtswR(v0, v1, &control);
+	return int(0x80 & control); // all true
+}
+
+int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1)
+{
+	unsigned int control;
+	__vcmpgtswR(v0, v1, &control);
+	return int(0x20 & ~control); // not all false
+}
+
+int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	unsigned int control;
+	outMask = __vcmpgtswR(v0, v1, &control);
+	return int(0x20 & ~control); // not all false
+}
diff --git a/src/simd/xbox360/SimdTypes.h b/src/simd/xbox360/SimdTypes.h
new file mode 100644
index 0000000..1dc28ba
--- /dev/null
+++ b/src/simd/xbox360/SimdTypes.h
@@ -0,0 +1,35 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2014 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include <vectorintrinsics.h>
+
+typedef __vector4 Simd4f;
+typedef __vector4 Simd4i;
author	Jason Maskell <[email protected]>	2016-05-09 10:39:54 +0200
committer	Jason Maskell <[email protected]>	2016-05-09 10:39:54 +0200
commit	79b3462799c28af8ba586349bd671b1b56e72353 (patch)
tree	3b06e36c390254c0dc7f3733a0d32af213d87293 /src/simd
download	waveworks_archive-79b3462799c28af8ba586349bd671b1b56e72353.tar.xz waveworks_archive-79b3462799c28af8ba586349bd671b1b56e72353.zip