Initial commit:

PhysX 3.4.0 Update @ 21294896 APEX 1.4.0 Update @ 21275617 [CL 21300167]
author: git perforce import user <a@b> 2016-10-25 12:29:14 -0600
committer: Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees> 2016-10-25 18:56:37 -0500
commit: 3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
tree: fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /APEX_1.4/module/clothing/embedded/LowLevelCloth
download: physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz
physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip
63 files changed, 14844 insertions, 0 deletions
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Cloth.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Cloth.h
new file mode 100644
index 00000000..6f24e51f
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Cloth.h
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Range.h"
+#include "PhaseConfig.h"
+
+struct ID3D11Buffer;
+
+namespace nvidia
+{
+#if APEX_UE4
+	namespace Cm
+	{
+		class Task;
+	}
+#endif
+
+namespace cloth
+{
+
+class Factory;
+class Fabric;
+class Cloth;
+
+template <typename T>
+struct MappedRange : public Range<T>
+{
+	MappedRange(T* first, T* last, const Cloth& cloth, void (Cloth::*lock)() const, void (Cloth::*unlock)() const)
+	: Range<T>(first, last), mCloth(cloth), mLock(lock), mUnlock(unlock)
+	{
+	}
+
+	MappedRange(const MappedRange& other)
+	: Range<T>(other), mCloth(other.mCloth), mLock(other.mLock), mUnlock(other.mUnlock)
+	{
+		(mCloth.*mLock)();
+	}
+
+	~MappedRange()
+	{
+		(mCloth.*mUnlock)();
+	}
+
+  private:
+	MappedRange& operator=(const MappedRange&);
+
+	const Cloth& mCloth;
+	void (Cloth::*mLock)() const;
+	void (Cloth::*mUnlock)() const;
+};
+
+struct GpuParticles
+{
+	PxVec4* mCurrent;
+	PxVec4* mPrevious;
+	ID3D11Buffer* mBuffer;
+};
+
+// abstract cloth instance
+class Cloth
+{
+	Cloth& operator=(const Cloth&);
+
+  protected:
+	Cloth()
+	{
+	}
+	Cloth(const Cloth&)
+	{
+	}
+
+  public:
+	virtual ~Cloth()
+	{
+	}
+
+	// same as factory.clone(*this)
+	virtual Cloth* clone(Factory& factory) const = 0;
+
+	virtual Fabric& getFabric() const = 0;
+	virtual Factory& getFactory() const = 0;
+
+	/* particle properties */
+
+	virtual uint32_t getNumParticles() const = 0;
+	virtual void lockParticles() const = 0;
+	virtual void unlockParticles() const = 0;
+	// return particle data for current and previous frame
+	// setting current invMass to zero locks particle.
+	virtual MappedRange<PxVec4> getCurrentParticles() = 0;
+	virtual MappedRange<const PxVec4> getCurrentParticles() const = 0;
+	virtual MappedRange<PxVec4> getPreviousParticles() = 0;
+	virtual MappedRange<const PxVec4> getPreviousParticles() const = 0;
+	virtual GpuParticles getGpuParticles() = 0;
+
+	// set position of cloth after next call to simulate()
+	virtual void setTranslation(const PxVec3& trans) = 0;
+	virtual void setRotation(const PxQuat& rot) = 0;
+
+	// get current position of cloth
+	virtual const PxVec3& getTranslation() const = 0;
+	virtual const PxQuat& getRotation() const = 0;
+
+	// zero inertia derived from method calls above (once)
+	virtual void clearInertia() = 0;
+
+	// adjust the position of the cloth without affecting the dynamics (to call after a world origin shift, for example)
+	virtual void teleport(const PxVec3& delta) = 0;
+
+	/* solver parameters */
+
+	// return delta time used for previous iteration
+	virtual float getPreviousIterationDt() const = 0;
+
+	// gravity in global coordinates
+	virtual void setGravity(const PxVec3&) = 0;
+	virtual PxVec3 getGravity() const = 0;
+
+	// damping of local particle velocity (1/stiffnessFrequency)
+	// 0 (default): velocity is unaffected, 1: velocity is zero'ed
+	virtual void setDamping(const PxVec3&) = 0;
+	virtual PxVec3 getDamping() const = 0;
+
+	// portion of local frame velocity applied to particles
+	// 0 (default): particles are unaffected
+	// same as damping: damp global particle velocity
+	virtual void setLinearDrag(const PxVec3&) = 0;
+	virtual PxVec3 getLinearDrag() const = 0;
+	virtual void setAngularDrag(const PxVec3&) = 0;
+	virtual PxVec3 getAngularDrag() const = 0;
+
+	// portion of local frame accelerations applied to particles
+	// 0: particles are unaffected, 1 (default): physically correct
+	virtual void setLinearInertia(const PxVec3&) = 0;
+	virtual PxVec3 getLinearInertia() const = 0;
+	virtual void setAngularInertia(const PxVec3&) = 0;
+	virtual PxVec3 getAngularInertia() const = 0;
+	virtual void setCentrifugalInertia(const PxVec3&) = 0;
+	virtual PxVec3 getCentrifugalInertia() const = 0;
+
+	// target solver iterations per second
+	virtual void setSolverFrequency(float) = 0;
+	virtual float getSolverFrequency() const = 0;
+
+	// damp, drag, stiffness exponent per second
+	virtual void setStiffnessFrequency(float) = 0;
+	virtual float getStiffnessFrequency() const = 0;
+
+	// filter width for averaging dt^2 factor of gravity and
+	// external acceleration, in numbers of iterations (default=30).
+	virtual void setAcceleationFilterWidth(uint32_t) = 0;
+	virtual uint32_t getAccelerationFilterWidth() const = 0;
+
+	// setup edge constraint solver iteration
+	virtual void setPhaseConfig(Range<const PhaseConfig> configs) = 0;
+
+	/* collision parameters */
+
+	virtual void setSpheres(Range<const PxVec4>, uint32_t first, uint32_t last) = 0;
+	virtual uint32_t getNumSpheres() const = 0;
+
+	virtual void setCapsules(Range<const uint32_t>, uint32_t first, uint32_t last) = 0;
+	virtual uint32_t getNumCapsules() const = 0;
+
+	virtual void setPlanes(Range<const PxVec4>, uint32_t first, uint32_t last) = 0;
+	virtual uint32_t getNumPlanes() const = 0;
+
+	virtual void setConvexes(Range<const uint32_t>, uint32_t first, uint32_t last) = 0;
+	virtual uint32_t getNumConvexes() const = 0;
+
+	virtual void setTriangles(Range<const PxVec3>, uint32_t first, uint32_t last) = 0;
+	virtual void setTriangles(Range<const PxVec3>, Range<const PxVec3>, uint32_t first) = 0;
+	virtual uint32_t getNumTriangles() const = 0;
+
+	// check if we use ccd or not
+	virtual bool isContinuousCollisionEnabled() const = 0;
+	// set if we use ccd or not (disabled by default)
+	virtual void enableContinuousCollision(bool) = 0;
+
+	// controls how quickly mass is increased during collisions
+	virtual float getCollisionMassScale() const = 0;
+	virtual void setCollisionMassScale(float) = 0;
+
+	// friction
+	virtual void setFriction(float) = 0;
+	virtual float getFriction() const = 0;
+
+	// set virtual particles for collision handling.
+	// each indices element consists of 3 particle
+	// indices and an index into the lerp weights array.
+	virtual void setVirtualParticles(Range<const uint32_t[4]> indices, Range<const PxVec3> weights) = 0;
+	virtual uint32_t getNumVirtualParticles() const = 0;
+	virtual uint32_t getNumVirtualParticleWeights() const = 0;
+
+	/* tether constraint parameters */
+
+	virtual void setTetherConstraintScale(float scale) = 0;
+	virtual float getTetherConstraintScale() const = 0;
+	virtual void setTetherConstraintStiffness(float stiffness) = 0;
+	virtual float getTetherConstraintStiffness() const = 0;
+
+	/* motion constraint parameters */
+
+	// return reference to motion constraints (position, radius)
+	// The entire range must be written after calling this function.
+	virtual Range<PxVec4> getMotionConstraints() = 0;
+	virtual void clearMotionConstraints() = 0;
+	virtual uint32_t getNumMotionConstraints() const = 0;
+	virtual void setMotionConstraintScaleBias(float scale, float bias) = 0;
+	virtual float getMotionConstraintScale() const = 0;
+	virtual float getMotionConstraintBias() const = 0;
+	virtual void setMotionConstraintStiffness(float stiffness) = 0;
+	virtual float getMotionConstraintStiffness() const = 0;
+
+	/* separation constraint parameters */
+
+	// return reference to separation constraints (position, radius)
+	// The entire range must be written after calling this function.
+	virtual Range<PxVec4> getSeparationConstraints() = 0;
+	virtual void clearSeparationConstraints() = 0;
+	virtual uint32_t getNumSeparationConstraints() const = 0;
+
+	/* clear interpolation */
+
+	// assign current to previous positions for
+	// collision spheres, motion, and separation constraints
+	virtual void clearInterpolation() = 0;
+
+	/* particle acceleration parameters */
+
+	// return reference to particle accelerations (in local coordinates)
+	// The entire range must be written after calling this function.
+	virtual Range<PxVec4> getParticleAccelerations() = 0;
+	virtual void clearParticleAccelerations() = 0;
+	virtual uint32_t getNumParticleAccelerations() const = 0;
+
+	/* self collision */
+
+	virtual void setSelfCollisionDistance(float distance) = 0;
+	virtual float getSelfCollisionDistance() const = 0;
+	virtual void setSelfCollisionStiffness(float stiffness) = 0;
+	virtual float getSelfCollisionStiffness() const = 0;
+
+	virtual void setSelfCollisionIndices(Range<const uint32_t>) = 0;
+	virtual uint32_t getNumSelfCollisionIndices() const = 0;
+
+	/* rest positions */
+
+	// set rest particle positions used during self-collision
+	virtual void setRestPositions(Range<const PxVec4>) = 0;
+	virtual uint32_t getNumRestPositions() const = 0;
+
+	/* bounding box */
+
+	// current particle position bounds in local space
+	virtual const PxVec3& getBoundingBoxCenter() const = 0;
+	virtual const PxVec3& getBoundingBoxScale() const = 0;
+
+	/* sleeping (disabled by default) */
+
+	// max particle velocity (per axis) to pass sleep test
+	virtual void setSleepThreshold(float) = 0;
+	virtual float getSleepThreshold() const = 0;
+	// test sleep condition every nth millisecond
+	virtual void setSleepTestInterval(uint32_t) = 0;
+	virtual uint32_t getSleepTestInterval() const = 0;
+	// put cloth to sleep when n consecutive sleep tests pass
+	virtual void setSleepAfterCount(uint32_t) = 0;
+	virtual uint32_t getSleepAfterCount() const = 0;
+	virtual uint32_t getSleepPassCount() const = 0;
+	virtual bool isAsleep() const = 0;
+	virtual void putToSleep() = 0;
+	virtual void wakeUp() = 0;
+
+	virtual void setHalfPrecisionOption(bool isAllowed) = 0;
+	virtual bool getHalfPrecisionOption() const = 0;
+
+#if APEX_UE4
+	virtual void simulate(float dt) = 0;
+#endif
+
+	virtual void setUserData(void*) = 0;
+	virtual void* getUserData() const = 0;
+};
+
+// wrappers to prevent non-const overload from marking particles dirty
+inline MappedRange<const PxVec4> readCurrentParticles(const Cloth& cloth)
+{
+	return cloth.getCurrentParticles();
+}
+inline MappedRange<const PxVec4> readPreviousParticles(const Cloth& cloth)
+{
+	return cloth.getPreviousParticles();
+}
+
+} // namespace cloth
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Fabric.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Fabric.h
new file mode 100644
index 00000000..f271b397
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Fabric.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+#include "PxAssert.h"
+#include "Range.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+
+class Factory;
+
+// abstract cloth constraints and triangle indices
+class Fabric
+{
+  protected:
+	Fabric(const Fabric&);
+	Fabric& operator=(const Fabric&);
+
+  protected:
+	Fabric() : mRefCount(0)
+	{
+	}
+
+  public:
+	virtual ~Fabric()
+	{
+		PX_ASSERT(!mRefCount);
+	}
+
+	virtual Factory& getFactory() const = 0;
+
+	virtual uint32_t getNumPhases() const = 0;
+	virtual uint32_t getNumRestvalues() const = 0;
+
+	virtual uint32_t getNumSets() const = 0;
+	virtual uint32_t getNumIndices() const = 0;
+
+	virtual uint32_t getNumParticles() const = 0;
+
+	virtual uint32_t getNumTethers() const = 0;
+
+	virtual void scaleRestvalues(float) = 0;
+	virtual void scaleTetherLengths(float) = 0;
+
+	uint16_t getRefCount() const
+	{
+		return mRefCount;
+	}
+	void incRefCount()
+	{
+		++mRefCount;
+		PX_ASSERT(mRefCount > 0);
+	}
+	void decRefCount()
+	{
+		PX_ASSERT(mRefCount > 0);
+		--mRefCount;
+	}
+
+  protected:
+	uint16_t mRefCount;
+};
+
+} // namespace cloth
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Factory.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Factory.h
new file mode 100644
index 00000000..651b3b0c
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Factory.h
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+#include "Range.h"
+
+typedef struct CUstream_st* CUstream;
+
+namespace physx
+{
+	namespace profile
+	{
+		class PxProfileZone;
+	}
+	class PxTaskManager;
+}
+
+namespace nvidia
+{
+namespace cloth
+{
+
+class Fabric;
+class Cloth;
+class Solver;
+class Character;
+
+/// abstract factory to create context-specific simulation components
+/// such as cloth, solver, collision, etc.
+class Factory
+{
+  public:
+	enum Platform
+	{
+		CPU,
+		CUDA,
+		DirectCompute
+	};
+
+  protected:
+	Factory(Platform platform) : mPlatform(platform)
+	{
+	}
+	Factory(const Factory&);
+	Factory& operator=(const Factory&);
+
+  public:
+	static Factory* createFactory(Platform, void* = 0);
+
+	virtual ~Factory()
+	{
+	}
+
+	Platform getPlatform() const
+	{
+		return mPlatform;
+	}
+
+	/**
+	    Create fabric data used to setup cloth object.
+	    @param numParticles number of particles, must be larger than any particle index
+	    @param phases map from phase to set index
+	    @param sets inclusive prefix sum of restvalue count per set
+	    @param restvalues array of constraint rest values
+	    @param indices array of particle index pair per constraint
+	 */
+	virtual Fabric* createFabric(uint32_t numParticles, Range<const uint32_t> phases, Range<const uint32_t> sets,
+	                             Range<const float> restvalues, Range<const uint32_t> indices,
+	                             Range<const uint32_t> anchors, Range<const float> tetherLengths) = 0;
+
+	/**
+	    Create cloth object.
+	    @param particles initial particle positions.
+	    @param fabric edge distance constraint structure
+	 */
+	virtual Cloth* createCloth(Range<const PxVec4> particles, Fabric& fabric) = 0;
+
+	/**
+	    Create cloth solver object.
+	    @param profiler performance event receiver.
+	    @param taskMgr PxTaskManager used for simulation.
+	 */
+	virtual Solver* createSolver(profile::PxProfileZone* profiler, PxTaskManager* taskMgr) = 0;
+
+	/**
+	    Create a copy of a cloth instance
+	    @param cloth the instance to be cloned, need not match the factory type
+	 */
+	virtual Cloth* clone(const Cloth& cloth) = 0;
+
+	/**
+	    Extract original data from a fabric object
+	    @param fabric to extract from, must match factory type
+	    @param phases pre-allocated memory range to write phases
+	    @param sets pre-allocated memory range to write sets
+	    @param restvalues pre-allocated memory range to write restvalues
+	    @param indices pre-allocated memory range to write indices
+	 */
+	virtual void extractFabricData(const Fabric& fabric, Range<uint32_t> phases, Range<uint32_t> sets,
+	                               Range<float> restvalues, Range<uint32_t> indices, Range<uint32_t> anchors,
+	                               Range<float> tetherLengths) const = 0;
+
+	/**
+	    Extract current collision spheres and capsules from a cloth object
+	    @param cloth the instance to extract from, must match factory type
+	    @param spheres pre-allocated memory range to write spheres
+	    @param capsules pre-allocated memory range to write capsules
+	    @param planes pre-allocated memory range to write planes
+	    @param convexes pre-allocated memory range to write convexes
+	    @param triangles pre-allocated memory range to write triangles
+	 */
+	virtual void extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules,
+	                                  Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const = 0;
+
+	/**
+	    Extract current motion constraints from a cloth object
+	    @param cloth the instance to extract from, must match factory type
+	    @param destConstraints pre-allocated memory range to write constraints
+	 */
+	virtual void extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const = 0;
+
+	/**
+	    Extract current separation constraints from a cloth object
+	    @param cloth the instance to extract from, must match factory type
+	    @param destConstraints pre-allocated memory range to write constraints
+	 */
+	virtual void extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const = 0;
+
+	/**
+	    Extract current particle accelerations from a cloth object
+	    @param cloth the instance to extract from, must match factory type
+	    @param destAccelerations pre-allocated memory range to write accelerations
+	 */
+	virtual void extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const = 0;
+
+	/**
+	    Extract virtual particles from a cloth object
+	    @param cloth the instance to extract from, must match factory type
+	    @param destIndices pre-allocated memory range to write indices
+	    @param destWeights pre-allocated memory range to write weights
+	 */
+	virtual void extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> destIndices,
+	                                     Range<PxVec3> destWeights) const = 0;
+
+	/**
+	    Extract self collision indices from cloth object.
+	    @param cloth the instance to extract from, must match factory type
+	    @param destIndices pre-allocated memory range to write indices
+	*/
+	virtual void extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const = 0;
+
+	/**
+	    Extract particle rest positions from cloth object.
+	    @param cloth the instance to extract from, must match factory type
+	    @param destRestPositions pre-allocated memory range to write rest positions
+	*/
+	virtual void extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const = 0;
+
+  protected:
+	const Platform mPlatform;
+};
+
+} // namespace cloth
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/PhaseConfig.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/PhaseConfig.h
new file mode 100644
index 00000000..4edf4802
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/PhaseConfig.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+
+struct PhaseConfig
+{
+	PhaseConfig(uint16_t index = uint16_t(-1));
+
+	uint16_t mPhaseIndex;
+	uint16_t mPadding;
+
+	// target convergence rate per iteration (1/solverFrequency)
+	float mStiffness;
+
+	float mStiffnessMultiplier;
+
+	float mCompressionLimit;
+	float mStretchLimit;
+};
+
+} // namespace cloth
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Range.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Range.h
new file mode 100644
index 00000000..7d48e195
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Range.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "PxAssert.h"
+#include "Types.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+
+template <class T>
+struct Range
+{
+	Range();
+
+	Range(T* first, T* last);
+
+	template <typename S>
+	Range(const Range<S>& other);
+
+	uint32_t size() const;
+	bool empty() const;
+
+	void popFront();
+	void popBack();
+
+	T* begin() const;
+	T* end() const;
+
+	T& front() const;
+	T& back() const;
+
+	T& operator[](uint32_t i) const;
+
+  private:
+	T* mFirst;
+	T* mLast; // past last element
+};
+
+template <typename T>
+Range<T>::Range()
+: mFirst(0), mLast(0)
+{
+}
+
+template <typename T>
+Range<T>::Range(T* first, T* last)
+: mFirst(first), mLast(last)
+{
+}
+
+template <typename T>
+template <typename S>
+Range<T>::Range(const Range<S>& other)
+: mFirst(other.begin()), mLast(other.end())
+{
+}
+
+template <typename T>
+uint32_t Range<T>::size() const
+{
+	return uint32_t(mLast - mFirst);
+}
+
+template <typename T>
+bool Range<T>::empty() const
+{
+	return mFirst >= mLast;
+}
+
+template <typename T>
+void Range<T>::popFront()
+{
+	PX_ASSERT(mFirst < mLast);
+	++mFirst;
+}
+
+template <typename T>
+void Range<T>::popBack()
+{
+	PX_ASSERT(mFirst < mLast);
+	--mLast;
+}
+
+template <typename T>
+T* Range<T>::begin() const
+{
+	return mFirst;
+}
+
+template <typename T>
+T* Range<T>::end() const
+{
+	return mLast;
+}
+
+template <typename T>
+T& Range<T>::front() const
+{
+	PX_ASSERT(mFirst < mLast);
+	return *mFirst;
+}
+
+template <typename T>
+T& Range<T>::back() const
+{
+	PX_ASSERT(mFirst < mLast);
+	return mLast[-1];
+}
+
+template <typename T>
+T& Range<T>::operator[](uint32_t i) const
+{
+	PX_ASSERT(mFirst + i < mLast);
+	return mFirst[i];
+}
+
+} // namespace cloth
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Solver.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Solver.h
new file mode 100644
index 00000000..585aab63
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Solver.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+
+namespace physx
+{
+	class PxBaseTask;
+}
+
+namespace nvidia
+{
+namespace cloth
+{
+
+class Cloth;
+
+// called during inter-collision, user0 and user1 are the user data from each cloth
+typedef bool (*InterCollisionFilter)(void* user0, void* user1);
+
+/// base class for solvers
+class Solver
+{
+  protected:
+	Solver(const Solver&);
+	Solver& operator=(const Solver&);
+
+  protected:
+	Solver()
+	{
+	}
+
+  public:
+	virtual ~Solver()
+	{
+	}
+
+	/// add cloth object, returns true if successful
+	virtual void addCloth(Cloth*) = 0;
+
+	/// remove cloth object
+	virtual void removeCloth(Cloth*) = 0;
+
+	/// simulate one time step
+	virtual PxBaseTask& simulate(float dt, PxBaseTask&) = 0;
+
+	// inter-collision parameters
+	virtual void setInterCollisionDistance(float distance) = 0;
+	virtual float getInterCollisionDistance() const = 0;
+	virtual void setInterCollisionStiffness(float stiffness) = 0;
+	virtual float getInterCollisionStiffness() const = 0;
+	virtual void setInterCollisionNbIterations(uint32_t nbIterations) = 0;
+	virtual uint32_t getInterCollisionNbIterations() const = 0;
+	virtual void setInterCollisionFilter(InterCollisionFilter filter) = 0;
+
+//	virtual uint32_t getNumSharedPositions( const Cloth* ) const = 0;
+
+	/// returns true if an unrecoverable error has occurred
+	virtual bool hasError() const = 0;
+};
+
+} // namespace cloth
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Types.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Types.h
new file mode 100644
index 00000000..e80a3009
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Types.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#ifndef __CUDACC__
+#include "ApexUsingNamespace.h"
+#include "Px.h"
+#include "PxVec3.h"
+#include "PxVec4.h"
+#include "PxQuat.h"
+#endif
+
+// Factory.cpp gets included in both PhysXGPU and LowLevelCloth projects
+// CuFactory can only be created in PhysXGPU project
+// DxFactory can only be created in PhysXGPU (win) or LowLevelCloth (xbox1)
+#if defined(PX_PHYSX_GPU_EXPORTS) || PX_XBOXONE
+#define ENABLE_CUFACTORY ((PX_WINDOWS_FAMILY && (PX_WINRT==0)) || PX_LINUX)
+
+//TEMPORARY DISABLE DXFACTORY
+#define ENABLE_DXFACTORY 0
+//#define ENABLE_DXFACTORY ((PX_WINDOWS_FAMILY && (PX_WINRT==0)) || PX_XBOXONE)
+#else
+#define ENABLE_CUFACTORY 0
+#define ENABLE_DXFACTORY 0
+#endif
+
+#ifndef _MSC_VER
+#include <stdint.h>
+#else
+// typedef standard integer types
+typedef unsigned __int8 uint8_t;
+typedef unsigned __int16 uint16_t;
+typedef unsigned __int32 uint32_t;
+typedef unsigned __int64 uint64_t;
+typedef __int16 int16_t;
+typedef __int32 int32_t;
+#if _MSC_VER < 1600
+#define nullptr NULL
+#endif
+#endif
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Allocator.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Allocator.cpp
new file mode 100644
index 00000000..c6c297ca
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Allocator.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "Allocator.h"
+#include "PsAllocator.h"
+
+namespace nvidia
+{
+
+void* cloth::allocate(size_t n)
+{
+	return n ? nvidia::getAllocator().allocate(n, "", __FILE__, __LINE__) : 0;
+}
+
+void cloth::deallocate(void* ptr)
+{
+	if(ptr)
+		nvidia::getAllocator().deallocate(ptr);
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Allocator.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Allocator.h
new file mode 100644
index 00000000..c0488b43
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Allocator.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+#include "PsArray.h"
+#include "PsAllocator.h"
+#include "PsAlignedMalloc.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+
+void* allocate(size_t);
+void deallocate(void*);
+
+/* templated typedefs for convenience */
+
+template <typename T>
+struct Vector
+{
+	typedef nvidia::Array<T, nvidia::NonTrackingAllocator> Type;
+};
+
+template <typename T, size_t alignment>
+struct AlignedVector
+{
+	typedef nvidia::Array<T, nvidia::AlignedAllocator<alignment> > Type;
+};
+
+struct UserAllocated
+{
+	virtual ~UserAllocated()
+	{
+	}
+	static void* operator new(size_t n)
+	{
+		return allocate(n);
+	}
+	static void operator delete(void* ptr)
+	{
+		deallocate(ptr);
+	}
+};
+
+} // namespace cloth
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Array.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Array.h
new file mode 100644
index 00000000..e9da59aa
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Array.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "PxVec4.h"
+#include "PxQuat.h"
+#include "PxVec3.h"
+#include "ApexUsingNamespace.h"
+
+namespace nvidia
+{
+
+namespace cloth
+{
+
+inline float (&array(PxVec3& v))[3]
+{
+	return reinterpret_cast<float(&)[3]>(v);
+}
+inline const float (&array(const PxVec3& v))[3]
+{
+	return reinterpret_cast<const float(&)[3]>(v);
+}
+inline float (&array(PxVec4& v))[4]
+{
+	return reinterpret_cast<float(&)[4]>(v);
+}
+inline const float (&array(const PxVec4& v))[4]
+{
+	return reinterpret_cast<const float(&)[4]>(v);
+}
+inline float (&array(PxQuat& q))[4]
+{
+	return reinterpret_cast<float(&)[4]>(q);
+}
+inline const float (&array(const PxQuat& q))[4]
+{
+	return reinterpret_cast<const float(&)[4]>(q);
+}
+
+} // namespace cloth
+
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/BoundingBox.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/BoundingBox.h
new file mode 100644
index 00000000..339f6f12
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/BoundingBox.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Simd4f.h"
+#include <float.h>
+
+namespace nvidia
+{
+
+namespace cloth
+{
+
+template <typename Simd4f>
+struct BoundingBox
+{
+	Simd4f mLower;
+	Simd4f mUpper;
+};
+
+template <typename Simd4f>
+inline BoundingBox<Simd4f> loadBounds(const float* ptr)
+{
+	BoundingBox<Simd4f> result;
+	result.mLower = load(ptr);
+	result.mUpper = load(ptr + 3);
+	return result;
+}
+
+template <typename Simd4f>
+inline BoundingBox<Simd4f> emptyBounds()
+{
+	BoundingBox<Simd4f> result;
+
+	result.mLower = simd4f(FLT_MAX);
+	result.mUpper = -result.mLower;
+
+	return result;
+}
+
+template <typename Simd4f>
+inline BoundingBox<Simd4f> expandBounds(const BoundingBox<Simd4f>& bounds, const Simd4f* pIt, const Simd4f* pEnd)
+{
+	BoundingBox<Simd4f> result = bounds;
+	for(; pIt != pEnd; ++pIt)
+	{
+		result.mLower = min(result.mLower, *pIt);
+		result.mUpper = max(result.mUpper, *pIt);
+	}
+	return result;
+}
+
+template <typename Simd4f>
+inline BoundingBox<Simd4f> expandBounds(const BoundingBox<Simd4f>& a, const BoundingBox<Simd4f>& b)
+{
+	BoundingBox<Simd4f> result;
+	result.mLower = min(a.mLower, b.mLower);
+	result.mUpper = max(a.mUpper, b.mUpper);
+	return result;
+}
+
+template <typename Simd4f>
+inline BoundingBox<Simd4f> intersectBounds(const BoundingBox<Simd4f>& a, const BoundingBox<Simd4f>& b)
+{
+	BoundingBox<Simd4f> result;
+	result.mLower = max(a.mLower, b.mLower);
+	result.mUpper = min(a.mUpper, b.mUpper);
+	return result;
+}
+
+template <typename Simd4f>
+inline bool isEmptyBounds(const BoundingBox<Simd4f>& a)
+{
+	return anyGreater(a.mLower, a.mUpper) != 0;
+}
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/ClothBase.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/ClothBase.h
new file mode 100644
index 00000000..641fc70f
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/ClothBase.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "PsMathUtils.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+
+/* helper functions shared between SwCloth and CuCloth */
+
+template <typename Cloth>
+void initialize(Cloth& cloth, const PxVec4* pIt, const PxVec4* pEnd)
+{
+	// initialize particles bounding box
+	PxVec4 lower(FLT_MAX), upper = -lower;
+	for(; pIt != pEnd; ++pIt)
+	{
+		lower = lower.minimum(*pIt);
+		upper = upper.maximum(*pIt);
+	}
+	PxVec4 center = (upper + lower) * 0.5f;
+	PxVec4 extent = (upper - lower) * 0.5f;
+	cloth.mParticleBoundsCenter = reinterpret_cast<const PxVec3&>(center);
+	cloth.mParticleBoundsHalfExtent = reinterpret_cast<const PxVec3&>(extent);
+
+	cloth.mGravity = PxVec3(0.0f);
+	cloth.mLogDamping = PxVec3(0.0f);
+	cloth.mLinearLogDrag = PxVec3(0.0f);
+	cloth.mAngularLogDrag = PxVec3(0.0f);
+	cloth.mLinearInertia = PxVec3(1.0f);
+	cloth.mAngularInertia = PxVec3(1.0f);
+	cloth.mCentrifugalInertia = PxVec3(1.0f);
+	cloth.mSolverFrequency = 60.0f;
+	cloth.mStiffnessFrequency = 10.0f;
+	cloth.mTargetMotion = PxTransform(PxIdentity);
+	cloth.mCurrentMotion = PxTransform(PxIdentity);
+	cloth.mLinearVelocity = PxVec3(0.0f);
+	cloth.mAngularVelocity = PxVec3(0.0f);
+	cloth.mPrevIterDt = 0.0f;
+	cloth.mIterDtAvg = MovingAverage(30);
+	cloth.mTetherConstraintLogStiffness = float(-FLT_MAX_EXP);
+	cloth.mTetherConstraintScale = 1.0f;
+	cloth.mMotionConstraintScale = 1.0f;
+	cloth.mMotionConstraintBias = 0.0f;
+	cloth.mMotionConstraintLogStiffness = float(-FLT_MAX_EXP);
+	cloth.mEnableContinuousCollision = false;
+	cloth.mCollisionMassScale = 0.0f;
+	cloth.mFriction = 0.0f;
+	cloth.mSelfCollisionDistance = 0.0f;
+	cloth.mSelfCollisionLogStiffness = float(-FLT_MAX_EXP);
+	cloth.mSleepTestInterval = uint32_t(-1);
+	cloth.mSleepAfterCount = uint32_t(-1);
+	cloth.mSleepThreshold = 0.0f;
+	cloth.mSleepPassCounter = 0;
+	cloth.mSleepTestCounter = 0;
+}
+
+template <typename DstCloth, typename SrcCloth>
+void copy(DstCloth& dstCloth, const SrcCloth& srcCloth)
+{
+	dstCloth.mParticleBoundsCenter = srcCloth.mParticleBoundsCenter;
+	dstCloth.mParticleBoundsHalfExtent = srcCloth.mParticleBoundsHalfExtent;
+	dstCloth.mGravity = srcCloth.mGravity;
+	dstCloth.mLogDamping = srcCloth.mLogDamping;
+	dstCloth.mLinearLogDrag = srcCloth.mLinearLogDrag;
+	dstCloth.mAngularLogDrag = srcCloth.mAngularLogDrag;
+	dstCloth.mLinearInertia = srcCloth.mLinearInertia;
+	dstCloth.mAngularInertia = srcCloth.mAngularInertia;
+	dstCloth.mCentrifugalInertia = srcCloth.mCentrifugalInertia;
+	dstCloth.mSolverFrequency = srcCloth.mSolverFrequency;
+	dstCloth.mStiffnessFrequency = srcCloth.mStiffnessFrequency;
+	dstCloth.mTargetMotion = srcCloth.mTargetMotion;
+	dstCloth.mCurrentMotion = srcCloth.mCurrentMotion;
+	dstCloth.mLinearVelocity = srcCloth.mLinearVelocity;
+	dstCloth.mAngularVelocity = srcCloth.mAngularVelocity;
+	dstCloth.mPrevIterDt = srcCloth.mPrevIterDt;
+	dstCloth.mIterDtAvg = srcCloth.mIterDtAvg;
+	dstCloth.mTetherConstraintLogStiffness = srcCloth.mTetherConstraintLogStiffness;
+	dstCloth.mTetherConstraintScale = srcCloth.mTetherConstraintScale;
+	dstCloth.mMotionConstraintScale = srcCloth.mMotionConstraintScale;
+	dstCloth.mMotionConstraintBias = srcCloth.mMotionConstraintBias;
+	dstCloth.mMotionConstraintLogStiffness = srcCloth.mMotionConstraintLogStiffness;
+	dstCloth.mEnableContinuousCollision = srcCloth.mEnableContinuousCollision;
+	dstCloth.mCollisionMassScale = srcCloth.mCollisionMassScale;
+	dstCloth.mFriction = srcCloth.mFriction;
+	dstCloth.mSelfCollisionDistance = srcCloth.mSelfCollisionDistance;
+	dstCloth.mSelfCollisionLogStiffness = srcCloth.mSelfCollisionLogStiffness;
+	dstCloth.mSleepTestInterval = srcCloth.mSleepTestInterval;
+	dstCloth.mSleepAfterCount = srcCloth.mSleepAfterCount;
+	dstCloth.mSleepThreshold = srcCloth.mSleepThreshold;
+	dstCloth.mSleepPassCounter = srcCloth.mSleepPassCounter;
+	dstCloth.mSleepTestCounter = srcCloth.mSleepTestCounter;
+	dstCloth.mIsAllowedHalfPrecisionSolver = srcCloth.mIsAllowedHalfPrecisionSolver;
+	dstCloth.mUserData = srcCloth.mUserData;
+}
+
+} // namespace cloth
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/ClothImpl.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/ClothImpl.h
new file mode 100644
index 00000000..22206016
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/ClothImpl.h
@@ -0,0 +1,1247 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Cloth.h"
+#include "Fabric.h"
+#include "Allocator.h"
+#include "PsMathUtils.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+
+// SwCloth or CuCloth aggregate implementing the Cloth interface
+// Member specializations are implemented in Sw/CuCloth.cpp
+template <typename T>
+class ClothImpl : public UserAllocated, public Cloth
+{
+	ClothImpl(const ClothImpl&);
+
+  public:
+	ClothImpl& operator=(const ClothImpl&);
+
+	typedef T ClothType;
+	typedef typename ClothType::FactoryType FactoryType;
+	typedef typename ClothType::FabricType FabricType;
+	typedef typename ClothType::ContextLockType ContextLockType;
+
+	ClothImpl(Factory&, Fabric&, Range<const PxVec4>);
+	ClothImpl(Factory&, const ClothImpl&);
+
+	virtual Cloth* clone(Factory& factory) const;
+
+	virtual Fabric& getFabric() const;
+	virtual Factory& getFactory() const;
+
+	virtual uint32_t getNumParticles() const;
+	virtual void lockParticles() const;
+	virtual void unlockParticles() const;
+	virtual MappedRange<PxVec4> getCurrentParticles();
+	virtual MappedRange<const PxVec4> getCurrentParticles() const;
+	virtual MappedRange<PxVec4> getPreviousParticles();
+	virtual MappedRange<const PxVec4> getPreviousParticles() const;
+	virtual GpuParticles getGpuParticles();
+
+	virtual void setTranslation(const PxVec3& trans);
+	virtual void setRotation(const PxQuat& rot);
+
+	virtual const PxVec3& getTranslation() const;
+	virtual const PxQuat& getRotation() const;
+
+	virtual void clearInertia();
+
+	virtual void teleport(const PxVec3& delta);
+
+	virtual float getPreviousIterationDt() const;
+	virtual void setGravity(const PxVec3& gravity);
+	virtual PxVec3 getGravity() const;
+	virtual void setDamping(const PxVec3& damping);
+	virtual PxVec3 getDamping() const;
+	virtual void setLinearDrag(const PxVec3& drag);
+	virtual PxVec3 getLinearDrag() const;
+	virtual void setAngularDrag(const PxVec3& drag);
+	virtual PxVec3 getAngularDrag() const;
+	virtual void setLinearInertia(const PxVec3& inertia);
+	virtual PxVec3 getLinearInertia() const;
+	virtual void setAngularInertia(const PxVec3& inertia);
+	virtual PxVec3 getAngularInertia() const;
+	virtual void setCentrifugalInertia(const PxVec3& inertia);
+	virtual PxVec3 getCentrifugalInertia() const;
+
+	virtual void setSolverFrequency(float frequency);
+	virtual float getSolverFrequency() const;
+
+	virtual void setStiffnessFrequency(float frequency);
+	virtual float getStiffnessFrequency() const;
+
+	virtual void setAcceleationFilterWidth(uint32_t);
+	virtual uint32_t getAccelerationFilterWidth() const;
+
+	virtual void setPhaseConfig(Range<const PhaseConfig> configs);
+
+	virtual void setSpheres(Range<const PxVec4>, uint32_t first, uint32_t last);
+	virtual uint32_t getNumSpheres() const;
+
+	virtual void setCapsules(Range<const uint32_t>, uint32_t first, uint32_t last);
+	virtual uint32_t getNumCapsules() const;
+
+	virtual void setPlanes(Range<const PxVec4>, uint32_t first, uint32_t last);
+	virtual uint32_t getNumPlanes() const;
+
+	virtual void setConvexes(Range<const uint32_t>, uint32_t first, uint32_t last);
+	virtual uint32_t getNumConvexes() const;
+
+	virtual void setTriangles(Range<const PxVec3>, uint32_t first, uint32_t last);
+	virtual void setTriangles(Range<const PxVec3>, Range<const PxVec3>, uint32_t first);
+	virtual uint32_t getNumTriangles() const;
+
+	virtual bool isContinuousCollisionEnabled() const;
+	virtual void enableContinuousCollision(bool);
+
+	virtual float getCollisionMassScale() const;
+	virtual void setCollisionMassScale(float);
+	virtual void setFriction(float friction);
+	virtual float getFriction() const;
+
+	virtual void setVirtualParticles(Range<const uint32_t[4]>, Range<const PxVec3>);
+	virtual uint32_t getNumVirtualParticles() const;
+	virtual uint32_t getNumVirtualParticleWeights() const;
+
+	virtual void setTetherConstraintScale(float scale);
+	virtual float getTetherConstraintScale() const;
+	virtual void setTetherConstraintStiffness(float stiffness);
+	virtual float getTetherConstraintStiffness() const;
+
+	virtual Range<PxVec4> getMotionConstraints();
+	virtual void clearMotionConstraints();
+	virtual uint32_t getNumMotionConstraints() const;
+	virtual void setMotionConstraintScaleBias(float scale, float bias);
+	virtual float getMotionConstraintScale() const;
+	virtual float getMotionConstraintBias() const;
+	virtual void setMotionConstraintStiffness(float stiffness);
+	virtual float getMotionConstraintStiffness() const;
+
+	virtual Range<PxVec4> getSeparationConstraints();
+	virtual void clearSeparationConstraints();
+	virtual uint32_t getNumSeparationConstraints() const;
+
+	virtual void clearInterpolation();
+
+	virtual Range<PxVec4> getParticleAccelerations();
+	virtual void clearParticleAccelerations();
+	virtual uint32_t getNumParticleAccelerations() const;
+
+	virtual void setSelfCollisionDistance(float);
+	virtual float getSelfCollisionDistance() const;
+	virtual void setSelfCollisionStiffness(float);
+	virtual float getSelfCollisionStiffness() const;
+
+	virtual void setSelfCollisionIndices(Range<const uint32_t>);
+	virtual uint32_t getNumSelfCollisionIndices() const;
+
+	virtual void setRestPositions(Range<const PxVec4>);
+	virtual uint32_t getNumRestPositions() const;
+
+	virtual const PxVec3& getBoundingBoxCenter() const;
+	virtual const PxVec3& getBoundingBoxScale() const;
+
+	virtual void setSleepThreshold(float);
+	virtual float getSleepThreshold() const;
+	virtual void setSleepTestInterval(uint32_t);
+	virtual uint32_t getSleepTestInterval() const;
+	virtual void setSleepAfterCount(uint32_t);
+	virtual uint32_t getSleepAfterCount() const;
+	virtual uint32_t getSleepPassCount() const;
+	virtual bool isAsleep() const;
+	virtual void putToSleep();
+	virtual void wakeUp();
+
+	virtual void setHalfPrecisionOption(bool isAllowed);
+	virtual bool getHalfPrecisionOption() const;
+
+#if APEX_UE4
+	virtual void simulate(float dt);
+#endif
+
+	virtual void setUserData(void*);
+	virtual void* getUserData() const;
+
+	// helper function
+	template <typename U>
+	MappedRange<U> getMappedParticles(U* data) const;
+
+	ClothType mCloth;
+};
+
+class SwCloth;
+typedef ClothImpl<SwCloth> SwClothImpl;
+
+class CuCloth;
+typedef ClothImpl<CuCloth> CuClothImpl;
+
+class DxCloth;
+typedef ClothImpl<DxCloth> DxClothImpl;
+
+template <typename T>
+ClothImpl<T>::ClothImpl(Factory& factory, Fabric& fabric, Range<const PxVec4> particles)
+: mCloth(static_cast<FactoryType&>(factory), static_cast<FabricType&>(fabric), particles)
+{
+	// fabric and cloth need to be created by the same factory
+	PX_ASSERT(&fabric.getFactory() == &factory);
+}
+
+template <typename T>
+ClothImpl<T>::ClothImpl(Factory& factory, const ClothImpl& impl)
+: mCloth(static_cast<FactoryType&>(factory), impl.mCloth)
+{
+}
+
+template <typename T>
+inline Fabric& ClothImpl<T>::getFabric() const
+{
+	return mCloth.mFabric;
+}
+
+template <typename T>
+inline Factory& ClothImpl<T>::getFactory() const
+{
+	return mCloth.mFactory;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setTranslation(const PxVec3& trans)
+{
+	PxVec3 t = reinterpret_cast<const PxVec3&>(trans);
+	if(t == mCloth.mTargetMotion.p)
+		return;
+
+	mCloth.mTargetMotion.p = t;
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline void ClothImpl<T>::setRotation(const PxQuat& q)
+{
+	if((q - mCloth.mTargetMotion.q).magnitudeSquared() == 0.0f)
+		return;
+
+	mCloth.mTargetMotion.q = q;
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline const PxVec3& ClothImpl<T>::getTranslation() const
+{
+	return mCloth.mTargetMotion.p;
+}
+
+template <typename T>
+inline const PxQuat& ClothImpl<T>::getRotation() const
+{
+	return mCloth.mTargetMotion.q;
+}
+
+template <typename T>
+inline void ClothImpl<T>::clearInertia()
+{
+	mCloth.mCurrentMotion = mCloth.mTargetMotion;
+	mCloth.mLinearVelocity = PxVec3(0.0f);
+	mCloth.mAngularVelocity = PxVec3(0.0f);
+
+	mCloth.wakeUp();
+}
+
+// Fixed 4505:local function has been removed
+template <typename T>
+inline void ClothImpl<T>::teleport(const PxVec3& delta)
+{
+	mCloth.mCurrentMotion.p += delta;
+	mCloth.mTargetMotion.p += delta;
+}
+
+template <typename T>
+inline float ClothImpl<T>::getPreviousIterationDt() const
+{
+	return mCloth.mPrevIterDt;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setGravity(const PxVec3& gravity)
+{
+	PxVec3 value = gravity;
+	if(value == mCloth.mGravity)
+		return;
+
+	mCloth.mGravity = value;
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline PxVec3 ClothImpl<T>::getGravity() const
+{
+	return mCloth.mGravity;
+}
+
+inline float safeLog2(float x)
+{
+	return x ? physx::shdfnd::log2(x) : -FLT_MAX_EXP;
+}
+
+inline PxVec3 safeLog2(const PxVec3& v)
+{
+	return PxVec3(safeLog2(v.x), safeLog2(v.y), safeLog2(v.z));
+}
+
+inline float safeExp2(float x)
+{
+	if(x <= -FLT_MAX_EXP)
+		return 0.0f;
+	else
+		return physx::shdfnd::exp2(x);
+}
+
+inline PxVec3 safeExp2(const PxVec3& v)
+{
+	return PxVec3(safeExp2(v.x), safeExp2(v.y), safeExp2(v.z));
+}
+
+template <typename T>
+inline void ClothImpl<T>::setDamping(const PxVec3& damping)
+{
+	PxVec3 value = safeLog2(PxVec3(1.f) - damping);
+	if(value == mCloth.mLogDamping)
+		return;
+
+	mCloth.mLogDamping = value;
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline PxVec3 ClothImpl<T>::getDamping() const
+{
+	return PxVec3(1.f) - safeExp2(mCloth.mLogDamping);
+}
+
+template <typename T>
+inline void ClothImpl<T>::setLinearDrag(const PxVec3& drag)
+{
+	PxVec3 value = safeLog2(PxVec3(1.f) - drag);
+	if(value == mCloth.mLinearLogDrag)
+		return;
+
+	mCloth.mLinearLogDrag = value;
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline PxVec3 ClothImpl<T>::getLinearDrag() const
+{
+	return PxVec3(1.f) - safeExp2(mCloth.mLinearLogDrag);
+}
+
+template <typename T>
+inline void ClothImpl<T>::setAngularDrag(const PxVec3& drag)
+{
+	PxVec3 value = safeLog2(PxVec3(1.f) - drag);
+	if(value == mCloth.mAngularLogDrag)
+		return;
+
+	mCloth.mAngularLogDrag = value;
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline PxVec3 ClothImpl<T>::getAngularDrag() const
+{
+	return PxVec3(1.f) - safeExp2(mCloth.mAngularLogDrag);
+}
+
+template <typename T>
+inline void ClothImpl<T>::setLinearInertia(const PxVec3& inertia)
+{
+	PxVec3 value = inertia;
+	if(value == mCloth.mLinearInertia)
+		return;
+
+	mCloth.mLinearInertia = value;
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline PxVec3 ClothImpl<T>::getLinearInertia() const
+{
+	return mCloth.mLinearInertia;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setAngularInertia(const PxVec3& inertia)
+{
+	PxVec3 value = inertia;
+	if(value == mCloth.mAngularInertia)
+		return;
+
+	mCloth.mAngularInertia = value;
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline PxVec3 ClothImpl<T>::getAngularInertia() const
+{
+	return mCloth.mAngularInertia;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setCentrifugalInertia(const PxVec3& inertia)
+{
+	PxVec3 value = inertia;
+	if(value == mCloth.mCentrifugalInertia)
+		return;
+
+	mCloth.mCentrifugalInertia = value;
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline PxVec3 ClothImpl<T>::getCentrifugalInertia() const
+{
+	return mCloth.mCentrifugalInertia;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setSolverFrequency(float frequency)
+{
+	if(frequency == mCloth.mSolverFrequency)
+		return;
+
+	mCloth.mSolverFrequency = frequency;
+	mCloth.mClothCostDirty = true;
+	mCloth.mIterDtAvg.reset();
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getSolverFrequency() const
+{
+	return mCloth.mSolverFrequency;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setStiffnessFrequency(float frequency)
+{
+	if(frequency == mCloth.mStiffnessFrequency)
+		return;
+
+	mCloth.mStiffnessFrequency = frequency;
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getStiffnessFrequency() const
+{
+	return mCloth.mStiffnessFrequency;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setAcceleationFilterWidth(uint32_t n)
+{
+	mCloth.mIterDtAvg.resize(n);
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getAccelerationFilterWidth() const
+{
+	return mCloth.mIterDtAvg.size();
+}
+
+// move a subarray
+template <typename Iter>
+void move(Iter it, uint32_t first, uint32_t last, uint32_t result)
+{
+	if(result > first)
+	{
+		result += last - first;
+		while(first < last)
+			it[--result] = it[--last];
+	}
+	else
+	{
+		while(first < last)
+			it[result++] = it[first++];
+	}
+}
+
+// update capsule index
+inline bool updateIndex(uint32_t& index, uint32_t first, int32_t delta)
+{
+	return index >= first && int32_t(index += delta) < int32_t(first);
+}
+
+template <typename T>
+inline void ClothImpl<T>::setSpheres(Range<const PxVec4> spheres, uint32_t first, uint32_t last)
+{
+	uint32_t oldSize = uint32_t(mCloth.mStartCollisionSpheres.size());
+	uint32_t newSize = uint32_t(spheres.size()) + oldSize - last + first;
+
+	PX_ASSERT(newSize <= 32);
+	PX_ASSERT(first <= oldSize);
+	PX_ASSERT(last <= oldSize);
+
+#if PX_DEBUG
+	for(const PxVec4* it = spheres.begin(); it < spheres.end(); ++it)
+		PX_ASSERT(it->w >= 0.0f);
+#endif
+
+	if(!oldSize && !newSize)
+		return;
+
+	if(!oldSize)
+	{
+		ContextLockType contextLock(mCloth.mFactory);
+		mCloth.mStartCollisionSpheres.assign(spheres.begin(), spheres.end());
+		mCloth.notifyChanged();
+	}
+	else
+	{
+		if(PxMax(oldSize, newSize) >
+		   PxMin(mCloth.mStartCollisionSpheres.capacity(), mCloth.mTargetCollisionSpheres.capacity()))
+		{
+			ContextLockType contextLock(mCloth.mFactory);
+			mCloth.mStartCollisionSpheres.reserve(newSize);
+			mCloth.mTargetCollisionSpheres.reserve(PxMax(oldSize, newSize));
+		}
+
+		typename T::MappedVec4fVectorType start = mCloth.mStartCollisionSpheres;
+		typename T::MappedVec4fVectorType target = mCloth.mTargetCollisionSpheres;
+
+		// fill target from start
+		for(uint32_t i = target.size(); i < oldSize; ++i)
+			target.pushBack(start[i]);
+
+		// resize to larger of oldSize and newSize
+		start.resize(PxMax(oldSize, newSize), PxVec4(0.0f));
+		target.resize(PxMax(oldSize, newSize), PxVec4(0.0f));
+
+		if(int32_t delta = int32_t(newSize - oldSize))
+		{
+			// move past-range elements to new place
+			move(start.begin(), last, oldSize, last + delta);
+			move(target.begin(), last, oldSize, last + delta);
+
+			// fill new elements from spheres
+			for(uint32_t i = last; i < last + delta; ++i)
+				start[i] = spheres[i - first];
+
+			// adjust capsule indices
+			typename T::MappedIndexVectorType indices = mCloth.mCapsuleIndices;
+			Vector<IndexPair>::Type::Iterator cIt, cEnd = indices.end();
+			for(cIt = indices.begin(); cIt != cEnd;)
+			{
+				bool removed = false;
+				removed |= updateIndex(cIt->first, last + PxMin(0, delta), int32_t(delta));
+				removed |= updateIndex(cIt->second, last + PxMin(0, delta), int32_t(delta));
+				if(!removed)
+					++cIt;
+				else
+				{
+					indices.replaceWithLast(cIt);
+					cEnd = indices.end();
+				}
+			}
+
+			start.resize(newSize);
+			target.resize(newSize);
+
+			mCloth.notifyChanged();
+		}
+
+		// fill target elements with spheres
+		for(uint32_t i = 0; i < spheres.size(); ++i)
+			target[first + i] = spheres[i];
+	}
+
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumSpheres() const
+{
+	return uint32_t(mCloth.mStartCollisionSpheres.size());
+}
+
+// Fixed 4505:local function has been removed
+template <typename T>
+inline void ClothImpl<T>::setCapsules(Range<const uint32_t> capsules, uint32_t first, uint32_t last)
+{
+	uint32_t oldSize = mCloth.mCapsuleIndices.size();
+	uint32_t newSize = uint32_t(capsules.size() / 2) + oldSize - last + first;
+
+	PX_ASSERT(newSize <= 32);
+	PX_ASSERT(first <= oldSize);
+	PX_ASSERT(last <= oldSize);
+
+	const IndexPair* srcIndices = reinterpret_cast<const IndexPair*>(capsules.begin());
+
+	if(mCloth.mCapsuleIndices.capacity() < newSize)
+	{
+		ContextLockType contextLock(mCloth.mFactory);
+		mCloth.mCapsuleIndices.reserve(newSize);
+	}
+
+	// resize to larger of oldSize and newSize
+	mCloth.mCapsuleIndices.resize(PxMax(oldSize, newSize));
+
+	typename T::MappedIndexVectorType dstIndices = mCloth.mCapsuleIndices;
+
+	if(uint32_t delta = newSize - oldSize)
+	{
+		// move past-range elements to new place
+		move(dstIndices.begin(), last, oldSize, last + delta);
+
+		// fill new elements from capsules
+		for(uint32_t i = last; i < last + delta; ++i)
+			dstIndices[i] = srcIndices[i - first];
+
+		dstIndices.resize(newSize);
+		mCloth.notifyChanged();
+	}
+
+	// fill existing elements from capsules
+	for(uint32_t i = first; i < last; ++i)
+		dstIndices[i] = srcIndices[i - first];
+
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumCapsules() const
+{
+	return uint32_t(mCloth.mCapsuleIndices.size());
+}
+
+template <typename T>
+inline void ClothImpl<T>::setPlanes(Range<const PxVec4> planes, uint32_t first, uint32_t last)
+{
+	uint32_t oldSize = uint32_t(mCloth.mStartCollisionPlanes.size());
+	uint32_t newSize = uint32_t(planes.size()) + oldSize - last + first;
+
+	PX_ASSERT(newSize <= 32);
+	PX_ASSERT(first <= oldSize);
+	PX_ASSERT(last <= oldSize);
+
+	if(!oldSize && !newSize)
+		return;
+
+	if(!oldSize)
+	{
+		ContextLockType contextLock(mCloth.mFactory);
+		mCloth.mStartCollisionPlanes.assign(planes.begin(), planes.end());
+		mCloth.notifyChanged();
+	}
+	else
+	{
+		if(PxMax(oldSize, newSize) >
+		   PxMin(mCloth.mStartCollisionPlanes.capacity(), mCloth.mTargetCollisionPlanes.capacity()))
+		{
+			ContextLockType contextLock(mCloth.mFactory);
+			mCloth.mStartCollisionPlanes.reserve(newSize);
+			mCloth.mTargetCollisionPlanes.reserve(PxMax(oldSize, newSize));
+		}
+
+		// fill target from start
+		for(uint32_t i = mCloth.mTargetCollisionPlanes.size(); i < oldSize; ++i)
+			mCloth.mTargetCollisionPlanes.pushBack(mCloth.mStartCollisionPlanes[i]);
+
+		// resize to larger of oldSize and newSize
+		mCloth.mStartCollisionPlanes.resize(PxMax(oldSize, newSize), PxZero);
+		mCloth.mTargetCollisionPlanes.resize(PxMax(oldSize, newSize), PxZero);
+
+		if(int32_t delta = int32_t(newSize - oldSize))
+		{
+			// move past-range elements to new place
+			move(mCloth.mStartCollisionPlanes.begin(), last, oldSize, last + delta);
+			move(mCloth.mTargetCollisionPlanes.begin(), last, oldSize, last + delta);
+
+			// fill new elements from planes
+			for(uint32_t i = last; i < last + delta; ++i)
+				mCloth.mStartCollisionPlanes[i] = planes[i - first];
+
+			// adjust convex indices
+			uint32_t mask = (uint32_t(1) << (last + PxMin(delta, 0))) - 1;
+			Vector<uint32_t>::Type::Iterator cIt, cEnd = mCloth.mConvexMasks.end();
+			for(cIt = mCloth.mConvexMasks.begin(); cIt != cEnd;)
+			{
+				uint32_t convex = (*cIt & mask);
+				if(delta < 0)
+					convex |= *cIt >> -delta & ~mask;
+				else
+					convex |= (*cIt & ~mask) << delta;
+				if(convex)
+					*cIt++ = convex;
+				else
+				{
+					mCloth.mConvexMasks.replaceWithLast(cIt);
+					cEnd = mCloth.mConvexMasks.end();
+				}
+			}
+
+			mCloth.mStartCollisionPlanes.resize(newSize);
+			mCloth.mTargetCollisionPlanes.resize(newSize);
+
+			mCloth.notifyChanged();
+		}
+
+		// fill target elements with planes
+		for(uint32_t i = 0; i < planes.size(); ++i)
+			mCloth.mTargetCollisionPlanes[first + i] = planes[i];
+	}
+
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumPlanes() const
+{
+	return uint32_t(mCloth.mStartCollisionPlanes.size());
+}
+
+template <typename T>
+inline void ClothImpl<T>::setConvexes(Range<const uint32_t> convexes, uint32_t first, uint32_t last)
+{
+	uint32_t oldSize = mCloth.mConvexMasks.size();
+	uint32_t newSize = uint32_t(convexes.size()) + oldSize - last + first;
+
+	PX_ASSERT(newSize <= 32);
+	PX_ASSERT(first <= oldSize);
+	PX_ASSERT(last <= oldSize);
+
+	if(mCloth.mConvexMasks.capacity() < newSize)
+	{
+		ContextLockType contextLock(mCloth.mFactory);
+		mCloth.mConvexMasks.reserve(newSize);
+	}
+
+	// resize to larger of oldSize and newSize
+	mCloth.mConvexMasks.resize(PxMax(oldSize, newSize));
+
+	if(uint32_t delta = newSize - oldSize)
+	{
+		// move past-range elements to new place
+		move(mCloth.mConvexMasks.begin(), last, oldSize, last + delta);
+
+		// fill new elements from capsules
+		for(uint32_t i = last; i < last + delta; ++i)
+			mCloth.mConvexMasks[i] = convexes[i - first];
+
+		mCloth.mConvexMasks.resize(newSize);
+		mCloth.notifyChanged();
+	}
+
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumConvexes() const
+{
+	return uint32_t(mCloth.mConvexMasks.size());
+}
+
+template <typename T>
+inline void ClothImpl<T>::setTriangles(Range<const PxVec3> triangles, uint32_t first, uint32_t last)
+{
+	// convert from triangle to vertex count
+	first *= 3;
+	last *= 3;
+
+	triangles = mCloth.clampTriangleCount(triangles, last - first);
+	PX_ASSERT(0 == triangles.size() % 3);
+
+	uint32_t oldSize = uint32_t(mCloth.mStartCollisionTriangles.size());
+	uint32_t newSize = uint32_t(triangles.size()) + oldSize - last + first;
+
+	PX_ASSERT(first <= oldSize);
+	PX_ASSERT(last <= oldSize);
+
+	if(!oldSize && !newSize)
+		return;
+
+	if(!oldSize)
+	{
+		ContextLockType contextLock(mCloth.mFactory);
+		mCloth.mStartCollisionTriangles.assign(triangles.begin(), triangles.end());
+		mCloth.notifyChanged();
+	}
+	else
+	{
+		if(PxMax(oldSize, newSize) >
+		   PxMin(mCloth.mStartCollisionTriangles.capacity(), mCloth.mTargetCollisionTriangles.capacity()))
+		{
+			ContextLockType contextLock(mCloth.mFactory);
+			mCloth.mStartCollisionTriangles.reserve(newSize);
+			mCloth.mTargetCollisionTriangles.reserve(PxMax(oldSize, newSize));
+		}
+
+		// fill target from start
+		for(uint32_t i = mCloth.mTargetCollisionTriangles.size(); i < oldSize; ++i)
+			mCloth.mTargetCollisionTriangles.pushBack(mCloth.mStartCollisionTriangles[i]);
+
+		// resize to larger of oldSize and newSize
+		mCloth.mStartCollisionTriangles.resize(PxMax(oldSize, newSize));
+		mCloth.mTargetCollisionTriangles.resize(PxMax(oldSize, newSize));
+
+		if(uint32_t delta = newSize - oldSize)
+		{
+			// move past-range elements to new place
+			move(mCloth.mStartCollisionTriangles.begin(), last, oldSize, last + delta);
+			move(mCloth.mTargetCollisionTriangles.begin(), last, oldSize, last + delta);
+
+			// fill new elements from triangles
+			for(uint32_t i = last; i < last + delta; ++i)
+				mCloth.mStartCollisionTriangles[i] = triangles[i - first];
+
+			mCloth.mStartCollisionTriangles.resize(newSize);
+			mCloth.mTargetCollisionTriangles.resize(newSize);
+
+			mCloth.notifyChanged();
+		}
+
+		// fill target elements with triangles
+		for(uint32_t i = 0; i < triangles.size(); ++i)
+			mCloth.mTargetCollisionTriangles[first + i] = triangles[i];
+	}
+
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline void ClothImpl<T>::setTriangles(Range<const PxVec3> startTriangles, Range<const PxVec3> targetTriangles,
+                                       uint32_t first)
+{
+	PX_ASSERT(startTriangles.size() == targetTriangles.size());
+
+	// convert from triangle to vertex count
+	first *= 3;
+
+	uint32_t last = uint32_t(mCloth.mStartCollisionTriangles.size());
+
+	startTriangles = mCloth.clampTriangleCount(startTriangles, last - first);
+	targetTriangles = mCloth.clampTriangleCount(targetTriangles, last - first);
+
+	uint32_t oldSize = uint32_t(mCloth.mStartCollisionTriangles.size());
+	uint32_t newSize = uint32_t(startTriangles.size()) + oldSize - last + first;
+
+	PX_ASSERT(first <= oldSize);
+	PX_ASSERT(last == oldSize); // this path only supports replacing the tail
+
+	if(!oldSize && !newSize)
+		return;
+
+	if(newSize > PxMin(mCloth.mStartCollisionTriangles.capacity(), mCloth.mTargetCollisionTriangles.capacity()))
+	{
+		ContextLockType contextLock(mCloth.mFactory);
+		mCloth.mStartCollisionTriangles.reserve(newSize);
+		mCloth.mTargetCollisionTriangles.reserve(newSize);
+	}
+
+	uint32_t retainSize = oldSize - last + first;
+	mCloth.mStartCollisionTriangles.resize(retainSize);
+	mCloth.mTargetCollisionTriangles.resize(retainSize);
+
+	for(uint32_t i = 0, n = startTriangles.size(); i < n; ++i)
+	{
+		mCloth.mStartCollisionTriangles.pushBack(startTriangles[i]);
+		mCloth.mTargetCollisionTriangles.pushBack(targetTriangles[i]);
+	}
+
+	if(newSize - oldSize)
+		mCloth.notifyChanged();
+
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumTriangles() const
+{
+	return uint32_t(mCloth.mStartCollisionTriangles.size()) / 3;
+}
+
+template <typename T>
+inline bool ClothImpl<T>::isContinuousCollisionEnabled() const
+{
+	return mCloth.mEnableContinuousCollision;
+}
+
+template <typename T>
+inline void ClothImpl<T>::enableContinuousCollision(bool enable)
+{
+	if(enable == mCloth.mEnableContinuousCollision)
+		return;
+
+	mCloth.mEnableContinuousCollision = enable;
+	mCloth.notifyChanged();
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getCollisionMassScale() const
+{
+	return mCloth.mCollisionMassScale;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setCollisionMassScale(float scale)
+{
+	if(scale == mCloth.mCollisionMassScale)
+		return;
+
+	mCloth.mCollisionMassScale = scale;
+	mCloth.notifyChanged();
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline void ClothImpl<T>::setFriction(float friction)
+{
+	mCloth.mFriction = friction;
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getFriction() const
+{
+	return mCloth.mFriction;
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumVirtualParticleWeights() const
+{
+	return uint32_t(mCloth.mVirtualParticleWeights.size());
+}
+
+template <typename T>
+inline void ClothImpl<T>::setTetherConstraintScale(float scale)
+{
+	if(scale == mCloth.mTetherConstraintScale)
+		return;
+
+	mCloth.mTetherConstraintScale = scale;
+	mCloth.notifyChanged();
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getTetherConstraintScale() const
+{
+	return mCloth.mTetherConstraintScale;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setTetherConstraintStiffness(float stiffness)
+{
+	float value = safeLog2(1 - stiffness);
+	if(value == mCloth.mTetherConstraintLogStiffness)
+		return;
+
+	mCloth.mTetherConstraintLogStiffness = value;
+	mCloth.notifyChanged();
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getTetherConstraintStiffness() const
+{
+	return 1 - safeExp2(mCloth.mTetherConstraintLogStiffness);
+}
+
+template <typename T>
+inline Range<PxVec4> ClothImpl<T>::getMotionConstraints()
+{
+	mCloth.wakeUp();
+	return mCloth.push(mCloth.mMotionConstraints);
+}
+
+template <typename T>
+inline void ClothImpl<T>::clearMotionConstraints()
+{
+	mCloth.clear(mCloth.mMotionConstraints);
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumMotionConstraints() const
+{
+	return uint32_t(mCloth.mMotionConstraints.mStart.size());
+}
+
+template <typename T>
+inline void ClothImpl<T>::setMotionConstraintScaleBias(float scale, float bias)
+{
+	if(scale == mCloth.mMotionConstraintScale && bias == mCloth.mMotionConstraintBias)
+		return;
+
+	mCloth.mMotionConstraintScale = scale;
+	mCloth.mMotionConstraintBias = bias;
+	mCloth.notifyChanged();
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getMotionConstraintScale() const
+{
+	return mCloth.mMotionConstraintScale;
+}
+
+template <typename T>
+inline float ClothImpl<T>::getMotionConstraintBias() const
+{
+	return mCloth.mMotionConstraintBias;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setMotionConstraintStiffness(float stiffness)
+{
+	float value = safeLog2(1 - stiffness);
+	if(value == mCloth.mMotionConstraintLogStiffness)
+		return;
+
+	mCloth.mMotionConstraintLogStiffness = value;
+	mCloth.notifyChanged();
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getMotionConstraintStiffness() const
+{
+	return 1 - safeExp2(mCloth.mMotionConstraintLogStiffness);
+}
+
+template <typename T>
+inline Range<PxVec4> ClothImpl<T>::getSeparationConstraints()
+{
+	mCloth.wakeUp();
+	return mCloth.push(mCloth.mSeparationConstraints);
+}
+
+template <typename T>
+inline void ClothImpl<T>::clearSeparationConstraints()
+{
+	mCloth.clear(mCloth.mSeparationConstraints);
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline void ClothImpl<T>::clearInterpolation()
+{
+	if(!mCloth.mTargetCollisionSpheres.empty())
+	{
+		nvidia::swap(mCloth.mStartCollisionSpheres, mCloth.mTargetCollisionSpheres);
+		mCloth.mTargetCollisionSpheres.resize(0);
+	}
+	mCloth.mMotionConstraints.pop();
+	mCloth.mSeparationConstraints.pop();
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumSeparationConstraints() const
+{
+	return uint32_t(mCloth.mSeparationConstraints.mStart.size());
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumParticleAccelerations() const
+{
+	return uint32_t(mCloth.mParticleAccelerations.size());
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumSelfCollisionIndices() const
+{
+	return uint32_t(mCloth.mSelfCollisionIndices.size());
+}
+
+// Fixed 4505:local function has been removed
+template <typename T>
+inline void ClothImpl<T>::setRestPositions(Range<const PxVec4> restPositions)
+{
+	PX_ASSERT(restPositions.empty() || restPositions.size() == getNumParticles());
+	ContextLockType contextLock(mCloth.mFactory);
+	mCloth.mRestPositions.assign(restPositions.begin(), restPositions.end());
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumRestPositions() const
+{
+	return uint32_t(mCloth.mRestPositions.size());
+}
+
+template <typename T>
+inline void ClothImpl<T>::setSelfCollisionDistance(float distance)
+{
+	if(distance == mCloth.mSelfCollisionDistance)
+		return;
+
+	mCloth.mSelfCollisionDistance = distance;
+	mCloth.notifyChanged();
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getSelfCollisionDistance() const
+{
+	return mCloth.mSelfCollisionDistance;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setSelfCollisionStiffness(float stiffness)
+{
+	float value = safeLog2(1 - stiffness);
+	if(value == mCloth.mSelfCollisionLogStiffness)
+		return;
+
+	mCloth.mSelfCollisionLogStiffness = value;
+	mCloth.notifyChanged();
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getSelfCollisionStiffness() const
+{
+	return 1 - safeExp2(mCloth.mSelfCollisionLogStiffness);
+}
+
+template <typename T>
+inline const PxVec3& ClothImpl<T>::getBoundingBoxCenter() const
+{
+	return mCloth.mParticleBoundsCenter;
+}
+
+template <typename T>
+inline const PxVec3& ClothImpl<T>::getBoundingBoxScale() const
+{
+	return mCloth.mParticleBoundsHalfExtent;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setSleepThreshold(float threshold)
+{
+	if(threshold == mCloth.mSleepThreshold)
+		return;
+
+	mCloth.mSleepThreshold = threshold;
+	mCloth.notifyChanged();
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getSleepThreshold() const
+{
+	return mCloth.mSleepThreshold;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setSleepTestInterval(uint32_t interval)
+{
+	if(interval == mCloth.mSleepTestInterval)
+		return;
+
+	mCloth.mSleepTestInterval = interval;
+	mCloth.notifyChanged();
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getSleepTestInterval() const
+{
+	return mCloth.mSleepTestInterval;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setSleepAfterCount(uint32_t afterCount)
+{
+	if(afterCount == mCloth.mSleepAfterCount)
+		return;
+
+	mCloth.mSleepAfterCount = afterCount;
+	mCloth.notifyChanged();
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getSleepAfterCount() const
+{
+	return mCloth.mSleepAfterCount;
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getSleepPassCount() const
+{
+	return mCloth.mSleepPassCounter;
+}
+
+template <typename T>
+inline bool ClothImpl<T>::isAsleep() const
+{
+	return mCloth.isSleeping();
+}
+
+template <typename T>
+inline void ClothImpl<T>::putToSleep()
+{
+	mCloth.mSleepPassCounter = mCloth.mSleepAfterCount;
+}
+
+template <typename T>
+inline void ClothImpl<T>::wakeUp()
+{
+	mCloth.wakeUp();
+}
+
+
+template <typename T>
+inline void ClothImpl<T>::setHalfPrecisionOption(bool isAllowed)
+{
+	mCloth.mIsAllowedHalfPrecisionSolver = isAllowed;
+}
+
+template <typename T>
+inline bool ClothImpl<T>::getHalfPrecisionOption() const
+{
+	return mCloth.mIsAllowedHalfPrecisionSolver;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setUserData(void* data)
+{
+	mCloth.mUserData = data;
+}
+
+template <typename T>
+inline void* ClothImpl<T>::getUserData() const
+{
+	return mCloth.mUserData;
+}
+
+template <typename T>
+template <typename U>
+inline MappedRange<U> ClothImpl<T>::getMappedParticles(U* data) const
+{
+	return MappedRange<U>(data, data + getNumParticles(), *this, &Cloth::lockParticles, &Cloth::unlockParticles);
+}
+
+} // namespace cloth
+
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Factory.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Factory.cpp
new file mode 100644
index 00000000..6e49c85f
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Factory.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "SwFactory.h"
+
+#if ENABLE_CUFACTORY
+#include "CuFactory.h"
+#endif
+
+#if ENABLE_DXFACTORY
+#include "windows/DxFactory.h"
+//#include "PxGraphicsContextManager.h"
+#pragma warning(disable : 4668 4917 4365 4061 4005)
+#if PX_XBOXONE
+#include <d3d11_x.h>
+#else
+#include <d3d11.h>
+#endif
+#endif
+
+namespace nvidia
+{
+namespace cloth
+{
+uint32_t getNextFabricId()
+{
+	static uint32_t sNextFabricId = 0;
+	return sNextFabricId++;
+}
+}
+}
+
+using namespace nvidia;
+
+cloth::Factory* cloth::Factory::createFactory(Platform platform, void* contextManager)
+{
+	PX_UNUSED(contextManager);
+
+	if(platform == Factory::CPU)
+		return new SwFactory;
+
+#if ENABLE_CUFACTORY
+	if(platform == Factory::CUDA)
+		return new CuFactory((PxCudaContextManager*)contextManager);
+#endif
+
+#if ENABLE_DXFACTORY
+	if(platform == Factory::DirectCompute)
+	{
+		//physx::PxGraphicsContextManager* graphicsContextManager = (physx::PxGraphicsContextManager*)contextManager;
+		//if(graphicsContextManager->getDevice()->GetFeatureLevel() >= D3D_FEATURE_LEVEL_11_0)
+		//	return new DxFactory(graphicsContextManager);
+	}
+#endif
+
+	return 0;
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/IndexPair.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/IndexPair.h
new file mode 100644
index 00000000..89dd9090
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/IndexPair.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+
+struct IndexPair
+{
+	uint32_t first;
+	uint32_t second;
+};
+
+} // namespace cloth
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/IterationState.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/IterationState.h
new file mode 100644
index 00000000..527cf163
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/IterationState.h
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+#include "Array.h"
+#include "PxTransform.h"
+#include "PxMat44.h"
+#include "PsMathUtils.h"
+#include "Simd4f.h"
+#include "Simd4i.h"
+
+namespace nvidia
+{
+
+/* function object to perform solver iterations on one cloth */
+
+// todo: performance optimization: cache this object and test if velocity/iterDt has changed
+// c'tor takes about 5% of the iteration time of a 20x20 cloth
+
+namespace cloth
+{
+
+/*  helper functions */
+
+inline PxVec3 log(const PxQuat& q)
+{
+	float theta = q.getImaginaryPart().magnitude();
+	float scale = theta > PX_EPS_REAL ? PxAsin(theta) / theta : 1.0f;
+	scale = intrinsics::fsel(q.w, scale, -scale);
+	return PxVec3(q.x * scale, q.y * scale, q.z * scale);
+}
+
+inline PxQuat exp(const PxVec3& v)
+{
+	float theta = v.magnitude();
+	float scale = theta > PX_EPS_REAL ? PxSin(theta) / theta : 1.0f;
+	return PxQuat(v.x * scale, v.y * scale, v.z * scale, cos(theta));
+}
+
+template <typename Simd4f, uint32_t N>
+inline void assign(Simd4f (&columns)[N], const PxMat44& matrix)
+{
+	for(uint32_t i = 0; i < N; ++i)
+		columns[i] = load(array(matrix[i]));
+}
+
+template <typename Simd4f>
+inline Simd4f transform(const Simd4f (&columns)[3], const Simd4f& vec)
+{
+	return splat<0>(vec) * columns[0] + splat<1>(vec) * columns[1] + splat<2>(vec) * columns[2];
+}
+
+template <typename Simd4f>
+inline Simd4f transform(const Simd4f (&columns)[3], const Simd4f& translate, const Simd4f& vec)
+{
+	return translate + splat<0>(vec) * columns[0] + splat<1>(vec) * columns[1] + splat<2>(vec) * columns[2];
+}
+
+template <typename>
+struct IterationState; // forward declaration
+
+struct IterationStateFactory
+{
+	template <typename MyCloth>
+	IterationStateFactory(MyCloth& cloth, float frameDt);
+
+	template <typename Simd4f, typename MyCloth>
+	IterationState<Simd4f> create(MyCloth const& cloth) const;
+
+	template <typename Simd4f>
+	static Simd4f lengthSqr(Simd4f const& v)
+	{
+		return dot3(v, v);
+	}
+
+	template <typename Simd4f>
+	static PxVec3 castToPxVec3(const Simd4f& v)
+	{
+		return *reinterpret_cast<const PxVec3*>(reinterpret_cast<const char*>(&v));
+	}
+
+	int mNumIterations;
+	float mInvNumIterations;
+	float mIterDt, mIterDtRatio, mIterDtAverage;
+	PxQuat mCurrentRotation;
+	PxVec3 mPrevLinearVelocity;
+	PxVec3 mPrevAngularVelocity;
+};
+
+/* solver iterations helper functor */
+template <typename Simd4f>
+struct IterationState
+{
+	// call after each iteration
+	void update();
+
+	inline float getCurrentAlpha() const;
+	inline float getPreviousAlpha() const;
+
+  public:
+	Simd4f mRotationMatrix[3];
+	Simd4f mCurBias;  // in local space
+	Simd4f mPrevBias; // in local space
+
+	Simd4f mPrevMatrix[3];
+	Simd4f mCurMatrix[3];
+	Simd4f mDampScaleUpdate;
+
+	// iteration counter
+	uint32_t mRemainingIterations;
+
+	// reciprocal total number of iterations
+	float mInvNumIterations;
+
+	// time step size per iteration
+	float mIterDt;
+
+	bool mIsTurning; // if false, mPositionScale = mPrevMatrix[0]
+};
+
+} // namespace cloth
+
+template <typename Simd4f>
+inline float cloth::IterationState<Simd4f>::getCurrentAlpha() const
+{
+	return getPreviousAlpha() + mInvNumIterations;
+}
+
+template <typename Simd4f>
+inline float cloth::IterationState<Simd4f>::getPreviousAlpha() const
+{
+	return 1.0f - mRemainingIterations * mInvNumIterations;
+}
+
+template <typename MyCloth>
+cloth::IterationStateFactory::IterationStateFactory(MyCloth& cloth, float frameDt)
+{
+	mNumIterations = PxMax(1, int(frameDt * cloth.mSolverFrequency + 0.5f));
+	mInvNumIterations = 1.0f / mNumIterations;
+	mIterDt = frameDt * mInvNumIterations;
+
+	mIterDtRatio = cloth.mPrevIterDt ? mIterDt / cloth.mPrevIterDt : 1.0f;
+	mIterDtAverage = cloth.mIterDtAvg.empty() ? mIterDt : cloth.mIterDtAvg.average();
+
+	mCurrentRotation = cloth.mCurrentMotion.q;
+	mPrevLinearVelocity = cloth.mLinearVelocity;
+	mPrevAngularVelocity = cloth.mAngularVelocity;
+
+	// update cloth
+	float invFrameDt = 1.0f / frameDt;
+	cloth.mLinearVelocity = invFrameDt * (cloth.mTargetMotion.p - cloth.mCurrentMotion.p);
+	PxQuat dq = cloth.mTargetMotion.q * cloth.mCurrentMotion.q.getConjugate();
+	cloth.mAngularVelocity = log(dq) * invFrameDt;
+
+	cloth.mPrevIterDt = mIterDt;
+	cloth.mIterDtAvg.push((uint32_t)mNumIterations, mIterDt);
+	cloth.mCurrentMotion = cloth.mTargetMotion;
+}
+
+/*
+momentum conservation:
+m2*x2 - m1*x1 = m1*x1 - m0*x0 + g*dt2, m = r+t
+r2*x2+t2 = 2(r1*x1+t1) - (r0*x0+t0) + g*dt2
+r2*x2 = r1*x1 + r1*x1 - r0*x0 - (t2-2t1+t0) + g*dt2
+substitue       r1*x1 - r0*x0 = r1*(x1-x0) + (r1-r0)*x0
+and     r1*x1 = r2*x1 - (r2-r1)*x1
+
+x2 = x1 + r2'*g*dt2
+   + r2'r1*(x1-x0) //< damp
+   + (r2'r1-r2'r0)*x0 - (1-r2'r1)*x1 - r2'*(t2-2t1+t0) //< inertia
+   + (1-r2'r1)x1 + t2-t1 //< drag (not momentum conserving)
+
+x2 = x0 + a0*x0 + a1*x1 + b with
+a0 = (inertia-damp)*r2'r1 - inertia*r2'r0 - eye
+a1 = (1-inertia-drag)*eye + (damp+inertia+drag)*r2'r1
+b = r2'*(g*dt2 - (inertia+drag)*(t2-t1) + inertia*(t1-t0))
+
+Velocities are used to deal with multiple iterations and varying dt. Only b needs
+to updated from one iteration to the next. Specifically, it is multiplied
+by (r2'r1)^1/numIterations. a0 and a1 are unaffected by that multiplication.
+
+The centrifugal and coriolis forces of non-inertial (turning) reference frame are
+not generally captured in these formulas. The 'inertia' term above contains radial
+acceleration plus centrifugal and coriolis force for a single iteration.
+For multiple iterations, or when the centrifugal forces are scaled differently
+than angular inertia, we need to add explicit centrifugal and coriolis forces.
+We only use them to correct the above formula because their discretization is
+not accurate.
+
+Possible improvements: multiply coriolis and centrifugal matrix by curInvRotation
+from the left. Do the alpha trick of linearInertia also for angularInertia, write
+prevParticle after multiplying it with matrix.
+
+If you change anything in this function, make sure that ClothCustomFloating and
+ClothInertia haven't regressed for any choice of solver frequency.
+*/
+
+template <typename Simd4f, typename MyCloth>
+cloth::IterationState<Simd4f> cloth::IterationStateFactory::create(MyCloth const& cloth) const
+{
+	IterationState<Simd4f> result;
+
+	result.mRemainingIterations = (uint32_t)mNumIterations;
+	result.mInvNumIterations = mInvNumIterations;
+	result.mIterDt = mIterDt;
+
+	Simd4f curLinearVelocity = load(array(cloth.mLinearVelocity));
+	Simd4f prevLinearVelocity = load(array(mPrevLinearVelocity));
+
+	Simd4f iterDt = simd4f(mIterDt);
+	Simd4f dampExponent = simd4f(cloth.mStiffnessFrequency) * iterDt;
+
+	// gravity delta per iteration
+	Simd4f gravity = load(array(cloth.mGravity)) * (Simd4f)simd4f(sqr(mIterDtAverage));
+
+	// scale of local particle velocity per iteration
+	Simd4f dampScale = simdf::exp2(load(array(cloth.mLogDamping)) * dampExponent);
+	// adjust for the change in time step during the first iteration
+	Simd4f firstDampScale = dampScale * simd4f(mIterDtRatio);
+
+	// portion of negative frame velocity to transfer to particle
+	Simd4f linearDrag =
+	    (simd4f(_1) - simdf::exp2(load(array(cloth.mLinearLogDrag)) * dampExponent)) * iterDt * curLinearVelocity;
+
+	// portion of frame acceleration to transfer to particle
+	Simd4f linearInertia = load(array(cloth.mLinearInertia)) * iterDt * (prevLinearVelocity - curLinearVelocity);
+
+	// for inertia, we want to violate newton physics to
+	// match velocity and position as given by the user, which means:
+	// vt = v0 + a*t and xt = x0 + v0*t + (!) a*t^2
+	// this is achieved by applying a different portion to cur and prev
+	// position, compared to the normal +0.5 and -0.5 for '... 1/2 a*t^2'.
+	// specifically, the portion is alpha=(n+1)/2n and 1-alpha.
+
+	float linearAlpha = (mNumIterations + 1) * 0.5f * mInvNumIterations;
+	Simd4f curLinearInertia = linearInertia * simd4f(linearAlpha);
+
+	// rotate to local space (use mRotationMatrix temporarily to hold matrix)
+	PxMat44 invRotation(mCurrentRotation.getConjugate());
+	assign(result.mRotationMatrix, invRotation);
+
+	Simd4f maskXYZ = simd4f(simd4i(~0, ~0, ~0, 0));
+
+	// Previously, we split the bias between previous and current position to
+	// get correct disretized position and velocity. However, this made a
+	// hanging cloth experience a downward velocity, which is problematic
+	// when scaled by the iterDt ratio and results in jitter under variable
+	// timesteps. Instead, we now apply the entire bias to current position
+	// and accept a less noticeable error for a free falling cloth.
+
+	Simd4f bias = gravity - linearDrag;
+	result.mCurBias = transform(result.mRotationMatrix, curLinearInertia + bias) & maskXYZ;
+	result.mPrevBias = transform(result.mRotationMatrix, linearInertia - curLinearInertia) & maskXYZ;
+
+	result.mIsTurning = mPrevAngularVelocity.magnitudeSquared() + cloth.mAngularVelocity.magnitudeSquared() > 0.0f;
+
+	if(result.mIsTurning)
+	{
+		Simd4f curAngularVelocity = load(array(invRotation.rotate(cloth.mAngularVelocity)));
+		Simd4f prevAngularVelocity = load(array(invRotation.rotate(mPrevAngularVelocity)));
+
+		// rotation for one iteration in local space
+		Simd4f curInvAngle = -iterDt * curAngularVelocity;
+		Simd4f prevInvAngle = -iterDt * prevAngularVelocity;
+
+		PxQuat curInvRotation = exp(castToPxVec3(curInvAngle));
+		PxQuat prevInvRotation = exp(castToPxVec3(prevInvAngle));
+
+		PxMat44 curMatrix(curInvRotation);
+		PxMat44 prevMatrix(prevInvRotation * curInvRotation);
+
+		assign(result.mRotationMatrix, curMatrix);
+
+		Simd4f angularDrag = simd4f(_1) - simdf::exp2(load(array(cloth.mAngularLogDrag)) * dampExponent);
+		Simd4f centrifugalInertia = load(array(cloth.mCentrifugalInertia));
+		Simd4f angularInertia = load(array(cloth.mAngularInertia));
+		Simd4f angularAcceleration = curAngularVelocity - prevAngularVelocity;
+
+		Simd4f epsilon = simd4f(sqrt(FLT_MIN)); // requirement: sqr(epsilon) > 0
+		Simd4f velocityLengthSqr = lengthSqr(curAngularVelocity) + epsilon;
+		Simd4f dragLengthSqr = lengthSqr(Simd4f(curAngularVelocity * angularDrag)) + epsilon;
+		Simd4f centrifugalLengthSqr = lengthSqr(Simd4f(curAngularVelocity * centrifugalInertia)) + epsilon;
+		Simd4f accelerationLengthSqr = lengthSqr(angularAcceleration) + epsilon;
+		Simd4f inertiaLengthSqr = lengthSqr(Simd4f(angularAcceleration * angularInertia)) + epsilon;
+
+		float dragScale = array(rsqrt(velocityLengthSqr * dragLengthSqr) * dragLengthSqr)[0];
+		float inertiaScale =
+		    mInvNumIterations * array(rsqrt(accelerationLengthSqr * inertiaLengthSqr) * inertiaLengthSqr)[0];
+
+		// magic factor found by comparing to global space simulation:
+		// some centrifugal force is in inertia part, remainder is 2*(n-1)/n
+		// after scaling the inertia part, we get for centrifugal:
+		float centrifugalAlpha = (2 * mNumIterations - 1) * mInvNumIterations;
+		float centrifugalScale =
+		    centrifugalAlpha * array(rsqrt(velocityLengthSqr * centrifugalLengthSqr) * centrifugalLengthSqr)[0] -
+		    inertiaScale;
+
+		// slightly better in ClothCustomFloating than curInvAngle alone
+		Simd4f centrifugalVelocity = (prevInvAngle + curInvAngle) * simd4f(0.5f);
+		const Simd4f data = lengthSqr(centrifugalVelocity);
+		float centrifugalSqrLength = array(data)[0] * centrifugalScale;
+
+		Simd4f coriolisVelocity = centrifugalVelocity * simd4f(centrifugalScale);
+		PxMat33 coriolisMatrix = physx::shdfnd::star(castToPxVec3(coriolisVelocity));
+
+		const float* dampScalePtr = array(firstDampScale);
+		const float* centrifugalPtr = array(centrifugalVelocity);
+
+		for(unsigned int j = 0; j < 3; ++j)
+		{
+			float centrifugalJ = -centrifugalPtr[j] * centrifugalScale;
+			for(unsigned int i = 0; i < 3; ++i)
+			{
+				float damping = dampScalePtr[j];
+				float coriolis = coriolisMatrix(i, j);
+				float centrifugal = centrifugalPtr[i] * centrifugalJ;
+
+				prevMatrix(i, j) = centrifugal - coriolis + curMatrix(i, j) * (inertiaScale - damping) -
+				                   prevMatrix(i, j) * inertiaScale;
+				curMatrix(i, j) = centrifugal + coriolis + curMatrix(i, j) * (inertiaScale + damping + dragScale);
+			}
+			curMatrix(j, j) += centrifugalSqrLength - inertiaScale - dragScale;
+			prevMatrix(j, j) += centrifugalSqrLength;
+		}
+
+		assign(result.mPrevMatrix, prevMatrix);
+		assign(result.mCurMatrix, curMatrix);
+	}
+	else
+	{
+		Simd4f minusOne = -(Simd4f)simd4f(_1);
+		result.mRotationMatrix[0] = minusOne;
+		result.mPrevMatrix[0] = select(maskXYZ, firstDampScale, minusOne);
+	}
+
+	// difference of damp scale between first and other iterations
+	result.mDampScaleUpdate = (dampScale - firstDampScale) & maskXYZ;
+
+	return result;
+}
+
+template <typename Simd4f>
+void cloth::IterationState<Simd4f>::update()
+{
+	if(mIsTurning)
+	{
+		// only need to turn bias, matrix is unaffected (todo: verify)
+		mCurBias = transform(mRotationMatrix, mCurBias);
+		mPrevBias = transform(mRotationMatrix, mPrevBias);
+	}
+
+	// remove time step ratio in damp scale after first iteration
+	for(uint32_t i = 0; i < 3; ++i)
+	{
+		mPrevMatrix[i] = mPrevMatrix[i] - mRotationMatrix[i] * mDampScaleUpdate;
+		mCurMatrix[i] = mCurMatrix[i] + mRotationMatrix[i] * mDampScaleUpdate;
+	}
+	mDampScaleUpdate = simd4f(_0); // only once
+
+	--mRemainingIterations;
+}
+
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/MovingAverage.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/MovingAverage.h
new file mode 100644
index 00000000..76eb7f4c
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/MovingAverage.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Allocator.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+
+struct MovingAverage
+{
+	struct Element
+	{
+		uint32_t mCount;
+		float mValue;
+	};
+
+  public:
+	MovingAverage(uint32_t n = 1) : mCount(0), mSize(n)
+	{
+	}
+
+	bool empty() const
+	{
+		return mData.empty();
+	}
+
+	uint32_t size() const
+	{
+		return mSize;
+	}
+
+	void resize(uint32_t n)
+	{
+		PX_ASSERT(n);
+		mSize = n;
+		trim();
+	}
+
+	void reset()
+	{
+		mData.resize(0);
+		mCount = 0;
+	}
+
+	void push(uint32_t n, float value)
+	{
+		n = PxMin(n, mSize);
+
+		if(mData.empty() || mData.back().mValue != value)
+		{
+			Element element = { n, value };
+			mData.pushBack(element);
+		}
+		else
+		{
+			mData.back().mCount += n;
+		}
+
+		mCount += n;
+		trim();
+	}
+
+	float average() const
+	{
+		PX_ASSERT(!mData.empty());
+
+		float sum = 0.0f;
+		Vector<Element>::Type::ConstIterator it = mData.begin(), end = mData.end();
+		for(; it != end; ++it)
+			sum += it->mCount * it->mValue;
+
+		// linear weight ramps at both ends for smoother average
+		uint32_t n = mCount / 8;
+		float ramp = 0.0f, temp = 0.0f;
+		uint32_t countLo = (it = mData.begin())->mCount;
+		uint32_t countHi = (--end)->mCount;
+		for(uint32_t i = 0; i < n; ++i)
+		{
+			if(i == countLo)
+				countLo += (++it)->mCount;
+			if(i == countHi)
+				countHi += (--end)->mCount;
+
+			temp += it->mValue + end->mValue;
+			ramp += temp;
+		}
+
+		uint32_t num = (mCount - n) * (n + 1);
+		return (sum * (n + 1) - ramp) / num;
+	}
+
+  private:
+	// remove oldest (front) values until mCount<=mSize
+	void trim()
+	{
+		Vector<Element>::Type::Iterator it = mData.begin();
+		for(uint32_t k = mSize; k < mCount; it += k <= mCount)
+		{
+			k += it->mCount;
+			it->mCount = k - mCount;
+		}
+
+		if(it != mData.begin())
+			mData.assign(it, mData.end());
+
+		mCount = PxMin(mCount, mSize);
+	}
+
+	Vector<Element>::Type mData;
+
+	uint32_t mCount;
+	uint32_t mSize;
+};
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/PhaseConfig.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/PhaseConfig.cpp
new file mode 100644
index 00000000..310c43d6
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/PhaseConfig.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "PhaseConfig.h"
+#include "ApexUsingNamespace.h"
+#include "PsMathUtils.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+PhaseConfig transform(const PhaseConfig&);
+}
+}
+
+using namespace nvidia;
+
+namespace
+{
+float safeLog2(float x)
+{
+	float saturated = PxMax(0.0f, PxMin(x, 1.0f));
+	return saturated ? physx::shdfnd::log2(saturated) : -FLT_MAX_EXP;
+}
+}
+
+cloth::PhaseConfig::PhaseConfig(uint16_t index)
+: mPhaseIndex(index)
+, mPadding(0xffff)
+, mStiffness(1.0f)
+, mStiffnessMultiplier(1.0f)
+, mCompressionLimit(1.0f)
+, mStretchLimit(1.0f)
+{
+}
+
+// convert from user input to solver format
+cloth::PhaseConfig cloth::transform(const PhaseConfig& config)
+{
+	PhaseConfig result(config.mPhaseIndex);
+
+	result.mStiffness = safeLog2(1.0f - config.mStiffness);
+	result.mStiffnessMultiplier = safeLog2(config.mStiffnessMultiplier);
+
+	// negative for compression, positive for stretch
+	result.mCompressionLimit = 1.f - 1.f / config.mCompressionLimit;
+	result.mStretchLimit = 1.f - 1.f / config.mStretchLimit;
+
+	return result;
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/PointInterpolator.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/PointInterpolator.h
new file mode 100644
index 00000000..fe130156
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/PointInterpolator.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+#include "Simd4f.h"
+
+namespace nvidia
+{
+
+namespace cloth
+{
+
+// acts as a poor mans random access iterator
+template <typename Simd4f, typename BaseIterator>
+class LerpIterator
+{
+
+	LerpIterator& operator=(const LerpIterator&); // not implemented
+
+  public:
+	LerpIterator(BaseIterator start, BaseIterator target, float alpha)
+	: mAlpha(simd4f(alpha)), mStart(start), mTarget(target)
+	{
+	}
+
+	// return the interpolated point at a given index
+	inline Simd4f operator[](size_t index) const
+	{
+		return mStart[index] + (mTarget[index] - mStart[index]) * mAlpha;
+	}
+
+	inline Simd4f operator*() const
+	{
+		return (*this)[0];
+	}
+
+	// prefix increment only
+	inline LerpIterator& operator++()
+	{
+		++mStart;
+		++mTarget;
+		return *this;
+	}
+
+  private:
+	// interpolation parameter
+	const Simd4f mAlpha;
+
+	BaseIterator mStart;
+	BaseIterator mTarget;
+};
+
+template <typename Simd4f, size_t Stride>
+class UnalignedIterator
+{
+
+	UnalignedIterator& operator=(const UnalignedIterator&); // not implemented
+
+  public:
+	UnalignedIterator(const float* pointer) : mPointer(pointer)
+	{
+	}
+
+	inline Simd4f operator[](size_t index) const
+	{
+		return load(mPointer + index * Stride);
+	}
+
+	inline Simd4f operator*() const
+	{
+		return (*this)[0];
+	}
+
+	// prefix increment only
+	inline UnalignedIterator& operator++()
+	{
+		mPointer += Stride;
+		return *this;
+	}
+
+  private:
+	const float* mPointer;
+};
+
+// acts as an iterator but returns a constant
+template <typename Simd4f>
+class ConstantIterator
+{
+  public:
+	ConstantIterator(const Simd4f& value) : mValue(value)
+	{
+	}
+
+	inline Simd4f operator*() const
+	{
+		return mValue;
+	}
+
+	inline ConstantIterator& operator++()
+	{
+		return *this;
+	}
+
+  private:
+	ConstantIterator& operator=(const ConstantIterator&);
+	const Simd4f mValue;
+};
+
+// wraps an iterator with constant scale and bias
+template <typename Simd4f, typename BaseIterator>
+class ScaleBiasIterator
+{
+  public:
+	ScaleBiasIterator(BaseIterator base, const Simd4f& scale, const Simd4f& bias)
+	: mScale(scale), mBias(bias), mBaseIterator(base)
+	{
+	}
+
+	inline Simd4f operator*() const
+	{
+		return (*mBaseIterator) * mScale + mBias;
+	}
+
+	inline ScaleBiasIterator& operator++()
+	{
+		++mBaseIterator;
+		return *this;
+	}
+
+  private:
+	ScaleBiasIterator& operator=(const ScaleBiasIterator&);
+
+	const Simd4f mScale;
+	const Simd4f mBias;
+
+	BaseIterator mBaseIterator;
+};
+
+} // namespace cloth
+
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Simd4f.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Simd4f.h
new file mode 100644
index 00000000..8755a010
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Simd4f.h
@@ -0,0 +1,478 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "SimdTypes.h"
+
+#if NVMATH_FUSE_MULTIPLY_ADD
+
+/*! \brief Expression template to fuse multiply-adds.
+ * \relates Simd4f */
+struct ProductExpr
+{
+	inline ProductExpr(Simd4f const& v0_, Simd4f const& v1_) : v0(v0_), v1(v1_)
+	{
+	}
+	inline operator Simd4f() const;
+	const Simd4f v0, v1;
+
+  private:
+	ProductExpr& operator=(const ProductExpr&); // not implemented
+};
+
+inline Simd4f operator+(const ProductExpr&, const Simd4f&);
+inline Simd4f operator+(const Simd4f& v, const ProductExpr&);
+inline Simd4f operator+(const ProductExpr&, const ProductExpr&);
+inline Simd4f operator-(const Simd4f& v, const ProductExpr&);
+inline Simd4f operator-(const ProductExpr&, const ProductExpr&);
+
+#else  // NVMATH_FUSE_MULTIPLY_ADD
+typedef Simd4f ProductExpr;
+#endif // NVMATH_FUSE_MULTIPLY_ADD
+
+template <typename T>
+struct Simd4fFactory
+{
+	Simd4fFactory(T v_) : v(v_)
+	{
+	}
+	inline operator Simd4f() const;
+	inline operator Scalar4f() const;
+	Simd4fFactory& operator=(const Simd4fFactory&); // not implemented
+	T v;
+};
+
+template <>
+struct Simd4fFactory<detail::FourTuple>
+{
+	Simd4fFactory(float x, float y, float z, float w)
+	{
+		v[0] = x, v[1] = y, v[2] = z, v[3] = w;
+	}
+	Simd4fFactory(const Simd4fFactory<const float&>& f)
+	{
+		v[3] = v[2] = v[1] = v[0] = f.v;
+	}
+	inline operator Simd4f() const;
+	inline operator Scalar4f() const;
+	Simd4fFactory& operator=(const Simd4fFactory&); // not implemented
+	PX_ALIGN(16, float) v[4];
+};
+
+template <int i>
+struct Simd4fFactory<detail::IntType<i> >
+{
+	inline operator Simd4f() const;
+	inline operator Scalar4f() const;
+};
+
+// forward declaration
+template <typename>
+struct Simd4iFactory;
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression template
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+#if NVMATH_SIMD
+inline Simd4f operator&(const ComplementExpr<Simd4f>&, const Simd4f&);
+inline Simd4f operator&(const Simd4f&, const ComplementExpr<Simd4f>&);
+#endif
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operators
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+// note: operator?= missing because they don't have corresponding intrinsics.
+
+/*! \brief Test for equality of two vectors.
+* \return Vector of per element result mask (all bits set for 'true', none set for 'false').
+* \note QNaPs aren't handled on SPU: comparing two QNaPs will return true.
+* \relates Simd4f */
+inline Simd4f operator==(const Simd4f& v0, const Simd4f& v1);
+
+// no operator!= because VMX128 does not support it, use ~operator== and handle QNaPs
+
+/*! \brief Less-compare all elements of two vectors.
+* \return Vector of per element result mask (all bits set for 'true', none set for 'false').
+* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false.
+* \relates Simd4f */
+inline Simd4f operator<(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Less-or-equal-compare all elements of two vectors.
+* \return Vector of per element result mask (all bits set for 'true', none set for 'false').
+* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false.
+* \relates Simd4f */
+inline Simd4f operator<=(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Greater-compare all elements of two vectors.
+* \return Vector of per element result mask (all bits set for 'true', none set for 'false').
+* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false.
+* \relates Simd4f */
+inline Simd4f operator>(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Greater-or-equal-compare all elements of two vectors.
+* \return Vector of per element result mask (all bits set for 'true', none set for 'false').
+* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false.
+* \relates Simd4f */
+inline Simd4f operator>=(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Vector bit-wise NOT operator
+* \return A vector holding the bit-negate of \a v.
+* \relates Simd4f */
+inline ComplementExpr<Simd4f> operator~(const Simd4f& v);
+
+/*! \brief Vector bit-wise AND operator
+* \return A vector holding the bit-wise AND of \a v0 and \a v1.
+* \relates Simd4f */
+inline Simd4f operator&(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Vector bit-wise OR operator
+* \return A vector holding the bit-wise OR of \a v0 and \a v1.
+* \relates Simd4f */
+inline Simd4f operator|(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Vector bit-wise XOR operator
+* \return A vector holding the bit-wise XOR of \a v0 and \a v1.
+* \relates Simd4f */
+inline Simd4f operator^(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Vector logical left shift.
+* \return A vector with 4 elements of \a v0, each shifted left by \a shift bits.
+* \relates Simd4f */
+inline Simd4f operator<<(const Simd4f& v, int shift);
+
+/*! \brief Vector logical right shift.
+* \return A vector with 4 elements of \a v0, each shifted right by \a shift bits.
+* \relates Simd4f */
+inline Simd4f operator>>(const Simd4f& v, int shift);
+
+#if NVMATH_SHIFT_BY_VECTOR
+/*! \brief Vector logical left shift.
+* \return A vector with 4 elements of \a v0, each shifted left by \a shift bits.
+* \relates Simd4f */
+inline Simd4f operator<<(const Simd4f& v, const Simd4f& shift);
+
+/*! \brief Vector logical right shift.
+* \return A vector with 4 elements of \a v0, each shifted right by \a shift bits.
+* \relates Simd4f */
+inline Simd4f operator>>(const Simd4f& v, const Simd4f& shift);
+#endif
+
+/*! \brief Unary vector addition operator.
+* \return A vector holding the component-wise copy of \a v.
+* \relates Simd4f */
+inline Simd4f operator+(const Simd4f& v);
+
+/*! \brief Vector addition operator
+* \return A vector holding the component-wise sum of \a v0 and \a v1.
+* \relates Simd4f */
+inline Simd4f operator+(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Unary vector negation operator.
+* \return A vector holding the component-wise negation of \a v.
+* \relates Simd4f */
+inline Simd4f operator-(const Simd4f& v);
+
+/*! \brief Vector subtraction operator.
+* \return A vector holding the component-wise difference of \a v0 and \a v1.
+* \relates Simd4f */
+inline Simd4f operator-(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Vector multiplication.
+* \return Element-wise product of \a v0 and \a v1.
+* \note For VMX, returns expression template to fuse multiply-add.
+* \relates Simd4f */
+inline ProductExpr operator*(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Vector division.
+* \return Element-wise division of \a v0 and \a v1.
+* \relates Simd4f */
+inline Simd4f operator/(const Simd4f& v0, const Simd4f& v1);
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// functions
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+/*! \brief Load float value into all vector components.
+* \relates Simd4f */
+inline Simd4fFactory<const float&> simd4f(const float& s)
+{
+	return Simd4fFactory<const float&>(s);
+}
+
+/*! \brief Load 4 float values into vector.
+* \relates Simd4f */
+inline Simd4fFactory<detail::FourTuple> simd4f(float x, float y, float z, float w)
+{
+	return Simd4fFactory<detail::FourTuple>(x, y, z, w);
+}
+
+/*! \brief Create vector from literal.
+* \return Vector with all elements set to i.
+* \relates Simd4f */
+template <int i>
+inline Simd4fFactory<detail::IntType<i> > simd4f(detail::IntType<i> const&)
+{
+	return Simd4fFactory<detail::IntType<i> >();
+}
+
+/*! \brief Reinterpret Simd4i as Simd4f.
+* \return A copy of \a v, but cast as Simd4f.
+* \relates Simd4f */
+inline Simd4f simd4f(const Simd4i& v);
+
+/*! \brief Reinterpret Simd4iFactory as Simd4fFactory.
+* \relates Simd4f */
+template <typename T>
+inline Simd4fFactory<T> simd4f(const Simd4iFactory<T>& v)
+{
+	return reinterpret_cast<const Simd4fFactory<T>&>(v);
+}
+
+/*! \brief return reference to contiguous array of vector elements
+* \relates Simd4f */
+inline float (&array(Simd4f& v))[4];
+
+/*! \brief return constant reference to contiguous array of vector elements
+* \relates Simd4f */
+inline const float (&array(const Simd4f& v))[4];
+
+/*! \brief Create vector from float array.
+* \relates Simd4f */
+inline Simd4fFactory<const float*> load(const float* ptr)
+{
+	return ptr;
+}
+
+/*! \brief Create vector from aligned float array.
+* \note \a ptr needs to be 16 byte aligned.
+* \relates Simd4f */
+inline Simd4fFactory<detail::AlignedPointer<float> > loadAligned(const float* ptr)
+{
+	return detail::AlignedPointer<float>(ptr);
+}
+
+/*! \brief Create vector from aligned float array.
+* \param offset pointer offset in bytes.
+* \note \a ptr+offset needs to be 16 byte aligned.
+* \relates Simd4f */
+inline Simd4fFactory<detail::OffsetPointer<float> > loadAligned(const float* ptr, unsigned int offset)
+{
+	return detail::OffsetPointer<float>(ptr, offset);
+}
+
+/*! \brief Store vector \a v to float array \a ptr.
+* \relates Simd4f */
+inline void store(float* ptr, Simd4f const& v);
+
+/*! \brief Store vector \a v to aligned float array \a ptr.
+* \note \a ptr needs to be 16 byte aligned.
+* \relates Simd4f */
+inline void storeAligned(float* ptr, Simd4f const& v);
+
+/*! \brief Store vector \a v to aligned float array \a ptr.
+* \param offset pointer offset in bytes.
+* \note \a ptr+offset needs to be 16 byte aligned.
+* \relates Simd4f */
+inline void storeAligned(float* ptr, unsigned int offset, Simd4f const& v);
+
+/*! \brief replicate i-th component into all vector components.
+* \return Vector with all elements set to \a v[i].
+* \relates Simd4f */
+template <size_t i>
+inline Simd4f splat(Simd4f const& v);
+
+/*! \brief Select \a v0 or \a v1 based on \a mask.
+* \return mask ? v0 : v1
+* \relates Simd4f */
+inline Simd4f select(Simd4f const& mask, Simd4f const& v0, Simd4f const& v1);
+
+/*! \brief Per element absolute value.
+* \return Vector with absolute values of \a v.
+* \relates Simd4f */
+inline Simd4f abs(const Simd4f& v);
+
+/*! \brief Per element floor value.
+* \note Result undefined for QNaN elements.
+* \note On SSE and NEON, returns v-1 if v is negative integer value
+* \relates Simd4f */
+inline Simd4f floor(const Simd4f& v);
+
+/*! \brief Per-component minimum of two vectors
+* \note Result undefined for QNaN elements.
+* \relates Simd4f */
+inline Simd4f max(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Per-component minimum of two vectors
+* \note Result undefined for QNaN elements.
+* \relates Simd4f */
+inline Simd4f min(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Return reciprocal estimate of a vector.
+* \return Vector of per-element reciprocal estimate.
+* \relates Simd4f */
+inline Simd4f recip(const Simd4f& v);
+
+/*! \brief Return reciprocal of a vector.
+* \return Vector of per-element reciprocal.
+* \note Performs \a n Newton-Raphson iterations on initial estimate.
+* \relates Simd4f */
+template <int n>
+inline Simd4f recipT(const Simd4f& v);
+
+/*! \brief Return square root of a vector.
+* \return Vector of per-element square root.
+* \note The behavior is undefined for negative elements.
+* \relates Simd4f */
+inline Simd4f sqrt(const Simd4f& v);
+
+/*! \brief Return inverse square root estimate of a vector.
+* \return Vector of per-element inverse square root estimate.
+* \note The behavior is undefined for negative, zero, and infinity elements.
+* \relates Simd4f */
+inline Simd4f rsqrt(const Simd4f& v);
+
+/*! \brief Return inverse square root of a vector.
+* \return Vector of per-element inverse square root.
+* \note Performs \a n Newton-Raphson iterations on initial estimate.
+* \note The behavior is undefined for negative and infinity elements.
+* \relates Simd4f */
+template <int n>
+inline Simd4f rsqrtT(const Simd4f& v);
+
+/*! \brief Return 2 raised to the power of v.
+* \note Result undefined for QNaN elements.
+* \relates Simd4f */
+inline Simd4f exp2(const Simd4f& v);
+
+#if NVMATH_SIMD
+namespace simdf
+{
+// PSP2 is confused resolving about exp2, forwarding works
+inline Simd4f exp2(const Simd4f& v)
+{
+	return ::exp2(v);
+}
+}
+#endif
+
+/*! \brief Return logarithm of v to base 2.
+* \note Result undefined for QNaN elements.
+* \relates Simd4f */
+inline Simd4f log2(const Simd4f& v);
+
+/*! \brief Return dot product of two 3-vectors.
+* \note The result is replicated across all 4 components.
+* \relates Simd4f */
+inline Simd4f dot3(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Return cross product of two 3-vectors.
+* \note The 4th component is undefined.
+* \relates Simd4f */
+inline Simd4f cross3(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Transposes 4x4 matrix represented by \a x, \a y, \a z, and \a w.
+* \relates Simd4f */
+inline void transpose(Simd4f& x, Simd4f& y, Simd4f& z, Simd4f& w);
+
+/*! \brief returns non-zero if all elements or \a v0 and \a v1 are equal
+* \note QNaPs aren't handled on SPU: comparing two QNaPs will return true.
+* \relates Simd4f */
+inline int allEqual(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief returns non-zero if all elements or \a v0 and \a v1 are equal
+* \param outMask holds the result of \a v0 == \a v1.
+* \note QNaPs aren't handled on SPU: comparing two QNaPs will return true.
+* \relates Simd4f */
+inline int allEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are equal
+* \note QNaPs aren't handled on SPU: comparing two QNaPs will return true.
+* \relates Simd4f */
+inline int anyEqual(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are equal
+* \param outMask holds the result of \a v0 == \a v1.
+* \note QNaPs aren't handled on SPU: comparing two QNaPs will return true.
+* \relates Simd4f */
+inline int anyEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask);
+
+/*! \brief returns non-zero if all elements or \a v0 and \a v1 are greater
+* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false.
+* \relates Simd4f */
+inline int allGreater(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief returns non-zero if all elements or \a v0 and \a v1 are greater
+* \param outMask holds the result of \a v0 == \a v1.
+* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false.
+* \relates Simd4f */
+inline int allGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater
+* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false.
+* \relates Simd4f */
+inline int anyGreater(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater
+* \param outMask holds the result of \a v0 == \a v1.
+* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false.
+* \relates Simd4f */
+inline int anyGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask);
+
+/*! \brief returns non-zero if all elements or \a v0 and \a v1 are greater or equal
+* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false.
+* \relates Simd4f */
+inline int allGreaterEqual(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief returns non-zero if all elements or \a v0 and \a v1 are greater or equal
+* \param outMask holds the result of \a v0 == \a v1.
+* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false.
+* \relates Simd4f */
+inline int allGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater or equal
+* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false.
+* \relates Simd4f */
+inline int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater or equal
+* \param outMask holds the result of \a v0 == \a v1.
+* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false.
+* \relates Simd4f */
+inline int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask);
+
+/*! \brief returns non-zero if all elements are true
+* \note Undefined if parameter is not result of a comparison.
+* \relates Simd4f */
+inline int allTrue(const Simd4f& v);
+
+/*! \brief returns non-zero if any element is true
+* \note Undefined if parameter is not result of a comparison.
+* \relates Simd4f */
+inline int anyTrue(const Simd4f& v);
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// platform specific includes
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+#if NVMATH_SSE2
+#include "sse2/Simd4f.h"
+#elif NVMATH_NEON
+#include "neon/Simd4f.h"
+#endif
+
+#if NVMATH_SCALAR
+#include "scalar/Simd4f.h"
+#endif
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Simd4i.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Simd4i.h
new file mode 100644
index 00000000..d237e1fa
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Simd4i.h
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "SimdTypes.h"
+
+template <typename T>
+struct Simd4iFactory
+{
+	Simd4iFactory(T v_) : v(v_)
+	{
+	}
+	inline operator Simd4i() const;
+	inline operator Scalar4i() const;
+	Simd4iFactory& operator=(const Simd4iFactory&); // not implemented
+	T v;
+};
+
+template <>
+struct Simd4iFactory<detail::FourTuple>
+{
+	Simd4iFactory(int x, int y, int z, int w)
+	{
+		v[0] = x, v[1] = y, v[2] = z, v[3] = w;
+	}
+	Simd4iFactory(const Simd4iFactory<const int&>& f)
+	{
+		v[3] = v[2] = v[1] = v[0] = f.v;
+	}
+	inline operator Simd4i() const;
+	inline operator Scalar4i() const;
+	Simd4iFactory& operator=(const Simd4iFactory&); // not implemented
+	PX_ALIGN(16, int) v[4];
+};
+
+template <int i>
+struct Simd4iFactory<detail::IntType<i> >
+{
+	inline operator Simd4i() const;
+	inline operator Scalar4i() const;
+};
+
+// forward declaration
+template <typename>
+struct Simd4fFactory;
+
+// map Simd4f/Scalar4f to Simd4i/Scalar4i
+template <typename>
+struct Simd4fToSimd4i;
+template <>
+struct Simd4fToSimd4i<Simd4f>
+{
+	typedef Simd4i Type;
+};
+template <>
+struct Simd4fToSimd4i<Scalar4f>
+{
+	typedef Scalar4i Type;
+};
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression template
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+#if NVMATH_DISTINCT_TYPES
+inline Simd4i operator&(const ComplementExpr<Simd4i>&, const Simd4i&);
+inline Simd4i operator&(const Simd4i&, const ComplementExpr<Simd4i>&);
+#endif
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operators
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+#if NVMATH_DISTINCT_TYPES
+
+/*! \brief Vector bit-wise NOT operator
+* \return A vector holding the bit-negate of \a v.
+* \relates Simd4i */
+inline ComplementExpr<Simd4i> operator~(const Simd4i& v);
+
+/*! \brief Vector bit-wise AND operator
+* \return A vector holding the bit-wise AND of \a v0 and \a v1.
+* \relates Simd4i */
+inline Simd4i operator&(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief Vector bit-wise OR operator
+* \return A vector holding the bit-wise OR of \a v0 and \a v1.
+* \relates Simd4i */
+inline Simd4i operator|(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief Vector bit-wise XOR operator
+* \return A vector holding the bit-wise XOR of \a v0 and \a v1.
+* \relates Simd4i */
+inline Simd4i operator^(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief Vector logical left shift.
+* \return A vector with 4 elements of \a v0, each shifted left by \a shift bits.
+* \relates Simd4i */
+inline Simd4i operator<<(const Simd4i& v, int shift);
+
+/*! \brief Vector logical right shift.
+* \return A vector with 4 elements of \a v0, each shifted right by \a shift bits.
+* \relates Simd4i */
+inline Simd4i operator>>(const Simd4i& v, int shift);
+
+#if NVMATH_SHIFT_BY_VECTOR
+
+/*! \brief Vector logical left shift.
+* \return A vector with 4 elements of \a v0, each shifted left by \a shift bits.
+* \relates Simd4i */
+inline Simd4i operator<<(const Simd4i& v, const Simd4i& shift);
+
+/*! \brief Vector logical right shift.
+* \return A vector with 4 elements of \a v0, each shifted right by \a shift bits.
+* \relates Simd4i */
+inline Simd4i operator>>(const Simd4i& v, const Simd4i& shift);
+
+#endif // NVMATH_SHIFT_BY_VECTOR
+
+#endif // NVMATH_DISTINCT_TYPES
+
+namespace simdi // disambiguate for VMX
+{
+// note: operator?= missing because they don't have corresponding intrinsics.
+
+/*! \brief Test for equality of two vectors.
+* \return Vector of per element result mask (all bits set for 'true', none set for 'false').
+* \relates Simd4i */
+inline Simd4i operator==(const Simd4i& v0, const Simd4i& v1);
+
+// no !=, <=, >= because VMX128/SSE don't support it, use ~equal etc.
+
+/*! \brief Less-compare all elements of two *signed* vectors.
+* \return Vector of per element result mask (all bits set for 'true', none set for 'false').
+* \relates Simd4i */
+inline Simd4i operator<(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief Greater-compare all elements of two *signed* vectors.
+* \return Vector of per element result mask (all bits set for 'true', none set for 'false').
+* \relates Simd4i */
+inline Simd4i operator>(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief Vector addition operator
+* \return A vector holding the component-wise sum of \a v0 and \a v1.
+* \relates Simd4i */
+inline Simd4i operator+(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief Unary vector negation operator.
+* \return A vector holding the component-wise negation of \a v.
+* \relates Simd4i */
+inline Simd4i operator-(const Simd4i& v);
+
+/*! \brief Vector subtraction operator.
+* \return A vector holding the component-wise difference of \a v0 and \a v1.
+* \relates Simd4i */
+inline Simd4i operator-(const Simd4i& v0, const Simd4i& v1);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// functions
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+/*! \brief Load int value into all vector components.
+* \relates Simd4i */
+inline Simd4iFactory<const int&> simd4i(const int& s)
+{
+	return Simd4iFactory<const int&>(s);
+}
+
+/*! \brief Load 4 int values into vector.
+* \relates Simd4i */
+inline Simd4iFactory<detail::FourTuple> simd4i(int x, int y, int z, int w)
+{
+	return Simd4iFactory<detail::FourTuple>(x, y, z, w);
+}
+
+/*! \brief Create vector from literal.
+* \return Vector with all elements set to \c i.
+* \relates Simd4i */
+template <int i>
+inline Simd4iFactory<detail::IntType<i> > simd4i(const detail::IntType<i>&)
+{
+	return Simd4iFactory<detail::IntType<i> >();
+}
+
+template <>
+inline Simd4iFactory<detail::IntType<1> > simd4i(const detail::IntType<1>&)
+{
+	return Simd4iFactory<detail::IntType<1> >();
+}
+
+template <>
+inline Simd4iFactory<detail::IntType<int(0x80000000)> > simd4i(const detail::IntType<int(0x80000000)>&)
+{
+	return Simd4iFactory<detail::IntType<int(0x80000000)> >();
+}
+
+template <>
+inline Simd4iFactory<detail::IntType<-1> > simd4i(const detail::IntType<-1>&)
+{
+	return Simd4iFactory<detail::IntType<-1> >();
+}
+
+/*! \brief Reinterpret Simd4f as Simd4i.
+* \return A copy of \a v, but cast as Simd4i.
+* \relates Simd4i */
+inline Simd4i simd4i(const Simd4f& v);
+
+/*! \brief Reinterpret Simd4fFactory as Simd4iFactory.
+* \relates Simd4i */
+template <typename T>
+inline Simd4iFactory<T> simd4i(const Simd4fFactory<T>& v)
+{
+	return reinterpret_cast<const Simd4iFactory<T>&>(v);
+}
+
+namespace simdi
+{
+
+/*! \brief return reference to contiguous array of vector elements
+* \relates Simd4i */
+inline int (&array(Simd4i& v))[4];
+
+/*! \brief return constant reference to contiguous array of vector elements
+* \relates Simd4i */
+inline const int (&array(const Simd4i& v))[4];
+
+} // namespace simdi
+
+/*! \brief Create vector from int array.
+* \relates Simd4i */
+inline Simd4iFactory<const int*> load(const int* ptr)
+{
+	return ptr;
+}
+
+/*! \brief Create vector from aligned int array.
+* \note \a ptr needs to be 16 byte aligned.
+* \relates Simd4i */
+inline Simd4iFactory<detail::AlignedPointer<int> > loadAligned(const int* ptr)
+{
+	return detail::AlignedPointer<int>(ptr);
+}
+
+/*! \brief Create vector from aligned float array.
+* \param offset pointer offset in bytes.
+* \note \a ptr+offset needs to be 16 byte aligned.
+* \relates Simd4i */
+inline Simd4iFactory<detail::OffsetPointer<int> > loadAligned(const int* ptr, unsigned int offset)
+{
+	return detail::OffsetPointer<int>(ptr, offset);
+}
+
+/*! \brief Store vector \a v to int array \a ptr.
+* \relates Simd4i */
+inline void store(int* ptr, const Simd4i& v);
+
+/*! \brief Store vector \a v to aligned int array \a ptr.
+* \note \a ptr needs to be 16 byte aligned.
+* \relates Simd4i */
+inline void storeAligned(int* ptr, const Simd4i& v);
+
+/*! \brief Store vector \a v to aligned int array \a ptr.
+* \param offset pointer offset in bytes.
+* \note \a ptr+offset needs to be 16 byte aligned.
+* \relates Simd4i */
+inline void storeAligned(int* ptr, unsigned int offset, const Simd4i& v);
+
+#if NVMATH_DISTINCT_TYPES
+
+/*! \brief replicate i-th component into all vector components.
+* \return Vector with all elements set to \a v[i].
+* \relates Simd4i */
+template <size_t i>
+inline Simd4i splat(const Simd4i& v);
+
+/*! \brief Select \a v0 or \a v1 based on \a mask.
+* \return mask ? v0 : v1
+* \relates Simd4i */
+inline Simd4i select(const Simd4i& mask, const Simd4i& v0, const Simd4i& v1);
+
+#endif // NVMATH_DISTINCT_TYPES
+
+namespace simdi // disambiguate for VMX
+{
+/*! \brief returns non-zero if all elements or \a v0 and \a v1 are equal
+* \relates Simd4i */
+inline int allEqual(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief returns non-zero if all elements or \a v0 and \a v1 are equal
+* \param outMask holds the result of \a v0 == \a v1.
+* \relates Simd4i */
+inline int allEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are equal
+* \relates Simd4i */
+inline int anyEqual(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are equal
+* \param outMask holds the result of \a v0 == \a v1.
+* \relates Simd4i */
+inline int anyEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask);
+
+/*! \brief returns non-zero if all *signed* elements or \a v0 and \a v1 are greater
+* \relates Simd4i */
+inline int allGreater(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief returns non-zero if all *signed* elements or \a v0 and \a v1 are greater
+* \param outMask holds the result of \a v0 == \a v1.
+* \relates Simd4i */
+inline int allGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater
+* \relates Simd4i */
+inline int anyGreater(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater
+* \param outMask holds the result of \a v0 == \a v1.
+* \relates Simd4i */
+inline int anyGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask);
+}
+
+#if NVMATH_DISTINCT_TYPES
+
+/*! \brief returns non-zero if all elements are true
+* \note undefined if parameter is not result of a comparison.
+* \relates Simd4i */
+inline int allTrue(const Simd4i& v);
+
+/*! \brief returns non-zero if any element is true
+* \note undefined if parameter is not result of a comparison.
+* \relates Simd4i */
+inline int anyTrue(const Simd4i& v);
+
+#endif // NVMATH_DISTINCT_TYPES
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// platform specific includes
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+#if NVMATH_SSE2
+#include "sse2/Simd4i.h"
+#elif NVMATH_NEON
+#include "neon/Simd4i.h"
+#endif
+
+#if NVMATH_SCALAR
+#include "scalar/Simd4i.h"
+#endif
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SimdTypes.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SimdTypes.h
new file mode 100644
index 00000000..e44e876a
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SimdTypes.h
@@ -0,0 +1,150 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you 
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and 
+// any modifications thereto. Any use, reproduction, disclosure, or 
+// distribution of this software and related documentation without an express 
+// license agreement from NVIDIA Corporation is strictly prohibited.
+// 
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2015 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include <cmath>
+
+// ps4 compiler defines _M_X64 without value
+#if((defined _M_IX86) || (defined _M_X64) || (defined __i386__) || (defined __x86_64__))
+#define NVMATH_SSE2 1
+#else
+#define NVMATH_SSE2 0
+#endif
+#define NVMATH_NEON (defined _M_ARM || defined __ARM_NEON__)
+
+// which simd types are implemented (one or both are all valid options)
+#define NVMATH_SIMD (NVMATH_SSE2 || NVMATH_NEON)
+#define NVMATH_SCALAR !NVMATH_SIMD
+// #define NVMATH_SCALAR 1
+
+// use template expression to fuse multiply-adds into a single instruction
+#define NVMATH_FUSE_MULTIPLY_ADD (NVMATH_NEON)
+// support shift by vector operarations
+#define NVMATH_SHIFT_BY_VECTOR (NVMATH_NEON)
+// Simd4f and Simd4i map to different types
+#define NVMATH_DISTINCT_TYPES (NVMATH_SSE2 || NVMATH_NEON)
+// support inline assembler
+#define NVMATH_INLINE_ASSEMBLER !((defined _M_ARM) || (defined SN_TARGET_PSP2) || (defined __arm64__))
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression template
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+/*! \brief Expression template to fuse and-not. */
+template <typename T>
+struct ComplementExpr
+{
+	inline ComplementExpr(T const& v_) : v(v_)
+	{
+	}
+	inline operator T() const;
+	const T v;
+
+  private:
+	ComplementExpr& operator=(const ComplementExpr&); // not implemented
+};
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// helper functions
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <typename T>
+T sqr(const T& x)
+{
+	return x * x;
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// details
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+namespace detail
+{
+template <typename T>
+struct AlignedPointer
+{
+	AlignedPointer(const T* p) : ptr(p)
+	{
+	}
+	const T* ptr;
+};
+
+template <typename T>
+struct OffsetPointer
+{
+	OffsetPointer(const T* p, unsigned int off) : ptr(p), offset(off)
+	{
+	}
+	const T* ptr;
+	unsigned int offset;
+};
+
+struct FourTuple
+{
+};
+
+// zero and one literals
+template <int i>
+struct IntType
+{
+};
+}
+
+// Supress warnings
+#if defined(__GNUC__) || defined(__SNC__)
+#define NVMATH_UNUSED __attribute__((unused))
+#else
+#define NVMATH_UNUSED
+#endif
+
+static detail::IntType<0> _0 NVMATH_UNUSED;
+static detail::IntType<1> _1 NVMATH_UNUSED;
+static detail::IntType<int(0x80000000)> _sign NVMATH_UNUSED;
+static detail::IntType<-1> _true NVMATH_UNUSED;
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// platform specific includes
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+#if NVMATH_SSE2
+#include "sse2/SimdTypes.h"
+#elif NVMATH_NEON
+#include "neon/SimdTypes.h"
+#else
+struct Simd4f;
+struct Simd4i;
+#endif
+
+#if NVMATH_SCALAR
+#include "scalar/SimdTypes.h"
+#else
+struct Scalar4f;
+struct Scalar4i;
+#endif
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/StackAllocator.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/StackAllocator.h
new file mode 100644
index 00000000..f8c6b2dc
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/StackAllocator.h
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include <PxAssert.h>
+
+#if PX_LINUX_FAMILY
+#include <stdint.h> // intptr_t
+#endif
+
+template <size_t align>
+class StackAllocator
+{
+	typedef unsigned char byte;
+
+	// todo: switch to offsets so size is consistent on x64
+	// mSize is just for book keeping so could be 4 bytes
+	struct Header
+	{
+		Header* mPrev;
+		size_t mSize : 31;
+		size_t mFree : 1;
+	};
+
+	StackAllocator(const StackAllocator&);
+	StackAllocator& operator=(const StackAllocator&);
+
+  public:
+	StackAllocator(void* buffer, size_t bufferSize)
+	: mBuffer(reinterpret_cast<byte*>(buffer)), mBufferSize(bufferSize), mFreeStart(mBuffer), mTop(0)
+	{
+	}
+
+	~StackAllocator()
+	{
+		PX_ASSERT(userBytes() == 0);
+	}
+
+	void* allocate(size_t numBytes)
+	{
+		// this is non-standard
+		if(!numBytes)
+			return 0;
+
+		uintptr_t unalignedStart = uintptr_t(mFreeStart) + sizeof(Header);
+
+		byte* allocStart = reinterpret_cast<byte*>((unalignedStart + (align - 1)) & ~(align - 1));
+		byte* allocEnd = allocStart + numBytes;
+
+		// ensure there is space for the alloc
+		PX_ASSERT(allocEnd <= mBuffer + mBufferSize);
+
+		Header* h = getHeader(allocStart);
+		h->mPrev = mTop;
+		h->mSize = numBytes;
+		h->mFree = false;
+
+		mTop = h;
+		mFreeStart = allocEnd;
+
+		return allocStart;
+	}
+
+	void deallocate(void* p)
+	{
+		if(!p)
+			return;
+
+		Header* h = getHeader(p);
+		h->mFree = true;
+
+		// unwind the stack to the next live alloc
+		while(mTop && mTop->mFree)
+		{
+			mFreeStart = reinterpret_cast<byte*>(mTop);
+			mTop = mTop->mPrev;
+		}
+	}
+
+  private:
+	// return the header for an allocation
+	inline Header* getHeader(void* p) const
+	{
+		PX_ASSERT((reinterpret_cast<uintptr_t>(p) & (align - 1)) == 0);
+		PX_ASSERT(reinterpret_cast<byte*>(p) >= mBuffer + sizeof(Header));
+		PX_ASSERT(reinterpret_cast<byte*>(p) < mBuffer + mBufferSize);
+
+		return reinterpret_cast<Header*>(p) - 1;
+	}
+
+  public:
+	// total user-allocated bytes not including any overhead
+	size_t userBytes() const
+	{
+		size_t total = 0;
+		Header* iter = mTop;
+		while(iter)
+		{
+			total += iter->mSize;
+			iter = iter->mPrev;
+		}
+
+		return total;
+	}
+
+	// total user-allocated bytes + overhead
+	size_t totalUsedBytes() const
+	{
+		return mFreeStart - mBuffer;
+	}
+
+	size_t remainingBytes() const
+	{
+		return mBufferSize - totalUsedBytes();
+	}
+
+	size_t wastedBytes() const
+	{
+		return totalUsedBytes() - userBytes();
+	}
+
+  private:
+	byte* const mBuffer;
+	const size_t mBufferSize;
+
+	byte* mFreeStart; // start of free space
+	Header* mTop;     // top allocation header
+};
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCloth.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCloth.cpp
new file mode 100644
index 00000000..2283a319
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCloth.cpp
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "SwCloth.h"
+#include "SwFabric.h"
+#include "SwFactory.h"
+#include "TripletScheduler.h"
+#include "ClothBase.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+PhaseConfig transform(const PhaseConfig&); // from PhaseConfig.cpp
+}
+}
+
+using namespace nvidia;
+using namespace physx::shdfnd;
+using namespace nvidia;
+
+cloth::SwCloth::SwCloth(SwFactory& factory, SwFabric& fabric, Range<const PxVec4> particles)
+: mFactory(factory)
+, mFabric(fabric)
+, mNumVirtualParticles(0)
+#if APEX_UE4
+, mSimulationTask(NULL)
+#endif
+, mUserData(0)
+{
+	PX_ASSERT(!particles.empty());
+
+	initialize(*this, particles.begin(), particles.end());
+
+#if PX_WINDOWS_FAMILY
+	const uint32_t kSimdWidth = 8; // avx
+#else
+	const uint32_t kSimdWidth = 4; // sse
+#endif
+
+	mCurParticles.reserve(particles.size() + kSimdWidth - 1);
+	mCurParticles.assign(reinterpret_cast<const PxVec4*>(particles.begin()),
+	                     reinterpret_cast<const PxVec4*>(particles.end()));
+
+	// 7 dummy particles used in SIMD solver
+	mCurParticles.resize(particles.size() + kSimdWidth - 1, PxVec4(0.0f));
+	mPrevParticles = mCurParticles;
+
+	mCurParticles.resize(particles.size());
+	mPrevParticles.resize(particles.size());
+
+	mFabric.incRefCount();
+}
+
+namespace
+{
+// copy vector and make same capacity
+void copyVector(cloth::Vec4fAlignedVector& dst, const cloth::Vec4fAlignedVector& src)
+{
+	dst.reserve(src.capacity());
+	dst.assign(src.begin(), src.end());
+
+	// ensure valid dummy data
+	dst.resize(src.capacity(), PxVec4(0.0f));
+	dst.resize(src.size());
+}
+}
+
+// copy constructor, supports rebinding to a different factory
+cloth::SwCloth::SwCloth(SwFactory& factory, const SwCloth& cloth)
+: mFactory(factory)
+, mFabric(cloth.mFabric)
+, mClothCostDirty(true)
+, mPhaseConfigs(cloth.mPhaseConfigs)
+, mCapsuleIndices(cloth.mCapsuleIndices)
+, mStartCollisionSpheres(cloth.mStartCollisionSpheres)
+, mTargetCollisionSpheres(cloth.mTargetCollisionSpheres)
+, mStartCollisionPlanes(cloth.mStartCollisionPlanes)
+, mTargetCollisionPlanes(cloth.mTargetCollisionPlanes)
+, mStartCollisionTriangles(cloth.mStartCollisionTriangles)
+, mTargetCollisionTriangles(cloth.mTargetCollisionTriangles)
+, mVirtualParticleIndices(cloth.mVirtualParticleIndices)
+, mVirtualParticleWeights(cloth.mVirtualParticleWeights)
+, mNumVirtualParticles(cloth.mNumVirtualParticles)
+, mSelfCollisionIndices(cloth.mSelfCollisionIndices)
+, mRestPositions(cloth.mRestPositions)
+#if APEX_UE4
+, mSimulationTask(NULL)
+#endif
+{
+	copy(*this, cloth);
+
+	// carry over capacity (using as dummy particles)
+	copyVector(mCurParticles, cloth.mCurParticles);
+	copyVector(mPrevParticles, cloth.mPrevParticles);
+	copyVector(mMotionConstraints.mStart, cloth.mMotionConstraints.mStart);
+	copyVector(mMotionConstraints.mTarget, cloth.mMotionConstraints.mTarget);
+	copyVector(mSeparationConstraints.mStart, cloth.mSeparationConstraints.mStart);
+	copyVector(mSeparationConstraints.mTarget, cloth.mSeparationConstraints.mTarget);
+	copyVector(mParticleAccelerations, cloth.mParticleAccelerations);
+
+	mFabric.incRefCount();
+}
+
+cloth::SwCloth::~SwCloth()
+{
+	mFabric.decRefCount();
+}
+
+cloth::Range<PxVec4> cloth::SwCloth::push(SwConstraints& constraints)
+{
+	uint32_t n = mCurParticles.size();
+
+	if(!constraints.mTarget.capacity())
+		constraints.mTarget.resize((n + 3) & ~3, PxVec4(0.0f)); // reserve multiple of 4 for SIMD
+
+	constraints.mTarget.resizeUninitialized(n);
+	PxVec4* data = &constraints.mTarget.front();
+	Range<PxVec4> result(data, data + constraints.mTarget.size());
+
+	if(constraints.mStart.empty()) // initialize start first
+		constraints.mStart.swap(constraints.mTarget);
+
+	return result;
+}
+
+void cloth::SwCloth::clear(SwConstraints& constraints)
+{
+	Vec4fAlignedVector().swap(constraints.mStart);
+	Vec4fAlignedVector().swap(constraints.mTarget);
+}
+
+cloth::Range<const PxVec3> cloth::SwCloth::clampTriangleCount(Range<const PxVec3> range, uint32_t)
+{
+	return range;
+}
+
+#include "ClothImpl.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+
+template <>
+Cloth* ClothImpl<SwCloth>::clone(Factory& factory) const
+{
+	return factory.clone(*this);
+}
+
+template <>
+uint32_t ClothImpl<SwCloth>::getNumParticles() const
+{
+	return mCloth.mCurParticles.size();
+}
+
+template <>
+void ClothImpl<SwCloth>::lockParticles() const
+{
+}
+
+template <>
+void ClothImpl<SwCloth>::unlockParticles() const
+{
+}
+
+template <>
+MappedRange<PxVec4> ClothImpl<SwCloth>::getCurrentParticles()
+{
+	return getMappedParticles(&mCloth.mCurParticles.front());
+}
+
+template <>
+MappedRange<const PxVec4> ClothImpl<SwCloth>::getCurrentParticles() const
+{
+	return getMappedParticles(&mCloth.mCurParticles.front());
+}
+
+template <>
+MappedRange<PxVec4> ClothImpl<SwCloth>::getPreviousParticles()
+{
+	return getMappedParticles(&mCloth.mPrevParticles.front());
+}
+
+template <>
+MappedRange<const PxVec4> ClothImpl<SwCloth>::getPreviousParticles() const
+{
+	return getMappedParticles(&mCloth.mPrevParticles.front());
+}
+
+template <>
+GpuParticles ClothImpl<SwCloth>::getGpuParticles()
+{
+	GpuParticles result = { 0, 0, 0 };
+	return result;
+}
+
+template <>
+void ClothImpl<SwCloth>::setPhaseConfig(Range<const PhaseConfig> configs)
+{
+	mCloth.mPhaseConfigs.resize(0);
+
+	// transform phase config to use in solver
+	for(; !configs.empty(); configs.popFront())
+		if(configs.front().mStiffness > 0.0f)
+			mCloth.mPhaseConfigs.pushBack(transform(configs.front()));
+
+	mCloth.wakeUp();
+}
+
+template <>
+void ClothImpl<SwCloth>::setSelfCollisionIndices(Range<const uint32_t> indices)
+{
+	ContextLockType lock(mCloth.mFactory);
+	mCloth.mSelfCollisionIndices.assign(indices.begin(), indices.end());
+	mCloth.notifyChanged();
+	mCloth.wakeUp();
+}
+
+template <>
+uint32_t ClothImpl<SwCloth>::getNumVirtualParticles() const
+{
+	return uint32_t(mCloth.mNumVirtualParticles);
+}
+
+template <>
+Range<PxVec4> ClothImpl<SwCloth>::getParticleAccelerations()
+{
+	if(mCloth.mParticleAccelerations.empty())
+	{
+		uint32_t n = mCloth.mCurParticles.size();
+		mCloth.mParticleAccelerations.resize(n, PxVec4(0.0f));
+	}
+
+	mCloth.wakeUp();
+
+	PxVec4* data = &mCloth.mParticleAccelerations.front();
+	return Range<PxVec4>(data, data + mCloth.mParticleAccelerations.size());
+}
+
+template <>
+void ClothImpl<SwCloth>::clearParticleAccelerations()
+{
+	Vec4fAlignedVector().swap(mCloth.mParticleAccelerations);
+	mCloth.wakeUp();
+}
+
+template <>
+void ClothImpl<SwCloth>::setVirtualParticles(Range<const uint32_t[4]> indices, Range<const PxVec3> weights)
+{
+	mCloth.mNumVirtualParticles = 0;
+
+	// shuffle indices to form independent SIMD sets
+	uint16_t numParticles = uint16_t(mCloth.mCurParticles.size());
+	TripletScheduler scheduler(indices);
+	scheduler.simd(numParticles, 4);
+
+	// convert indices to byte offset
+	Vec4us dummy(numParticles, uint16_t(numParticles + 1), uint16_t(numParticles + 2), 0);
+	Vector<uint32_t>::Type::ConstIterator sIt = scheduler.mSetSizes.begin();
+	Vector<uint32_t>::Type::ConstIterator sEnd = scheduler.mSetSizes.end();
+	TripletScheduler::ConstTripletIter tIt = scheduler.mTriplets.begin(), tLast;
+	mCloth.mVirtualParticleIndices.resize(0);
+	mCloth.mVirtualParticleIndices.reserve(indices.size() + 3 * uint32_t(sEnd - sIt));
+	for(; sIt != sEnd; ++sIt)
+	{
+		uint32_t setSize = *sIt;
+		for(tLast = tIt + setSize; tIt != tLast; ++tIt, ++mCloth.mNumVirtualParticles)
+			mCloth.mVirtualParticleIndices.pushBack(Vec4us(*tIt));
+		mCloth.mVirtualParticleIndices.resize((mCloth.mVirtualParticleIndices.size() + 3) & ~3, dummy);
+	}
+	Vector<Vec4us>::Type(mCloth.mVirtualParticleIndices.begin(), mCloth.mVirtualParticleIndices.end())
+	    .swap(mCloth.mVirtualParticleIndices);
+
+	// precompute 1/dot(w,w)
+	Vec4fAlignedVector().swap(mCloth.mVirtualParticleWeights);
+	mCloth.mVirtualParticleWeights.reserve(weights.size());
+	for(; !weights.empty(); weights.popFront())
+	{
+		PxVec3 w = reinterpret_cast<const PxVec3&>(weights.front());
+		float scale = 1 / w.magnitudeSquared();
+		mCloth.mVirtualParticleWeights.pushBack(PxVec4(w.x, w.y, w.z, scale));
+	}
+
+	mCloth.notifyChanged();
+}
+
+#if APEX_UE4
+template <>
+void ClothImpl<SwCloth>::simulate(float dt)
+{
+	(*SwCloth::sSimulationFunction)(mCloth.mSimulationTask, dt);
+}
+#endif
+
+} // namespace cloth
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCloth.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCloth.h
new file mode 100644
index 00000000..3d0569af
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCloth.h
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Cloth.h"
+#include "Range.h"
+#include "MovingAverage.h"
+#include "PhaseConfig.h"
+#include "IndexPair.h"
+#include "Vec4T.h"
+#include "Array.h"
+#include "PxTransform.h"
+
+namespace nvidia
+{
+
+namespace cloth
+{
+
+class SwFabric;
+class SwFactory;
+#if APEX_UE4
+class SwCloth;
+#endif
+
+typedef AlignedVector<PxVec4, 16>::Type Vec4fAlignedVector;
+
+struct SwConstraints
+{
+	void pop()
+	{
+		if(!mTarget.empty())
+		{
+			mStart.swap(mTarget);
+			mTarget.resize(0);
+		}
+	}
+
+	Vec4fAlignedVector mStart;
+	Vec4fAlignedVector mTarget;
+};
+
+class SwCloth
+{
+	SwCloth& operator=(const SwCloth&); // not implemented
+	struct SwContextLock
+	{
+		SwContextLock(const SwFactory&)
+		{
+		}
+	};
+
+  public:
+	typedef SwFactory FactoryType;
+	typedef SwFabric FabricType;
+	typedef SwContextLock ContextLockType;
+
+	typedef Vec4fAlignedVector& MappedVec4fVectorType;
+	typedef Vector<IndexPair>::Type& MappedIndexVectorType;
+
+	SwCloth(SwFactory&, SwFabric&, Range<const PxVec4>);
+	SwCloth(SwFactory&, const SwCloth&);
+	~SwCloth(); // not virtual on purpose
+
+  public:
+	bool isSleeping() const
+	{
+		return mSleepPassCounter >= mSleepAfterCount;
+	}
+	void wakeUp()
+	{
+		mSleepPassCounter = 0;
+	}
+
+	void notifyChanged()
+	{
+	}
+
+	void setParticleBounds(const float*);
+
+	Range<PxVec4> push(SwConstraints&);
+	static void clear(SwConstraints&);
+
+	static Range<const PxVec3> clampTriangleCount(Range<const PxVec3>, uint32_t);
+
+  public:
+	SwFactory& mFactory;
+	SwFabric& mFabric;
+
+	bool mClothCostDirty;
+
+	// current and previous-iteration particle positions
+	Vec4fAlignedVector mCurParticles;
+	Vec4fAlignedVector mPrevParticles;
+
+	PxVec3 mParticleBoundsCenter;
+	PxVec3 mParticleBoundsHalfExtent;
+
+	PxVec3 mGravity;
+	PxVec3 mLogDamping;
+	PxVec3 mLinearLogDrag;
+	PxVec3 mAngularLogDrag;
+	PxVec3 mLinearInertia;
+	PxVec3 mAngularInertia;
+	PxVec3 mCentrifugalInertia;
+	float mSolverFrequency;
+	float mStiffnessFrequency;
+
+	PxTransform mTargetMotion;
+	PxTransform mCurrentMotion;
+	PxVec3 mLinearVelocity;
+	PxVec3 mAngularVelocity;
+
+	float mPrevIterDt;
+	MovingAverage mIterDtAvg;
+
+	Vector<PhaseConfig>::Type mPhaseConfigs; // transformed!
+
+	// tether constraints stuff
+	float mTetherConstraintLogStiffness;
+	float mTetherConstraintScale;
+
+	// motion constraints stuff
+	SwConstraints mMotionConstraints;
+	float mMotionConstraintScale;
+	float mMotionConstraintBias;
+	float mMotionConstraintLogStiffness;
+
+	// separation constraints stuff
+	SwConstraints mSeparationConstraints;
+
+	// particle acceleration stuff
+	Vec4fAlignedVector mParticleAccelerations;
+
+	// collision stuff
+	Vector<IndexPair>::Type mCapsuleIndices;
+	Vec4fAlignedVector mStartCollisionSpheres;
+	Vec4fAlignedVector mTargetCollisionSpheres;
+	Vector<uint32_t>::Type mConvexMasks;
+	Vec4fAlignedVector mStartCollisionPlanes;
+	Vec4fAlignedVector mTargetCollisionPlanes;
+	Vector<PxVec3>::Type mStartCollisionTriangles;
+	Vector<PxVec3>::Type mTargetCollisionTriangles;
+	bool mEnableContinuousCollision;
+	float mCollisionMassScale;
+	float mFriction;
+
+	// virtual particles
+	Vector<Vec4us>::Type mVirtualParticleIndices;
+	Vec4fAlignedVector mVirtualParticleWeights;
+	uint32_t mNumVirtualParticles;
+
+	// self collision
+	float mSelfCollisionDistance;
+	float mSelfCollisionLogStiffness;
+
+	Vector<uint32_t>::Type mSelfCollisionIndices;
+
+	Vec4fAlignedVector mRestPositions;
+
+	// sleeping
+	uint32_t mSleepTestInterval; // how often to test for movement
+	uint32_t mSleepAfterCount;   // number of tests to pass before sleep
+	float mSleepThreshold;       // max movement delta to pass test
+	uint32_t mSleepPassCounter;  // how many tests passed
+	uint32_t mSleepTestCounter;  // how many iterations since tested
+
+	// unused for CPU simulation
+	bool mIsAllowedHalfPrecisionSolver;
+
+#if APEX_UE4
+	void* mSimulationTask;
+	static void(*const sSimulationFunction)(void*, float);
+#endif
+
+	void* mUserData;
+
+} PX_ALIGN_SUFFIX(16);
+
+} // namespace cloth
+
+// bounds = lower[3], upper[3]
+inline void cloth::SwCloth::setParticleBounds(const float* bounds)
+{
+	for(uint32_t i = 0; i < 3; ++i)
+	{
+		mParticleBoundsCenter[i] = (bounds[3 + i] + bounds[i]) * 0.5f;
+		mParticleBoundsHalfExtent[i] = (bounds[3 + i] - bounds[i]) * 0.5f;
+	}
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwClothData.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwClothData.cpp
new file mode 100644
index 00000000..bc09612f
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwClothData.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "SwClothData.h"
+#include "SwCloth.h"
+#include "SwFabric.h"
+#include "Simd4f.h"
+#include "PsUtilities.h"
+
+using namespace nvidia;
+
+cloth::SwClothData::SwClothData(SwCloth& cloth, const SwFabric& fabric)
+{
+	mNumParticles = uint32_t(cloth.mCurParticles.size());
+	mCurParticles = array(cloth.mCurParticles.front());
+	mPrevParticles = array(cloth.mPrevParticles.front());
+
+	const float* center = array(cloth.mParticleBoundsCenter);
+	const float* extent = array(cloth.mParticleBoundsHalfExtent);
+	for(uint32_t i = 0; i < 3; ++i)
+	{
+		mCurBounds[i] = center[i] - extent[i];
+		mCurBounds[i + 3] = center[i] + extent[i];
+	}
+
+	// avoid reading uninitialized data into mCurBounds, even though it's never used.
+	mPrevBounds[0] = 0.0f;
+
+	mConfigBegin = cloth.mPhaseConfigs.empty() ? 0 : &cloth.mPhaseConfigs.front();
+	mConfigEnd = mConfigBegin + cloth.mPhaseConfigs.size();
+
+	mPhases = &fabric.mPhases.front();
+	mNumPhases = uint32_t(fabric.mPhases.size());
+
+	mSets = &fabric.mSets.front();
+	mNumSets = uint32_t(fabric.mSets.size());
+
+	mRestvalues = &fabric.mRestvalues.front();
+	mNumRestvalues = uint32_t(fabric.mRestvalues.size());
+
+	mIndices = &fabric.mIndices.front();
+	mNumIndices = uint32_t(fabric.mIndices.size());
+
+	float stiffnessExponent = cloth.mStiffnessFrequency * cloth.mPrevIterDt * 0.69314718055994531f; // logf(2.0f);
+
+	mTethers = fabric.mTethers.begin();
+	mNumTethers = uint32_t(fabric.mTethers.size());
+	mTetherConstraintStiffness = 1.0f - exp(stiffnessExponent * cloth.mTetherConstraintLogStiffness);
+	mTetherConstraintScale = cloth.mTetherConstraintScale * fabric.mTetherLengthScale;
+
+	mStartMotionConstraints = cloth.mMotionConstraints.mStart.size() ? array(cloth.mMotionConstraints.mStart.front()) : 0;
+	mTargetMotionConstraints =
+	    !cloth.mMotionConstraints.mTarget.empty() ? array(cloth.mMotionConstraints.mTarget.front()) : 0;
+	mMotionConstraintStiffness = 1.0f - exp(stiffnessExponent * cloth.mMotionConstraintLogStiffness);
+
+	mStartSeparationConstraints =
+	    cloth.mSeparationConstraints.mStart.size() ? array(cloth.mSeparationConstraints.mStart.front()) : 0;
+	mTargetSeparationConstraints =
+	    !cloth.mSeparationConstraints.mTarget.empty() ? array(cloth.mSeparationConstraints.mTarget.front()) : 0;
+
+	mParticleAccelerations = cloth.mParticleAccelerations.size() ? array(cloth.mParticleAccelerations.front()) : 0;
+
+	mStartCollisionSpheres = cloth.mStartCollisionSpheres.empty() ? 0 : array(cloth.mStartCollisionSpheres.front());
+	mTargetCollisionSpheres =
+	    cloth.mTargetCollisionSpheres.empty() ? mStartCollisionSpheres : array(cloth.mTargetCollisionSpheres.front());
+	mNumSpheres = uint32_t(cloth.mStartCollisionSpheres.size());
+
+	mCapsuleIndices = cloth.mCapsuleIndices.empty() ? 0 : &cloth.mCapsuleIndices.front();
+	mNumCapsules = uint32_t(cloth.mCapsuleIndices.size());
+
+	mStartCollisionPlanes = cloth.mStartCollisionPlanes.empty() ? 0 : array(cloth.mStartCollisionPlanes.front());
+	mTargetCollisionPlanes =
+	    cloth.mTargetCollisionPlanes.empty() ? mStartCollisionPlanes : array(cloth.mTargetCollisionPlanes.front());
+	mNumPlanes = uint32_t(cloth.mStartCollisionPlanes.size());
+
+	mConvexMasks = cloth.mConvexMasks.empty() ? 0 : &cloth.mConvexMasks.front();
+	mNumConvexes = uint32_t(cloth.mConvexMasks.size());
+
+	mStartCollisionTriangles = cloth.mStartCollisionTriangles.empty() ? 0 : array(cloth.mStartCollisionTriangles.front());
+	mTargetCollisionTriangles = cloth.mTargetCollisionTriangles.empty() ? mStartCollisionTriangles
+	                                                                    : array(cloth.mTargetCollisionTriangles.front());
+	mNumTriangles = uint32_t(cloth.mStartCollisionTriangles.size()) / 3;
+
+	mVirtualParticlesBegin = cloth.mVirtualParticleIndices.empty() ? 0 : array(cloth.mVirtualParticleIndices.front());
+	mVirtualParticlesEnd = mVirtualParticlesBegin + 4 * cloth.mVirtualParticleIndices.size();
+	mVirtualParticleWeights = cloth.mVirtualParticleWeights.empty() ? 0 : array(cloth.mVirtualParticleWeights.front());
+	mNumVirtualParticleWeights = uint32_t(cloth.mVirtualParticleWeights.size());
+
+	mEnableContinuousCollision = cloth.mEnableContinuousCollision;
+	mCollisionMassScale = cloth.mCollisionMassScale;
+	mFrictionScale = cloth.mFriction;
+
+	mSelfCollisionDistance = cloth.mSelfCollisionDistance;
+	mSelfCollisionStiffness = 1.0f - exp(stiffnessExponent * cloth.mSelfCollisionLogStiffness);
+
+	mSelfCollisionIndices = cloth.mSelfCollisionIndices.empty() ? 0 : cloth.mSelfCollisionIndices.begin();
+	mNumSelfCollisionIndices = mSelfCollisionIndices ? cloth.mSelfCollisionIndices.size() : mNumParticles;
+
+	mRestPositions = cloth.mRestPositions.size() ? array(cloth.mRestPositions.front()) : 0;
+
+	mSleepPassCounter = cloth.mSleepPassCounter;
+	mSleepTestCounter = cloth.mSleepTestCounter;
+}
+
+void cloth::SwClothData::reconcile(SwCloth& cloth) const
+{
+	cloth.setParticleBounds(mCurBounds);
+	cloth.mSleepTestCounter = mSleepTestCounter;
+	cloth.mSleepPassCounter = mSleepPassCounter;
+}
+
+void cloth::SwClothData::verify() const
+{
+	// checks needs to be run after the constructor because
+
+	PX_ASSERT(!mNumCapsules ||
+	          mNumSpheres > *nvidia::maxElement(&mCapsuleIndices->first, &(mCapsuleIndices + mNumCapsules)->first));
+
+	PX_ASSERT(!mNumConvexes || (1u << mNumPlanes) - 1 >= *nvidia::maxElement(mConvexMasks, mConvexMasks + mNumConvexes));
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwClothData.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwClothData.h
new file mode 100644
index 00000000..3aaa6a2b
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwClothData.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Px.h"
+#include "Types.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+
+class SwCloth;
+class SwFabric;
+struct PhaseConfig;
+struct IndexPair;
+struct SwTether;
+
+// reference to cloth instance bulk data (POD)
+struct SwClothData
+{
+	SwClothData(SwCloth&, const SwFabric&);
+	void reconcile(SwCloth&) const;
+	void verify() const;
+
+	// particle data
+	uint32_t mNumParticles;
+	float* mCurParticles;
+	float* mPrevParticles;
+
+	float mCurBounds[6]; // lower[3], upper[3]
+	float mPrevBounds[6];
+	float mPadding; // write as simd
+
+	// distance constraints
+	const PhaseConfig* mConfigBegin;
+	const PhaseConfig* mConfigEnd;
+
+	const uint32_t* mPhases;
+	uint32_t mNumPhases;
+
+	const uint32_t* mSets;
+	uint32_t mNumSets;
+
+	const float* mRestvalues;
+	uint32_t mNumRestvalues;
+
+	const uint16_t* mIndices;
+	uint32_t mNumIndices;
+
+	const SwTether* mTethers;
+	uint32_t mNumTethers;
+	float mTetherConstraintStiffness;
+	float mTetherConstraintScale;
+
+	// motion constraint data
+	const float* mStartMotionConstraints;
+	const float* mTargetMotionConstraints;
+	float mMotionConstraintStiffness;
+
+	// separation constraint data
+	const float* mStartSeparationConstraints;
+	const float* mTargetSeparationConstraints;
+
+	// particle acceleration data
+	const float* mParticleAccelerations;
+
+	// collision stuff
+	const float* mStartCollisionSpheres;
+	const float* mTargetCollisionSpheres;
+	uint32_t mNumSpheres;
+
+	const IndexPair* mCapsuleIndices;
+	uint32_t mNumCapsules;
+
+	const float* mStartCollisionPlanes;
+	const float* mTargetCollisionPlanes;
+	uint32_t mNumPlanes;
+
+	const uint32_t* mConvexMasks;
+	uint32_t mNumConvexes;
+
+	const float* mStartCollisionTriangles;
+	const float* mTargetCollisionTriangles;
+	uint32_t mNumTriangles;
+
+	const uint16_t* mVirtualParticlesBegin;
+	const uint16_t* mVirtualParticlesEnd;
+
+	const float* mVirtualParticleWeights;
+	uint32_t mNumVirtualParticleWeights;
+
+	bool mEnableContinuousCollision;
+	float mFrictionScale;
+	float mCollisionMassScale;
+
+	float mSelfCollisionDistance;
+	float mSelfCollisionStiffness;
+
+	uint32_t mNumSelfCollisionIndices;
+	const uint32_t* mSelfCollisionIndices;
+
+	float* mRestPositions;
+
+	// sleep data
+	uint32_t mSleepPassCounter;
+	uint32_t mSleepTestCounter;
+
+} PX_ALIGN_SUFFIX(16);
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollision.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollision.cpp
new file mode 100644
index 00000000..581d276b
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollision.cpp
@@ -0,0 +1,1927 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "SwCollision.h"
+#include "SwCloth.h"
+#include "SwClothData.h"
+#include "IterationState.h"
+#include "BoundingBox.h"
+#include "PointInterpolator.h"
+#include "SwCollisionHelpers.h"
+#include "PxAssert.h"
+#include <string.h> // for memset
+
+using namespace nvidia;
+
+// the particle trajectory needs to penetrate more than 0.2 * radius to trigger continuous collision
+template <typename Simd4f>
+const Simd4f cloth::SwCollision<Simd4f>::sSkeletonWidth = simd4f(sqr(1 - 0.2f) - 1);
+
+#if NVMATH_SSE2
+const Simd4i cloth::Gather<Simd4i>::sIntSignBit = simd4i(_sign);
+const Simd4i cloth::Gather<Simd4i>::sSignedMask = sIntSignBit | simd4i(0x7);
+#elif NVMATH_NEON
+const Simd4i cloth::Gather<Simd4i>::sPack = simd4i(0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c);
+const Simd4i cloth::Gather<Simd4i>::sOffset = simd4i(0x03020100);
+const Simd4i cloth::Gather<Simd4i>::sShift = simd4i(detail::IntType<2>());
+const Simd4i cloth::Gather<Simd4i>::sMask = simd4i(detail::IntType<7>());
+#endif
+
+namespace
+{
+typedef Simd4fFactory<detail::FourTuple> Simd4fConstant;
+
+const Simd4fConstant sEpsilon = simd4f(FLT_EPSILON);
+const Simd4fConstant sMax = simd4f(FLT_MAX);
+const Simd4fConstant sMaskX = simd4f(simd4i(~0, 0, 0, 0));
+const Simd4fConstant sMaskZ = simd4f(simd4i(0, 0, ~0, 0));
+const Simd4fConstant sMaskW = simd4f(simd4i(0, 0, 0, ~0));
+const Simd4fConstant sZero = simd4f(0.0f);
+const Simd4fConstant sOne = simd4f(1.0f);
+const Simd4fConstant sNegOne = simd4f(-1.0f);
+const Simd4fConstant sHalf = simd4f(0.5f);
+const Simd4fConstant sOneXYZ = simd4f(1.0f, 1.0f, 1.0f, 0.0f);
+const Simd4fConstant sGridLength = simd4f(8 - 1e-3f); // sGridSize
+const Simd4fConstant sGridExpand = simd4f(1e-4f);
+const Simd4fConstant sMinusFloatMaxXYZ = simd4f(-FLT_MAX, -FLT_MAX, -FLT_MAX, 0.0f);
+
+#if PX_PROFILE || PX_DEBUG
+template <typename Simd4f>
+uint32_t horizontalSum(const Simd4f& x)
+{
+	const float* p = array(x);
+	return uint32_t(0.5f + p[0] + p[1] + p[2] + p[3]);
+}
+#endif
+
+// 7 elements are written to ptr!
+template <typename Simd4f>
+void storeBounds(float* ptr, const cloth::BoundingBox<Simd4f>& bounds)
+{
+	store(ptr, bounds.mLower);
+	store(ptr + 3, bounds.mUpper);
+}
+}
+
+struct cloth::SphereData
+{
+	PxVec3 center;
+	float radius;
+};
+
+struct cloth::ConeData
+{
+	PxVec3 center;
+	float radius; // cone radius at center
+	PxVec3 axis;
+	float slope; // tan(alpha)
+
+	float sqrCosine; // cos^2(alpha)
+	float halfLength;
+
+	uint32_t firstMask;
+	uint32_t bothMask;
+};
+
+struct cloth::TriangleData
+{
+	PxVec3 base;
+	float edge0DotEdge1;
+
+	PxVec3 edge0;
+	float edge0SqrLength;
+
+	PxVec3 edge1;
+	float edge1SqrLength;
+
+	PxVec3 normal;
+	float padding;
+
+	float det;
+	float denom;
+
+	float edge0InvSqrLength;
+	float edge1InvSqrLength;
+};
+
+namespace nvidia
+{
+namespace cloth
+{
+template <typename Simd4f>
+BoundingBox<Simd4f> expandBounds(const BoundingBox<Simd4f>& bbox, const SphereData* sIt, const SphereData* sEnd)
+{
+	BoundingBox<Simd4f> result = bbox;
+	for(; sIt != sEnd; ++sIt)
+	{
+		Simd4f p = loadAligned(array(sIt->center));
+		Simd4f r = splat<3>(p);
+		result.mLower = min(result.mLower, p - r);
+		result.mUpper = max(result.mUpper, p + r);
+	}
+	return result;
+}
+}
+}
+
+namespace
+{
+template <typename Simd4f, typename SrcIterator>
+void generateSpheres(Simd4f* dIt, const SrcIterator& src, uint32_t count)
+{
+	// have to copy out iterator to ensure alignment is maintained
+	for(SrcIterator sIt = src; 0 < count--; ++sIt, ++dIt)
+		*dIt = max(sMinusFloatMaxXYZ, *sIt); // clamp radius to 0
+}
+
+void generateCones(cloth::ConeData* dst, const cloth::SphereData* sourceSpheres, const cloth::IndexPair* capsuleIndices,
+                   uint32_t numCones)
+{
+	cloth::ConeData* cIt = dst;
+	for(const cloth::IndexPair* iIt = capsuleIndices, *iEnd = iIt + numCones; iIt != iEnd; ++iIt, ++cIt)
+	{
+		PxVec4 first = reinterpret_cast<const PxVec4&>(sourceSpheres[iIt->first]);
+		PxVec4 second = reinterpret_cast<const PxVec4&>(sourceSpheres[iIt->second]);
+
+		PxVec4 center = (second + first) * 0.5f;
+		PxVec4 axis = (second - first) * 0.5f;
+
+		float sqrAxisLength = axis.x * axis.x + axis.y * axis.y + axis.z * axis.z;
+		float sqrConeLength = sqrAxisLength - sqr(axis.w);
+
+		float invAxisLength = 1 / sqrtf(sqrAxisLength);
+		float invConeLength = 1 / sqrtf(sqrConeLength);
+
+		if(sqrConeLength <= 0.0f)
+			invAxisLength = invConeLength = 0.0f;
+
+		float axisLength = sqrAxisLength * invAxisLength;
+		float slope = axis.w * invConeLength;
+
+		cIt->center = PxVec3(center.x, center.y, center.z);
+		cIt->radius = (axis.w + first.w) * invConeLength * axisLength;
+		cIt->axis = PxVec3(axis.x, axis.y, axis.z) * invAxisLength;
+		cIt->slope = slope;
+
+		cIt->sqrCosine = 1.0f - sqr(axis.w * invAxisLength);
+		cIt->halfLength = axisLength;
+
+		uint32_t firstMask = 0x1u << iIt->first;
+		cIt->firstMask = firstMask;
+		cIt->bothMask = firstMask | 0x1u << iIt->second;
+	}
+}
+
+template <typename Simd4f, typename SrcIterator>
+void generatePlanes(Simd4f* dIt, const SrcIterator& src, uint32_t count)
+{
+	// have to copy out iterator to ensure alignment is maintained
+	for(SrcIterator sIt = src; 0 < count--; ++sIt, ++dIt)
+		*dIt = *sIt;
+}
+
+template <typename Simd4f, typename SrcIterator>
+void generateTriangles(cloth::TriangleData* dIt, const SrcIterator& src, uint32_t count)
+{
+	// have to copy out iterator to ensure alignment is maintained
+	for(SrcIterator sIt = src; 0 < count--; ++dIt)
+	{
+		Simd4f p0 = *sIt;
+		++sIt;
+		Simd4f p1 = *sIt;
+		++sIt;
+		Simd4f p2 = *sIt;
+		++sIt;
+
+		Simd4f edge0 = p1 - p0;
+		Simd4f edge1 = p2 - p0;
+		Simd4f normal = cross3(edge0, edge1);
+
+		Simd4f edge0SqrLength = dot3(edge0, edge0);
+		Simd4f edge1SqrLength = dot3(edge1, edge1);
+		Simd4f edge0DotEdge1 = dot3(edge0, edge1);
+		Simd4f normalInvLength = rsqrt(dot3(normal, normal));
+
+		Simd4f det = edge0SqrLength * edge1SqrLength - edge0DotEdge1 * edge0DotEdge1;
+		Simd4f denom = edge0SqrLength + edge1SqrLength - edge0DotEdge1 - edge0DotEdge1;
+
+		// there are definitely faster ways...
+		Simd4f aux = select(sMaskX, det, denom);
+		aux = select(sMaskZ, edge0SqrLength, aux);
+		aux = select(sMaskW, edge1SqrLength, aux);
+
+		storeAligned(&dIt->base.x, select(sMaskW, edge0DotEdge1, p0));
+		storeAligned(&dIt->edge0.x, select(sMaskW, edge0SqrLength, edge0));
+		storeAligned(&dIt->edge1.x, select(sMaskW, edge1SqrLength, edge1));
+		storeAligned(&dIt->normal.x, normal * normalInvLength);
+		storeAligned(&dIt->det, recipT<1>(aux));
+	}
+}
+
+} // namespace
+
+template <typename Simd4f>
+cloth::SwCollision<Simd4f>::CollisionData::CollisionData()
+: mSpheres(0), mCones(0)
+{
+}
+
+template <typename Simd4f>
+cloth::SwCollision<Simd4f>::SwCollision(SwClothData& clothData, SwKernelAllocator& alloc, profile::PxProfileZone* profiler)
+: mClothData(clothData), mAllocator(alloc), mProfiler(profiler)
+{
+	allocate(mCurData);
+
+	if(mClothData.mEnableContinuousCollision || mClothData.mFrictionScale > 0.0f)
+	{
+		allocate(mPrevData);
+
+		generateSpheres(reinterpret_cast<Simd4f*>(mPrevData.mSpheres),
+		                reinterpret_cast<const Simd4f*>(clothData.mStartCollisionSpheres), clothData.mNumSpheres);
+
+		generateCones(mPrevData.mCones, mPrevData.mSpheres, clothData.mCapsuleIndices, clothData.mNumCapsules);
+	}
+}
+
+template <typename Simd4f>
+cloth::SwCollision<Simd4f>::~SwCollision()
+{
+	deallocate(mCurData);
+	deallocate(mPrevData);
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::operator()(const IterationState<Simd4f>& state)
+{
+	mNumCollisions = 0;
+
+	collideConvexes(state);  // discrete convex collision, no friction
+	collideTriangles(state); // discrete triangle collision, no friction
+
+	computeBounds();
+
+	if(!mClothData.mNumSpheres)
+		return;
+
+	bool lastIteration = state.mRemainingIterations == 1;
+
+	const Simd4f* targetSpheres = reinterpret_cast<const Simd4f*>(mClothData.mTargetCollisionSpheres);
+
+	// generate sphere and cone collision data
+	if(!lastIteration)
+	{
+		// interpolate spheres
+		LerpIterator<Simd4f, const Simd4f*> pIter(reinterpret_cast<const Simd4f*>(mClothData.mStartCollisionSpheres),
+		                                          targetSpheres, state.getCurrentAlpha());
+		generateSpheres(reinterpret_cast<Simd4f*>(mCurData.mSpheres), pIter, mClothData.mNumSpheres);
+	}
+	else
+	{
+		// otherwise use the target spheres directly
+		generateSpheres(reinterpret_cast<Simd4f*>(mCurData.mSpheres), targetSpheres, mClothData.mNumSpheres);
+	}
+
+	// generate cones even if test below fails because
+	// continuous collision might need it in next iteration
+	generateCones(mCurData.mCones, mCurData.mSpheres, mClothData.mCapsuleIndices, mClothData.mNumCapsules);
+
+	if(buildAcceleration())
+	{
+		if(mClothData.mEnableContinuousCollision)
+			collideContinuousParticles();
+
+		mergeAcceleration((uint32_t*)mSphereGrid);
+		mergeAcceleration((uint32_t*)mConeGrid);
+
+		if(!mClothData.mEnableContinuousCollision)
+			collideParticles();
+
+		collideVirtualParticles();
+	}
+
+	if(mPrevData.mSpheres)
+		nvidia::swap(mCurData, mPrevData);
+}
+
+template <typename Simd4f>
+size_t cloth::SwCollision<Simd4f>::estimateTemporaryMemory(const SwCloth& cloth)
+{
+	size_t numTriangles = cloth.mStartCollisionTriangles.size();
+	size_t numPlanes = cloth.mStartCollisionPlanes.size();
+
+	const size_t kTriangleDataSize = sizeof(TriangleData) * numTriangles;
+	const size_t kPlaneDataSize = sizeof(PxVec4) * numPlanes * 2;
+
+	return PxMax(kTriangleDataSize, kPlaneDataSize);
+}
+
+template <typename Simd4f>
+size_t cloth::SwCollision<Simd4f>::estimatePersistentMemory(const SwCloth& cloth)
+{
+	size_t numCapsules = cloth.mCapsuleIndices.size();
+	size_t numSpheres = cloth.mStartCollisionSpheres.size();
+
+	size_t sphereDataSize = sizeof(SphereData) * numSpheres * 2;
+	size_t coneDataSize = sizeof(ConeData) * numCapsules * 2;
+
+	return sphereDataSize + coneDataSize;
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::allocate(CollisionData& data)
+{
+	data.mSpheres = static_cast<SphereData*>(mAllocator.allocate(sizeof(SphereData) * mClothData.mNumSpheres));
+
+	data.mCones = static_cast<ConeData*>(mAllocator.allocate(sizeof(ConeData) * mClothData.mNumCapsules));
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::deallocate(const CollisionData& data)
+{
+	mAllocator.deallocate(data.mSpheres);
+	mAllocator.deallocate(data.mCones);
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::computeBounds()
+{
+#if PX_PROFILE
+	ProfileZone zone("cloth::SwSolverKernel::computeBounds", mProfiler);
+#endif
+
+	Simd4f* prevIt = reinterpret_cast<Simd4f*>(mClothData.mPrevParticles);
+	Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles);
+	Simd4f* curEnd = curIt + mClothData.mNumParticles;
+	Simd4f floatMaxXYZ = -(Simd4f)sMinusFloatMaxXYZ;
+
+	Simd4f lower = simd4f(FLT_MAX), upper = -lower;
+	for(; curIt < curEnd; ++curIt, ++prevIt)
+	{
+		Simd4f current = *curIt;
+		lower = min(lower, current);
+		upper = max(upper, current);
+		// if(current.w > 0) current.w = previous.w
+		*curIt = select(current > floatMaxXYZ, *prevIt, current);
+	}
+
+	BoundingBox<Simd4f> curBounds;
+	curBounds.mLower = lower;
+	curBounds.mUpper = upper;
+
+	// don't change this order, storeBounds writes 7 floats
+	BoundingBox<Simd4f> prevBounds = loadBounds<Simd4f>(mClothData.mCurBounds);
+	storeBounds(mClothData.mCurBounds, curBounds);
+	storeBounds(mClothData.mPrevBounds, prevBounds);
+}
+
+namespace
+{
+template <typename Simd4i>
+Simd4i andNotIsZero(const Simd4i& left, const Simd4i& right)
+{
+	return simdi::operator==(left & ~right, simd4i(_0));
+}
+}
+
+// build per-axis mask arrays of spheres on the right/left of grid cell
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::buildSphereAcceleration(const SphereData* sIt)
+{
+	static const int maxIndex = sGridSize - 1;
+
+	const SphereData* sEnd = sIt + mClothData.mNumSpheres;
+	for(uint32_t mask = 0x1; sIt != sEnd; ++sIt, mask <<= 1)
+	{
+		Simd4f sphere = loadAligned(array(sIt->center));
+		Simd4f radius = splat<3>(sphere);
+
+		Simd4i first = intFloor(max((sphere - radius) * mGridScale + mGridBias, sZero));
+		Simd4i last = intFloor(min((sphere + radius) * mGridScale + mGridBias, sGridLength));
+
+		const int* firstIdx = simdi::array(first);
+		const int* lastIdx = simdi::array(last);
+
+		uint32_t* firstIt = (uint32_t*)mSphereGrid;
+		uint32_t* lastIt = firstIt + 3 * sGridSize;
+
+		for(uint32_t i = 0; i < 3; ++i, firstIt += sGridSize, lastIt += sGridSize)
+		{
+			for(int j = firstIdx[i]; j <= maxIndex; ++j)
+				firstIt[j] |= mask;
+
+			for(int j = lastIdx[i]; j >= 0; --j)
+				lastIt[j] |= mask;
+		}
+	}
+}
+
+// generate cone masks from sphere masks
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::buildConeAcceleration()
+{
+	const ConeData* coneIt = mCurData.mCones;
+	const ConeData* coneEnd = coneIt + mClothData.mNumCapsules;
+	for(uint32_t coneMask = 0x1; coneIt != coneEnd; ++coneIt, coneMask <<= 1)
+	{
+		if(coneIt->radius == 0.0f)
+			continue;
+
+		uint32_t spheresMask = coneIt->bothMask;
+
+		uint32_t* sphereIt = (uint32_t*)mSphereGrid;
+		uint32_t* sphereEnd = sphereIt + 6 * sGridSize;
+		uint32_t* gridIt = (uint32_t*)mConeGrid;
+		for(; sphereIt != sphereEnd; ++sphereIt, ++gridIt)
+			if(*sphereIt & spheresMask)
+				*gridIt |= coneMask;
+	}
+}
+
+// convert right/left mask arrays into single overlap array
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::mergeAcceleration(uint32_t* firstIt)
+{
+	uint32_t* firstEnd = firstIt + 3 * sGridSize;
+	uint32_t* lastIt = firstEnd;
+	for(; firstIt != firstEnd; ++firstIt, ++lastIt)
+		*firstIt &= *lastIt;
+}
+
+// build mask of spheres/cones touching a regular grid along each axis
+template <typename Simd4f>
+bool cloth::SwCollision<Simd4f>::buildAcceleration()
+{
+	// determine sphere bbox
+	BoundingBox<Simd4f> sphereBounds =
+	    expandBounds(emptyBounds<Simd4f>(), mCurData.mSpheres, mCurData.mSpheres + mClothData.mNumSpheres);
+	BoundingBox<Simd4f> particleBounds = loadBounds<Simd4f>(mClothData.mCurBounds);
+	if(mClothData.mEnableContinuousCollision)
+	{
+		sphereBounds = expandBounds(sphereBounds, mPrevData.mSpheres, mPrevData.mSpheres + mClothData.mNumSpheres);
+		particleBounds = expandBounds(particleBounds, loadBounds<Simd4f>(mClothData.mPrevBounds));
+	}
+
+	BoundingBox<Simd4f> bounds = intersectBounds(sphereBounds, particleBounds);
+	Simd4f edgeLength = (bounds.mUpper - bounds.mLower) & ~(Simd4f)sMaskW;
+	if(!allGreaterEqual(edgeLength, simd4f(_0)))
+		return false;
+
+	// calculate an expanded bounds to account for numerical inaccuracy
+	const Simd4f expandedLower = bounds.mLower - abs(bounds.mLower) * sGridExpand;
+	const Simd4f expandedUpper = bounds.mUpper + abs(bounds.mUpper) * sGridExpand;
+	const Simd4f expandedEdgeLength = max(expandedUpper - expandedLower, sEpsilon);
+
+	// make grid minimal thickness and strict upper bound of spheres
+	mGridScale = sGridLength * recipT<1>(expandedEdgeLength);
+	mGridBias = -expandedLower * mGridScale;
+	array(mGridBias)[3] = 1.0f; // needed for collideVirtualParticles()
+
+	PX_ASSERT(allTrue(((bounds.mLower * mGridScale + mGridBias) >= simd4f(0.0f)) | sMaskW));
+	PX_ASSERT(allTrue(((bounds.mUpper * mGridScale + mGridBias) < simd4f(8.0f)) | sMaskW));
+
+	memset(mSphereGrid, 0, sizeof(uint32_t) * 6 * (sGridSize));
+	if(mClothData.mEnableContinuousCollision)
+		buildSphereAcceleration(mPrevData.mSpheres);
+	buildSphereAcceleration(mCurData.mSpheres);
+
+	memset(mConeGrid, 0, sizeof(uint32_t) * 6 * (sGridSize));
+	buildConeAcceleration();
+
+	return true;
+}
+
+#ifdef _MSC_VER
+#define FORCE_INLINE __forceinline
+#else
+#define FORCE_INLINE inline __attribute__((always_inline))
+#endif
+
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask& cloth::SwCollision<Simd4f>::ShapeMask::
+operator=(const ShapeMask& right)
+{
+	mCones = right.mCones;
+	mSpheres = right.mSpheres;
+	return *this;
+}
+
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask& cloth::SwCollision<Simd4f>::ShapeMask::
+operator&=(const ShapeMask& right)
+{
+	mCones = mCones & right.mCones;
+	mSpheres = mSpheres & right.mSpheres;
+	return *this;
+}
+
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask
+cloth::SwCollision<Simd4f>::getShapeMask(const Simd4f& position, const Simd4i* __restrict sphereGrid,
+                                         const Simd4i* __restrict coneGrid)
+{
+	Gather<Simd4i> gather(intFloor(position));
+
+	ShapeMask result;
+	result.mCones = gather(coneGrid);
+	result.mSpheres = gather(sphereGrid);
+	return result;
+}
+
+// lookup acceleration structure and return mask of potential intersectors
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask
+cloth::SwCollision<Simd4f>::getShapeMask(const Simd4f* __restrict positions) const
+{
+	Simd4f posX = positions[0] * splat<0>(mGridScale) + splat<0>(mGridBias);
+	Simd4f posY = positions[1] * splat<1>(mGridScale) + splat<1>(mGridBias);
+	Simd4f posZ = positions[2] * splat<2>(mGridScale) + splat<2>(mGridBias);
+
+	ShapeMask result = getShapeMask(posX, mSphereGrid, mConeGrid);
+	result &= getShapeMask(posY, mSphereGrid + 2, mConeGrid + 2);
+	result &= getShapeMask(posZ, mSphereGrid + 4, mConeGrid + 4);
+
+	return result;
+}
+
+// lookup acceleration structure and return mask of potential intersectors
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask
+cloth::SwCollision<Simd4f>::getShapeMask(const Simd4f* __restrict prevPos, const Simd4f* __restrict curPos) const
+{
+	Simd4f scaleX = splat<0>(mGridScale);
+	Simd4f scaleY = splat<1>(mGridScale);
+	Simd4f scaleZ = splat<2>(mGridScale);
+
+	Simd4f biasX = splat<0>(mGridBias);
+	Simd4f biasY = splat<1>(mGridBias);
+	Simd4f biasZ = splat<2>(mGridBias);
+
+	Simd4f prevX = prevPos[0] * scaleX + biasX;
+	Simd4f prevY = prevPos[1] * scaleY + biasY;
+	Simd4f prevZ = prevPos[2] * scaleZ + biasZ;
+
+	Simd4f curX = curPos[0] * scaleX + biasX;
+	Simd4f curY = curPos[1] * scaleY + biasY;
+	Simd4f curZ = curPos[2] * scaleZ + biasZ;
+
+	Simd4f maxX = min(max(prevX, curX), sGridLength);
+	Simd4f maxY = min(max(prevY, curY), sGridLength);
+	Simd4f maxZ = min(max(prevZ, curZ), sGridLength);
+
+	ShapeMask result = getShapeMask(maxX, mSphereGrid, mConeGrid);
+	result &= getShapeMask(maxY, mSphereGrid + 2, mConeGrid + 2);
+	result &= getShapeMask(maxZ, mSphereGrid + 4, mConeGrid + 4);
+
+	Simd4f zero = simd4f(_0);
+	Simd4f minX = max(min(prevX, curX), zero);
+	Simd4f minY = max(min(prevY, curY), zero);
+	Simd4f minZ = max(min(prevZ, curZ), zero);
+
+	result &= getShapeMask(minX, mSphereGrid + 6, mConeGrid + 6);
+	result &= getShapeMask(minY, mSphereGrid + 8, mConeGrid + 8);
+	result &= getShapeMask(minZ, mSphereGrid + 10, mConeGrid + 10);
+
+	return result;
+}
+
+template <typename Simd4f>
+struct cloth::SwCollision<Simd4f>::ImpulseAccumulator
+{
+	ImpulseAccumulator()
+	: mDeltaX(simd4f(_0))
+	, mDeltaY(mDeltaX)
+	, mDeltaZ(mDeltaX)
+	, mVelX(mDeltaX)
+	, mVelY(mDeltaX)
+	, mVelZ(mDeltaX)
+	, mNumCollisions(sEpsilon)
+	{
+	}
+
+	void add(const Simd4f& x, const Simd4f& y, const Simd4f& z, const Simd4f& scale, const Simd4f& mask)
+	{
+		PX_ASSERT(allTrue((mask & x) == (mask & x)));
+		PX_ASSERT(allTrue((mask & y) == (mask & y)));
+		PX_ASSERT(allTrue((mask & z) == (mask & z)));
+		PX_ASSERT(allTrue((mask & scale) == (mask & scale)));
+
+		Simd4f maskedScale = scale & mask;
+		mDeltaX = mDeltaX + x * maskedScale;
+		mDeltaY = mDeltaY + y * maskedScale;
+		mDeltaZ = mDeltaZ + z * maskedScale;
+		mNumCollisions = mNumCollisions + (simd4f(_1) & mask);
+	}
+
+	void addVelocity(const Simd4f& vx, const Simd4f& vy, const Simd4f& vz, const Simd4f& mask)
+	{
+		PX_ASSERT(allTrue((mask & vx) == (mask & vx)));
+		PX_ASSERT(allTrue((mask & vy) == (mask & vy)));
+		PX_ASSERT(allTrue((mask & vz) == (mask & vz)));
+
+		mVelX = mVelX + (vx & mask);
+		mVelY = mVelY + (vy & mask);
+		mVelZ = mVelZ + (vz & mask);
+	}
+
+	void subtract(const Simd4f& x, const Simd4f& y, const Simd4f& z, const Simd4f& scale, const Simd4f& mask)
+	{
+		PX_ASSERT(allTrue((mask & x) == (mask & x)));
+		PX_ASSERT(allTrue((mask & y) == (mask & y)));
+		PX_ASSERT(allTrue((mask & z) == (mask & z)));
+		PX_ASSERT(allTrue((mask & scale) == (mask & scale)));
+
+		Simd4f maskedScale = scale & mask;
+		mDeltaX = mDeltaX - x * maskedScale;
+		mDeltaY = mDeltaY - y * maskedScale;
+		mDeltaZ = mDeltaZ - z * maskedScale;
+		mNumCollisions = mNumCollisions + (simd4f(_1) & mask);
+	}
+
+	Simd4f mDeltaX, mDeltaY, mDeltaZ;
+	Simd4f mVelX, mVelY, mVelZ;
+	Simd4f mNumCollisions;
+};
+
+template <typename Simd4f>
+FORCE_INLINE void cloth::SwCollision<Simd4f>::collideSpheres(const Simd4i& sphereMask, const Simd4f* positions,
+                                                             ImpulseAccumulator& accum) const
+{
+	const float* __restrict spherePtr = array(mCurData.mSpheres->center);
+
+	bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+
+	Simd4i mask4 = horizontalOr(sphereMask);
+	uint32_t mask = uint32_t(simdi::array(mask4)[0]);
+	while(mask)
+	{
+		uint32_t test = mask - 1;
+		uint32_t offset = findBitSet(mask & ~test) * sizeof(SphereData);
+		mask = mask & test;
+
+		Simd4f sphere = loadAligned(spherePtr, offset);
+
+		Simd4f deltaX = positions[0] - splat<0>(sphere);
+		Simd4f deltaY = positions[1] - splat<1>(sphere);
+		Simd4f deltaZ = positions[2] - splat<2>(sphere);
+
+		Simd4f sqrDistance = sEpsilon + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ;
+		Simd4f negativeScale = simd4f(_1) - rsqrt(sqrDistance) * splat<3>(sphere);
+
+		Simd4f contactMask;
+		if(!anyGreater(simd4f(_0), negativeScale, contactMask))
+			continue;
+
+		accum.subtract(deltaX, deltaY, deltaZ, negativeScale, contactMask);
+
+		if(frictionEnabled)
+		{
+			// load previous sphere pos
+			const float* __restrict prevSpherePtr = array(mPrevData.mSpheres->center);
+
+			Simd4f prevSphere = loadAligned(prevSpherePtr, offset);
+			Simd4f velocity = sphere - prevSphere;
+
+			accum.addVelocity(splat<0>(velocity), splat<1>(velocity), splat<2>(velocity), contactMask);
+		}
+	}
+}
+
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::Simd4i
+cloth::SwCollision<Simd4f>::collideCones(const Simd4f* __restrict positions, ImpulseAccumulator& accum) const
+{
+	const float* __restrict centerPtr = array(mCurData.mCones->center);
+	const float* __restrict axisPtr = array(mCurData.mCones->axis);
+	const float* __restrict auxiliaryPtr = &mCurData.mCones->sqrCosine;
+
+	bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+
+	ShapeMask shapeMask = getShapeMask(positions);
+	Simd4i mask4 = horizontalOr(shapeMask.mCones);
+	uint32_t mask = uint32_t(simdi::array(mask4)[0]);
+	while(mask)
+	{
+		uint32_t test = mask - 1;
+		uint32_t coneIndex = findBitSet(mask & ~test);
+		uint32_t offset = coneIndex * sizeof(ConeData);
+		mask = mask & test;
+
+		Simd4i test4 = simdi::operator-(mask4, simd4i(_1));
+		Simd4f culled = simd4f(andNotIsZero(shapeMask.mCones, test4));
+		mask4 = mask4 & test4;
+
+		Simd4f center = loadAligned(centerPtr, offset);
+
+		Simd4f deltaX = positions[0] - splat<0>(center);
+		Simd4f deltaY = positions[1] - splat<1>(center);
+		Simd4f deltaZ = positions[2] - splat<2>(center);
+
+		Simd4f axis = loadAligned(axisPtr, offset);
+
+		Simd4f axisX = splat<0>(axis);
+		Simd4f axisY = splat<1>(axis);
+		Simd4f axisZ = splat<2>(axis);
+		Simd4f slope = splat<3>(axis);
+
+		Simd4f dot = deltaX * axisX + deltaY * axisY + deltaZ * axisZ;
+		Simd4f radius = dot * slope + splat<3>(center);
+
+		// set radius to zero if cone is culled
+		radius = max(radius, sZero) & ~culled;
+
+		Simd4f sqrDistance = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ - dot * dot;
+
+		Simd4i auxiliary = simd4i((Simd4f)loadAligned(auxiliaryPtr, offset));
+		Simd4i bothMask = splat<3>(auxiliary);
+
+		Simd4f contactMask;
+		if(!anyGreater(radius * radius, sqrDistance, contactMask))
+		{
+			// cone only culled when spheres culled, ok to clear those too
+			shapeMask.mSpheres = shapeMask.mSpheres & ~bothMask;
+			continue;
+		}
+
+		// clamp to a small positive epsilon to avoid numerical error
+		// making sqrDistance negative when point lies on the cone axis
+		sqrDistance = max(sqrDistance, sEpsilon);
+
+		Simd4f invDistance = rsqrt(sqrDistance);
+		Simd4f base = dot + slope * sqrDistance * invDistance;
+
+		// force left/rightMask to false if not inside cone
+		base = base & contactMask;
+
+		Simd4f halfLength = splat<1>(simd4f(auxiliary));
+		Simd4i leftMask = simd4i(base < -halfLength);
+		Simd4i rightMask = simd4i(base > halfLength);
+
+		// we use both mask because of the early out above.
+		Simd4i firstMask = splat<2>(auxiliary);
+		Simd4i secondMask = firstMask ^ bothMask;
+		shapeMask.mSpheres = shapeMask.mSpheres & ~(firstMask & ~leftMask);
+		shapeMask.mSpheres = shapeMask.mSpheres & ~(secondMask & ~rightMask);
+
+		deltaX = deltaX - base * axisX;
+		deltaY = deltaY - base * axisY;
+		deltaZ = deltaZ - base * axisZ;
+
+		Simd4f sqrCosine = splat<0>(simd4f(auxiliary));
+		Simd4f scale = radius * invDistance * sqrCosine - sqrCosine;
+
+		contactMask = contactMask & ~simd4f(leftMask | rightMask);
+
+		if(!anyTrue(contactMask))
+			continue;
+
+		accum.add(deltaX, deltaY, deltaZ, scale, contactMask);
+
+		if(frictionEnabled)
+		{
+			uint32_t s0 = mClothData.mCapsuleIndices[coneIndex].first;
+			uint32_t s1 = mClothData.mCapsuleIndices[coneIndex].second;
+
+			float* prevSpheres = reinterpret_cast<float*>(mPrevData.mSpheres);
+			float* curSpheres = reinterpret_cast<float*>(mCurData.mSpheres);
+
+			// todo: could pre-compute sphere velocities or it might be
+			// faster to compute cur/prev sphere positions directly
+			Simd4f s0p0 = loadAligned(prevSpheres, s0 * sizeof(SphereData));
+			Simd4f s0p1 = loadAligned(curSpheres, s0 * sizeof(SphereData));
+
+			Simd4f s1p0 = loadAligned(prevSpheres, s1 * sizeof(SphereData));
+			Simd4f s1p1 = loadAligned(curSpheres, s1 * sizeof(SphereData));
+
+			Simd4f v0 = s0p1 - s0p0;
+			Simd4f v1 = s1p1 - s1p0;
+			Simd4f vd = v1 - v0;
+
+			// dot is in the range -1 to 1, scale and bias to 0 to 1
+			dot = dot * sHalf + sHalf;
+
+			// interpolate velocity at contact points
+			Simd4f vx = splat<0>(v0) + dot * splat<0>(vd);
+			Simd4f vy = splat<1>(v0) + dot * splat<1>(vd);
+			Simd4f vz = splat<2>(v0) + dot * splat<2>(vd);
+
+			accum.addVelocity(vx, vy, vz, contactMask);
+		}
+	}
+
+	return shapeMask.mSpheres;
+}
+
+template <typename Simd4f>
+FORCE_INLINE void cloth::SwCollision<Simd4f>::collideSpheres(const Simd4i& sphereMask, const Simd4f* __restrict prevPos,
+                                                             Simd4f* __restrict curPos, ImpulseAccumulator& accum) const
+{
+	const float* __restrict prevSpheres = array(mPrevData.mSpheres->center);
+	const float* __restrict curSpheres = array(mCurData.mSpheres->center);
+
+	bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+
+	Simd4i mask4 = horizontalOr(sphereMask);
+	uint32_t mask = uint32_t(simdi::array(mask4)[0]);
+	while(mask)
+	{
+		uint32_t test = mask - 1;
+		uint32_t offset = findBitSet(mask & ~test) * sizeof(SphereData);
+		mask = mask & test;
+
+		Simd4f prevSphere = loadAligned(prevSpheres, offset);
+		Simd4f prevX = prevPos[0] - splat<0>(prevSphere);
+		Simd4f prevY = prevPos[1] - splat<1>(prevSphere);
+		Simd4f prevZ = prevPos[2] - splat<2>(prevSphere);
+		Simd4f prevRadius = splat<3>(prevSphere);
+
+		Simd4f curSphere = loadAligned(curSpheres, offset);
+		Simd4f curX = curPos[0] - splat<0>(curSphere);
+		Simd4f curY = curPos[1] - splat<1>(curSphere);
+		Simd4f curZ = curPos[2] - splat<2>(curSphere);
+		Simd4f curRadius = splat<3>(curSphere);
+
+		Simd4f sqrDistance = sEpsilon + curX * curX + curY * curY + curZ * curZ;
+
+		Simd4f dotPrevPrev = prevX * prevX + prevY * prevY + prevZ * prevZ - prevRadius * prevRadius;
+		Simd4f dotPrevCur = prevX * curX + prevY * curY + prevZ * curZ - prevRadius * curRadius;
+		Simd4f dotCurCur = sqrDistance - curRadius * curRadius;
+
+		Simd4f discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev;
+		Simd4f sqrtD = sqrt(discriminant);
+		Simd4f halfB = dotPrevCur - dotPrevPrev;
+		Simd4f minusA = dotPrevCur - dotCurCur + halfB;
+
+		// time of impact or 0 if prevPos inside sphere
+		Simd4f toi = recip(minusA) * min(simd4f(_0), halfB + sqrtD);
+		Simd4f collisionMask = (toi < simd4f(_1)) & (halfB < sqrtD);
+
+		// skip continuous collision if the (un-clamped) particle
+		// trajectory only touches the outer skin of the cone.
+		Simd4f rMin = prevRadius + halfB * minusA * (curRadius - prevRadius);
+		collisionMask = collisionMask & (discriminant > minusA * rMin * rMin * sSkeletonWidth);
+
+		// a is negative when one sphere is contained in the other,
+		// which is already handled by discrete collision.
+		collisionMask = collisionMask & (minusA < -(Simd4f)sEpsilon);
+
+		if(!allEqual(collisionMask, simd4f(_0)))
+		{
+			Simd4f deltaX = prevX - curX;
+			Simd4f deltaY = prevY - curY;
+			Simd4f deltaZ = prevZ - curZ;
+
+			Simd4f oneMinusToi = (simd4f(_1) - toi) & collisionMask;
+
+			// reduce ccd impulse if (clamped) particle trajectory stays in sphere skin,
+			// i.e. scale by exp2(-k) or 1/(1+k) with k = (tmin - toi) / (1 - toi)
+			Simd4f minusK = sqrtD * recip(minusA * oneMinusToi) & (oneMinusToi > sEpsilon);
+			oneMinusToi = oneMinusToi * recip(sOne - minusK);
+
+			curX = curX + deltaX * oneMinusToi;
+			curY = curY + deltaY * oneMinusToi;
+			curZ = curZ + deltaZ * oneMinusToi;
+
+			curPos[0] = splat<0>(curSphere) + curX;
+			curPos[1] = splat<1>(curSphere) + curY;
+			curPos[2] = splat<2>(curSphere) + curZ;
+
+			sqrDistance = sEpsilon + curX * curX + curY * curY + curZ * curZ;
+		}
+
+		Simd4f negativeScale = simd4f(_1) - rsqrt(sqrDistance) * curRadius;
+
+		Simd4f contactMask;
+		if(!anyGreater(simd4f(_0), negativeScale, contactMask))
+			continue;
+
+		accum.subtract(curX, curY, curZ, negativeScale, contactMask);
+
+		if(frictionEnabled)
+		{
+			Simd4f velocity = curSphere - prevSphere;
+			accum.addVelocity(splat<0>(velocity), splat<1>(velocity), splat<2>(velocity), contactMask);
+		}
+	}
+}
+
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::Simd4i
+cloth::SwCollision<Simd4f>::collideCones(const Simd4f* __restrict prevPos, Simd4f* __restrict curPos,
+                                         ImpulseAccumulator& accum) const
+{
+	const float* __restrict prevCenterPtr = array(mPrevData.mCones->center);
+	const float* __restrict prevAxisPtr = array(mPrevData.mCones->axis);
+	const float* __restrict prevAuxiliaryPtr = &mPrevData.mCones->sqrCosine;
+
+	const float* __restrict curCenterPtr = array(mCurData.mCones->center);
+	const float* __restrict curAxisPtr = array(mCurData.mCones->axis);
+	const float* __restrict curAuxiliaryPtr = &mCurData.mCones->sqrCosine;
+
+	bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+
+	ShapeMask shapeMask = getShapeMask(prevPos, curPos);
+	Simd4i mask4 = horizontalOr(shapeMask.mCones);
+	uint32_t mask = uint32_t(simdi::array(mask4)[0]);
+	while(mask)
+	{
+		uint32_t test = mask - 1;
+		uint32_t coneIndex = findBitSet(mask & ~test);
+		uint32_t offset = coneIndex * sizeof(ConeData);
+		mask = mask & test;
+
+		Simd4i test4 = simdi::operator-(mask4, simd4i(_1));
+		Simd4f culled = simd4f(andNotIsZero(shapeMask.mCones, test4));
+		mask4 = mask4 & test4;
+
+		Simd4f prevCenter = loadAligned(prevCenterPtr, offset);
+		Simd4f prevAxis = loadAligned(prevAxisPtr, offset);
+		Simd4f prevAxisX = splat<0>(prevAxis);
+		Simd4f prevAxisY = splat<1>(prevAxis);
+		Simd4f prevAxisZ = splat<2>(prevAxis);
+		Simd4f prevSlope = splat<3>(prevAxis);
+
+		Simd4f prevX = prevPos[0] - splat<0>(prevCenter);
+		Simd4f prevY = prevPos[1] - splat<1>(prevCenter);
+		Simd4f prevZ = prevPos[2] - splat<2>(prevCenter);
+		Simd4f prevT = prevY * prevAxisZ - prevZ * prevAxisY;
+		Simd4f prevU = prevZ * prevAxisX - prevX * prevAxisZ;
+		Simd4f prevV = prevX * prevAxisY - prevY * prevAxisX;
+		Simd4f prevDot = prevX * prevAxisX + prevY * prevAxisY + prevZ * prevAxisZ;
+		Simd4f prevRadius = prevDot * prevSlope + splat<3>(prevCenter);
+
+		Simd4f curCenter = loadAligned(curCenterPtr, offset);
+		Simd4f curAxis = loadAligned(curAxisPtr, offset);
+		Simd4f curAxisX = splat<0>(curAxis);
+		Simd4f curAxisY = splat<1>(curAxis);
+		Simd4f curAxisZ = splat<2>(curAxis);
+		Simd4f curSlope = splat<3>(curAxis);
+		Simd4i curAuxiliary = simd4i((Simd4f)loadAligned(curAuxiliaryPtr, offset));
+
+		Simd4f curX = curPos[0] - splat<0>(curCenter);
+		Simd4f curY = curPos[1] - splat<1>(curCenter);
+		Simd4f curZ = curPos[2] - splat<2>(curCenter);
+		Simd4f curT = curY * curAxisZ - curZ * curAxisY;
+		Simd4f curU = curZ * curAxisX - curX * curAxisZ;
+		Simd4f curV = curX * curAxisY - curY * curAxisX;
+		Simd4f curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ;
+		Simd4f curRadius = curDot * curSlope + splat<3>(curCenter);
+
+		Simd4f curSqrDistance = sEpsilon + curT * curT + curU * curU + curV * curV;
+
+		// set radius to zero if cone is culled
+		prevRadius = max(prevRadius, simd4f(_0)) & ~culled;
+		curRadius = max(curRadius, simd4f(_0)) & ~culled;
+
+		Simd4f dotPrevPrev = prevT * prevT + prevU * prevU + prevV * prevV - prevRadius * prevRadius;
+		Simd4f dotPrevCur = prevT * curT + prevU * curU + prevV * curV - prevRadius * curRadius;
+		Simd4f dotCurCur = curSqrDistance - curRadius * curRadius;
+
+		Simd4f discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev;
+		Simd4f sqrtD = sqrt(discriminant);
+		Simd4f halfB = dotPrevCur - dotPrevPrev;
+		Simd4f minusA = dotPrevCur - dotCurCur + halfB;
+
+		// time of impact or 0 if prevPos inside cone
+		Simd4f toi = recip(minusA) * min(simd4f(_0), halfB + sqrtD);
+		Simd4f collisionMask = (toi < simd4f(_1)) & (halfB < sqrtD);
+
+		// skip continuous collision if the (un-clamped) particle
+		// trajectory only touches the outer skin of the cone.
+		Simd4f rMin = prevRadius + halfB * minusA * (curRadius - prevRadius);
+		collisionMask = collisionMask & (discriminant > minusA * rMin * rMin * sSkeletonWidth);
+
+		// a is negative when one cone is contained in the other,
+		// which is already handled by discrete collision.
+		collisionMask = collisionMask & (minusA < -(Simd4f)sEpsilon);
+
+		// test if any particle hits infinite cone (and 0<time of impact<1)
+		if(!allEqual(collisionMask, simd4f(_0)))
+		{
+			Simd4f deltaX = prevX - curX;
+			Simd4f deltaY = prevY - curY;
+			Simd4f deltaZ = prevZ - curZ;
+
+			// interpolate delta at toi
+			Simd4f posX = prevX - deltaX * toi;
+			Simd4f posY = prevY - deltaY * toi;
+			Simd4f posZ = prevZ - deltaZ * toi;
+
+			Simd4f curScaledAxis = curAxis * splat<1>(simd4f(curAuxiliary));
+			Simd4i prevAuxiliary = simd4i((Simd4f)loadAligned(prevAuxiliaryPtr, offset));
+			Simd4f deltaScaledAxis = curScaledAxis - prevAxis * splat<1>(simd4f(prevAuxiliary));
+
+			Simd4f oneMinusToi = simd4f(_1) - toi;
+
+			// interpolate axis at toi
+			Simd4f axisX = splat<0>(curScaledAxis) - splat<0>(deltaScaledAxis) * oneMinusToi;
+			Simd4f axisY = splat<1>(curScaledAxis) - splat<1>(deltaScaledAxis) * oneMinusToi;
+			Simd4f axisZ = splat<2>(curScaledAxis) - splat<2>(deltaScaledAxis) * oneMinusToi;
+			Simd4f slope = (prevSlope * oneMinusToi + curSlope * toi);
+
+			Simd4f sqrHalfLength = axisX * axisX + axisY * axisY + axisZ * axisZ;
+			Simd4f invHalfLength = rsqrt(sqrHalfLength);
+			Simd4f dot = (posX * axisX + posY * axisY + posZ * axisZ) * invHalfLength;
+
+			Simd4f sqrDistance = posX * posX + posY * posY + posZ * posZ - dot * dot;
+			Simd4f invDistance = rsqrt(sqrDistance) & (sqrDistance > simd4f(_0));
+
+			Simd4f base = dot + slope * sqrDistance * invDistance;
+			Simd4f scale = base * invHalfLength & collisionMask;
+
+			Simd4f cullMask = (abs(scale) < simd4f(_1)) & collisionMask;
+
+			// test if any impact position is in cone section
+			if(!allEqual(cullMask, simd4f(_0)))
+			{
+				deltaX = deltaX + splat<0>(deltaScaledAxis) * scale;
+				deltaY = deltaY + splat<1>(deltaScaledAxis) * scale;
+				deltaZ = deltaZ + splat<2>(deltaScaledAxis) * scale;
+
+				oneMinusToi = oneMinusToi & cullMask;
+
+				// reduce ccd impulse if (clamped) particle trajectory stays in cone skin,
+				// i.e. scale by exp2(-k) or 1/(1+k) with k = (tmin - toi) / (1 - toi)
+				// oneMinusToi = oneMinusToi * recip(sOne - sqrtD * recip(minusA * oneMinusToi));
+				Simd4f minusK = sqrtD * recip(minusA * oneMinusToi) & (oneMinusToi > sEpsilon);
+				oneMinusToi = oneMinusToi * recip(sOne - minusK);
+
+				curX = curX + deltaX * oneMinusToi;
+				curY = curY + deltaY * oneMinusToi;
+				curZ = curZ + deltaZ * oneMinusToi;
+
+				curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ;
+				curRadius = curDot * curSlope + splat<3>(curCenter);
+				curRadius = max(curRadius, simd4f(_0)) & ~culled;
+				curSqrDistance = curX * curX + curY * curY + curZ * curZ - curDot * curDot;
+
+				curPos[0] = splat<0>(curCenter) + curX;
+				curPos[1] = splat<1>(curCenter) + curY;
+				curPos[2] = splat<2>(curCenter) + curZ;
+			}
+		}
+
+		// curPos inside cone (discrete collision)
+		Simd4f contactMask;
+		int anyContact = anyGreater(curRadius * curRadius, curSqrDistance, contactMask);
+
+		Simd4i bothMask = splat<3>(curAuxiliary);
+
+		// instead of culling continuous collision for ~collisionMask, and discrete
+		// collision for ~contactMask, disable both if ~collisionMask & ~contactMask
+		Simd4i cullMask = bothMask & ~simd4i(collisionMask | contactMask);
+		shapeMask.mSpheres = shapeMask.mSpheres & ~cullMask;
+
+		if(!anyContact)
+			continue;
+
+		Simd4f invDistance = rsqrt(curSqrDistance) & (curSqrDistance > sZero);
+		Simd4f base = curDot + curSlope * curSqrDistance * invDistance;
+
+		Simd4f halfLength = splat<1>(simd4f(curAuxiliary));
+		Simd4i leftMask = simd4i(base < -halfLength);
+		Simd4i rightMask = simd4i(base > halfLength);
+
+		// can only skip continuous sphere collision if post-ccd position
+		// is on code side *and* particle had cone-ccd collision.
+		Simd4i firstMask = splat<2>(curAuxiliary);
+		Simd4i secondMask = firstMask ^ bothMask;
+		cullMask = (firstMask & ~leftMask) | (secondMask & ~rightMask);
+		shapeMask.mSpheres = shapeMask.mSpheres & ~(cullMask & simd4i(collisionMask));
+
+		Simd4f deltaX = curX - base * curAxisX;
+		Simd4f deltaY = curY - base * curAxisY;
+		Simd4f deltaZ = curZ - base * curAxisZ;
+
+		Simd4f sqrCosine = splat<0>(simd4f(curAuxiliary));
+		Simd4f scale = curRadius * invDistance * sqrCosine - sqrCosine;
+
+		contactMask = contactMask & ~simd4f(leftMask | rightMask);
+
+		if(!anyTrue(contactMask))
+			continue;
+
+		accum.add(deltaX, deltaY, deltaZ, scale, contactMask);
+
+		if(frictionEnabled)
+		{
+			uint32_t s0 = mClothData.mCapsuleIndices[coneIndex].first;
+			uint32_t s1 = mClothData.mCapsuleIndices[coneIndex].second;
+
+			float* prevSpheres = reinterpret_cast<float*>(mPrevData.mSpheres);
+			float* curSpheres = reinterpret_cast<float*>(mCurData.mSpheres);
+
+			// todo: could pre-compute sphere velocities or it might be
+			// faster to compute cur/prev sphere positions directly
+			Simd4f s0p0 = loadAligned(prevSpheres, s0 * sizeof(SphereData));
+			Simd4f s0p1 = loadAligned(curSpheres, s0 * sizeof(SphereData));
+
+			Simd4f s1p0 = loadAligned(prevSpheres, s1 * sizeof(SphereData));
+			Simd4f s1p1 = loadAligned(curSpheres, s1 * sizeof(SphereData));
+
+			Simd4f v0 = s0p1 - s0p0;
+			Simd4f v1 = s1p1 - s1p0;
+			Simd4f vd = v1 - v0;
+
+			// dot is in the range -1 to 1, scale and bias to 0 to 1
+			curDot = curDot * sHalf + sHalf;
+
+			// interpolate velocity at contact points
+			Simd4f vx = splat<0>(v0) + curDot * splat<0>(vd);
+			Simd4f vy = splat<1>(v0) + curDot * splat<1>(vd);
+			Simd4f vz = splat<2>(v0) + curDot * splat<2>(vd);
+
+			accum.addVelocity(vx, vy, vz, contactMask);
+		}
+	}
+
+	return shapeMask.mSpheres;
+}
+
+namespace
+{
+
+template <typename Simd4f>
+PX_INLINE void calculateFrictionImpulse(const Simd4f& deltaX, const Simd4f& deltaY, const Simd4f& deltaZ,
+                                        const Simd4f& velX, const Simd4f& velY, const Simd4f& velZ,
+                                        const Simd4f* curPos, const Simd4f* prevPos, const Simd4f& scale,
+                                        const Simd4f& coefficient, const Simd4f& mask, Simd4f* impulse)
+{
+	// calculate collision normal
+	Simd4f deltaSq = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ;
+
+	Simd4f rcpDelta = rsqrt(deltaSq + sEpsilon);
+
+	Simd4f nx = deltaX * rcpDelta;
+	Simd4f ny = deltaY * rcpDelta;
+	Simd4f nz = deltaZ * rcpDelta;
+
+	// calculate relative velocity scaled by number of collisions
+	Simd4f rvx = curPos[0] - prevPos[0] - velX * scale;
+	Simd4f rvy = curPos[1] - prevPos[1] - velY * scale;
+	Simd4f rvz = curPos[2] - prevPos[2] - velZ * scale;
+
+	// calculate magnitude of relative normal velocity
+	Simd4f rvn = rvx * nx + rvy * ny + rvz * nz;
+
+	// calculate relative tangential velocity
+	Simd4f rvtx = rvx - rvn * nx;
+	Simd4f rvty = rvy - rvn * ny;
+	Simd4f rvtz = rvz - rvn * nz;
+
+	// calculate magnitude of vt
+	Simd4f rcpVt = rsqrt(rvtx * rvtx + rvty * rvty + rvtz * rvtz + sEpsilon);
+
+	// magnitude of friction impulse (cannot be greater than -vt)
+	Simd4f j = max(-coefficient * deltaSq * rcpDelta * rcpVt, sNegOne) & mask;
+
+	impulse[0] = rvtx * j;
+	impulse[1] = rvty * j;
+	impulse[2] = rvtz * j;
+}
+
+} // anonymous namespace
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideParticles()
+{
+	const bool massScalingEnabled = mClothData.mCollisionMassScale > 0.0f;
+	const Simd4f massScale = simd4f(mClothData.mCollisionMassScale);
+
+	const bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+	const Simd4f frictionScale = simd4f(mClothData.mFrictionScale);
+
+	Simd4f curPos[4];
+	Simd4f prevPos[4];
+
+	float* __restrict prevIt = mClothData.mPrevParticles;
+	float* __restrict pIt = mClothData.mCurParticles;
+	float* __restrict pEnd = pIt + mClothData.mNumParticles * 4;
+	for(; pIt < pEnd; pIt += 16, prevIt += 16)
+	{
+		curPos[0] = loadAligned(pIt, 0);
+		curPos[1] = loadAligned(pIt, 16);
+		curPos[2] = loadAligned(pIt, 32);
+		curPos[3] = loadAligned(pIt, 48);
+		transpose(curPos[0], curPos[1], curPos[2], curPos[3]);
+
+		ImpulseAccumulator accum;
+		Simd4i sphereMask = collideCones(curPos, accum);
+		collideSpheres(sphereMask, curPos, accum);
+
+		Simd4f mask;
+		if(!anyGreater(accum.mNumCollisions, sEpsilon, mask))
+			continue;
+
+		Simd4f invNumCollisions = recip(accum.mNumCollisions);
+
+		if(frictionEnabled)
+		{
+			prevPos[0] = loadAligned(prevIt, 0);
+			prevPos[1] = loadAligned(prevIt, 16);
+			prevPos[2] = loadAligned(prevIt, 32);
+			prevPos[3] = loadAligned(prevIt, 48);
+			transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]);
+
+			Simd4f frictionImpulse[3];
+			calculateFrictionImpulse(accum.mDeltaX, accum.mDeltaY, accum.mDeltaZ, accum.mVelX, accum.mVelY, accum.mVelZ,
+			                         curPos, prevPos, invNumCollisions, frictionScale, mask, frictionImpulse);
+
+			prevPos[0] = prevPos[0] - frictionImpulse[0];
+			prevPos[1] = prevPos[1] - frictionImpulse[1];
+			prevPos[2] = prevPos[2] - frictionImpulse[2];
+
+			transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]);
+			storeAligned(prevIt, 0, prevPos[0]);
+			storeAligned(prevIt, 16, prevPos[1]);
+			storeAligned(prevIt, 32, prevPos[2]);
+			storeAligned(prevIt, 48, prevPos[3]);
+		}
+
+		if(massScalingEnabled)
+		{
+			// calculate the inverse mass scale based on the collision impulse magnitude
+			Simd4f dSq = invNumCollisions * invNumCollisions *
+			             (accum.mDeltaX * accum.mDeltaX + accum.mDeltaY * accum.mDeltaY + accum.mDeltaZ * accum.mDeltaZ);
+
+			Simd4f scale = recip(sOne + massScale * dSq);
+
+			// scale invmass
+			curPos[3] = select(mask, curPos[3] * scale, curPos[3]);
+		}
+
+		curPos[0] = curPos[0] + accum.mDeltaX * invNumCollisions;
+		curPos[1] = curPos[1] + accum.mDeltaY * invNumCollisions;
+		curPos[2] = curPos[2] + accum.mDeltaZ * invNumCollisions;
+
+		transpose(curPos[0], curPos[1], curPos[2], curPos[3]);
+		storeAligned(pIt, 0, curPos[0]);
+		storeAligned(pIt, 16, curPos[1]);
+		storeAligned(pIt, 32, curPos[2]);
+		storeAligned(pIt, 48, curPos[3]);
+
+#if PX_PROFILE || PX_DEBUG
+		mNumCollisions += horizontalSum(accum.mNumCollisions);
+#endif
+	}
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideVirtualParticles()
+{
+	const bool massScalingEnabled = mClothData.mCollisionMassScale > 0.0f;
+	const Simd4f massScale = simd4f(mClothData.mCollisionMassScale);
+
+	const bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+	const Simd4f frictionScale = simd4f(mClothData.mFrictionScale);
+
+	Simd4f curPos[3];
+
+	const float* __restrict weights = mClothData.mVirtualParticleWeights;
+	float* __restrict particles = mClothData.mCurParticles;
+	float* __restrict prevParticles = mClothData.mPrevParticles;
+
+	// move dummy particles outside of collision range
+	Simd4f* __restrict dummy = mClothData.mNumParticles + reinterpret_cast<Simd4f*>(mClothData.mCurParticles);
+	Simd4f invGridScale = recip(mGridScale) & (mGridScale > sEpsilon);
+	dummy[0] = dummy[1] = dummy[2] = invGridScale * mGridBias - invGridScale;
+
+	const uint16_t* __restrict vpIt = mClothData.mVirtualParticlesBegin;
+	const uint16_t* __restrict vpEnd = mClothData.mVirtualParticlesEnd;
+	for(; vpIt != vpEnd; vpIt += 16)
+	{
+		// load 12 particles and 4 weights
+		Simd4f p0v0 = loadAligned(particles, vpIt[0] * sizeof(PxVec4));
+		Simd4f p0v1 = loadAligned(particles, vpIt[1] * sizeof(PxVec4));
+		Simd4f p0v2 = loadAligned(particles, vpIt[2] * sizeof(PxVec4));
+		Simd4f w0 = loadAligned(weights, vpIt[3] * sizeof(PxVec4));
+
+		Simd4f p1v0 = loadAligned(particles, vpIt[4] * sizeof(PxVec4));
+		Simd4f p1v1 = loadAligned(particles, vpIt[5] * sizeof(PxVec4));
+		Simd4f p1v2 = loadAligned(particles, vpIt[6] * sizeof(PxVec4));
+		Simd4f w1 = loadAligned(weights, vpIt[7] * sizeof(PxVec4));
+
+		Simd4f p2v0 = loadAligned(particles, vpIt[8] * sizeof(PxVec4));
+		Simd4f p2v1 = loadAligned(particles, vpIt[9] * sizeof(PxVec4));
+		Simd4f p2v2 = loadAligned(particles, vpIt[10] * sizeof(PxVec4));
+		Simd4f w2 = loadAligned(weights, vpIt[11] * sizeof(PxVec4));
+
+		Simd4f p3v1 = loadAligned(particles, vpIt[13] * sizeof(PxVec4));
+		Simd4f p3v0 = loadAligned(particles, vpIt[12] * sizeof(PxVec4));
+		Simd4f p3v2 = loadAligned(particles, vpIt[14] * sizeof(PxVec4));
+		Simd4f w3 = loadAligned(weights, vpIt[15] * sizeof(PxVec4));
+
+		// interpolate particles and transpose
+		Simd4f px = p0v0 * splat<0>(w0) + p0v1 * splat<1>(w0) + p0v2 * splat<2>(w0);
+		Simd4f py = p1v0 * splat<0>(w1) + p1v1 * splat<1>(w1) + p1v2 * splat<2>(w1);
+		Simd4f pz = p2v0 * splat<0>(w2) + p2v1 * splat<1>(w2) + p2v2 * splat<2>(w2);
+		Simd4f pw = p3v0 * splat<0>(w3) + p3v1 * splat<1>(w3) + p3v2 * splat<2>(w3);
+		transpose(px, py, pz, pw);
+
+		curPos[0] = px;
+		curPos[1] = py;
+		curPos[2] = pz;
+
+		ImpulseAccumulator accum;
+		Simd4i sphereMask = collideCones(curPos, accum);
+		collideSpheres(sphereMask, curPos, accum);
+
+		Simd4f mask;
+		if(!anyGreater(accum.mNumCollisions, sEpsilon, mask))
+			continue;
+
+		Simd4f invNumCollisions = recip(accum.mNumCollisions);
+
+		// displacement and transpose back
+		Simd4f d0 = accum.mDeltaX * invNumCollisions;
+		Simd4f d1 = accum.mDeltaY * invNumCollisions;
+		Simd4f d2 = accum.mDeltaZ * invNumCollisions;
+		Simd4f d3 = sZero;
+		transpose(d0, d1, d2, d3);
+
+		// scale weights by 1/dot(w,w)
+		Simd4f rw0 = w0 * splat<3>(w0);
+		Simd4f rw1 = w1 * splat<3>(w1);
+		Simd4f rw2 = w2 * splat<3>(w2);
+		Simd4f rw3 = w3 * splat<3>(w3);
+
+		if(frictionEnabled)
+		{
+			Simd4f q0v0 = loadAligned(prevParticles, vpIt[0] * sizeof(PxVec4));
+			Simd4f q0v1 = loadAligned(prevParticles, vpIt[1] * sizeof(PxVec4));
+			Simd4f q0v2 = loadAligned(prevParticles, vpIt[2] * sizeof(PxVec4));
+
+			Simd4f q1v0 = loadAligned(prevParticles, vpIt[4] * sizeof(PxVec4));
+			Simd4f q1v1 = loadAligned(prevParticles, vpIt[5] * sizeof(PxVec4));
+			Simd4f q1v2 = loadAligned(prevParticles, vpIt[6] * sizeof(PxVec4));
+
+			Simd4f q2v0 = loadAligned(prevParticles, vpIt[8] * sizeof(PxVec4));
+			Simd4f q2v1 = loadAligned(prevParticles, vpIt[9] * sizeof(PxVec4));
+			Simd4f q2v2 = loadAligned(prevParticles, vpIt[10] * sizeof(PxVec4));
+
+			Simd4f q3v0 = loadAligned(prevParticles, vpIt[12] * sizeof(PxVec4));
+			Simd4f q3v1 = loadAligned(prevParticles, vpIt[13] * sizeof(PxVec4));
+			Simd4f q3v2 = loadAligned(prevParticles, vpIt[14] * sizeof(PxVec4));
+
+			// calculate previous interpolated positions
+			Simd4f qx = q0v0 * splat<0>(w0) + q0v1 * splat<1>(w0) + q0v2 * splat<2>(w0);
+			Simd4f qy = q1v0 * splat<0>(w1) + q1v1 * splat<1>(w1) + q1v2 * splat<2>(w1);
+			Simd4f qz = q2v0 * splat<0>(w2) + q2v1 * splat<1>(w2) + q2v2 * splat<2>(w2);
+			Simd4f qw = q3v0 * splat<0>(w3) + q3v1 * splat<1>(w3) + q3v2 * splat<2>(w3);
+			transpose(qx, qy, qz, qw);
+
+			Simd4f prevPos[3] = { qx, qy, qz };
+			Simd4f frictionImpulse[4];
+			frictionImpulse[3] = sZero;
+
+			calculateFrictionImpulse(accum.mDeltaX, accum.mDeltaY, accum.mDeltaZ, accum.mVelX, accum.mVelY, accum.mVelZ,
+			                         curPos, prevPos, invNumCollisions, frictionScale, mask, frictionImpulse);
+
+			transpose(frictionImpulse[0], frictionImpulse[1], frictionImpulse[2], frictionImpulse[3]);
+
+			q0v0 = q0v0 - (splat<0>(rw0) * frictionImpulse[0]);
+			q0v1 = q0v1 - (splat<1>(rw0) * frictionImpulse[0]);
+			q0v2 = q0v2 - (splat<2>(rw0) * frictionImpulse[0]);
+
+			q1v0 = q1v0 - (splat<0>(rw1) * frictionImpulse[1]);
+			q1v1 = q1v1 - (splat<1>(rw1) * frictionImpulse[1]);
+			q1v2 = q1v2 - (splat<2>(rw1) * frictionImpulse[1]);
+
+			q2v0 = q2v0 - (splat<0>(rw2) * frictionImpulse[2]);
+			q2v1 = q2v1 - (splat<1>(rw2) * frictionImpulse[2]);
+			q2v2 = q2v2 - (splat<2>(rw2) * frictionImpulse[2]);
+
+			q3v0 = q3v0 - (splat<0>(rw3) * frictionImpulse[3]);
+			q3v1 = q3v1 - (splat<1>(rw3) * frictionImpulse[3]);
+			q3v2 = q3v2 - (splat<2>(rw3) * frictionImpulse[3]);
+
+			// write back prev particles
+			storeAligned(prevParticles, vpIt[0] * sizeof(PxVec4), q0v0);
+			storeAligned(prevParticles, vpIt[1] * sizeof(PxVec4), q0v1);
+			storeAligned(prevParticles, vpIt[2] * sizeof(PxVec4), q0v2);
+
+			storeAligned(prevParticles, vpIt[4] * sizeof(PxVec4), q1v0);
+			storeAligned(prevParticles, vpIt[5] * sizeof(PxVec4), q1v1);
+			storeAligned(prevParticles, vpIt[6] * sizeof(PxVec4), q1v2);
+
+			storeAligned(prevParticles, vpIt[8] * sizeof(PxVec4), q2v0);
+			storeAligned(prevParticles, vpIt[9] * sizeof(PxVec4), q2v1);
+			storeAligned(prevParticles, vpIt[10] * sizeof(PxVec4), q2v2);
+
+			storeAligned(prevParticles, vpIt[12] * sizeof(PxVec4), q3v0);
+			storeAligned(prevParticles, vpIt[13] * sizeof(PxVec4), q3v1);
+			storeAligned(prevParticles, vpIt[14] * sizeof(PxVec4), q3v2);
+		}
+
+		if(massScalingEnabled)
+		{
+			// calculate the inverse mass scale based on the collision impulse
+			Simd4f dSq = invNumCollisions * invNumCollisions *
+			             (accum.mDeltaX * accum.mDeltaX + accum.mDeltaY * accum.mDeltaY + accum.mDeltaZ * accum.mDeltaZ);
+
+			Simd4f weightScale = recip(sOne + massScale * dSq);
+
+			weightScale = weightScale - sOne;
+			Simd4f s0 = sOne + splat<0>(weightScale) * (w0 & splat<0>(mask));
+			Simd4f s1 = sOne + splat<1>(weightScale) * (w1 & splat<1>(mask));
+			Simd4f s2 = sOne + splat<2>(weightScale) * (w2 & splat<2>(mask));
+			Simd4f s3 = sOne + splat<3>(weightScale) * (w3 & splat<3>(mask));
+
+			p0v0 = p0v0 * (sOneXYZ | (splat<0>(s0) & sMaskW));
+			p0v1 = p0v1 * (sOneXYZ | (splat<1>(s0) & sMaskW));
+			p0v2 = p0v2 * (sOneXYZ | (splat<2>(s0) & sMaskW));
+
+			p1v0 = p1v0 * (sOneXYZ | (splat<0>(s1) & sMaskW));
+			p1v1 = p1v1 * (sOneXYZ | (splat<1>(s1) & sMaskW));
+			p1v2 = p1v2 * (sOneXYZ | (splat<2>(s1) & sMaskW));
+
+			p2v0 = p2v0 * (sOneXYZ | (splat<0>(s2) & sMaskW));
+			p2v1 = p2v1 * (sOneXYZ | (splat<1>(s2) & sMaskW));
+			p2v2 = p2v2 * (sOneXYZ | (splat<2>(s2) & sMaskW));
+
+			p3v0 = p3v0 * (sOneXYZ | (splat<0>(s3) & sMaskW));
+			p3v1 = p3v1 * (sOneXYZ | (splat<1>(s3) & sMaskW));
+			p3v2 = p3v2 * (sOneXYZ | (splat<2>(s3) & sMaskW));
+		}
+
+		p0v0 = p0v0 + (splat<0>(rw0) * d0);
+		p0v1 = p0v1 + (splat<1>(rw0) * d0);
+		p0v2 = p0v2 + (splat<2>(rw0) * d0);
+
+		p1v0 = p1v0 + (splat<0>(rw1) * d1);
+		p1v1 = p1v1 + (splat<1>(rw1) * d1);
+		p1v2 = p1v2 + (splat<2>(rw1) * d1);
+
+		p2v0 = p2v0 + (splat<0>(rw2) * d2);
+		p2v1 = p2v1 + (splat<1>(rw2) * d2);
+		p2v2 = p2v2 + (splat<2>(rw2) * d2);
+
+		p3v0 = p3v0 + (splat<0>(rw3) * d3);
+		p3v1 = p3v1 + (splat<1>(rw3) * d3);
+		p3v2 = p3v2 + (splat<2>(rw3) * d3);
+
+		// write back particles
+		storeAligned(particles, vpIt[0] * sizeof(PxVec4), p0v0);
+		storeAligned(particles, vpIt[1] * sizeof(PxVec4), p0v1);
+		storeAligned(particles, vpIt[2] * sizeof(PxVec4), p0v2);
+
+		storeAligned(particles, vpIt[4] * sizeof(PxVec4), p1v0);
+		storeAligned(particles, vpIt[5] * sizeof(PxVec4), p1v1);
+		storeAligned(particles, vpIt[6] * sizeof(PxVec4), p1v2);
+
+		storeAligned(particles, vpIt[8] * sizeof(PxVec4), p2v0);
+		storeAligned(particles, vpIt[9] * sizeof(PxVec4), p2v1);
+		storeAligned(particles, vpIt[10] * sizeof(PxVec4), p2v2);
+
+		storeAligned(particles, vpIt[12] * sizeof(PxVec4), p3v0);
+		storeAligned(particles, vpIt[13] * sizeof(PxVec4), p3v1);
+		storeAligned(particles, vpIt[14] * sizeof(PxVec4), p3v2);
+
+#if PX_PROFILE || PX_DEBUG
+		mNumCollisions += horizontalSum(accum.mNumCollisions);
+#endif
+	}
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideContinuousParticles()
+{
+	Simd4f curPos[4];
+	Simd4f prevPos[4];
+
+	const bool massScalingEnabled = mClothData.mCollisionMassScale > 0.0f;
+	const Simd4f massScale = simd4f(mClothData.mCollisionMassScale);
+
+	const bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+	const Simd4f frictionScale = simd4f(mClothData.mFrictionScale);
+
+	float* __restrict prevIt = mClothData.mPrevParticles;
+	float* __restrict curIt = mClothData.mCurParticles;
+	float* __restrict curEnd = curIt + mClothData.mNumParticles * 4;
+
+	for(; curIt < curEnd; curIt += 16, prevIt += 16)
+	{
+		prevPos[0] = loadAligned(prevIt, 0);
+		prevPos[1] = loadAligned(prevIt, 16);
+		prevPos[2] = loadAligned(prevIt, 32);
+		prevPos[3] = loadAligned(prevIt, 48);
+		transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]);
+
+		curPos[0] = loadAligned(curIt, 0);
+		curPos[1] = loadAligned(curIt, 16);
+		curPos[2] = loadAligned(curIt, 32);
+		curPos[3] = loadAligned(curIt, 48);
+		transpose(curPos[0], curPos[1], curPos[2], curPos[3]);
+
+		ImpulseAccumulator accum;
+		Simd4i sphereMask = collideCones(prevPos, curPos, accum);
+		collideSpheres(sphereMask, prevPos, curPos, accum);
+
+		Simd4f mask;
+		if(!anyGreater(accum.mNumCollisions, sEpsilon, mask))
+			continue;
+
+		Simd4f invNumCollisions = recip(accum.mNumCollisions);
+
+		if(frictionEnabled)
+		{
+			Simd4f frictionImpulse[3];
+			calculateFrictionImpulse(accum.mDeltaX, accum.mDeltaY, accum.mDeltaZ, accum.mVelX, accum.mVelY, accum.mVelZ,
+			                         curPos, prevPos, invNumCollisions, frictionScale, mask, frictionImpulse);
+
+			prevPos[0] = prevPos[0] - frictionImpulse[0];
+			prevPos[1] = prevPos[1] - frictionImpulse[1];
+			prevPos[2] = prevPos[2] - frictionImpulse[2];
+
+			transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]);
+			storeAligned(prevIt, 0, prevPos[0]);
+			storeAligned(prevIt, 16, prevPos[1]);
+			storeAligned(prevIt, 32, prevPos[2]);
+			storeAligned(prevIt, 48, prevPos[3]);
+		}
+
+		if(massScalingEnabled)
+		{
+			// calculate the inverse mass scale based on the collision impulse magnitude
+			Simd4f dSq = invNumCollisions * invNumCollisions *
+			             (accum.mDeltaX * accum.mDeltaX + accum.mDeltaY * accum.mDeltaY + accum.mDeltaZ * accum.mDeltaZ);
+
+			Simd4f weightScale = recip(sOne + massScale * dSq);
+
+			// scale invmass
+			curPos[3] = select(mask, curPos[3] * weightScale, curPos[3]);
+		}
+
+		curPos[0] = curPos[0] + accum.mDeltaX * invNumCollisions;
+		curPos[1] = curPos[1] + accum.mDeltaY * invNumCollisions;
+		curPos[2] = curPos[2] + accum.mDeltaZ * invNumCollisions;
+
+		transpose(curPos[0], curPos[1], curPos[2], curPos[3]);
+		storeAligned(curIt, 0, curPos[0]);
+		storeAligned(curIt, 16, curPos[1]);
+		storeAligned(curIt, 32, curPos[2]);
+		storeAligned(curIt, 48, curPos[3]);
+
+#if PX_PROFILE || PX_DEBUG
+		mNumCollisions += horizontalSum(accum.mNumCollisions);
+#endif
+	}
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideConvexes(const IterationState<Simd4f>& state)
+{
+	if(!mClothData.mNumConvexes)
+		return;
+
+	// times 2 for plane equation result buffer
+	Simd4f* planes = static_cast<Simd4f*>(mAllocator.allocate(sizeof(Simd4f) * mClothData.mNumPlanes * 2));
+
+	const Simd4f* targetPlanes = reinterpret_cast<const Simd4f*>(mClothData.mTargetCollisionPlanes);
+
+	// generate plane collision data
+	if(state.mRemainingIterations != 1)
+	{
+		// interpolate planes
+		LerpIterator<Simd4f, const Simd4f*> planeIter(reinterpret_cast<const Simd4f*>(mClothData.mStartCollisionPlanes),
+		                                              targetPlanes, state.getCurrentAlpha());
+
+		// todo: normalize plane equations
+		generatePlanes(planes, planeIter, mClothData.mNumPlanes);
+	}
+	else
+	{
+		// otherwise use the target planes directly
+		generatePlanes(planes, targetPlanes, mClothData.mNumPlanes);
+	}
+
+	Simd4f curPos[4], prevPos[4];
+
+	const bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+	const Simd4f frictionScale = simd4f(mClothData.mFrictionScale);
+
+	float* __restrict curIt = mClothData.mCurParticles;
+	float* __restrict curEnd = curIt + mClothData.mNumParticles * 4;
+	float* __restrict prevIt = mClothData.mPrevParticles;
+	for(; curIt < curEnd; curIt += 16, prevIt += 16)
+	{
+		curPos[0] = loadAligned(curIt, 0);
+		curPos[1] = loadAligned(curIt, 16);
+		curPos[2] = loadAligned(curIt, 32);
+		curPos[3] = loadAligned(curIt, 48);
+		transpose(curPos[0], curPos[1], curPos[2], curPos[3]);
+
+		ImpulseAccumulator accum;
+		collideConvexes(planes, curPos, accum);
+
+		Simd4f mask;
+		if(!anyGreater(accum.mNumCollisions, sEpsilon, mask))
+			continue;
+
+		Simd4f invNumCollisions = recip(accum.mNumCollisions);
+
+		if(frictionEnabled)
+		{
+			prevPos[0] = loadAligned(prevIt, 0);
+			prevPos[1] = loadAligned(prevIt, 16);
+			prevPos[2] = loadAligned(prevIt, 32);
+			prevPos[3] = loadAligned(prevIt, 48);
+			transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]);
+
+			Simd4f frictionImpulse[3];
+			calculateFrictionImpulse(accum.mDeltaX, accum.mDeltaY, accum.mDeltaZ, accum.mVelX, accum.mVelY, accum.mVelZ,
+			                         curPos, prevPos, invNumCollisions, frictionScale, mask, frictionImpulse);
+
+			prevPos[0] = prevPos[0] - frictionImpulse[0];
+			prevPos[1] = prevPos[1] - frictionImpulse[1];
+			prevPos[2] = prevPos[2] - frictionImpulse[2];
+
+			transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]);
+			storeAligned(prevIt, 0, prevPos[0]);
+			storeAligned(prevIt, 16, prevPos[1]);
+			storeAligned(prevIt, 32, prevPos[2]);
+			storeAligned(prevIt, 48, prevPos[3]);
+		}
+
+		curPos[0] = curPos[0] + accum.mDeltaX * invNumCollisions;
+		curPos[1] = curPos[1] + accum.mDeltaY * invNumCollisions;
+		curPos[2] = curPos[2] + accum.mDeltaZ * invNumCollisions;
+
+		transpose(curPos[0], curPos[1], curPos[2], curPos[3]);
+		storeAligned(curIt, 0, curPos[0]);
+		storeAligned(curIt, 16, curPos[1]);
+		storeAligned(curIt, 32, curPos[2]);
+		storeAligned(curIt, 48, curPos[3]);
+
+#if PX_PROFILE || PX_DEBUG
+		mNumCollisions += horizontalSum(accum.mNumCollisions);
+#endif
+	}
+
+	mAllocator.deallocate(planes);
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideConvexes(const Simd4f* __restrict planes, Simd4f* __restrict curPos,
+                                                 ImpulseAccumulator& accum)
+{
+	Simd4i result = simd4i(_0);
+	Simd4i mask4 = simd4i(_1);
+
+	const Simd4f* __restrict pIt, *pEnd = planes + mClothData.mNumPlanes;
+	Simd4f* __restrict dIt = const_cast<Simd4f*>(pEnd);
+	for(pIt = planes; pIt != pEnd; ++pIt, ++dIt)
+	{
+		*dIt = splat<3>(*pIt) + curPos[2] * splat<2>(*pIt) + curPos[1] * splat<1>(*pIt) + curPos[0] * splat<0>(*pIt);
+		result = result | (mask4 & simd4i(*dIt < simd4f(_0)));
+		mask4 = mask4 << 1; // todo: shift by Simd4i on consoles
+	}
+
+	if(simdi::allEqual(result, simd4i(_0)))
+		return;
+
+	const uint32_t* __restrict cIt = mClothData.mConvexMasks;
+	const uint32_t* __restrict cEnd = cIt + mClothData.mNumConvexes;
+	for(; cIt != cEnd; ++cIt)
+	{
+		uint32_t mask = *cIt;
+		mask4 = simd4i(int(mask));
+		if(!simdi::anyEqual(mask4 & result, mask4, mask4))
+			continue;
+
+		uint32_t test = mask - 1;
+		uint32_t planeIndex = findBitSet(mask & ~test);
+		Simd4f plane = planes[planeIndex];
+		Simd4f planeX = splat<0>(plane);
+		Simd4f planeY = splat<1>(plane);
+		Simd4f planeZ = splat<2>(plane);
+		Simd4f planeD = pEnd[planeIndex];
+		while(mask &= test)
+		{
+			test = mask - 1;
+			planeIndex = findBitSet(mask & ~test);
+			plane = planes[planeIndex];
+			Simd4f dist = pEnd[planeIndex];
+			Simd4f closer = dist > planeD;
+			planeX = select(closer, splat<0>(plane), planeX);
+			planeY = select(closer, splat<1>(plane), planeY);
+			planeZ = select(closer, splat<2>(plane), planeZ);
+			planeD = max(dist, planeD);
+		}
+
+		accum.subtract(planeX, planeY, planeZ, planeD, simd4f(mask4));
+	}
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideTriangles(const IterationState<Simd4f>& state)
+{
+	if(!mClothData.mNumTriangles)
+		return;
+
+	TriangleData* triangles =
+	    static_cast<TriangleData*>(mAllocator.allocate(sizeof(TriangleData) * mClothData.mNumTriangles));
+
+	UnalignedIterator<Simd4f, 3> targetTriangles(mClothData.mTargetCollisionTriangles);
+
+	// generate triangle collision data
+	if(state.mRemainingIterations != 1)
+	{
+		// interpolate triangles
+		LerpIterator<Simd4f, UnalignedIterator<Simd4f, 3> > triangleIter(mClothData.mStartCollisionTriangles,
+		                                                                 targetTriangles, state.getCurrentAlpha());
+
+		generateTriangles<Simd4f>(triangles, triangleIter, mClothData.mNumTriangles);
+	}
+	else
+	{
+		// otherwise use the target triangles directly
+		generateTriangles<Simd4f>(triangles, targetTriangles, mClothData.mNumTriangles);
+	}
+
+	Simd4f positions[4];
+
+	float* __restrict pIt = mClothData.mCurParticles;
+	float* __restrict pEnd = pIt + mClothData.mNumParticles * 4;
+	for(; pIt < pEnd; pIt += 16)
+	{
+		positions[0] = loadAligned(pIt, 0);
+		positions[1] = loadAligned(pIt, 16);
+		positions[2] = loadAligned(pIt, 32);
+		positions[3] = loadAligned(pIt, 48);
+		transpose(positions[0], positions[1], positions[2], positions[3]);
+
+		ImpulseAccumulator accum;
+		collideTriangles(triangles, positions, accum);
+
+		Simd4f mask;
+		if(!anyGreater(accum.mNumCollisions, sEpsilon, mask))
+			continue;
+
+		Simd4f invNumCollisions = recip(accum.mNumCollisions);
+
+		positions[0] = positions[0] + accum.mDeltaX * invNumCollisions;
+		positions[1] = positions[1] + accum.mDeltaY * invNumCollisions;
+		positions[2] = positions[2] + accum.mDeltaZ * invNumCollisions;
+
+		transpose(positions[0], positions[1], positions[2], positions[3]);
+		storeAligned(pIt, 0, positions[0]);
+		storeAligned(pIt, 16, positions[1]);
+		storeAligned(pIt, 32, positions[2]);
+		storeAligned(pIt, 48, positions[3]);
+
+#if PX_PROFILE || PX_DEBUG
+		mNumCollisions += horizontalSum(accum.mNumCollisions);
+#endif
+	}
+
+	mAllocator.deallocate(triangles);
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideTriangles(const TriangleData* __restrict triangles, Simd4f* __restrict curPos,
+                                                  ImpulseAccumulator& accum)
+{
+	Simd4f normalX, normalY, normalZ, normalD;
+	normalX = normalY = normalZ = normalD = simd4f(_0);
+	Simd4f minSqrLength = sMax;
+
+	const TriangleData* __restrict tIt, *tEnd = triangles + mClothData.mNumTriangles;
+	for(tIt = triangles; tIt != tEnd; ++tIt)
+	{
+		Simd4f base = loadAligned(&tIt->base.x);
+		Simd4f edge0 = loadAligned(&tIt->edge0.x);
+		Simd4f edge1 = loadAligned(&tIt->edge1.x);
+		Simd4f normal = loadAligned(&tIt->normal.x);
+		Simd4f aux = loadAligned(&tIt->det);
+
+		Simd4f dx = curPos[0] - splat<0>(base);
+		Simd4f dy = curPos[1] - splat<1>(base);
+		Simd4f dz = curPos[2] - splat<2>(base);
+
+		Simd4f e0x = splat<0>(edge0);
+		Simd4f e0y = splat<1>(edge0);
+		Simd4f e0z = splat<2>(edge0);
+
+		Simd4f e1x = splat<0>(edge1);
+		Simd4f e1y = splat<1>(edge1);
+		Simd4f e1z = splat<2>(edge1);
+
+		Simd4f nx = splat<0>(normal);
+		Simd4f ny = splat<1>(normal);
+		Simd4f nz = splat<2>(normal);
+
+		Simd4f deltaDotEdge0 = dx * e0x + dy * e0y + dz * e0z;
+		Simd4f deltaDotEdge1 = dx * e1x + dy * e1y + dz * e1z;
+		Simd4f deltaDotNormal = dx * nx + dy * ny + dz * nz;
+
+		Simd4f edge0DotEdge1 = splat<3>(base);
+		Simd4f edge0SqrLength = splat<3>(edge0);
+		Simd4f edge1SqrLength = splat<3>(edge1);
+
+		Simd4f s = edge1SqrLength * deltaDotEdge0 - edge0DotEdge1 * deltaDotEdge1;
+		Simd4f t = edge0SqrLength * deltaDotEdge1 - edge0DotEdge1 * deltaDotEdge0;
+
+		Simd4f sPositive = s > simd4f(_0);
+		Simd4f tPositive = t > simd4f(_0);
+
+		Simd4f det = splat<0>(aux);
+
+		s = select(tPositive, s * det, deltaDotEdge0 * splat<2>(aux));
+		t = select(sPositive, t * det, deltaDotEdge1 * splat<3>(aux));
+
+		Simd4f clamp = simd4f(_1) < s + t;
+		Simd4f numerator = edge1SqrLength - edge0DotEdge1 + deltaDotEdge0 - deltaDotEdge1;
+
+		s = select(clamp, numerator * splat<1>(aux), s);
+
+		s = max(simd4f(_0), min(simd4f(_1), s));
+		t = max(simd4f(_0), min(simd4f(_1) - s, t));
+
+		dx = dx - e0x * s - e1x * t;
+		dy = dy - e0y * s - e1y * t;
+		dz = dz - e0z * s - e1z * t;
+
+		Simd4f sqrLength = dx * dx + dy * dy + dz * dz;
+
+		// slightly increase distance for colliding triangles
+		Simd4f slack = (simd4f(_0) > deltaDotNormal) & simd4f(1e-4f);
+		sqrLength = sqrLength + sqrLength * slack;
+
+		Simd4f mask = sqrLength < minSqrLength;
+
+		normalX = select(mask, nx, normalX);
+		normalY = select(mask, ny, normalY);
+		normalZ = select(mask, nz, normalZ);
+		normalD = select(mask, deltaDotNormal, normalD);
+
+		minSqrLength = min(sqrLength, minSqrLength);
+	}
+
+	Simd4f mask;
+	if(!anyGreater(simd4f(_0), normalD, mask))
+		return;
+
+	accum.subtract(normalX, normalY, normalZ, normalD, mask);
+}
+
+// explicit template instantiation
+#if NVMATH_SIMD
+template class cloth::SwCollision<Simd4f>;
+#endif
+#if NVMATH_SCALAR
+template class cloth::SwCollision<Scalar4f>;
+#endif
+/*
+namespace
+{
+    using namespace cloth;
+
+    int test()
+    {
+        Simd4f vertices[] = {
+            simd4f(0.0f, 0.0f, 0.0f, 0.0f),
+            simd4f(0.1f, 0.0f, 0.0f, 0.0f),
+            simd4f(0.0f, 0.1f, 0.0f, 0.0f)
+        };
+        TriangleData triangle;
+        generateTriangles<Simd4f>(&triangle, &*vertices, 1);
+
+        char buffer[1000];
+        SwKernelAllocator alloc(buffer, 1000);
+
+        SwClothData* cloth = static_cast<SwClothData*>(malloc(sizeof(SwClothData)));
+        memset(cloth, 0, sizeof(SwClothData));
+        cloth->mNumTriangles = 1;
+
+        SwCollision<Simd4f> collision(*cloth, alloc);
+        SwCollision<Simd4f>::ImpulseAccumulator accum;
+
+        Simd4f particles[4] = {};
+        for(float y=-0.1f; y < 0.0f; y += 0.2f)
+        {
+            for(float x=-0.1f; x < 0.0f; x += 0.2f)
+            {
+                particles[0] = simd4f(x);
+                particles[1] = simd4f(y);
+                particles[2] = simd4f(-1.0f);
+
+                collision.collideTriangles(&triangle, particles, accum);
+            }
+        }
+
+        return 0;
+    }
+
+    static int blah = test();
+}
+*/
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollision.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollision.h
new file mode 100644
index 00000000..bf5f3177
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollision.h
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+#include "StackAllocator.h"
+#include "Simd4i.h"
+
+#if PX_PROFILE
+#include "PxProfileEventSender.h"
+#include "PxProfileZone.h"
+#else
+namespace physx
+{
+namespace profile
+{
+	class PxProfileZone;
+}
+}
+#endif
+
+namespace nvidia
+{
+namespace cloth
+{
+
+#if PX_PROFILE
+
+struct ProfileZone
+{
+	ProfileZone(const char* name, profile::PxProfileZone* profiler)
+	: mSender(profiler), mEventId(profiler ? profiler->getEventIdForName(name) : uint16_t(-1))
+	{
+		if(mSender)
+			mSender->startEvent(mEventId, (uint64_t)intptr_t(this));
+	}
+
+	~ProfileZone()
+	{
+		if(mSender)
+			mSender->stopEvent(mEventId, (uint64_t)intptr_t(this));
+	}
+
+	void setValue(int64_t value) const
+	{
+		if(mSender)
+			mSender->eventValue(mEventId, (uint64_t)intptr_t(this), value);
+	}
+
+	profile::PxProfileEventSender* mSender;
+	uint16_t mEventId;
+};
+
+#else // PX_PROFILE
+
+struct ProfileZone
+{
+	ProfileZone(const char*, profile::PxProfileZone*)
+	{
+	}
+	void setValue(int64_t) const
+	{
+	}
+};
+
+#endif // PX_PROFILE
+
+class SwCloth;
+struct SwClothData;
+template <typename>
+struct IterationState;
+struct IndexPair;
+struct SphereData;
+struct ConeData;
+struct TriangleData;
+
+typedef StackAllocator<16> SwKernelAllocator;
+
+/**
+   Collision handler for SwSolver.
+ */
+template <typename Simd4f>
+class SwCollision
+{
+	typedef typename Simd4fToSimd4i<Simd4f>::Type Simd4i;
+
+  public:
+	struct ShapeMask
+	{
+		Simd4i mCones;
+		Simd4i mSpheres;
+
+		ShapeMask& operator=(const ShapeMask&);
+		ShapeMask& operator&=(const ShapeMask&);
+	};
+
+	struct CollisionData
+	{
+		CollisionData();
+		SphereData* mSpheres;
+		ConeData* mCones;
+	};
+
+	struct ImpulseAccumulator;
+
+  public:
+	SwCollision(SwClothData& clothData, SwKernelAllocator& alloc, profile::PxProfileZone* profiler);
+	~SwCollision();
+
+	void operator()(const IterationState<Simd4f>& state);
+
+	static size_t estimateTemporaryMemory(const SwCloth& cloth);
+	static size_t estimatePersistentMemory(const SwCloth& cloth);
+
+  private:
+	SwCollision& operator=(const SwCollision&); // not implemented
+	void allocate(CollisionData&);
+	void deallocate(const CollisionData&);
+
+	void computeBounds();
+
+	void buildSphereAcceleration(const SphereData*);
+	void buildConeAcceleration();
+	static void mergeAcceleration(uint32_t*);
+	bool buildAcceleration();
+
+	static ShapeMask getShapeMask(const Simd4f&, const Simd4i*, const Simd4i*);
+	ShapeMask getShapeMask(const Simd4f*) const;
+	ShapeMask getShapeMask(const Simd4f*, const Simd4f*) const;
+
+	void collideSpheres(const Simd4i&, const Simd4f*, ImpulseAccumulator&) const;
+	Simd4i collideCones(const Simd4f*, ImpulseAccumulator&) const;
+
+	void collideSpheres(const Simd4i&, const Simd4f*, Simd4f*, ImpulseAccumulator&) const;
+	Simd4i collideCones(const Simd4f*, Simd4f*, ImpulseAccumulator&) const;
+
+	void collideParticles();
+	void collideVirtualParticles();
+	void collideContinuousParticles();
+
+	void collideConvexes(const IterationState<Simd4f>&);
+	void collideConvexes(const Simd4f*, Simd4f*, ImpulseAccumulator&);
+
+	void collideTriangles(const IterationState<Simd4f>&);
+	void collideTriangles(const TriangleData*, Simd4f*, ImpulseAccumulator&);
+
+  public:
+	// acceleration structure
+	static const uint32_t sGridSize = 8;
+	Simd4i mSphereGrid[6 * sGridSize / 4];
+	Simd4i mConeGrid[6 * sGridSize / 4];
+	Simd4f mGridScale, mGridBias;
+
+	CollisionData mPrevData;
+	CollisionData mCurData;
+
+	SwClothData& mClothData;
+	SwKernelAllocator& mAllocator;
+
+	uint32_t mNumCollisions;
+
+	profile::PxProfileZone* mProfiler;
+
+	static const Simd4f sSkeletonWidth;
+};
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollisionHelpers.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollisionHelpers.h
new file mode 100644
index 00000000..5e098922
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollisionHelpers.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Simd4i.h"
+
+// platform specific helpers
+
+namespace nvidia
+{
+namespace cloth
+{
+
+inline uint32_t findBitSet(uint32_t mask);
+
+// intFloor(-1.0f) returns -2 on SSE and NEON!
+inline Simd4i intFloor(const Simd4f& v);
+
+inline Simd4i horizontalOr(Simd4i mask);
+
+template <typename>
+struct Gather;
+
+#if NVMATH_SIMD
+template <>
+struct Gather<Simd4i>
+{
+	inline Gather(const Simd4i& index);
+	inline Simd4i operator()(const Simd4i*) const;
+
+#if NVMATH_SSE2
+	Simd4i mSelectQ, mSelectD, mSelectW;
+	static const Simd4i sIntSignBit;
+	static const Simd4i sSignedMask;
+#elif NVMATH_NEON
+	Simd4i mPermute;
+	static const Simd4i sPack;
+	static const Simd4i sOffset;
+	static const Simd4i sShift;
+	static const Simd4i sMask;
+#endif
+	Simd4i mOutOfRange;
+};
+#endif
+
+} // namespace cloth
+} // namespace nvidia
+
+#if NVMATH_SSE2
+#include "sse2/SwCollisionHelpers.h"
+#elif NVMATH_NEON
+#include "neon/SwCollisionHelpers.h"
+#endif
+
+#if NVMATH_SCALAR
+#include "scalar/SwCollisionHelpers.h"
+#endif
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFabric.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFabric.cpp
new file mode 100644
index 00000000..0d527dbf
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFabric.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "PxAssert.h"
+#include "SwFabric.h"
+#include "SwFactory.h"
+#include "PsSort.h"
+#include "limits.h" // for USHRT_MAX
+
+#include "PsUtilities.h"
+
+using namespace nvidia;
+using namespace physx::shdfnd;
+
+cloth::SwTether::SwTether(uint16_t anchor, float length) : mAnchor(anchor), mLength(length)
+{
+}
+
+cloth::SwFabric::SwFabric(SwFactory& factory, uint32_t numParticles, Range<const uint32_t> phases,
+                          Range<const uint32_t> sets, Range<const float> restvalues, Range<const uint32_t> indices,
+                          Range<const uint32_t> anchors, Range<const float> tetherLengths, uint32_t id)
+: mFactory(factory), mNumParticles(numParticles), mTetherLengthScale(1.0f), mId(id)
+{
+	// should no longer be prefixed with 0
+	PX_ASSERT(sets.front() != 0);
+
+#if PX_WINDOWS_FAMILY
+	const uint32_t kSimdWidth = 8; // avx
+#else
+	const uint32_t kSimdWidth = 4;
+#endif
+
+	// consistency check
+	PX_ASSERT(sets.back() == restvalues.size());
+	PX_ASSERT(restvalues.size() * 2 == indices.size());
+	PX_ASSERT(mNumParticles > *maxElement(indices.begin(), indices.end()));
+	PX_ASSERT(mNumParticles + kSimdWidth - 1 <= USHRT_MAX);
+
+	mPhases.assign(phases.begin(), phases.end());
+	mSets.reserve(sets.size() + 1);
+	mSets.pushBack(0); // prefix with 0
+
+	mOriginalNumRestvalues = uint32_t(restvalues.size());
+
+	// padd indices for SIMD
+	const uint32_t* iBegin = indices.begin(), *iIt = iBegin;
+	const float* rBegin = restvalues.begin(), *rIt = rBegin;
+	const uint32_t* sIt, *sEnd = sets.end();
+	for(sIt = sets.begin(); sIt != sEnd; ++sIt)
+	{
+		const float* rEnd = rBegin + *sIt;
+		const uint32_t* iEnd = iBegin + *sIt * 2;
+		uint32_t numConstraints = uint32_t(rEnd - rIt);
+
+		for(; rIt != rEnd; ++rIt)
+			mRestvalues.pushBack(*rIt);
+
+		for(; iIt != iEnd; ++iIt)
+			mIndices.pushBack(uint16_t(*iIt));
+
+		// add dummy indices to make multiple of 4
+		for(; numConstraints &= kSimdWidth - 1; ++numConstraints)
+		{
+			mRestvalues.pushBack(-FLT_MAX);
+			uint32_t index = mNumParticles + numConstraints - 1;
+			mIndices.pushBack(uint16_t(index));
+			mIndices.pushBack(uint16_t(index));
+		}
+
+		mSets.pushBack(uint32_t(mRestvalues.size()));
+	}
+
+	// trim overallocations
+	RestvalueContainer(mRestvalues.begin(), mRestvalues.end()).swap(mRestvalues);
+	Vector<uint16_t>::Type(mIndices.begin(), mIndices.end()).swap(mIndices);
+
+	// tethers
+	PX_ASSERT(anchors.size() == tetherLengths.size());
+
+	// pad to allow for direct 16 byte (unaligned) loads
+	mTethers.reserve(anchors.size() + 2);
+	for(; !anchors.empty(); anchors.popFront(), tetherLengths.popFront())
+		mTethers.pushBack(SwTether(uint16_t(anchors.front()), tetherLengths.front()));
+
+	mFactory.mFabrics.pushBack(this);
+}
+
+cloth::SwFabric::~SwFabric()
+{
+	Vector<SwFabric*>::Type::Iterator fIt = mFactory.mFabrics.find(this);
+	PX_ASSERT(fIt != mFactory.mFabrics.end());
+	mFactory.mFabrics.replaceWithLast(fIt);
+}
+
+cloth::Factory& cloth::SwFabric::getFactory() const
+{
+	return mFactory;
+}
+
+uint32_t cloth::SwFabric::getNumPhases() const
+{
+	return uint32_t(mPhases.size());
+}
+
+uint32_t cloth::SwFabric::getNumRestvalues() const
+{
+	return mOriginalNumRestvalues;
+}
+
+uint32_t cloth::SwFabric::getNumSets() const
+{
+	return uint32_t(mSets.size() - 1);
+}
+
+uint32_t cloth::SwFabric::getNumIndices() const
+{
+	return 2 * mOriginalNumRestvalues;
+}
+
+uint32_t cloth::SwFabric::getNumParticles() const
+{
+	return mNumParticles;
+}
+
+uint32_t cloth::SwFabric::getNumTethers() const
+{
+	return uint32_t(mTethers.size());
+}
+
+void cloth::SwFabric::scaleRestvalues(float scale)
+{
+	RestvalueContainer::Iterator rIt, rEnd = mRestvalues.end();
+	for(rIt = mRestvalues.begin(); rIt != rEnd; ++rIt)
+		*rIt *= scale;
+}
+
+void cloth::SwFabric::scaleTetherLengths(float scale)
+{
+	mTetherLengthScale *= scale;
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFabric.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFabric.h
new file mode 100644
index 00000000..e2081866
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFabric.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Allocator.h"
+#include "Fabric.h"
+#include "Types.h"
+#include "Range.h"
+#include "PxVec4.h"
+
+namespace nvidia
+{
+
+namespace cloth
+{
+
+class SwFactory;
+
+struct SwTether
+{
+	SwTether(uint16_t, float);
+	uint16_t mAnchor;
+	float mLength;
+};
+
+class SwFabric : public UserAllocated, public Fabric
+{
+  public:
+#if PX_WINDOWS_FAMILY
+	typedef AlignedVector<float, 32>::Type RestvalueContainer; // avx
+#else
+	typedef AlignedVector<float, 16>::Type RestvalueContainer;
+#endif
+
+	SwFabric(SwFactory& factory, uint32_t numParticles, Range<const uint32_t> phases, Range<const uint32_t> sets,
+	         Range<const float> restvalues, Range<const uint32_t> indices, Range<const uint32_t> anchors,
+	         Range<const float> tetherLengths, uint32_t id);
+
+	SwFabric& operator=(const SwFabric&);
+
+	virtual ~SwFabric();
+
+	virtual Factory& getFactory() const;
+
+	virtual uint32_t getNumPhases() const;
+	virtual uint32_t getNumRestvalues() const;
+
+	virtual uint32_t getNumSets() const;
+	virtual uint32_t getNumIndices() const;
+
+	virtual uint32_t getNumParticles() const;
+
+	virtual uint32_t getNumTethers() const;
+
+	virtual void scaleRestvalues(float);
+	virtual void scaleTetherLengths(float);
+
+  public:
+	SwFactory& mFactory;
+
+	uint32_t mNumParticles;
+
+	Vector<uint32_t>::Type mPhases; // index of set to use
+	Vector<uint32_t>::Type mSets;   // offset of first restvalue, with 0 prefix
+
+	RestvalueContainer mRestvalues;  // rest values (edge length)
+	Vector<uint16_t>::Type mIndices; // particle index pairs
+
+	Vector<SwTether>::Type mTethers;
+	float mTetherLengthScale;
+
+	uint32_t mId;
+
+	uint32_t mOriginalNumRestvalues;
+
+} PX_ALIGN_SUFFIX(16);
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFactory.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFactory.cpp
new file mode 100644
index 00000000..9955156d
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFactory.cpp
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "SwFactory.h"
+#include "SwFabric.h"
+#include "SwCloth.h"
+#include "SwSolver.h"
+#include "ClothImpl.h"
+#include <string.h> // for memcpy
+#include "PsIntrinsics.h"
+
+using namespace nvidia;
+using namespace nvidia;
+
+namespace nvidia
+{
+namespace cloth
+{
+// defined in Factory.cpp
+uint32_t getNextFabricId();
+}
+}
+
+cloth::SwFactory::SwFactory() : Factory(CPU)
+{
+}
+
+cloth::SwFactory::~SwFactory()
+{
+}
+
+cloth::Fabric* cloth::SwFactory::createFabric(uint32_t numParticles, Range<const uint32_t> phases,
+                                              Range<const uint32_t> sets, Range<const float> restvalues,
+                                              Range<const uint32_t> indices, Range<const uint32_t> anchors,
+                                              Range<const float> tetherLengths)
+{
+	return new SwFabric(*this, numParticles, phases, sets, restvalues, indices, anchors, tetherLengths,
+	                    getNextFabricId());
+}
+
+cloth::Cloth* cloth::SwFactory::createCloth(Range<const PxVec4> particles, Fabric& fabric)
+{
+	return new SwClothImpl(*this, fabric, particles);
+}
+
+cloth::Solver* cloth::SwFactory::createSolver(profile::PxProfileZone* profiler, PxTaskManager* taskMgr)
+{
+#ifdef PX_PHYSX_GPU_EXPORTS
+	// SwSolver not defined in PhysXGpu project
+	PX_UNUSED(profiler);
+	PX_UNUSED(taskMgr);
+	return 0;
+#else
+	return new SwSolver(profiler, taskMgr);
+#endif
+}
+
+cloth::Cloth* cloth::SwFactory::clone(const Cloth& cloth)
+{
+	if(cloth.getFactory().getPlatform() != Factory::CPU)
+		return cloth.clone(*this); // forward to CuCloth
+
+	// copy construct
+	return new SwClothImpl(*this, static_cast<const SwClothImpl&>(cloth));
+}
+
+void cloth::SwFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> phases, Range<uint32_t> sets,
+                                         Range<float> restvalues, Range<uint32_t> indices, Range<uint32_t> anchors,
+                                         Range<float> tetherLengths) const
+{
+	const SwFabric& swFabric = static_cast<const SwFabric&>(fabric);
+
+	PX_ASSERT(phases.empty() || phases.size() == swFabric.getNumPhases());
+	PX_ASSERT(restvalues.empty() || restvalues.size() == swFabric.getNumRestvalues());
+	PX_ASSERT(sets.empty() || sets.size() == swFabric.getNumSets());
+	PX_ASSERT(indices.empty() || indices.size() == swFabric.getNumIndices());
+	PX_ASSERT(anchors.empty() || anchors.size() == swFabric.getNumTethers());
+	PX_ASSERT(tetherLengths.empty() || tetherLengths.size() == swFabric.getNumTethers());
+
+	for(uint32_t i = 0; !phases.empty(); ++i, phases.popFront())
+		phases.front() = swFabric.mPhases[i];
+
+	const uint32_t* sEnd = swFabric.mSets.end(), *sIt;
+	const float* rBegin = swFabric.mRestvalues.begin(), *rIt = rBegin;
+	const uint16_t* iIt = swFabric.mIndices.begin();
+
+	uint32_t* sDst = sets.begin();
+	float* rDst = restvalues.begin();
+	uint32_t* iDst = indices.begin();
+
+	uint32_t numConstraints = 0;
+	for(sIt = swFabric.mSets.begin(); ++sIt != sEnd;)
+	{
+		const float* rEnd = rBegin + *sIt;
+		for(; rIt != rEnd; ++rIt)
+		{
+			uint16_t i0 = *iIt++;
+			uint16_t i1 = *iIt++;
+
+			if(PxMax(i0, i1) >= swFabric.mNumParticles)
+				continue;
+
+			if(!restvalues.empty())
+				*rDst++ = *rIt;
+
+			if(!indices.empty())
+			{
+				*iDst++ = i0;
+				*iDst++ = i1;
+			}
+
+			++numConstraints;
+		}
+
+		if(!sets.empty())
+			*sDst++ = numConstraints;
+	}
+
+	for(uint32_t i = 0; !anchors.empty(); ++i, anchors.popFront())
+		anchors.front() = swFabric.mTethers[i].mAnchor;
+
+	for(uint32_t i = 0; !tetherLengths.empty(); ++i, tetherLengths.popFront())
+		tetherLengths.front() = swFabric.mTethers[i].mLength * swFabric.mTetherLengthScale;
+}
+
+void cloth::SwFactory::extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules,
+                                            Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const
+{
+	PX_ASSERT(&cloth.getFactory() == this);
+
+	const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth;
+
+	PX_ASSERT(spheres.empty() || spheres.size() == swCloth.mStartCollisionSpheres.size());
+	PX_ASSERT(capsules.empty() || capsules.size() == swCloth.mCapsuleIndices.size() * 2);
+	PX_ASSERT(planes.empty() || planes.size() == swCloth.mStartCollisionPlanes.size());
+	PX_ASSERT(convexes.empty() || convexes.size() == swCloth.mConvexMasks.size());
+	PX_ASSERT(triangles.empty() || triangles.size() == swCloth.mStartCollisionTriangles.size());
+
+	if(!swCloth.mStartCollisionSpheres.empty() && !spheres.empty())
+		memcpy(spheres.begin(), &swCloth.mStartCollisionSpheres.front(),
+		       swCloth.mStartCollisionSpheres.size() * sizeof(PxVec4));
+
+	if(!swCloth.mCapsuleIndices.empty() && !capsules.empty())
+		memcpy(capsules.begin(), &swCloth.mCapsuleIndices.front(), swCloth.mCapsuleIndices.size() * sizeof(IndexPair));
+
+	if(!swCloth.mStartCollisionPlanes.empty() && !planes.empty())
+		memcpy(planes.begin(), &swCloth.mStartCollisionPlanes.front(),
+		       swCloth.mStartCollisionPlanes.size() * sizeof(PxVec4));
+
+	if(!swCloth.mConvexMasks.empty() && !convexes.empty())
+		memcpy(convexes.begin(), &swCloth.mConvexMasks.front(), swCloth.mConvexMasks.size() * sizeof(uint32_t));
+
+	if(!swCloth.mStartCollisionTriangles.empty() && !triangles.empty())
+		memcpy(triangles.begin(), &swCloth.mStartCollisionTriangles.front(),
+		       swCloth.mStartCollisionTriangles.size() * sizeof(PxVec3));
+}
+
+void cloth::SwFactory::extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const
+{
+	PX_ASSERT(&cloth.getFactory() == this);
+
+	const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth;
+
+	Vec4fAlignedVector const& srcConstraints = !swCloth.mMotionConstraints.mTarget.empty()
+	                                               ? swCloth.mMotionConstraints.mTarget
+	                                               : swCloth.mMotionConstraints.mStart;
+
+	if(!srcConstraints.empty())
+	{
+		// make sure dest array is big enough
+		PX_ASSERT(destConstraints.size() == srcConstraints.size());
+
+		memcpy(destConstraints.begin(), &srcConstraints.front(), srcConstraints.size() * sizeof(PxVec4));
+	}
+}
+
+void cloth::SwFactory::extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const
+{
+	PX_ASSERT(&cloth.getFactory() == this);
+
+	const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth;
+
+	Vec4fAlignedVector const& srcConstraints = !swCloth.mSeparationConstraints.mTarget.empty()
+	                                               ? swCloth.mSeparationConstraints.mTarget
+	                                               : swCloth.mSeparationConstraints.mStart;
+
+	if(!srcConstraints.empty())
+	{
+		// make sure dest array is big enough
+		PX_ASSERT(destConstraints.size() == srcConstraints.size());
+
+		memcpy(destConstraints.begin(), &srcConstraints.front(), srcConstraints.size() * sizeof(PxVec4));
+	}
+}
+
+void cloth::SwFactory::extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const
+{
+	PX_ASSERT(&cloth.getFactory() == this);
+
+	const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth;
+
+	if(!swCloth.mParticleAccelerations.empty())
+	{
+		// make sure dest array is big enough
+		PX_ASSERT(destAccelerations.size() == swCloth.mParticleAccelerations.size());
+
+		memcpy(destAccelerations.begin(), &swCloth.mParticleAccelerations.front(),
+		       swCloth.mParticleAccelerations.size() * sizeof(PxVec4));
+	}
+}
+
+void cloth::SwFactory::extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> indices, Range<PxVec3> weights) const
+{
+	PX_ASSERT(this == &cloth.getFactory());
+
+	const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth;
+
+	uint32_t numIndices = cloth.getNumVirtualParticles();
+	uint32_t numWeights = cloth.getNumVirtualParticleWeights();
+
+	PX_ASSERT(indices.size() == numIndices || indices.empty());
+	PX_ASSERT(weights.size() == numWeights || weights.empty());
+
+	if(weights.size() == numWeights)
+	{
+		PxVec3* wDestIt = reinterpret_cast<PxVec3*>(weights.begin());
+
+		// convert weights from vec4 to vec3
+		cloth::Vec4fAlignedVector::ConstIterator wIt = swCloth.mVirtualParticleWeights.begin();
+		cloth::Vec4fAlignedVector::ConstIterator wEnd = wIt + numWeights;
+
+		for(; wIt != wEnd; ++wIt, ++wDestIt)
+			*wDestIt = PxVec3(wIt->x, wIt->y, wIt->z);
+
+		PX_ASSERT(wDestIt == weights.end());
+	}
+	if(indices.size() == numIndices)
+	{
+		// convert indices
+		Vec4u* iDestIt = reinterpret_cast<Vec4u*>(indices.begin());
+		Vector<Vec4us>::Type::ConstIterator iIt = swCloth.mVirtualParticleIndices.begin();
+		Vector<Vec4us>::Type::ConstIterator iEnd = swCloth.mVirtualParticleIndices.end();
+
+		uint32_t numParticles = uint32_t(swCloth.mCurParticles.size());
+
+		for(; iIt != iEnd; ++iIt)
+		{
+			// skip dummy indices
+			if(iIt->x < numParticles)
+				// byte offset to element index
+				*iDestIt++ = Vec4u(*iIt);
+		}
+
+		PX_ASSERT(&array(*iDestIt) == indices.end());
+	}
+}
+
+void cloth::SwFactory::extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const
+{
+	const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth;
+	PX_ASSERT(destIndices.size() == swCloth.mSelfCollisionIndices.size());
+	intrinsics::memCopy(destIndices.begin(), swCloth.mSelfCollisionIndices.begin(), destIndices.size() * sizeof(uint32_t));
+}
+
+void cloth::SwFactory::extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const
+{
+	const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth;
+	PX_ASSERT(destRestPositions.size() == swCloth.mRestPositions.size());
+	intrinsics::memCopy(destRestPositions.begin(), swCloth.mRestPositions.begin(), destRestPositions.size() * sizeof(PxVec4));
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFactory.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFactory.h
new file mode 100644
index 00000000..a078add0
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFactory.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Factory.h"
+#include "Allocator.h"
+
+namespace nvidia
+{
+
+namespace cloth
+{
+
+class SwFabric;
+class SwCloth;
+template <typename>
+class ClothImpl;
+
+class SwFactory : public UserAllocated, public Factory
+{
+  public:
+	typedef SwFabric FabricType;
+	typedef ClothImpl<SwCloth> ImplType;
+
+	SwFactory();
+	virtual ~SwFactory();
+
+	virtual Fabric* createFabric(uint32_t numParticles, Range<const uint32_t> phases, Range<const uint32_t> sets,
+	                             Range<const float> restvalues, Range<const uint32_t> indices,
+	                             Range<const uint32_t> anchors, Range<const float> tetherLengths);
+
+	virtual Cloth* createCloth(Range<const PxVec4> particles, Fabric& fabric);
+
+	virtual Solver* createSolver(profile::PxProfileZone*, PxTaskManager*);
+
+	virtual Cloth* clone(const Cloth& cloth);
+
+	virtual void extractFabricData(const Fabric& fabric, Range<uint32_t> phases, Range<uint32_t> sets,
+	                               Range<float> restvalues, Range<uint32_t> indices, Range<uint32_t> anchors,
+	                               Range<float> tetherLengths) const;
+
+	virtual void extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules,
+	                                  Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const;
+
+	virtual void extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const;
+
+	virtual void extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const;
+
+	virtual void extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const;
+
+	virtual void extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> destIndices,
+	                                     Range<PxVec3> destWeights) const;
+
+	virtual void extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const;
+
+	virtual void extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const;
+
+  public:
+	Vector<SwFabric*>::Type mFabrics;
+};
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwInterCollision.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwInterCollision.cpp
new file mode 100644
index 00000000..c2c924cf
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwInterCollision.cpp
@@ -0,0 +1,694 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "SwInterCollision.h"
+#include "PsIntrinsics.h"
+#include "SwCollision.h" //temp fix, needed by SwCollisionHelper implementations
+#include "Simd4f.h"
+#include "SwCollisionHelpers.h"
+#include "BoundingBox.h"
+#include "PsSort.h"
+#include "PsIntrinsics.h"
+
+#pragma warning(disable:4127)
+
+using namespace nvidia;
+
+namespace
+{
+typedef Simd4fFactory<detail::FourTuple> Simd4fConstant;
+
+const Simd4fConstant sMaskXYZ = simd4f(simd4i(~0, ~0, ~0, 0));
+const Simd4fConstant sMaskW = simd4f(simd4i(0, 0, 0, ~0));
+const Simd4fConstant sEpsilon = simd4f(FLT_EPSILON);
+const Simd4fConstant sZeroW = simd4f(-FLT_MAX, -FLT_MAX, -FLT_MAX, 0.0f);
+
+// returns sorted indices, output needs to be at least 2*(last-first)+1024
+void radixSort(const uint32_t* first, const uint32_t* last, uint32_t* out)
+{
+	uint32_t n = uint32_t(last - first);
+
+	uint32_t* buffer = out + 2 * n;
+	uint32_t* __restrict histograms[] = { buffer, buffer + 256, buffer + 512, buffer + 768 };
+
+	intrinsics::memZero(buffer, 1024 * sizeof(uint32_t));
+
+	// build 3 histograms in one pass
+	for(const uint32_t* __restrict it = first; it != last; ++it)
+	{
+		uint32_t key = *it;
+		++histograms[0][0xff & key];
+		++histograms[1][0xff & (key >> 8)];
+		++histograms[2][0xff & (key >> 16)];
+		++histograms[3][key >> 24];
+	}
+
+	// convert histograms to offset tables in-place
+	uint32_t sums[4] = {};
+	for(uint32_t i = 0; i < 256; ++i)
+	{
+		uint32_t temp0 = histograms[0][i] + sums[0];
+		histograms[0][i] = sums[0], sums[0] = temp0;
+
+		uint32_t temp1 = histograms[1][i] + sums[1];
+		histograms[1][i] = sums[1], sums[1] = temp1;
+
+		uint32_t temp2 = histograms[2][i] + sums[2];
+		histograms[2][i] = sums[2], sums[2] = temp2;
+
+		uint32_t temp3 = histograms[3][i] + sums[3];
+		histograms[3][i] = sums[3], sums[3] = temp3;
+	}
+
+	PX_ASSERT(sums[0] == n && sums[1] == n && sums[2] == n && sums[3] == n);
+
+#if PX_DEBUG
+	memset(out, 0xff, 2 * n * sizeof(uint32_t));
+#endif
+
+	// sort 8 bits per pass
+
+	uint32_t* __restrict indices[] = { out, out + n };
+
+	for(uint32_t i = 0; i != n; ++i)
+		indices[1][histograms[0][0xff & first[i]]++] = i;
+
+	for(uint32_t i = 0, index; index = indices[1][i], i != n; ++i)
+		indices[0][histograms[1][0xff & (first[index] >> 8)]++] = index;
+
+	for(uint32_t i = 0, index; index = indices[0][i], i != n; ++i)
+		indices[1][histograms[2][0xff & (first[index] >> 16)]++] = index;
+
+	for(uint32_t i = 0, index; index = indices[1][i], i != n; ++i)
+		indices[0][histograms[3][first[index] >> 24]++] = index;
+}
+
+template <typename Simd4f>
+uint32_t longestAxis(const Simd4f& edgeLength)
+{
+	const float* e = array(edgeLength);
+
+	if(e[0] > e[1])
+		return uint32_t(e[0] > e[2] ? 0 : 2);
+	else
+		return uint32_t(e[1] > e[2] ? 1 : 2);
+}
+}
+
+template <typename Simd4f>
+cloth::SwInterCollision<Simd4f>::SwInterCollision(const cloth::SwInterCollisionData* instances, uint32_t n, float colDist,
+                                                  float stiffness, uint32_t iterations, InterCollisionFilter filter,
+                                                  cloth::SwKernelAllocator& alloc, profile::PxProfileZone* zone)
+: mInstances(instances)
+, mNumInstances(n)
+, mClothIndices(NULL)
+, mParticleIndices(NULL)
+, mNumParticles(0)
+, mTotalParticles(0)
+, mFilter(filter)
+, mAllocator(alloc)
+, mProfiler(zone)
+{
+	PX_ASSERT(mFilter);
+
+	mCollisionDistance = simd4f(colDist, colDist, colDist, 0.0f);
+	mCollisionSquareDistance = mCollisionDistance * mCollisionDistance;
+	mStiffness = simd4f(stiffness);
+	mNumIterations = iterations;
+
+	// calculate particle size
+	for(uint32_t i = 0; i < n; ++i)
+		mTotalParticles += instances[i].mNumParticles;
+}
+
+template <typename Simd4f>
+cloth::SwInterCollision<Simd4f>::~SwInterCollision()
+{
+}
+
+namespace
+{
+// multiple x by m leaving w component of x intact
+template <typename Simd4f>
+PX_INLINE Simd4f transform(const Simd4f m[4], const Simd4f& x)
+{
+	const Simd4f a = m[3] + splat<0>(x) * m[0] + splat<1>(x) * m[1] + splat<2>(x) * m[2];
+	return select(sMaskXYZ, a, x);
+}
+
+// rotate x by m leaving w component intact
+template <typename Simd4f>
+PX_INLINE Simd4f rotate(const Simd4f m[4], const Simd4f& x)
+{
+	const Simd4f a = splat<0>(x) * m[0] + splat<1>(x) * m[1] + splat<2>(x) * m[2];
+	return select(sMaskXYZ, a, x);
+}
+
+template <typename Simd4f>
+struct ClothSorter
+{
+	typedef cloth::BoundingBox<Simd4f> BoundingBox;
+
+	ClothSorter(BoundingBox* bounds, uint32_t n, uint32_t axis) : mBounds(bounds), mNumBounds(n), mAxis(axis)
+	{
+	}
+
+	bool operator()(uint32_t i, uint32_t j) const
+	{
+		PX_ASSERT(i < mNumBounds);
+		PX_ASSERT(j < mNumBounds);
+
+		return array(mBounds[i].mLower)[mAxis] < array(mBounds[j].mLower)[mAxis];
+	}
+
+	BoundingBox* mBounds;
+	uint32_t mNumBounds;
+	uint32_t mAxis;
+};
+
+// for the given cloth array this function calculates the set of particles
+// which potentially interact, the potential colliders are returned with their
+// cloth index and particle index in clothIndices and particleIndices, the
+// function returns the number of potential colliders
+template <typename Simd4f>
+uint32_t calculatePotentialColliders(const cloth::SwInterCollisionData* cBegin, const cloth::SwInterCollisionData* cEnd,
+                                     Simd4f colDist, uint16_t* clothIndices, uint32_t* particleIndices,
+                                     cloth::BoundingBox<Simd4f>& bounds, uint32_t* overlapMasks,
+                                     cloth::InterCollisionFilter filter, cloth::SwKernelAllocator& allocator)
+{
+	using namespace cloth;
+
+	typedef BoundingBox<Simd4f> BoundingBox;
+
+	uint32_t numParticles = 0;
+	const uint32_t numCloths = uint32_t(cEnd - cBegin);
+
+	// bounds of each cloth objects in world space
+	BoundingBox* const clothBounds = (BoundingBox*)(allocator.allocate(numCloths * sizeof(BoundingBox)));
+	BoundingBox* const overlapBounds = (BoundingBox*)(allocator.allocate(numCloths * sizeof(BoundingBox)));
+
+	// union of all cloth world bounds
+	BoundingBox totalClothBounds = emptyBounds<Simd4f>();
+
+	uint32_t* sortedIndices = (uint32_t*)allocator.allocate(numCloths * sizeof(uint32_t));
+
+	for(uint32_t i = 0; i < numCloths; ++i)
+	{
+		const SwInterCollisionData& c = cBegin[i];
+
+		// transform bounds from b local space to local space of a
+		PxBounds3 lcBounds = PxBounds3::centerExtents(c.mBoundsCenter, c.mBoundsHalfExtent + PxVec3(array(colDist)[0]));
+		PX_ASSERT(!lcBounds.isEmpty());
+		PxBounds3 cWorld = PxBounds3::transformFast(c.mGlobalPose, lcBounds);
+
+		BoundingBox cBounds = {(Simd4f)simd4f(cWorld.minimum.x, cWorld.minimum.y, cWorld.minimum.z, 0.0f),
+			                   (Simd4f)simd4f(cWorld.maximum.x, cWorld.maximum.y, cWorld.maximum.z, 0.0f) };
+
+		sortedIndices[i] = i;
+		clothBounds[i] = cBounds;
+
+		totalClothBounds = expandBounds(totalClothBounds, cBounds);
+	}
+
+	// sort indices by their minimum extent on the longest axis
+	const uint32_t sweepAxis = longestAxis(totalClothBounds.mUpper - totalClothBounds.mLower);
+
+	ClothSorter<Simd4f> predicate(clothBounds, numCloths, sweepAxis);
+	nvidia::sort(sortedIndices, numCloths, predicate);
+
+	for(uint32_t i = 0; i < numCloths; ++i)
+	{
+		PX_ASSERT(sortedIndices[i] < numCloths);
+
+		const SwInterCollisionData& a = cBegin[sortedIndices[i]];
+
+		// local bounds
+		const Simd4f aCenter = load(reinterpret_cast<const float*>(&a.mBoundsCenter));
+		const Simd4f aHalfExtent = load(reinterpret_cast<const float*>(&a.mBoundsHalfExtent)) + colDist;
+		const BoundingBox aBounds = { aCenter - aHalfExtent, aCenter + aHalfExtent };
+
+		const PxMat44 aToWorld(a.mGlobalPose);
+		const PxTransform aToLocal(a.mGlobalPose.getInverse());
+
+		const float axisMin = array(clothBounds[sortedIndices[i]].mLower)[sweepAxis];
+		const float axisMax = array(clothBounds[sortedIndices[i]].mUpper)[sweepAxis];
+
+		uint32_t overlapMask = 0;
+		uint32_t numOverlaps = 0;
+
+		// scan back to find first intersecting bounding box
+		uint32_t startIndex = i;
+		while(startIndex > 0 && array(clothBounds[sortedIndices[startIndex]].mUpper)[sweepAxis] > axisMin)
+			--startIndex;
+
+		// compute all overlapping bounds
+		for(uint32_t j = startIndex; j < numCloths; ++j)
+		{
+			// ignore self-collision
+			if(i == j)
+				continue;
+
+			// early out if no more cloths along axis intersect us
+			if(array(clothBounds[sortedIndices[j]].mLower)[sweepAxis] > axisMax)
+				break;
+
+			const SwInterCollisionData& b = cBegin[sortedIndices[j]];
+
+			// check if collision between these shapes is filtered
+			if(!filter(a.mUserData, b.mUserData))
+				continue;
+
+			// set mask bit for this cloth
+			overlapMask |= 1 << sortedIndices[j];
+
+			// transform bounds from b local space to local space of a
+			PxBounds3 lcBounds =
+			    PxBounds3::centerExtents(b.mBoundsCenter, b.mBoundsHalfExtent + PxVec3(array(colDist)[0]));
+			PX_ASSERT(!lcBounds.isEmpty());
+			PxBounds3 bLocal = PxBounds3::transformFast(aToLocal * b.mGlobalPose, lcBounds);
+
+			BoundingBox bBounds = {(Simd4f)simd4f(bLocal.minimum.x, bLocal.minimum.y, bLocal.minimum.z, 0.0f),
+				                   (Simd4f)simd4f(bLocal.maximum.x, bLocal.maximum.y, bLocal.maximum.z, 0.0f) };
+
+			BoundingBox iBounds = intersectBounds(aBounds, bBounds);
+
+			// setup bounding box w to make point containment test cheaper
+			iBounds.mLower = (iBounds.mLower & sMaskXYZ) | ((Simd4f)sMaskW & simd4f(-FLT_MAX));
+			iBounds.mUpper = (iBounds.mUpper & sMaskXYZ) | ((Simd4f)sMaskW & simd4f(FLT_MAX));
+
+			if(!isEmptyBounds(iBounds))
+				overlapBounds[numOverlaps++] = iBounds;
+		}
+
+		//----------------------------------------------------------------
+		// cull all particles to overlapping bounds and transform particles to world space
+
+		const uint32_t clothIndex = sortedIndices[i];
+		overlapMasks[clothIndex] = overlapMask;
+
+		Simd4f* pBegin = reinterpret_cast<Simd4f*>(a.mParticles);
+		Simd4f* qBegin = reinterpret_cast<Simd4f*>(a.mPrevParticles);
+
+		const Simd4f xform[4] = { load(reinterpret_cast<const float*>(&aToWorld.column0)),
+			                      load(reinterpret_cast<const float*>(&aToWorld.column1)),
+			                      load(reinterpret_cast<const float*>(&aToWorld.column2)),
+			                      load(reinterpret_cast<const float*>(&aToWorld.column3)) };
+
+		Simd4f impulseInvScale = recip(Simd4f(simd4f(cBegin[clothIndex].mImpulseScale)));
+
+		for(uint32_t k = 0; k < a.mNumParticles; ++k)
+		{
+			Simd4f* pIt = a.mIndices ? pBegin + a.mIndices[k] : pBegin + k;
+			Simd4f* qIt = a.mIndices ? qBegin + a.mIndices[k] : qBegin + k;
+
+			const Simd4f p = *pIt;
+
+			for(const BoundingBox* oIt = overlapBounds, *oEnd = overlapBounds + numOverlaps; oIt != oEnd; ++oIt)
+			{
+				// point in box test
+				if(anyGreater(oIt->mLower, p) != 0)
+					continue;
+				if(anyGreater(p, oIt->mUpper) != 0)
+					continue;
+
+				// transform particle to world space in-place
+				// (will be transformed back after collision)
+				*pIt = transform(xform, p);
+
+				Simd4f impulse = (p - *qIt) * impulseInvScale;
+				*qIt = rotate(xform, impulse);
+
+				// update world bounds
+				bounds = expandBounds(bounds, pIt, pIt + 1);
+
+				// add particle to output arrays
+				clothIndices[numParticles] = uint16_t(clothIndex);
+				particleIndices[numParticles] = uint32_t(pIt - pBegin);
+
+				// output each particle only once
+				++numParticles;
+				break;
+			}
+		}
+	}
+
+	allocator.deallocate(sortedIndices);
+	allocator.deallocate(overlapBounds);
+	allocator.deallocate(clothBounds);
+
+	return numParticles;
+}
+}
+
+template <typename Simd4f>
+PX_INLINE Simd4f& cloth::SwInterCollision<Simd4f>::getParticle(uint32_t index)
+{
+	PX_ASSERT(index < mNumParticles);
+
+	uint16_t clothIndex = mClothIndices[index];
+	uint32_t particleIndex = mParticleIndices[index];
+
+	PX_ASSERT(clothIndex < mNumInstances);
+
+	return reinterpret_cast<Simd4f&>(mInstances[clothIndex].mParticles[particleIndex]);
+}
+
+template <typename Simd4f>
+void cloth::SwInterCollision<Simd4f>::operator()()
+{
+	mNumTests = mNumCollisions = 0;
+
+	mClothIndices = static_cast<uint16_t*>(mAllocator.allocate(sizeof(uint16_t) * mTotalParticles));
+	mParticleIndices = static_cast<uint32_t*>(mAllocator.allocate(sizeof(uint32_t) * mTotalParticles));
+	mOverlapMasks = static_cast<uint32_t*>(mAllocator.allocate(sizeof(uint32_t*) * mNumInstances));
+
+	for(uint32_t k = 0; k < mNumIterations; ++k)
+	{
+		// world bounds of particles
+		BoundingBox<Simd4f> bounds = emptyBounds<Simd4f>();
+
+		// calculate potentially colliding set
+		{
+#if PX_PROFILE
+			ProfileZone zone("cloth::SwInterCollision::BroadPhase", mProfiler);
+#endif
+
+			mNumParticles =
+			    calculatePotentialColliders(mInstances, mInstances + mNumInstances, mCollisionDistance, mClothIndices,
+			                                mParticleIndices, bounds, mOverlapMasks, mFilter, mAllocator);
+		}
+
+		// collide
+		if(mNumParticles)
+		{
+#if PX_PROFILE
+			ProfileZone zone("cloth::SwInterCollision::Collide", mProfiler);
+#endif
+
+			Simd4f lowerBound = bounds.mLower;
+			Simd4f edgeLength = max(bounds.mUpper - lowerBound, sEpsilon);
+
+			// sweep along longest axis
+			uint32_t sweepAxis = longestAxis(edgeLength);
+			uint32_t hashAxis0 = (sweepAxis + 1) % 3;
+			uint32_t hashAxis1 = (sweepAxis + 2) % 3;
+
+			// reserve 0, 127, and 65535 for sentinel
+			Simd4f cellSize = max(mCollisionDistance, simd4f(1.0f / 253) * edgeLength);
+			array(cellSize)[sweepAxis] = array(edgeLength)[sweepAxis] / 65533;
+
+			Simd4f one = simd4f(_1);
+			Simd4f gridSize = simd4f(254.0f);
+			array(gridSize)[sweepAxis] = 65534.0f;
+
+			Simd4f gridScale = recipT<1>(cellSize);
+			Simd4f gridBias = -lowerBound * gridScale + simd4f(_1);
+
+			void* buffer = mAllocator.allocate(getBufferSize(mNumParticles));
+
+			uint32_t* __restrict sortedIndices = reinterpret_cast<uint32_t*>(buffer);
+			uint32_t* __restrict sortedKeys = sortedIndices + mNumParticles;
+			uint32_t* __restrict keys = PxMax(sortedKeys + mNumParticles, sortedIndices + 2 * mNumParticles + 1024);
+
+			typedef typename Simd4fToSimd4i<Simd4f>::Type Simd4i;
+
+			// create keys
+			for(uint32_t i = 0; i < mNumParticles; ++i)
+			{
+				// grid coordinate
+				Simd4f indexf = getParticle(i) * gridScale + gridBias;
+
+				// need to clamp index because shape collision potentially
+				// pushes particles outside of their original bounds
+				Simd4i indexi = intFloor(max(one, min(indexf, gridSize)));
+
+				const int32_t* ptr = simdi::array(indexi);
+				keys[i] = uint32_t(ptr[sweepAxis] | (ptr[hashAxis0] << 16) | (ptr[hashAxis1] << 24));
+			}
+
+			// compute sorted keys indices
+			radixSort(keys, keys + mNumParticles, sortedIndices);
+
+			// snoop histogram: offset of first index with 8 msb > 1 (0 is sentinel)
+			uint32_t firstColumnSize = sortedIndices[2 * mNumParticles + 769];
+
+			// sort keys
+			for(uint32_t i = 0; i < mNumParticles; ++i)
+				sortedKeys[i] = keys[sortedIndices[i]];
+			sortedKeys[mNumParticles] = uint32_t(-1); // sentinel
+
+			// calculate the number of buckets we need to search forward
+			const Simd4i data = intFloor(gridScale * mCollisionDistance);
+			uint32_t collisionDistance = uint32_t(2 + simdi::array(data)[sweepAxis]);
+
+			// collide particles
+			collideParticles(sortedKeys, firstColumnSize, sortedIndices, mNumParticles, collisionDistance);
+
+			mAllocator.deallocate(buffer);
+		}
+
+		/*
+		// verify against brute force (disable collision response when testing)
+		uint32_t numCollisions = mNumCollisions;
+		mNumCollisions = 0;
+
+		for(uint32_t i = 0; i < mNumParticles; ++i)
+		    for(uint32_t j = i+1; j < mNumParticles; ++j)
+		        if (mOverlapMasks[mClothIndices[i]] & (1 << mClothIndices[j]))
+		            collideParticles(getParticle(i), getParticle(j));
+
+		static uint32_t iter = 0; ++iter;
+		if(numCollisions != mNumCollisions)
+		    printf("%u: %u != %u\n", iter, numCollisions, mNumCollisions);
+		*/
+
+		// transform back to local space
+		{
+#if PX_PROFILE
+			ProfileZone zone("cloth::SwInterCollision::PostTransform", mProfiler);
+#endif
+			Simd4f toLocal[4], impulseScale;
+			uint16_t lastCloth = uint16_t(0xffff);
+
+			for(uint32_t i = 0; i < mNumParticles; ++i)
+			{
+				uint16_t clothIndex = mClothIndices[i];
+				const SwInterCollisionData* instance = mInstances + clothIndex;
+
+				// todo: could pre-compute these inverses
+				if(clothIndex != lastCloth)
+				{
+					const PxMat44 xform(instance->mGlobalPose.getInverse());
+
+					toLocal[0] = load(reinterpret_cast<const float*>(&xform.column0));
+					toLocal[1] = load(reinterpret_cast<const float*>(&xform.column1));
+					toLocal[2] = load(reinterpret_cast<const float*>(&xform.column2));
+					toLocal[3] = load(reinterpret_cast<const float*>(&xform.column3));
+
+					impulseScale = simd4f(instance->mImpulseScale);
+
+					lastCloth = mClothIndices[i];
+				}
+
+				uint32_t particleIndex = mParticleIndices[i];
+				Simd4f& particle = reinterpret_cast<Simd4f&>(instance->mParticles[particleIndex]);
+				Simd4f& impulse = reinterpret_cast<Simd4f&>(instance->mPrevParticles[particleIndex]);
+
+				particle = transform(toLocal, particle);
+				// avoid w becoming negative due to numerical inaccuracies
+				impulse = max(sZeroW, particle - rotate(toLocal, Simd4f(impulse * impulseScale)));
+			}
+		}
+	}
+
+	mAllocator.deallocate(mOverlapMasks);
+	mAllocator.deallocate(mParticleIndices);
+	mAllocator.deallocate(mClothIndices);
+}
+
+template <typename Simd4f>
+size_t cloth::SwInterCollision<Simd4f>::estimateTemporaryMemory(SwInterCollisionData* cloths, uint32_t n)
+{
+	// count total particles
+	uint32_t numParticles = 0;
+	for(uint32_t i = 0; i < n; ++i)
+		numParticles += cloths[i].mNumParticles;
+
+	uint32_t boundsSize = 2 * n * sizeof(BoundingBox<Simd4f>) + n * sizeof(uint32_t);
+	uint32_t clothIndicesSize = numParticles * sizeof(uint16_t);
+	uint32_t particleIndicesSize = numParticles * sizeof(uint32_t);
+	uint32_t masksSize = n * sizeof(uint32_t);
+
+	return boundsSize + clothIndicesSize + particleIndicesSize + masksSize + getBufferSize(numParticles);
+}
+
+template <typename Simd4f>
+size_t cloth::SwInterCollision<Simd4f>::getBufferSize(uint32_t numParticles)
+{
+	uint32_t keysSize = numParticles * sizeof(uint32_t);
+	uint32_t indicesSize = numParticles * sizeof(uint32_t);
+	uint32_t histogramSize = 1024 * sizeof(uint32_t);
+
+	return keysSize + indicesSize + PxMax(indicesSize + histogramSize, keysSize);
+}
+
+template <typename Simd4f>
+void cloth::SwInterCollision<Simd4f>::collideParticle(uint32_t index)
+{
+	uint16_t clothIndex = mClothIndices[index];
+
+	if((1 << clothIndex) & ~mClothMask)
+		return;
+
+	const SwInterCollisionData* instance = mInstances + clothIndex;
+
+	uint32_t particleIndex = mParticleIndices[index];
+	Simd4f& particle = reinterpret_cast<Simd4f&>(instance->mParticles[particleIndex]);
+
+	Simd4f diff = particle - mParticle;
+	Simd4f distSqr = dot3(diff, diff);
+
+#if PX_DEBUG
+	++mNumTests;
+#endif
+
+	if(allGreater(distSqr, mCollisionSquareDistance))
+		return;
+
+	Simd4f w0 = splat<3>(mParticle);
+	Simd4f w1 = splat<3>(particle);
+
+	Simd4f ratio = mCollisionDistance * rsqrtT<1>(distSqr);
+	Simd4f scale = mStiffness * recipT<1>(sEpsilon + w0 + w1);
+	Simd4f delta = (scale * (diff - diff * ratio)) & sMaskXYZ;
+
+	mParticle = mParticle + delta * w0;
+	particle = particle - delta * w1;
+
+	Simd4f& impulse = reinterpret_cast<Simd4f&>(instance->mPrevParticles[particleIndex]);
+
+	mImpulse = mImpulse + delta * w0;
+	impulse = impulse - delta * w1;
+
+#if PX_DEBUG || PX_PROFILE
+	++mNumCollisions;
+#endif
+}
+
+template <typename Simd4f>
+void cloth::SwInterCollision<Simd4f>::collideParticles(const uint32_t* keys, uint32_t firstColumnSize,
+                                                       const uint32_t* indices, uint32_t numParticles,
+                                                       uint32_t collisionDistance)
+{
+	const uint32_t bucketMask = uint16_t(-1);
+
+	const uint32_t keyOffsets[] = { 0, 0x00010000, 0x00ff0000, 0x01000000, 0x01010000 };
+
+	const uint32_t* __restrict kFirst[5];
+	const uint32_t* __restrict kLast[5];
+
+	{
+		// optimization: scan forward iterator starting points once instead of 9 times
+		const uint32_t* __restrict kIt = keys;
+
+		uint32_t key = *kIt;
+		uint32_t firstKey = key - PxMin(collisionDistance, key & bucketMask);
+		uint32_t lastKey = PxMin(key + collisionDistance, key | bucketMask);
+
+		kFirst[0] = kIt;
+		while(*kIt < lastKey)
+			++kIt;
+		kLast[0] = kIt;
+
+		for(uint32_t k = 1; k < 5; ++k)
+		{
+			for(uint32_t n = firstKey + keyOffsets[k]; *kIt < n;)
+				++kIt;
+			kFirst[k] = kIt;
+
+			for(uint32_t n = lastKey + keyOffsets[k]; *kIt < n;)
+				++kIt;
+			kLast[k] = kIt;
+
+			// jump forward once to second column
+			kIt = keys + firstColumnSize;
+			firstColumnSize = 0;
+		}
+	}
+
+	const uint32_t* __restrict iIt = indices;
+	const uint32_t* __restrict iEnd = indices + numParticles;
+
+	const uint32_t* __restrict jIt;
+	const uint32_t* __restrict jEnd;
+
+	for(; iIt != iEnd; ++iIt, ++kFirst[0])
+	{
+		// load current particle once outside of inner loop
+		uint32_t index = *iIt;
+		PX_ASSERT(index < mNumParticles);
+		mClothIndex = mClothIndices[index];
+		PX_ASSERT(mClothIndex < mNumInstances);
+		mClothMask = mOverlapMasks[mClothIndex];
+
+		const SwInterCollisionData* instance = mInstances + mClothIndex;
+
+		mParticleIndex = mParticleIndices[index];
+		mParticle = reinterpret_cast<const Simd4f&>(instance->mParticles[mParticleIndex]);
+		mImpulse = reinterpret_cast<const Simd4f&>(instance->mPrevParticles[mParticleIndex]);
+
+		uint32_t key = *kFirst[0];
+
+		// range of keys we need to check against for this particle
+		uint32_t firstKey = key - PxMin(collisionDistance, key & bucketMask);
+		uint32_t lastKey = PxMin(key + collisionDistance, key | bucketMask);
+
+		// scan forward end point
+		while(*kLast[0] < lastKey)
+			++kLast[0];
+
+		// process potential colliders of same cell
+		jEnd = indices + (kLast[0] - keys);
+		for(jIt = iIt + 1; jIt != jEnd; ++jIt)
+			collideParticle(*jIt);
+
+		// process neighbor cells
+		for(uint32_t k = 1; k < 5; ++k)
+		{
+			// scan forward start point
+			for(uint32_t n = firstKey + keyOffsets[k]; *kFirst[k] < n;)
+				++kFirst[k];
+
+			// scan forward end point
+			for(uint32_t n = lastKey + keyOffsets[k]; *kLast[k] < n;)
+				++kLast[k];
+
+			// process potential colliders
+			jEnd = indices + (kLast[k] - keys);
+			for(jIt = indices + (kFirst[k] - keys); jIt != jEnd; ++jIt)
+				collideParticle(*jIt);
+		}
+
+		// write back particle and impulse
+		reinterpret_cast<Simd4f&>(instance->mParticles[mParticleIndex]) = mParticle;
+		reinterpret_cast<Simd4f&>(instance->mPrevParticles[mParticleIndex]) = mImpulse;
+	}
+}
+
+// explicit template instantiation
+#if NVMATH_SIMD
+template class cloth::SwInterCollision<Simd4f>;
+#endif
+#if NVMATH_SCALAR
+template class cloth::SwInterCollision<Scalar4f>;
+#endif
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwInterCollision.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwInterCollision.h
new file mode 100644
index 00000000..ffc62eb1
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwInterCollision.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+
+#include "StackAllocator.h"
+
+#include "Simd4i.h"
+
+#include "PxMat44.h"
+#include "PxTransform.h"
+#include "PxBounds3.h"
+
+namespace physx
+{
+	namespace profile
+	{
+		class PxProfileZone;
+	}
+}
+
+namespace nvidia
+{
+namespace cloth
+{
+
+class SwCloth;
+struct SwClothData;
+
+typedef StackAllocator<16> SwKernelAllocator;
+
+typedef bool (*InterCollisionFilter)(void* cloth0, void* cloth1);
+
+struct SwInterCollisionData
+{
+	SwInterCollisionData()
+	{
+	}
+	SwInterCollisionData(PxVec4* particles, PxVec4* prevParticles, uint32_t numParticles, uint32_t* indices,
+	                     const PxTransform& globalPose, const PxVec3& boundsCenter, const PxVec3& boundsHalfExtents,
+	                     float impulseScale, void* userData)
+	: mParticles(particles)
+	, mPrevParticles(prevParticles)
+	, mNumParticles(numParticles)
+	, mIndices(indices)
+	, mGlobalPose(globalPose)
+	, mBoundsCenter(boundsCenter)
+	, mBoundsHalfExtent(boundsHalfExtents)
+	, mImpulseScale(impulseScale)
+	, mUserData(userData)
+	{
+	}
+
+	PxVec4* mParticles;
+	PxVec4* mPrevParticles;
+	uint32_t mNumParticles;
+	uint32_t* mIndices;
+	PxTransform mGlobalPose;
+	PxVec3 mBoundsCenter;
+	PxVec3 mBoundsHalfExtent;
+	float mImpulseScale;
+	void* mUserData;
+};
+
+template <typename Simd4f>
+class SwInterCollision
+{
+
+  public:
+	SwInterCollision(const SwInterCollisionData* cloths, uint32_t n, float colDist, float stiffness, uint32_t iterations,
+	                 InterCollisionFilter filter, cloth::SwKernelAllocator& alloc, nvidia::profile::PxProfileZone* zone);
+
+	~SwInterCollision();
+
+	void operator()();
+
+	static size_t estimateTemporaryMemory(SwInterCollisionData* cloths, uint32_t n);
+
+  private:
+	SwInterCollision& operator=(const SwInterCollision&); // not implemented
+
+	static size_t getBufferSize(uint32_t);
+
+	void collideParticles(const uint32_t* keys, uint32_t firstColumnSize, const uint32_t* sortedIndices,
+	                      uint32_t numParticles, uint32_t collisionDistance);
+
+	Simd4f& getParticle(uint32_t index);
+
+	// better wrap these in a struct
+	void collideParticle(uint32_t index);
+
+	Simd4f mParticle;
+	Simd4f mImpulse;
+
+	Simd4f mCollisionDistance;
+	Simd4f mCollisionSquareDistance;
+	Simd4f mStiffness;
+
+	uint16_t mClothIndex;
+	uint32_t mClothMask;
+	uint32_t mParticleIndex;
+
+	uint32_t mNumIterations;
+
+	const SwInterCollisionData* mInstances;
+	uint32_t mNumInstances;
+
+	uint16_t* mClothIndices;
+	uint32_t* mParticleIndices;
+	uint32_t mNumParticles;
+	uint32_t* mOverlapMasks;
+
+	uint32_t mTotalParticles;
+
+	InterCollisionFilter mFilter;
+
+	SwKernelAllocator& mAllocator;
+
+	profile::PxProfileZone* mProfiler;
+
+  public:
+	mutable uint32_t mNumTests;
+	mutable uint32_t mNumCollisions;
+};
+
+} // namespace cloth
+
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSelfCollision.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSelfCollision.cpp
new file mode 100644
index 00000000..939543f4
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSelfCollision.cpp
@@ -0,0 +1,404 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "SwSelfCollision.h"
+#include "SwCloth.h"
+#include "SwClothData.h"
+#include "PsIntrinsics.h"
+#include "SwCollision.h" //temp fix, needed by SwCollisionHelper implementaitons
+#include "Simd4f.h"
+#include "SwCollisionHelpers.h"
+
+#pragma warning(disable:4127)
+
+using namespace nvidia;
+using namespace nvidia;
+
+namespace
+{
+typedef Simd4fFactory<detail::FourTuple> Simd4fConstant;
+
+const Simd4fConstant sMaskXYZ = simd4f(simd4i(~0, ~0, ~0, 0));
+const Simd4fConstant sEpsilon = simd4f(FLT_EPSILON);
+
+// returns sorted indices, output needs to be at least 2*(last-first)+1024
+void radixSort(const uint32_t* first, const uint32_t* last, uint16_t* out)
+{
+	uint16_t n = uint16_t(last - first);
+
+	uint16_t* buffer = out + 2 * n;
+	uint16_t* __restrict histograms[] = { buffer, buffer + 256, buffer + 512, buffer + 768 };
+
+	intrinsics::memZero(buffer, 1024 * sizeof(uint16_t));
+
+	// build 3 histograms in one pass
+	for(const uint32_t* __restrict it = first; it != last; ++it)
+	{
+		uint32_t key = *it;
+		++histograms[0][0xff & key];
+		++histograms[1][0xff & (key >> 8)];
+		++histograms[2][0xff & (key >> 16)];
+		++histograms[3][key >> 24];
+	}
+
+	// convert histograms to offset tables in-place
+	uint16_t sums[4] = {};
+	for(uint32_t i = 0; i < 256; ++i)
+	{
+		uint16_t temp0 = uint16_t(histograms[0][i] + sums[0]);
+		histograms[0][i] = sums[0], sums[0] = temp0;
+
+		uint16_t temp1 = uint16_t(histograms[1][i] + sums[1]);
+		histograms[1][i] = sums[1], sums[1] = temp1;
+
+		uint16_t temp2 = uint16_t(histograms[2][i] + sums[2]);
+		histograms[2][i] = sums[2], sums[2] = temp2;
+
+		uint16_t temp3 = uint16_t(histograms[3][i] + sums[3]);
+		histograms[3][i] = sums[3], sums[3] = temp3;
+	}
+
+	PX_ASSERT(sums[0] == n && sums[1] == n && sums[2] == n && sums[3] == n);
+
+#if PX_DEBUG
+	memset(out, 0xff, 2 * n * sizeof(uint16_t));
+#endif
+
+	// sort 8 bits per pass
+
+	uint16_t* __restrict indices[] = { out, out + n };
+
+	for(uint16_t i = 0; i != n; ++i)
+		indices[1][histograms[0][0xff & first[i]]++] = i;
+
+	for(uint16_t i = 0, index; index = indices[1][i], i != n; ++i)
+		indices[0][histograms[1][0xff & (first[index] >> 8)]++] = index;
+
+	for(uint16_t i = 0, index; index = indices[0][i], i != n; ++i)
+		indices[1][histograms[2][0xff & (first[index] >> 16)]++] = index;
+
+	for(uint16_t i = 0, index; index = indices[1][i], i != n; ++i)
+		indices[0][histograms[3][first[index] >> 24]++] = index;
+}
+
+template <typename Simd4f>
+uint32_t longestAxis(const Simd4f& edgeLength)
+{
+	const float* e = array(edgeLength);
+
+	if(e[0] > e[1])
+		return uint32_t(e[0] > e[2] ? 0 : 2);
+	else
+		return uint32_t(e[1] > e[2] ? 1 : 2);
+}
+
+bool isSelfCollisionEnabled(const cloth::SwClothData& cloth)
+{
+	return PxMin(cloth.mSelfCollisionDistance, cloth.mSelfCollisionStiffness) > 0.0f;
+}
+
+bool isSelfCollisionEnabled(const cloth::SwCloth& cloth)
+{
+	return PxMin(cloth.mSelfCollisionDistance, -cloth.mSelfCollisionLogStiffness) > 0.0f;
+}
+
+inline uint32_t align2(uint32_t x)
+{
+	return (x + 1) & ~1;
+}
+
+} // anonymous namespace
+
+template <typename Simd4f>
+cloth::SwSelfCollision<Simd4f>::SwSelfCollision(cloth::SwClothData& clothData, cloth::SwKernelAllocator& alloc)
+: mClothData(clothData), mAllocator(alloc)
+{
+	mCollisionDistance = simd4f(mClothData.mSelfCollisionDistance);
+	mCollisionSquareDistance = mCollisionDistance * mCollisionDistance;
+	mStiffness = (Simd4f)sMaskXYZ & simd4f(mClothData.mSelfCollisionStiffness);
+}
+
+template <typename Simd4f>
+cloth::SwSelfCollision<Simd4f>::~SwSelfCollision()
+{
+}
+
+template <typename Simd4f>
+void cloth::SwSelfCollision<Simd4f>::operator()()
+{
+	mNumTests = mNumCollisions = 0;
+
+	if(!isSelfCollisionEnabled(mClothData))
+		return;
+
+	Simd4f lowerBound = load(mClothData.mCurBounds);
+	Simd4f edgeLength = max(load(mClothData.mCurBounds + 3) - lowerBound, sEpsilon);
+
+	// sweep along longest axis
+	uint32_t sweepAxis = longestAxis(edgeLength);
+	uint32_t hashAxis0 = (sweepAxis + 1) % 3;
+	uint32_t hashAxis1 = (sweepAxis + 2) % 3;
+
+	// reserve 0, 127, and 65535 for sentinel
+	Simd4f cellSize = max(mCollisionDistance, simd4f(1.0f / 253) * edgeLength);
+	array(cellSize)[sweepAxis] = array(edgeLength)[sweepAxis] / 65533;
+
+	Simd4f one = simd4f(_1);
+	Simd4f gridSize = simd4f(254.0f);
+	array(gridSize)[sweepAxis] = 65534.0f;
+
+	Simd4f gridScale = recipT<1>(cellSize);
+	Simd4f gridBias = -lowerBound * gridScale + simd4f(_1);
+
+	uint32_t numIndices = mClothData.mNumSelfCollisionIndices;
+	void* buffer = mAllocator.allocate(getBufferSize(numIndices));
+
+	const uint32_t* __restrict indices = mClothData.mSelfCollisionIndices;
+	uint32_t* __restrict keys = reinterpret_cast<uint32_t*>(buffer);
+	uint16_t* __restrict sortedIndices = reinterpret_cast<uint16_t*>(keys + numIndices);
+	uint32_t* __restrict sortedKeys = reinterpret_cast<uint32_t*>(sortedIndices + align2(numIndices));
+
+	const Simd4f* particles = reinterpret_cast<const Simd4f*>(mClothData.mCurParticles);
+
+	// create keys
+	for(uint32_t i = 0; i < numIndices; ++i)
+	{
+		uint32_t index = indices ? indices[i] : i;
+
+		// grid coordinate
+		Simd4f keyf = particles[index] * gridScale + gridBias;
+
+		// need to clamp index because shape collision potentially
+		// pushes particles outside of their original bounds
+		Simd4i keyi = intFloor(max(one, min(keyf, gridSize)));
+
+		const int32_t* ptr = simdi::array(keyi);
+		keys[i] = uint32_t(ptr[sweepAxis] | (ptr[hashAxis0] << 16) | (ptr[hashAxis1] << 24));
+	}
+
+	// compute sorted keys indices
+	radixSort(keys, keys + numIndices, sortedIndices);
+
+	// snoop histogram: offset of first index with 8 msb > 1 (0 is sentinel)
+	uint16_t firstColumnSize = sortedIndices[2 * numIndices + 769];
+
+	// sort keys
+	for(uint32_t i = 0; i < numIndices; ++i)
+		sortedKeys[i] = keys[sortedIndices[i]];
+	sortedKeys[numIndices] = uint32_t(-1); // sentinel
+
+	if(indices)
+	{
+		// sort indices (into no-longer-needed keys array)
+		const uint16_t* __restrict permutation = sortedIndices;
+		sortedIndices = reinterpret_cast<uint16_t*>(keys);
+		for(uint32_t i = 0; i < numIndices; ++i)
+			sortedIndices[i] = uint16_t(indices[permutation[i]]);
+	}
+
+	// calculate the number of buckets we need to search forward
+	const Simd4i data = intFloor(gridScale * mCollisionDistance);
+	uint32_t collisionDistance = 2 + (uint32_t)simdi::array(data)[sweepAxis];
+
+	// collide particles
+	if(mClothData.mRestPositions)
+		collideParticles<true>(sortedKeys, firstColumnSize, sortedIndices, collisionDistance);
+	else
+		collideParticles<false>(sortedKeys, firstColumnSize, sortedIndices, collisionDistance);
+
+	mAllocator.deallocate(buffer);
+
+	// verify against brute force (disable collision response when testing)
+	/*
+	uint32_t numCollisions = mNumCollisions;
+	mNumCollisions = 0;
+
+	Simd4f* qarticles = reinterpret_cast<
+	    Simd4f*>(mClothData.mCurParticles);
+	for(uint32_t i = 0; i < numIndices; ++i)
+	{
+	    uint32_t indexI = indices ? indices[i] : i;
+	    for(uint32_t j = i+1; j < numIndices; ++j)
+	    {
+	        uint32_t indexJ = indices ? indices[j] : j;
+	        collideParticles(qarticles[indexI], qarticles[indexJ]);
+	    }
+	}
+
+	static uint32_t iter = 0; ++iter;
+	if(numCollisions != mNumCollisions)
+	    printf("%u: %u != %u\n", iter, numCollisions, mNumCollisions);
+	*/
+}
+
+template <typename Simd4f>
+size_t cloth::SwSelfCollision<Simd4f>::estimateTemporaryMemory(const SwCloth& cloth)
+{
+	uint32_t numIndices =
+	    cloth.mSelfCollisionIndices.empty() ? cloth.mCurParticles.size() : cloth.mSelfCollisionIndices.size();
+	return isSelfCollisionEnabled(cloth) ? getBufferSize(numIndices) : 0;
+}
+
+template <typename Simd4f>
+size_t cloth::SwSelfCollision<Simd4f>::getBufferSize(uint32_t numIndices)
+{
+	uint32_t keysSize = numIndices * sizeof(uint32_t);
+	uint32_t indicesSize = align2(numIndices) * sizeof(uint16_t);
+	uint32_t radixSize = (numIndices + 1024) * sizeof(uint16_t);
+	return keysSize + indicesSize + PxMax(radixSize, keysSize + uint32_t(sizeof(uint32_t)));
+}
+
+template <typename Simd4f>
+template <bool useRestParticles>
+void cloth::SwSelfCollision<Simd4f>::collideParticles(Simd4f& pos0, Simd4f& pos1, const Simd4f& pos0rest,
+                                                      const Simd4f& pos1rest)
+{
+	Simd4f diff = pos1 - pos0;
+	Simd4f distSqr = dot3(diff, diff);
+
+#if PX_DEBUG
+	++mNumTests;
+#endif
+
+	if(allGreater(distSqr, mCollisionSquareDistance))
+		return;
+
+	if(useRestParticles)
+	{
+		// calculate distance in rest configuration, if less than collision
+		// distance then ignore collision between particles in deformed config
+		Simd4f restDiff = pos1rest - pos0rest;
+		Simd4f restDistSqr = dot3(restDiff, restDiff);
+
+		if(allGreater(mCollisionSquareDistance, restDistSqr))
+			return;
+	}
+
+	Simd4f w0 = splat<3>(pos0);
+	Simd4f w1 = splat<3>(pos1);
+
+	Simd4f ratio = mCollisionDistance * rsqrt(distSqr);
+	Simd4f scale = mStiffness * recip(sEpsilon + w0 + w1);
+	Simd4f delta = (scale * (diff - diff * ratio)) & sMaskXYZ;
+
+	pos0 = pos0 + delta * w0;
+	pos1 = pos1 - delta * w1;
+
+#if PX_DEBUG || PX_PROFILE
+	++mNumCollisions;
+#endif
+}
+
+template <typename Simd4f>
+template <bool useRestParticles>
+void cloth::SwSelfCollision<Simd4f>::collideParticles(const uint32_t* keys, uint16_t firstColumnSize,
+                                                      const uint16_t* indices, uint32_t collisionDistance)
+{
+	Simd4f* __restrict particles = reinterpret_cast<Simd4f*>(mClothData.mCurParticles);
+	Simd4f* __restrict restParticles =
+	    useRestParticles ? reinterpret_cast<Simd4f*>(mClothData.mRestPositions) : particles;
+
+	const uint32_t bucketMask = uint16_t(-1);
+
+	const uint32_t keyOffsets[] = { 0, 0x00010000, 0x00ff0000, 0x01000000, 0x01010000 };
+
+	const uint32_t* __restrict kFirst[5];
+	const uint32_t* __restrict kLast[5];
+
+	{
+		// optimization: scan forward iterator starting points once instead of 9 times
+		const uint32_t* __restrict kIt = keys;
+
+		uint32_t key = *kIt;
+		uint32_t firstKey = key - PxMin(collisionDistance, key & bucketMask);
+		uint32_t lastKey = PxMin(key + collisionDistance, key | bucketMask);
+
+		kFirst[0] = kIt;
+		while(*kIt < lastKey)
+			++kIt;
+		kLast[0] = kIt;
+
+		for(uint32_t k = 1; k < 5; ++k)
+		{
+			for(uint32_t n = firstKey + keyOffsets[k]; *kIt < n;)
+				++kIt;
+			kFirst[k] = kIt;
+
+			for(uint32_t n = lastKey + keyOffsets[k]; *kIt < n;)
+				++kIt;
+			kLast[k] = kIt;
+
+			// jump forward once to second column
+			kIt = keys + firstColumnSize;
+			firstColumnSize = 0;
+		}
+	}
+
+	const uint16_t* __restrict iIt = indices;
+	const uint16_t* __restrict iEnd = indices + mClothData.mNumSelfCollisionIndices;
+
+	const uint16_t* __restrict jIt;
+	const uint16_t* __restrict jEnd;
+
+	for(; iIt != iEnd; ++iIt, ++kFirst[0])
+	{
+		PX_ASSERT(*iIt < mClothData.mNumParticles);
+
+		// load current particle once outside of inner loop
+		Simd4f particle = particles[*iIt];
+		Simd4f restParticle = restParticles[*iIt];
+
+		uint32_t key = *kFirst[0];
+
+		// range of keys we need to check against for this particle
+		uint32_t firstKey = key - PxMin(collisionDistance, key & bucketMask);
+		uint32_t lastKey = PxMin(key + collisionDistance, key | bucketMask);
+
+		// scan forward end point
+		while(*kLast[0] < lastKey)
+			++kLast[0];
+
+		// process potential colliders of same cell
+		jEnd = indices + (kLast[0] - keys);
+		for(jIt = iIt + 1; jIt != jEnd; ++jIt)
+			collideParticles<useRestParticles>(particle, particles[*jIt], restParticle, restParticles[*jIt]);
+
+		// process neighbor cells
+		for(uint32_t k = 1; k < 5; ++k)
+		{
+			// scan forward start point
+			for(uint32_t n = firstKey + keyOffsets[k]; *kFirst[k] < n;)
+				++kFirst[k];
+
+			// scan forward end point
+			for(uint32_t n = lastKey + keyOffsets[k]; *kLast[k] < n;)
+				++kLast[k];
+
+			// process potential colliders
+			jEnd = indices + (kLast[k] - keys);
+			for(jIt = indices + (kFirst[k] - keys); jIt != jEnd; ++jIt)
+				collideParticles<useRestParticles>(particle, particles[*jIt], restParticle, restParticles[*jIt]);
+		}
+
+		// store current particle
+		particles[*iIt] = particle;
+	}
+}
+
+// explicit template instantiation
+#if NVMATH_SIMD
+template class cloth::SwSelfCollision<Simd4f>;
+#endif
+#if NVMATH_SCALAR
+template class cloth::SwSelfCollision<Scalar4f>;
+#endif
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSelfCollision.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSelfCollision.h
new file mode 100644
index 00000000..fa023e56
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSelfCollision.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+#include "StackAllocator.h"
+#include "Simd4i.h"
+
+namespace nvidia
+{
+
+namespace cloth
+{
+
+class SwCloth;
+struct SwClothData;
+
+typedef StackAllocator<16> SwKernelAllocator;
+
+template <typename Simd4f>
+class SwSelfCollision
+{
+	typedef typename Simd4fToSimd4i<Simd4f>::Type Simd4i;
+
+  public:
+	SwSelfCollision(SwClothData& clothData, SwKernelAllocator& alloc);
+	~SwSelfCollision();
+
+	void operator()();
+
+	static size_t estimateTemporaryMemory(const SwCloth&);
+
+  private:
+	SwSelfCollision& operator=(const SwSelfCollision&); // not implemented
+	static size_t getBufferSize(uint32_t);
+
+	template <bool useRestParticles>
+	void collideParticles(Simd4f&, Simd4f&, const Simd4f&, const Simd4f&);
+
+	template <bool useRestParticles>
+	void collideParticles(const uint32_t*, uint16_t, const uint16_t*, uint32_t);
+
+	Simd4f mCollisionDistance;
+	Simd4f mCollisionSquareDistance;
+	Simd4f mStiffness;
+
+	SwClothData& mClothData;
+	SwKernelAllocator& mAllocator;
+
+  public:
+	mutable uint32_t mNumTests;
+	mutable uint32_t mNumCollisions;
+};
+
+} // namespace cloth
+
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolver.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolver.cpp
new file mode 100644
index 00000000..35cb1bde
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolver.cpp
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "SwSolver.h"
+#include "SwCloth.h"
+#include "ClothImpl.h"
+#include "SwFabric.h"
+#include "SwFactory.h"
+#include "SwClothData.h"
+#include "SwSolverKernel.h"
+#include "SwInterCollision.h"
+#include "IterationState.h"
+#include "PxCpuDispatcher.h"
+#include "PxProfileZone.h"
+#include "PsFPU.h"
+#include "PsSort.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+bool neonSolverKernel(SwCloth const&, SwClothData&, SwKernelAllocator&, IterationStateFactory&, profile::PxProfileZone*);
+}
+}
+
+#if NVMATH_SIMD
+typedef Simd4f Simd4fType;
+#else
+typedef Scalar4f Simd4fType;
+#endif
+
+using namespace nvidia;
+
+cloth::SwSolver::SwSolver(nvidia::profile::PxProfileZone* profiler, PxTaskManager* taskMgr)
+: mProfiler(profiler)
+, mSimulateEventId(mProfiler ? mProfiler->getEventIdForName("cloth::SwSolver::simulate") : uint16_t(-1))
+#if APEX_UE4
+, mDt(0.0f)
+#endif
+, mInterCollisionDistance(0.0f)
+, mInterCollisionStiffness(1.0f)
+, mInterCollisionIterations(1)
+, mInterCollisionScratchMem(NULL)
+, mInterCollisionScratchMemSize(0)
+{
+	mStartSimulationTask.mSolver = this;
+	mEndSimulationTask.mSolver = this;
+
+	PX_UNUSED(taskMgr);
+}
+
+cloth::SwSolver::~SwSolver()
+{
+	if(mInterCollisionScratchMem)
+		PX_FREE(mInterCollisionScratchMem);
+
+	PX_ASSERT(mCpuClothSimulationTasks.empty());
+}
+
+namespace
+{
+template <typename T>
+bool clothSizeGreater(const T& t0, const T& t1)
+{
+#if APEX_UE4
+	return t0->mCloth->mCurParticles.size() > t1->mCloth->mCurParticles.size();
+#else
+	return t0.mCloth->mCurParticles.size() > t1.mCloth->mCurParticles.size();
+#endif
+}
+
+template <typename T>
+void sortTasks(nvidia::Array<T, nvidia::NonTrackingAllocator>& tasks)
+{
+	nvidia::sort(tasks.begin(), tasks.size(), &clothSizeGreater<T>);
+}
+}
+
+void cloth::SwSolver::addCloth(Cloth* cloth)
+{
+	SwCloth& swCloth = static_cast<SwClothImpl&>(*cloth).mCloth;
+
+#if APEX_UE4
+	mCpuClothSimulationTasks.pushBack(new CpuClothSimulationTask(swCloth, *this));
+#else
+	mCpuClothSimulationTasks.pushBack(CpuClothSimulationTask(swCloth, mEndSimulationTask));
+#endif
+
+	sortTasks(mCpuClothSimulationTasks);
+}
+
+void cloth::SwSolver::removeCloth(Cloth* cloth)
+{
+	SwCloth& swCloth = static_cast<SwClothImpl&>(*cloth).mCloth;
+
+	CpuClothSimulationTaskVector::Iterator tIt = mCpuClothSimulationTasks.begin();
+	CpuClothSimulationTaskVector::Iterator tEnd = mCpuClothSimulationTasks.end();
+
+	while (tIt != tEnd &&
+#if APEX_UE4
+		(*tIt)->mCloth != &swCloth
+#else
+		tIt->mCloth != &swCloth
+#endif
+		)
+		++tIt;
+
+	if(tIt != tEnd)
+	{
+#if APEX_UE4
+		delete *tIt;
+#else
+		deallocate(tIt->mScratchMemory);
+#endif
+		mCpuClothSimulationTasks.replaceWithLast(tIt);
+		sortTasks(mCpuClothSimulationTasks);
+	}
+}
+
+PxBaseTask& cloth::SwSolver::simulate(float dt, PxBaseTask& continuation)
+{
+	if (mCpuClothSimulationTasks.empty()
+#if APEX_UE4
+		|| dt == 0.0f
+#endif
+		)
+	{
+		continuation.addReference();
+		return continuation;
+	}
+
+	mEndSimulationTask.setContinuation(&continuation);
+#if APEX_UE4
+	mDt = dt;
+#else
+	mEndSimulationTask.mDt = dt;
+#endif
+
+	mStartSimulationTask.setContinuation(&mEndSimulationTask);
+
+	mEndSimulationTask.removeReference();
+
+	return mStartSimulationTask;
+}
+
+void cloth::SwSolver::interCollision()
+{
+	if(!mInterCollisionIterations || mInterCollisionDistance == 0.0f)
+		return;
+
+	float elasticity = 1.0f;
+
+	// rebuild cloth instance array
+	mInterCollisionInstances.resize(0);
+	for(uint32_t i = 0; i < mCpuClothSimulationTasks.size(); ++i)
+	{
+#if APEX_UE4
+		SwCloth* c = mCpuClothSimulationTasks[i]->mCloth;
+		float invNumIterations = mCpuClothSimulationTasks[i]->mInvNumIterations;
+#else
+		SwCloth* c = mCpuClothSimulationTasks[i].mCloth;
+		float invNumIterations = mCpuClothSimulationTasks[i].mInvNumIterations;
+#endif
+
+		mInterCollisionInstances.pushBack(SwInterCollisionData(
+		    c->mCurParticles.begin(), c->mPrevParticles.begin(),
+		    c->mSelfCollisionIndices.empty() ? c->mCurParticles.size() : c->mSelfCollisionIndices.size(),
+		    c->mSelfCollisionIndices.empty() ? NULL : &c->mSelfCollisionIndices[0], c->mTargetMotion,
+		    c->mParticleBoundsCenter, c->mParticleBoundsHalfExtent, elasticity * invNumIterations, c->mUserData));
+	}
+
+	const uint32_t requiredTempMemorySize = uint32_t(SwInterCollision<Simd4fType>::estimateTemporaryMemory(
+	    &mInterCollisionInstances[0], mInterCollisionInstances.size()));
+
+	// realloc temp memory if necessary
+	if(mInterCollisionScratchMemSize < requiredTempMemorySize)
+	{
+		if(mInterCollisionScratchMem)
+			PX_FREE(mInterCollisionScratchMem);
+
+		mInterCollisionScratchMem = PX_ALLOC(requiredTempMemorySize, "cloth::SwSolver::mInterCollisionScratchMem");
+		mInterCollisionScratchMemSize = requiredTempMemorySize;
+	}
+
+	SwKernelAllocator allocator(mInterCollisionScratchMem, mInterCollisionScratchMemSize);
+
+	// run inter-collision
+	SwInterCollision<Simd4fType> collider(mInterCollisionInstances.begin(), mInterCollisionInstances.size(),
+	                                      mInterCollisionDistance, mInterCollisionStiffness, mInterCollisionIterations,
+	                                      mInterCollisionFilter, allocator, mProfiler);
+
+	collider();
+}
+
+void cloth::SwSolver::beginFrame() const
+{
+	if(mProfiler)
+		mProfiler->startEvent(mSimulateEventId, uint64_t(intptr_t(this)), uint32_t(intptr_t(this)));
+}
+
+void cloth::SwSolver::endFrame() const
+{
+	if(mProfiler)
+		mProfiler->stopEvent(mSimulateEventId, uint64_t(intptr_t(this)), uint32_t(intptr_t(this)));
+}
+
+#if APEX_UE4
+void cloth::SwSolver::simulate(void* task, float dt)
+{
+	if (task)
+		static_cast<cloth::SwSolver::CpuClothSimulationTask*>(task)->simulate(dt);
+}
+#endif
+
+void cloth::SwSolver::StartSimulationTask::runInternal()
+{
+	mSolver->beginFrame();
+
+	CpuClothSimulationTaskVector::Iterator tIt = mSolver->mCpuClothSimulationTasks.begin();
+	CpuClothSimulationTaskVector::Iterator tEnd = mSolver->mCpuClothSimulationTasks.end();
+
+	for(; tIt != tEnd; ++tIt)
+	{
+#if APEX_UE4
+		if (!(*tIt)->mCloth->isSleeping())
+		{
+			(*tIt)->setContinuation(mCont);
+			(*tIt)->removeReference();
+		}
+#else
+		if(!tIt->mCloth->isSleeping())
+		{
+			tIt->setContinuation(mCont);
+			tIt->removeReference();
+		}
+#endif
+	}
+}
+
+const char* cloth::SwSolver::StartSimulationTask::getName() const
+{
+	return "cloth.SwSolver.startSimulation";
+}
+
+void cloth::SwSolver::EndSimulationTask::runInternal()
+{
+	mSolver->interCollision();
+	mSolver->endFrame();
+}
+
+const char* cloth::SwSolver::EndSimulationTask::getName() const
+{
+	return "cloth.SwSolver.endSimulation";
+}
+
+#if !APEX_UE4
+cloth::SwSolver::CpuClothSimulationTask::CpuClothSimulationTask(SwCloth& cloth, EndSimulationTask& continuation)
+: mCloth(&cloth), mContinuation(&continuation), mScratchMemorySize(0), mScratchMemory(0), mInvNumIterations(0.0f)
+{
+}
+#endif
+
+#if APEX_UE4
+cloth::SwSolver::CpuClothSimulationTask::CpuClothSimulationTask(SwCloth& cloth, SwSolver& solver)
+	: mCloth(&cloth), mSolver(&solver), mScratchMemorySize(0), mScratchMemory(0), mInvNumIterations(0.0f)
+{
+	mCloth->mSimulationTask = this;
+}
+
+cloth::SwSolver::CpuClothSimulationTask::~CpuClothSimulationTask()
+{
+	deallocate(mScratchMemory);
+	mCloth->mSimulationTask = NULL;
+}
+
+void cloth::SwSolver::CpuClothSimulationTask::runInternal()
+{
+	simulate(mSolver->mDt);
+}
+
+
+void cloth::SwSolver::CpuClothSimulationTask::simulate(float dt)
+{
+	// check if we need to reallocate the temp memory buffer
+	// (number of shapes may have changed)
+	uint32_t requiredTempMemorySize = uint32_t(SwSolverKernel<Simd4fType>::estimateTemporaryMemory(*mCloth));
+
+	if (mScratchMemorySize < requiredTempMemorySize)
+	{
+		deallocate(mScratchMemory);
+
+		mScratchMemory = allocate(requiredTempMemorySize);
+		mScratchMemorySize = requiredTempMemorySize;
+	}
+
+	IterationStateFactory factory(*mCloth, dt);
+	mInvNumIterations = factory.mInvNumIterations;
+
+	nvidia::SIMDGuard simdGuard;
+
+	SwClothData data(*mCloth, mCloth->mFabric);
+	SwKernelAllocator allocator(mScratchMemory, uint32_t(mScratchMemorySize));
+	nvidia::profile::PxProfileZone* profileZone = mSolver->mProfiler;
+
+	// construct kernel functor and execute
+#if PX_ANDROID
+	// if(!neonSolverKernel(cloth, data, allocator, factory, profileZone))
+#endif
+	SwSolverKernel<Simd4fType>(*mCloth, data, allocator, factory, profileZone)();
+
+	data.reconcile(*mCloth); // update cloth
+
+	release();
+}
+
+#else
+
+void cloth::SwSolver::CpuClothSimulationTask::runInternal()
+{
+	// check if we need to reallocate the temp memory buffer
+	// (number of shapes may have changed)
+	uint32_t requiredTempMemorySize = uint32_t(SwSolverKernel<Simd4fType>::estimateTemporaryMemory(*mCloth));
+
+	if(mScratchMemorySize < requiredTempMemorySize)
+	{
+		deallocate(mScratchMemory);
+
+		mScratchMemory = allocate(requiredTempMemorySize);
+		mScratchMemorySize = requiredTempMemorySize;
+	}
+
+	if(mContinuation->mDt == 0.0f)
+		return;
+
+	IterationStateFactory factory(*mCloth, mContinuation->mDt);
+	mInvNumIterations = factory.mInvNumIterations;
+
+	nvidia::SIMDGuard simdGuard;
+
+	SwClothData data(*mCloth, mCloth->mFabric);
+	SwKernelAllocator allocator(mScratchMemory, uint32_t(mScratchMemorySize));
+	nvidia::profile::PxProfileZone* profileZone = mContinuation->mSolver->mProfiler;
+
+	// construct kernel functor and execute
+#if PX_ANDROID
+	// if(!neonSolverKernel(cloth, data, allocator, factory, profileZone))
+#endif
+	SwSolverKernel<Simd4fType>(*mCloth, data, allocator, factory, profileZone)();
+
+	data.reconcile(*mCloth); // update cloth
+}
+#endif
+
+const char* cloth::SwSolver::CpuClothSimulationTask::getName() const
+{
+	return "cloth.SwSolver.cpuClothSimulation";
+}
+
+void cloth::SwSolver::CpuClothSimulationTask::release()
+{
+	mCloth->mMotionConstraints.pop();
+	mCloth->mSeparationConstraints.pop();
+
+	if (!mCloth->mTargetCollisionSpheres.empty())
+	{
+		swap(mCloth->mStartCollisionSpheres, mCloth->mTargetCollisionSpheres);
+		mCloth->mTargetCollisionSpheres.resize(0);
+	}
+
+	if (!mCloth->mTargetCollisionPlanes.empty())
+	{
+		swap(mCloth->mStartCollisionPlanes, mCloth->mTargetCollisionPlanes);
+		mCloth->mTargetCollisionPlanes.resize(0);
+	}
+
+	if (!mCloth->mTargetCollisionTriangles.empty())
+	{
+		swap(mCloth->mStartCollisionTriangles, mCloth->mTargetCollisionTriangles);
+		mCloth->mTargetCollisionTriangles.resize(0);
+	}
+#if !APEX_UE4
+	mContinuation->removeReference();
+#endif
+}
+
+#if APEX_UE4
+void(*const cloth::SwCloth::sSimulationFunction)(void*, float) = &cloth::SwSolver::simulate;
+#endif
+\ No newline at end of file
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolver.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolver.h
new file mode 100644
index 00000000..472a5dba
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolver.h
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Solver.h"
+#include "Allocator.h"
+#include "SwInterCollision.h"
+#include "CmTask.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+
+class SwCloth;
+class SwFactory;
+
+/// CPU/SSE based cloth solver
+class SwSolver : public UserAllocated, public Solver
+{
+	struct StartSimulationTask : public Cm::Task
+	{
+		using PxLightCpuTask::mRefCount;
+		using PxLightCpuTask::mTm;
+
+		virtual void runInternal();
+		virtual const char* getName() const;
+
+		SwSolver* mSolver;
+	};
+
+	struct EndSimulationTask : public Cm::Task
+	{
+		using PxLightCpuTask::mRefCount;
+
+		virtual void runInternal();
+		virtual const char* getName() const;
+
+		SwSolver* mSolver;
+#if !APEX_UE4
+		float mDt;
+#endif
+	};
+
+	struct CpuClothSimulationTask : public Cm::Task
+	{
+#if APEX_UE4
+		void* operator new(size_t n){ return allocate(n); }
+		void operator delete(void* ptr) { return deallocate(ptr); }
+
+		CpuClothSimulationTask(SwCloth&, SwSolver&);
+		~CpuClothSimulationTask();
+
+		void simulate(float dt);
+
+		SwSolver* mSolver;
+#else
+		CpuClothSimulationTask(SwCloth&, EndSimulationTask&);
+
+		EndSimulationTask* mContinuation;
+#endif
+		virtual void runInternal();
+		virtual const char* getName() const;
+		virtual void release();
+
+		SwCloth* mCloth;
+
+		uint32_t mScratchMemorySize;
+		void* mScratchMemory;
+		float mInvNumIterations;
+	};
+
+  public:
+	SwSolver(nvidia::profile::PxProfileZone*, PxTaskManager*);
+	virtual ~SwSolver();
+
+	virtual void addCloth(Cloth*);
+	virtual void removeCloth(Cloth*);
+
+	virtual PxBaseTask& simulate(float dt, PxBaseTask&);
+
+	virtual void setInterCollisionDistance(float distance)
+	{
+		mInterCollisionDistance = distance;
+	}
+	virtual float getInterCollisionDistance() const
+	{
+		return mInterCollisionDistance;
+	}
+
+	virtual void setInterCollisionStiffness(float stiffness)
+	{
+		mInterCollisionStiffness = stiffness;
+	}
+	virtual float getInterCollisionStiffness() const
+	{
+		return mInterCollisionStiffness;
+	}
+
+	virtual void setInterCollisionNbIterations(uint32_t nbIterations)
+	{
+		mInterCollisionIterations = nbIterations;
+	}
+	virtual uint32_t getInterCollisionNbIterations() const
+	{
+		return mInterCollisionIterations;
+	}
+
+	virtual void setInterCollisionFilter(InterCollisionFilter filter)
+	{
+		mInterCollisionFilter = filter;
+	}
+
+	virtual uint32_t getNumSharedPositions( const Cloth* ) const 
+	{ 
+		return uint32_t(-1); 
+	}
+
+	virtual bool hasError() const
+	{
+		return false;
+	}
+
+#if APEX_UE4
+	static void simulate(void*, float);
+#endif
+
+  private:
+	void beginFrame() const;
+	void endFrame() const;
+
+	void interCollision();
+
+  private:
+	StartSimulationTask mStartSimulationTask;
+
+#if APEX_UE4
+	typedef Vector<CpuClothSimulationTask*>::Type CpuClothSimulationTaskVector;
+	float mDt;
+#else
+	typedef Vector<CpuClothSimulationTask>::Type CpuClothSimulationTaskVector;
+#endif
+
+	CpuClothSimulationTaskVector mCpuClothSimulationTasks;
+
+	EndSimulationTask mEndSimulationTask;
+
+	profile::PxProfileZone* mProfiler;
+	uint16_t mSimulateEventId;
+
+	float mInterCollisionDistance;
+	float mInterCollisionStiffness;
+	uint32_t mInterCollisionIterations;
+	InterCollisionFilter mInterCollisionFilter;
+
+	void* mInterCollisionScratchMem;
+	uint32_t mInterCollisionScratchMemSize;
+	nvidia::Array<SwInterCollisionData> mInterCollisionInstances;
+
+};
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolverKernel.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolverKernel.cpp
new file mode 100644
index 00000000..29f3fdc3
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolverKernel.cpp
@@ -0,0 +1,695 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "SwSolverKernel.h"
+#include "SwCloth.h"
+#include "SwClothData.h"
+#include "SwFabric.h"
+#include "SwFactory.h"
+#include "PointInterpolator.h"
+#include "BoundingBox.h"
+#include "Simd4i.h"
+
+#if defined(_MSC_VER) && _MSC_VER >= 1600 && PX_WINDOWS_FAMILY
+#define PX_AVX 1
+
+namespace avx
+{
+// defined in SwSolveConstraints.cpp
+
+void initialize();
+
+template <bool, uint32_t>
+void solveConstraints(float* __restrict, const float* __restrict, const float* __restrict, const uint16_t* __restrict,
+                      const __m128&);
+}
+
+namespace
+{
+uint32_t getAvxSupport()
+{
+// Checking for AVX requires 3 things:
+// 1) CPUID indicates that the OS uses XSAVE and XRSTORE
+// 2) CPUID indicates support for AVX
+// 3) XGETBV indicates registers are saved and restored on context switch
+
+#if _MSC_FULL_VER < 160040219 || !defined(_XCR_XFEATURE_ENABLED_MASK)
+	// need at least VC10 SP1 and compile on at least Win7 SP1
+	return 0;
+#else
+	int cpuInfo[4];
+	__cpuid(cpuInfo, 1);
+	int avxFlags = 3 << 27; // checking 1) and 2) above
+	if((cpuInfo[2] & avxFlags) != avxFlags)
+		return 0; // xgetbv not enabled or no AVX support
+
+	if((_xgetbv(_XCR_XFEATURE_ENABLED_MASK) & 0x6) != 0x6)
+		return 0; // OS does not save YMM registers
+
+	avx::initialize();
+
+#if _MSC_VER < 1700
+	return 1;
+#else
+	int fmaFlags = 1 << 12;
+	if((cpuInfo[2] & fmaFlags) != fmaFlags)
+		return 1; // no FMA3 support
+
+	/* only using fma at the moment, don't lock out AMD's piledriver by requiring avx2
+	__cpuid(cpuInfo, 7);
+	int avx2Flags = 1 << 5;
+	if((cpuInfo[1] & avx2Flags) != avx2Flags)
+	    return 1; // no AVX2 support
+	*/
+
+	return 2;
+#endif // _MSC_VER
+#endif // _MSC_FULL_VER
+}
+
+const uint32_t sAvxSupport = getAvxSupport(); // 0: no AVX, 1: AVX, 2: AVX+FMA
+}
+#endif
+
+using namespace nvidia;
+
+namespace
+{
+/* simd constants */
+
+typedef Simd4fFactory<detail::FourTuple> Simd4fConstant;
+
+const Simd4fConstant sMaskW = simd4f(simd4i(0, 0, 0, ~0));
+const Simd4fConstant sMaskXY = simd4f(simd4i(~0, ~0, 0, 0));
+const Simd4fConstant sMaskXYZ = simd4f(simd4i(~0, ~0, ~0, 0));
+const Simd4fConstant sMaskYZW = simd4f(simd4i(0, ~0, ~0, ~0));
+const Simd4fConstant sEpsilon = simd4f(FLT_EPSILON);
+const Simd4fConstant sMinusOneXYZOneW = simd4f(-1.0f, -1.0f, -1.0f, 1.0f);
+const Simd4fConstant sFloatMaxW = simd4f(0.0f, 0.0f, 0.0f, FLT_MAX);
+const Simd4fConstant sMinusFloatMaxXYZ = simd4f(-FLT_MAX, -FLT_MAX, -FLT_MAX, 0.0f);
+
+/* static worker functions */
+
+/**
+   This function performs explicit Euler integration based on position, where
+   x_next = x_cur + (x_cur - x_prev) * dt_cur/dt_prev * damping + g * dt * dt
+   The g * dt * dt term is folded into accelIt.
+ */
+
+template <typename Simd4f, typename AccelerationIterator>
+void integrateParticles(Simd4f* __restrict curIt, Simd4f* __restrict curEnd, Simd4f* __restrict prevIt, Simd4f scale,
+                        const AccelerationIterator& aIt, const Simd4f& prevBias)
+{
+	// local copy to avoid LHS
+	AccelerationIterator accelIt(aIt);
+
+	for(; curIt != curEnd; ++curIt, ++prevIt, ++accelIt)
+	{
+		Simd4f current = *curIt;
+		Simd4f previous = *prevIt;
+		// if(current.w == 0) current.w = previous.w
+		current = select(current > sMinusFloatMaxXYZ, current, previous);
+		Simd4f finiteMass = splat<3>(previous) > sFloatMaxW;
+		Simd4f delta = (current - previous) * scale + *accelIt;
+		*curIt = current + (delta & finiteMass);
+		*prevIt = select(sMaskW, previous, current) + (prevBias & finiteMass);
+	}
+}
+
+template <typename Simd4f, typename AccelerationIterator>
+void integrateParticles(Simd4f* __restrict curIt, Simd4f* __restrict curEnd, Simd4f* __restrict prevIt,
+                        const Simd4f (&prevMatrix)[3], const Simd4f (&curMatrix)[3], const AccelerationIterator& aIt,
+                        const Simd4f& prevBias)
+{
+	// local copy to avoid LHS
+	AccelerationIterator accelIt(aIt);
+
+	for(; curIt != curEnd; ++curIt, ++prevIt, ++accelIt)
+	{
+		Simd4f current = *curIt;
+		Simd4f previous = *prevIt;
+		// if(current.w == 0) current.w = previous.w
+		current = select(current > sMinusFloatMaxXYZ, current, previous);
+		Simd4f finiteMass = splat<3>(previous) > sFloatMaxW;
+		// curMatrix*current + prevMatrix*previous + accel
+		Simd4f delta = cloth::transform(curMatrix, cloth::transform(prevMatrix, *accelIt, previous), current);
+		*curIt = current + (delta & finiteMass);
+		*prevIt = select(sMaskW, previous, current) + (prevBias & finiteMass);
+	}
+}
+
+template <typename Simd4f, typename ConstraintIterator>
+void constrainMotion(Simd4f* __restrict curIt, const Simd4f* __restrict curEnd, const ConstraintIterator& spheres,
+                     Simd4f scaleBiasStiffness)
+{
+	Simd4f scale = splat<0>(scaleBiasStiffness);
+	Simd4f bias = splat<1>(scaleBiasStiffness);
+	Simd4f stiffness = splat<3>(scaleBiasStiffness);
+
+	// local copy of iterator to maintain alignment
+	ConstraintIterator sphIt = spheres;
+
+	for(; curIt < curEnd; curIt += 4)
+	{
+		// todo: use msub where available
+		Simd4f curPos0 = curIt[0];
+		Simd4f curPos1 = curIt[1];
+		Simd4f curPos2 = curIt[2];
+		Simd4f curPos3 = curIt[3];
+
+		Simd4f delta0 = *sphIt - (sMaskXYZ & curPos0);
+		++sphIt;
+		Simd4f delta1 = *sphIt - (sMaskXYZ & curPos1);
+		++sphIt;
+		Simd4f delta2 = *sphIt - (sMaskXYZ & curPos2);
+		++sphIt;
+		Simd4f delta3 = *sphIt - (sMaskXYZ & curPos3);
+		++sphIt;
+
+		Simd4f deltaX = delta0, deltaY = delta1, deltaZ = delta2, deltaW = delta3;
+		transpose(deltaX, deltaY, deltaZ, deltaW);
+
+		Simd4f sqrLength = sEpsilon + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ;
+		Simd4f radius = max(simd4f(_0), deltaW * scale + bias);
+
+		Simd4f slack = simd4f(_1) - radius * rsqrt(sqrLength);
+
+		// if slack <= 0.0f then we don't want to affect particle
+		// and can skip if all particles are unaffected
+		Simd4f isPositive;
+		if(anyGreater(slack, simd4f(_0), isPositive))
+		{
+			// set invMass to zero if radius is zero
+			curPos0 = curPos0 & (splat<0>(radius) > sMinusFloatMaxXYZ);
+			curPos1 = curPos1 & (splat<1>(radius) > sMinusFloatMaxXYZ);
+			curPos2 = curPos2 & (splat<2>(radius) > sMinusFloatMaxXYZ);
+			curPos3 = curPos3 & ((radius) > sMinusFloatMaxXYZ);
+
+			slack = slack * stiffness & isPositive;
+
+			curIt[0] = curPos0 + (delta0 & sMaskXYZ) * splat<0>(slack);
+			curIt[1] = curPos1 + (delta1 & sMaskXYZ) * splat<1>(slack);
+			curIt[2] = curPos2 + (delta2 & sMaskXYZ) * splat<2>(slack);
+			curIt[3] = curPos3 + (delta3 & sMaskXYZ) * splat<3>(slack);
+		}
+	}
+}
+
+template <typename Simd4f, typename ConstraintIterator>
+void constrainSeparation(Simd4f* __restrict curIt, const Simd4f* __restrict curEnd, const ConstraintIterator& spheres)
+{
+	// local copy of iterator to maintain alignment
+	ConstraintIterator sphIt = spheres;
+
+	for(; curIt < curEnd; curIt += 4)
+	{
+		// todo: use msub where available
+		Simd4f curPos0 = curIt[0];
+		Simd4f curPos1 = curIt[1];
+		Simd4f curPos2 = curIt[2];
+		Simd4f curPos3 = curIt[3];
+
+		Simd4f delta0 = *sphIt - (sMaskXYZ & curPos0);
+		++sphIt;
+		Simd4f delta1 = *sphIt - (sMaskXYZ & curPos1);
+		++sphIt;
+		Simd4f delta2 = *sphIt - (sMaskXYZ & curPos2);
+		++sphIt;
+		Simd4f delta3 = *sphIt - (sMaskXYZ & curPos3);
+		++sphIt;
+
+		Simd4f deltaX = delta0, deltaY = delta1, deltaZ = delta2, deltaW = delta3;
+		transpose(deltaX, deltaY, deltaZ, deltaW);
+
+		Simd4f sqrLength = sEpsilon + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ;
+
+		Simd4f slack = simd4f(_1) - deltaW * rsqrtT<1>(sqrLength);
+
+		// if slack >= 0.0f then we don't want to affect particle
+		// and can skip if all particles are unaffected
+		Simd4f isNegative;
+		if(anyGreater(simd4f(_0), slack, isNegative))
+		{
+			slack = slack & isNegative;
+
+			curIt[0] = curPos0 + (delta0 & sMaskXYZ) * splat<0>(slack);
+			curIt[1] = curPos1 + (delta1 & sMaskXYZ) * splat<1>(slack);
+			curIt[2] = curPos2 + (delta2 & sMaskXYZ) * splat<2>(slack);
+			curIt[3] = curPos3 + (delta3 & sMaskXYZ) * splat<3>(slack);
+		}
+	}
+}
+
+/**
+    traditional gauss-seidel internal constraint solver
+ */
+template <bool useMultiplier, typename Simd4f>
+void solveConstraints(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd,
+                      const uint16_t* __restrict iIt, Simd4f stiffness)
+{
+	Simd4f stretchLimit, compressionLimit, multiplier;
+	if(useMultiplier)
+	{
+		stretchLimit = splat<3>(stiffness);
+		compressionLimit = splat<2>(stiffness);
+		multiplier = splat<1>(stiffness);
+	}
+	stiffness = splat<0>(stiffness);
+
+	for(; rIt != rEnd; rIt += 4, iIt += 8)
+	{
+		uint32_t p0i = iIt[0] * sizeof(PxVec4);
+		uint32_t p0j = iIt[1] * sizeof(PxVec4);
+		uint32_t p1i = iIt[2] * sizeof(PxVec4);
+		uint32_t p1j = iIt[3] * sizeof(PxVec4);
+		uint32_t p2i = iIt[4] * sizeof(PxVec4);
+		uint32_t p2j = iIt[5] * sizeof(PxVec4);
+		uint32_t p3i = iIt[6] * sizeof(PxVec4);
+		uint32_t p3j = iIt[7] * sizeof(PxVec4);
+
+		Simd4f v0i = loadAligned(posIt, p0i);
+		Simd4f v0j = loadAligned(posIt, p0j);
+		Simd4f v1i = loadAligned(posIt, p1i);
+		Simd4f v1j = loadAligned(posIt, p1j);
+		Simd4f v2i = loadAligned(posIt, p2i);
+		Simd4f v2j = loadAligned(posIt, p2j);
+		Simd4f v3i = loadAligned(posIt, p3i);
+		Simd4f v3j = loadAligned(posIt, p3j);
+
+		Simd4f h0ij = v0j + v0i * sMinusOneXYZOneW;
+		Simd4f h1ij = v1j + v1i * sMinusOneXYZOneW;
+		Simd4f h2ij = v2j + v2i * sMinusOneXYZOneW;
+		Simd4f h3ij = v3j + v3i * sMinusOneXYZOneW;
+
+		Simd4f hxij = h0ij, hyij = h1ij, hzij = h2ij, vwij = h3ij;
+		transpose(hxij, hyij, hzij, vwij);
+
+		Simd4f rij = loadAligned(rIt);
+		Simd4f e2ij = sEpsilon + hxij * hxij + hyij * hyij + hzij * hzij;
+		Simd4f erij = (simd4f(_1) - rij * rsqrt(e2ij)) & (rij > sEpsilon); // add parentheses for wiiu
+
+		if(useMultiplier)
+		{
+			erij = erij - multiplier * max(compressionLimit, min(erij, stretchLimit));
+		}
+		Simd4f exij = erij * stiffness * recip(sEpsilon + vwij);
+
+		h0ij = h0ij * splat<0>(exij) & sMaskXYZ;
+		h1ij = h1ij * splat<1>(exij) & sMaskXYZ;
+		h2ij = h2ij * splat<2>(exij) & sMaskXYZ;
+		h3ij = h3ij * splat<3>(exij) & sMaskXYZ;
+
+		storeAligned(posIt, p0i, v0i + h0ij * splat<3>(v0i));
+		storeAligned(posIt, p0j, v0j - h0ij * splat<3>(v0j));
+		storeAligned(posIt, p1i, v1i + h1ij * splat<3>(v1i));
+		storeAligned(posIt, p1j, v1j - h1ij * splat<3>(v1j));
+		storeAligned(posIt, p2i, v2i + h2ij * splat<3>(v2i));
+		storeAligned(posIt, p2j, v2j - h2ij * splat<3>(v2j));
+		storeAligned(posIt, p3i, v3i + h3ij * splat<3>(v3i));
+		storeAligned(posIt, p3j, v3j - h3ij * splat<3>(v3j));
+	}
+}
+
+#if PX_WINDOWS_FAMILY
+#include "sse2/SwSolveConstraints.h"
+#endif
+
+// calculates upper bound of all position deltas
+template <typename Simd4f>
+Simd4f calculateMaxDelta(const Simd4f* prevIt, const Simd4f* curIt, const Simd4f* curEnd)
+{
+	Simd4f maxDelta(simd4f(_0));
+	for(; curIt < curEnd; ++curIt, ++prevIt)
+		maxDelta = max(maxDelta, abs(*curIt - *prevIt));
+
+	return maxDelta & sMaskXYZ;
+}
+
+} // anonymous namespace
+
+template <typename Simd4f>
+cloth::SwSolverKernel<Simd4f>::SwSolverKernel(SwCloth const& cloth, SwClothData& clothData, SwKernelAllocator& allocator,
+                                              IterationStateFactory& factory, profile::PxProfileZone* profiler)
+: mCloth(cloth)
+, mClothData(clothData)
+, mAllocator(allocator)
+, mCollision(clothData, allocator, profiler)
+, mSelfCollision(clothData, allocator)
+, mState(factory.create<Simd4f>(cloth))
+, mProfiler(profiler)
+{
+	mClothData.verify();
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::operator()()
+{
+	simulateCloth();
+}
+
+template <typename Simd4f>
+size_t cloth::SwSolverKernel<Simd4f>::estimateTemporaryMemory(const SwCloth& cloth)
+{
+	size_t collisionTempMemory = SwCollision<Simd4f>::estimateTemporaryMemory(cloth);
+	size_t selfCollisionTempMemory = SwSelfCollision<Simd4f>::estimateTemporaryMemory(cloth);
+
+	size_t tempMemory = PxMax(collisionTempMemory, selfCollisionTempMemory);
+	size_t persistentMemory = SwCollision<Simd4f>::estimatePersistentMemory(cloth);
+
+	// account for any allocator overhead (this could be exposed in the allocator)
+	size_t maxAllocs = 32;
+	size_t maxPerAllocationOverhead = 32;
+	size_t maxAllocatorOverhead = maxAllocs * maxPerAllocationOverhead;
+
+	return maxAllocatorOverhead + persistentMemory + tempMemory;
+}
+
+template <typename Simd4f>
+template <typename AccelerationIterator>
+void cloth::SwSolverKernel<Simd4f>::integrateParticles(AccelerationIterator& accelIt, const Simd4f& prevBias)
+{
+	Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles);
+	Simd4f* curEnd = curIt + mClothData.mNumParticles;
+	Simd4f* prevIt = reinterpret_cast<Simd4f*>(mClothData.mPrevParticles);
+
+	if(!mState.mIsTurning)
+		::integrateParticles(curIt, curEnd, prevIt, mState.mPrevMatrix[0], accelIt, prevBias);
+	else
+		::integrateParticles(curIt, curEnd, prevIt, mState.mPrevMatrix, mState.mCurMatrix, accelIt, prevBias);
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::integrateParticles()
+{
+	ProfileZone zone("cloth::SwSolverKernel::integrateParticles", mProfiler);
+
+	const Simd4f* startAccelIt = reinterpret_cast<const Simd4f*>(mClothData.mParticleAccelerations);
+
+	// dt^2 (todo: should this be the smoothed dt used for gravity?)
+	const Simd4f sqrIterDt = simd4f(sqr(mState.mIterDt)) & (Simd4f)sMaskXYZ;
+
+	if(!startAccelIt)
+	{
+		// no per-particle accelerations, use a constant
+		ConstantIterator<Simd4f> accelIt(mState.mCurBias);
+		integrateParticles(accelIt, mState.mPrevBias);
+	}
+	else
+	{
+		// iterator implicitly scales by dt^2 and adds gravity
+		ScaleBiasIterator<Simd4f, const Simd4f*> accelIt(startAccelIt, sqrIterDt, mState.mCurBias);
+		integrateParticles(accelIt, mState.mPrevBias);
+	}
+
+	zone.setValue(mState.mIsTurning);
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::constrainTether()
+{
+	if(0.0f == mClothData.mTetherConstraintStiffness || !mClothData.mNumTethers)
+		return;
+
+#if PX_PROFILE
+	ProfileZone zone("cloth::SwSolverKernel::solveTethers", mProfiler);
+#endif
+
+	uint32_t numParticles = mClothData.mNumParticles;
+	uint32_t numTethers = mClothData.mNumTethers;
+	PX_ASSERT(0 == numTethers % numParticles);
+
+	float* __restrict curIt = mClothData.mCurParticles;
+	const float* __restrict curFirst = curIt;
+	const float* __restrict curEnd = curIt + 4 * numParticles;
+
+	typedef const SwTether* __restrict TetherIter;
+	TetherIter tFirst = mClothData.mTethers;
+	TetherIter tEnd = tFirst + numTethers;
+
+	Simd4f stiffness = (Simd4f)sMaskXYZ & simd4f(numParticles * mClothData.mTetherConstraintStiffness / numTethers);
+	Simd4f scale = simd4f(mClothData.mTetherConstraintScale);
+
+	for(; curIt != curEnd; curIt += 4, ++tFirst)
+	{
+		Simd4f position = loadAligned(curIt);
+		Simd4f offset = simd4f(_0);
+
+		for(TetherIter tIt = tFirst; tIt < tEnd; tIt += numParticles)
+		{
+			PX_ASSERT(tIt->mAnchor < numParticles);
+			Simd4f anchor = loadAligned(curFirst, tIt->mAnchor * sizeof(PxVec4));
+			Simd4f delta = anchor - position;
+			Simd4f sqrLength = sEpsilon + dot3(delta, delta);
+
+			Simd4f tetherLength = load(&tIt->mLength);
+			tetherLength = splat<0>(tetherLength);
+
+			Simd4f radius = tetherLength * scale;
+			Simd4f slack = simd4f(_1) - radius * rsqrt(sqrLength);
+
+			offset = offset + delta * max(slack, simd4f(_0));
+		}
+
+		storeAligned(curIt, position + offset * stiffness);
+	}
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::solveFabric()
+{
+	ProfileZone zone("cloth::SwSolverKernel::solveFabric", mProfiler);
+
+	float* pIt = mClothData.mCurParticles;
+
+	const PhaseConfig* cIt = mClothData.mConfigBegin;
+	const PhaseConfig* cEnd = mClothData.mConfigEnd;
+
+	const uint32_t* pBegin = mClothData.mPhases;
+	const float* rBegin = mClothData.mRestvalues;
+
+	const uint32_t* sBegin = mClothData.mSets;
+	const uint16_t* iBegin = mClothData.mIndices;
+
+	uint32_t totalConstraints = 0;
+
+	Simd4f stiffnessExponent = simd4f(mCloth.mStiffnessFrequency * mState.mIterDt);
+
+	for(; cIt != cEnd; ++cIt)
+	{
+		const uint32_t* sIt = sBegin + pBegin[cIt->mPhaseIndex];
+		const float* rIt = rBegin + sIt[0];
+		const float* rEnd = rBegin + sIt[1];
+		const uint16_t* iIt = iBegin + sIt[0] * 2;
+
+		totalConstraints += uint32_t(rEnd - rIt);
+
+		// (stiffness, multiplier, compressionLimit, stretchLimit)
+		Simd4f config = load(&cIt->mStiffness);
+		// stiffness specified as fraction of constraint error per-millisecond
+		Simd4f scaledConfig = simd4f(_1) - simdf::exp2(config * stiffnessExponent);
+		Simd4f stiffness = select(sMaskXY, scaledConfig, config);
+
+		int neutralMultiplier = allEqual(sMaskYZW & stiffness, simd4f(_0));
+
+#if PX_AVX
+		switch(sAvxSupport)
+		{
+		case 2:
+#if _MSC_VER >= 1700
+			neutralMultiplier ? avx::solveConstraints<false, 2>(pIt, rIt, rEnd, iIt, stiffness)
+			                  : avx::solveConstraints<true, 2>(pIt, rIt, rEnd, iIt, stiffness);
+			break;
+#endif
+		case 1:
+			neutralMultiplier ? avx::solveConstraints<false, 1>(pIt, rIt, rEnd, iIt, stiffness)
+			                  : avx::solveConstraints<true, 1>(pIt, rIt, rEnd, iIt, stiffness);
+			break;
+		default:
+#endif
+			neutralMultiplier ? solveConstraints<false>(pIt, rIt, rEnd, iIt, stiffness)
+			                  : solveConstraints<true>(pIt, rIt, rEnd, iIt, stiffness);
+#if PX_AVX
+			break;
+		}
+#endif
+	}
+
+	zone.setValue(totalConstraints);
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::constrainMotion()
+{
+	if(!mClothData.mStartMotionConstraints)
+		return;
+
+#if PX_PROFILE
+	ProfileZone zone("cloth::SwSolverKernel::constrainMotion", mProfiler);
+#endif
+
+	Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles);
+	Simd4f* curEnd = curIt + mClothData.mNumParticles;
+
+	const Simd4f* startIt = reinterpret_cast<const Simd4f*>(mClothData.mStartMotionConstraints);
+	const Simd4f* targetIt = reinterpret_cast<const Simd4f*>(mClothData.mTargetMotionConstraints);
+
+	Simd4f scaleBias = load(&mCloth.mMotionConstraintScale);
+	Simd4f stiffness = simd4f(mClothData.mMotionConstraintStiffness);
+	Simd4f scaleBiasStiffness = select(sMaskXYZ, scaleBias, stiffness);
+
+	if(!mClothData.mTargetMotionConstraints)
+		// no interpolation, use the start positions
+		return ::constrainMotion(curIt, curEnd, startIt, scaleBiasStiffness);
+
+	if(mState.mRemainingIterations == 1)
+		// use the target positions on last iteration
+		return ::constrainMotion(curIt, curEnd, targetIt, scaleBiasStiffness);
+
+	// otherwise use an interpolating iterator
+	LerpIterator<Simd4f, const Simd4f*> interpolator(startIt, targetIt, mState.getCurrentAlpha());
+	::constrainMotion(curIt, curEnd, interpolator, scaleBiasStiffness);
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::constrainSeparation()
+{
+	if(!mClothData.mStartSeparationConstraints)
+		return;
+
+#if PX_PROFILE
+	ProfileZone zone("cloth::SwSolverKernel::constrainSeparation", mProfiler);
+#endif
+
+	Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles);
+	Simd4f* curEnd = curIt + mClothData.mNumParticles;
+
+	const Simd4f* startIt = reinterpret_cast<const Simd4f*>(mClothData.mStartSeparationConstraints);
+	const Simd4f* targetIt = reinterpret_cast<const Simd4f*>(mClothData.mTargetSeparationConstraints);
+
+	if(!mClothData.mTargetSeparationConstraints)
+		// no interpolation, use the start positions
+		return ::constrainSeparation(curIt, curEnd, startIt);
+
+	if(mState.mRemainingIterations == 1)
+		// use the target positions on last iteration
+		return ::constrainSeparation(curIt, curEnd, targetIt);
+
+	// otherwise use an interpolating iterator
+	LerpIterator<Simd4f, const Simd4f*> interpolator(startIt, targetIt, mState.getCurrentAlpha());
+	::constrainSeparation(curIt, curEnd, interpolator);
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::collideParticles()
+{
+	ProfileZone zone("cloth::SwSolverKernel::collideParticles", mProfiler);
+
+	mCollision(mState);
+
+	zone.setValue(mCollision.mNumCollisions);
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::selfCollideParticles()
+{
+	ProfileZone zone("cloth::SwSolverKernel::selfCollideParticles", mProfiler);
+
+	mSelfCollision();
+
+	zone.setValue(mSelfCollision.mNumCollisions);
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::updateSleepState()
+{
+	ProfileZone zone("cloth::SwSolverKernel::updateSleepState", mProfiler);
+
+	mClothData.mSleepTestCounter += PxMax(1u, uint32_t(mState.mIterDt * 1000));
+	if(mClothData.mSleepTestCounter >= mCloth.mSleepTestInterval)
+	{
+		const Simd4f* prevIt = reinterpret_cast<Simd4f*>(mClothData.mPrevParticles);
+		const Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles);
+		const Simd4f* curEnd = curIt + mClothData.mNumParticles;
+
+		// calculate max particle delta since last iteration
+		Simd4f maxDelta = calculateMaxDelta(prevIt, curIt, curEnd);
+
+		++mClothData.mSleepPassCounter;
+		Simd4f threshold = simd4f(mCloth.mSleepThreshold * mState.mIterDt);
+		if(anyGreaterEqual(maxDelta, threshold))
+			mClothData.mSleepPassCounter = 0;
+
+		mClothData.mSleepTestCounter -= mCloth.mSleepTestInterval;
+	}
+
+	zone.setValue(mClothData.mSleepPassCounter);
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::iterateCloth()
+{
+	// note on invMass (stored in current/previous positions.w):
+	// integrateParticles()
+	//   - if(current.w == 0) current.w = previous.w
+	// constraintMotion()
+	//   - if(constraint.radius <= 0) current.w = 0
+	// computeBounds()
+	//   - if(current.w > 0) current.w = previous.w
+	// collideParticles()
+	//   - if(collides) current.w *= 1/massScale
+	// after simulate()
+	//   - previous.w: original invMass as set by user
+	//   - current.w: zeroed by motion constraints and mass-scaled by collision
+
+	// integrate positions
+	integrateParticles();
+
+	// motion constraints
+	constrainMotion();
+
+	// solve tether constraints
+	constrainTether();
+
+	// solve edge constraints
+	solveFabric();
+
+	// separation constraints
+	constrainSeparation();
+
+	// perform character collision
+	collideParticles();
+
+	// perform self collision
+	selfCollideParticles();
+
+	// test wake / sleep conditions
+	updateSleepState();
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::simulateCloth()
+{
+	while(mState.mRemainingIterations)
+	{
+		iterateCloth();
+		mState.update();
+	}
+}
+
+// explicit template instantiation
+#if NVMATH_SIMD
+template class cloth::SwSolverKernel<Simd4f>;
+#endif
+#if NVMATH_SCALAR
+template class cloth::SwSolverKernel<Scalar4f>;
+#endif
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolverKernel.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolverKernel.h
new file mode 100644
index 00000000..26b45a88
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolverKernel.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "IterationState.h"
+#include "SwCollision.h"
+#include "SwSelfCollision.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+
+class SwCloth;
+struct SwClothData;
+
+template <typename Simd4f>
+class SwSolverKernel
+{
+  public:
+	SwSolverKernel(SwCloth const&, SwClothData&, SwKernelAllocator&, IterationStateFactory&, nvidia::profile::PxProfileZone*);
+
+	void operator()();
+
+	// returns a conservative estimate of the
+	// total memory requirements during a solve
+	static size_t estimateTemporaryMemory(const SwCloth& c);
+
+  private:
+	void integrateParticles();
+	void constrainTether();
+	void solveFabric();
+	void constrainMotion();
+	void constrainSeparation();
+	void collideParticles();
+	void selfCollideParticles();
+	void updateSleepState();
+
+	void iterateCloth();
+	void simulateCloth();
+
+	SwCloth const& mCloth;
+	SwClothData& mClothData;
+	SwKernelAllocator& mAllocator;
+
+	SwCollision<Simd4f> mCollision;
+	SwSelfCollision<Simd4f> mSelfCollision;
+	IterationState<Simd4f> mState;
+
+	profile::PxProfileZone* mProfiler;
+
+  private:
+	SwSolverKernel<Simd4f>& operator=(const SwSolverKernel<Simd4f>&);
+	template <typename AccelerationIterator>
+	void integrateParticles(AccelerationIterator& accelIt, const Simd4f&);
+};
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/TripletScheduler.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/TripletScheduler.cpp
new file mode 100644
index 00000000..d077624e
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/TripletScheduler.cpp
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "TripletScheduler.h"
+#include "PxMath.h"
+#include "PsFPU.h"
+#include "PxMat33.h"
+#include "PsVecMath.h"
+#include "PsUtilities.h"
+
+using namespace nvidia;
+using namespace physx::shdfnd::aos;
+
+cloth::TripletScheduler::TripletScheduler(Range<const uint32_t[4]> triplets)
+: mTriplets(reinterpret_cast<const Vec4u*>(triplets.begin()), reinterpret_cast<const Vec4u*>(triplets.end()))
+{
+}
+
+// SSE version
+void cloth::TripletScheduler::simd(uint32_t numParticles, uint32_t simdWidth)
+{
+	if(mTriplets.empty())
+		return;
+
+	Vector<uint32_t>::Type mark(numParticles, uint32_t(-1));
+
+	uint32_t setIndex = 0, setSize = 0;
+	for(TripletIter tIt = mTriplets.begin(), tEnd = mTriplets.end(); tIt != tEnd; ++setIndex)
+	{
+		TripletIter tLast = tIt + PxMin(simdWidth, uint32_t(tEnd - tIt));
+		TripletIter tSwap = tEnd;
+
+		for(; tIt != tLast && tIt != tSwap; ++tIt, ++setSize)
+		{
+			// swap from tail until independent triplet found
+			while((mark[tIt->x] == setIndex || mark[tIt->y] == setIndex || mark[tIt->z] == setIndex) && tIt != --tSwap)
+				swap(*tIt, *tSwap);
+
+			if(tIt == tSwap)
+				break; // no independent triplet found
+
+			// mark vertices to be used in simdIndex
+			mark[tIt->x] = setIndex;
+			mark[tIt->y] = setIndex;
+			mark[tIt->z] = setIndex;
+		}
+
+		if(tIt == tSwap) // remaining triplets depend on current set
+		{
+			if(setSize > simdWidth) // trim set to multiple of simdWidth
+			{
+				uint32_t overflow = setSize % simdWidth;
+				setSize -= overflow;
+				tIt -= overflow;
+			}
+			mSetSizes.pushBack(setSize);
+			setSize = 0;
+		}
+	}
+}
+
+namespace
+{
+struct TripletSet
+{
+	TripletSet() : mMark(0xFFFFFFFF)
+	{
+		mNumReplays[0] = mNumReplays[1] = mNumReplays[2] = 1;
+		memset(mNumConflicts[0], 0, 32);
+		memset(mNumConflicts[1], 0, 32);
+		memset(mNumConflicts[2], 0, 32);
+	}
+
+	uint32_t mMark; // triplet index
+	uint8_t mNumReplays[3];
+	uint8_t mNumConflicts[3][32];
+};
+
+/*
+struct GreaterSum
+{
+    typedef cloth::Vector<uint32_t>::Type Container;
+
+    GreaterSum(const Container& cont)
+        : mContainer(cont)
+    {}
+
+    bool operator()(const cloth::Vec4u& a, const cloth::Vec4u& b) const
+    {
+        return mContainer[a.x] + mContainer[a.y] + mContainer[a.z]
+            > mContainer[b.x] + mContainer[b.y] + mContainer[b.z];
+    }
+
+    const Container& mContainer;
+};
+*/
+
+// calculate the inclusive prefix sum, equivalent of std::partial_sum
+template <typename T>
+void prefixSum(const T* first, const T* last, T* dest)
+{
+	if(first == last)
+		return;
+	else
+	{
+		*(dest++) = *(first++);
+
+		for(; first != last; ++first, ++dest)
+			*dest = *(dest - 1) + *first;
+	}
+}
+}
+
+// CUDA version
+void cloth::TripletScheduler::warp(uint32_t numParticles, uint32_t warpWidth)
+{
+	// PX_ASSERT(warpWidth == 32 || warpWidth == 16);
+
+	if(mTriplets.empty())
+		return;
+
+	TripletIter tIt, tEnd = mTriplets.end();
+	uint32_t tripletIndex;
+
+	// count number of triplets per particle
+	Vector<uint32_t>::Type adjacentCount(numParticles + 1, uint32_t(0));
+	for(tIt = mTriplets.begin(); tIt != tEnd; ++tIt)
+		for(int i = 0; i < 3; ++i)
+			++adjacentCount[(*tIt)[i]];
+
+	/* neither of those were really improving number of batches:
+	// run simd version to pre-sort particles
+	simd(numParticles, blockWidth); mSetSizes.resize(0);
+	// sort according to triplet degree (estimated by sum of adjacentCount)
+	std::sort(mTriplets.begin(), tEnd, GreaterSum(adjacentCount));
+	*/
+
+	uint32_t maxTripletCount = *maxElement(adjacentCount.begin(), adjacentCount.end());
+
+	// compute in place prefix sum (inclusive)
+	prefixSum(adjacentCount.begin(), adjacentCount.end(), adjacentCount.begin());
+
+	// initialize adjacencies (for each particle, collect touching triplets)
+	// also converts partial sum in adjacentCount from inclusive to exclusive
+	Vector<uint32_t>::Type adjacencies(adjacentCount.back());
+	for(tIt = mTriplets.begin(), tripletIndex = 0; tIt != tEnd; ++tIt, ++tripletIndex)
+		for(int i = 0; i < 3; ++i)
+			adjacencies[--adjacentCount[(*tIt)[i]]] = tripletIndex;
+
+	uint32_t warpMask = warpWidth - 1;
+
+	uint32_t numSets = maxTripletCount; // start with minimum number of sets
+	Vector<TripletSet>::Type sets(numSets);
+	Vector<uint32_t>::Type setIndices(mTriplets.size(), uint32_t(-1));
+	mSetSizes.resize(numSets);
+
+	// color triplets (assign to sets)
+	Vector<uint32_t>::Type::ConstIterator aBegin = adjacencies.begin(), aIt, aEnd;
+	for(tIt = mTriplets.begin(), tripletIndex = 0; tIt != tEnd; ++tIt, ++tripletIndex)
+	{
+		// mark sets of adjacent triplets
+		for(int i = 0; i < 3; ++i)
+		{
+			uint32_t particleIndex = (*tIt)[i];
+			aIt = aBegin + adjacentCount[particleIndex];
+			aEnd = aBegin + adjacentCount[particleIndex + 1];
+			for(uint32_t setIndex; aIt != aEnd; ++aIt)
+				if(numSets > (setIndex = setIndices[*aIt]))
+					sets[setIndex].mMark = tripletIndex;
+		}
+
+		// find valid set with smallest number of bank conflicts
+		uint32_t bestIndex = numSets;
+		uint32_t minReplays = 4;
+		for(uint32_t setIndex = 0; setIndex < numSets && minReplays; ++setIndex)
+		{
+			const TripletSet& set = sets[setIndex];
+
+			if(set.mMark == tripletIndex)
+				continue; // triplet collision
+
+			uint32_t numReplays = 0;
+			for(uint32_t i = 0; i < 3; ++i)
+				numReplays += set.mNumReplays[i] == set.mNumConflicts[i][warpMask & (*tIt)[i]];
+
+			if(minReplays > numReplays)
+				minReplays = numReplays, bestIndex = setIndex;
+		}
+
+		// add new set if none found
+		if(bestIndex == numSets)
+		{
+			sets.pushBack(TripletSet());
+			mSetSizes.pushBack(0);
+			++numSets;
+		}
+
+		// increment bank conflicts or reset if warp filled
+		TripletSet& set = sets[bestIndex];
+		if(++mSetSizes[bestIndex] & warpMask)
+			for(uint32_t i = 0; i < 3; ++i)
+				set.mNumReplays[i] = PxMax(set.mNumReplays[i], ++set.mNumConflicts[i][warpMask & (*tIt)[i]]);
+		else
+			set = TripletSet();
+
+		setIndices[tripletIndex] = bestIndex;
+	}
+
+	// reorder triplets
+	Vector<uint32_t>::Type setOffsets(mSetSizes.size());
+	prefixSum(mSetSizes.begin(), mSetSizes.end(), setOffsets.begin());
+
+	Vector<Vec4u>::Type triplets(mTriplets.size());
+	Vector<uint32_t>::Type::ConstIterator iIt = setIndices.begin();
+	for(tIt = mTriplets.begin(), tripletIndex = 0; tIt != tEnd; ++tIt, ++iIt)
+		triplets[--setOffsets[*iIt]] = *tIt;
+
+	mTriplets.swap(triplets);
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/TripletScheduler.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/TripletScheduler.h
new file mode 100644
index 00000000..836c9784
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/TripletScheduler.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+#include "Range.h"
+#include "Allocator.h"
+#include "Vec4T.h"
+
+namespace nvidia
+{
+
+namespace cloth
+{
+
+struct TripletScheduler
+{
+	typedef Vector<Vec4u>::Type::ConstIterator ConstTripletIter;
+	typedef Vector<Vec4u>::Type::Iterator TripletIter;
+
+	TripletScheduler(Range<const uint32_t[4]>);
+	void simd(uint32_t numParticles, uint32_t simdWidth);
+	void warp(uint32_t numParticles, uint32_t warpWidth);
+
+	Vector<Vec4u>::Type mTriplets;
+	Vector<uint32_t>::Type mSetSizes;
+};
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Vec4T.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Vec4T.h
new file mode 100644
index 00000000..c82b9629
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Vec4T.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+
+namespace nvidia
+{
+
+namespace cloth
+{
+
+template <typename T>
+struct Vec4T
+{
+	Vec4T()
+	{
+	}
+
+	Vec4T(T a, T b, T c, T d) : x(a), y(b), z(c), w(d)
+	{
+	}
+
+	template <typename S>
+	Vec4T(const Vec4T<S>& other)
+	{
+		x = T(other.x);
+		y = T(other.y);
+		z = T(other.z);
+		w = T(other.w);
+	}
+
+	template <typename Index>
+	T& operator[](Index i)
+	{
+		return reinterpret_cast<T*>(this)[i];
+	}
+
+	template <typename Index>
+	const T& operator[](Index i) const
+	{
+		return reinterpret_cast<const T*>(this)[i];
+	}
+
+	T x, y, z, w;
+};
+
+template <typename T>
+Vec4T<T> operator*(const Vec4T<T>& vec, T scalar)
+{
+	return Vec4T<T>(vec.x * scalar, vec.y * scalar, vec.z * scalar, vec.w * scalar);
+}
+
+template <typename T>
+Vec4T<T> operator/(const Vec4T<T>& vec, T scalar)
+{
+	return Vec4T<T>(vec.x / scalar, vec.y / scalar, vec.z / scalar, vec.w / scalar);
+}
+
+template <typename T>
+T (&array(Vec4T<T>& vec))[4]
+{
+	return reinterpret_cast<T(&)[4]>(vec);
+}
+
+template <typename T>
+const T (&array(const Vec4T<T>& vec))[4]
+{
+	return reinterpret_cast<const T(&)[4]>(vec);
+}
+
+typedef Vec4T<uint32_t> Vec4u;
+typedef Vec4T<uint16_t> Vec4us;
+
+} // namespace cloth
+
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/avx/SwSolveConstraints.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/avx/SwSolveConstraints.cpp
new file mode 100644
index 00000000..b9a6ab35
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/avx/SwSolveConstraints.cpp
@@ -0,0 +1,916 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma warning(push)
+#pragma warning(disable : 4668) //'symbol' is not defined as a preprocessor macro, replacing with '0' for 'directives'
+#pragma warning(disable : 4987) // nonstandard extension used: 'throw (...)'
+#include <intrin.h>
+#pragma warning(pop)
+
+#pragma warning(disable : 4127) // conditional expression is constant
+
+typedef unsigned __int16 uint16_t;
+typedef unsigned __int32 uint32_t;
+
+namespace avx
+{
+__m128 sMaskYZW;
+__m256 sOne, sEpsilon, sMinusOneXYZOneW, sMaskXY;
+
+void initialize()
+{
+	sMaskYZW = _mm_castsi128_ps(_mm_setr_epi32(0, ~0, ~0, ~0));
+	sOne = _mm256_set1_ps(1.0f);
+	sEpsilon = _mm256_set1_ps(1.192092896e-07f);
+	sMinusOneXYZOneW = _mm256_setr_ps(-1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f);
+	sMaskXY = _mm256_castsi256_ps(_mm256_setr_epi32(~0, ~0, 0, 0, ~0, ~0, 0, 0));
+}
+
+template <uint32_t>
+__m256 fmadd_ps(__m256 a, __m256 b, __m256 c)
+{
+	return _mm256_add_ps(_mm256_mul_ps(a, b), c);
+}
+template <uint32_t>
+__m256 fnmadd_ps(__m256 a, __m256 b, __m256 c)
+{
+	return _mm256_sub_ps(c, _mm256_mul_ps(a, b));
+}
+#if _MSC_VER >= 1700
+template <>
+__m256 fmadd_ps<2>(__m256 a, __m256 b, __m256 c)
+{
+	return _mm256_fmadd_ps(a, b, c);
+}
+template <>
+__m256 fnmadd_ps<2>(__m256 a, __m256 b, __m256 c)
+{
+	return _mm256_fnmadd_ps(a, b, c);
+}
+#endif
+
+// roughly same perf as SSE2 intrinsics, the asm version below is about 10% faster
+template <bool useMultiplier, uint32_t avx>
+void solveConstraints(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd,
+                      const uint16_t* __restrict iIt, const __m128& stiffnessRef)
+{
+	__m256 stiffness, stretchLimit, compressionLimit, multiplier;
+
+	if(useMultiplier)
+	{
+		stiffness = _mm256_broadcast_ps(&stiffnessRef);
+		stretchLimit = _mm256_permute_ps(stiffness, 0xff);
+		compressionLimit = _mm256_permute_ps(stiffness, 0xaa);
+		multiplier = _mm256_permute_ps(stiffness, 0x55);
+		stiffness = _mm256_permute_ps(stiffness, 0x00);
+	}
+	else
+	{
+		stiffness = _mm256_broadcast_ss((const float*)&stiffnessRef);
+	}
+
+	for(; rIt < rEnd; rIt += 8, iIt += 16)
+	{
+		float* p0i = posIt + iIt[0] * 4;
+		float* p4i = posIt + iIt[8] * 4;
+		float* p0j = posIt + iIt[1] * 4;
+		float* p4j = posIt + iIt[9] * 4;
+		float* p1i = posIt + iIt[2] * 4;
+		float* p5i = posIt + iIt[10] * 4;
+		float* p1j = posIt + iIt[3] * 4;
+		float* p5j = posIt + iIt[11] * 4;
+
+		__m128 v0i = _mm_load_ps(p0i);
+		__m128 v4i = _mm_load_ps(p4i);
+		__m128 v0j = _mm_load_ps(p0j);
+		__m128 v4j = _mm_load_ps(p4j);
+		__m128 v1i = _mm_load_ps(p1i);
+		__m128 v5i = _mm_load_ps(p5i);
+		__m128 v1j = _mm_load_ps(p1j);
+		__m128 v5j = _mm_load_ps(p5j);
+
+		__m256 v04i = _mm256_insertf128_ps(_mm256_castps128_ps256(v0i), v4i, 1);
+		__m256 v04j = _mm256_insertf128_ps(_mm256_castps128_ps256(v0j), v4j, 1);
+		__m256 v15i = _mm256_insertf128_ps(_mm256_castps128_ps256(v1i), v5i, 1);
+		__m256 v15j = _mm256_insertf128_ps(_mm256_castps128_ps256(v1j), v5j, 1);
+
+		__m256 h04ij = fmadd_ps<avx>(sMinusOneXYZOneW, v04i, v04j);
+		__m256 h15ij = fmadd_ps<avx>(sMinusOneXYZOneW, v15i, v15j);
+
+		float* p2i = posIt + iIt[4] * 4;
+		float* p6i = posIt + iIt[12] * 4;
+		float* p2j = posIt + iIt[5] * 4;
+		float* p6j = posIt + iIt[13] * 4;
+		float* p3i = posIt + iIt[6] * 4;
+		float* p7i = posIt + iIt[14] * 4;
+		float* p3j = posIt + iIt[7] * 4;
+		float* p7j = posIt + iIt[15] * 4;
+
+		__m128 v2i = _mm_load_ps(p2i);
+		__m128 v6i = _mm_load_ps(p6i);
+		__m128 v2j = _mm_load_ps(p2j);
+		__m128 v6j = _mm_load_ps(p6j);
+		__m128 v3i = _mm_load_ps(p3i);
+		__m128 v7i = _mm_load_ps(p7i);
+		__m128 v3j = _mm_load_ps(p3j);
+		__m128 v7j = _mm_load_ps(p7j);
+
+		__m256 v26i = _mm256_insertf128_ps(_mm256_castps128_ps256(v2i), v6i, 1);
+		__m256 v26j = _mm256_insertf128_ps(_mm256_castps128_ps256(v2j), v6j, 1);
+		__m256 v37i = _mm256_insertf128_ps(_mm256_castps128_ps256(v3i), v7i, 1);
+		__m256 v37j = _mm256_insertf128_ps(_mm256_castps128_ps256(v3j), v7j, 1);
+
+		__m256 h26ij = fmadd_ps<avx>(sMinusOneXYZOneW, v26i, v26j);
+		__m256 h37ij = fmadd_ps<avx>(sMinusOneXYZOneW, v37i, v37j);
+
+		__m256 a = _mm256_unpacklo_ps(h04ij, h26ij);
+		__m256 b = _mm256_unpackhi_ps(h04ij, h26ij);
+		__m256 c = _mm256_unpacklo_ps(h15ij, h37ij);
+		__m256 d = _mm256_unpackhi_ps(h15ij, h37ij);
+
+		__m256 hxij = _mm256_unpacklo_ps(a, c);
+		__m256 hyij = _mm256_unpackhi_ps(a, c);
+		__m256 hzij = _mm256_unpacklo_ps(b, d);
+		__m256 vwij = _mm256_unpackhi_ps(b, d);
+
+		__m256 e2ij = fmadd_ps<avx>(hxij, hxij, fmadd_ps<avx>(hyij, hyij, fmadd_ps<avx>(hzij, hzij, sEpsilon)));
+
+		__m256 rij = _mm256_load_ps(rIt);
+		__m256 mask = _mm256_cmp_ps(rij, sEpsilon, _CMP_GT_OQ);
+		__m256 erij = _mm256_and_ps(fnmadd_ps<avx>(rij, _mm256_rsqrt_ps(e2ij), sOne), mask);
+
+		if(useMultiplier)
+		{
+			erij = fnmadd_ps<avx>(multiplier, _mm256_max_ps(compressionLimit, _mm256_min_ps(erij, stretchLimit)), erij);
+		}
+
+		__m256 exij = _mm256_mul_ps(erij, _mm256_mul_ps(stiffness, _mm256_rcp_ps(_mm256_add_ps(sEpsilon, vwij))));
+
+		// replace these two instructions with _mm_maskstore_ps below?
+		__m256 exlo = _mm256_and_ps(sMaskXY, exij);
+		__m256 exhi = _mm256_andnot_ps(sMaskXY, exij);
+
+		__m256 f04ij = _mm256_mul_ps(h04ij, _mm256_permute_ps(exlo, 0xc0));
+		__m256 u04i = fmadd_ps<avx>(f04ij, _mm256_permute_ps(v04i, 0xff), v04i);
+		__m256 u04j = fnmadd_ps<avx>(f04ij, _mm256_permute_ps(v04j, 0xff), v04j);
+
+		_mm_store_ps(p0i, _mm256_extractf128_ps(u04i, 0));
+		_mm_store_ps(p0j, _mm256_extractf128_ps(u04j, 0));
+		_mm_store_ps(p4i, _mm256_extractf128_ps(u04i, 1));
+		_mm_store_ps(p4j, _mm256_extractf128_ps(u04j, 1));
+
+		__m256 f15ij = _mm256_mul_ps(h15ij, _mm256_permute_ps(exlo, 0xd5));
+		__m256 u15i = fmadd_ps<avx>(f15ij, _mm256_permute_ps(v15i, 0xff), v15i);
+		__m256 u15j = fnmadd_ps<avx>(f15ij, _mm256_permute_ps(v15j, 0xff), v15j);
+
+		_mm_store_ps(p1i, _mm256_extractf128_ps(u15i, 0));
+		_mm_store_ps(p1j, _mm256_extractf128_ps(u15j, 0));
+		_mm_store_ps(p5i, _mm256_extractf128_ps(u15i, 1));
+		_mm_store_ps(p5j, _mm256_extractf128_ps(u15j, 1));
+
+		__m256 f26ij = _mm256_mul_ps(h26ij, _mm256_permute_ps(exhi, 0x2a));
+		__m256 u26i = fmadd_ps<avx>(f26ij, _mm256_permute_ps(v26i, 0xff), v26i);
+		__m256 u26j = fnmadd_ps<avx>(f26ij, _mm256_permute_ps(v26j, 0xff), v26j);
+
+		_mm_store_ps(p2i, _mm256_extractf128_ps(u26i, 0));
+		_mm_store_ps(p2j, _mm256_extractf128_ps(u26j, 0));
+		_mm_store_ps(p6i, _mm256_extractf128_ps(u26i, 1));
+		_mm_store_ps(p6j, _mm256_extractf128_ps(u26j, 1));
+
+		__m256 f37ij = _mm256_mul_ps(h37ij, _mm256_permute_ps(exhi, 0x3f));
+		__m256 u37i = fmadd_ps<avx>(f37ij, _mm256_permute_ps(v37i, 0xff), v37i);
+		__m256 u37j = fnmadd_ps<avx>(f37ij, _mm256_permute_ps(v37j, 0xff), v37j);
+
+		_mm_store_ps(p3i, _mm256_extractf128_ps(u37i, 0));
+		_mm_store_ps(p3j, _mm256_extractf128_ps(u37j, 0));
+		_mm_store_ps(p7i, _mm256_extractf128_ps(u37i, 1));
+		_mm_store_ps(p7j, _mm256_extractf128_ps(u37j, 1));
+	}
+
+	_mm256_zeroupper();
+}
+
+#ifdef _M_IX86
+
+// clang-format:disable
+
+/* full template specializations of above functions in assembler */
+
+// AVX without useMultiplier
+template <>
+void solveConstraints<false, 1>(float* __restrict posIt, const float* __restrict rIt,
+                                const float* __restrict rEnd, const uint16_t* __restrict iIt, const __m128& stiffnessRef)
+{
+	__m256 stiffness = _mm256_broadcast_ss((const float*)&stiffnessRef);
+
+	__m256 vtmp[8], htmp[4];
+	float* ptmp[16];
+
+	__asm 
+	{
+		mov edx, rIt
+		mov esi, rEnd
+
+		cmp edx, esi
+		jae forEnd
+
+		mov eax, iIt
+		mov ecx, posIt
+
+forBegin:
+		movzx edi, WORD PTR [eax   ] __asm shl edi, 4 __asm mov [ptmp   ], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v0i
+		movzx edi, WORD PTR [eax+16] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v4i
+		movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v0j
+		movzx edi, WORD PTR [eax+18] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v4j
+		movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v1i
+		movzx edi, WORD PTR [eax+20] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v5i
+		movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v1j
+		movzx edi, WORD PTR [eax+22] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v5j
+
+		vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp    ], ymm0 // v04i
+		vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+ 32], ymm2 // v04j
+		vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+ 64], ymm4 // v15i
+		vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+ 96], ymm6 // v15j
+
+		vmovaps ymm7, sMinusOneXYZOneW
+		vmulps ymm2, ymm2, ymm7 __asm vaddps ymm0, ymm0, ymm2 __asm vmovaps YMMWORD PTR [htmp   ], ymm0 // h04ij
+		vmulps ymm6, ymm6, ymm7 __asm vaddps ymm4, ymm4, ymm6 __asm vmovaps YMMWORD PTR [htmp+32], ymm4 // h15ij
+
+		movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+32], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v2i
+		movzx edi, WORD PTR [eax+24] __asm shl edi, 4 __asm mov [ptmp+36], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v6i
+		movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+40], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v2j
+		movzx edi, WORD PTR [eax+26] __asm shl edi, 4 __asm mov [ptmp+44], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v6j
+		movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+48], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v3i
+		movzx edi, WORD PTR [eax+28] __asm shl edi, 4 __asm mov [ptmp+52], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v7i
+		movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+56], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v3j
+		movzx edi, WORD PTR [eax+30] __asm shl edi, 4 __asm mov [ptmp+60], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v7j
+
+		vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp+128], ymm0 // v26i
+		vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+160], ymm2 // v26j
+		vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+192], ymm4 // v37i
+		vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+224], ymm6 // v37j
+
+		vmovaps ymm7, sMinusOneXYZOneW
+		vmulps ymm2, ymm2, ymm7 __asm vaddps ymm2, ymm0, ymm2 __asm vmovaps YMMWORD PTR [htmp+64], ymm2 // h26ij
+		vmulps ymm6, ymm6, ymm7 __asm vaddps ymm6, ymm4, ymm6 __asm vmovaps YMMWORD PTR [htmp+96], ymm6 // h37ij
+
+		vmovaps ymm0, YMMWORD PTR [htmp   ] // h04ij
+		vmovaps ymm4, YMMWORD PTR [htmp+32] // h15ij
+
+		vunpcklps ymm1, ymm0, ymm2 // a
+		vunpckhps ymm3, ymm0, ymm2 // b
+		vunpcklps ymm5, ymm4, ymm6 // c
+		vunpckhps ymm7, ymm4, ymm6 // d
+
+		vunpcklps ymm0, ymm1, ymm5 // hxij
+		vunpckhps ymm2, ymm1, ymm5 // hyij
+		vunpcklps ymm4, ymm3, ymm7 // hzij
+		vunpckhps ymm6, ymm3, ymm7 // vwij
+
+		vmovaps ymm7, sEpsilon
+		vmovaps ymm5, sOne
+		vmovaps ymm3, stiffness
+		vmovaps ymm1, YMMWORD PTR [edx] // rij
+
+		vmulps ymm0, ymm0, ymm0 __asm vaddps ymm0, ymm0, ymm7 // e2ij
+		vmulps ymm2, ymm2, ymm2 __asm vaddps ymm0, ymm0, ymm2
+		vmulps ymm4, ymm4, ymm4 __asm vaddps ymm0, ymm0, ymm4
+
+		vcmpgt_oqps ymm2, ymm1, ymm7 // mask
+		vrsqrtps ymm0, ymm0 __asm vmulps ymm0, ymm0, ymm1 // erij
+		vsubps ymm5, ymm5, ymm0 __asm vandps ymm5, ymm5, ymm2
+		vaddps ymm6, ymm6, ymm7 __asm vrcpps ymm6, ymm6
+
+		vmulps ymm6, ymm6, ymm3 __asm vmulps ymm6, ymm6, ymm5 // exij
+
+		vmovaps ymm7, sMaskXY
+		vandps ymm7, ymm7, ymm6 // exlo
+		vxorps ymm6, ymm6, ymm7 // exhi
+
+		vmovaps ymm4, YMMWORD PTR [htmp    ] // h04ij
+		vmovaps ymm0, YMMWORD PTR [vtmp    ] // v04i
+		vmovaps ymm1, YMMWORD PTR [vtmp+ 32] // v04j
+
+		vpermilps ymm5, ymm7, 0xc0 __asm vmulps ymm4, ymm4, ymm5 // f04ij
+		vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u04i
+		vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u04j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp   ] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v0i
+		mov edi, [ptmp+ 8] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v0j
+		mov edi, [ptmp+ 4] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v4i
+		mov edi, [ptmp+12] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v4j
+
+		vmovaps ymm4, YMMWORD PTR [htmp+ 32] // h15ij
+		vmovaps ymm0, YMMWORD PTR [vtmp+ 64] // v15i
+		vmovaps ymm1, YMMWORD PTR [vtmp+ 96] // v15j
+
+		vpermilps ymm5, ymm7, 0xd5 __asm vmulps ymm4, ymm4, ymm5 // f15ij
+		vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u15i
+		vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u15j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp+16] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v1i
+		mov edi, [ptmp+24] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v1j
+		mov edi, [ptmp+20] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v5i
+		mov edi, [ptmp+28] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v5j
+
+		vmovaps ymm4, YMMWORD PTR [htmp+ 64] // h26ij
+		vmovaps ymm0, YMMWORD PTR [vtmp+128] // v26i
+		vmovaps ymm1, YMMWORD PTR [vtmp+160] // v26j
+
+		vpermilps ymm5, ymm6, 0x2a __asm vmulps ymm4, ymm4, ymm5 // f26ij
+		vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u26i
+		vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u26j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp+32] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v2i
+		mov edi, [ptmp+40] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v2j
+		mov edi, [ptmp+36] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v6i
+		mov edi, [ptmp+44] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v6j
+
+		vmovaps ymm4, YMMWORD PTR [htmp+ 96] // h37ij
+		vmovaps ymm0, YMMWORD PTR [vtmp+192] // v37i
+		vmovaps ymm1, YMMWORD PTR [vtmp+224] // v37j
+
+		vpermilps ymm5, ymm6, 0x3f __asm vmulps ymm4, ymm4, ymm5 // f37ij
+		vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u37i
+		vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u37j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp+48] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v3i
+		mov edi, [ptmp+56] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v3j
+		mov edi, [ptmp+52] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v7i
+		mov edi, [ptmp+60] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v7j
+
+		add eax, 32
+		add edx, 32
+
+		cmp edx, esi
+		jb forBegin
+forEnd:
+	}
+
+	_mm256_zeroupper();
+}
+
+// AVX with useMultiplier
+template <>
+void solveConstraints<true, 1>(float* __restrict posIt, const float* __restrict rIt,
+                               const float* __restrict rEnd, const uint16_t* __restrict iIt, const __m128& stiffnessRef)
+{
+	__m256 stiffness = _mm256_broadcast_ps(&stiffnessRef);
+	__m256 stretchLimit = _mm256_permute_ps(stiffness, 0xff);
+	__m256 compressionLimit = _mm256_permute_ps(stiffness, 0xaa);
+	__m256 multiplier = _mm256_permute_ps(stiffness, 0x55);
+	stiffness = _mm256_permute_ps(stiffness, 0x00);
+
+	__m256 vtmp[8], htmp[4];
+	float* ptmp[16];
+
+	__asm 
+	{
+		mov edx, rIt
+		mov esi, rEnd
+
+		cmp edx, esi
+		jae forEnd
+
+		mov eax, iIt
+		mov ecx, posIt
+
+forBegin:
+		movzx edi, WORD PTR [eax   ] __asm shl edi, 4 __asm mov [ptmp   ], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v0i
+		movzx edi, WORD PTR [eax+16] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v4i
+		movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v0j
+		movzx edi, WORD PTR [eax+18] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v4j
+		movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v1i
+		movzx edi, WORD PTR [eax+20] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v5i
+		movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v1j
+		movzx edi, WORD PTR [eax+22] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v5j
+
+		vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp    ], ymm0 // v04i
+		vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+ 32], ymm2 // v04j
+		vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+ 64], ymm4 // v15i
+		vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+ 96], ymm6 // v15j
+
+		vmovaps ymm7, sMinusOneXYZOneW
+		vmulps ymm2, ymm2, ymm7 __asm vaddps ymm0, ymm0, ymm2 __asm vmovaps YMMWORD PTR [htmp   ], ymm0 // h04ij
+		vmulps ymm6, ymm6, ymm7 __asm vaddps ymm4, ymm4, ymm6 __asm vmovaps YMMWORD PTR [htmp+32], ymm4 // h15ij
+
+		movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+32], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v2i
+		movzx edi, WORD PTR [eax+24] __asm shl edi, 4 __asm mov [ptmp+36], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v6i
+		movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+40], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v2j
+		movzx edi, WORD PTR [eax+26] __asm shl edi, 4 __asm mov [ptmp+44], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v6j
+		movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+48], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v3i
+		movzx edi, WORD PTR [eax+28] __asm shl edi, 4 __asm mov [ptmp+52], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v7i
+		movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+56], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v3j
+		movzx edi, WORD PTR [eax+30] __asm shl edi, 4 __asm mov [ptmp+60], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v7j
+
+		vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp+128], ymm0 // v26i
+		vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+160], ymm2 // v26j
+		vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+192], ymm4 // v37i
+		vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+224], ymm6 // v37j
+
+		vmovaps ymm7, sMinusOneXYZOneW
+		vmulps ymm2, ymm2, ymm7 __asm vaddps ymm2, ymm0, ymm2 __asm vmovaps YMMWORD PTR [htmp+64], ymm2 // h26ij
+		vmulps ymm6, ymm6, ymm7 __asm vaddps ymm6, ymm4, ymm6 __asm vmovaps YMMWORD PTR [htmp+96], ymm6 // h37ij
+
+		vmovaps ymm0, YMMWORD PTR [htmp   ] // h04ij
+		vmovaps ymm4, YMMWORD PTR [htmp+32] // h15ij
+
+		vunpcklps ymm1, ymm0, ymm2 // a
+		vunpckhps ymm3, ymm0, ymm2 // b
+		vunpcklps ymm5, ymm4, ymm6 // c
+		vunpckhps ymm7, ymm4, ymm6 // d
+
+		vunpcklps ymm0, ymm1, ymm5 // hxij
+		vunpckhps ymm2, ymm1, ymm5 // hyij
+		vunpcklps ymm4, ymm3, ymm7 // hzij
+		vunpckhps ymm6, ymm3, ymm7 // vwij
+
+		vmovaps ymm7, sEpsilon
+		vmovaps ymm5, sOne
+		vmovaps ymm3, stiffness
+		vmovaps ymm1, YMMWORD PTR [edx] // rij
+
+		vmulps ymm0, ymm0, ymm0 __asm vaddps ymm0, ymm0, ymm7 // e2ij
+		vmulps ymm2, ymm2, ymm2 __asm vaddps ymm0, ymm0, ymm2
+		vmulps ymm4, ymm4, ymm4 __asm vaddps ymm0, ymm0, ymm4
+
+		vcmpgt_oqps ymm2, ymm1, ymm7 // mask
+		vrsqrtps ymm0, ymm0 __asm vmulps ymm0, ymm0, ymm1 // erij
+		vsubps ymm5, ymm5, ymm0 __asm vandps ymm5, ymm5, ymm2
+		vaddps ymm6, ymm6, ymm7 __asm vrcpps ymm6, ymm6
+
+		vmovaps ymm0, stretchLimit // multiplier block
+		vmovaps ymm1, compressionLimit
+		vmovaps ymm2, multiplier
+		vminps ymm0, ymm0, ymm5
+		vmaxps ymm1, ymm1, ymm0
+		vmulps ymm2, ymm2, ymm1
+		vsubps ymm5, ymm5, ymm2
+
+		vmulps ymm6, ymm6, ymm3 __asm vmulps ymm6, ymm6, ymm5 // exij
+
+		vmovaps ymm7, sMaskXY
+		vandps ymm7, ymm7, ymm6 // exlo
+		vxorps ymm6, ymm6, ymm7 // exhi
+
+		vmovaps ymm4, YMMWORD PTR [htmp    ] // h04ij
+		vmovaps ymm0, YMMWORD PTR [vtmp    ] // v04i
+		vmovaps ymm1, YMMWORD PTR [vtmp+ 32] // v04j
+
+		vpermilps ymm5, ymm7, 0xc0 __asm vmulps ymm4, ymm4, ymm5 // f04ij
+		vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u04i
+		vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u04j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp   ] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v0i
+		mov edi, [ptmp+ 8] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v0j
+		mov edi, [ptmp+ 4] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v4i
+		mov edi, [ptmp+12] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v4j
+
+		vmovaps ymm4, YMMWORD PTR [htmp+ 32] // h15ij
+		vmovaps ymm0, YMMWORD PTR [vtmp+ 64] // v15i
+		vmovaps ymm1, YMMWORD PTR [vtmp+ 96] // v15j
+
+		vpermilps ymm5, ymm7, 0xd5 __asm vmulps ymm4, ymm4, ymm5 // f15ij
+		vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u15i
+		vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u15j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp+16] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v1i
+		mov edi, [ptmp+24] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v1j
+		mov edi, [ptmp+20] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v5i
+		mov edi, [ptmp+28] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v5j
+
+		vmovaps ymm4, YMMWORD PTR [htmp+ 64] // h26ij
+		vmovaps ymm0, YMMWORD PTR [vtmp+128] // v26i
+		vmovaps ymm1, YMMWORD PTR [vtmp+160] // v26j
+
+		vpermilps ymm5, ymm6, 0x2a __asm vmulps ymm4, ymm4, ymm5 // f26ij
+		vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u26i
+		vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u26j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp+32] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v2i
+		mov edi, [ptmp+40] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v2j
+		mov edi, [ptmp+36] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v6i
+		mov edi, [ptmp+44] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v6j
+
+		vmovaps ymm4, YMMWORD PTR [htmp+ 96] // h37ij
+		vmovaps ymm0, YMMWORD PTR [vtmp+192] // v37i
+		vmovaps ymm1, YMMWORD PTR [vtmp+224] // v37j
+
+		vpermilps ymm5, ymm6, 0x3f __asm vmulps ymm4, ymm4, ymm5 // f37ij
+		vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u37i
+		vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u37j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp+48] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v3i
+		mov edi, [ptmp+56] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v3j
+		mov edi, [ptmp+52] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v7i
+		mov edi, [ptmp+60] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v7j
+
+		add eax, 32
+		add edx, 32
+
+		cmp edx, esi
+		jb forBegin
+forEnd:
+	}
+
+	_mm256_zeroupper();
+}
+
+#if _MSC_VER >= 1700
+// AVX2 without useMultiplier
+template <>
+void solveConstraints<false, 2>(float* __restrict posIt, const float* __restrict rIt, 
+                                const float* __restrict rEnd, const uint16_t* __restrict iIt, const __m128& stiffnessRef)
+{
+	__m256 stiffness = _mm256_broadcast_ss((const float*)&stiffnessRef);
+
+	__m256 vtmp[8], htmp[4];
+	float* ptmp[16];
+
+	__asm 
+	{
+		mov edx, rIt
+			mov esi, rEnd
+
+			cmp edx, esi
+			jae forEnd
+
+			mov eax, iIt
+			mov ecx, posIt
+
+forBegin:
+		movzx edi, WORD PTR [eax   ] __asm shl edi, 4 __asm mov [ptmp   ], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v0i
+		movzx edi, WORD PTR [eax+16] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v4i
+		movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v0j
+		movzx edi, WORD PTR [eax+18] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v4j
+		movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v1i
+		movzx edi, WORD PTR [eax+20] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v5i
+		movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v1j
+		movzx edi, WORD PTR [eax+22] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v5j
+
+		vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp    ], ymm0 // v04i
+		vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+ 32], ymm2 // v04j
+		vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+ 64], ymm4 // v15i
+		vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+ 96], ymm6 // v15j
+
+		vmovaps ymm7, sMinusOneXYZOneW
+		vfmadd213ps ymm2, ymm7, ymm0 __asm vmovaps YMMWORD PTR [htmp   ], ymm2 // h04ij
+		vfmadd213ps ymm6, ymm7, ymm4 __asm vmovaps YMMWORD PTR [htmp+32], ymm6 // h15ij
+
+		movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+32], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v2i
+		movzx edi, WORD PTR [eax+24] __asm shl edi, 4 __asm mov [ptmp+36], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v6i
+		movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+40], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v2j
+		movzx edi, WORD PTR [eax+26] __asm shl edi, 4 __asm mov [ptmp+44], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v6j
+		movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+48], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v3i
+		movzx edi, WORD PTR [eax+28] __asm shl edi, 4 __asm mov [ptmp+52], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v7i
+		movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+56], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v3j
+		movzx edi, WORD PTR [eax+30] __asm shl edi, 4 __asm mov [ptmp+60], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v7j
+
+		vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp+128], ymm0 // v26i
+		vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+160], ymm2 // v26j
+		vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+192], ymm4 // v37i
+		vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+224], ymm6 // v37j
+
+		vmovaps ymm7, sMinusOneXYZOneW
+		vfmadd213ps ymm2, ymm7, ymm0  __asm vmovaps YMMWORD PTR [htmp+64], ymm2 // h26ij
+		vfmadd213ps ymm6, ymm7, ymm4  __asm vmovaps YMMWORD PTR [htmp+96], ymm6 // h37ij
+
+		vmovaps ymm0, YMMWORD PTR [htmp   ] // h04ij
+		vmovaps ymm4, YMMWORD PTR [htmp+32] // h15ij
+
+		vunpcklps ymm1, ymm0, ymm2 // a
+		vunpckhps ymm3, ymm0, ymm2 // b
+		vunpcklps ymm5, ymm4, ymm6 // c
+		vunpckhps ymm7, ymm4, ymm6 // d
+
+		vunpcklps ymm0, ymm1, ymm5 // hxij
+		vunpckhps ymm2, ymm1, ymm5 // hyij
+		vunpcklps ymm4, ymm3, ymm7 // hzij
+		vunpckhps ymm6, ymm3, ymm7 // vwij
+
+		vmovaps ymm7, sEpsilon
+		vmovaps ymm5, sOne
+		vmovaps ymm3, stiffness
+		vmovaps ymm1, YMMWORD PTR [edx] // rij
+
+		vfmadd213ps ymm4, ymm4, ymm7 // e2ij
+		vfmadd213ps ymm2, ymm2, ymm4
+		vfmadd213ps ymm0, ymm0, ymm2
+
+		vcmpgt_oqps ymm2, ymm1, ymm7 // mask
+		vrsqrtps ymm0, ymm0 __asm vfnmadd231ps ymm5, ymm0, ymm1 // erij
+		vandps ymm5, ymm5, ymm2
+		vaddps ymm6, ymm6, ymm7 __asm vrcpps ymm6, ymm6
+
+		vmulps ymm6, ymm6, ymm3 __asm vmulps ymm6, ymm6, ymm5 // exij
+
+		vmovaps ymm7, sMaskXY
+		vandps ymm7, ymm7, ymm6 // exlo
+		vxorps ymm6, ymm6, ymm7 // exhi
+
+		vmovaps ymm4, YMMWORD PTR [htmp    ] // h04ij
+		vmovaps ymm0, YMMWORD PTR [vtmp    ] // v04i
+		vmovaps ymm1, YMMWORD PTR [vtmp+ 32] // v04j
+
+		vpermilps ymm5, ymm7, 0xc0 __asm vmulps ymm4, ymm4, ymm5 // f04ij
+		vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u04i
+		vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4  // u04j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp   ] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v0i
+		mov edi, [ptmp+ 8] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v0j
+		mov edi, [ptmp+ 4] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v4i
+		mov edi, [ptmp+12] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v4j
+
+		vmovaps ymm4, YMMWORD PTR [htmp+ 32] // h15ij
+		vmovaps ymm0, YMMWORD PTR [vtmp+ 64] // v15i
+		vmovaps ymm1, YMMWORD PTR [vtmp+ 96] // v15j
+
+		vpermilps ymm5, ymm7, 0xd5 __asm vmulps ymm4, ymm4, ymm5 // f15ij
+		vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u15i
+		vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u15j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp+16] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v1i
+		mov edi, [ptmp+24] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v1j
+		mov edi, [ptmp+20] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v5i
+		mov edi, [ptmp+28] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v5j
+
+		vmovaps ymm4, YMMWORD PTR [htmp+ 64] // h26ij
+		vmovaps ymm0, YMMWORD PTR [vtmp+128] // v26i
+		vmovaps ymm1, YMMWORD PTR [vtmp+160] // v26j
+
+		vpermilps ymm5, ymm6, 0x2a __asm vmulps ymm4, ymm4, ymm5 // f26ij
+		vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u26i
+		vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u26j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp+32] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v2i
+		mov edi, [ptmp+40] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v2j
+		mov edi, [ptmp+36] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v6i
+		mov edi, [ptmp+44] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v6j
+
+		vmovaps ymm4, YMMWORD PTR [htmp+ 96] // h37ij
+		vmovaps ymm0, YMMWORD PTR [vtmp+192] // v37i
+		vmovaps ymm1, YMMWORD PTR [vtmp+224] // v37j
+
+		vpermilps ymm5, ymm6, 0x3f __asm vmulps ymm4, ymm4, ymm5 // f37ij
+		vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u37i
+		vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u37j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp+48] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v3i
+		mov edi, [ptmp+56] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v3j
+		mov edi, [ptmp+52] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v7i
+		mov edi, [ptmp+60] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v7j
+
+		add eax, 32
+		add edx, 32
+
+		cmp edx, esi
+		jb forBegin
+forEnd:
+	}
+
+	_mm256_zeroupper();
+}
+
+// AVX2 with useMultiplier
+template <>
+void solveConstraints<true, 2>(float* __restrict posIt, const float* __restrict rIt, 
+                               const float* __restrict rEnd, const uint16_t* __restrict iIt, const __m128& stiffnessRef)
+{
+	__m256 stiffness = _mm256_broadcast_ps(&stiffnessRef);
+	__m256 stretchLimit = _mm256_permute_ps(stiffness, 0xff);
+	__m256 compressionLimit = _mm256_permute_ps(stiffness, 0xaa);
+	__m256 multiplier = _mm256_permute_ps(stiffness, 0x55);
+	stiffness = _mm256_permute_ps(stiffness, 0x00);
+
+	__m256 vtmp[8], htmp[4];
+	float* ptmp[16];
+
+	__asm 
+	{
+		mov edx, rIt
+		mov esi, rEnd
+
+		cmp edx, esi
+		jae forEnd
+
+		mov eax, iIt
+		mov ecx, posIt
+
+forBegin:
+		movzx edi, WORD PTR [eax   ] __asm shl edi, 4 __asm mov [ptmp   ], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v0i
+		movzx edi, WORD PTR [eax+16] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v4i
+		movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v0j
+		movzx edi, WORD PTR [eax+18] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v4j
+		movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v1i
+		movzx edi, WORD PTR [eax+20] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v5i
+		movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v1j
+		movzx edi, WORD PTR [eax+22] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v5j
+
+		vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp    ], ymm0 // v04i
+		vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+ 32], ymm2 // v04j
+		vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+ 64], ymm4 // v15i
+		vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+ 96], ymm6 // v15j
+
+		vmovaps ymm7, sMinusOneXYZOneW
+		vfmadd213ps ymm2, ymm7, ymm0 __asm vmovaps YMMWORD PTR [htmp   ], ymm2 // h04ij
+		vfmadd213ps ymm6, ymm7, ymm4 __asm vmovaps YMMWORD PTR [htmp+32], ymm6 // h15ij
+
+		movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+32], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v2i
+		movzx edi, WORD PTR [eax+24] __asm shl edi, 4 __asm mov [ptmp+36], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v6i
+		movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+40], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v2j
+		movzx edi, WORD PTR [eax+26] __asm shl edi, 4 __asm mov [ptmp+44], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v6j
+		movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+48], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v3i
+		movzx edi, WORD PTR [eax+28] __asm shl edi, 4 __asm mov [ptmp+52], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v7i
+		movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+56], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v3j
+		movzx edi, WORD PTR [eax+30] __asm shl edi, 4 __asm mov [ptmp+60], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v7j
+
+		vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp+128], ymm0 // v26i
+		vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+160], ymm2 // v26j
+		vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+192], ymm4 // v37i
+		vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+224], ymm6 // v37j
+
+		vmovaps ymm7, sMinusOneXYZOneW
+		vfmadd213ps ymm2, ymm7, ymm0  __asm vmovaps YMMWORD PTR [htmp+64], ymm2 // h26ij
+		vfmadd213ps ymm6, ymm7, ymm4  __asm vmovaps YMMWORD PTR [htmp+96], ymm6 // h37ij
+
+		vmovaps ymm0, YMMWORD PTR [htmp   ] // h04ij
+		vmovaps ymm4, YMMWORD PTR [htmp+32] // h15ij
+
+		vunpcklps ymm1, ymm0, ymm2 // a
+		vunpckhps ymm3, ymm0, ymm2 // b
+		vunpcklps ymm5, ymm4, ymm6 // c
+		vunpckhps ymm7, ymm4, ymm6 // d
+
+		vunpcklps ymm0, ymm1, ymm5 // hxij
+		vunpckhps ymm2, ymm1, ymm5 // hyij
+		vunpcklps ymm4, ymm3, ymm7 // hzij
+		vunpckhps ymm6, ymm3, ymm7 // vwij
+
+		vmovaps ymm7, sEpsilon
+		vmovaps ymm5, sOne
+		vmovaps ymm3, stiffness
+		vmovaps ymm1, YMMWORD PTR [edx] // rij
+
+		vfmadd213ps ymm4, ymm4, ymm7 // e2ij
+		vfmadd213ps ymm2, ymm2, ymm4
+		vfmadd213ps ymm0, ymm0, ymm2
+
+		vcmpgt_oqps ymm2, ymm1, ymm7 // mask
+		vrsqrtps ymm0, ymm0 __asm vfnmadd231ps ymm5, ymm0, ymm1 // erij
+		vandps ymm5, ymm5, ymm2
+		vaddps ymm6, ymm6, ymm7 __asm vrcpps ymm6, ymm6
+
+		vmovaps ymm0, stretchLimit // multiplier block
+		vmovaps ymm1, compressionLimit
+		vmovaps ymm2, multiplier
+		vminps ymm0, ymm0, ymm5
+		vmaxps ymm1, ymm1, ymm0
+		vfnmadd231ps ymm5, ymm1, ymm2
+
+		vmulps ymm6, ymm6, ymm3 __asm vmulps ymm6, ymm6, ymm5 // exij
+
+		vmovaps ymm7, sMaskXY
+		vandps ymm7, ymm7, ymm6 // exlo
+		vxorps ymm6, ymm6, ymm7 // exhi
+
+		vmovaps ymm4, YMMWORD PTR [htmp    ] // h04ij
+		vmovaps ymm0, YMMWORD PTR [vtmp    ] // v04i
+		vmovaps ymm1, YMMWORD PTR [vtmp+ 32] // v04j
+
+		vpermilps ymm5, ymm7, 0xc0 __asm vmulps ymm4, ymm4, ymm5 // f04ij
+		vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u04i
+		vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4  // u04j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp   ] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v0i
+		mov edi, [ptmp+ 8] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v0j
+		mov edi, [ptmp+ 4] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v4i
+		mov edi, [ptmp+12] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v4j
+
+		vmovaps ymm4, YMMWORD PTR [htmp+ 32] // h15ij
+		vmovaps ymm0, YMMWORD PTR [vtmp+ 64] // v15i
+		vmovaps ymm1, YMMWORD PTR [vtmp+ 96] // v15j
+
+		vpermilps ymm5, ymm7, 0xd5 __asm vmulps ymm4, ymm4, ymm5 // f15ij
+		vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u15i
+		vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u15j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp+16] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v1i
+		mov edi, [ptmp+24] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v1j
+		mov edi, [ptmp+20] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v5i
+		mov edi, [ptmp+28] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v5j
+
+		vmovaps ymm4, YMMWORD PTR [htmp+ 64] // h26ij
+		vmovaps ymm0, YMMWORD PTR [vtmp+128] // v26i
+		vmovaps ymm1, YMMWORD PTR [vtmp+160] // v26j
+
+		vpermilps ymm5, ymm6, 0x2a __asm vmulps ymm4, ymm4, ymm5 // f26ij
+		vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u26i
+		vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u26j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp+32] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v2i
+		mov edi, [ptmp+40] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v2j
+		mov edi, [ptmp+36] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v6i
+		mov edi, [ptmp+44] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v6j
+
+		vmovaps ymm4, YMMWORD PTR [htmp+ 96] // h37ij
+		vmovaps ymm0, YMMWORD PTR [vtmp+192] // v37i
+		vmovaps ymm1, YMMWORD PTR [vtmp+224] // v37j
+
+		vpermilps ymm5, ymm6, 0x3f __asm vmulps ymm4, ymm4, ymm5 // f37ij
+		vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u37i
+		vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u37j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp+48] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v3i
+		mov edi, [ptmp+56] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v3j
+		mov edi, [ptmp+52] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v7i
+		mov edi, [ptmp+60] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v7j
+
+		add eax, 32
+		add edx, 32
+
+		cmp edx, esi
+		jb forBegin
+forEnd:
+	}
+
+	_mm256_zeroupper();
+}
+#endif // _MSC_VER >= 1700
+
+// clang-format:enable
+
+#else // _M_IX86
+
+template void solveConstraints<false, 1>(float* __restrict, const float* __restrict, const float* __restrict,
+                                         const uint16_t* __restrict, const __m128&);
+
+template void solveConstraints<true, 1>(float* __restrict, const float* __restrict, const float* __restrict,
+                                        const uint16_t* __restrict, const __m128&);
+
+template void solveConstraints<false, 2>(float* __restrict, const float* __restrict, const float* __restrict,
+                                         const uint16_t* __restrict, const __m128&);
+
+template void solveConstraints<true, 2>(float* __restrict, const float* __restrict, const float* __restrict,
+                                        const uint16_t* __restrict, const __m128&);
+
+#endif // _M_IX86
+
+} // namespace avx
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonCollision.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonCollision.cpp
new file mode 100644
index 00000000..01f1fb50
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonCollision.cpp
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef __ARM_NEON__
+#error This file needs to be compiled with NEON support!
+#endif
+
+#include "SwCollision.cpp"
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonSelfCollision.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonSelfCollision.cpp
new file mode 100644
index 00000000..d272bb6d
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonSelfCollision.cpp
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef __ARM_NEON__
+#error This file needs to be compiled with NEON support!
+#endif
+
+#include "SwSelfCollision.cpp"
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonSolverKernel.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonSolverKernel.cpp
new file mode 100644
index 00000000..068c900a
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonSolverKernel.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef __ARM_NEON__
+#error This file needs to be compiled with NEON support!
+#endif
+
+#include "SwSolverKernel.cpp"
+
+#include <cpu-features.h>
+
+namespace nvidia
+{
+namespace cloth
+{
+bool neonSolverKernel(SwCloth const& cloth, SwClothData& data, SwKernelAllocator& allocator,
+                      IterationStateFactory& factory, PxProfileZone* profileZone)
+{
+	return ANDROID_CPU_ARM_FEATURE_NEON & android_getCpuFeatures() &&
+	       (SwSolverKernel<Simd4f>(cloth, data, allocator, factory, profileZone)(), true);
+}
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/Simd4f.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/Simd4f.h
new file mode 100644
index 00000000..0c0b884c
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/Simd4f.h
@@ -0,0 +1,500 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// factory implementation
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline Simd4fFactory<const float&>::operator Simd4f() const
+{
+	return vdupq_n_f32(reinterpret_cast<const float32_t&>(v));
+}
+
+inline Simd4fFactory<detail::FourTuple>::operator Simd4f() const
+{
+	return reinterpret_cast<const Simd4f&>(v);
+}
+
+template <int i>
+inline Simd4fFactory<detail::IntType<i> >::operator Simd4f() const
+{
+	return vdupq_n_u32(i);
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<1> >::operator Simd4f() const
+{
+	return vdupq_n_f32(1.0f);
+}
+
+template <>
+inline Simd4fFactory<const float*>::operator Simd4f() const
+{
+	return vld1q_f32((const float32_t*)v);
+}
+
+template <>
+inline Simd4fFactory<detail::AlignedPointer<float> >::operator Simd4f() const
+{
+	return vld1q_f32((const float32_t*)v.ptr);
+}
+
+template <>
+inline Simd4fFactory<detail::OffsetPointer<float> >::operator Simd4f() const
+{
+	return vld1q_f32(reinterpret_cast<const float32_t*>(reinterpret_cast<const char*>(v.ptr) + v.offset));
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression templates
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline ComplementExpr<Simd4f>::operator Simd4f() const
+{
+	return vbicq_u32(vdupq_n_u32(0xffffffff), v.u4);
+}
+
+Simd4f operator&(const ComplementExpr<Simd4f>& complement, const Simd4f& v)
+{
+	return vbicq_u32(v.u4, complement.v.u4);
+}
+
+Simd4f operator&(const Simd4f& v, const ComplementExpr<Simd4f>& complement)
+{
+	return vbicq_u32(v.u4, complement.v.u4);
+}
+
+ProductExpr::operator Simd4f() const
+{
+	return vmulq_f32(v0.f4, v1.f4);
+}
+
+Simd4f operator+(const ProductExpr& p, const Simd4f& v)
+{
+	return vmlaq_f32(v.f4, p.v0.f4, p.v1.f4);
+}
+
+Simd4f operator+(const Simd4f& v, const ProductExpr& p)
+{
+	return vmlaq_f32(v.f4, p.v0.f4, p.v1.f4);
+}
+
+Simd4f operator+(const ProductExpr& p0, const ProductExpr& p1)
+{
+	// cast calls operator Simd4f() which evaluates the other ProductExpr
+	return vmlaq_f32(static_cast<Simd4f>(p0).f4, p1.v0.f4, p1.v1.f4);
+}
+
+Simd4f operator-(const Simd4f& v, const ProductExpr& p)
+{
+	return vmlsq_f32(v.f4, p.v0.f4, p.v1.f4);
+}
+
+Simd4f operator-(const ProductExpr& p0, const ProductExpr& p1)
+{
+	// cast calls operator Simd4f() which evaluates the other ProductExpr
+	return vmlsq_f32(static_cast<Simd4f>(p0).f4, p1.v0.f4, p1.v1.f4);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operator implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4f operator==(const Simd4f& v0, const Simd4f& v1)
+{
+	return vceqq_f32(v0.f4, v1.f4);
+}
+
+Simd4f operator<(const Simd4f& v0, const Simd4f& v1)
+{
+	return vcltq_f32(v0.f4, v1.f4);
+}
+
+Simd4f operator<=(const Simd4f& v0, const Simd4f& v1)
+{
+	return vcleq_f32(v0.f4, v1.f4);
+}
+
+Simd4f operator>(const Simd4f& v0, const Simd4f& v1)
+{
+	return vcgtq_f32(v0.f4, v1.f4);
+}
+
+Simd4f operator>=(const Simd4f& v0, const Simd4f& v1)
+{
+	return vcgeq_f32(v0.f4, v1.f4);
+}
+
+ComplementExpr<Simd4f> operator~(const Simd4f& v)
+{
+	return ComplementExpr<Simd4f>(v);
+}
+
+Simd4f operator&(const Simd4f& v0, const Simd4f& v1)
+{
+	return vandq_u32(v0.u4, v1.u4);
+}
+
+Simd4f operator|(const Simd4f& v0, const Simd4f& v1)
+{
+	return vorrq_u32(v0.u4, v1.u4);
+}
+
+Simd4f operator^(const Simd4f& v0, const Simd4f& v1)
+{
+	return veorq_u32(v0.u4, v1.u4);
+}
+
+Simd4f operator<<(const Simd4f& v, int shift)
+{
+	return vshlq_u32(v.u4, vdupq_n_s32(shift));
+}
+
+Simd4f operator>>(const Simd4f& v, int shift)
+{
+	return vshlq_u32(v.u4, vdupq_n_s32(-shift));
+}
+
+Simd4f operator<<(const Simd4f& v, const Simd4f& shift)
+{
+	return vshlq_u32(v.u4, shift.i4);
+}
+
+Simd4f operator>>(const Simd4f& v, const Simd4f& shift)
+{
+	return vshlq_u32(v.u4, vnegq_s32(shift.i4));
+}
+
+Simd4f operator+(const Simd4f& v)
+{
+	return v;
+}
+
+Simd4f operator+(const Simd4f& v0, const Simd4f& v1)
+{
+	return vaddq_f32(v0.f4, v1.f4);
+}
+
+Simd4f operator-(const Simd4f& v)
+{
+	return vnegq_f32(v.f4);
+}
+
+Simd4f operator-(const Simd4f& v0, const Simd4f& v1)
+{
+	return vsubq_f32(v0.f4, v1.f4);
+}
+
+ProductExpr operator*(const Simd4f& v0, const Simd4f& v1)
+{
+	return ProductExpr(v0, v1);
+}
+
+Simd4f operator/(const Simd4f& v0, const Simd4f& v1)
+{
+	return v0 * vrecpeq_f32(v1.f4); // reciprocal estimate
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// function implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4f simd4f(const Simd4i& v)
+{
+	return v.u4;
+}
+
+float (&array(Simd4f& v))[4]
+{
+	return (float(&)[4])v;
+}
+
+const float (&array(const Simd4f& v))[4]
+{
+	return (const float(&)[4])v;
+}
+
+void store(float* ptr, Simd4f const& v)
+{
+	return vst1q_f32((float32_t*)ptr, v.f4);
+}
+
+void storeAligned(float* ptr, Simd4f const& v)
+{
+	return vst1q_f32((float32_t*)ptr, v.f4);
+}
+
+void storeAligned(float* ptr, unsigned int offset, Simd4f const& v)
+{
+	return storeAligned(reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + offset), v);
+}
+
+template <size_t i>
+Simd4f splat(Simd4f const& v)
+{
+	return vdupq_n_f32(array(v)[i]);
+}
+
+Simd4f select(Simd4f const& mask, Simd4f const& v0, Simd4f const& v1)
+{
+	return vbslq_f32(mask.u4, v0.f4, v1.f4);
+}
+
+Simd4f abs(const Simd4f& v)
+{
+	return vabsq_f32(v.f4);
+}
+
+Simd4f floor(const Simd4f& v)
+{
+	int32x4_t neg = vreinterpretq_s32_u32(vshrq_n_u32(v.u4, 31));
+	return vcvtq_f32_s32(vsubq_s32(vcvtq_s32_f32(v.f4), neg));
+}
+
+Simd4f max(const Simd4f& v0, const Simd4f& v1)
+{
+	return vmaxq_f32(v0.f4, v1.f4);
+}
+
+Simd4f min(const Simd4f& v0, const Simd4f& v1)
+{
+	return vminq_f32(v0.f4, v1.f4);
+}
+
+Simd4f recip(const Simd4f& v)
+{
+	return recipT<0>(v);
+}
+
+template <int n>
+Simd4f recipT(const Simd4f& v)
+{
+	Simd4f recipV = vrecpeq_f32(v.f4);
+	// n+1 newton iterations because initial approximation is crude
+	for(int i = 0; i <= n; ++i)
+		recipV = vrecpsq_f32(v.f4, recipV.f4) * recipV;
+	return recipV;
+}
+
+Simd4f sqrt(const Simd4f& v)
+{
+	return v * rsqrt(v);
+}
+
+Simd4f rsqrt(const Simd4f& v)
+{
+	return rsqrtT<0>(v);
+}
+
+template <int n>
+Simd4f rsqrtT(const Simd4f& v)
+{
+	Simd4f rsqrtV = vrsqrteq_f32(v.f4);
+	// n+1 newton iterations because initial approximation is crude
+	for(int i = 0; i <= n; ++i)
+		rsqrtV = vrsqrtsq_f32(vmulq_f32(v.f4, rsqrtV.f4), rsqrtV.f4) * rsqrtV;
+	return rsqrtV;
+}
+
+Simd4f exp2(const Simd4f& v)
+{
+	// http://www.netlib.org/cephes/
+
+	Simd4f limit = simd4f(127.4999f);
+	Simd4f x = min(max(-limit, v), limit);
+
+	// separate into integer and fractional part
+
+	Simd4f fx = x + simd4f(0.5f);
+	Simd4i ix = vsubq_s32(vcvtq_s32_f32(fx.f4), vreinterpretq_s32_u32(vshrq_n_u32(fx.u4, 31)));
+	fx = x - vcvtq_f32_s32(ix.i4);
+
+	// exp2(fx) ~ 1 + 2*P(fx) / (Q(fx) - P(fx))
+
+	Simd4f fx2 = fx * fx;
+
+	Simd4f px = fx * (simd4f(1.51390680115615096133e+3f) +
+	                  fx2 * (simd4f(2.02020656693165307700e+1f) + fx2 * simd4f(2.30933477057345225087e-2f)));
+	Simd4f qx = simd4f(4.36821166879210612817e+3f) + fx2 * (simd4f(2.33184211722314911771e+2f) + fx2);
+
+	Simd4f exp2fx = px * recip(qx - px);
+	exp2fx = simd4f(_1) + exp2fx + exp2fx;
+
+	// exp2(ix)
+
+	Simd4f exp2ix = vreinterpretq_f32_s32(vshlq_n_s32(vaddq_s32(ix.i4, vdupq_n_s32(0x7f)), 23));
+
+	return exp2fx * exp2ix;
+}
+
+Simd4f log2(const Simd4f& v)
+{
+	Simd4f scale = simd4f(1.44269504088896341f); // 1/ln(2)
+	const float* ptr = array(v);
+	return simd4f(::logf(ptr[0]), ::logf(ptr[1]), ::logf(ptr[2]), ::logf(ptr[3])) * scale;
+}
+
+Simd4f dot3(const Simd4f& v0, const Simd4f& v1)
+{
+	Simd4f tmp = v0 * v1;
+	return splat<0>(tmp) + splat<1>(tmp) + splat<2>(tmp);
+}
+
+Simd4f cross3(const Simd4f& v0, const Simd4f& v1)
+{
+	float32x2_t x0_y0 = vget_low_f32(v0.f4);
+	float32x2_t z0_w0 = vget_high_f32(v0.f4);
+	float32x2_t x1_y1 = vget_low_f32(v1.f4);
+	float32x2_t z1_w1 = vget_high_f32(v1.f4);
+
+	float32x2_t y1_z1 = vext_f32(x1_y1, z1_w1, 1);
+	float32x2_t y0_z0 = vext_f32(x0_y0, z0_w0, 1);
+
+	float32x2_t z0x1_w0y1 = vmul_f32(z0_w0, x1_y1);
+	float32x2_t x0y1_y0z1 = vmul_f32(x0_y0, y1_z1);
+
+	float32x2_t y2_w2 = vmls_f32(z0x1_w0y1, x0_y0, z1_w1);
+	float32x2_t z2_x2 = vmls_f32(x0y1_y0z1, y0_z0, x1_y1);
+	float32x2_t x2_y2 = vext_f32(z2_x2, y2_w2, 1);
+
+	return vcombine_f32(x2_y2, z2_x2);
+}
+
+void transpose(Simd4f& x, Simd4f& y, Simd4f& z, Simd4f& w)
+{
+#if NVMATH_INLINE_ASSEMBLER
+	asm volatile("vzip.f32 %q0, %q2 \n\t"
+	             "vzip.f32 %q1, %q3 \n\t"
+	             "vzip.f32 %q0, %q1 \n\t"
+	             "vzip.f32 %q2, %q3 \n\t"
+	             : "+w"(x.f4), "+w"(y.f4), "+w"(z.f4), "+w"(w.f4));
+#else
+	float32x4x2_t v0v1 = vzipq_f32(x.f4, z.f4);
+	float32x4x2_t v2v3 = vzipq_f32(y.f4, w.f4);
+	float32x4x2_t zip0 = vzipq_f32(v0v1.val[0], v2v3.val[0]);
+	float32x4x2_t zip1 = vzipq_f32(v0v1.val[1], v2v3.val[1]);
+
+	x = zip0.val[0];
+	y = zip0.val[1];
+	z = zip1.val[0];
+	w = zip1.val[1];
+#endif
+}
+
+int allEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return allTrue(v0 == v1);
+}
+
+int allEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return allTrue(outMask = v0 == v1);
+}
+
+int anyEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return anyTrue(v0 == v1);
+}
+
+int anyEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return anyTrue(outMask = v0 == v1);
+}
+
+int allGreater(const Simd4f& v0, const Simd4f& v1)
+{
+	return allTrue(v0 > v1);
+}
+
+int allGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return allTrue(outMask = v0 > v1);
+}
+
+int anyGreater(const Simd4f& v0, const Simd4f& v1)
+{
+	return anyTrue(v0 > v1);
+}
+
+int anyGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return anyTrue(outMask = v0 > v1);
+}
+
+int allGreaterEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return allTrue(v0 >= v1);
+}
+
+int allGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return allTrue(outMask = v0 >= v1);
+}
+
+int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return anyTrue(v0 >= v1);
+}
+
+int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return anyTrue(outMask = v0 >= v1);
+}
+
+int allTrue(const Simd4f& v)
+{
+#if NVMATH_INLINE_ASSEMBLER
+	int result;
+	asm volatile("vmovq q0, %q1 \n\t"
+	             "vand.u32 d0, d0, d1 \n\t"
+	             "vpmin.u32 d0, d0, d0 \n\t"
+	             "vcmp.f32 s0, #0 \n\t"
+	             "fmrx %0, fpscr"
+	             : "=r"(result)
+	             : "w"(v.f4)
+	             : "q0");
+	return result >> 28 & 0x1;
+#else
+	uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4));
+	uint16x4_t lo = vmovn_u32(v.u4);
+	uint16x8_t combined = vcombine_u16(lo, hi);
+	uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined));
+	return vget_lane_u32(reduced, 0) == 0xffffffff;
+#endif
+}
+
+int anyTrue(const Simd4f& v)
+{
+#if NVMATH_INLINE_ASSEMBLER
+	int result;
+	asm volatile("vmovq q0, %q1 \n\t"
+	             "vorr.u32 d0, d0, d1 \n\t"
+	             "vpmax.u32 d0, d0, d0 \n\t"
+	             "vcmp.f32 s0, #0 \n\t"
+	             "fmrx %0, fpscr"
+	             : "=r"(result)
+	             : "w"(v.f4)
+	             : "q0");
+	return result >> 28 & 0x1;
+#else
+	uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4));
+	uint16x4_t lo = vmovn_u32(v.u4);
+	uint16x8_t combined = vcombine_u16(lo, hi);
+	uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined));
+	return vget_lane_u32(reduced, 0) != 0x0;
+#endif
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/Simd4i.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/Simd4i.h
new file mode 100644
index 00000000..7a566256
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/Simd4i.h
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// factory implementation
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline Simd4iFactory<const int&>::operator Simd4i() const
+{
+	return vdupq_n_s32(v);
+}
+
+inline Simd4iFactory<detail::FourTuple>::operator Simd4i() const
+{
+	return reinterpret_cast<const Simd4i&>(v);
+}
+
+template <int i>
+inline Simd4iFactory<detail::IntType<i> >::operator Simd4i() const
+{
+	return vdupq_n_u32(i);
+}
+
+template <>
+inline Simd4iFactory<const int*>::operator Simd4i() const
+{
+	return vld1q_s32(v);
+}
+
+template <>
+inline Simd4iFactory<detail::AlignedPointer<int> >::operator Simd4i() const
+{
+	return vld1q_s32(v.ptr);
+}
+
+template <>
+inline Simd4iFactory<detail::OffsetPointer<int> >::operator Simd4i() const
+{
+	return vld1q_s32(reinterpret_cast<const int*>(reinterpret_cast<const char*>(v.ptr) + v.offset));
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression template
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline ComplementExpr<Simd4i>::operator Simd4i() const
+{
+	return vbicq_u32(vdupq_n_u32(0xffffffff), v.u4);
+}
+
+Simd4i operator&(const ComplementExpr<Simd4i>& complement, const Simd4i& v)
+{
+	return vbicq_u32(v.u4, complement.v.u4);
+}
+
+Simd4i operator&(const Simd4i& v, const ComplementExpr<Simd4i>& complement)
+{
+	return vbicq_u32(v.u4, complement.v.u4);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operator implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4i simdi::operator==(const Simd4i& v0, const Simd4i& v1)
+{
+	return vceqq_u32(v0.u4, v1.u4);
+}
+
+Simd4i simdi::operator<(const Simd4i& v0, const Simd4i& v1)
+{
+	return vcltq_s32(v0.i4, v1.i4);
+}
+
+Simd4i simdi::operator>(const Simd4i& v0, const Simd4i& v1)
+{
+	return vcgtq_s32(v0.i4, v1.i4);
+}
+
+ComplementExpr<Simd4i> operator~(const Simd4i& v)
+{
+	return ComplementExpr<Simd4i>(v);
+}
+
+Simd4i operator&(const Simd4i& v0, const Simd4i& v1)
+{
+	return vandq_u32(v0.u4, v1.u4);
+}
+
+Simd4i operator|(const Simd4i& v0, const Simd4i& v1)
+{
+	return vorrq_u32(v0.u4, v1.u4);
+}
+
+Simd4i operator^(const Simd4i& v0, const Simd4i& v1)
+{
+	return veorq_u32(v0.u4, v1.u4);
+}
+
+Simd4i operator<<(const Simd4i& v, int shift)
+{
+	return vshlq_u32(v.u4, vdupq_n_s32(shift));
+}
+
+Simd4i operator>>(const Simd4i& v, int shift)
+{
+	return vshlq_u32(v.u4, vdupq_n_s32(-shift));
+}
+
+Simd4i operator<<(const Simd4i& v, const Simd4i& shift)
+{
+	return vshlq_u32(v.u4, shift.i4);
+}
+
+Simd4i operator>>(const Simd4i& v, const Simd4i& shift)
+{
+	return vshlq_u32(v.u4, vnegq_s32(shift.i4));
+}
+
+Simd4i simdi::operator+(const Simd4i& v0, const Simd4i& v1)
+{
+	return vaddq_u32(v0.u4, v1.u4);
+}
+
+Simd4i simdi::operator-(const Simd4i& v)
+{
+	return vnegq_s32(v.i4);
+}
+
+Simd4i simdi::operator-(const Simd4i& v0, const Simd4i& v1)
+{
+	return vsubq_u32(v0.u4, v1.u4);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// function implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4i simd4i(const Simd4f& v)
+{
+	return v.u4;
+}
+
+int (&simdi::array(Simd4i& v))[4]
+{
+	return (int(&)[4])v;
+}
+
+const int (&simdi::array(const Simd4i& v))[4]
+{
+	return (const int(&)[4])v;
+}
+
+void store(int* ptr, const Simd4i& v)
+{
+	return vst1q_s32(ptr, v.i4);
+}
+
+void storeAligned(int* ptr, const Simd4i& v)
+{
+	vst1q_s32(ptr, v.i4);
+}
+
+void storeAligned(int* ptr, unsigned int offset, const Simd4i& v)
+{
+	return storeAligned(reinterpret_cast<int*>(reinterpret_cast<char*>(ptr) + offset), v);
+}
+
+template <size_t i>
+Simd4i splat(Simd4i const& v)
+{
+	return vdupq_n_s32(simdi::array(v)[i]);
+}
+
+Simd4i select(Simd4i const& mask, Simd4i const& v0, Simd4i const& v1)
+{
+	return vbslq_u32(mask.u4, v0.u4, v1.u4);
+}
+
+int simdi::allEqual(const Simd4i& v0, const Simd4i& v1)
+{
+	return allTrue(simdi::operator==(v0, v1));
+}
+
+int simdi::allEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return allTrue(outMask = simdi::operator==(v0, v1));
+}
+
+int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1)
+{
+	return anyTrue(simdi::operator==(v0, v1));
+}
+
+int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return anyTrue(outMask = simdi::operator==(v0, v1));
+}
+
+int simdi::allGreater(const Simd4i& v0, const Simd4i& v1)
+{
+	return allTrue(simdi::operator>(v0, v1));
+}
+
+int simdi::allGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return allTrue(outMask = simdi::operator>(v0, v1));
+}
+
+int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1)
+{
+	return anyTrue(simdi::operator>(v0, v1));
+}
+
+int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return anyTrue(outMask = simdi::operator>(v0, v1));
+}
+
+int allTrue(const Simd4i& v)
+{
+#if NVMATH_INLINE_ASSEMBLER
+	int result;
+	asm volatile("vmovq q0, %q1 \n\t"
+	             "vand.u32 d0, d0, d1 \n\t"
+	             "vpmin.u32 d0, d0, d0 \n\t"
+	             "vcmp.f32 s0, #0 \n\t"
+	             "fmrx %0, fpscr"
+	             : "=r"(result)
+	             : "w"(v.u4)
+	             : "q0");
+	return result >> 28 & 0x1;
+#else
+	uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4));
+	uint16x4_t lo = vmovn_u32(v.u4);
+	uint16x8_t combined = vcombine_u16(lo, hi);
+	uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined));
+	return vget_lane_u32(reduced, 0) == 0xffffffff;
+#endif
+}
+
+int anyTrue(const Simd4i& v)
+{
+#if NVMATH_INLINE_ASSEMBLER
+	int result;
+	asm volatile("vmovq q0, %q1 \n\t"
+	             "vorr.u32 d0, d0, d1 \n\t"
+	             "vpmax.u32 d0, d0, d0 \n\t"
+	             "vcmp.f32 s0, #0 \n\t"
+	             "fmrx %0, fpscr"
+	             : "=r"(result)
+	             : "w"(v.u4)
+	             : "q0");
+	return result >> 28 & 0x1;
+#else
+	uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4));
+	uint16x4_t lo = vmovn_u32(v.u4);
+	uint16x8_t combined = vcombine_u16(lo, hi);
+	uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined));
+	return vget_lane_u32(reduced, 0) != 0x0;
+#endif
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/SimdTypes.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/SimdTypes.h
new file mode 100644
index 00000000..542fac08
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/SimdTypes.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include <arm_neon.h>
+
+union Simd4f
+{
+	Simd4f()
+	{
+	}
+	Simd4f(const float32x4_t& v) : f4(v)
+	{
+	}
+#ifndef _M_ARM // all *32x4_t map to the same type
+	Simd4f(const uint32x4_t& v) : u4(v)
+	{
+	}
+#endif
+	float32x4_t f4;
+	uint32x4_t u4;
+	int32x4_t i4;
+};
+
+union Simd4i
+{
+	Simd4i()
+	{
+	}
+	Simd4i(const uint32x4_t& v) : u4(v)
+	{
+	}
+#ifndef _M_ARM // all *32x4_t map to the same type
+	Simd4i(const int32x4_t& v) : i4(v)
+	{
+	}
+#endif
+	uint32x4_t u4;
+	int32x4_t i4;
+};
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/SwCollisionHelpers.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/SwCollisionHelpers.h
new file mode 100644
index 00000000..b67f96aa
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/SwCollisionHelpers.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#ifdef _M_ARM
+#include <arm_neon.h>
+#endif
+
+namespace nvidia
+{
+namespace cloth
+{
+
+uint32_t findBitSet(uint32_t mask)
+{
+#ifdef _M_ARM
+	__n64 t = { mask };
+	return 31 - (vclz_u32(t)).n64_u32[0];
+#else
+	return 31 - __builtin_clz(mask);
+#endif
+}
+
+Simd4i intFloor(const Simd4f& v)
+{
+	int32x4_t neg = vreinterpretq_s32_u32(vshrq_n_u32(v.u4, 31));
+	return vsubq_s32(vcvtq_s32_f32(v.f4), neg);
+}
+
+Simd4i horizontalOr(Simd4i mask)
+{
+	using namespace simdi;
+	uint32x2_t hi = vget_high_u32(mask.u4);
+	uint32x2_t lo = vget_low_u32(mask.u4);
+	uint32x2_t tmp = vorr_u32(lo, hi);
+	uint32x2_t rev = vrev64_u32(tmp);
+	uint32x2_t res = vorr_u32(tmp, rev);
+	return vcombine_u32(res, res);
+}
+
+Gather<Simd4i>::Gather(const Simd4i& index)
+{
+#ifdef __arm64__
+	using namespace simdi;
+	PX_ALIGN(16, uint8x8x2_t) byteIndex = reinterpret_cast<const uint8x8x2_t&>(sPack);
+	uint8x16_t lohiIndex = reinterpret_cast<const uint8x16_t&>(index);
+	byteIndex.val[0] = vtbl1q_u8(lohiIndex, byteIndex.val[0]);
+	byteIndex.val[1] = vtbl1q_u8(lohiIndex, byteIndex.val[1]);
+	mPermute = vshlq_n_u32(reinterpret_cast<const uint32x4_t&>(byteIndex), 2);
+	mPermute = mPermute | sOffset | vcgtq_u32(index.u4, sMask.u4);
+#else
+	using namespace simdi;
+	PX_ALIGN(16, uint8x8x2_t) byteIndex = reinterpret_cast<const uint8x8x2_t&>(sPack);
+	uint8x8x2_t lohiIndex = reinterpret_cast<const uint8x8x2_t&>(index);
+	byteIndex.val[0] = vtbl2_u8(lohiIndex, byteIndex.val[0]);
+	byteIndex.val[1] = vtbl2_u8(lohiIndex, byteIndex.val[1]);
+	mPermute = vshlq_n_u32(reinterpret_cast<const uint32x4_t&>(byteIndex), 2);
+	mPermute = mPermute | sOffset | vcgtq_u32(index.u4, sMask.u4);
+#endif
+}
+
+Simd4i Gather<Simd4i>::operator()(const Simd4i* ptr) const
+{
+#ifdef __arm64__
+	PX_ALIGN(16, uint8x8x2_t) result = reinterpret_cast<const uint8x8x2_t&>(mPermute);
+	const uint8x16x2_t* table = reinterpret_cast<const uint8x16x2_t*>(ptr);
+	result.val[0] = vtbl2q_u8(*table, result.val[0]);
+	result.val[1] = vtbl2q_u8(*table, result.val[1]);
+	return reinterpret_cast<const Simd4i&>(result);
+#else
+	PX_ALIGN(16, uint8x8x2_t) result = reinterpret_cast<const uint8x8x2_t&>(mPermute);
+	const uint8x8x4_t* table = reinterpret_cast<const uint8x8x4_t*>(ptr);
+	result.val[0] = vtbl4_u8(*table, result.val[0]);
+	result.val[1] = vtbl4_u8(*table, result.val[1]);
+	return reinterpret_cast<const Simd4i&>(result);
+#endif
+}
+
+} // namespace cloth
+} // namespace physx
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/Simd4f.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/Simd4f.h
new file mode 100644
index 00000000..d02d5066
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/Simd4f.h
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// factory implementation
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline Simd4fFactory<const float&>::operator Scalar4f() const
+{
+	return Scalar4f(v, v, v, v);
+}
+
+inline Simd4fFactory<detail::FourTuple>::operator Scalar4f() const
+{
+	return reinterpret_cast<const Scalar4f&>(v);
+}
+
+template <int i>
+inline Simd4fFactory<detail::IntType<i> >::operator Scalar4f() const
+{
+	float s = i;
+	return Scalar4f(s, s, s, s);
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<0x80000000u> >::operator Scalar4f() const
+{
+	int32_t i = 0x80000000u;
+	return Scalar4f(i, i, i, i);
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<0xffffffff> >::operator Scalar4f() const
+{
+	int32_t i = 0xffffffff;
+	return Scalar4f(i, i, i, i);
+}
+
+template <>
+inline Simd4fFactory<const float*>::operator Scalar4f() const
+{
+	return Scalar4f(v[0], v[1], v[2], v[3]);
+}
+
+template <>
+inline Simd4fFactory<detail::AlignedPointer<float> >::operator Scalar4f() const
+{
+	return Scalar4f(v.ptr[0], v.ptr[1], v.ptr[2], v.ptr[3]);
+}
+
+template <>
+inline Simd4fFactory<detail::OffsetPointer<float> >::operator Scalar4f() const
+{
+	const float* ptr = reinterpret_cast<const float*>(reinterpret_cast<const char*>(v.ptr) + v.offset);
+	return Scalar4f(ptr[0], ptr[1], ptr[2], ptr[3]);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression template
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline ComplementExpr<Scalar4f>::operator Scalar4f() const
+{
+	return Scalar4f(~v.u4[0], ~v.u4[1], ~v.u4[2], ~v.u4[3]);
+}
+
+inline Scalar4f operator&(const ComplementExpr<Scalar4f>& complement, const Scalar4f& v)
+{
+	return Scalar4f(v.u4[0] & ~complement.v.u4[0], v.u4[1] & ~complement.v.u4[1], v.u4[2] & ~complement.v.u4[2],
+	                v.u4[3] & ~complement.v.u4[3]);
+}
+
+inline Scalar4f operator&(const Scalar4f& v, const ComplementExpr<Scalar4f>& complement)
+{
+	return Scalar4f(v.u4[0] & ~complement.v.u4[0], v.u4[1] & ~complement.v.u4[1], v.u4[2] & ~complement.v.u4[2],
+	                v.u4[3] & ~complement.v.u4[3]);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operator implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+inline Scalar4f operator==(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] == v1.f4[0], v0.f4[1] == v1.f4[1], v0.f4[2] == v1.f4[2], v0.f4[3] == v1.f4[3]);
+}
+
+inline Scalar4f operator<(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] < v1.f4[0], v0.f4[1] < v1.f4[1], v0.f4[2] < v1.f4[2], v0.f4[3] < v1.f4[3]);
+}
+
+inline Scalar4f operator<=(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] <= v1.f4[0], v0.f4[1] <= v1.f4[1], v0.f4[2] <= v1.f4[2], v0.f4[3] <= v1.f4[3]);
+}
+
+inline Scalar4f operator>(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] > v1.f4[0], v0.f4[1] > v1.f4[1], v0.f4[2] > v1.f4[2], v0.f4[3] > v1.f4[3]);
+}
+
+inline Scalar4f operator>=(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] >= v1.f4[0], v0.f4[1] >= v1.f4[1], v0.f4[2] >= v1.f4[2], v0.f4[3] >= v1.f4[3]);
+}
+
+inline ComplementExpr<Scalar4f> operator~(const Scalar4f& v)
+{
+	return ComplementExpr<Scalar4f>(v);
+}
+
+inline Scalar4f operator&(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.u4[0] & v1.u4[0], v0.u4[1] & v1.u4[1], v0.u4[2] & v1.u4[2], v0.u4[3] & v1.u4[3]);
+}
+
+inline Scalar4f operator|(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.u4[0] | v1.u4[0], v0.u4[1] | v1.u4[1], v0.u4[2] | v1.u4[2], v0.u4[3] | v1.u4[3]);
+}
+
+inline Scalar4f operator^(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.u4[0] ^ v1.u4[0], v0.u4[1] ^ v1.u4[1], v0.u4[2] ^ v1.u4[2], v0.u4[3] ^ v1.u4[3]);
+}
+
+inline Scalar4f operator<<(const Scalar4f& v, int shift)
+{
+	return Scalar4f(v.u4[0] << shift, v.u4[1] << shift, v.u4[2] << shift, v.u4[3] << shift);
+}
+
+inline Scalar4f operator>>(const Scalar4f& v, int shift)
+{
+	return Scalar4f(v.u4[0] >> shift, v.u4[1] >> shift, v.u4[2] >> shift, v.u4[3] >> shift);
+}
+
+inline Scalar4f operator+(const Scalar4f& v)
+{
+	return v;
+}
+
+inline Scalar4f operator+(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] + v1.f4[0], v0.f4[1] + v1.f4[1], v0.f4[2] + v1.f4[2], v0.f4[3] + v1.f4[3]);
+}
+
+inline Scalar4f operator-(const Scalar4f& v)
+{
+	return Scalar4f(-v.f4[0], -v.f4[1], -v.f4[2], -v.f4[3]);
+}
+
+inline Scalar4f operator-(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] - v1.f4[0], v0.f4[1] - v1.f4[1], v0.f4[2] - v1.f4[2], v0.f4[3] - v1.f4[3]);
+}
+
+inline Scalar4f operator*(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] * v1.f4[0], v0.f4[1] * v1.f4[1], v0.f4[2] * v1.f4[2], v0.f4[3] * v1.f4[3]);
+}
+
+inline Scalar4f operator/(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] / v1.f4[0], v0.f4[1] / v1.f4[1], v0.f4[2] / v1.f4[2], v0.f4[3] / v1.f4[3]);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// function implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+inline Scalar4f simd4f(const Scalar4i& v)
+{
+	return v;
+}
+
+inline float (&array(Scalar4f& v))[4]
+{
+	return v.f4;
+}
+
+inline const float (&array(const Scalar4f& v))[4]
+{
+	return v.f4;
+}
+
+inline void store(float* ptr, const Scalar4f& v)
+{
+	ptr[0] = v.f4[0];
+	ptr[1] = v.f4[1];
+	ptr[2] = v.f4[2];
+	ptr[3] = v.f4[3];
+}
+
+inline void storeAligned(float* ptr, const Scalar4f& v)
+{
+	store(ptr, v);
+}
+
+inline void storeAligned(float* ptr, unsigned int offset, const Scalar4f& v)
+{
+	storeAligned(reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + offset), v);
+}
+
+template <size_t i>
+inline Scalar4f splat(const Scalar4f& v)
+{
+	return Scalar4f(v.f4[i], v.f4[i], v.f4[i], v.f4[i]);
+}
+
+inline Scalar4f select(const Scalar4f& mask, const Scalar4f& v0, const Scalar4f& v1)
+{
+	return ((v0 ^ v1) & mask) ^ v1;
+}
+
+inline Scalar4f abs(const Scalar4f& v)
+{
+	return Scalar4f(::fabsf(v.f4[0]), ::fabsf(v.f4[1]), ::fabsf(v.f4[2]), ::fabsf(v.f4[3]));
+}
+
+inline Scalar4f floor(const Scalar4f& v)
+{
+	return Scalar4f(::floorf(v.f4[0]), ::floorf(v.f4[1]), ::floorf(v.f4[2]), ::floorf(v.f4[3]));
+}
+
+inline Scalar4f max(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(std::max(v0.f4[0], v1.f4[0]), std::max(v0.f4[1], v1.f4[1]), std::max(v0.f4[2], v1.f4[2]),
+	                std::max(v0.f4[3], v1.f4[3]));
+}
+
+inline Scalar4f min(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(std::min(v0.f4[0], v1.f4[0]), std::min(v0.f4[1], v1.f4[1]), std::min(v0.f4[2], v1.f4[2]),
+	                std::min(v0.f4[3], v1.f4[3]));
+}
+
+inline Scalar4f recip(const Scalar4f& v)
+{
+	return Scalar4f(1 / v.f4[0], 1 / v.f4[1], 1 / v.f4[2], 1 / v.f4[3]);
+}
+
+template <int n>
+inline Scalar4f recipT(const Scalar4f& v)
+{
+	return recip(v);
+}
+
+inline Scalar4f sqrt(const Scalar4f& v)
+{
+	return Scalar4f(::sqrtf(v.f4[0]), ::sqrtf(v.f4[1]), ::sqrtf(v.f4[2]), ::sqrtf(v.f4[3]));
+}
+
+inline Scalar4f rsqrt(const Scalar4f& v)
+{
+	return recip(sqrt(v));
+}
+
+template <int n>
+inline Scalar4f rsqrtT(const Scalar4f& v)
+{
+	return rsqrt(v);
+}
+
+inline Scalar4f exp2(const Scalar4f& v)
+{
+	float scale = 0.69314718055994531f; // ::logf(2.0f);
+	return Scalar4f(::expf(v.f4[0] * scale), ::expf(v.f4[1] * scale), ::expf(v.f4[2] * scale), ::expf(v.f4[3] * scale));
+}
+
+namespace simdf
+{
+// PSP2 is confused resolving about exp2, forwarding works
+inline Scalar4f exp2(const Scalar4f& v)
+{
+	return ::exp2(v);
+}
+}
+
+inline Scalar4f log2(const Scalar4f& v)
+{
+	float scale = 1.44269504088896341f; // 1/ln(2)
+	return Scalar4f(::logf(v.f4[0]) * scale, ::logf(v.f4[1]) * scale, ::logf(v.f4[2]) * scale, ::logf(v.f4[3]) * scale);
+}
+
+inline Scalar4f dot3(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return simd4f(v0.f4[0] * v1.f4[0] + v0.f4[1] * v1.f4[1] + v0.f4[2] * v1.f4[2]);
+}
+
+inline Scalar4f cross3(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return simd4f(v0.f4[1] * v1.f4[2] - v0.f4[2] * v1.f4[1], v0.f4[2] * v1.f4[0] - v0.f4[0] * v1.f4[2],
+	              v0.f4[0] * v1.f4[1] - v0.f4[1] * v1.f4[0], 0.0f);
+}
+
+inline void transpose(Scalar4f& x, Scalar4f& y, Scalar4f& z, Scalar4f& w)
+{
+	float x1 = x.f4[1], x2 = x.f4[2], x3 = x.f4[3];
+	float y2 = y.f4[2], y3 = y.f4[3], z3 = z.f4[3];
+
+	x.f4[1] = y.f4[0];
+	x.f4[2] = z.f4[0];
+	x.f4[3] = w.f4[0];
+	y.f4[0] = x1;
+	y.f4[2] = z.f4[1];
+	y.f4[3] = w.f4[1];
+	z.f4[0] = x2;
+	z.f4[1] = y2;
+	z.f4[3] = w.f4[2];
+	w.f4[0] = x3;
+	w.f4[1] = y3;
+	w.f4[2] = z3;
+}
+
+inline int allEqual(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return v0.f4[0] == v1.f4[0] && v0.f4[1] == v1.f4[1] && v0.f4[2] == v1.f4[2] && v0.f4[3] == v1.f4[3];
+}
+
+inline int allEqual(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask)
+{
+	bool b0 = v0.f4[0] == v1.f4[0], b1 = v0.f4[1] == v1.f4[1], b2 = v0.f4[2] == v1.f4[2], b3 = v0.f4[3] == v1.f4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 && b1 && b2 && b3;
+}
+
+inline int anyEqual(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return v0.f4[0] == v1.f4[0] || v0.f4[1] == v1.f4[1] || v0.f4[2] == v1.f4[2] || v0.f4[3] == v1.f4[3];
+}
+
+inline int anyEqual(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask)
+{
+	bool b0 = v0.f4[0] == v1.f4[0], b1 = v0.f4[1] == v1.f4[1], b2 = v0.f4[2] == v1.f4[2], b3 = v0.f4[3] == v1.f4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 || b1 || b2 || b3;
+}
+
+inline int allGreater(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return v0.f4[0] > v1.f4[0] && v0.f4[1] > v1.f4[1] && v0.f4[2] > v1.f4[2] && v0.f4[3] > v1.f4[3];
+}
+
+inline int allGreater(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask)
+{
+	bool b0 = v0.f4[0] > v1.f4[0], b1 = v0.f4[1] > v1.f4[1], b2 = v0.f4[2] > v1.f4[2], b3 = v0.f4[3] > v1.f4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 && b1 && b2 && b3;
+}
+
+inline int anyGreater(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return v0.f4[0] > v1.f4[0] || v0.f4[1] > v1.f4[1] || v0.f4[2] > v1.f4[2] || v0.f4[3] > v1.f4[3];
+}
+
+inline int anyGreater(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask)
+{
+	bool b0 = v0.f4[0] > v1.f4[0], b1 = v0.f4[1] > v1.f4[1], b2 = v0.f4[2] > v1.f4[2], b3 = v0.f4[3] > v1.f4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 || b1 || b2 || b3;
+}
+
+inline int allGreaterEqual(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return v0.f4[0] >= v1.f4[0] && v0.f4[1] >= v1.f4[1] && v0.f4[2] >= v1.f4[2] && v0.f4[3] >= v1.f4[3];
+}
+
+inline int allGreaterEqual(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask)
+{
+	bool b0 = v0.f4[0] >= v1.f4[0], b1 = v0.f4[1] >= v1.f4[1], b2 = v0.f4[2] >= v1.f4[2], b3 = v0.f4[3] >= v1.f4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 && b1 && b2 && b3;
+}
+
+inline int anyGreaterEqual(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return v0.f4[0] >= v1.f4[0] || v0.f4[1] >= v1.f4[1] || v0.f4[2] >= v1.f4[2] || v0.f4[3] >= v1.f4[3];
+}
+
+inline int anyGreaterEqual(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask)
+{
+	bool b0 = v0.f4[0] >= v1.f4[0], b1 = v0.f4[1] >= v1.f4[1], b2 = v0.f4[2] >= v1.f4[2], b3 = v0.f4[3] >= v1.f4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 || b1 || b2 || b3;
+}
+
+inline int allTrue(const Scalar4f& v)
+{
+	return v.u4[0] & v.u4[1] & v.u4[2] & v.u4[3];
+}
+
+inline int anyTrue(const Scalar4f& v)
+{
+	return v.u4[0] | v.u4[1] | v.u4[2] | v.u4[3];
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/Simd4i.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/Simd4i.h
new file mode 100644
index 00000000..80ac2abd
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/Simd4i.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// factory implementation
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline Simd4iFactory<const int&>::operator Scalar4i() const
+{
+	return Scalar4i(v, v, v, v);
+}
+
+inline Simd4iFactory<detail::FourTuple>::operator Scalar4i() const
+{
+	return reinterpret_cast<const Scalar4i&>(v);
+}
+
+template <int i>
+inline Simd4iFactory<detail::IntType<i> >::operator Scalar4i() const
+{
+	return Scalar4i(i, i, i, i);
+}
+
+template <>
+inline Simd4iFactory<const int*>::operator Scalar4i() const
+{
+	return Scalar4i(v[0], v[1], v[2], v[3]);
+}
+
+template <>
+inline Simd4iFactory<detail::AlignedPointer<int> >::operator Scalar4i() const
+{
+	return Scalar4i(v.ptr[0], v.ptr[1], v.ptr[2], v.ptr[3]);
+}
+
+template <>
+inline Simd4iFactory<detail::OffsetPointer<int> >::operator Scalar4i() const
+{
+	const int* ptr = reinterpret_cast<const int*>(reinterpret_cast<const char*>(v.ptr) + v.offset);
+	return Scalar4i(ptr[0], ptr[1], ptr[2], ptr[3]);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operator implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+namespace simdi
+{
+
+inline Scalar4i operator==(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return Scalar4i(v0.i4[0] == v1.i4[0], v0.i4[1] == v1.i4[1], v0.i4[2] == v1.i4[2], v0.i4[3] == v1.i4[3]);
+}
+
+inline Scalar4i operator<(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return Scalar4i(v0.i4[0] < v1.i4[0], v0.i4[1] < v1.i4[1], v0.i4[2] < v1.i4[2], v0.i4[3] < v1.i4[3]);
+}
+
+inline Scalar4i operator>(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return Scalar4i(v0.i4[0] > v1.i4[0], v0.i4[1] > v1.i4[1], v0.i4[2] > v1.i4[2], v0.i4[3] > v1.i4[3]);
+}
+
+inline Scalar4i operator+(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return Scalar4i(v0.i4[0] + v1.i4[0], v0.i4[1] + v1.i4[1], v0.i4[2] + v1.i4[2], v0.i4[3] + v1.i4[3]);
+}
+
+inline Scalar4i operator-(const Scalar4i& v)
+{
+	return Scalar4i(-v.i4[0], -v.i4[1], -v.i4[2], -v.i4[3]);
+}
+
+inline Scalar4i operator-(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return Scalar4i(v0.i4[0] - v1.i4[0], v0.i4[1] - v1.i4[1], v0.i4[2] - v1.i4[2], v0.i4[3] - v1.i4[3]);
+}
+
+} // namespace simd
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// function implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+inline Scalar4i simd4i(const Scalar4f& v)
+{
+	return v;
+}
+
+namespace simdi
+{
+
+inline int (&array(Scalar4i& v))[4]
+{
+	return v.i4;
+}
+
+inline const int (&array(const Scalar4i& v))[4]
+{
+	return v.i4;
+}
+
+} // namespace simdi
+
+inline void store(int* ptr, const Scalar4i& v)
+{
+	ptr[0] = v.i4[0];
+	ptr[1] = v.i4[1];
+	ptr[2] = v.i4[2];
+	ptr[3] = v.i4[3];
+}
+
+inline void storeAligned(int* ptr, const Scalar4i& v)
+{
+	store(ptr, v);
+}
+
+inline void storeAligned(int* ptr, unsigned int offset, const Scalar4i& v)
+{
+	store(reinterpret_cast<int*>(reinterpret_cast<char*>(ptr) + offset), v);
+}
+
+namespace simdi
+{
+
+inline int allEqual(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return v0.i4[0] == v1.i4[0] && v0.i4[1] == v1.i4[1] && v0.i4[2] == v1.i4[2] && v0.i4[3] == v1.i4[3];
+}
+
+inline int allEqual(const Scalar4i& v0, const Scalar4i& v1, Scalar4i& outMask)
+{
+	bool b0 = v0.i4[0] == v1.i4[0], b1 = v0.i4[1] == v1.i4[1], b2 = v0.i4[2] == v1.i4[2], b3 = v0.i4[3] == v1.i4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 && b1 && b2 && b3;
+}
+
+inline int anyEqual(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return v0.i4[0] == v1.i4[0] || v0.i4[1] == v1.i4[1] || v0.i4[2] == v1.i4[2] || v0.i4[3] == v1.i4[3];
+}
+
+inline int anyEqual(const Scalar4i& v0, const Scalar4i& v1, Scalar4i& outMask)
+{
+	bool b0 = v0.i4[0] == v1.i4[0], b1 = v0.i4[1] == v1.i4[1], b2 = v0.i4[2] == v1.i4[2], b3 = v0.i4[3] == v1.i4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 || b1 || b2 || b3;
+}
+
+inline int allGreater(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return v0.i4[0] > v1.i4[0] && v0.i4[1] > v1.i4[1] && v0.i4[2] > v1.i4[2] && v0.i4[3] > v1.i4[3];
+}
+
+inline int allGreater(const Scalar4i& v0, const Scalar4i& v1, Scalar4i& outMask)
+{
+	bool b0 = v0.i4[0] > v1.i4[0], b1 = v0.i4[1] > v1.i4[1], b2 = v0.i4[2] > v1.i4[2], b3 = v0.i4[3] > v1.i4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 && b1 && b2 && b3;
+}
+
+inline int anyGreater(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return v0.i4[0] > v1.i4[0] || v0.i4[1] > v1.i4[1] || v0.i4[2] > v1.i4[2] || v0.i4[3] > v1.i4[3];
+}
+
+inline int anyGreater(const Scalar4i& v0, const Scalar4i& v1, Scalar4i& outMask)
+{
+	bool b0 = v0.i4[0] > v1.i4[0], b1 = v0.i4[1] > v1.i4[1], b2 = v0.i4[2] > v1.i4[2], b3 = v0.i4[3] > v1.i4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 || b1 || b2 || b3;
+}
+
+} // namespace simd
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/SimdTypes.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/SimdTypes.h
new file mode 100644
index 00000000..a287766c
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/SimdTypes.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#ifdef PX_WIIU
+#pragma ghs nowarning 193 // warning #193-D: zero used for undefined preprocessing identifier
+#endif
+
+#include <algorithm>
+
+#ifdef PX_WIIU
+#pragma ghs endnowarning
+#endif
+
+union Scalar4f
+{
+	Scalar4f()
+	{
+	}
+
+	Scalar4f(float x, float y, float z, float w)
+	{
+		f4[0] = x;
+		f4[1] = y;
+		f4[2] = z;
+		f4[3] = w;
+	}
+
+	Scalar4f(int32_t x, int32_t y, int32_t z, int32_t w)
+	{
+		i4[0] = x;
+		i4[1] = y;
+		i4[2] = z;
+		i4[3] = w;
+	}
+
+	Scalar4f(uint32_t x, uint32_t y, uint32_t z, uint32_t w)
+	{
+		u4[0] = x;
+		u4[1] = y;
+		u4[2] = z;
+		u4[3] = w;
+	}
+
+	Scalar4f(bool x, bool y, bool z, bool w)
+	{
+		u4[0] = ~(uint32_t(x) - 1);
+		u4[1] = ~(uint32_t(y) - 1);
+		u4[2] = ~(uint32_t(z) - 1);
+		u4[3] = ~(uint32_t(w) - 1);
+	}
+
+	Scalar4f(const Scalar4f& other)
+	{
+		u4[0] = other.u4[0];
+		u4[1] = other.u4[1];
+		u4[2] = other.u4[2];
+		u4[3] = other.u4[3];
+	}
+
+	Scalar4f& operator=(const Scalar4f& other)
+	{
+		u4[0] = other.u4[0];
+		u4[1] = other.u4[1];
+		u4[2] = other.u4[2];
+		u4[3] = other.u4[3];
+		return *this;
+	}
+
+	float f4[4];
+	int32_t i4[4];
+	uint32_t u4[4];
+};
+
+typedef Scalar4f Scalar4i;
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/SwCollisionHelpers.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/SwCollisionHelpers.h
new file mode 100644
index 00000000..33b35f72
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/SwCollisionHelpers.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+namespace nvidia
+{
+namespace cloth
+{
+
+#if !NVMATH_SIMD
+uint32_t findBitSet(uint32_t mask)
+{
+	uint32_t result = 0;
+	while(mask >>= 1)
+		++result;
+	return result;
+}
+#endif
+
+inline Scalar4i intFloor(const Scalar4f& v)
+{
+	return Scalar4i(int(floor(v.f4[0])), int(floor(v.f4[1])), int(floor(v.f4[2])), int(floor(v.f4[3])));
+}
+
+inline Scalar4i horizontalOr(Scalar4i mask)
+{
+	return simd4i(mask.i4[0] | mask.i4[1] | mask.i4[2] | mask.i4[3]);
+}
+
+template <>
+struct Gather<Scalar4i>
+{
+	inline Gather(const Scalar4i& index);
+	inline Scalar4i operator()(const Scalar4i*) const;
+
+	Scalar4i mIndex;
+	Scalar4i mOutOfRange;
+};
+
+Gather<Scalar4i>::Gather(const Scalar4i& index)
+{
+	uint32_t mask = physx::cloth::SwCollision<Scalar4i>::sGridSize - 1;
+
+	mIndex.u4[0] = index.u4[0] & mask;
+	mIndex.u4[1] = index.u4[1] & mask;
+	mIndex.u4[2] = index.u4[2] & mask;
+	mIndex.u4[3] = index.u4[3] & mask;
+
+	mOutOfRange.u4[0] = index.u4[0] & ~mask ? 0 : -1;
+	mOutOfRange.u4[1] = index.u4[1] & ~mask ? 0 : -1;
+	mOutOfRange.u4[2] = index.u4[2] & ~mask ? 0 : -1;
+	mOutOfRange.u4[3] = index.u4[3] & ~mask ? 0 : -1;
+}
+
+Scalar4i Gather<Scalar4i>::operator()(const Scalar4i* ptr) const
+{
+	const int32_t* base = ptr->i4;
+	const int32_t* index = mIndex.i4;
+	const int32_t* mask = mOutOfRange.i4;
+	return Scalar4i(base[index[0]] & mask[0], base[index[1]] & mask[1], base[index[2]] & mask[2],
+	                base[index[3]] & mask[3]);
+}
+
+} // namespace cloth
+} // namespace physx
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/Simd4f.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/Simd4f.h
new file mode 100644
index 00000000..3f04750f
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/Simd4f.h
@@ -0,0 +1,411 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// factory implementation
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline Simd4fFactory<const float&>::operator Simd4f() const
+{
+	return _mm_set1_ps(v);
+}
+
+inline Simd4fFactory<detail::FourTuple>::operator Simd4f() const
+{
+	return reinterpret_cast<const Simd4f&>(v);
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<0> >::operator Simd4f() const
+{
+	return _mm_setzero_ps();
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<1> >::operator Simd4f() const
+{
+	return _mm_set1_ps(1.0f);
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<int(0x80000000)> >::operator Simd4f() const
+{
+	return _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<int(0xffffffff)> >::operator Simd4f() const
+{
+	return _mm_castsi128_ps(_mm_set1_epi32(-1));
+}
+
+template <>
+inline Simd4fFactory<const float*>::operator Simd4f() const
+{
+	return _mm_loadu_ps(v);
+}
+
+template <>
+inline Simd4fFactory<detail::AlignedPointer<float> >::operator Simd4f() const
+{
+	return _mm_load_ps(v.ptr);
+}
+
+template <>
+inline Simd4fFactory<detail::OffsetPointer<float> >::operator Simd4f() const
+{
+	return _mm_load_ps(reinterpret_cast<const float*>(reinterpret_cast<const char*>(v.ptr) + v.offset));
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression template
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline ComplementExpr<Simd4f>::operator Simd4f() const
+{
+	return _mm_andnot_ps(v, _mm_castsi128_ps(_mm_set1_epi32(-1)));
+}
+
+Simd4f operator&(const ComplementExpr<Simd4f>& complement, const Simd4f& v)
+{
+	return _mm_andnot_ps(complement.v, v);
+}
+
+Simd4f operator&(const Simd4f& v, const ComplementExpr<Simd4f>& complement)
+{
+	return _mm_andnot_ps(complement.v, v);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operator implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4f operator==(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_cmpeq_ps(v0, v1);
+}
+
+Simd4f operator<(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_cmplt_ps(v0, v1);
+}
+
+Simd4f operator<=(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_cmple_ps(v0, v1);
+}
+
+Simd4f operator>(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_cmpgt_ps(v0, v1);
+}
+
+Simd4f operator>=(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_cmpge_ps(v0, v1);
+}
+
+ComplementExpr<Simd4f> operator~(const Simd4f& v)
+{
+	return ComplementExpr<Simd4f>(v);
+}
+
+Simd4f operator&(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_and_ps(v0, v1);
+}
+
+Simd4f operator|(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_or_ps(v0, v1);
+}
+
+Simd4f operator^(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_xor_ps(v0, v1);
+}
+
+Simd4f operator<<(const Simd4f& v, int shift)
+{
+	return _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(v), shift));
+}
+
+Simd4f operator>>(const Simd4f& v, int shift)
+{
+	return _mm_castsi128_ps(_mm_srli_epi32(_mm_castps_si128(v), shift));
+}
+
+Simd4f operator+(const Simd4f& v)
+{
+	return v;
+}
+
+Simd4f operator+(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_add_ps(v0, v1);
+}
+
+Simd4f operator-(const Simd4f& v)
+{
+	return _mm_sub_ps(_mm_setzero_ps(), v);
+}
+
+Simd4f operator-(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_sub_ps(v0, v1);
+}
+
+Simd4f operator*(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_mul_ps(v0, v1);
+}
+
+Simd4f operator/(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_div_ps(v0, v1);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// function implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4f simd4f(const Simd4i& v)
+{
+	return _mm_castsi128_ps(v);
+}
+
+float (&array(Simd4f& v))[4]
+{
+	return reinterpret_cast<float(&)[4]>(v);
+}
+
+const float (&array(const Simd4f& v))[4]
+{
+	return reinterpret_cast<const float(&)[4]>(v);
+}
+
+void store(float* ptr, Simd4f const& v)
+{
+	_mm_storeu_ps(ptr, v);
+}
+
+void storeAligned(float* ptr, Simd4f const& v)
+{
+	_mm_store_ps(ptr, v);
+}
+
+void storeAligned(float* ptr, unsigned int offset, Simd4f const& v)
+{
+	_mm_store_ps(reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + offset), v);
+}
+
+template <size_t i>
+Simd4f splat(Simd4f const& v)
+{
+	return _mm_shuffle_ps(v, v, _MM_SHUFFLE(i, i, i, i));
+}
+
+Simd4f select(Simd4f const& mask, Simd4f const& v0, Simd4f const& v1)
+{
+	return _mm_xor_ps(v1, _mm_and_ps(mask, _mm_xor_ps(v1, v0)));
+}
+
+Simd4f abs(const Simd4f& v)
+{
+	return _mm_andnot_ps(_mm_castsi128_ps(_mm_set1_epi32(0x80000000)), v);
+}
+
+Simd4f floor(const Simd4f& v)
+{
+	// SSE 4.1: return _mm_floor_ps(v);
+	Simd4i i = _mm_cvttps_epi32(v);
+	return _mm_cvtepi32_ps(_mm_sub_epi32(i, _mm_srli_epi32(i, 31)));
+}
+
+Simd4f max(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_max_ps(v0, v1);
+}
+
+Simd4f min(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_min_ps(v0, v1);
+}
+
+Simd4f recip(const Simd4f& v)
+{
+	return _mm_rcp_ps(v);
+}
+
+template <int n>
+Simd4f recipT(const Simd4f& v)
+{
+	Simd4f two = simd4f(2.0f);
+	Simd4f recipV = recip(v);
+	for(int i = 0; i < n; ++i)
+		recipV = recipV * (two - v * recipV);
+	return recipV;
+}
+
+Simd4f sqrt(const Simd4f& v)
+{
+	return _mm_sqrt_ps(v);
+}
+
+Simd4f rsqrt(const Simd4f& v)
+{
+	return _mm_rsqrt_ps(v);
+}
+
+template <int n>
+Simd4f rsqrtT(const Simd4f& v)
+{
+	Simd4f halfV = v * simd4f(0.5f);
+	Simd4f threeHalf = simd4f(1.5f);
+	Simd4f rsqrtV = rsqrt(v);
+	for(int i = 0; i < n; ++i)
+		rsqrtV = rsqrtV * (threeHalf - halfV * rsqrtV * rsqrtV);
+	return rsqrtV;
+}
+
+Simd4f exp2(const Simd4f& v)
+{
+	// http://www.netlib.org/cephes/
+
+	Simd4f limit = simd4f(127.4999f);
+	Simd4f x = min(max(-limit, v), limit);
+
+	// separate into integer and fractional part
+
+	Simd4f fx = x + simd4f(0.5f);
+	Simd4i ix = _mm_sub_epi32(_mm_cvttps_epi32(fx), _mm_srli_epi32(_mm_castps_si128(fx), 31));
+	fx = x - Simd4f(_mm_cvtepi32_ps(ix));
+
+	// exp2(fx) ~ 1 + 2*P(fx) / (Q(fx) - P(fx))
+
+	Simd4f fx2 = fx * fx;
+
+	Simd4f px = fx * (simd4f(1.51390680115615096133e+3f) +
+	                  fx2 * (simd4f(2.02020656693165307700e+1f) + fx2 * simd4f(2.30933477057345225087e-2f)));
+	Simd4f qx = simd4f(4.36821166879210612817e+3f) + fx2 * (simd4f(2.33184211722314911771e+2f) + fx2);
+
+	Simd4f exp2fx = px * recip(qx - px);
+	exp2fx = simd4f(_1) + exp2fx + exp2fx;
+
+	// exp2(ix)
+
+	Simd4f exp2ix = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ix, _mm_set1_epi32(0x7f)), 23));
+
+	return exp2fx * exp2ix;
+}
+
+Simd4f log2(const Simd4f& v)
+{
+	// todo: fast approximate implementation like exp2
+	Simd4f scale = simd4f(1.44269504088896341f); // 1/ln(2)
+	const float* ptr = array(v);
+	return simd4f(::logf(ptr[0]), ::logf(ptr[1]), ::logf(ptr[2]), ::logf(ptr[3])) * scale;
+}
+
+Simd4f dot3(const Simd4f& v0, const Simd4f& v1)
+{
+	Simd4f tmp = v0 * v1;
+	return splat<0>(tmp) + splat<1>(tmp) + splat<2>(tmp);
+}
+
+Simd4f cross3(const Simd4f& v0, const Simd4f& v1)
+{
+	Simd4f t0 = _mm_shuffle_ps(v0, v0, 0xc9); // w z y x -> w x z y
+	Simd4f t1 = _mm_shuffle_ps(v1, v1, 0xc9);
+	Simd4f tmp = v0 * t1 - t0 * v1;
+	return _mm_shuffle_ps(tmp, tmp, 0xc9);
+}
+
+void transpose(Simd4f& x, Simd4f& y, Simd4f& z, Simd4f& w)
+{
+	_MM_TRANSPOSE4_PS(x, y, z, w);
+}
+
+int allEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return allTrue(v0 == v1);
+}
+
+int allEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return allTrue(outMask = v0 == v1);
+}
+
+int anyEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return anyTrue(v0 == v1);
+}
+
+int anyEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return anyTrue(outMask = v0 == v1);
+}
+
+int allGreater(const Simd4f& v0, const Simd4f& v1)
+{
+	return allTrue(v0 > v1);
+}
+
+int allGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return allTrue(outMask = v0 > v1);
+}
+
+int anyGreater(const Simd4f& v0, const Simd4f& v1)
+{
+	return anyTrue(v0 > v1);
+}
+
+int anyGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return anyTrue(outMask = v0 > v1);
+}
+
+int allGreaterEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return allTrue(v0 >= v1);
+}
+
+int allGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return allTrue(outMask = v0 >= v1);
+}
+
+int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return anyTrue(v0 >= v1);
+}
+
+int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return anyTrue(outMask = v0 >= v1);
+}
+
+int allTrue(const Simd4f& v)
+{
+	return _mm_movemask_ps(v) == 0xf;
+}
+
+int anyTrue(const Simd4f& v)
+{
+	return _mm_movemask_ps(v);
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/Simd4i.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/Simd4i.h
new file mode 100644
index 00000000..d4a70a02
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/Simd4i.h
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// factory implementation
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline Simd4iFactory<const int&>::operator Simd4i() const
+{
+	return _mm_set1_epi32(v);
+}
+
+inline Simd4iFactory<detail::FourTuple>::operator Simd4i() const
+{
+	return reinterpret_cast<const Simd4i&>(v);
+}
+
+template <int i>
+inline Simd4iFactory<detail::IntType<i> >::operator Simd4i() const
+{
+	return _mm_set1_epi32(i);
+}
+
+template <>
+inline Simd4iFactory<detail::IntType<0> >::operator Simd4i() const
+{
+	return _mm_setzero_si128();
+}
+
+template <>
+inline Simd4iFactory<const int*>::operator Simd4i() const
+{
+	return _mm_loadu_si128(reinterpret_cast<const __m128i*>(v));
+}
+
+template <>
+inline Simd4iFactory<detail::AlignedPointer<int> >::operator Simd4i() const
+{
+	return _mm_load_si128(reinterpret_cast<const __m128i*>(v.ptr));
+}
+
+template <>
+inline Simd4iFactory<detail::OffsetPointer<int> >::operator Simd4i() const
+{
+	return _mm_load_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const char*>(v.ptr) + v.offset));
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression template
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline ComplementExpr<Simd4i>::operator Simd4i() const
+{
+	return _mm_andnot_si128(v, _mm_set1_epi32(0xffffffff));
+}
+
+Simd4i operator&(const ComplementExpr<Simd4i>& complement, const Simd4i& v)
+{
+	return _mm_andnot_si128(complement.v, v);
+}
+
+Simd4i operator&(const Simd4i& v, const ComplementExpr<Simd4i>& complement)
+{
+	return _mm_andnot_si128(complement.v, v);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operator implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4i simdi::operator==(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_cmpeq_epi32(v0, v1);
+}
+
+Simd4i simdi::operator<(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_cmplt_epi32(v0, v1);
+}
+
+Simd4i simdi::operator>(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_cmpgt_epi32(v0, v1);
+}
+
+ComplementExpr<Simd4i> operator~(const Simd4i& v)
+{
+	return ComplementExpr<Simd4i>(v);
+}
+
+Simd4i operator&(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_and_si128(v0, v1);
+}
+
+Simd4i operator|(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_or_si128(v0, v1);
+}
+
+Simd4i operator^(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_xor_si128(v0, v1);
+}
+
+Simd4i operator<<(const Simd4i& v, int shift)
+{
+	return _mm_slli_epi32(v, shift);
+}
+
+Simd4i operator>>(const Simd4i& v, int shift)
+{
+	return _mm_srli_epi32(v, shift);
+}
+
+Simd4i simdi::operator+(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_add_epi32(v0, v1);
+}
+
+Simd4i simdi::operator-(const Simd4i& v)
+{
+	return _mm_sub_epi32(_mm_setzero_si128(), v);
+}
+
+Simd4i simdi::operator-(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_sub_epi32(v0, v1);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// function implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4i simd4i(const Simd4f& v)
+{
+	return _mm_castps_si128(v);
+}
+
+int (&simdi::array(Simd4i& v))[4]
+{
+	return reinterpret_cast<int(&)[4]>(v);
+}
+
+const int (&simdi::array(const Simd4i& v))[4]
+{
+	return reinterpret_cast<const int(&)[4]>(v);
+}
+
+void store(int* ptr, const Simd4i& v)
+{
+	_mm_storeu_si128(reinterpret_cast<__m128i*>(ptr), v);
+}
+
+void storeAligned(int* ptr, const Simd4i& v)
+{
+	_mm_store_si128(reinterpret_cast<__m128i*>(ptr), v);
+}
+
+void storeAligned(int* ptr, unsigned int offset, const Simd4i& v)
+{
+	_mm_store_si128(reinterpret_cast<__m128i*>(reinterpret_cast<char*>(ptr) + offset), v);
+}
+
+template <size_t i>
+Simd4i splat(const Simd4i& v)
+{
+	return _mm_shuffle_epi32(v, _MM_SHUFFLE(i, i, i, i));
+}
+
+Simd4i select(const Simd4i& mask, const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_xor_si128(v1, _mm_and_si128(mask, _mm_xor_si128(v1, v0)));
+}
+
+int simdi::allEqual(const Simd4i& v0, const Simd4i& v1)
+{
+	return allTrue(simdi::operator==(v0, v1));
+}
+
+int simdi::allEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return allTrue(outMask = simdi::operator==(v0, v1));
+}
+
+int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1)
+{
+	return anyTrue(simdi::operator==(v0, v1));
+}
+
+int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return anyTrue(outMask = simdi::operator==(v0, v1));
+}
+
+int simdi::allGreater(const Simd4i& v0, const Simd4i& v1)
+{
+	return allTrue(simdi::operator>(v0, v1));
+}
+
+int simdi::allGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return allTrue(outMask = simdi::operator>(v0, v1));
+}
+
+int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1)
+{
+	return anyTrue(simdi::operator>(v0, v1));
+}
+
+int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return anyTrue(outMask = simdi::operator>(v0, v1));
+}
+
+int allTrue(const Simd4i& v)
+{
+	return _mm_movemask_ps(_mm_castsi128_ps(v)) == 0xf;
+}
+
+int anyTrue(const Simd4i& v)
+{
+	return _mm_movemask_ps(_mm_castsi128_ps(v));
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SimdTypes.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SimdTypes.h
new file mode 100644
index 00000000..e54edde7
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SimdTypes.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// SSE + SSE2 (don't include intrin.h!)
+#include <emmintrin.h>
+
+#if defined(_MSC_VER)
+
+typedef __m128 Simd4f;
+typedef __m128i Simd4i;
+
+#else
+
+struct Simd4f
+{
+	Simd4f()
+	{
+	}
+	Simd4f(__m128 x) : m128(x)
+	{
+	}
+
+	operator __m128&()
+	{
+		return m128;
+	}
+	operator const __m128&() const
+	{
+		return m128;
+	}
+
+  private:
+	__m128 m128;
+};
+
+struct Simd4i
+{
+	Simd4i()
+	{
+	}
+	Simd4i(__m128i x) : m128i(x)
+	{
+	}
+
+	operator __m128i&()
+	{
+		return m128i;
+	}
+	operator const __m128i&() const
+	{
+		return m128i;
+	}
+
+  private:
+	__m128i m128i;
+};
+
+#endif
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SwCollisionHelpers.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SwCollisionHelpers.h
new file mode 100644
index 00000000..0750fcf5
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SwCollisionHelpers.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#ifdef PX_GCC_FAMILY
+#include <xmmintrin.h> // _BitScanForward
+#else
+#pragma warning(push)
+#pragma warning(disable : 4668) //'symbol'  is not defined as a preprocessor macro, replacing with '0' for 'directives'
+#pragma warning(disable : 4987) // nonstandard extension used: 'throw (...)'
+#include <intrin.h>             // _BitScanForward
+#pragma warning(pop)
+#endif
+
+namespace nvidia
+{
+namespace cloth
+{
+
+uint32_t findBitSet(uint32_t mask)
+{
+#if defined(_MSC_VER)
+	unsigned long result;
+	_BitScanForward(&result, unsigned long(mask));
+	return result;
+#else
+	return __builtin_ffs(mask) - 1;
+#endif
+}
+
+Simd4i intFloor(const Simd4f& v)
+{
+	Simd4i i = _mm_cvttps_epi32(v);
+	return simdi::operator-(i, _mm_srli_epi32(simd4i(v), 31));
+}
+
+Simd4i horizontalOr(Simd4i mask)
+{
+	Simd4i tmp = mask | _mm_shuffle_epi32(mask, 0xb1); // w z y x -> z w x y
+	return tmp | _mm_shuffle_epi32(tmp, 0x4e);         // w z y x -> y x w z
+}
+
+Gather<Simd4i>::Gather(const Simd4i& index)
+{
+	mSelectQ = _mm_srai_epi32(index << 29, 31);
+	mSelectD = _mm_srai_epi32(index << 30, 31);
+	mSelectW = _mm_srai_epi32(index << 31, 31);
+	mOutOfRange = simdi::operator>(index ^ sIntSignBit, sSignedMask);
+}
+
+Simd4i Gather<Simd4i>::operator()(const Simd4i* ptr) const
+{
+	// more efficient with _mm_shuffle_epi8 (SSSE3)
+	Simd4i lo = ptr[0], hi = ptr[1];
+	Simd4i m01 = select(mSelectW, splat<1>(lo), splat<0>(lo));
+	Simd4i m23 = select(mSelectW, splat<3>(lo), splat<2>(lo));
+	Simd4i m45 = select(mSelectW, splat<1>(hi), splat<0>(hi));
+	Simd4i m67 = select(mSelectW, splat<3>(hi), splat<2>(hi));
+	Simd4i m0123 = select(mSelectD, m23, m01);
+	Simd4i m4567 = select(mSelectD, m67, m45);
+	return select(mSelectQ, m4567, m0123) & ~mOutOfRange;
+}
+
+} // namespace cloth
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SwSolveConstraints.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SwSolveConstraints.h
new file mode 100644
index 00000000..382812bb
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SwSolveConstraints.h
@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma warning(push)
+#pragma warning(disable:4127) // Disable the nag warning 'conditional expression is constant'
+
+template <bool useMultiplier>
+void solveConstraints(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd,
+                      const uint16_t* __restrict iIt, __m128 stiffness)
+{
+	__m128 sOne = _mm_set1_ps(1.0f);
+
+	__m128 stretchLimit, compressionLimit, multiplier;
+	if(useMultiplier)
+	{
+		stretchLimit = _mm_shuffle_ps(stiffness, stiffness, 0xff);
+		compressionLimit = _mm_shuffle_ps(stiffness, stiffness, 0xaa);
+		multiplier = _mm_shuffle_ps(stiffness, stiffness, 0x55);
+	}
+	stiffness = _mm_shuffle_ps(stiffness, stiffness, 0x00);
+
+	for(; rIt != rEnd; rIt += 4, iIt += 8)
+	{
+		float* p0i = posIt + iIt[0] * 4;
+		float* p0j = posIt + iIt[1] * 4;
+		float* p1i = posIt + iIt[2] * 4;
+		float* p1j = posIt + iIt[3] * 4;
+		float* p2i = posIt + iIt[4] * 4;
+		float* p2j = posIt + iIt[5] * 4;
+		float* p3i = posIt + iIt[6] * 4;
+		float* p3j = posIt + iIt[7] * 4;
+
+		__m128 v0i = _mm_load_ps(p0i);
+		__m128 v0j = _mm_load_ps(p0j);
+		__m128 v1i = _mm_load_ps(p1i);
+		__m128 v1j = _mm_load_ps(p1j);
+		__m128 v2i = _mm_load_ps(p2i);
+		__m128 v2j = _mm_load_ps(p2j);
+		__m128 v3i = _mm_load_ps(p3i);
+		__m128 v3j = _mm_load_ps(p3j);
+
+		__m128 h0ij = _mm_add_ps(v0j, _mm_mul_ps(v0i, sMinusOneXYZOneW));
+		__m128 h1ij = _mm_add_ps(v1j, _mm_mul_ps(v1i, sMinusOneXYZOneW));
+		__m128 h2ij = _mm_add_ps(v2j, _mm_mul_ps(v2i, sMinusOneXYZOneW));
+		__m128 h3ij = _mm_add_ps(v3j, _mm_mul_ps(v3i, sMinusOneXYZOneW));
+
+		__m128 a = _mm_unpacklo_ps(h0ij, h2ij);
+		__m128 b = _mm_unpackhi_ps(h0ij, h2ij);
+		__m128 c = _mm_unpacklo_ps(h1ij, h3ij);
+		__m128 d = _mm_unpackhi_ps(h1ij, h3ij);
+
+		__m128 hxij = _mm_unpacklo_ps(a, c);
+		__m128 hyij = _mm_unpackhi_ps(a, c);
+		__m128 hzij = _mm_unpacklo_ps(b, d);
+		__m128 vwij = _mm_unpackhi_ps(b, d);
+
+		__m128 rij = _mm_load_ps(rIt);
+		__m128 e2ij = _mm_add_ps(
+		    sEpsilon, _mm_add_ps(_mm_mul_ps(hxij, hxij), _mm_add_ps(_mm_mul_ps(hyij, hyij), _mm_mul_ps(hzij, hzij))));
+		__m128 mask = _mm_cmpnle_ps(rij, sEpsilon);
+		__m128 erij = _mm_and_ps(_mm_sub_ps(sOne, _mm_mul_ps(rij, _mm_rsqrt_ps(e2ij))), mask);
+
+		if(useMultiplier)
+		{
+			erij = _mm_sub_ps(erij, _mm_mul_ps(multiplier, _mm_max_ps(compressionLimit, _mm_min_ps(erij, stretchLimit))));
+		}
+		__m128 exij = _mm_mul_ps(erij, _mm_mul_ps(stiffness, _mm_rcp_ps(_mm_add_ps(sEpsilon, vwij))));
+
+		__m128 exlo = _mm_and_ps(sMaskXY, exij);
+		__m128 exhi = _mm_andnot_ps(sMaskXY, exij);
+
+		__m128 f0ij = _mm_mul_ps(h0ij, _mm_shuffle_ps(exlo, exlo, 0xc0));
+		__m128 f1ij = _mm_mul_ps(h1ij, _mm_shuffle_ps(exlo, exlo, 0xd5));
+		__m128 f2ij = _mm_mul_ps(h2ij, _mm_shuffle_ps(exhi, exhi, 0x2a));
+		__m128 f3ij = _mm_mul_ps(h3ij, _mm_shuffle_ps(exhi, exhi, 0x3f));
+
+		__m128 u0i = _mm_add_ps(v0i, _mm_mul_ps(f0ij, _mm_shuffle_ps(v0i, v0i, 0xff)));
+		__m128 u0j = _mm_sub_ps(v0j, _mm_mul_ps(f0ij, _mm_shuffle_ps(v0j, v0j, 0xff)));
+		__m128 u1i = _mm_add_ps(v1i, _mm_mul_ps(f1ij, _mm_shuffle_ps(v1i, v1i, 0xff)));
+		__m128 u1j = _mm_sub_ps(v1j, _mm_mul_ps(f1ij, _mm_shuffle_ps(v1j, v1j, 0xff)));
+		__m128 u2i = _mm_add_ps(v2i, _mm_mul_ps(f2ij, _mm_shuffle_ps(v2i, v2i, 0xff)));
+		__m128 u2j = _mm_sub_ps(v2j, _mm_mul_ps(f2ij, _mm_shuffle_ps(v2j, v2j, 0xff)));
+		__m128 u3i = _mm_add_ps(v3i, _mm_mul_ps(f3ij, _mm_shuffle_ps(v3i, v3i, 0xff)));
+		__m128 u3j = _mm_sub_ps(v3j, _mm_mul_ps(f3ij, _mm_shuffle_ps(v3j, v3j, 0xff)));
+
+		_mm_store_ps(p0i, u0i);
+		_mm_store_ps(p0j, u0j);
+		_mm_store_ps(p1i, u1i);
+		_mm_store_ps(p1j, u1j);
+		_mm_store_ps(p2i, u2i);
+		_mm_store_ps(p2j, u2j);
+		_mm_store_ps(p3i, u3i);
+		_mm_store_ps(p3j, u3j);
+	}
+}
+
+#if PX_X86
+
+// clang-format:disable
+
+// asm blocks in static condition blocks don't get removed, specialize
+template <>
+void solveConstraints<false>(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd, 
+                             const uint16_t* __restrict iIt, __m128 stiffness)
+{
+	__m128 sOne = _mm_set1_ps(1.0f);
+	stiffness = _mm_shuffle_ps(stiffness, stiffness, 0x00);
+
+	__m128 htmp[4];
+	float* ptmp[8];
+
+	__asm 
+	{
+		mov edx, rIt
+		mov esi, rEnd
+
+		cmp edx, esi
+		jae forEnd
+
+		mov eax, iIt
+		mov ecx, posIt
+
+forBegin:
+		movzx edi, WORD PTR [eax   ] __asm shl edi, 4 __asm mov [ptmp   ], edi __asm movaps xmm0, XMMWORD PTR [edi + ecx] /* v0i */
+		movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v0j */
+		movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm movaps xmm1, XMMWORD PTR [edi + ecx] /* v1i */
+		movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v1j */
+
+		movaps xmm7, sMinusOneXYZOneW
+		mulps xmm2, xmm7 __asm addps xmm0, xmm2 __asm movaps XMMWORD PTR [htmp   ], xmm0 /* h0ij */
+		mulps xmm3, xmm7 __asm addps xmm1, xmm3 __asm movaps XMMWORD PTR [htmp+16], xmm1 /* h1ij */
+
+		movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */
+		movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v2j */
+		movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm movaps xmm5, XMMWORD PTR [edi + ecx] /* v3i */
+		movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v3j */
+
+		mulps xmm2, xmm7 __asm addps xmm2, xmm4 __asm movaps XMMWORD PTR [htmp+32], xmm2 /* h2ij */
+		mulps xmm3, xmm7 __asm addps xmm3, xmm5 __asm movaps XMMWORD PTR [htmp+48], xmm3 /* h3ij */
+
+		movaps xmm4, xmm0
+		movaps xmm5, xmm1
+
+		unpcklps xmm0, xmm2 /* a */
+		unpckhps xmm4, xmm2 /* b */
+		unpcklps xmm1, xmm3 /* c */
+		unpckhps xmm5, xmm3 /* d */
+
+		movaps xmm2, xmm0
+		movaps xmm6, xmm4
+
+		unpcklps xmm0, xmm1 /* hxij */
+		unpckhps xmm2, xmm1 /* hyij */
+		unpcklps xmm4, xmm5 /* hzij */
+		unpckhps xmm6, xmm5 /* vwij */
+
+		movaps xmm7, sEpsilon
+		movaps xmm5, sOne
+		movaps xmm3, stiffness
+		movaps xmm1, XMMWORD PTR [edx] /* rij */
+
+		mulps xmm0, xmm0 __asm addps xmm0, xmm7 /* e2ij */
+		mulps xmm2, xmm2 __asm addps xmm0, xmm2
+		mulps xmm4, xmm4 __asm addps xmm0, xmm4
+
+		rsqrtps xmm0, xmm0 __asm mulps xmm0, xmm1 /* erij */
+		cmpnleps xmm1, xmm7 /* mask */
+		subps xmm5, xmm0 __asm andps xmm5, xmm1
+		addps xmm6, xmm7 __asm rcpps xmm6, xmm6
+
+		mulps xmm6, xmm3 __asm mulps xmm6, xmm5 /* exij */
+
+		movaps xmm7, sMaskXY
+		andps xmm7, xmm6 /* exlo */
+		xorps xmm6, xmm7 /* exhi */
+
+		movaps xmm0, XMMWORD PTR [htmp   ] /* h0ij */
+		movaps xmm1, XMMWORD PTR [htmp+16] /* h1ij */
+		movaps xmm2, XMMWORD PTR [htmp+32] /* h2ij */
+		movaps xmm3, XMMWORD PTR [htmp+48] /* h3ij */
+
+		pshufd xmm5, xmm7, 0xc0 __asm mulps xmm0, xmm5 /* f0ij */
+		pshufd xmm7, xmm7, 0xd5 __asm mulps xmm1, xmm7 /* f1ij */
+		pshufd xmm4, xmm6, 0x2a __asm mulps xmm2, xmm4 /* f2ij */
+		pshufd xmm6, xmm6, 0x3f __asm mulps xmm3, xmm6 /* f3ij */
+
+		mov edi, [ptmp   ] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v0i */
+		pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm0 __asm subps xmm4, xmm5 /* u0i */
+		movaps XMMWORD PTR [edi + ecx], xmm4
+
+		mov edi, [ptmp+ 4] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v0j */
+		pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm0 __asm addps xmm6, xmm7 /* u0j */
+		movaps XMMWORD PTR [edi + ecx], xmm6
+
+		mov edi, [ptmp+ 8] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v1i */
+		pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm1 __asm subps xmm4, xmm5 /* u1i */
+		movaps XMMWORD PTR [edi + ecx], xmm4
+
+		mov edi, [ptmp+12] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v1j */
+		pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm1 __asm addps xmm6, xmm7 /* u1j */
+		movaps XMMWORD PTR [edi + ecx], xmm6
+
+		mov edi, [ptmp+16] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */
+		pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm2 __asm subps xmm4, xmm5 /* u2i */
+		movaps XMMWORD PTR [edi + ecx], xmm4
+
+		mov edi, [ptmp+20] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v2j */
+		pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm2 __asm addps xmm6, xmm7 /* u2j */
+		movaps XMMWORD PTR [edi + ecx], xmm6
+
+		mov edi, [ptmp+24] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v3i */
+		pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm3 __asm subps xmm4, xmm5 /* u3i */
+		movaps XMMWORD PTR [edi + ecx], xmm4
+
+		mov edi, [ptmp+28] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v3j */
+		pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm3 __asm addps xmm6, xmm7 /* u3j */
+		movaps XMMWORD PTR [edi + ecx], xmm6
+
+		add eax, 16
+		add edx, 16
+
+		cmp edx, esi
+		jb forBegin
+forEnd:
+	}
+}
+
+template <>
+void solveConstraints<true>(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd, 
+                            const uint16_t* __restrict iIt, __m128 stiffness)
+{
+	__m128 sOne = _mm_set1_ps(1.0f);
+	__m128 stretchLimit = _mm_shuffle_ps(stiffness, stiffness, 0xff);
+	__m128 compressionLimit = _mm_shuffle_ps(stiffness, stiffness, 0xaa);
+	__m128 multiplier = _mm_shuffle_ps(stiffness, stiffness, 0x55);
+	stiffness = _mm_shuffle_ps(stiffness, stiffness, 0x00);
+
+	__m128 htmp[4];
+	float* ptmp[8];
+
+	__asm 
+	{
+		mov edx, rIt
+		mov esi, rEnd
+
+		cmp edx, esi
+		jae forEnd
+
+		mov eax, iIt
+		mov ecx, posIt
+
+forBegin:
+		movzx edi, WORD PTR [eax   ] __asm shl edi, 4 __asm mov [ptmp   ], edi __asm movaps xmm0, XMMWORD PTR [edi + ecx] /* v0i */
+		movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v0j */
+		movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm movaps xmm1, XMMWORD PTR [edi + ecx] /* v1i */
+		movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v1j */
+
+		movaps xmm7, sMinusOneXYZOneW
+		mulps xmm2, xmm7 __asm addps xmm0, xmm2 __asm movaps XMMWORD PTR [htmp   ], xmm0 /* h0ij */
+		mulps xmm3, xmm7 __asm addps xmm1, xmm3 __asm movaps XMMWORD PTR [htmp+16], xmm1 /* h1ij */
+
+		movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */
+		movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v2j */
+		movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm movaps xmm5, XMMWORD PTR [edi + ecx] /* v3i */
+		movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v3j */
+
+		mulps xmm2, xmm7 __asm addps xmm2, xmm4 __asm movaps XMMWORD PTR [htmp+32], xmm2 /* h2ij */
+		mulps xmm3, xmm7 __asm addps xmm3, xmm5 __asm movaps XMMWORD PTR [htmp+48], xmm3 /* h3ij */
+
+		movaps xmm4, xmm0
+		movaps xmm5, xmm1
+
+		unpcklps xmm0, xmm2 /* a */
+		unpckhps xmm4, xmm2 /* b */
+		unpcklps xmm1, xmm3 /* c */
+		unpckhps xmm5, xmm3 /* d */
+
+		movaps xmm2, xmm0
+		movaps xmm6, xmm4
+
+		unpcklps xmm0, xmm1 /* hxij */
+		unpckhps xmm2, xmm1 /* hyij */
+		unpcklps xmm4, xmm5 /* hzij */
+		unpckhps xmm6, xmm5 /* vwij */
+
+		movaps xmm7, sEpsilon
+		movaps xmm5, sOne
+		movaps xmm3, stiffness
+		movaps xmm1, XMMWORD PTR [edx] /* rij */
+
+		mulps xmm0, xmm0 __asm addps xmm0, xmm7 /* e2ij */
+		mulps xmm2, xmm2 __asm addps xmm0, xmm2
+		mulps xmm4, xmm4 __asm addps xmm0, xmm4
+
+		rsqrtps xmm0, xmm0 __asm mulps xmm0, xmm1 /* erij */
+		cmpnleps xmm1, xmm7 /* mask */
+		subps xmm5, xmm0 __asm andps xmm5, xmm1
+		addps xmm6, xmm7 __asm rcpps xmm6, xmm6
+
+		movaps xmm0, stretchLimit /* multiplier block */
+		movaps xmm1, compressionLimit
+		movaps xmm2, multiplier
+		minps xmm0, xmm5
+		maxps xmm1, xmm0
+		mulps xmm2, xmm1
+		subps xmm5, xmm2
+
+		mulps xmm6, xmm3 __asm mulps xmm6, xmm5 /* exij */
+
+		movaps xmm7, sMaskXY
+		andps xmm7, xmm6 /* exlo */
+		xorps xmm6, xmm7 /* exhi */
+
+		movaps xmm0, XMMWORD PTR [htmp   ] /* h0ij */
+		movaps xmm1, XMMWORD PTR [htmp+16] /* h1ij */
+		movaps xmm2, XMMWORD PTR [htmp+32] /* h2ij */
+		movaps xmm3, XMMWORD PTR [htmp+48] /* h3ij */
+
+		pshufd xmm5, xmm7, 0xc0 __asm mulps xmm0, xmm5 /* f0ij */
+		pshufd xmm7, xmm7, 0xd5 __asm mulps xmm1, xmm7 /* f1ij */
+		pshufd xmm4, xmm6, 0x2a __asm mulps xmm2, xmm4 /* f2ij */
+		pshufd xmm6, xmm6, 0x3f __asm mulps xmm3, xmm6 /* f3ij */
+
+		mov edi, [ptmp   ] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v0i */
+		pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm0 __asm subps xmm4, xmm5 /* u0i */
+		movaps XMMWORD PTR [edi + ecx], xmm4
+
+		mov edi, [ptmp+ 4] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v0j */
+		pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm0 __asm addps xmm6, xmm7 /* u0j */
+		movaps XMMWORD PTR [edi + ecx], xmm6
+
+		mov edi, [ptmp+ 8] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v1i */
+		pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm1 __asm subps xmm4, xmm5 /* u1i */
+		movaps XMMWORD PTR [edi + ecx], xmm4
+
+		mov edi, [ptmp+12] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v1j */
+		pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm1 __asm addps xmm6, xmm7 /* u1j */
+		movaps XMMWORD PTR [edi + ecx], xmm6
+
+		mov edi, [ptmp+16] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */
+		pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm2 __asm subps xmm4, xmm5 /* u2i */
+		movaps XMMWORD PTR [edi + ecx], xmm4
+
+		mov edi, [ptmp+20] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v2j */
+		pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm2 __asm addps xmm6, xmm7 /* u2j */
+		movaps XMMWORD PTR [edi + ecx], xmm6
+
+		mov edi, [ptmp+24] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v3i */
+		pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm3 __asm subps xmm4, xmm5 /* u3i */
+		movaps XMMWORD PTR [edi + ecx], xmm4
+
+		mov edi, [ptmp+28] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v3j */
+		pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm3 __asm addps xmm6, xmm7 /* u3j */
+		movaps XMMWORD PTR [edi + ecx], xmm6
+
+		add eax, 16
+		add edx, 16
+
+		cmp edx, esi
+		jb forBegin
+forEnd:
+	}
+}
+
+// clang-format:enable
+
+#endif
+
+#pragma warning(pop)
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/windows/CuFactory.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/windows/CuFactory.h
new file mode 100644
index 00000000..59cec2d9
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/windows/CuFactory.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Factory.h"
+#include "Allocator.h"
+
+namespace physx
+{
+	class PxCudaContextManager;
+}
+
+namespace nvidia
+{
+namespace cloth
+{
+
+class CuFabric;
+class CuCloth;
+template <typename>
+class ClothImpl;
+
+class CuFactory : public UserAllocated, public Factory
+{
+  protected:
+	CuFactory& operator=(const CuFactory&);
+
+  public:
+	typedef CuFabric FabricType;
+	typedef ClothImpl<CuCloth> ImplType;
+
+	CuFactory(PxCudaContextManager*);
+	virtual ~CuFactory();
+
+	virtual Fabric* createFabric(uint32_t numParticles, Range<const uint32_t> phases, Range<const uint32_t> sets,
+	                             Range<const float> restvalues, Range<const uint32_t> indices,
+	                             Range<const uint32_t> anchors, Range<const float> tetherLengths);
+
+	virtual Cloth* createCloth(Range<const PxVec4> particles, Fabric& fabric);
+
+	virtual Solver* createSolver(profile::PxProfileZone* profiler, PxTaskManager* taskMgr);
+
+	virtual Cloth* clone(const Cloth& cloth);
+
+	virtual void extractFabricData(const Fabric& fabric, Range<uint32_t> phases, Range<uint32_t> sets,
+	                               Range<float> restvalues, Range<uint32_t> indices, Range<uint32_t> anchors,
+	                               Range<float> tetherLengths) const;
+
+	virtual void extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules,
+	                                  Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const;
+
+	virtual void extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const;
+
+	virtual void extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const;
+
+	virtual void extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const;
+
+	virtual void extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> destIndices,
+	                                     Range<PxVec3> destWeights) const;
+
+	virtual void extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const;
+
+	virtual void extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const;
+
+  public:
+	void copyToHost(const void* srcIt, const void* srcEnd, void* dstIt) const;
+
+  public:
+	Vector<CuFabric*>::Type mFabrics;
+
+	PxCudaContextManager* mContextManager;
+
+	uint32_t mNumThreadsPerBlock;
+
+	const uint32_t mMaxThreadsPerBlock;
+};
+}
+}
author	git perforce import user <a@b>	2016-10-25 12:29:14 -0600
committer	Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees>	2016-10-25 18:56:37 -0500
commit	3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
tree	fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /APEX_1.4/module/clothing/embedded/LowLevelCloth
download	physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip