Initial commit:

PhysX 3.4.0 Update @ 21294896 APEX 1.4.0 Update @ 21275617 [CL 21300167]
author: git perforce import user <a@b> 2016-10-25 12:29:14 -0600
committer: Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees> 2016-10-25 18:56:37 -0500
commit: 3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
tree: fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /APEX_1.4/module/clothing/embedded
download: physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz
physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip
81 files changed, 21487 insertions, 0 deletions
diff --git a/APEX_1.4/module/clothing/embedded/CmPhysXCommon.h b/APEX_1.4/module/clothing/embedded/CmPhysXCommon.h
new file mode 100644
index 00000000..ad6ff21b
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/CmPhysXCommon.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PX_PHYSICS_COMMON
+#define PX_PHYSICS_COMMON
+
+//! \file Top level internal include file for PhysX SDK
+
+#include "Ps.h"
+#include "PxFoundation.h"
+
+#ifndef CACHE_LOCAL_CONTACTS_XP
+#define CACHE_LOCAL_CONTACTS_XP 1
+#endif
+
+#if PX_CHECKED
+	#define PX_CHECK_MSG(exp, msg)				(!!(exp) || (PxGetFoundation().getErrorCallback().reportError(PxErrorCode::eINVALID_PARAMETER, msg, __FILE__, __LINE__), 0) )
+	#define PX_CHECK(exp)						PX_CHECK_MSG(exp, #exp)
+	#define PX_CHECK_AND_RETURN(exp,msg)		{ if(!(exp)) { PX_CHECK_MSG(exp, msg); return; } }
+	#define PX_CHECK_AND_RETURN_NULL(exp,msg)	{ if(!(exp)) { PX_CHECK_MSG(exp, msg); return 0; } }
+	#define PX_CHECK_AND_RETURN_VAL(exp,msg,r)	{ if(!(exp)) { PX_CHECK_MSG(exp, msg); return r; } }
+#else
+	#define PX_CHECK_MSG(exp, msg)
+	#define PX_CHECK(exp)
+	#define PX_CHECK_AND_RETURN(exp,msg)
+	#define PX_CHECK_AND_RETURN_NULL(exp,msg)
+	#define PX_CHECK_AND_RETURN_VAL(exp,msg,r)
+#endif
+
+#if PX_VC
+	// VC compiler defines __FUNCTION__ as a string literal so it is possible to concatenate it with another string
+	// Example: #define PX_CHECK_VALID(x)	PX_CHECK_MSG(shdfnd::checkValid(x), __FUNCTION__ ": parameter invalid!")
+	#define PX_CHECK_VALID(x)				PX_CHECK_MSG(shdfnd::checkValid(x), __FUNCTION__)
+#elif PX_GCC_FAMILY || PX_GHS
+	// GCC compiler defines __FUNCTION__ as a variable, hence, it is NOT possible concatenate an additional string to it
+	// In GCC, __FUNCTION__ only returns the function name, using __PRETTY_FUNCTION__ will return the full function definition
+	#define PX_CHECK_VALID(x)				PX_CHECK_MSG(shdfnd::checkValid(x), __PRETTY_FUNCTION__)
+#else
+	// Generic macro for other compilers
+	#define PX_CHECK_VALID(x)				PX_CHECK_MSG(shdfnd::checkValid(x), __FUNCTION__)
+#endif
+
+
+#endif
diff --git a/APEX_1.4/module/clothing/embedded/CmTask.h b/APEX_1.4/module/clothing/embedded/CmTask.h
new file mode 100644
index 00000000..4de05e3b
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/CmTask.h
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PX_PHYSICS_COMMON_TASK
+#define PX_PHYSICS_COMMON_TASK
+
+#include "PxTask.h"
+#include "PxTaskManager.h"
+#include "CmPhysXCommon.h"
+#include "PsUserAllocated.h"
+#include "PsAtomic.h"
+#include "PsMutex.h"
+#include "PsSync.h"
+#include "PxCpuDispatcher.h"
+#include "PsFPU.h"
+#include "PsInlineArray.h"
+
+namespace nvidia
+{
+namespace Cm
+{
+	// wrapper around the public PxLightCpuTask
+	// internal SDK tasks should be inherited from
+	// this and override the runInternal() method
+	// to ensure that the correct floating point 
+	// state is set / reset during execution
+	class Task : public PxLightCpuTask
+	{
+	public:
+
+		virtual void run()
+		{
+			physx::PX_SIMD_GUARD;
+			runInternal();
+		}
+
+		virtual void runInternal()=0;
+	};
+
+	// same as Cm::Task but inheriting from PxBaseTask
+	// instead of PxLightCpuTask
+	class BaseTask : public PxBaseTask
+	{
+	public:
+
+		virtual void run()
+		{
+			physx::PX_SIMD_GUARD;
+			runInternal();
+		}
+
+		virtual void runInternal()=0;
+	};
+
+	template <class T, void (T::*Fn)(PxBaseTask*) >
+	class DelegateTask : public Cm::Task, public shdfnd::UserAllocated
+	{
+	public:
+
+		DelegateTask(T* obj, const char* name) : 
+		  mObj(obj), mName(name) { }
+
+		virtual void runInternal()
+		{
+			(mObj->*Fn)((PxBaseTask*)mCont);
+		}
+
+		virtual const char* getName() const
+		{
+			return mName;
+		}
+
+		void setObject(T* obj) { mObj = obj; }
+
+	private:
+		T* mObj;
+		const char* mName;
+	};
+
+
+	/**
+	\brief A task that maintains a list of dependent tasks.
+	
+	This task maintains a list of dependent tasks that have their reference counts 
+	reduced on completion of the task.
+
+	The refcount is incremented every time a dependent task is added.
+	*/
+	class FanoutTask : public Cm::BaseTask
+	{
+		PX_NOCOPY(FanoutTask)
+	public:
+		FanoutTask(const char* name) : Cm::BaseTask(), mRefCount(0), mName(name), mNotifySubmission(false) {}
+
+		virtual void runInternal() {}
+
+		virtual const char* getName() const { return mName; }
+
+		/**
+		Swap mDependents with mReferencesToRemove when refcount goes to 0.
+		*/
+		virtual void removeReference()
+		{
+			nvidia::Mutex::ScopedLock lock(mMutex);
+			if (!nvidia::atomicDecrement(&mRefCount))
+			{
+				// prevents access to mReferencesToRemove until release
+				nvidia::atomicIncrement(&mRefCount);
+				mNotifySubmission = false;
+				PX_ASSERT(mReferencesToRemove.empty());
+				for (uint32_t i = 0; i < mDependents.size(); i++)
+					mReferencesToRemove.pushBack(mDependents[i]);
+				mDependents.clear();
+				mTm->getCpuDispatcher()->submitTask(*this);
+			}
+		}
+
+		/** 
+		\brief Increases reference count
+		*/
+		virtual void addReference()
+		{
+			nvidia::Mutex::ScopedLock lock(mMutex);
+			nvidia::atomicIncrement(&mRefCount);
+			mNotifySubmission = true;
+		}
+
+		/** 
+		\brief Return the ref-count for this task 
+		*/
+		PX_INLINE int32_t getReference() const
+		{
+			return mRefCount;
+		}
+
+		/**
+		Sets the task manager. Doesn't increase the reference count.
+		*/
+		PX_INLINE void setTaskManager(PxTaskManager& tm)
+		{
+			mTm = &tm;
+		}
+
+		/**
+		Adds a dependent task. It also sets the task manager querying it from the dependent task.  
+		The refcount is incremented every time a dependent task is added.
+		*/
+		PX_INLINE void addDependent(PxBaseTask& dependent)
+		{
+			nvidia::Mutex::ScopedLock lock(mMutex);
+			nvidia::atomicIncrement(&mRefCount);
+			mTm = dependent.getTaskManager();
+			mDependents.pushBack(&dependent);
+			dependent.addReference();
+			mNotifySubmission = true;
+		}
+
+		/**
+		Reduces reference counts of the continuation task and the dependent tasks, also 
+		clearing the copy of continuation and dependents task list.
+		*/
+		virtual void release()
+		{
+			nvidia::Mutex::ScopedLock lock(mMutex);
+			for (uint32_t i = 0, n = mReferencesToRemove.size(); i < n; ++i)
+				mReferencesToRemove[i]->removeReference();
+			mReferencesToRemove.clear();
+			// allow access to mReferencesToRemove again
+			if (mNotifySubmission)
+			{
+				removeReference();
+			}
+			else
+			{
+				nvidia::atomicDecrement(&mRefCount);
+			}
+		}
+
+	protected:
+		volatile int32_t mRefCount;
+		const char* mName;
+		nvidia::InlineArray<PxBaseTask*, 4> mDependents;
+		nvidia::InlineArray<PxBaseTask*, 4> mReferencesToRemove;
+		bool mNotifySubmission;
+		nvidia::Mutex mMutex; // guarding mDependents and mNotifySubmission
+	};
+
+
+	/**
+	\brief Specialization of FanoutTask class in order to provide the delegation mechanism.
+	*/
+	template <class T, void (T::*Fn)(PxBaseTask*) >
+	class DelegateFanoutTask : public FanoutTask, public shdfnd::UserAllocated
+	{
+	public:
+
+		DelegateFanoutTask(T* obj, const char* name) : 
+		  FanoutTask(name), mObj(obj) { }
+
+		  virtual void runInternal()
+		  {
+			  PxBaseTask* continuation = mDependents.empty() ? NULL : mDependents[0];
+			  (mObj->*Fn)(continuation);
+		  }
+
+		  void setObject(T* obj) { mObj = obj; }
+
+	private:
+		T* mObj;
+	};
+
+} // namespace Cm
+
+}
+
+#endif
diff --git a/APEX_1.4/module/clothing/embedded/Cooking.cpp b/APEX_1.4/module/clothing/embedded/Cooking.cpp
new file mode 100644
index 00000000..5a9d8251
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/Cooking.cpp
@@ -0,0 +1,678 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+
+#include "Cooking.h"
+
+#include <PsArray.h>
+#include <PsMathUtils.h>
+#include <PsSort.h>
+#include <Ps.h>
+
+#include <ApexSDKIntl.h>
+
+#include "ClothingCookedPhysX3Param.h"
+
+#include "ExtClothFabricCooker.h"
+#include "ExtClothMeshQuadifier.h"
+
+#include "ModuleClothingHelpers.h"
+
+#include <ctime>
+
+namespace
+{
+	using namespace nvidia;
+	
+	struct VirtualParticle
+	{
+		VirtualParticle(uint32_t i0, uint32_t i1, uint32_t i2)
+		{
+			indices[0] = i0;
+			indices[1] = i1;
+			indices[2] = i2;
+			tableIndex = 0;
+		}
+
+		void rotate(uint32_t count)
+		{
+			while (count--)
+			{
+				const uint32_t temp = indices[2];
+				indices[2] = indices[1];
+				indices[1] = indices[0];
+				indices[0] = temp;
+			}
+		}
+
+		uint32_t indices[3];
+		uint32_t tableIndex;
+	};
+
+	struct EdgeAndLength
+	{
+		EdgeAndLength(uint32_t edgeNumber, float length) : mEdgeNumber(edgeNumber), mLength(length) {}
+		uint32_t mEdgeNumber;
+		float mLength;
+
+		bool operator<(const EdgeAndLength& other) const
+		{
+			return mLength < other.mLength;
+		}
+	};
+}
+
+namespace nvidia
+{
+namespace clothing
+{
+
+bool Cooking::mTetraWarning = false;
+
+NvParameterized::Interface* Cooking::execute()
+{
+	ClothingCookedPhysX3Param* rootCookedData = NULL;
+
+	for (uint32_t meshIndex = 0; meshIndex < mPhysicalMeshes.size(); meshIndex++)
+	{
+		if (mPhysicalMeshes[meshIndex].isTetrahedral)
+		{
+			if (!mTetraWarning)
+			{
+				mTetraWarning = true;
+				APEX_INVALID_OPERATION("Tetrahedral meshes are not (yet) supported with the 3.x solver");
+			}
+			continue;
+		}
+
+		ClothingCookedPhysX3Param* cookedData = NULL;
+
+		cookedData = fiberCooker(meshIndex);
+
+		computeVertexWeights(cookedData, meshIndex);
+		fillOutSetsDesc(cookedData);
+
+		createVirtualParticles(cookedData, meshIndex);
+		createSelfcollisionIndices(cookedData, meshIndex);
+
+		if (rootCookedData == NULL)
+		{
+			rootCookedData = cookedData;
+		}
+		else
+		{
+			ClothingCookedPhysX3Param* addCookedData = rootCookedData;
+			while (addCookedData != NULL && addCookedData->nextCookedData != NULL)
+			{
+				addCookedData = static_cast<ClothingCookedPhysX3Param*>(addCookedData->nextCookedData);
+			}
+			addCookedData->nextCookedData = cookedData;
+		}
+	}
+
+	return rootCookedData;
+}
+
+
+
+ClothingCookedPhysX3Param* Cooking::fiberCooker(uint32_t meshIndex) const
+{
+	const uint32_t numSimulatedVertices = mPhysicalMeshes[meshIndex].numSimulatedVertices;
+	const uint32_t numAttached = mPhysicalMeshes[meshIndex].numMaxDistance0Vertices;
+
+	shdfnd::Array<PxVec4> vertices(numSimulatedVertices);
+	for (uint32_t i = 0; i < numSimulatedVertices; i++)
+		vertices[i] = PxVec4(mPhysicalMeshes[meshIndex].vertices[i], 1.0f);
+
+	if (numAttached > 0)
+	{
+		const uint32_t start = numSimulatedVertices - numAttached;
+		for (uint32_t i = start; i < numSimulatedVertices; i++)
+			vertices[i].w = 0.0f;
+	}
+
+	PxClothMeshDesc desc;
+
+	desc.points.data = vertices.begin();
+	desc.points.count = numSimulatedVertices;
+	desc.points.stride = sizeof(PxVec4);
+
+	desc.invMasses.data = &vertices.begin()->w;
+	desc.invMasses.count = numSimulatedVertices;
+	desc.invMasses.stride = sizeof(PxVec4);
+
+	desc.triangles.data = mPhysicalMeshes[meshIndex].indices;
+	desc.triangles.count = mPhysicalMeshes[meshIndex].numSimulatedIndices / 3;
+	desc.triangles.stride = sizeof(uint32_t) * 3;
+
+	PxClothMeshQuadifier quadifier(desc);
+
+	PxClothFabricCooker cooker(quadifier.getDescriptor(), mGravityDirection);
+	
+	PxClothFabricDesc fabric = cooker.getDescriptor(); 
+
+	int32_t nbConstraints = (int32_t)fabric.sets[fabric.nbSets - 1];
+
+	ClothingCookedPhysX3Param* cookedData = NULL;
+
+	bool success = true;
+	if (success)
+	{
+		cookedData = static_cast<ClothingCookedPhysX3Param*>(GetInternalApexSDK()->getParameterizedTraits()->createNvParameterized(ClothingCookedPhysX3Param::staticClassName()));
+
+		NvParameterized::Handle arrayHandle(cookedData);
+		arrayHandle.getParameter("deformableIndices");
+		arrayHandle.resizeArray(nbConstraints * 2);
+		arrayHandle.setParamU32Array(fabric.indices, nbConstraints * 2);
+
+		arrayHandle.getParameter("deformableRestLengths");
+		arrayHandle.resizeArray(nbConstraints);
+		arrayHandle.setParamF32Array(fabric.restvalues, nbConstraints);
+
+		arrayHandle.getParameter("deformableSets");
+		const int32_t numSets = (int32_t)fabric.nbSets;
+		arrayHandle.resizeArray(numSets);
+		for (int32_t i = 0; i < numSets; i++)
+		{
+			arrayHandle.set(i);
+			arrayHandle.set(0);
+			arrayHandle.setParamU32(fabric.sets[(uint32_t)i]);
+			arrayHandle.popIndex();
+			arrayHandle.popIndex();
+		}
+
+		arrayHandle.getParameter("deformablePhaseDescs");
+		arrayHandle.resizeArray((int32_t)fabric.nbPhases);
+
+		for (uint32_t i = 0; i < fabric.nbPhases; i++)
+		{
+			PxClothFabricPhase phase = fabric.phases[i];
+			cookedData->deformablePhaseDescs.buf[i].phaseType = phase.phaseType;
+			cookedData->deformablePhaseDescs.buf[i].setIndex = phase.setIndex;
+		}
+
+		arrayHandle.getParameter("tetherAnchors");
+		arrayHandle.resizeArray((int32_t)fabric.nbTethers);
+		arrayHandle.setParamU32Array(fabric.tetherAnchors, (int32_t)fabric.nbTethers);
+
+		arrayHandle.getParameter("tetherLengths");
+		arrayHandle.resizeArray((int32_t)fabric.nbTethers);
+		arrayHandle.setParamF32Array(fabric.tetherLengths, (int32_t)fabric.nbTethers);
+
+		cookedData->physicalMeshId = meshIndex;
+		cookedData->numVertices = numSimulatedVertices;
+
+		//dumpObj("c:\\lastCooked.obj", meshIndex);
+		//dumpApx("c:\\lastCooked.apx", cookedData);
+
+		cookedData->cookedDataVersion = getCookingVersion();
+	}
+	else
+	{
+#if PX_WINDOWS_FAMILY
+		static int failureCount = 0;
+		char buf[64];
+		sprintf_s(buf, 64, "c:\\cookingFailure_%d.obj", failureCount++);
+		dumpObj(buf, meshIndex);
+
+		APEX_INTERNAL_ERROR("Fiber cooking failure (mesh %d), the failing mesh has been dumped to \'%s\'", meshIndex, buf);
+#else
+		APEX_INTERNAL_ERROR("Fiber cooking failure (mesh %d)", meshIndex);
+#endif
+
+	}
+
+
+	return cookedData;
+}
+
+void Cooking::computeVertexWeights(ClothingCookedPhysX3Param* cookedData, uint32_t meshIndex) const
+{
+	const uint32_t* indices					= mPhysicalMeshes[cookedData->physicalMeshId].indices;
+	const PxVec3*	positions				= mPhysicalMeshes[cookedData->physicalMeshId].vertices;
+	const uint32_t	numSimulatedIndices		= mPhysicalMeshes[meshIndex].numSimulatedIndices;
+	const uint32_t	numSimulatedVertices	= mPhysicalMeshes[meshIndex].numSimulatedVertices;
+
+	nvidia::Array<float> weights(numSimulatedVertices, 0.0f);
+
+	PX_ASSERT(numSimulatedIndices % 3 == 0);
+	for (uint32_t i = 0; i < numSimulatedIndices; i += 3)
+	{
+		const PxVec3 v1 = positions[indices[i + 1]] - positions[indices[i]];
+		const PxVec3 v2 = positions[indices[i + 2]] - positions[indices[i]];
+		const float area = v1.cross(v2).magnitude();
+
+		for (uint32_t j = 0; j < 3; j++)
+		{
+			weights[indices[i + j]] += area;
+		}
+	}
+
+	float weightSum = 0.0f;
+	for (uint32_t i = 0; i < numSimulatedVertices; i++)
+	{
+		weightSum += weights[i];
+	}
+
+	const float weightScale = (float)numSimulatedVertices / weightSum;
+
+	for (uint32_t i = 0; i < numSimulatedVertices; i++)
+	{
+		weights[i] *= weightScale;
+	}
+
+	NvParameterized::Handle handle(*cookedData, "deformableInvVertexWeights");
+	if (handle.resizeArray((int32_t)numSimulatedVertices) == NvParameterized::ERROR_NONE)
+	{
+		for (uint32_t i = 0; i < numSimulatedVertices; i++)
+		{
+			cookedData->deformableInvVertexWeights.buf[i] = 1.0f / weights[i];
+		}
+	}
+}
+
+
+
+void Cooking::createVirtualParticles(ClothingCookedPhysX3Param* cookedData, uint32_t meshIndex)
+{
+	const PxVec3*	positions	= mPhysicalMeshes[cookedData->physicalMeshId].vertices;
+	const uint32_t* indices		= mPhysicalMeshes[cookedData->physicalMeshId].indices;
+	const uint32_t	numIndices	= mPhysicalMeshes[meshIndex].numSimulatedIndices;
+
+	nvidia::Array<VirtualParticle> particles;
+
+	const float minTriangleArea = mVirtualParticleDensity * mPhysicalMeshes[cookedData->physicalMeshId].smallestTriangleArea / 2.0f +
+	                              (1.0f - mVirtualParticleDensity) * mPhysicalMeshes[cookedData->physicalMeshId].largestTriangleArea;
+	const float coveredTriangleArea = minTriangleArea;
+
+	for (uint32_t i = 0; i < numIndices; i += 3)
+	{
+		VirtualParticle particle(indices[i], indices[i + 1], indices[i + 2]);
+
+		const PxVec3 edge1 = positions[particle.indices[1]] - positions[particle.indices[0]];
+		const PxVec3 edge2 = positions[particle.indices[2]] - positions[particle.indices[0]];
+		const float triangleArea = edge1.cross(edge2).magnitude();
+
+		const float numSpheres = triangleArea / coveredTriangleArea;
+
+		if (numSpheres <= 1.0f)
+		{
+			// do nothing
+		}
+		else if (numSpheres < 2.0f)
+		{
+			// add one virtual particle
+			particles.pushBack(particle);
+		}
+		else
+		{
+			// add two or three, depending on whether it's a slim triangle.
+			EdgeAndLength eal0(0, edge1.magnitude());
+			EdgeAndLength eal1(1, (positions[particle.indices[2]] - positions[particle.indices[1]]).magnitude());
+			EdgeAndLength eal2(2, edge2.magnitude());
+			EdgeAndLength middle = eal0 < eal1 ? eal0 : eal1; // technically this does not have to be the middle of the three, but for the test below it suffices.
+			EdgeAndLength smallest = middle < eal2 ? middle : eal2;
+			if (smallest.mLength * 2.0f < middle.mLength)
+			{
+				// two
+				particle.rotate(smallest.mEdgeNumber);
+				particle.tableIndex = 2;
+				particles.pushBack(particle);
+				particle.tableIndex = 3;
+				particles.pushBack(particle);
+			}
+			else
+			{
+				// three
+				particle.tableIndex = 1;
+				particles.pushBack(particle);
+				particle.rotate(1);
+				particles.pushBack(particle);
+				particle.rotate(1);
+				particles.pushBack(particle);
+			}
+		}
+	}
+
+	if (!particles.empty())
+	{
+		NvParameterized::Handle handle(cookedData);
+		handle.getParameter("virtualParticleIndices");
+		handle.resizeArray((int32_t)particles.size() * 4);
+		handle.getParameter("virtualParticleWeights");
+		handle.resizeArray(3 * 4);
+
+		// table index 0, the center particle
+		cookedData->virtualParticleWeights.buf[0] = 1.0f / 3.0f;
+		cookedData->virtualParticleWeights.buf[1] = 1.0f / 3.0f;
+		cookedData->virtualParticleWeights.buf[2] = 1.0f / 3.0f;
+
+		// table index 1, three particles
+		cookedData->virtualParticleWeights.buf[3] = 0.1f;
+		cookedData->virtualParticleWeights.buf[4] = 0.3f;
+		cookedData->virtualParticleWeights.buf[5] = 0.6f;
+
+		// table index 2, the pointy particle
+		cookedData->virtualParticleWeights.buf[6] = 0.7f;
+		cookedData->virtualParticleWeights.buf[7] = 0.15f;
+		cookedData->virtualParticleWeights.buf[8] = 0.15f;
+
+		// table index 3, the flat particle
+		cookedData->virtualParticleWeights.buf[9] = 0.3f;
+		cookedData->virtualParticleWeights.buf[10] = 0.35f;
+		cookedData->virtualParticleWeights.buf[11] = 0.35f;
+
+		for (uint32_t i = 0; i < particles.size(); i++)
+		{
+			for (uint32_t j = 0; j < 3; j++)
+			{
+				cookedData->virtualParticleIndices.buf[4 * i + j] = particles[i].indices[j];
+			}
+			cookedData->virtualParticleIndices.buf[4 * i + 3] = particles[i].tableIndex; // the table index
+		}
+	}
+}
+
+
+void Cooking::createSelfcollisionIndices(ClothingCookedPhysX3Param* cookedData, uint32_t meshIndex) const
+{
+	const PxVec3*	positions	= mPhysicalMeshes[cookedData->physicalMeshId].vertices;
+	const uint32_t	numVertices = mPhysicalMeshes[meshIndex].numSimulatedVertices;
+
+
+	// we'll start with a full set of indices, and eliminate the ones we don't want. selfCollisionIndices
+	//  is an array of indices, i.e. a second layer of indirection
+	Array<uint32_t> selfCollisionIndices;
+	for (uint32_t i = 0; i < numVertices; ++i)
+	{
+		selfCollisionIndices.pushBack(i);
+	}
+
+	float selfcollisionThicknessSq = mSelfcollisionRadius * mSelfcollisionRadius;
+	for (uint32_t v0ii = 0; v0ii < selfCollisionIndices.size(); ++v0ii)
+	{
+		// ii suffix means "index into indices array", i suffix just means "index into vertex array"
+
+		// load the first vertex
+		uint32_t v0i = selfCollisionIndices[v0ii];
+		const PxVec3& v0 = positions[v0i];
+
+		// no need to start at the beginning of the array, those comparisons have already been made.
+		// don't autoincrement the sequence index. if we eliminate an index, we'll replace it with one from
+		//  the end, and reevaluate that element
+		for (uint32_t v1ii = v0ii + 1; v1ii < selfCollisionIndices.size(); ) // don't autoincrement iteratorsee if/else
+		{
+			uint32_t v1i = selfCollisionIndices[v1ii];
+			const PxVec3& v1 = positions[v1i];
+
+			// how close is this particle?
+			float v0v1DistanceSq = (v0 - v1).magnitudeSquared();
+			if (v0v1DistanceSq < selfcollisionThicknessSq )
+			{
+				// too close for missiles
+				selfCollisionIndices.replaceWithLast(v1ii);
+
+				// don't move on to the next - replaceWithLast put a fresh index at v1ii, so reevaluate it
+			}
+			else
+			{
+				// it's comfortably distant, so we'll keep it around (for now). 
+
+				// we need to be mindful of which element we visit next in the outer loop. we want to minimize the distance between 
+				//  self colliding particles and not unnecessarily introduce large gaps between them. the easiest way is to pick
+				//  the closest non-eliminated particle to the one currently being evaluated, and evaluate it next. if we find one
+				//  that's closer than what's currently next in the list, swap it. both of these elements are prior to the next 
+				//  sinner-loop element, so this doesn't impact the inner loop traversal
+
+				// if we assume the index of the closest known particle is always v0ii + 1, we can just reevaluate it's distance to 
+				//  v0ii every iteration. slightly expensive, but it eliminates the need to maintain redundant 
+				//  ClosestDistance/ClosestIndex variables
+				uint32_t vNexti = selfCollisionIndices[v0ii + 1];
+				const PxVec3& nextVertexToEvaluate = positions[vNexti];
+
+				float v0vNextDistanceSq = (v0 - nextVertexToEvaluate).magnitudeSquared();
+				if (v0v1DistanceSq < v0vNextDistanceSq)
+				{
+					nvidia::swap(selfCollisionIndices[v0ii + 1], selfCollisionIndices[v1ii]);
+				}
+
+				// move on to the next
+				++v1ii;
+			}
+		}
+	}
+
+	NvParameterized::Handle arrayHandle(cookedData);
+	arrayHandle.getParameter("selfCollisionIndices");
+	arrayHandle.resizeArray((int32_t)selfCollisionIndices.size());
+	arrayHandle.setParamU32Array(selfCollisionIndices.begin(), (int32_t)selfCollisionIndices.size());
+}
+
+
+bool Cooking::verifyValidity(const ClothingCookedPhysX3Param* cookedData, uint32_t meshIndex)
+{
+	if (cookedData == NULL)
+	{
+		return false;
+	}
+
+	const char* errorMessage = NULL;
+
+	const uint32_t numSetsDescs				= (uint32_t)cookedData->deformableSets.arraySizes[0];
+	const uint32_t numDeformableVertices	= mPhysicalMeshes[meshIndex].numSimulatedVertices;
+
+	for (uint32_t validSetsDescs = 0; validSetsDescs < numSetsDescs && errorMessage == NULL; ++validSetsDescs)
+	{
+		const uint32_t fromIndex = validSetsDescs ? cookedData->deformableSets.buf[validSetsDescs - 1].fiberEnd : 0;
+		const uint32_t toIndex = cookedData->deformableSets.buf[validSetsDescs].fiberEnd;
+		if (toIndex <= fromIndex)
+		{
+			errorMessage = "Set without fibers";
+		}
+
+		for (uint32_t f = fromIndex; f < toIndex && errorMessage == NULL; ++f)
+		{
+			uint32_t	posIndex1	= cookedData->deformableIndices.buf[2 * f];
+			uint32_t	posIndex2	= cookedData->deformableIndices.buf[2 * f + 1];
+
+			if (posIndex2 > (uint32_t)cookedData->deformableIndices.arraySizes[0])
+			{
+				errorMessage = "Fiber index out of bounds";
+			}
+	
+			if (posIndex1 >= numDeformableVertices)
+			{
+				errorMessage = "Deformable index out of bounds";
+			}
+		}
+	}
+
+	if (errorMessage != NULL)
+	{
+		APEX_INTERNAL_ERROR("Invalid cooked data: %s", errorMessage);
+	}
+
+	return (errorMessage == NULL);
+}
+
+
+
+
+void Cooking::fillOutSetsDesc(ClothingCookedPhysX3Param* cookedData)
+{
+	const PxVec3* vertices = mPhysicalMeshes[cookedData->physicalMeshId].vertices;
+	for (int32_t sd = 0; sd < cookedData->deformableSets.arraySizes[0]; sd++)
+	{
+		const uint32_t firstFiber = sd ? cookedData->deformableSets.buf[sd - 1].fiberEnd : 0;
+		const uint32_t lastFiber = cookedData->deformableSets.buf[sd].fiberEnd;
+
+		uint32_t numEdges = 0;
+		float avgEdgeLength = 0.0f;
+
+		for (uint32_t f = firstFiber; f < lastFiber; f++)
+		{
+			uint32_t from = cookedData->deformableIndices.buf[f * 2];
+			uint32_t to = cookedData->deformableIndices.buf[f*2+1];
+			numEdges ++;
+			avgEdgeLength += (vertices[to] - vertices[from]).magnitude();
+		}
+
+		if (numEdges > 0)
+		{
+			cookedData->deformableSets.buf[sd].longestFiber = 0;
+			cookedData->deformableSets.buf[sd].shortestFiber = 0;
+			cookedData->deformableSets.buf[sd].numEdges = numEdges;
+			cookedData->deformableSets.buf[sd].avgFiberLength = 0;
+			cookedData->deformableSets.buf[sd].avgEdgeLength = avgEdgeLength / (float)numEdges;
+		}
+	}
+}
+
+
+
+void Cooking::groupPhases(ClothingCookedPhysX3Param* cookedData, uint32_t meshIndex, uint32_t startIndex, uint32_t endIndex, Array<uint32_t>& phaseEnds) const
+{
+	shdfnd::Array<bool> usedInPhase(mPhysicalMeshes[meshIndex].numSimulatedVertices, false);
+	for (uint32_t f = startIndex; f < endIndex; f++)
+	{
+		uint32_t index1 = cookedData->deformableIndices.buf[2 * f + 0];
+		uint32_t index2 = cookedData->deformableIndices.buf[2 * f + 1];
+
+		if (usedInPhase[index1] || usedInPhase[index2])
+		{
+			bool swapped = false;
+
+			// need to replace this with one further ahead
+			for (uint32_t scanAhead = f + 1; scanAhead < endIndex; scanAhead++)
+			{
+				const uint32_t i1 = cookedData->deformableIndices.buf[2 * scanAhead + 0];
+				const uint32_t i2 = cookedData->deformableIndices.buf[2 * scanAhead + 1];
+				if (!usedInPhase[i1] && !usedInPhase[i2])
+				{
+					// swap
+					cookedData->deformableIndices.buf[2 * f + 0] = i1;
+					cookedData->deformableIndices.buf[2 * f + 1] = i2;
+
+					cookedData->deformableIndices.buf[2 * scanAhead + 0] = index1;
+					cookedData->deformableIndices.buf[2 * scanAhead + 1] = index2;
+
+					nvidia::swap(cookedData->deformableRestLengths.buf[2 * f], cookedData->deformableRestLengths.buf[2 * scanAhead]);
+
+					index1 = i1;
+					index2 = i2;
+
+					swapped = true;
+
+					break;
+				}
+			}
+
+			if (!swapped)
+			{
+				phaseEnds.pushBack(f);
+				f--;
+
+				for (uint32_t i = 0; i < usedInPhase.size(); i++)
+				{
+					usedInPhase[i] = false;
+				}
+
+				continue;
+			}
+		}
+
+		usedInPhase[index1] = true;
+		usedInPhase[index2] = true;
+	}
+	phaseEnds.pushBack(endIndex);
+}
+
+
+
+void Cooking::dumpObj(const char* filename, uint32_t meshIndex) const
+{
+	PX_UNUSED(filename);
+
+#if PX_WINDOWS_FAMILY
+	FILE* outputFile = NULL;
+	fopen_s(&outputFile, filename, "w");
+
+	if (outputFile == NULL)
+	{
+		return;
+	}
+
+	fprintf(outputFile, "# PhysX3 Cooking input mesh\n");
+	fprintf(outputFile, "# Mesh %d\n", meshIndex);
+
+	{
+		time_t rawtime;
+		struct tm* timeinfo;
+
+		time(&rawtime);
+		timeinfo = localtime(&rawtime);
+		fprintf(outputFile, "# File Created: %s", asctime(timeinfo));
+	}
+
+	fprintf(outputFile, "\n\n\n");
+
+	const uint32_t numVertices = mPhysicalMeshes[meshIndex].numVertices;
+	const uint32_t numIndices = mPhysicalMeshes[meshIndex].numIndices;
+//	const uint32_t numSimulatedVertices = mPhysicalMeshes[meshIndex].numSimulatedVertices;
+//	const uint32_t numSimulatedIndices = mPhysicalMeshes[meshIndex].numSimulatedIndices;
+
+	const PxVec3* vert = mPhysicalMeshes[meshIndex].vertices;
+	for (uint32_t i = 0; i < numVertices; i++)
+	{
+		fprintf(outputFile, "v %f %f %f\n", vert[i].x, vert[i].y, vert[i].z);
+	}
+
+	fprintf(outputFile, "\n\n\n");
+
+	const uint32_t* indices = mPhysicalMeshes[meshIndex].indices;
+	for (uint32_t i = 0; i < numIndices; i += 3)
+	{
+		fprintf(outputFile, "f %d %d %d\n", indices[i] + 1, indices[i + 1] + 1, indices[i + 2] + 1);
+	}
+
+	fclose(outputFile);
+#endif
+}
+
+
+
+void Cooking::dumpApx(const char* filename, const NvParameterized::Interface* data) const
+{
+	NvParameterized::Serializer::SerializeType serType = NvParameterized::Serializer::NST_XML;
+
+	if (data == NULL)
+	{
+		return;
+	}
+
+	PxFileBuf* filebuffer = GetInternalApexSDK()->createStream(filename, PxFileBuf::OPEN_WRITE_ONLY);
+
+	if (filebuffer != NULL)
+	{
+		if (filebuffer->isOpen())
+		{
+			NvParameterized::Serializer* serializer = GetInternalApexSDK()->createSerializer(serType);
+			serializer->serialize(*filebuffer, &data, 1);
+
+			serializer->release();
+		}
+
+		filebuffer->release();
+		filebuffer = NULL;
+	}
+}
+
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/CreateCuFactory.cpp b/APEX_1.4/module/clothing/embedded/CreateCuFactory.cpp
new file mode 100644
index 00000000..8f46c4b5
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/CreateCuFactory.cpp
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "CuFactory.h"
+#include "CreateCuFactory.h"
+
+#if PX_WINDOWS_FAMILY
+
+nvidia::cloth::CuFactory* PxCreateCuFactory(physx::PxCudaContextManager* contextManager)
+{
+	return new nvidia::cloth::CuFactory(contextManager);
+}
+
+#endif //PX_WINDOWS_FAMILY
diff --git a/APEX_1.4/module/clothing/embedded/CreateCuFactory.h b/APEX_1.4/module/clothing/embedded/CreateCuFactory.h
new file mode 100644
index 00000000..60a76426
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/CreateCuFactory.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#pragma once
+
+#include "Factory.h"
+#include "Allocator.h"
+
+#if PX_WINDOWS_FAMILY
+
+/**
+Create CuFactory interface class.  This is defined so the CUDA cloth solver can be isolated in its own DLL
+*/
+PX_C_EXPORT __declspec(dllexport) nvidia::cloth::CuFactory* PX_CALL_CONV PxCreateCuFactory(physx::PxCudaContextManager* contextManager);
+
+#endif //PX_WINDOWS_FAMILY
diff --git a/APEX_1.4/module/clothing/embedded/ExtClothConfig.h b/APEX_1.4/module/clothing/embedded/ExtClothConfig.h
new file mode 100644
index 00000000..96d3005e
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/ExtClothConfig.h
@@ -0,0 +1,94 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2015 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef EXT_CLOTH_CONFIG_NX
+#define EXT_CLOTH_CONFIG_NX
+
+/** \addtogroup common 
+@{ */
+
+#include "Px.h"
+
+// Exposing of the Cloth API. Run API meta data generation in Tools/PhysXMetaDataGenerator when changing.
+#define APEX_USE_CLOTH_API 1
+
+// define API function declaration (public API only needed because of extensions)
+#if defined EXT_CLOTH_STATIC_LIB
+	#define EXT_CLOTH_CORE_API
+#else
+	#if PX_WINDOWS_FAMILY || PX_WINRT
+		#if defined EXT_CLOTH_CORE_EXPORTS
+			#define EXT_CLOTH_CORE_API PX_DLL_EXPORT
+		#else
+			#define EXT_CLOTH_CORE_API PX_DLL_IMPORT
+		#endif
+	#elif PX_UNIX_FAMILY
+		#define EXT_CLOTH_CORE_API PX_UNIX_EXPORT
+    #else
+		#define EXT_CLOTH_CORE_API
+    #endif
+#endif
+
+#if PX_WINDOWS_FAMILY || PX_WINRT && !defined(__CUDACC__)
+	#if defined EXT_CLOTH_COMMON_EXPORTS
+		#define EXT_CLOTH_COMMON_API __declspec(dllexport)
+	#else
+		#define EXT_CLOTH_COMMON_API __declspec(dllimport)
+	#endif
+#elif PX_UNIX_FAMILY
+	#define EXT_CLOTH_COMMON_API PX_UNIX_EXPORT
+#else
+	#define EXT_CLOTH_COMMON_API
+#endif
+
+// Changing these parameters requires recompilation of the SDK
+
+#ifndef PX_DOXYGEN
+namespace physx
+{
+#endif
+	class PxCollection;
+	class PxBase;
+
+	class PxHeightField;
+	class PxHeightFieldDesc;
+
+	class PxTriangleMesh;
+	class PxConvexMesh;
+
+	typedef uint32_t PxTriangleID;
+	typedef uint16_t PxMaterialTableIndex;
+
+#ifndef PX_DOXYGEN
+} // namespace physx
+#endif
+
+/** @} */
+#endif
diff --git a/APEX_1.4/module/clothing/embedded/ExtClothCoreUtilityTypes.h b/APEX_1.4/module/clothing/embedded/ExtClothCoreUtilityTypes.h
new file mode 100644
index 00000000..b764f8f4
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/ExtClothCoreUtilityTypes.h
@@ -0,0 +1,243 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and
+// any modifications thereto. Any use, reproduction, disclosure, or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA Corporation is strictly prohibited.
+//
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2015 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef EXT_CLOTH_CORE_UTILTY_TYPES_H
+#define EXT_CLOTH_CORE_UTILTY_TYPES_H
+/** \addtogroup common
+@{
+*/
+
+#include "PxAssert.h"
+#include "PxFlags.h"
+
+#if !PX_DOXYGEN
+namespace nvidia
+{
+#endif
+
+
+struct PxStridedData
+{
+	/**
+	\brief The offset in bytes between consecutive samples in the data.
+
+	<b>Default:</b> 0
+	*/
+	uint32_t stride;
+	const void* data;
+
+	PxStridedData() : stride( 0 ), data( NULL ) {}
+
+	template<typename TDataType>
+	PX_INLINE const TDataType& at( uint32_t idx ) const
+	{
+		uint32_t theStride( stride );
+		if ( theStride == 0 )
+			theStride = sizeof( TDataType );
+		uint32_t offset( theStride * idx );
+		return *(reinterpret_cast<const TDataType*>( reinterpret_cast< const uint8_t* >( data ) + offset ));
+	}
+};
+
+template<typename TDataType>
+struct PxTypedStridedData
+{
+	uint32_t stride;
+	const TDataType* data;
+
+	PxTypedStridedData()
+		: stride( 0 )
+		, data( NULL )
+	{
+	}
+
+};
+
+struct PxBoundedData : public PxStridedData
+{
+	uint32_t count;
+	PxBoundedData() : count( 0 ) {}
+};
+
+template<uint8_t TNumBytes>
+struct PxPadding
+{
+	uint8_t mPadding[TNumBytes];
+	PxPadding()
+	{
+		for ( uint8_t idx =0; idx < TNumBytes; ++idx )
+			mPadding[idx] = 0;
+	}
+};
+
+template <uint32_t NB_ELEMENTS> class PxFixedSizeLookupTable
+{
+//= ATTENTION! =====================================================================================
+// Changing the data layout of this class breaks the binary serialization format.  See comments for 
+// PX_BINARY_SERIAL_VERSION.  If a modification is required, please adjust the getBinaryMetaData 
+// function.  If the modification is made on a custom branch, please change PX_BINARY_SERIAL_VERSION
+// accordingly.
+//==================================================================================================
+public:
+	
+	PxFixedSizeLookupTable() 
+		: mNbDataPairs(0)
+	{
+	}
+
+	PxFixedSizeLookupTable(const physx::PxEMPTY) {}
+
+	PxFixedSizeLookupTable(const float* dataPairs, const uint32_t numDataPairs)
+	{
+		memcpy(mDataPairs,dataPairs,sizeof(float)*2*numDataPairs);
+		mNbDataPairs=numDataPairs;
+	}
+
+	PxFixedSizeLookupTable(const PxFixedSizeLookupTable& src)
+	{
+		memcpy(mDataPairs,src.mDataPairs,sizeof(float)*2*src.mNbDataPairs);
+		mNbDataPairs=src.mNbDataPairs;
+	}
+
+	~PxFixedSizeLookupTable()
+	{
+	}
+
+	PxFixedSizeLookupTable& operator=(const PxFixedSizeLookupTable& src)
+	{
+		memcpy(mDataPairs,src.mDataPairs,sizeof(float)*2*src.mNbDataPairs);
+		mNbDataPairs=src.mNbDataPairs;
+		return *this;
+	}
+
+	PX_FORCE_INLINE void addPair(const float x, const float y)
+	{
+		PX_ASSERT(mNbDataPairs<NB_ELEMENTS);
+		mDataPairs[2*mNbDataPairs+0]=x;
+		mDataPairs[2*mNbDataPairs+1]=y;
+		mNbDataPairs++;
+	}
+
+	PX_FORCE_INLINE float getYVal(const float x) const
+	{
+		if(0==mNbDataPairs)
+		{
+			PX_ASSERT(false);
+			return 0;
+		}
+
+		if(1==mNbDataPairs || x<getX(0))
+		{
+			return getY(0);
+		}
+
+		float x0=getX(0);
+		float y0=getY(0);
+
+		for (uint32_t i = 1; i<mNbDataPairs; i++)
+		{
+			const float x1=getX(i);
+			const float y1=getY(i);
+
+			if((x>=x0)&&(x<x1))
+			{
+				return (y0+(y1-y0)*(x-x0)/(x1-x0));
+			}
+
+			x0=x1;
+			y0=y1;
+		}
+
+		PX_ASSERT(x>=getX(mNbDataPairs-1));
+		return getY(mNbDataPairs-1);
+	}
+
+	uint32_t getNbDataPairs() const {return mNbDataPairs;}
+	
+	void clear()
+	{
+		memset(mDataPairs, 0, NB_ELEMENTS*2*sizeof(float));
+		mNbDataPairs = 0;
+	}
+
+	PX_FORCE_INLINE float getX(const uint32_t i) const
+	{
+		return mDataPairs[2*i];
+	}
+	PX_FORCE_INLINE float getY(const uint32_t i) const
+	{
+		return mDataPairs[2*i+1];
+	}
+
+	float mDataPairs[2*NB_ELEMENTS];
+	uint32_t mNbDataPairs;
+	uint32_t mPad[3];
+
+	
+};
+
+struct PxMeshFlag
+{
+	enum Enum
+	{
+		/**
+		\brief Specifies if the SDK should flip normals.
+
+		The PhysX libraries assume that the face normal of a triangle with vertices [a,b,c] can be computed as:
+		edge1 = b-a
+		edge2 = c-a
+		face_normal = edge1 x edge2.
+
+		Note: This is the same as a counterclockwise winding in a right handed coordinate system or
+		alternatively a clockwise winding order in a left handed coordinate system.
+
+		If this does not match the winding order for your triangles, raise the below flag.
+		*/
+		eFLIPNORMALS		=	(1<<0),
+		e16_BIT_INDICES		=	(1<<1)	//!< Denotes the use of 16-bit vertex indices
+	};
+};
+
+/**
+\brief collection of set bits defined in PxMeshFlag.
+
+@see PxMeshFlag
+*/
+typedef physx::PxFlags<PxMeshFlag::Enum, uint16_t> PxMeshFlags;
+using physx::PxFlags;
+PX_FLAGS_OPERATORS(PxMeshFlag::Enum, uint16_t)
+
+#if !PX_DOXYGEN
+} // namespace nvidia
+#endif
+
+
+/** @} */
+#endif
diff --git a/APEX_1.4/module/clothing/embedded/ExtClothFabricCooker.cpp b/APEX_1.4/module/clothing/embedded/ExtClothFabricCooker.cpp
new file mode 100644
index 00000000..4df323d7
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/ExtClothFabricCooker.cpp
@@ -0,0 +1,595 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "ExtClothConfig.h"
+#if APEX_USE_CLOTH_API
+
+#include "ExtClothFabricCooker.h"
+#include "ExtClothTetherCooker.h"
+#include "PxVec4.h"
+#include "PsFoundation.h"
+#include "PxErrorCallback.h"
+#include "PsArray.h"
+#include "PsHashMap.h"
+#include "PsSort.h"
+#include "PxIO.h"
+
+#include "PxStrideIterator.h"
+
+#pragma warning(disable:4127)
+
+using namespace nvidia;
+using namespace physx;
+
+struct nvidia::PxFabricCookerImpl
+{
+	bool cook(const PxClothMeshDesc& desc, PxVec3 gravity, bool useGeodesicTether);
+
+	PxClothFabricDesc getDescriptor() const;
+	void save(physx::PxOutputStream& stream, bool platformMismatch) const;
+
+public:
+	uint32_t mNumParticles;
+
+	shdfnd::Array<PxClothFabricPhase> mPhases;
+	shdfnd::Array<uint32_t> mSets; // with 0 prefix
+	shdfnd::Array<float> mRestvalues;
+	shdfnd::Array<uint32_t> mIndices;
+
+	shdfnd::Array<uint32_t> mTetherAnchors;
+	shdfnd::Array<float> mTetherLengths;
+};
+
+PxClothFabricCooker::PxClothFabricCooker(const PxClothMeshDesc& desc, const PxVec3& gravity, bool useGeodesicTether)
+: mImpl(new PxFabricCookerImpl())
+{
+	mImpl->cook(desc, gravity, useGeodesicTether);
+}
+
+PxClothFabricCooker::~PxClothFabricCooker()
+{
+	delete mImpl;
+}
+
+PxClothFabricDesc PxClothFabricCooker::getDescriptor() const
+{
+	return mImpl->getDescriptor();
+}
+
+void PxClothFabricCooker::save(physx::PxOutputStream& stream, bool platformMismatch) const
+{
+	mImpl->save(stream, platformMismatch);
+}
+
+
+namespace
+{
+	// calculate the inclusive prefix sum, equivalent of std::partial_sum
+	template <typename T>
+	void prefixSum(const T* first, const T* last, T* dest)
+	{
+		if (first != last)
+		{	
+			*(dest++) = *(first++);
+			for (; first != last; ++first, ++dest)
+				*dest = *(dest-1) + *first;
+		}
+	}
+
+	template <typename T>
+	void gatherAdjacencies(shdfnd::Array<uint32_t>& valency, shdfnd::Array<uint32_t>& adjacencies, 
+		const PxBoundedData& triangles, const PxBoundedData& quads)
+	{
+		// count number of edges per vertex
+		PxStrideIterator<const T> tIt, qIt;
+		tIt = physx::PxMakeIterator((const T*)triangles.data, triangles.stride);
+		for(uint32_t i=0; i<triangles.count; ++i, ++tIt, ++qIt)
+		{
+			for(uint32_t j=0; j<3; ++j)
+				valency[tIt.ptr()[j]] += 2;
+		}
+		qIt = physx::PxMakeIterator((const T*)quads.data, quads.stride);
+		for(uint32_t i=0; i<quads.count; ++i, ++tIt, ++qIt)
+		{
+			for(uint32_t j=0; j<4; ++j)
+				valency[qIt.ptr()[j]] += 2;
+		}
+
+		prefixSum(valency.begin(), valency.end(), valency.begin());
+		adjacencies.resize(valency.back());
+
+		// gather adjacent vertices
+		tIt = physx::PxMakeIterator((const T*)triangles.data, triangles.stride);
+		for(uint32_t i=0; i<triangles.count; ++i, ++tIt)
+		{
+			for(uint32_t j=0; j<3; ++j)
+			{
+				adjacencies[--valency[tIt.ptr()[j]]] = tIt.ptr()[(j+1)%3];
+				adjacencies[--valency[tIt.ptr()[j]]] = tIt.ptr()[(j+2)%3];
+			}
+		}
+		qIt = physx::PxMakeIterator((const T*)quads.data, quads.stride);
+		for(uint32_t i=0; i<quads.count; ++i, ++qIt)
+		{
+			for(uint32_t j=0; j<4; ++j)
+			{
+				adjacencies[--valency[qIt.ptr()[j]]] = qIt.ptr()[(j+1)%4];
+				adjacencies[--valency[qIt.ptr()[j]]] = qIt.ptr()[(j+3)%4];
+			}
+		}
+	}
+
+	
+	struct Edge
+	{
+		Edge() : mStretching(0.0f), mBending(0.0f), mShearing(0.0f) {}
+
+		void classify()
+		{
+			mStretching += 0.1f;
+		}
+
+		// classify v0-v2 edge based on alternative v0-v1-v2 path 
+		void classify(const PxVec4& v0, const PxVec4& v1, const PxVec4& v2)
+		{
+			const PxVec3& p0 = reinterpret_cast<const PxVec3&>(v0);
+			const PxVec3& p1 = reinterpret_cast<const PxVec3&>(v1);
+			const PxVec3& p2 = reinterpret_cast<const PxVec3&>(v2);
+
+			float area = (p1-p0).cross(p2-p1).magnitude();
+			// triangle height / base length
+			// 1.0 = quad edge, 0.2 = quad diagonal + quad edge, 
+			float ratio = area / (p2-p0).magnitudeSquared();
+
+			// 0.5 = quad diagonal
+			mShearing += PxMax(0.0f, 0.15f - fabsf(0.45f - ratio));
+			// 0.0 = collinear points
+			mBending += PxMax(0.0f, 0.1f - ratio) * 3;
+		}
+
+		float mStretching;
+		float mBending;
+		float mShearing;
+	};
+
+	typedef shdfnd::Pair<uint32_t, uint32_t> Pair;
+	typedef shdfnd::Pair<Pair, PxClothFabricPhaseType::Enum> Entry;
+
+	// maintain heap status after elements have been pushed (heapify)
+	template<typename T>
+	void pushHeap(shdfnd::Array<T> &heap, const T &value)
+	{
+		heap.pushBack(value);
+		T* begin = heap.begin();
+		T* end = heap.end();
+
+		if (end <= begin)
+			return;
+	
+		uint32_t current = uint32_t(end - begin) - 1;
+		while (current > 0)
+		{
+			const uint32_t parent = (current - 1) / 2;
+			if (!(begin[parent] < begin[current]))
+				break;
+
+			shdfnd::swap(begin[parent], begin[current]);
+			current = parent;
+		}
+	}
+
+	// pop one element from the heap
+	template<typename T>
+	T popHeap(shdfnd::Array<T> &heap)
+	{
+		T* begin = heap.begin();
+		T* end = heap.end();
+
+		shdfnd::swap(begin[0], end[-1]); // exchange elements
+
+		// shift down
+		end--;
+
+		uint32_t current = 0;
+		while (begin + (current * 2 + 1) < end)
+		{
+			uint32_t child = current * 2 + 1;
+			if (begin + child + 1 < end && begin[child] < begin[child + 1])
+				++child;
+
+			if (!(begin[current] < begin[child]))
+				break;
+
+			shdfnd::swap(begin[current], begin[child]);
+			current = child;
+		}
+
+		return heap.popBack();
+	}
+
+	// ---------------------------------------------------------------------------------------
+	// Heap element to sort constraint based on graph color count
+	struct ConstraintGraphColorCount
+	{
+		ConstraintGraphColorCount(int cid, int count) 
+			: constraint((uint32_t)cid), colorCount((uint32_t)count) {}
+
+		uint32_t constraint;
+		uint32_t colorCount; 
+
+		bool operator < (const ConstraintGraphColorCount& c) const
+		{
+			return colorCount < c.colorCount;
+		}
+	};
+
+	struct ConstraintSorter
+	{
+	public:
+
+		ConstraintSorter(uint32_t* constraints_) : constraints(constraints_) {}
+
+		bool operator()(uint32_t i, uint32_t j) const
+		{
+			uint32_t ci = i*2;
+			uint32_t cj = j*2;
+
+			if (constraints[ci] == constraints[cj])
+				return constraints[ci+1] < constraints[cj+1];
+			else
+				return constraints[ci] < constraints[cj];
+		}
+
+		uint32_t* constraints;
+	};
+
+} // anonymous namespace
+
+bool nvidia::PxFabricCookerImpl::cook(const PxClothMeshDesc& desc, PxVec3 gravity, bool useGeodesicTether)
+{	
+	if(!desc.isValid())
+	{
+		shdfnd::getFoundation().getErrorCallback().reportError(PxErrorCode::eINVALID_PARAMETER, 
+			"PxFabricCookerImpl::cook: desc.isValid() failed!", __FILE__, __LINE__);
+		return false;
+	}
+
+	gravity = gravity.getNormalized();
+
+	mNumParticles = desc.points.count;
+
+	// assemble points
+	shdfnd::Array<PxVec4> particles;
+	particles.reserve(mNumParticles);
+	PxStrideIterator<const PxVec3> pIt((const PxVec3*)desc.points.data, desc.points.stride);
+	PxStrideIterator<const float> wIt((const float*)desc.invMasses.data, desc.invMasses.stride);
+	for(uint32_t i=0; i<mNumParticles; ++i)
+		particles.pushBack(PxVec4(*pIt++, wIt.ptr() ? *wIt++ : 1.0f));
+
+	// build adjacent vertex list
+	shdfnd::Array<uint32_t> valency(mNumParticles+1, 0);
+	shdfnd::Array<uint32_t> adjacencies;
+	if(desc.flags & PxMeshFlag::e16_BIT_INDICES)
+		gatherAdjacencies<uint16_t>(valency, adjacencies, desc.triangles, desc.quads);
+	else
+		gatherAdjacencies<uint32_t>(valency, adjacencies, desc.triangles, desc.quads);
+
+	// build unique neighbors from adjacencies
+	shdfnd::Array<uint32_t> mark(valency.size(), 0);
+	shdfnd::Array<uint32_t> neighbors; neighbors.reserve(adjacencies.size());
+	for(uint32_t i=1, j=0; i<valency.size(); ++i)
+	{
+		for(; j<valency[i]; ++j)
+		{
+			uint32_t k = adjacencies[j];
+			if(mark[k] != i)
+			{
+				mark[k] = i;
+				neighbors.pushBack(k);
+			}
+		}
+		valency[i] = neighbors.size();
+	}
+
+	// build map of unique edges and classify
+	shdfnd::HashMap<Pair, Edge> edges;
+	for(uint32_t i=0; i<mNumParticles; ++i)
+	{
+		float wi = particles[i].w;
+		// iterate all neighbors
+		uint32_t jlast = valency[i+1];
+		for(uint32_t j=valency[i]; j<jlast; ++j)
+		{
+			// add 1-ring edge
+			uint32_t m = neighbors[j];
+			if(wi + particles[m].w > 0.0f)
+				edges[Pair(PxMin(i, m), PxMax(i, m))].classify();
+
+			// iterate all neighbors of neighbor
+			uint32_t klast = valency[m+1];
+			for(uint32_t k=valency[m]; k<klast; ++k)
+			{
+				uint32_t n = neighbors[k];
+				if(n != i && wi + particles[n].w > 0.0f)
+				{
+					// add 2-ring edge
+					edges[Pair(PxMin(i, n), PxMax(i, n))].classify(
+						particles[i], particles[m], particles[n]);
+				}
+			}
+		}
+	}
+
+	// copy classified edges to constraints array
+	// build histogram of constraints per vertex
+	shdfnd::Array<Entry> constraints; 	
+	constraints.reserve(edges.size());
+	valency.resize(0); valency.resize(mNumParticles+1, 0);
+
+	const float sqrtHalf = PxSqrt(0.4f);
+	for(shdfnd::HashMap<Pair, Edge>::Iterator eIt = edges.getIterator(); !eIt.done(); ++eIt)
+	{
+		const Edge& edge = eIt->second;
+		const Pair& pair = eIt->first;
+		if((edge.mStretching + edge.mBending + edge.mShearing) > 0.0f)
+		{	
+			PxClothFabricPhaseType::Enum type = PxClothFabricPhaseType::eINVALID;
+			if(edge.mBending > PxMax(edge.mStretching, edge.mShearing))
+				type = PxClothFabricPhaseType::eBENDING;
+			else if(edge.mShearing > PxMax(edge.mStretching, edge.mBending))
+				type = PxClothFabricPhaseType::eSHEARING;
+			else 
+			{
+				PxVec4 diff = particles[pair.first]-particles[pair.second];
+				float dot = gravity.dot(reinterpret_cast<const PxVec3&>(diff).getNormalized());
+				type = fabsf(dot) < sqrtHalf ? PxClothFabricPhaseType::eHORIZONTAL : PxClothFabricPhaseType::eVERTICAL;
+			}
+			++valency[pair.first];
+			++valency[pair.second];
+			constraints.pushBack(Entry(pair, type));
+		}
+	} 
+
+	prefixSum(valency.begin(), valency.end(), valency.begin());
+
+	uint32_t numConstraints = constraints.size();
+
+	// build adjacent constraint list
+	adjacencies.resize(0); adjacencies.resize(valency.back(), 0);
+	for(uint32_t i=0; i<numConstraints; ++i)
+	{
+		adjacencies[--valency[constraints[i].first.first]] = i;
+		adjacencies[--valency[constraints[i].first.second]] = i;
+	}
+	
+	shdfnd::Array<uint32_t>::ConstIterator aFirst = adjacencies.begin();
+	shdfnd::Array<uint32_t> colors(numConstraints, numConstraints); // constraint -> color, initialily not colored
+	mark.resize(0); mark.resize(numConstraints+1, UINT32_MAX); // color -> constraint index
+	shdfnd::Array<uint32_t> adjColorCount(numConstraints, 0); // # of neighbors that are already colored
+
+	shdfnd::Array<ConstraintGraphColorCount> constraintHeap; 
+	constraintHeap.reserve(numConstraints); // set of constraints to color (added in edge distance order)
+
+	// Do graph coloring based on edge distance.
+	// For each constraint, we add its uncolored neighbors to the heap
+	// ,and we pick the constraint with most colored neighbors from the heap.
+	while ( true )
+	{
+		uint32_t constraint = 0;
+		while ( (constraint < numConstraints) && (colors[constraint] != numConstraints))
+			constraint++; // start with the first uncolored constraint
+	
+		if (constraint >= numConstraints)
+			break;
+
+		constraintHeap.clear();
+		pushHeap(constraintHeap, ConstraintGraphColorCount((int)constraint, (int)adjColorCount[constraint]));
+		PxClothFabricPhaseType::Enum type = constraints[constraint].second;
+		
+		while (!constraintHeap.empty())
+		{		
+			ConstraintGraphColorCount heapItem = popHeap(constraintHeap);
+			constraint = heapItem.constraint;
+			if (colors[constraint] != numConstraints)
+				continue; // skip if already colored 
+
+			const Pair& pair = constraints[constraint].first;			
+			for(uint32_t j=0; j<2; ++j)
+			{
+				uint32_t index = j ? pair.first : pair.second;
+				if(particles[index].w == 0.0f)
+					continue; // don't mark adjacent particles if attached
+
+				for(shdfnd::Array<uint32_t>::ConstIterator aIt = aFirst + valency[index], aEnd = aFirst + valency[index+1]; aIt != aEnd; ++aIt)
+				{				
+					uint32_t adjacentConstraint = *aIt;
+					if ((constraints[adjacentConstraint].second != type) || (adjacentConstraint == constraint))
+						continue;
+
+					mark[colors[adjacentConstraint]] = constraint; 
+					++adjColorCount[adjacentConstraint];
+					pushHeap(constraintHeap, ConstraintGraphColorCount((int)adjacentConstraint, (int)adjColorCount[adjacentConstraint]));
+				}
+			}
+
+			// find smallest color with matching type
+			uint32_t color = 0;
+			while((color < mPhases.size() && mPhases[color].phaseType != type) || mark[color] == constraint)
+				++color;
+
+			// create a new color set
+			if(color == mPhases.size())
+			{
+				PxClothFabricPhase phase(type, mPhases.size());
+				mPhases.pushBack(phase);
+				mSets.pushBack(0);
+			}
+
+			colors[constraint] = color;
+			++mSets[color];
+		} 
+	}
+
+#if 0 // PX_DEBUG
+	printf("set[%u] = ", mSets.size());
+	for(uint32_t i=0; i<mSets.size(); ++i)
+		printf("%u ", mSets[i]);
+#endif
+
+	prefixSum(mSets.begin(), mSets.end(), mSets.begin());
+
+#if 0 // PX_DEBUG
+	printf(" = %u\n", mSets.back());
+#endif
+
+	// write indices and rest lengths
+	// convert mSets to exclusive sum
+	uint32_t back = mSets.back();
+	mSets.pushBack(back);
+	mIndices.resize(numConstraints*2);
+	mRestvalues.resize(numConstraints);
+	for(uint32_t i=0; i<numConstraints; ++i)
+	{
+		uint32_t first = constraints[i].first.first;
+		uint32_t second = constraints[i].first.second;
+
+		uint32_t index = --mSets[colors[i]];
+
+		mIndices[2*index  ] = first;
+		mIndices[2*index+1] = second;
+
+		PxVec4 diff = particles[second] - particles[first];
+		mRestvalues[index] = reinterpret_cast<
+			const PxVec3&>(diff).magnitude();
+	} 
+	
+	// reorder constraints and rest values for more efficient cache access (linear)
+	shdfnd::Array<uint32_t> newIndices(mIndices.size());
+	shdfnd::Array<float> newRestValues(mRestvalues.size());
+
+	// sort each constraint set in vertex order
+	for (uint32_t i=0; i < mSets.size()-1; ++i)
+	{
+		// create a re-ordering list
+		shdfnd::Array<uint32_t> reorder(mSets[i+1]-mSets[i]);
+
+		for (uint32_t r=0; r < reorder.size(); ++r)
+			reorder[r] = r;
+
+		const uint32_t indicesOffset = mSets[i]*2;
+		const uint32_t restOffset = mSets[i];
+
+		ConstraintSorter predicate(&mIndices[indicesOffset]);
+		shdfnd::sort(&reorder[0], reorder.size(), predicate);
+		
+		for (uint32_t r=0; r < reorder.size(); ++r)
+		{
+			newIndices[indicesOffset + r*2] = mIndices[indicesOffset + reorder[r]*2];
+			newIndices[indicesOffset + r*2+1] = mIndices[indicesOffset + reorder[r]*2+1];
+			newRestValues[restOffset + r] = mRestvalues[restOffset + reorder[r]];
+		}
+	}
+
+	mIndices = newIndices;
+	mRestvalues = newRestValues;
+
+	PX_ASSERT(mIndices.size() == mRestvalues.size()*2);
+	PX_ASSERT(mRestvalues.size() == mSets.back());
+
+#if 0 // PX_DEBUG
+	for (uint32_t i = 1; i < mSets.size(); i++)
+	{
+		PxClothFabricPhase phase = mPhases[i-1];
+		printf("%d : type %d, size %d\n", 
+			i-1, phase.phaseType, mSets[i] - mSets[i-1]);
+	}
+#endif
+
+	if (useGeodesicTether)
+	{
+		PxClothGeodesicTetherCooker tetherCooker(desc);
+		if (tetherCooker.getCookerStatus() == 0)
+		{
+			uint32_t numTethersPerParticle = tetherCooker.getNbTethersPerParticle();
+			uint32_t tetherSize = mNumParticles * numTethersPerParticle;
+			mTetherAnchors.resize(tetherSize);
+			mTetherLengths.resize(tetherSize);
+			tetherCooker.getTetherData(mTetherAnchors.begin(), mTetherLengths.begin());
+		}
+		else
+			useGeodesicTether = false;
+	}
+
+	if (!useGeodesicTether)
+	{
+		PxClothSimpleTetherCooker tetherCooker(desc);
+		if (tetherCooker.getCookerStatus() == 0)
+		{
+			mTetherAnchors.resize(mNumParticles);
+			mTetherLengths.resize(mNumParticles);
+			tetherCooker.getTetherData(mTetherAnchors.begin(), mTetherLengths.begin());
+		}
+	}
+
+	return true;
+}
+
+PxClothFabricDesc nvidia::PxFabricCookerImpl::getDescriptor() const
+{
+	PxClothFabricDesc result;
+
+	result.nbParticles = mNumParticles;
+	result.nbPhases = mPhases.size();
+	result.phases = mPhases.begin();
+	result.nbSets = mSets.size()-1;
+	result.sets = mSets.begin()+1;
+	result.restvalues = mRestvalues.begin();
+	result.indices = mIndices.begin();
+	result.nbTethers = mTetherAnchors.size();
+	result.tetherAnchors = mTetherAnchors.begin();
+	result.tetherLengths = mTetherLengths.begin();
+
+	return result;
+}
+
+void nvidia::PxFabricCookerImpl::save( physx::PxOutputStream& stream, bool /*platformMismatch*/ ) const
+{
+	// version 1 is equivalent to 0x030300 and 0x030301 (PX_PHYSICS_VERSION of 3.3.0 and 3.3.1).
+	// If the stream format changes, the loader code in ScClothFabricCore.cpp 
+	// and the version number need to change too. 
+	uint32_t version = 1;
+	stream.write(&version, sizeof(uint32_t));
+
+	PxClothFabricDesc desc = getDescriptor();
+
+	// write explicit sizes, others are implicit
+	stream.write(&mNumParticles, sizeof(uint32_t));
+	stream.write(&desc.nbPhases, sizeof(uint32_t));
+	stream.write(&desc.nbSets, sizeof(uint32_t));
+	stream.write(&desc.nbTethers, sizeof(uint32_t));
+
+	uint32_t nbConstraints = desc.sets[desc.nbSets-1];
+
+	// write actual data
+	PX_COMPILE_TIME_ASSERT(sizeof(PxClothFabricPhaseType::Enum) == sizeof(uint32_t));
+	stream.write(desc.phases, desc.nbPhases*sizeof(PxClothFabricPhase));
+	stream.write(desc.sets, desc.nbSets*sizeof(uint32_t));
+
+	stream.write(desc.restvalues, nbConstraints*sizeof(float));
+	stream.write(desc.indices, nbConstraints*2*sizeof(uint32_t));
+
+	stream.write(desc.tetherAnchors, desc.nbTethers*sizeof(uint32_t));
+	stream.write(desc.tetherLengths, desc.nbTethers*sizeof(float));
+}
+
+#endif //APEX_USE_CLOTH_API
diff --git a/APEX_1.4/module/clothing/embedded/ExtClothFabricCooker.h b/APEX_1.4/module/clothing/embedded/ExtClothFabricCooker.h
new file mode 100644
index 00000000..dc6ec2cc
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/ExtClothFabricCooker.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PX_PHYSICS_EXTENSIONS_CLOTH_FABRIC_COOKER_H
+#define PX_PHYSICS_EXTENSIONS_CLOTH_FABRIC_COOKER_H
+
+/** \addtogroup extensions
+  @{
+*/
+
+#include "PxClothMeshDesc.h"
+#include "PxClothFabric.h"
+
+#if PX_DOXYGEN == 0
+namespace nvidia
+{
+#endif
+
+struct PxFabricCookerImpl;
+
+class PxClothFabricCooker
+{
+public:
+	/**
+	\brief Cooks a triangle mesh to a PxClothFabricDesc.
+	\param desc The cloth mesh descriptor on which the generation of the cooked mesh depends.
+	\param gravity A normalized vector which specifies the direction of gravity. 
+	This information allows the cooker to generate a fabric with higher quality simulation behavior.
+	\param useGeodesicTether A flag to indicate whether to compute geodesic distance for tether constraints.
+	\note The geodesic option for tether only works for manifold input.  For non-manifold input, a simple Euclidean distance will be used.
+	For more detailed cooker status for such cases, try running PxClothGeodesicTetherCooker directly.
+	*/
+	PxClothFabricCooker(const PxClothMeshDesc& desc, const physx::PxVec3& gravity, bool useGeodesicTether = true);
+	~PxClothFabricCooker();
+
+	/** \brief Returns the fabric descriptor to create the fabric. */
+	PxClothFabricDesc getDescriptor() const;
+	/** \brief Saves the fabric data to a platform and version dependent stream. */
+	void save(physx::PxOutputStream& stream, bool platformMismatch) const;
+
+private:
+	PxFabricCookerImpl* mImpl;
+};
+
+#if PX_DOXYGEN == 0
+} // namespace nvidia
+#endif
+
+/** @} */
+#endif // PX_PHYSICS_EXTENSIONS_CLOTH_FABRIC_COOKER_H
diff --git a/APEX_1.4/module/clothing/embedded/ExtClothGeodesicTetherCooker.cpp b/APEX_1.4/module/clothing/embedded/ExtClothGeodesicTetherCooker.cpp
new file mode 100644
index 00000000..e0fea857
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/ExtClothGeodesicTetherCooker.cpp
@@ -0,0 +1,1006 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "ExtClothConfig.h"
+#if APEX_USE_CLOTH_API
+
+#include "ExtClothTetherCooker.h"
+#include "PxStrideIterator.h"
+
+// from shared foundation
+
+#include <PsArray.h>
+#include <PsSort.h>
+#include <Ps.h>
+#include <PsMathUtils.h>
+#include "PxVec4.h"
+#include "PsIntrinsics.h"
+
+using namespace nvidia;
+using namespace physx;
+
+namespace
+{
+	// calculate the inclusive prefix sum, equivalent of std::partial_sum
+	template <typename T>
+	void prefixSum(const T* first, const T* last, T* dest)
+	{
+		if (first != last)
+		{	
+			*(dest++) = *(first++);
+			for (; first != last; ++first, ++dest)
+				*dest = *(dest-1) + *first;
+		}
+	}
+
+	template <typename T>
+	void gatherAdjacencies(shdfnd::Array<uint32_t>& valency, shdfnd::Array<uint32_t>& adjacencies, 
+		const PxBoundedData& triangles, const PxBoundedData& quads)
+	{
+		// count number of edges per vertex
+		PxStrideIterator<const T> tIt, qIt;
+		tIt = physx::PxMakeIterator((const T*)triangles.data, triangles.stride);
+		for(uint32_t i=0; i<triangles.count; ++i, ++tIt, ++qIt)
+		{
+			for(uint32_t j=0; j<3; ++j)
+				valency[tIt.ptr()[j]] += 2;
+		}
+		qIt = physx::PxMakeIterator((const T*)quads.data, quads.stride);
+		for(uint32_t i=0; i<quads.count; ++i, ++tIt, ++qIt)
+		{
+			for(uint32_t j=0; j<4; ++j)
+				valency[qIt.ptr()[j]] += 2;
+		}
+
+		prefixSum(valency.begin(), valency.end(), valency.begin());
+		adjacencies.resize(valency.back());
+
+		// gather adjacent vertices
+		tIt = physx::PxMakeIterator((const T*)triangles.data, triangles.stride);
+		for(uint32_t i=0; i<triangles.count; ++i, ++tIt)
+		{
+			for(uint32_t j=0; j<3; ++j)
+			{
+				adjacencies[--valency[tIt.ptr()[j]]] = tIt.ptr()[(j+1)%3];
+				adjacencies[--valency[tIt.ptr()[j]]] = tIt.ptr()[(j+2)%3];
+			}
+		}
+		qIt = physx::PxMakeIterator((const T*)quads.data, quads.stride);
+		for(uint32_t i=0; i<quads.count; ++i, ++qIt)
+		{
+			for(uint32_t j=0; j<4; ++j)
+			{
+				adjacencies[--valency[qIt.ptr()[j]]] = qIt.ptr()[(j+1)%4];
+				adjacencies[--valency[qIt.ptr()[j]]] = qIt.ptr()[(j+3)%4];
+			}
+		}
+	}
+
+	template <typename T>
+	void gatherIndices(shdfnd::Array<uint32_t>& indices, 
+		const PxBoundedData& triangles, const PxBoundedData& quads)
+	{
+		PxStrideIterator<const T> tIt, qIt;
+
+		indices.reserve(triangles.count * 3 + quads.count * 6);
+	
+		tIt = physx::PxMakeIterator((const T*)triangles.data, triangles.stride);
+		for(uint32_t i=0; i<triangles.count; ++i, ++tIt)
+		{
+			indices.pushBack(tIt.ptr()[0]);
+			indices.pushBack(tIt.ptr()[1]);
+			indices.pushBack(tIt.ptr()[2]);
+		}
+		qIt = physx::PxMakeIterator((const T*)quads.data, quads.stride);
+		for(uint32_t i=0; i<quads.count; ++i, ++qIt)
+		{
+			indices.pushBack(qIt.ptr()[0]);
+			indices.pushBack(qIt.ptr()[1]);
+			indices.pushBack(qIt.ptr()[2]);
+			indices.pushBack(qIt.ptr()[0]);
+			indices.pushBack(qIt.ptr()[2]);
+			indices.pushBack(qIt.ptr()[3]);
+		}
+	}
+
+	// maintain heap status after elements have been pushed (heapify)
+	template<typename T>
+	void pushHeap(shdfnd::Array<T> &heap, const T &value)
+	{
+		heap.pushBack(value);
+		T* begin = heap.begin();
+		T* end = heap.end();
+
+		if (end <= begin)
+			return;
+	
+		uint32_t current = uint32_t(end - begin) - 1;
+		while (current > 0)
+		{
+			const uint32_t parent = (current - 1) / 2;
+			if (!(begin[parent] < begin[current]))
+				break;
+
+			shdfnd::swap(begin[parent], begin[current]);
+			current = parent;
+		}
+	}
+
+	// pop one element from the heap
+	template<typename T>
+	T popHeap(shdfnd::Array<T> &heap)
+	{
+		T* begin = heap.begin();
+		T* end = heap.end();
+
+		shdfnd::swap(begin[0], end[-1]); // exchange elements
+
+		// shift down
+		end--;
+
+		uint32_t current = 0;
+		while (begin + (current * 2 + 1) < end)
+		{
+			uint32_t child = current * 2 + 1;
+			if (begin + child + 1 < end && begin[child] < begin[child + 1])
+				++child;
+
+			if (!(begin[current] < begin[child]))
+				break;
+
+			shdfnd::swap(begin[current], begin[child]);
+			current = child;
+		}
+
+		return heap.popBack();
+	}
+
+	// ---------------------------------------------------------------------------------------
+	struct VertexDistanceCount
+	{
+		VertexDistanceCount(int vert, float dist, int count) 
+			: vertNr(vert), distance(dist), edgeCount(count) {}
+
+		int vertNr;
+		float distance;
+		int edgeCount;
+		bool operator < (const VertexDistanceCount& v) const
+		{
+			return v.distance < distance;
+		}
+	};
+
+	// ---------------------------------------------------------------------------------------
+	struct PathIntersection
+	{
+		uint32_t vertOrTriangle;
+		uint32_t index; // vertex id or triangle edge id
+		float s; // only used for edge intersection
+		float distance; // computed distance
+
+	public:
+		PathIntersection() {}
+
+		PathIntersection(uint32_t vort, uint32_t in_index, float in_distance, float in_s = 0.0f) 
+			: vertOrTriangle(vort), index(in_index), s(in_s), distance(in_distance)
+		{
+		}
+	};
+
+	//---------------------------------------------------------------------------------------
+	struct VertTriangle
+	{
+		VertTriangle(int vert, int triangle)
+			: mVertIndex(vert), mTriangleIndex(triangle)
+		{
+		}
+
+		bool operator<(const VertTriangle &vt) const
+		{
+			return mVertIndex == vt.mVertIndex ?
+				mTriangleIndex < vt.mTriangleIndex : mVertIndex < vt.mVertIndex;
+		}
+
+		int mVertIndex;
+		int mTriangleIndex;
+	};
+
+	// ---------------------------------------------------------------------------------------
+	struct MeshEdge
+	{
+		MeshEdge(int v0, int v1, int halfEdgeIndex)
+			: mFromVertIndex(v0), mToVertIndex(v1), mHalfEdgeIndex(halfEdgeIndex)
+		{
+			if(mFromVertIndex > mToVertIndex)
+				shdfnd::swap(mFromVertIndex, mToVertIndex);
+		}
+
+		bool operator<(const MeshEdge& e) const
+		{
+			return mFromVertIndex == e.mFromVertIndex ?
+				mToVertIndex < e.mToVertIndex : mFromVertIndex < e.mFromVertIndex;
+		}
+
+		bool operator==(const MeshEdge& e) const
+		{
+			return mFromVertIndex == e.mFromVertIndex 
+				&& mToVertIndex == e.mToVertIndex;
+		}
+
+		int mFromVertIndex, mToVertIndex; 
+		int mHalfEdgeIndex; 
+	};
+
+	// check if the edge is following triangle order or not
+	bool checkEdgeOrientation(const MeshEdge &e, const shdfnd::Array<uint32_t> &indices)
+	{		
+		int offset0 = e.mHalfEdgeIndex % 3;
+		int offset1 = (offset0 < 2) ? 1 : -2;
+
+		int v0 = (int)indices[uint32_t(e.mHalfEdgeIndex)];
+		int v1 = (int)indices[uint32_t(e.mHalfEdgeIndex + offset1)];
+
+		if ((e.mFromVertIndex == v0) && (e.mToVertIndex == v1))
+			return true;
+
+		return false;
+	}
+
+	// check if two index pairs represent same edge regardless of order.
+	inline bool checkEdge(int ei0, int ei1, int ej0, int ej1)
+	{
+		return ( (ei0 == ej0) && (ei1 == ej1) ) ||
+			( (ei0 == ej1) && (ei1 == ej0) );
+	}
+
+	// compute ray edge intersection
+	bool intersectRayEdge(const PxVec3 &O, const PxVec3 &D, const PxVec3 &A, const PxVec3 &B, float &s, float &t)
+	{
+		// point on edge P = A + s * AB
+		// point on ray R = o + t * d
+		// for this two points to intersect, we have
+		// |AB -d| | s t | = o - A
+		const float eps = 1e-4;
+
+		PxVec3 OA = O - A;
+		PxVec3 AB = B - A;
+
+		float a = AB.dot(AB), b = -AB.dot(D);
+		float c = b, d = D.dot(D);
+
+		float e = AB.dot(OA);
+		float f = -D.dot(OA);
+
+		float det = a * d - b * c;
+		if (fabs(det) < eps) // coplanar case
+			return false;
+
+		float iPX_det = 1.0f / det;
+
+		s = (d * iPX_det) * e + (-b * iPX_det) * f;
+		t = (-c * iPX_det) * e + (a * iPX_det) * f;
+
+		return true;
+	}
+}
+
+
+struct nvidia::PxClothGeodesicTetherCookerImpl
+{
+
+	PxClothGeodesicTetherCookerImpl(const PxClothMeshDesc& desc);
+
+	uint32_t	getCookerStatus() const;
+	uint32_t	getNbTethersPerParticle() const;
+	void	getTetherData(uint32_t* userTetherAnchors, float* userTetherLengths) const;
+
+public:
+	// input
+	const PxClothMeshDesc&  mDesc;
+
+	// internal variables
+	uint32_t					mNumParticles;
+	shdfnd::Array<PxVec3>   mVertices;
+	shdfnd::Array<uint32_t>	mIndices;
+	shdfnd::Array<uint8_t>     mAttached;
+	shdfnd::Array<uint32_t>	mFirstVertTriAdj;
+	shdfnd::Array<uint32_t>	mVertTriAdjs;
+	shdfnd::Array<uint32_t>	mTriNeighbors; // needs changing for non-manifold support
+
+	// error status
+	uint32_t					mCookerStatus;
+
+	// output
+	shdfnd::Array<uint32_t>	mTetherAnchors;
+	shdfnd::Array<float>	mTetherLengths;
+
+protected:
+	void	createTetherData(const PxClothMeshDesc &desc);
+	int		computeVertexIntersection(uint32_t parent, uint32_t src, PathIntersection &path);
+	int		computeEdgeIntersection(uint32_t parent, uint32_t edge, float in_s, PathIntersection &path);
+	float	computeGeodesicDistance(uint32_t i, uint32_t parent, int &errorCode);
+	uint32_t	findTriNeighbors();
+	void	findVertTriNeighbors();
+
+private:
+	PxClothGeodesicTetherCookerImpl& operator=(const PxClothGeodesicTetherCookerImpl&);
+};
+
+PxClothGeodesicTetherCooker::PxClothGeodesicTetherCooker(const PxClothMeshDesc& desc)
+: mImpl(new PxClothGeodesicTetherCookerImpl(desc))
+{
+}
+
+PxClothGeodesicTetherCooker::~PxClothGeodesicTetherCooker()
+{
+	delete mImpl;
+}
+
+uint32_t PxClothGeodesicTetherCooker::getCookerStatus() const
+{
+	return mImpl->getCookerStatus();
+}
+
+uint32_t PxClothGeodesicTetherCooker::getNbTethersPerParticle() const
+{
+	return mImpl->getNbTethersPerParticle();
+}
+
+void PxClothGeodesicTetherCooker::getTetherData(uint32_t* userTetherAnchors, float* userTetherLengths) const
+{
+	mImpl->getTetherData(userTetherAnchors, userTetherLengths);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+PxClothGeodesicTetherCookerImpl::PxClothGeodesicTetherCookerImpl(const PxClothMeshDesc &desc)
+	:mDesc(desc),
+	mCookerStatus(0)
+{
+	createTetherData(desc);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+void PxClothGeodesicTetherCookerImpl::createTetherData(const PxClothMeshDesc &desc)
+{
+	mNumParticles = desc.points.count;
+	
+	if (!desc.invMasses.data)
+		return;
+
+	// assemble points
+	mVertices.resize(mNumParticles);
+	mAttached.resize(mNumParticles);
+	PxStrideIterator<const PxVec3> pIt((const PxVec3*)desc.points.data, desc.points.stride);
+	PxStrideIterator<const float> wIt((const float*)desc.invMasses.data, desc.invMasses.stride);
+	for(uint32_t i=0; i<mNumParticles; ++i)
+	{
+		mVertices[i] = *pIt++;
+		mAttached[i] = uint8_t(wIt.ptr() ? (*wIt++ == 0.0f) : 0);
+	}
+
+	// build triangle indices
+	if(desc.flags & PxMeshFlag::e16_BIT_INDICES)
+		gatherIndices<uint16_t>(mIndices, desc.triangles, desc.quads);
+	else
+		gatherIndices<uint32_t>(mIndices, desc.triangles, desc.quads);
+
+	// build vertex-triangle adjacencies
+	findVertTriNeighbors();
+
+	// build triangle-triangle adjacencies
+	mCookerStatus = findTriNeighbors();
+	if (mCookerStatus != 0)
+		return;
+
+	// build adjacent vertex list
+	shdfnd::Array<uint32_t> valency(mNumParticles+1, 0);
+	shdfnd::Array<uint32_t> adjacencies;
+	if(desc.flags & PxMeshFlag::e16_BIT_INDICES)
+		gatherAdjacencies<uint16_t>(valency, adjacencies, desc.triangles, desc.quads);
+	else
+		gatherAdjacencies<uint32_t>(valency, adjacencies, desc.triangles, desc.quads);
+
+	// build unique neighbors from adjacencies
+	shdfnd::Array<uint32_t> mark(valency.size(), 0);
+	shdfnd::Array<uint32_t> neighbors; neighbors.reserve(adjacencies.size());
+	for(uint32_t i=1, j=0; i<valency.size(); ++i)
+	{
+		for(; j<valency[i]; ++j)
+		{
+			uint32_t k = adjacencies[j];
+			if(mark[k] != i)
+			{
+				mark[k] = i;
+				neighbors.pushBack(k);
+			}
+		}
+		valency[i] = neighbors.size();
+	}
+
+	// create islands of attachment points
+	shdfnd::Array<uint32_t> vertexIsland(mNumParticles);
+	shdfnd::Array<VertexDistanceCount> vertexIslandHeap;
+
+	// put all the attachments in heap
+	for (uint32_t i = 0; i < mNumParticles; ++i)
+	{
+		// we put each attached point with large distance so that 
+		// we can prioritize things that are added during mesh traversal.
+		vertexIsland[i] = uint32_t(-1);
+		if (mAttached[i])
+			vertexIslandHeap.pushBack(VertexDistanceCount((int)i, FLT_MAX, 0));
+	}
+	uint32_t attachedCnt = vertexIslandHeap.size();
+
+	// no attached vertices
+	if (vertexIslandHeap.empty())
+		return;
+
+	// identify islands of attached vertices
+	shdfnd::Array<uint32_t> islandIndices;
+	shdfnd::Array<uint32_t> islandFirst;
+	uint32_t islandCnt = 0;
+	uint32_t islandIndexCnt = 0;
+
+	islandIndices.reserve(attachedCnt);
+	islandFirst.reserve(attachedCnt+1);
+
+	// while the island heap is not empty
+	while (!vertexIslandHeap.empty())
+	{
+		// pop vi from heap
+		VertexDistanceCount vi = popHeap(vertexIslandHeap);
+
+		// new cluster
+		if (vertexIsland[(uint32_t)vi.vertNr] == uint32_t(-1))
+		{
+			islandFirst.pushBack(islandIndexCnt++);
+			vertexIsland[(uint32_t)vi.vertNr] = islandCnt++;
+			vi.distance = 0;
+			islandIndices.pushBack((uint32_t)vi.vertNr);
+		}
+		
+		// for each adjacent vj that's not visited
+		const uint32_t begin = (uint32_t)valency[(uint32_t)vi.vertNr];
+		const uint32_t end = (uint32_t)valency[uint32_t(vi.vertNr + 1)];
+		for (uint32_t j = begin; j < end; ++j)
+		{
+			const uint32_t vj = neighbors[j];
+
+			// do not expand unattached vertices
+			if (!mAttached[vj])
+				continue; 
+
+			// already visited
+			if (vertexIsland[vj] != uint32_t(-1))
+				continue;
+
+			islandIndices.pushBack(vj);
+			islandIndexCnt++;
+			vertexIsland[vj] = vertexIsland[uint32_t(vi.vertNr)];
+			pushHeap(vertexIslandHeap, VertexDistanceCount((int)vj, vi.distance + 1.0f, 0));
+		}
+	}
+
+	islandFirst.pushBack(islandIndexCnt);
+
+	PX_ASSERT(islandCnt == (islandFirst.size() - 1));
+
+	/////////////////////////////////////////////////////////
+	uint32_t bufferSize = mNumParticles * islandCnt;
+	PX_ASSERT(bufferSize > 0);
+
+	shdfnd::Array<float> vertexDistanceBuffer(bufferSize, PX_MAX_F32);
+	shdfnd::Array<uint32_t> vertexParentBuffer(bufferSize, 0);
+	shdfnd::Array<VertexDistanceCount> vertexHeap;
+
+	// now process each island 
+	for (uint32_t i = 0; i < islandCnt; i++)
+	{
+		vertexHeap.clear();
+		float* vertexDistance = &vertexDistanceBuffer[0] + (i * mNumParticles);
+		uint32_t* vertexParent = &vertexParentBuffer[0] + (i * mNumParticles);
+
+		// initialize parent and distance
+		for (uint32_t j = 0; j < mNumParticles; ++j)
+		{
+			vertexParent[j] = j;
+			vertexDistance[j] = PX_MAX_F32;
+		}
+
+		// put all the attached vertices in this island to heap
+		const uint32_t beginIsland = islandFirst[i];
+		const uint32_t endIsland = islandFirst[i+1];
+		for (uint32_t j = beginIsland; j < endIsland; j++)
+		{
+			uint32_t vj = islandIndices[j];
+			vertexDistance[vj] = 0.0f;
+			vertexHeap.pushBack(VertexDistanceCount((int)vj, 0.0f, 0));
+		}
+
+		// no attached vertices in this island (error?)
+		PX_ASSERT(vertexHeap.empty() == false);
+		if (vertexHeap.empty())
+			continue;
+
+		// while heap is not empty
+		while (!vertexHeap.empty())
+		{
+			// pop vi from heap
+			VertexDistanceCount vi = popHeap(vertexHeap);
+
+			// obsolete entry ( we already found better distance)
+			if (vi.distance > vertexDistance[vi.vertNr])
+				continue;
+
+			// for each adjacent vj that's not visited
+			const int32_t begin = (int32_t)valency[(uint32_t)vi.vertNr];
+			const int32_t end = (int32_t)valency[uint32_t(vi.vertNr + 1)];
+			for (int32_t j = begin; j < end; ++j)
+			{
+				const int32_t vj = (int32_t)neighbors[(uint32_t)j];
+				PxVec3 edge = mVertices[(uint32_t)vj] - mVertices[(uint32_t)vi.vertNr];
+				const float edgeLength = edge.magnitude();
+				float newDistance = vi.distance + edgeLength;
+
+				if (newDistance < vertexDistance[vj])
+				{
+					vertexDistance[vj] = newDistance;
+					vertexParent[vj] = vertexParent[vi.vertNr];
+	
+					pushHeap(vertexHeap, VertexDistanceCount(vj, newDistance, 0));
+				}
+			}
+		}
+	}
+
+	const uint32_t maxTethersPerParticle = 4; // max tethers
+	const uint32_t nbTethersPerParticle = (islandCnt > maxTethersPerParticle) ? maxTethersPerParticle : islandCnt;
+
+	uint32_t nbTethers = nbTethersPerParticle * mNumParticles;
+	mTetherAnchors.resize(nbTethers);
+	mTetherLengths.resize(nbTethers);
+
+	// now process the parent and distance and add to fibers
+	for (uint32_t i = 0; i < mNumParticles; i++)
+	{
+		// we use the heap to sort out N-closest island
+		vertexHeap.clear();
+		for (uint32_t j = 0; j < islandCnt; j++)
+		{
+			int parent = (int)vertexParentBuffer[j * mNumParticles + i];
+			float edgeDistance = vertexDistanceBuffer[j * mNumParticles + i];
+			pushHeap(vertexHeap, VertexDistanceCount(parent, edgeDistance, 0));
+		}
+
+		// take out N-closest island from the heap
+		for (uint32_t j = 0; j < nbTethersPerParticle; j++)
+		{
+			VertexDistanceCount vi = popHeap(vertexHeap);
+			uint32_t parent = (uint32_t)vi.vertNr;
+			float distance = 0.0f;
+		
+			if (parent != i) 
+			{
+				float euclideanDistance = (mVertices[i] - mVertices[parent]).magnitude();
+				float dijkstraDistance = vi.distance;
+				int errorCode = 0;
+				float geodesicDistance = computeGeodesicDistance(i,parent, errorCode);
+				if (errorCode < 0)
+					geodesicDistance = dijkstraDistance;
+				distance = PxMax(euclideanDistance, geodesicDistance);
+			}
+
+			uint32_t tetherLoc = j * mNumParticles + i;
+			mTetherAnchors[ tetherLoc ] = parent;
+			mTetherLengths[ tetherLoc ] = distance;
+		}
+	}
+}
+
+///////////////////////////////////////////////////////////////////////////////
+uint32_t PxClothGeodesicTetherCookerImpl::getCookerStatus() const
+{
+	return mCookerStatus;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+uint32_t PxClothGeodesicTetherCookerImpl::getNbTethersPerParticle() const
+{
+	return mTetherAnchors.size() / mNumParticles;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+void  
+PxClothGeodesicTetherCookerImpl::getTetherData(uint32_t* userTetherAnchors, float* userTetherLengths) const
+{
+	intrinsics::memCopy(userTetherAnchors, mTetherAnchors.begin(), mTetherAnchors.size() * sizeof(uint32_t));
+	intrinsics::memCopy(userTetherLengths, mTetherLengths.begin(), mTetherLengths.size() * sizeof(float));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// find triangle-triangle adjacency (return non-zero if there is an error)
+uint32_t PxClothGeodesicTetherCookerImpl::findTriNeighbors()
+{
+	shdfnd::Array<MeshEdge> edges;
+
+	mTriNeighbors.resize(mIndices.size(), uint32_t(-1));
+
+	// assemble all edges
+	uint32_t numTriangles = mIndices.size() / 3;
+	for (uint32_t i = 0; i < numTriangles; ++i)
+	{
+		uint32_t i0 = mIndices[3 * i];
+		uint32_t i1 = mIndices[3 * i + 1];
+		uint32_t i2 = mIndices[3 * i + 2];
+		edges.pushBack(MeshEdge((int)i0, (int)i1, int(3*i)));
+		edges.pushBack(MeshEdge((int)i1, (int)i2, int(3*i+1)));
+		edges.pushBack(MeshEdge((int)i2, (int)i0, int(3*i+2)));
+	}
+
+	shdfnd::sort(edges.begin(), edges.size());
+
+	int numEdges = (int)edges.size();
+	for(int i=0; i < numEdges; )
+	{
+		const MeshEdge& e0 = edges[(uint32_t)i];
+		bool orientation0 = checkEdgeOrientation(e0, mIndices);
+
+		int j = i;
+		while(++i < numEdges && edges[(uint32_t)i] == e0)
+			;
+
+		if(i - j > 2)
+			return 1; // non-manifold
+	
+		while(++j < i)
+		{
+			const MeshEdge& e1 = edges[(uint32_t)j];
+			bool orientation1 = checkEdgeOrientation(e1, mIndices);
+			mTriNeighbors[(uint32_t)e0.mHalfEdgeIndex] = (uint32_t)e1.mHalfEdgeIndex/3;
+			mTriNeighbors[(uint32_t)e1.mHalfEdgeIndex] = (uint32_t)e0.mHalfEdgeIndex/3;
+
+			if (orientation0 == orientation1)
+				return 2; // bad winding
+		}
+	}
+
+	return 0;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// find vertex triangle adjacency information
+void PxClothGeodesicTetherCookerImpl::findVertTriNeighbors()
+{
+	shdfnd::Array<VertTriangle> vertTriangles;
+	vertTriangles.reserve(mIndices.size());
+
+	int numTriangles = (int)mIndices.size() / 3;
+	for (int i = 0; i < numTriangles; ++i)
+	{
+		vertTriangles.pushBack(VertTriangle((int)mIndices[uint32_t(3*i)], i));
+		vertTriangles.pushBack(VertTriangle((int)mIndices[uint32_t(3*i+1)], i));
+		vertTriangles.pushBack(VertTriangle((int)mIndices[uint32_t(3*i+2)], i));
+	}
+
+	shdfnd::sort(vertTriangles.begin(), vertTriangles.size(), shdfnd::Less<VertTriangle>());
+	mFirstVertTriAdj.resize(mNumParticles);
+	mVertTriAdjs.reserve(mIndices.size());
+
+	for (uint32_t i = 0; i < (uint32_t)vertTriangles.size(); )
+	{
+		int v = vertTriangles[i].mVertIndex;
+
+		mFirstVertTriAdj[(uint32_t)v] = i;
+
+		while ((i < mIndices.size()) && (vertTriangles[i].mVertIndex == v))
+		{
+			int t = vertTriangles[i].mTriangleIndex;
+			mVertTriAdjs.pushBack((uint32_t)t);
+			i++;
+		}
+	}
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// compute intersection of a ray from a source vertex in direction toward parent
+int PxClothGeodesicTetherCookerImpl::computeVertexIntersection(uint32_t parent, uint32_t src, PathIntersection &path)
+{
+	if (src == parent)
+	{
+		path = PathIntersection(true, src, 0.0);
+		return 0;
+	}
+
+	float maxdot = -1.0f;
+	int closestVert = -1;
+
+	// gradient is toward the parent vertex
+	PxVec3 g = (mVertices[parent] - mVertices[src]).getNormalized();
+
+	// for every triangle incident on this vertex, we intersect against opposite edge of the triangle
+	uint32_t sfirst = mFirstVertTriAdj[src];
+	uint32_t slast = (src < ((uint32_t)mNumParticles-1)) ? mFirstVertTriAdj[src+1] : (uint32_t)mVertTriAdjs.size();
+	for (uint32_t adj = sfirst; adj < slast; adj++)
+	{
+		uint32_t tid = mVertTriAdjs[adj];
+		
+		uint32_t i0 = mIndices[tid*3];
+		uint32_t i1 = mIndices[tid*3+1];
+		uint32_t i2 = mIndices[tid*3+2];
+
+		int eid = 0;
+		if (i0 == src) eid = 1;
+		else if (i1 == src) eid = 2;
+		else if (i2 == src) eid = 0;
+		else continue; // error
+
+		// reshuffle so that src is located at i2
+		i0 = mIndices[tid*3 + eid];
+		i1 = mIndices[tid*3 + (eid+1)%3];
+		i2 = src;
+
+		PxVec3 p0 = mVertices[i0];
+		PxVec3 p1 = mVertices[i1];
+		PxVec3 p2 = mVertices[i2];
+
+		// check if we hit source immediately from this triangle
+		if (i0 == parent)
+		{
+			path = PathIntersection(true, parent, (p0 - p2).magnitude());
+			return 1;
+		}
+
+		if (i1 == parent)
+		{
+			path = PathIntersection(true, parent, (p1 - p2).magnitude());
+			return 1;
+		}
+
+		// ray direction is the gradient projected on the plane of this triangle
+		PxVec3 n = ((p0 - p2).cross(p1 - p2)).getNormalized();
+		PxVec3 d = (g - g.dot(n) * n).getNormalized();
+
+		// find intersection of ray (p2, d) against the edge (p0,p1)
+		float s, t;
+		bool result = intersectRayEdge(p2, d, p0, p1, s, t);
+		if (result == false)
+			continue;
+
+		// t should be positive, otherwise we just hit the triangle in opposite direction, so ignore
+		const float eps = 1e-5;
+		if (t > -eps)
+		{		
+			PxVec3 ip; // intersection point
+			if (( s > -eps ) && (s < (1.0f + eps)))
+			{				
+				// if intersection point is too close to each vertex, we record a vertex intersection
+				if ( ( s < eps) || (s > (1.0f-eps)))
+				{
+					path.vertOrTriangle = true;
+					path.index = (s < eps) ? i0 : i1;
+					path.distance = (p2 - mVertices[path.index]).magnitude();				
+				}
+				else // found an edge instersection
+				{
+					ip = p0 + s * (p1 - p0);
+					path = PathIntersection(false, tid*3 + eid, (p2 - ip).magnitude(), s);					
+				}
+				return 1;
+			}
+		}
+
+		// for fall back (see below)
+		PxVec3 d0 = (p0 - p2).getNormalized();
+		PxVec3 d1 = (p1 - p2).getNormalized();
+		float d0dotg = d0.dot(d);
+		float d1dotg = d1.dot(d);
+
+		if (d0dotg > maxdot)
+		{
+			closestVert = (int)i0;
+			maxdot = d0dotg;
+		}
+		if (d1dotg > maxdot)
+		{
+			closestVert = (int)i1;
+			maxdot = d1dotg;
+		}
+	} // end for (uint32_t adj = sfirst...
+
+	// Fall back to use greedy (Dijkstra-like) path selection. 
+	// This happens as triangles are curved and we may not find intersection on any triangle.
+	// In this case, we choose a vertex closest to the gradient direction.
+	if (closestVert > 0)
+	{
+		path = PathIntersection(true, (uint32_t)closestVert, (mVertices[src] - mVertices[(uint32_t)closestVert]).magnitude());
+		return 1;
+	}
+
+	// Error, (possibly dangling vertex)
+	return -1;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// compute intersection of a ray from a source vertex in direction toward parent
+int PxClothGeodesicTetherCookerImpl::computeEdgeIntersection(uint32_t parent, uint32_t edge, float in_s, PathIntersection &path)
+{
+	int tid = (int)edge / 3;
+	int eid = (int)edge % 3;
+
+	uint32_t e0 = mIndices[uint32_t(tid*3 + eid)];
+	uint32_t e1 = mIndices[uint32_t(tid*3 + (eid+1)%3)];
+
+	PxVec3 v0 = mVertices[e0];
+	PxVec3 v1 = mVertices[e1];
+
+	PxVec3 v = v0 + in_s * (v1 - v0);
+	PxVec3 g = mVertices[parent] - v;
+
+	uint32_t triNbr = mTriNeighbors[edge];
+
+	if (triNbr == uint32_t(-1)) // boundary edge
+	{
+		float dir = g.dot(v1-v0);
+		uint32_t vid = (dir > 0) ? e1 : e0;
+		path = PathIntersection(true, vid, (mVertices[vid] - v).magnitude());
+		return 1;
+	}
+
+	uint32_t i0 = mIndices[triNbr*3];
+	uint32_t i1 = mIndices[triNbr*3+1];
+	uint32_t i2 = mIndices[triNbr*3+2];
+
+	// vertex is sorted s.t i0,i1 contains the edge point
+	if ( checkEdge((int)i0, (int)i1, (int)e0, (int)e1)) {
+		eid = 0;
+	}
+	else if ( checkEdge((int)i1, (int)i2, (int)e0, (int)e1)) {
+		eid = 1;
+		uint32_t tmp = i2;
+		i2 = i0;
+		i0 = i1;
+		i1 = tmp;
+	}
+	else if ( checkEdge((int)i2, (int)i0, (int)e0, (int)e1)) 
+	{
+		eid = 2;
+		uint32_t tmp = i0;
+		i0 = i2;
+		i2 = i1;
+		i1 = tmp;
+	}
+
+	// we hit the parent
+	if (i2 == parent)
+	{
+		path = PathIntersection(true, i2, (mVertices[i2] - v).magnitude());
+		return 1;
+	}
+
+	PxVec3 p0 = mVertices[i0];
+	PxVec3 p1 = mVertices[i1];
+	PxVec3 p2 = mVertices[i2];
+
+	// project gradient vector on the plane of the triangle
+	PxVec3 n = ((p0 - p2).cross(p1 - p2)).getNormalized();
+	g = (g - g.dot(n) * n).getNormalized();
+
+	float s = 0.0f, t = 0.0f;
+	const float eps = 1e-5;
+	PxVec3 ip;
+
+	// intersect against edge form p2 to p0
+	if (intersectRayEdge(v, g, p2, p0, s, t) && ( s >= -eps) && ( s <= (1.0f+eps) ) && (t > -eps))
+	{
+		if ( ( s < eps) || (s > (1.0f-eps)))
+		{
+			path.vertOrTriangle = true;
+			path.index = (s < eps) ? i2 : i0;
+			path.distance = (mVertices[path.index] - v).magnitude();
+		}
+		else
+		{
+			ip = p2 + s * (p0 - p2);
+			path = PathIntersection(false, triNbr*3 + (eid + 2) % 3, (ip - v).magnitude(), s);
+			
+		}
+
+		return 1;
+	}
+
+	// intersect against edge form p1 to p2
+	if (intersectRayEdge(v, g, p1, p2, s, t) && ( s >= -eps) && ( s <= (1.0f+eps) ) && (t > -eps))
+	{
+		if ( ( s < eps) || (s > (1.0f-eps)))
+		{
+			path.vertOrTriangle = true;
+			path.index = (s < eps) ? i1 : i2;
+			path.distance = (mVertices[path.index] - v).magnitude();
+		}
+		else
+		{
+			ip = p1 + s * (p2 - p1);
+			path = PathIntersection(false, triNbr*3 + (eid + 1) % 3, (ip - v).magnitude(), s);
+		}
+		
+		return 1;
+	}
+
+	// fallback to pick closer vertex when no edges intersect
+	float dir = g.dot(v1-v0);
+	path.vertOrTriangle = true;
+	path.index = (dir > 0) ? e1 : e0;
+	path.distance = (mVertices[path.index] - v).magnitude();
+
+	return 1;
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+// compute geodesic distance and path from vertex i to its parent
+float PxClothGeodesicTetherCookerImpl::computeGeodesicDistance(uint32_t i, uint32_t parent, int &errorCode)
+{
+	if (i == parent)
+		return 0.0f;
+		
+	PathIntersection path;
+	
+	errorCode = 0;
+
+	// find intial intersection
+	int status = computeVertexIntersection(parent, i, path);
+	if (status < 0)
+	{
+		errorCode = -1;
+		return 0;
+	}
+
+	int pathcnt = 0;
+	float geodesicDistance = 0;
+
+	while (status > 0)
+	{	
+		geodesicDistance += path.distance;
+
+		if (path.vertOrTriangle)
+			status = computeVertexIntersection(parent, path.index, path);
+		else 
+			status = computeEdgeIntersection(parent, path.index, path.s, path);
+
+		// cannot find valid path
+		if (status < 0) 
+		{
+			errorCode = -2;
+			return 0.0f;
+		}
+
+		// possibly cycles, too many path
+		if (pathcnt > 1000) 
+		{		
+			errorCode = -3;
+			return 0.0f;
+		}
+
+		pathcnt++;
+	}
+
+	return geodesicDistance;
+}
+
+
+
+
+#endif //APEX_USE_CLOTH_API
+
+
diff --git a/APEX_1.4/module/clothing/embedded/ExtClothMeshQuadifier.cpp b/APEX_1.4/module/clothing/embedded/ExtClothMeshQuadifier.cpp
new file mode 100644
index 00000000..f94cb1f0
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/ExtClothMeshQuadifier.cpp
@@ -0,0 +1,429 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "ExtClothConfig.h"
+#if APEX_USE_CLOTH_API
+
+#include "ExtClothMeshQuadifier.h"
+#include "PxStrideIterator.h"
+
+// from shared foundation
+#include <PsArray.h>
+#include <PsSort.h>
+#include <Ps.h>
+#include <PsMathUtils.h>
+
+using namespace nvidia;
+using namespace physx;
+
+struct nvidia::PxClothMeshQuadifierImpl
+{
+	PxClothMeshQuadifierImpl(const PxClothMeshDesc& desc);
+	PxClothMeshDesc getDescriptor() const;
+
+public:
+	PxClothMeshDesc mDesc;
+	shdfnd::Array<uint32_t> mQuads;
+	shdfnd::Array<uint32_t> mTriangles;
+};
+
+PxClothMeshQuadifier::PxClothMeshQuadifier(const PxClothMeshDesc& desc)
+: mImpl(new PxClothMeshQuadifierImpl(desc))
+{
+}
+
+PxClothMeshQuadifier::~PxClothMeshQuadifier()
+{
+	delete mImpl;
+}
+
+PxClothMeshDesc PxClothMeshQuadifier::getDescriptor() const
+{
+	return mImpl->getDescriptor();
+}
+
+namespace 
+{
+	struct UniqueEdge
+	{
+		PX_FORCE_INLINE bool operator()(const UniqueEdge& e1, const UniqueEdge& e2) const
+		{
+			return e1 < e2;
+		}
+
+		PX_FORCE_INLINE bool operator==(const UniqueEdge& other) const
+		{
+			return vertex0 == other.vertex0 && vertex1 == other.vertex1;
+		}
+		PX_FORCE_INLINE bool operator<(const UniqueEdge& other) const
+		{
+			if (vertex0 != other.vertex0)
+			{
+				return vertex0 < other.vertex0;
+			}
+
+			return vertex1 < other.vertex1;
+		}
+
+		///////////////////////////////////////////////////////////////////////////////
+		UniqueEdge() 
+			: vertex0(0), vertex1(0), vertex2(0), vertex3(0xffffffff),
+			maxAngle(0.0f), isQuadDiagonal(false), isUsed(false) {}
+
+		UniqueEdge(uint32_t v0, uint32_t v1, uint32_t v2) 
+		    : vertex0(PxMin(v0, v1)), vertex1(PxMax(v0, v1)), vertex2(v2), vertex3(0xffffffff),
+			maxAngle(0.0f), isQuadDiagonal(false), isUsed(false) {}
+
+		
+		uint32_t vertex0, vertex1;
+		uint32_t vertex2, vertex3;
+		float maxAngle;
+		bool isQuadDiagonal;
+		bool isUsed;
+	};
+
+	struct SortHiddenEdges
+	{
+		SortHiddenEdges(shdfnd::Array<UniqueEdge>& uniqueEdges) : mUniqueEdges(uniqueEdges) {}
+
+		bool operator()(uint32_t a, uint32_t b) const
+		{
+			return mUniqueEdges[a].maxAngle < mUniqueEdges[b].maxAngle;
+		}
+
+	private:
+		SortHiddenEdges& operator=(const SortHiddenEdges&);
+
+		shdfnd::Array<UniqueEdge>& mUniqueEdges;
+	};
+
+	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	template <typename T>
+	void copyIndices(const PxClothMeshDesc &desc, shdfnd::Array<uint32_t> &triangles, shdfnd::Array<uint32_t> &quads)
+	{
+		triangles.resize(desc.triangles.count*3);
+		PxStrideIterator<const T> tIt = physx::PxMakeIterator((const T*)desc.triangles.data, desc.triangles.stride);
+		for(uint32_t i=0; i<desc.triangles.count; ++i, ++tIt)
+			for(uint32_t j=0; j<3; ++j)
+				triangles[i*3+j] = tIt.ptr()[j];
+
+		quads.resize(desc.quads.count*4);
+		PxStrideIterator<const T> qIt = physx::PxMakeIterator((const T*)desc.quads.data, desc.quads.stride);
+		for(uint32_t i=0; i<desc.quads.count; ++i, ++qIt)
+			for(uint32_t j=0; j<4; ++j)
+				quads[i*4+j] = qIt.ptr()[j];
+	}
+
+	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	void computeUniqueEdges(shdfnd::Array<UniqueEdge> &uniqueEdges, const PxVec3* positions, const shdfnd::Array<uint32_t>& triangles)
+	{		
+		uniqueEdges.resize(0);
+		uniqueEdges.reserve(triangles.size());
+		uint32_t indexMap[3][3] = { { 0, 1, 2 }, { 1, 2, 0 }, { 0, 2, 1 } };
+
+		const float rightAngle = PxCos(physx::shdfnd::degToRad(85.0f));
+
+		for(uint32_t i=0; i<triangles.size(); i+=3)
+		{
+			UniqueEdge edges[3];
+			float edgeLengths[3];
+			float edgeAngles[3];
+
+			for (uint32_t j = 0; j < 3; j++)
+			{
+				edges[j] = UniqueEdge(triangles[i+indexMap[j][0]], triangles[i+indexMap[j][1]], triangles[i+indexMap[j][2]]);
+				edgeLengths[j] = (positions[edges[j].vertex0] - positions[edges[j].vertex1]).magnitude();
+				const PxVec3 v1 = positions[edges[j].vertex2] - positions[edges[j].vertex0];
+				const PxVec3 v2 = positions[edges[j].vertex2] - positions[edges[j].vertex1];
+				edgeAngles[j] = PxAbs(v1.dot(v2)) / (v1.magnitude() * v2.magnitude());
+			}
+
+			// find the longest edge
+			uint32_t  longest = 0;
+			for (uint32_t j = 1; j < 3; j++)
+			{
+				if (edgeLengths[j] > edgeLengths[longest])
+					longest = j;
+			}
+
+			// check it's angle
+			if (edgeAngles[longest] < rightAngle)
+				edges[longest].isQuadDiagonal = true;
+		
+			for (uint32_t j = 0; j < 3; j++)
+				uniqueEdges.pushBack(edges[j]);
+		}
+
+		physx::shdfnd::sort(uniqueEdges.begin(), uniqueEdges.size(), UniqueEdge(0, 0, 0));
+
+		uint32_t writeIndex = 0, readStart = 0, readEnd = 0;
+		uint32_t numQuadEdges = 0;
+		while (readEnd < uniqueEdges.size())
+		{
+			while (readEnd < uniqueEdges.size() && uniqueEdges[readStart] == uniqueEdges[readEnd])
+				readEnd++;
+
+			const uint32_t count = readEnd - readStart;
+
+			UniqueEdge uniqueEdge = uniqueEdges[readStart];
+
+			if (count == 2)
+				// know the other diagonal
+				uniqueEdge.vertex3 = uniqueEdges[readStart + 1].vertex2;
+			else
+				uniqueEdge.isQuadDiagonal = false;
+
+			for (uint32_t i = 1; i < count; i++)
+				uniqueEdge.isQuadDiagonal &= uniqueEdges[readStart + i].isQuadDiagonal;
+
+			numQuadEdges += uniqueEdge.isQuadDiagonal ? 1 : 0;
+
+			uniqueEdges[writeIndex] = uniqueEdge;
+
+			writeIndex++;
+			readStart = readEnd;
+		}
+
+		uniqueEdges.resize(writeIndex, UniqueEdge(0, 0, 0));
+	}
+
+	///////////////////////////////////////////////////////////////////////////////
+	uint32_t findUniqueEdge(const shdfnd::Array<UniqueEdge> &uniqueEdges, uint32_t index1, uint32_t index2)
+	{
+		UniqueEdge searchFor(index1, index2, 0);
+
+		uint32_t curMin = 0;
+		uint32_t curMax = uniqueEdges.size();
+		while (curMax > curMin)
+		{
+			uint32_t middle = (curMin + curMax) >> 1;
+
+			const UniqueEdge& probe = uniqueEdges[middle];
+			if (probe < searchFor)
+				curMin = middle + 1;
+			else
+				curMax = middle;		
+		}
+
+		return curMin;
+	}
+
+	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	void refineUniqueEdges(shdfnd::Array<UniqueEdge> &uniqueEdges, const PxVec3* positions)
+	{
+		shdfnd::Array<uint32_t> hideEdges;
+		hideEdges.reserve(uniqueEdges.size());
+
+		for (uint32_t i = 0; i < uniqueEdges.size(); i++)
+		{
+			UniqueEdge& uniqueEdge = uniqueEdges[i];
+			uniqueEdge.maxAngle = 0.0f;
+			uniqueEdge.isQuadDiagonal = false; // just to be sure
+
+			if (uniqueEdge.vertex3 != 0xffffffff)
+			{
+				uint32_t indices[4] = { uniqueEdge.vertex0, uniqueEdge.vertex2, uniqueEdge.vertex1, uniqueEdge.vertex3 };
+
+				// compute max angle of the quad
+				for (uint32_t j = 0; j < 4; j++)
+				{
+					PxVec3 e0 = positions[indices[ j + 0   ]] - positions[indices[(j + 1) % 4]];
+					PxVec3 e1 = positions[indices[(j + 1) % 4]] - positions[indices[(j + 2) % 4]];
+
+					float denominator = e0.magnitude() * e1.magnitude();
+					if (denominator != 0.0f)
+					{
+						float cosAngle = PxAbs(e0.dot(e1)) / denominator;
+						uniqueEdge.maxAngle = PxMax(uniqueEdge.maxAngle, cosAngle);
+					}
+				}
+
+				hideEdges.pushBack(i);
+			}
+		}
+
+		shdfnd::sort(hideEdges.begin(), hideEdges.size(), SortHiddenEdges(uniqueEdges));
+
+		const float maxAngle = PxSin(physx::shdfnd::degToRad(60.0f));
+
+		uint32_t numHiddenEdges = 0;
+
+		for (uint32_t i = 0; i < hideEdges.size(); i++)
+		{
+			UniqueEdge& uniqueEdge = uniqueEdges[hideEdges[i]];
+
+			// find some stop criterion
+			if (uniqueEdge.maxAngle > maxAngle)
+				break;
+		
+			// check if all four adjacent edges are still visible?
+			uint32_t indices[5] = { uniqueEdge.vertex0, uniqueEdge.vertex2, uniqueEdge.vertex1, uniqueEdge.vertex3, uniqueEdge.vertex0 };
+
+			uint32_t numVisible = 0;
+			for (uint32_t j = 0; j < 4; j++)
+			{
+				const uint32_t edgeIndex = findUniqueEdge(uniqueEdges, indices[j], indices[j + 1]);
+				PX_ASSERT(edgeIndex < uniqueEdges.size());
+	
+				numVisible += uniqueEdges[edgeIndex].isQuadDiagonal ? 0 : 1;
+			}
+
+			if (numVisible == 4)
+			{
+				uniqueEdge.isQuadDiagonal = true;
+				numHiddenEdges++;
+			}
+		}
+	}
+
+
+	// calculate the inclusive prefix sum, equivalent of std::partial_sum
+	template <typename T>
+	void prefixSum(const T* first, const T* last, T* dest)
+	{
+		if (first != last)
+		{	
+			*(dest++) = *(first++);
+			for (; first != last; ++first, ++dest)
+				*dest = *(dest-1) + *first;
+		}
+	}
+
+	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	void quadifyTriangles(const shdfnd::Array<UniqueEdge> &uniqueEdges, shdfnd::Array<uint32_t>& triangles, shdfnd::Array<uint32_t> &quads)
+	{
+		shdfnd::Array<uint32_t> valency(uniqueEdges.size()+1, 0); // edge valency
+		shdfnd::Array<uint32_t> adjacencies; // adjacency from unique edge to triangles
+		uint32_t numTriangles = triangles.size() / 3;
+
+		// compute edge valency w.r.t triangles
+		for(uint32_t i=0; i<numTriangles; ++i)
+		{
+			for (uint32_t j=0; j < 3; j++)
+			{
+				uint32_t uniqueEdgeIndex = findUniqueEdge(uniqueEdges, triangles[i*3+j], triangles[i*3+(j+1)%3]);
+				++valency[uniqueEdgeIndex];
+			}
+		}
+
+		// compute adjacency from each edge to triangle, the value also encodes which side of the triangle this edge belongs to
+		prefixSum(valency.begin(), valency.end(), valency.begin());
+		adjacencies.resize(valency.back());
+		for(uint32_t i=0; i<numTriangles; ++i)
+		{
+			for (uint32_t j=0; j < 3; j++)
+			{
+				uint32_t uniqueEdgeIndex = findUniqueEdge(uniqueEdges, triangles[i*3+j], triangles[i*3+(j+1)%3]);
+				adjacencies[--valency[uniqueEdgeIndex]] = i*3+j;
+			}
+		}
+
+		// now go through unique edges that are identified as diagonal, and build a quad out of two adjacent triangles
+		shdfnd::Array<uint32_t> mark(numTriangles, 0);
+		for (uint32_t i = 0; i < uniqueEdges.size(); i++)
+		{
+			const UniqueEdge& edge = uniqueEdges[i];
+			if (edge.isQuadDiagonal)
+			{
+				uint32_t vi = valency[i];
+				if ((valency[i+1]-vi) != 2)
+					continue; // we do not quadify around non-manifold edges
+
+				uint32_t adj0 = adjacencies[vi], adj1 = adjacencies[vi+1];
+				uint32_t tid0 = adj0 / 3, tid1 = adj1 / 3;
+				uint32_t eid0 = adj0 % 3, eid1 = adj1 % 3;
+
+				quads.pushBack(triangles[tid0 * 3 + eid0]);
+				quads.pushBack(triangles[tid1 * 3 + (eid1+2)%3]);
+				quads.pushBack(triangles[tid0 * 3 + (eid0+1)%3]);
+				quads.pushBack(triangles[tid0 * 3 + (eid0+2)%3]);
+
+				mark[tid0] = 1;
+				mark[tid1] = 1;
+#if 0 // PX_DEBUG
+				printf("Deleting %d, %d, %d - %d, %d, %d, creating %d, %d, %d, %d\n",
+					triangles[tid0*3],triangles[tid0*3+1],triangles[tid0*3+2],
+					triangles[tid1*3],triangles[tid1*3+1],triangles[tid1*3+2],
+					v0,v3,v1,v2);
+#endif
+			}
+		}
+
+		// add remaining triangles that are not marked as already quadified
+		shdfnd::Array<uint32_t> oldTriangles = triangles;
+		triangles.resize(0);
+		for (uint32_t i = 0; i < numTriangles; i++)
+		{
+			if (mark[i]) continue;
+
+			triangles.pushBack(oldTriangles[i*3]);
+			triangles.pushBack(oldTriangles[i*3+1]);
+			triangles.pushBack(oldTriangles[i*3+2]);
+		}
+	}
+
+} // namespace 
+
+
+///////////////////////////////////////////////////////////////////////////////
+PxClothMeshQuadifierImpl::PxClothMeshQuadifierImpl(const PxClothMeshDesc &desc)
+	:mDesc(desc)
+{
+	shdfnd::Array<PxVec3> particles(desc.points.count);
+	PxStrideIterator<const PxVec3> pIt((const PxVec3*)desc.points.data, desc.points.stride);
+	for(uint32_t i=0; i<desc.points.count; ++i)
+		particles[i] = *pIt++;
+
+	// copy triangle indices
+	if(desc.flags & PxMeshFlag::e16_BIT_INDICES)
+		copyIndices<uint16_t>(desc, mTriangles, mQuads);
+	else
+		copyIndices<uint32_t>(desc, mTriangles, mQuads);
+
+	shdfnd::Array<UniqueEdge> uniqueEdges;
+
+	computeUniqueEdges(uniqueEdges, particles.begin(), mTriangles);
+
+	refineUniqueEdges(uniqueEdges, particles.begin());
+
+//	printf("before %d triangles, %d quads\n", mTriangles.size()/3, mQuads.size()/4);
+	quadifyTriangles(uniqueEdges, mTriangles, mQuads);
+
+//	printf("after %d triangles, %d quads\n", mTriangles.size()/3, mQuads.size()/4);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+PxClothMeshDesc 
+PxClothMeshQuadifierImpl::getDescriptor() const
+{
+	// copy points and other data
+	PxClothMeshDesc desc = mDesc;
+
+	// for now use only 32 bit for temporary indices out of quadifier
+	desc.flags &= ~PxMeshFlag::e16_BIT_INDICES;
+
+	desc.triangles.count = mTriangles.size() / 3;
+	desc.triangles.data = mTriangles.begin();
+	desc.triangles.stride = 3 * sizeof(uint32_t);
+
+	desc.quads.count = mQuads.size() / 4;
+	desc.quads.data = mQuads.begin();
+	desc.quads.stride = 4 * sizeof(uint32_t);
+
+	PX_ASSERT(desc.isValid());
+
+	return desc;
+}
+#endif //APEX_USE_CLOTH_API
+
+
diff --git a/APEX_1.4/module/clothing/embedded/ExtClothMeshQuadifier.h b/APEX_1.4/module/clothing/embedded/ExtClothMeshQuadifier.h
new file mode 100644
index 00000000..653622f6
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/ExtClothMeshQuadifier.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PX_PHYSICS_EXTENSIONS_CLOTH_EDGE_QUADIFIER_H
+#define PX_PHYSICS_EXTENSIONS_CLOTH_EDGE_QUADIFIER_H
+
+#include "ExtClothConfig.h"
+#include "PxClothMeshDesc.h"
+
+#if PX_DOXYGEN == 0
+namespace nvidia
+{
+#endif
+	
+struct PxClothMeshQuadifierImpl;
+
+class PxClothMeshQuadifier
+{
+public:
+	/**
+	\brief Convert triangles of PxClothMeshDesc to quads.
+	\details In PxCloth, quad dominant mesh representations are preferable to pre-triangulated versions.
+	In cases where the mesh has been already triangulated, this class provides a meachanism to
+	convert (quadify) some triangles back to quad representations.
+	\see PxClothFabricCooker
+	\param desc The cloth mesh descriptor prepared for cooking
+	*/
+	PxClothMeshQuadifier(const PxClothMeshDesc &desc);
+	~PxClothMeshQuadifier();
+
+    /** 
+	\brief Returns a mesh descriptor with some triangle pairs converted to quads.
+	\note The returned descriptor is valid only within the lifespan of PxClothMeshQuadifier class.
+	*/
+    PxClothMeshDesc getDescriptor() const;
+
+private:
+	PxClothMeshQuadifierImpl* mImpl;
+
+};
+
+#if PX_DOXYGEN == 0
+} // namespace nvidia
+#endif
+
+#endif // PX_PHYSICS_EXTENSIONS_CLOTH_EDGE_QUADIFIER_H
diff --git a/APEX_1.4/module/clothing/embedded/ExtClothSimpleTetherCooker.cpp b/APEX_1.4/module/clothing/embedded/ExtClothSimpleTetherCooker.cpp
new file mode 100644
index 00000000..e5570186
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/ExtClothSimpleTetherCooker.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "ExtClothConfig.h"
+#if APEX_USE_CLOTH_API
+
+#include "ExtClothTetherCooker.h"
+#include "PxStrideIterator.h"
+#include "PxVec4.h"
+#include "PsIntrinsics.h"
+#include "PsArray.h"
+
+using namespace nvidia;
+using namespace physx;
+
+struct nvidia::PxClothSimpleTetherCookerImpl
+{
+	PxClothSimpleTetherCookerImpl(const PxClothMeshDesc& desc);
+
+	uint32_t	getCookerStatus() const;
+	void	getTetherData(uint32_t* userTetherAnchors, float* userTetherLengths) const;
+
+public:
+	// output
+	shdfnd::Array<uint32_t>	mTetherAnchors;
+	shdfnd::Array<float>	mTetherLengths;
+
+protected:
+	void	createTetherData(const PxClothMeshDesc &desc);
+
+	uint32_t	mCookerStatus;
+};
+
+PxClothSimpleTetherCooker::PxClothSimpleTetherCooker(const PxClothMeshDesc& desc)
+: mImpl(new PxClothSimpleTetherCookerImpl(desc))
+{
+}
+
+PxClothSimpleTetherCooker::~PxClothSimpleTetherCooker()
+{
+	delete mImpl;
+}
+
+uint32_t PxClothSimpleTetherCooker::getCookerStatus() const
+{
+	return mImpl->getCookerStatus();
+}
+
+void PxClothSimpleTetherCooker::getTetherData(uint32_t* userTetherAnchors, float* userTetherLengths) const
+{
+	mImpl->getTetherData(userTetherAnchors, userTetherLengths);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+PxClothSimpleTetherCookerImpl::PxClothSimpleTetherCookerImpl(const PxClothMeshDesc &desc) : mCookerStatus(1)
+{
+	createTetherData(desc);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+void PxClothSimpleTetherCookerImpl::createTetherData(const PxClothMeshDesc &desc)
+{
+	uint32_t numParticles = desc.points.count;
+
+	if (!desc.invMasses.data)
+		return;
+
+	// assemble points
+	shdfnd::Array<PxVec4> particles;
+	particles.reserve(numParticles);
+	PxStrideIterator<const PxVec3> pIt((const PxVec3*)desc.points.data, desc.points.stride);
+	PxStrideIterator<const float> wIt((const float*)desc.invMasses.data, desc.invMasses.stride);
+	for(uint32_t i=0; i<numParticles; ++i)
+		particles.pushBack(PxVec4(*pIt++, wIt.ptr() ? *wIt++ : 1.0f));
+
+	// compute tether data
+	shdfnd::Array<uint32_t> attachedIndices;
+	for(uint32_t i=0; i < numParticles; ++i)
+		if(particles[i].w == 0.0f)
+			attachedIndices.pushBack(i);
+
+	uint32_t n = attachedIndices.empty() ? 0 : numParticles;
+	for(uint32_t i=0; i < n; ++i)
+	{
+		mTetherAnchors.reserve(numParticles);
+		mTetherLengths.reserve(numParticles);
+
+		PxVec3 position = reinterpret_cast<const PxVec3&>(particles[i]);
+		float minSqrDist = FLT_MAX;
+		uint32_t minIndex = numParticles;
+		const uint32_t *aIt, *aEnd = attachedIndices.end();
+		for(aIt = attachedIndices.begin(); aIt != aEnd; ++aIt)
+		{
+			float sqrDist = (reinterpret_cast<const PxVec3&>(
+				particles[*aIt]) - position).magnitudeSquared();
+			if(minSqrDist > sqrDist)
+				minSqrDist = sqrDist, minIndex = *aIt;
+		}
+
+		mTetherAnchors.pushBack(minIndex);
+		mTetherLengths.pushBack(PxSqrt(minSqrDist));
+	}
+
+	PX_ASSERT(mTetherAnchors.size() == mTetherLengths.size());
+	if (numParticles == mTetherAnchors.size() && numParticles == mTetherLengths.size())
+	{
+		mCookerStatus = 0;
+	}
+}
+
+///////////////////////////////////////////////////////////////////////////////  
+uint32_t PxClothSimpleTetherCookerImpl::getCookerStatus() const
+{
+	return mCookerStatus;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+void  
+PxClothSimpleTetherCookerImpl::getTetherData(uint32_t* userTetherAnchors, float* userTetherLengths) const
+{
+	intrinsics::memCopy(userTetherAnchors, mTetherAnchors.begin(), mTetherAnchors.size() * sizeof(uint32_t));
+	intrinsics::memCopy(userTetherLengths, mTetherLengths.begin(), mTetherLengths.size() * sizeof(float));
+}
+
+
+#endif //APEX_USE_CLOTH_API
+
+
diff --git a/APEX_1.4/module/clothing/embedded/ExtClothTetherCooker.h b/APEX_1.4/module/clothing/embedded/ExtClothTetherCooker.h
new file mode 100644
index 00000000..2c5d720a
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/ExtClothTetherCooker.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PX_PHYSICS_EXTENSIONS_CLOTH_TETHER_COOKER_H
+#define PX_PHYSICS_EXTENSIONS_CLOTH_TETHER_COOKER_H
+
+#include "ExtClothConfig.h"
+#include "PxClothMeshDesc.h"
+
+#if PX_DOXYGEN == 0
+namespace nvidia
+{
+#endif
+	
+struct PxClothSimpleTetherCookerImpl;
+
+class PxClothSimpleTetherCooker
+{
+public:
+	/**
+	\brief Compute tether data from PxClothMeshDesc with simple distance measure.
+	\details The tether constraint in PxCloth requires rest distance and anchor index to be precomputed during cooking time.
+	This cooker computes a simple Euclidean distance to closest anchor point.
+	The Euclidean distance measure works reasonably for flat cloth and flags and computation time is very fast.
+	With this cooker, there is only one tether anchor point per particle.
+	\see PxClothTetherGeodesicCooker for more accurate distance estimation.
+	\param desc The cloth mesh descriptor prepared for cooking
+	*/
+	PxClothSimpleTetherCooker(const PxClothMeshDesc &desc);
+	~PxClothSimpleTetherCooker();
+
+	/**
+	\brief Returns cooker status
+	\details This function returns cooker status after cooker computation is done.
+	A non-zero return value indicates a failure.
+	*/
+	uint32_t getCookerStatus() const;
+
+    /** 
+	\brief Returns computed tether data.
+	\details This function returns anchor indices for each particle as well as desired distance between the tether anchor and the particle.
+	The user buffers should be at least as large as number of particles.
+	*/
+    void getTetherData(uint32_t* userTetherAnchors, float* userTetherLengths) const;
+
+private:
+	PxClothSimpleTetherCookerImpl* mImpl;
+
+};
+
+
+struct PxClothGeodesicTetherCookerImpl;
+
+class PxClothGeodesicTetherCooker
+{
+public:
+	/**
+	\brief Compute tether data from PxClothMeshDesc using geodesic distance.
+	\details The tether constraint in PxCloth requires rest distance and anchor index to be precomputed during cooking time.
+	The provided tether cooker computes optimal tether distance with geodesic distance computation.
+	For curved and complex meshes, geodesic distance provides the best behavior for tether constraints.
+	But the cooking time is slower than the simple cooker.
+	\see PxClothSimpleTetherCooker
+	\param desc The cloth mesh descriptor prepared for cooking
+	\note The geodesic distance is optimized to work for intended use in tether constraint.  
+	This is by no means a general purpose geodesic computation code for arbitrary meshes.
+	\note The geodesic cooker does not work with non-manifold input such as edges having more than two incident triangles, 
+	or adjacent triangles following inconsitent winding order (e.g. clockwise vs counter-clockwise). 
+	*/
+	PxClothGeodesicTetherCooker(const PxClothMeshDesc &desc);
+	~PxClothGeodesicTetherCooker();
+
+	/**
+	\brief Returns cooker status
+	\details This function returns cooker status after cooker computation is done.
+	A non-zero return value indicates a failure, 1 for non-manifold and 2 for inconsistent winding.
+	*/
+	uint32_t getCookerStatus() const;
+
+	/**
+	\brief Returns number of tether anchors per particle
+	\note Returned number indicates the maximum anchors.  
+	If some particles are assigned fewer anchors, the anchor indices will be uint32_t(-1) 
+	\note If there is no attached point in the input mesh descriptor, this will return 0 and no tether data will be generated.
+	*/
+	uint32_t getNbTethersPerParticle() const;
+
+    /** 
+	\brief Returns computed tether data.
+	\details This function returns anchor indices for each particle as well as desired distance between the tether anchor and the particle.
+	The user buffers should be at least as large as number of particles * number of tethers per particle.
+	\see getNbTethersPerParticle()
+	*/
+    void getTetherData(uint32_t* userTetherAnchors, float* userTetherLengths) const;
+
+private:
+	PxClothGeodesicTetherCookerImpl* mImpl;
+
+};
+
+
+#if PX_DOXYGEN == 0
+} // namespace nvidia
+#endif
+
+#endif // PX_PHYSICS_EXTENSIONS_CLOTH_TETHER_COOKER_H
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Cloth.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Cloth.h
new file mode 100644
index 00000000..6f24e51f
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Cloth.h
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Range.h"
+#include "PhaseConfig.h"
+
+struct ID3D11Buffer;
+
+namespace nvidia
+{
+#if APEX_UE4
+	namespace Cm
+	{
+		class Task;
+	}
+#endif
+
+namespace cloth
+{
+
+class Factory;
+class Fabric;
+class Cloth;
+
+template <typename T>
+struct MappedRange : public Range<T>
+{
+	MappedRange(T* first, T* last, const Cloth& cloth, void (Cloth::*lock)() const, void (Cloth::*unlock)() const)
+	: Range<T>(first, last), mCloth(cloth), mLock(lock), mUnlock(unlock)
+	{
+	}
+
+	MappedRange(const MappedRange& other)
+	: Range<T>(other), mCloth(other.mCloth), mLock(other.mLock), mUnlock(other.mUnlock)
+	{
+		(mCloth.*mLock)();
+	}
+
+	~MappedRange()
+	{
+		(mCloth.*mUnlock)();
+	}
+
+  private:
+	MappedRange& operator=(const MappedRange&);
+
+	const Cloth& mCloth;
+	void (Cloth::*mLock)() const;
+	void (Cloth::*mUnlock)() const;
+};
+
+struct GpuParticles
+{
+	PxVec4* mCurrent;
+	PxVec4* mPrevious;
+	ID3D11Buffer* mBuffer;
+};
+
+// abstract cloth instance
+class Cloth
+{
+	Cloth& operator=(const Cloth&);
+
+  protected:
+	Cloth()
+	{
+	}
+	Cloth(const Cloth&)
+	{
+	}
+
+  public:
+	virtual ~Cloth()
+	{
+	}
+
+	// same as factory.clone(*this)
+	virtual Cloth* clone(Factory& factory) const = 0;
+
+	virtual Fabric& getFabric() const = 0;
+	virtual Factory& getFactory() const = 0;
+
+	/* particle properties */
+
+	virtual uint32_t getNumParticles() const = 0;
+	virtual void lockParticles() const = 0;
+	virtual void unlockParticles() const = 0;
+	// return particle data for current and previous frame
+	// setting current invMass to zero locks particle.
+	virtual MappedRange<PxVec4> getCurrentParticles() = 0;
+	virtual MappedRange<const PxVec4> getCurrentParticles() const = 0;
+	virtual MappedRange<PxVec4> getPreviousParticles() = 0;
+	virtual MappedRange<const PxVec4> getPreviousParticles() const = 0;
+	virtual GpuParticles getGpuParticles() = 0;
+
+	// set position of cloth after next call to simulate()
+	virtual void setTranslation(const PxVec3& trans) = 0;
+	virtual void setRotation(const PxQuat& rot) = 0;
+
+	// get current position of cloth
+	virtual const PxVec3& getTranslation() const = 0;
+	virtual const PxQuat& getRotation() const = 0;
+
+	// zero inertia derived from method calls above (once)
+	virtual void clearInertia() = 0;
+
+	// adjust the position of the cloth without affecting the dynamics (to call after a world origin shift, for example)
+	virtual void teleport(const PxVec3& delta) = 0;
+
+	/* solver parameters */
+
+	// return delta time used for previous iteration
+	virtual float getPreviousIterationDt() const = 0;
+
+	// gravity in global coordinates
+	virtual void setGravity(const PxVec3&) = 0;
+	virtual PxVec3 getGravity() const = 0;
+
+	// damping of local particle velocity (1/stiffnessFrequency)
+	// 0 (default): velocity is unaffected, 1: velocity is zero'ed
+	virtual void setDamping(const PxVec3&) = 0;
+	virtual PxVec3 getDamping() const = 0;
+
+	// portion of local frame velocity applied to particles
+	// 0 (default): particles are unaffected
+	// same as damping: damp global particle velocity
+	virtual void setLinearDrag(const PxVec3&) = 0;
+	virtual PxVec3 getLinearDrag() const = 0;
+	virtual void setAngularDrag(const PxVec3&) = 0;
+	virtual PxVec3 getAngularDrag() const = 0;
+
+	// portion of local frame accelerations applied to particles
+	// 0: particles are unaffected, 1 (default): physically correct
+	virtual void setLinearInertia(const PxVec3&) = 0;
+	virtual PxVec3 getLinearInertia() const = 0;
+	virtual void setAngularInertia(const PxVec3&) = 0;
+	virtual PxVec3 getAngularInertia() const = 0;
+	virtual void setCentrifugalInertia(const PxVec3&) = 0;
+	virtual PxVec3 getCentrifugalInertia() const = 0;
+
+	// target solver iterations per second
+	virtual void setSolverFrequency(float) = 0;
+	virtual float getSolverFrequency() const = 0;
+
+	// damp, drag, stiffness exponent per second
+	virtual void setStiffnessFrequency(float) = 0;
+	virtual float getStiffnessFrequency() const = 0;
+
+	// filter width for averaging dt^2 factor of gravity and
+	// external acceleration, in numbers of iterations (default=30).
+	virtual void setAcceleationFilterWidth(uint32_t) = 0;
+	virtual uint32_t getAccelerationFilterWidth() const = 0;
+
+	// setup edge constraint solver iteration
+	virtual void setPhaseConfig(Range<const PhaseConfig> configs) = 0;
+
+	/* collision parameters */
+
+	virtual void setSpheres(Range<const PxVec4>, uint32_t first, uint32_t last) = 0;
+	virtual uint32_t getNumSpheres() const = 0;
+
+	virtual void setCapsules(Range<const uint32_t>, uint32_t first, uint32_t last) = 0;
+	virtual uint32_t getNumCapsules() const = 0;
+
+	virtual void setPlanes(Range<const PxVec4>, uint32_t first, uint32_t last) = 0;
+	virtual uint32_t getNumPlanes() const = 0;
+
+	virtual void setConvexes(Range<const uint32_t>, uint32_t first, uint32_t last) = 0;
+	virtual uint32_t getNumConvexes() const = 0;
+
+	virtual void setTriangles(Range<const PxVec3>, uint32_t first, uint32_t last) = 0;
+	virtual void setTriangles(Range<const PxVec3>, Range<const PxVec3>, uint32_t first) = 0;
+	virtual uint32_t getNumTriangles() const = 0;
+
+	// check if we use ccd or not
+	virtual bool isContinuousCollisionEnabled() const = 0;
+	// set if we use ccd or not (disabled by default)
+	virtual void enableContinuousCollision(bool) = 0;
+
+	// controls how quickly mass is increased during collisions
+	virtual float getCollisionMassScale() const = 0;
+	virtual void setCollisionMassScale(float) = 0;
+
+	// friction
+	virtual void setFriction(float) = 0;
+	virtual float getFriction() const = 0;
+
+	// set virtual particles for collision handling.
+	// each indices element consists of 3 particle
+	// indices and an index into the lerp weights array.
+	virtual void setVirtualParticles(Range<const uint32_t[4]> indices, Range<const PxVec3> weights) = 0;
+	virtual uint32_t getNumVirtualParticles() const = 0;
+	virtual uint32_t getNumVirtualParticleWeights() const = 0;
+
+	/* tether constraint parameters */
+
+	virtual void setTetherConstraintScale(float scale) = 0;
+	virtual float getTetherConstraintScale() const = 0;
+	virtual void setTetherConstraintStiffness(float stiffness) = 0;
+	virtual float getTetherConstraintStiffness() const = 0;
+
+	/* motion constraint parameters */
+
+	// return reference to motion constraints (position, radius)
+	// The entire range must be written after calling this function.
+	virtual Range<PxVec4> getMotionConstraints() = 0;
+	virtual void clearMotionConstraints() = 0;
+	virtual uint32_t getNumMotionConstraints() const = 0;
+	virtual void setMotionConstraintScaleBias(float scale, float bias) = 0;
+	virtual float getMotionConstraintScale() const = 0;
+	virtual float getMotionConstraintBias() const = 0;
+	virtual void setMotionConstraintStiffness(float stiffness) = 0;
+	virtual float getMotionConstraintStiffness() const = 0;
+
+	/* separation constraint parameters */
+
+	// return reference to separation constraints (position, radius)
+	// The entire range must be written after calling this function.
+	virtual Range<PxVec4> getSeparationConstraints() = 0;
+	virtual void clearSeparationConstraints() = 0;
+	virtual uint32_t getNumSeparationConstraints() const = 0;
+
+	/* clear interpolation */
+
+	// assign current to previous positions for
+	// collision spheres, motion, and separation constraints
+	virtual void clearInterpolation() = 0;
+
+	/* particle acceleration parameters */
+
+	// return reference to particle accelerations (in local coordinates)
+	// The entire range must be written after calling this function.
+	virtual Range<PxVec4> getParticleAccelerations() = 0;
+	virtual void clearParticleAccelerations() = 0;
+	virtual uint32_t getNumParticleAccelerations() const = 0;
+
+	/* self collision */
+
+	virtual void setSelfCollisionDistance(float distance) = 0;
+	virtual float getSelfCollisionDistance() const = 0;
+	virtual void setSelfCollisionStiffness(float stiffness) = 0;
+	virtual float getSelfCollisionStiffness() const = 0;
+
+	virtual void setSelfCollisionIndices(Range<const uint32_t>) = 0;
+	virtual uint32_t getNumSelfCollisionIndices() const = 0;
+
+	/* rest positions */
+
+	// set rest particle positions used during self-collision
+	virtual void setRestPositions(Range<const PxVec4>) = 0;
+	virtual uint32_t getNumRestPositions() const = 0;
+
+	/* bounding box */
+
+	// current particle position bounds in local space
+	virtual const PxVec3& getBoundingBoxCenter() const = 0;
+	virtual const PxVec3& getBoundingBoxScale() const = 0;
+
+	/* sleeping (disabled by default) */
+
+	// max particle velocity (per axis) to pass sleep test
+	virtual void setSleepThreshold(float) = 0;
+	virtual float getSleepThreshold() const = 0;
+	// test sleep condition every nth millisecond
+	virtual void setSleepTestInterval(uint32_t) = 0;
+	virtual uint32_t getSleepTestInterval() const = 0;
+	// put cloth to sleep when n consecutive sleep tests pass
+	virtual void setSleepAfterCount(uint32_t) = 0;
+	virtual uint32_t getSleepAfterCount() const = 0;
+	virtual uint32_t getSleepPassCount() const = 0;
+	virtual bool isAsleep() const = 0;
+	virtual void putToSleep() = 0;
+	virtual void wakeUp() = 0;
+
+	virtual void setHalfPrecisionOption(bool isAllowed) = 0;
+	virtual bool getHalfPrecisionOption() const = 0;
+
+#if APEX_UE4
+	virtual void simulate(float dt) = 0;
+#endif
+
+	virtual void setUserData(void*) = 0;
+	virtual void* getUserData() const = 0;
+};
+
+// wrappers to prevent non-const overload from marking particles dirty
+inline MappedRange<const PxVec4> readCurrentParticles(const Cloth& cloth)
+{
+	return cloth.getCurrentParticles();
+}
+inline MappedRange<const PxVec4> readPreviousParticles(const Cloth& cloth)
+{
+	return cloth.getPreviousParticles();
+}
+
+} // namespace cloth
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Fabric.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Fabric.h
new file mode 100644
index 00000000..f271b397
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Fabric.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+#include "PxAssert.h"
+#include "Range.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+
+class Factory;
+
+// abstract cloth constraints and triangle indices
+class Fabric
+{
+  protected:
+	Fabric(const Fabric&);
+	Fabric& operator=(const Fabric&);
+
+  protected:
+	Fabric() : mRefCount(0)
+	{
+	}
+
+  public:
+	virtual ~Fabric()
+	{
+		PX_ASSERT(!mRefCount);
+	}
+
+	virtual Factory& getFactory() const = 0;
+
+	virtual uint32_t getNumPhases() const = 0;
+	virtual uint32_t getNumRestvalues() const = 0;
+
+	virtual uint32_t getNumSets() const = 0;
+	virtual uint32_t getNumIndices() const = 0;
+
+	virtual uint32_t getNumParticles() const = 0;
+
+	virtual uint32_t getNumTethers() const = 0;
+
+	virtual void scaleRestvalues(float) = 0;
+	virtual void scaleTetherLengths(float) = 0;
+
+	uint16_t getRefCount() const
+	{
+		return mRefCount;
+	}
+	void incRefCount()
+	{
+		++mRefCount;
+		PX_ASSERT(mRefCount > 0);
+	}
+	void decRefCount()
+	{
+		PX_ASSERT(mRefCount > 0);
+		--mRefCount;
+	}
+
+  protected:
+	uint16_t mRefCount;
+};
+
+} // namespace cloth
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Factory.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Factory.h
new file mode 100644
index 00000000..651b3b0c
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Factory.h
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+#include "Range.h"
+
+typedef struct CUstream_st* CUstream;
+
+namespace physx
+{
+	namespace profile
+	{
+		class PxProfileZone;
+	}
+	class PxTaskManager;
+}
+
+namespace nvidia
+{
+namespace cloth
+{
+
+class Fabric;
+class Cloth;
+class Solver;
+class Character;
+
+/// abstract factory to create context-specific simulation components
+/// such as cloth, solver, collision, etc.
+class Factory
+{
+  public:
+	enum Platform
+	{
+		CPU,
+		CUDA,
+		DirectCompute
+	};
+
+  protected:
+	Factory(Platform platform) : mPlatform(platform)
+	{
+	}
+	Factory(const Factory&);
+	Factory& operator=(const Factory&);
+
+  public:
+	static Factory* createFactory(Platform, void* = 0);
+
+	virtual ~Factory()
+	{
+	}
+
+	Platform getPlatform() const
+	{
+		return mPlatform;
+	}
+
+	/**
+	    Create fabric data used to setup cloth object.
+	    @param numParticles number of particles, must be larger than any particle index
+	    @param phases map from phase to set index
+	    @param sets inclusive prefix sum of restvalue count per set
+	    @param restvalues array of constraint rest values
+	    @param indices array of particle index pair per constraint
+	 */
+	virtual Fabric* createFabric(uint32_t numParticles, Range<const uint32_t> phases, Range<const uint32_t> sets,
+	                             Range<const float> restvalues, Range<const uint32_t> indices,
+	                             Range<const uint32_t> anchors, Range<const float> tetherLengths) = 0;
+
+	/**
+	    Create cloth object.
+	    @param particles initial particle positions.
+	    @param fabric edge distance constraint structure
+	 */
+	virtual Cloth* createCloth(Range<const PxVec4> particles, Fabric& fabric) = 0;
+
+	/**
+	    Create cloth solver object.
+	    @param profiler performance event receiver.
+	    @param taskMgr PxTaskManager used for simulation.
+	 */
+	virtual Solver* createSolver(profile::PxProfileZone* profiler, PxTaskManager* taskMgr) = 0;
+
+	/**
+	    Create a copy of a cloth instance
+	    @param cloth the instance to be cloned, need not match the factory type
+	 */
+	virtual Cloth* clone(const Cloth& cloth) = 0;
+
+	/**
+	    Extract original data from a fabric object
+	    @param fabric to extract from, must match factory type
+	    @param phases pre-allocated memory range to write phases
+	    @param sets pre-allocated memory range to write sets
+	    @param restvalues pre-allocated memory range to write restvalues
+	    @param indices pre-allocated memory range to write indices
+	 */
+	virtual void extractFabricData(const Fabric& fabric, Range<uint32_t> phases, Range<uint32_t> sets,
+	                               Range<float> restvalues, Range<uint32_t> indices, Range<uint32_t> anchors,
+	                               Range<float> tetherLengths) const = 0;
+
+	/**
+	    Extract current collision spheres and capsules from a cloth object
+	    @param cloth the instance to extract from, must match factory type
+	    @param spheres pre-allocated memory range to write spheres
+	    @param capsules pre-allocated memory range to write capsules
+	    @param planes pre-allocated memory range to write planes
+	    @param convexes pre-allocated memory range to write convexes
+	    @param triangles pre-allocated memory range to write triangles
+	 */
+	virtual void extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules,
+	                                  Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const = 0;
+
+	/**
+	    Extract current motion constraints from a cloth object
+	    @param cloth the instance to extract from, must match factory type
+	    @param destConstraints pre-allocated memory range to write constraints
+	 */
+	virtual void extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const = 0;
+
+	/**
+	    Extract current separation constraints from a cloth object
+	    @param cloth the instance to extract from, must match factory type
+	    @param destConstraints pre-allocated memory range to write constraints
+	 */
+	virtual void extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const = 0;
+
+	/**
+	    Extract current particle accelerations from a cloth object
+	    @param cloth the instance to extract from, must match factory type
+	    @param destAccelerations pre-allocated memory range to write accelerations
+	 */
+	virtual void extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const = 0;
+
+	/**
+	    Extract virtual particles from a cloth object
+	    @param cloth the instance to extract from, must match factory type
+	    @param destIndices pre-allocated memory range to write indices
+	    @param destWeights pre-allocated memory range to write weights
+	 */
+	virtual void extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> destIndices,
+	                                     Range<PxVec3> destWeights) const = 0;
+
+	/**
+	    Extract self collision indices from cloth object.
+	    @param cloth the instance to extract from, must match factory type
+	    @param destIndices pre-allocated memory range to write indices
+	*/
+	virtual void extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const = 0;
+
+	/**
+	    Extract particle rest positions from cloth object.
+	    @param cloth the instance to extract from, must match factory type
+	    @param destRestPositions pre-allocated memory range to write rest positions
+	*/
+	virtual void extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const = 0;
+
+  protected:
+	const Platform mPlatform;
+};
+
+} // namespace cloth
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/PhaseConfig.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/PhaseConfig.h
new file mode 100644
index 00000000..4edf4802
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/PhaseConfig.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+
+struct PhaseConfig
+{
+	PhaseConfig(uint16_t index = uint16_t(-1));
+
+	uint16_t mPhaseIndex;
+	uint16_t mPadding;
+
+	// target convergence rate per iteration (1/solverFrequency)
+	float mStiffness;
+
+	float mStiffnessMultiplier;
+
+	float mCompressionLimit;
+	float mStretchLimit;
+};
+
+} // namespace cloth
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Range.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Range.h
new file mode 100644
index 00000000..7d48e195
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Range.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "PxAssert.h"
+#include "Types.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+
+template <class T>
+struct Range
+{
+	Range();
+
+	Range(T* first, T* last);
+
+	template <typename S>
+	Range(const Range<S>& other);
+
+	uint32_t size() const;
+	bool empty() const;
+
+	void popFront();
+	void popBack();
+
+	T* begin() const;
+	T* end() const;
+
+	T& front() const;
+	T& back() const;
+
+	T& operator[](uint32_t i) const;
+
+  private:
+	T* mFirst;
+	T* mLast; // past last element
+};
+
+template <typename T>
+Range<T>::Range()
+: mFirst(0), mLast(0)
+{
+}
+
+template <typename T>
+Range<T>::Range(T* first, T* last)
+: mFirst(first), mLast(last)
+{
+}
+
+template <typename T>
+template <typename S>
+Range<T>::Range(const Range<S>& other)
+: mFirst(other.begin()), mLast(other.end())
+{
+}
+
+template <typename T>
+uint32_t Range<T>::size() const
+{
+	return uint32_t(mLast - mFirst);
+}
+
+template <typename T>
+bool Range<T>::empty() const
+{
+	return mFirst >= mLast;
+}
+
+template <typename T>
+void Range<T>::popFront()
+{
+	PX_ASSERT(mFirst < mLast);
+	++mFirst;
+}
+
+template <typename T>
+void Range<T>::popBack()
+{
+	PX_ASSERT(mFirst < mLast);
+	--mLast;
+}
+
+template <typename T>
+T* Range<T>::begin() const
+{
+	return mFirst;
+}
+
+template <typename T>
+T* Range<T>::end() const
+{
+	return mLast;
+}
+
+template <typename T>
+T& Range<T>::front() const
+{
+	PX_ASSERT(mFirst < mLast);
+	return *mFirst;
+}
+
+template <typename T>
+T& Range<T>::back() const
+{
+	PX_ASSERT(mFirst < mLast);
+	return mLast[-1];
+}
+
+template <typename T>
+T& Range<T>::operator[](uint32_t i) const
+{
+	PX_ASSERT(mFirst + i < mLast);
+	return mFirst[i];
+}
+
+} // namespace cloth
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Solver.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Solver.h
new file mode 100644
index 00000000..585aab63
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Solver.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+
+namespace physx
+{
+	class PxBaseTask;
+}
+
+namespace nvidia
+{
+namespace cloth
+{
+
+class Cloth;
+
+// called during inter-collision, user0 and user1 are the user data from each cloth
+typedef bool (*InterCollisionFilter)(void* user0, void* user1);
+
+/// base class for solvers
+class Solver
+{
+  protected:
+	Solver(const Solver&);
+	Solver& operator=(const Solver&);
+
+  protected:
+	Solver()
+	{
+	}
+
+  public:
+	virtual ~Solver()
+	{
+	}
+
+	/// add cloth object, returns true if successful
+	virtual void addCloth(Cloth*) = 0;
+
+	/// remove cloth object
+	virtual void removeCloth(Cloth*) = 0;
+
+	/// simulate one time step
+	virtual PxBaseTask& simulate(float dt, PxBaseTask&) = 0;
+
+	// inter-collision parameters
+	virtual void setInterCollisionDistance(float distance) = 0;
+	virtual float getInterCollisionDistance() const = 0;
+	virtual void setInterCollisionStiffness(float stiffness) = 0;
+	virtual float getInterCollisionStiffness() const = 0;
+	virtual void setInterCollisionNbIterations(uint32_t nbIterations) = 0;
+	virtual uint32_t getInterCollisionNbIterations() const = 0;
+	virtual void setInterCollisionFilter(InterCollisionFilter filter) = 0;
+
+//	virtual uint32_t getNumSharedPositions( const Cloth* ) const = 0;
+
+	/// returns true if an unrecoverable error has occurred
+	virtual bool hasError() const = 0;
+};
+
+} // namespace cloth
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Types.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Types.h
new file mode 100644
index 00000000..e80a3009
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/include/Types.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#ifndef __CUDACC__
+#include "ApexUsingNamespace.h"
+#include "Px.h"
+#include "PxVec3.h"
+#include "PxVec4.h"
+#include "PxQuat.h"
+#endif
+
+// Factory.cpp gets included in both PhysXGPU and LowLevelCloth projects
+// CuFactory can only be created in PhysXGPU project
+// DxFactory can only be created in PhysXGPU (win) or LowLevelCloth (xbox1)
+#if defined(PX_PHYSX_GPU_EXPORTS) || PX_XBOXONE
+#define ENABLE_CUFACTORY ((PX_WINDOWS_FAMILY && (PX_WINRT==0)) || PX_LINUX)
+
+//TEMPORARY DISABLE DXFACTORY
+#define ENABLE_DXFACTORY 0
+//#define ENABLE_DXFACTORY ((PX_WINDOWS_FAMILY && (PX_WINRT==0)) || PX_XBOXONE)
+#else
+#define ENABLE_CUFACTORY 0
+#define ENABLE_DXFACTORY 0
+#endif
+
+#ifndef _MSC_VER
+#include <stdint.h>
+#else
+// typedef standard integer types
+typedef unsigned __int8 uint8_t;
+typedef unsigned __int16 uint16_t;
+typedef unsigned __int32 uint32_t;
+typedef unsigned __int64 uint64_t;
+typedef __int16 int16_t;
+typedef __int32 int32_t;
+#if _MSC_VER < 1600
+#define nullptr NULL
+#endif
+#endif
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Allocator.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Allocator.cpp
new file mode 100644
index 00000000..c6c297ca
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Allocator.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "Allocator.h"
+#include "PsAllocator.h"
+
+namespace nvidia
+{
+
+void* cloth::allocate(size_t n)
+{
+	return n ? nvidia::getAllocator().allocate(n, "", __FILE__, __LINE__) : 0;
+}
+
+void cloth::deallocate(void* ptr)
+{
+	if(ptr)
+		nvidia::getAllocator().deallocate(ptr);
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Allocator.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Allocator.h
new file mode 100644
index 00000000..c0488b43
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Allocator.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+#include "PsArray.h"
+#include "PsAllocator.h"
+#include "PsAlignedMalloc.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+
+void* allocate(size_t);
+void deallocate(void*);
+
+/* templated typedefs for convenience */
+
+template <typename T>
+struct Vector
+{
+	typedef nvidia::Array<T, nvidia::NonTrackingAllocator> Type;
+};
+
+template <typename T, size_t alignment>
+struct AlignedVector
+{
+	typedef nvidia::Array<T, nvidia::AlignedAllocator<alignment> > Type;
+};
+
+struct UserAllocated
+{
+	virtual ~UserAllocated()
+	{
+	}
+	static void* operator new(size_t n)
+	{
+		return allocate(n);
+	}
+	static void operator delete(void* ptr)
+	{
+		deallocate(ptr);
+	}
+};
+
+} // namespace cloth
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Array.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Array.h
new file mode 100644
index 00000000..e9da59aa
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Array.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "PxVec4.h"
+#include "PxQuat.h"
+#include "PxVec3.h"
+#include "ApexUsingNamespace.h"
+
+namespace nvidia
+{
+
+namespace cloth
+{
+
+inline float (&array(PxVec3& v))[3]
+{
+	return reinterpret_cast<float(&)[3]>(v);
+}
+inline const float (&array(const PxVec3& v))[3]
+{
+	return reinterpret_cast<const float(&)[3]>(v);
+}
+inline float (&array(PxVec4& v))[4]
+{
+	return reinterpret_cast<float(&)[4]>(v);
+}
+inline const float (&array(const PxVec4& v))[4]
+{
+	return reinterpret_cast<const float(&)[4]>(v);
+}
+inline float (&array(PxQuat& q))[4]
+{
+	return reinterpret_cast<float(&)[4]>(q);
+}
+inline const float (&array(const PxQuat& q))[4]
+{
+	return reinterpret_cast<const float(&)[4]>(q);
+}
+
+} // namespace cloth
+
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/BoundingBox.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/BoundingBox.h
new file mode 100644
index 00000000..339f6f12
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/BoundingBox.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Simd4f.h"
+#include <float.h>
+
+namespace nvidia
+{
+
+namespace cloth
+{
+
+template <typename Simd4f>
+struct BoundingBox
+{
+	Simd4f mLower;
+	Simd4f mUpper;
+};
+
+template <typename Simd4f>
+inline BoundingBox<Simd4f> loadBounds(const float* ptr)
+{
+	BoundingBox<Simd4f> result;
+	result.mLower = load(ptr);
+	result.mUpper = load(ptr + 3);
+	return result;
+}
+
+template <typename Simd4f>
+inline BoundingBox<Simd4f> emptyBounds()
+{
+	BoundingBox<Simd4f> result;
+
+	result.mLower = simd4f(FLT_MAX);
+	result.mUpper = -result.mLower;
+
+	return result;
+}
+
+template <typename Simd4f>
+inline BoundingBox<Simd4f> expandBounds(const BoundingBox<Simd4f>& bounds, const Simd4f* pIt, const Simd4f* pEnd)
+{
+	BoundingBox<Simd4f> result = bounds;
+	for(; pIt != pEnd; ++pIt)
+	{
+		result.mLower = min(result.mLower, *pIt);
+		result.mUpper = max(result.mUpper, *pIt);
+	}
+	return result;
+}
+
+template <typename Simd4f>
+inline BoundingBox<Simd4f> expandBounds(const BoundingBox<Simd4f>& a, const BoundingBox<Simd4f>& b)
+{
+	BoundingBox<Simd4f> result;
+	result.mLower = min(a.mLower, b.mLower);
+	result.mUpper = max(a.mUpper, b.mUpper);
+	return result;
+}
+
+template <typename Simd4f>
+inline BoundingBox<Simd4f> intersectBounds(const BoundingBox<Simd4f>& a, const BoundingBox<Simd4f>& b)
+{
+	BoundingBox<Simd4f> result;
+	result.mLower = max(a.mLower, b.mLower);
+	result.mUpper = min(a.mUpper, b.mUpper);
+	return result;
+}
+
+template <typename Simd4f>
+inline bool isEmptyBounds(const BoundingBox<Simd4f>& a)
+{
+	return anyGreater(a.mLower, a.mUpper) != 0;
+}
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/ClothBase.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/ClothBase.h
new file mode 100644
index 00000000..641fc70f
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/ClothBase.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "PsMathUtils.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+
+/* helper functions shared between SwCloth and CuCloth */
+
+template <typename Cloth>
+void initialize(Cloth& cloth, const PxVec4* pIt, const PxVec4* pEnd)
+{
+	// initialize particles bounding box
+	PxVec4 lower(FLT_MAX), upper = -lower;
+	for(; pIt != pEnd; ++pIt)
+	{
+		lower = lower.minimum(*pIt);
+		upper = upper.maximum(*pIt);
+	}
+	PxVec4 center = (upper + lower) * 0.5f;
+	PxVec4 extent = (upper - lower) * 0.5f;
+	cloth.mParticleBoundsCenter = reinterpret_cast<const PxVec3&>(center);
+	cloth.mParticleBoundsHalfExtent = reinterpret_cast<const PxVec3&>(extent);
+
+	cloth.mGravity = PxVec3(0.0f);
+	cloth.mLogDamping = PxVec3(0.0f);
+	cloth.mLinearLogDrag = PxVec3(0.0f);
+	cloth.mAngularLogDrag = PxVec3(0.0f);
+	cloth.mLinearInertia = PxVec3(1.0f);
+	cloth.mAngularInertia = PxVec3(1.0f);
+	cloth.mCentrifugalInertia = PxVec3(1.0f);
+	cloth.mSolverFrequency = 60.0f;
+	cloth.mStiffnessFrequency = 10.0f;
+	cloth.mTargetMotion = PxTransform(PxIdentity);
+	cloth.mCurrentMotion = PxTransform(PxIdentity);
+	cloth.mLinearVelocity = PxVec3(0.0f);
+	cloth.mAngularVelocity = PxVec3(0.0f);
+	cloth.mPrevIterDt = 0.0f;
+	cloth.mIterDtAvg = MovingAverage(30);
+	cloth.mTetherConstraintLogStiffness = float(-FLT_MAX_EXP);
+	cloth.mTetherConstraintScale = 1.0f;
+	cloth.mMotionConstraintScale = 1.0f;
+	cloth.mMotionConstraintBias = 0.0f;
+	cloth.mMotionConstraintLogStiffness = float(-FLT_MAX_EXP);
+	cloth.mEnableContinuousCollision = false;
+	cloth.mCollisionMassScale = 0.0f;
+	cloth.mFriction = 0.0f;
+	cloth.mSelfCollisionDistance = 0.0f;
+	cloth.mSelfCollisionLogStiffness = float(-FLT_MAX_EXP);
+	cloth.mSleepTestInterval = uint32_t(-1);
+	cloth.mSleepAfterCount = uint32_t(-1);
+	cloth.mSleepThreshold = 0.0f;
+	cloth.mSleepPassCounter = 0;
+	cloth.mSleepTestCounter = 0;
+}
+
+template <typename DstCloth, typename SrcCloth>
+void copy(DstCloth& dstCloth, const SrcCloth& srcCloth)
+{
+	dstCloth.mParticleBoundsCenter = srcCloth.mParticleBoundsCenter;
+	dstCloth.mParticleBoundsHalfExtent = srcCloth.mParticleBoundsHalfExtent;
+	dstCloth.mGravity = srcCloth.mGravity;
+	dstCloth.mLogDamping = srcCloth.mLogDamping;
+	dstCloth.mLinearLogDrag = srcCloth.mLinearLogDrag;
+	dstCloth.mAngularLogDrag = srcCloth.mAngularLogDrag;
+	dstCloth.mLinearInertia = srcCloth.mLinearInertia;
+	dstCloth.mAngularInertia = srcCloth.mAngularInertia;
+	dstCloth.mCentrifugalInertia = srcCloth.mCentrifugalInertia;
+	dstCloth.mSolverFrequency = srcCloth.mSolverFrequency;
+	dstCloth.mStiffnessFrequency = srcCloth.mStiffnessFrequency;
+	dstCloth.mTargetMotion = srcCloth.mTargetMotion;
+	dstCloth.mCurrentMotion = srcCloth.mCurrentMotion;
+	dstCloth.mLinearVelocity = srcCloth.mLinearVelocity;
+	dstCloth.mAngularVelocity = srcCloth.mAngularVelocity;
+	dstCloth.mPrevIterDt = srcCloth.mPrevIterDt;
+	dstCloth.mIterDtAvg = srcCloth.mIterDtAvg;
+	dstCloth.mTetherConstraintLogStiffness = srcCloth.mTetherConstraintLogStiffness;
+	dstCloth.mTetherConstraintScale = srcCloth.mTetherConstraintScale;
+	dstCloth.mMotionConstraintScale = srcCloth.mMotionConstraintScale;
+	dstCloth.mMotionConstraintBias = srcCloth.mMotionConstraintBias;
+	dstCloth.mMotionConstraintLogStiffness = srcCloth.mMotionConstraintLogStiffness;
+	dstCloth.mEnableContinuousCollision = srcCloth.mEnableContinuousCollision;
+	dstCloth.mCollisionMassScale = srcCloth.mCollisionMassScale;
+	dstCloth.mFriction = srcCloth.mFriction;
+	dstCloth.mSelfCollisionDistance = srcCloth.mSelfCollisionDistance;
+	dstCloth.mSelfCollisionLogStiffness = srcCloth.mSelfCollisionLogStiffness;
+	dstCloth.mSleepTestInterval = srcCloth.mSleepTestInterval;
+	dstCloth.mSleepAfterCount = srcCloth.mSleepAfterCount;
+	dstCloth.mSleepThreshold = srcCloth.mSleepThreshold;
+	dstCloth.mSleepPassCounter = srcCloth.mSleepPassCounter;
+	dstCloth.mSleepTestCounter = srcCloth.mSleepTestCounter;
+	dstCloth.mIsAllowedHalfPrecisionSolver = srcCloth.mIsAllowedHalfPrecisionSolver;
+	dstCloth.mUserData = srcCloth.mUserData;
+}
+
+} // namespace cloth
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/ClothImpl.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/ClothImpl.h
new file mode 100644
index 00000000..22206016
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/ClothImpl.h
@@ -0,0 +1,1247 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Cloth.h"
+#include "Fabric.h"
+#include "Allocator.h"
+#include "PsMathUtils.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+
+// SwCloth or CuCloth aggregate implementing the Cloth interface
+// Member specializations are implemented in Sw/CuCloth.cpp
+template <typename T>
+class ClothImpl : public UserAllocated, public Cloth
+{
+	ClothImpl(const ClothImpl&);
+
+  public:
+	ClothImpl& operator=(const ClothImpl&);
+
+	typedef T ClothType;
+	typedef typename ClothType::FactoryType FactoryType;
+	typedef typename ClothType::FabricType FabricType;
+	typedef typename ClothType::ContextLockType ContextLockType;
+
+	ClothImpl(Factory&, Fabric&, Range<const PxVec4>);
+	ClothImpl(Factory&, const ClothImpl&);
+
+	virtual Cloth* clone(Factory& factory) const;
+
+	virtual Fabric& getFabric() const;
+	virtual Factory& getFactory() const;
+
+	virtual uint32_t getNumParticles() const;
+	virtual void lockParticles() const;
+	virtual void unlockParticles() const;
+	virtual MappedRange<PxVec4> getCurrentParticles();
+	virtual MappedRange<const PxVec4> getCurrentParticles() const;
+	virtual MappedRange<PxVec4> getPreviousParticles();
+	virtual MappedRange<const PxVec4> getPreviousParticles() const;
+	virtual GpuParticles getGpuParticles();
+
+	virtual void setTranslation(const PxVec3& trans);
+	virtual void setRotation(const PxQuat& rot);
+
+	virtual const PxVec3& getTranslation() const;
+	virtual const PxQuat& getRotation() const;
+
+	virtual void clearInertia();
+
+	virtual void teleport(const PxVec3& delta);
+
+	virtual float getPreviousIterationDt() const;
+	virtual void setGravity(const PxVec3& gravity);
+	virtual PxVec3 getGravity() const;
+	virtual void setDamping(const PxVec3& damping);
+	virtual PxVec3 getDamping() const;
+	virtual void setLinearDrag(const PxVec3& drag);
+	virtual PxVec3 getLinearDrag() const;
+	virtual void setAngularDrag(const PxVec3& drag);
+	virtual PxVec3 getAngularDrag() const;
+	virtual void setLinearInertia(const PxVec3& inertia);
+	virtual PxVec3 getLinearInertia() const;
+	virtual void setAngularInertia(const PxVec3& inertia);
+	virtual PxVec3 getAngularInertia() const;
+	virtual void setCentrifugalInertia(const PxVec3& inertia);
+	virtual PxVec3 getCentrifugalInertia() const;
+
+	virtual void setSolverFrequency(float frequency);
+	virtual float getSolverFrequency() const;
+
+	virtual void setStiffnessFrequency(float frequency);
+	virtual float getStiffnessFrequency() const;
+
+	virtual void setAcceleationFilterWidth(uint32_t);
+	virtual uint32_t getAccelerationFilterWidth() const;
+
+	virtual void setPhaseConfig(Range<const PhaseConfig> configs);
+
+	virtual void setSpheres(Range<const PxVec4>, uint32_t first, uint32_t last);
+	virtual uint32_t getNumSpheres() const;
+
+	virtual void setCapsules(Range<const uint32_t>, uint32_t first, uint32_t last);
+	virtual uint32_t getNumCapsules() const;
+
+	virtual void setPlanes(Range<const PxVec4>, uint32_t first, uint32_t last);
+	virtual uint32_t getNumPlanes() const;
+
+	virtual void setConvexes(Range<const uint32_t>, uint32_t first, uint32_t last);
+	virtual uint32_t getNumConvexes() const;
+
+	virtual void setTriangles(Range<const PxVec3>, uint32_t first, uint32_t last);
+	virtual void setTriangles(Range<const PxVec3>, Range<const PxVec3>, uint32_t first);
+	virtual uint32_t getNumTriangles() const;
+
+	virtual bool isContinuousCollisionEnabled() const;
+	virtual void enableContinuousCollision(bool);
+
+	virtual float getCollisionMassScale() const;
+	virtual void setCollisionMassScale(float);
+	virtual void setFriction(float friction);
+	virtual float getFriction() const;
+
+	virtual void setVirtualParticles(Range<const uint32_t[4]>, Range<const PxVec3>);
+	virtual uint32_t getNumVirtualParticles() const;
+	virtual uint32_t getNumVirtualParticleWeights() const;
+
+	virtual void setTetherConstraintScale(float scale);
+	virtual float getTetherConstraintScale() const;
+	virtual void setTetherConstraintStiffness(float stiffness);
+	virtual float getTetherConstraintStiffness() const;
+
+	virtual Range<PxVec4> getMotionConstraints();
+	virtual void clearMotionConstraints();
+	virtual uint32_t getNumMotionConstraints() const;
+	virtual void setMotionConstraintScaleBias(float scale, float bias);
+	virtual float getMotionConstraintScale() const;
+	virtual float getMotionConstraintBias() const;
+	virtual void setMotionConstraintStiffness(float stiffness);
+	virtual float getMotionConstraintStiffness() const;
+
+	virtual Range<PxVec4> getSeparationConstraints();
+	virtual void clearSeparationConstraints();
+	virtual uint32_t getNumSeparationConstraints() const;
+
+	virtual void clearInterpolation();
+
+	virtual Range<PxVec4> getParticleAccelerations();
+	virtual void clearParticleAccelerations();
+	virtual uint32_t getNumParticleAccelerations() const;
+
+	virtual void setSelfCollisionDistance(float);
+	virtual float getSelfCollisionDistance() const;
+	virtual void setSelfCollisionStiffness(float);
+	virtual float getSelfCollisionStiffness() const;
+
+	virtual void setSelfCollisionIndices(Range<const uint32_t>);
+	virtual uint32_t getNumSelfCollisionIndices() const;
+
+	virtual void setRestPositions(Range<const PxVec4>);
+	virtual uint32_t getNumRestPositions() const;
+
+	virtual const PxVec3& getBoundingBoxCenter() const;
+	virtual const PxVec3& getBoundingBoxScale() const;
+
+	virtual void setSleepThreshold(float);
+	virtual float getSleepThreshold() const;
+	virtual void setSleepTestInterval(uint32_t);
+	virtual uint32_t getSleepTestInterval() const;
+	virtual void setSleepAfterCount(uint32_t);
+	virtual uint32_t getSleepAfterCount() const;
+	virtual uint32_t getSleepPassCount() const;
+	virtual bool isAsleep() const;
+	virtual void putToSleep();
+	virtual void wakeUp();
+
+	virtual void setHalfPrecisionOption(bool isAllowed);
+	virtual bool getHalfPrecisionOption() const;
+
+#if APEX_UE4
+	virtual void simulate(float dt);
+#endif
+
+	virtual void setUserData(void*);
+	virtual void* getUserData() const;
+
+	// helper function
+	template <typename U>
+	MappedRange<U> getMappedParticles(U* data) const;
+
+	ClothType mCloth;
+};
+
+class SwCloth;
+typedef ClothImpl<SwCloth> SwClothImpl;
+
+class CuCloth;
+typedef ClothImpl<CuCloth> CuClothImpl;
+
+class DxCloth;
+typedef ClothImpl<DxCloth> DxClothImpl;
+
+template <typename T>
+ClothImpl<T>::ClothImpl(Factory& factory, Fabric& fabric, Range<const PxVec4> particles)
+: mCloth(static_cast<FactoryType&>(factory), static_cast<FabricType&>(fabric), particles)
+{
+	// fabric and cloth need to be created by the same factory
+	PX_ASSERT(&fabric.getFactory() == &factory);
+}
+
+template <typename T>
+ClothImpl<T>::ClothImpl(Factory& factory, const ClothImpl& impl)
+: mCloth(static_cast<FactoryType&>(factory), impl.mCloth)
+{
+}
+
+template <typename T>
+inline Fabric& ClothImpl<T>::getFabric() const
+{
+	return mCloth.mFabric;
+}
+
+template <typename T>
+inline Factory& ClothImpl<T>::getFactory() const
+{
+	return mCloth.mFactory;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setTranslation(const PxVec3& trans)
+{
+	PxVec3 t = reinterpret_cast<const PxVec3&>(trans);
+	if(t == mCloth.mTargetMotion.p)
+		return;
+
+	mCloth.mTargetMotion.p = t;
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline void ClothImpl<T>::setRotation(const PxQuat& q)
+{
+	if((q - mCloth.mTargetMotion.q).magnitudeSquared() == 0.0f)
+		return;
+
+	mCloth.mTargetMotion.q = q;
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline const PxVec3& ClothImpl<T>::getTranslation() const
+{
+	return mCloth.mTargetMotion.p;
+}
+
+template <typename T>
+inline const PxQuat& ClothImpl<T>::getRotation() const
+{
+	return mCloth.mTargetMotion.q;
+}
+
+template <typename T>
+inline void ClothImpl<T>::clearInertia()
+{
+	mCloth.mCurrentMotion = mCloth.mTargetMotion;
+	mCloth.mLinearVelocity = PxVec3(0.0f);
+	mCloth.mAngularVelocity = PxVec3(0.0f);
+
+	mCloth.wakeUp();
+}
+
+// Fixed 4505:local function has been removed
+template <typename T>
+inline void ClothImpl<T>::teleport(const PxVec3& delta)
+{
+	mCloth.mCurrentMotion.p += delta;
+	mCloth.mTargetMotion.p += delta;
+}
+
+template <typename T>
+inline float ClothImpl<T>::getPreviousIterationDt() const
+{
+	return mCloth.mPrevIterDt;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setGravity(const PxVec3& gravity)
+{
+	PxVec3 value = gravity;
+	if(value == mCloth.mGravity)
+		return;
+
+	mCloth.mGravity = value;
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline PxVec3 ClothImpl<T>::getGravity() const
+{
+	return mCloth.mGravity;
+}
+
+inline float safeLog2(float x)
+{
+	return x ? physx::shdfnd::log2(x) : -FLT_MAX_EXP;
+}
+
+inline PxVec3 safeLog2(const PxVec3& v)
+{
+	return PxVec3(safeLog2(v.x), safeLog2(v.y), safeLog2(v.z));
+}
+
+inline float safeExp2(float x)
+{
+	if(x <= -FLT_MAX_EXP)
+		return 0.0f;
+	else
+		return physx::shdfnd::exp2(x);
+}
+
+inline PxVec3 safeExp2(const PxVec3& v)
+{
+	return PxVec3(safeExp2(v.x), safeExp2(v.y), safeExp2(v.z));
+}
+
+template <typename T>
+inline void ClothImpl<T>::setDamping(const PxVec3& damping)
+{
+	PxVec3 value = safeLog2(PxVec3(1.f) - damping);
+	if(value == mCloth.mLogDamping)
+		return;
+
+	mCloth.mLogDamping = value;
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline PxVec3 ClothImpl<T>::getDamping() const
+{
+	return PxVec3(1.f) - safeExp2(mCloth.mLogDamping);
+}
+
+template <typename T>
+inline void ClothImpl<T>::setLinearDrag(const PxVec3& drag)
+{
+	PxVec3 value = safeLog2(PxVec3(1.f) - drag);
+	if(value == mCloth.mLinearLogDrag)
+		return;
+
+	mCloth.mLinearLogDrag = value;
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline PxVec3 ClothImpl<T>::getLinearDrag() const
+{
+	return PxVec3(1.f) - safeExp2(mCloth.mLinearLogDrag);
+}
+
+template <typename T>
+inline void ClothImpl<T>::setAngularDrag(const PxVec3& drag)
+{
+	PxVec3 value = safeLog2(PxVec3(1.f) - drag);
+	if(value == mCloth.mAngularLogDrag)
+		return;
+
+	mCloth.mAngularLogDrag = value;
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline PxVec3 ClothImpl<T>::getAngularDrag() const
+{
+	return PxVec3(1.f) - safeExp2(mCloth.mAngularLogDrag);
+}
+
+template <typename T>
+inline void ClothImpl<T>::setLinearInertia(const PxVec3& inertia)
+{
+	PxVec3 value = inertia;
+	if(value == mCloth.mLinearInertia)
+		return;
+
+	mCloth.mLinearInertia = value;
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline PxVec3 ClothImpl<T>::getLinearInertia() const
+{
+	return mCloth.mLinearInertia;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setAngularInertia(const PxVec3& inertia)
+{
+	PxVec3 value = inertia;
+	if(value == mCloth.mAngularInertia)
+		return;
+
+	mCloth.mAngularInertia = value;
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline PxVec3 ClothImpl<T>::getAngularInertia() const
+{
+	return mCloth.mAngularInertia;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setCentrifugalInertia(const PxVec3& inertia)
+{
+	PxVec3 value = inertia;
+	if(value == mCloth.mCentrifugalInertia)
+		return;
+
+	mCloth.mCentrifugalInertia = value;
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline PxVec3 ClothImpl<T>::getCentrifugalInertia() const
+{
+	return mCloth.mCentrifugalInertia;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setSolverFrequency(float frequency)
+{
+	if(frequency == mCloth.mSolverFrequency)
+		return;
+
+	mCloth.mSolverFrequency = frequency;
+	mCloth.mClothCostDirty = true;
+	mCloth.mIterDtAvg.reset();
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getSolverFrequency() const
+{
+	return mCloth.mSolverFrequency;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setStiffnessFrequency(float frequency)
+{
+	if(frequency == mCloth.mStiffnessFrequency)
+		return;
+
+	mCloth.mStiffnessFrequency = frequency;
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getStiffnessFrequency() const
+{
+	return mCloth.mStiffnessFrequency;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setAcceleationFilterWidth(uint32_t n)
+{
+	mCloth.mIterDtAvg.resize(n);
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getAccelerationFilterWidth() const
+{
+	return mCloth.mIterDtAvg.size();
+}
+
+// move a subarray
+template <typename Iter>
+void move(Iter it, uint32_t first, uint32_t last, uint32_t result)
+{
+	if(result > first)
+	{
+		result += last - first;
+		while(first < last)
+			it[--result] = it[--last];
+	}
+	else
+	{
+		while(first < last)
+			it[result++] = it[first++];
+	}
+}
+
+// update capsule index
+inline bool updateIndex(uint32_t& index, uint32_t first, int32_t delta)
+{
+	return index >= first && int32_t(index += delta) < int32_t(first);
+}
+
+template <typename T>
+inline void ClothImpl<T>::setSpheres(Range<const PxVec4> spheres, uint32_t first, uint32_t last)
+{
+	uint32_t oldSize = uint32_t(mCloth.mStartCollisionSpheres.size());
+	uint32_t newSize = uint32_t(spheres.size()) + oldSize - last + first;
+
+	PX_ASSERT(newSize <= 32);
+	PX_ASSERT(first <= oldSize);
+	PX_ASSERT(last <= oldSize);
+
+#if PX_DEBUG
+	for(const PxVec4* it = spheres.begin(); it < spheres.end(); ++it)
+		PX_ASSERT(it->w >= 0.0f);
+#endif
+
+	if(!oldSize && !newSize)
+		return;
+
+	if(!oldSize)
+	{
+		ContextLockType contextLock(mCloth.mFactory);
+		mCloth.mStartCollisionSpheres.assign(spheres.begin(), spheres.end());
+		mCloth.notifyChanged();
+	}
+	else
+	{
+		if(PxMax(oldSize, newSize) >
+		   PxMin(mCloth.mStartCollisionSpheres.capacity(), mCloth.mTargetCollisionSpheres.capacity()))
+		{
+			ContextLockType contextLock(mCloth.mFactory);
+			mCloth.mStartCollisionSpheres.reserve(newSize);
+			mCloth.mTargetCollisionSpheres.reserve(PxMax(oldSize, newSize));
+		}
+
+		typename T::MappedVec4fVectorType start = mCloth.mStartCollisionSpheres;
+		typename T::MappedVec4fVectorType target = mCloth.mTargetCollisionSpheres;
+
+		// fill target from start
+		for(uint32_t i = target.size(); i < oldSize; ++i)
+			target.pushBack(start[i]);
+
+		// resize to larger of oldSize and newSize
+		start.resize(PxMax(oldSize, newSize), PxVec4(0.0f));
+		target.resize(PxMax(oldSize, newSize), PxVec4(0.0f));
+
+		if(int32_t delta = int32_t(newSize - oldSize))
+		{
+			// move past-range elements to new place
+			move(start.begin(), last, oldSize, last + delta);
+			move(target.begin(), last, oldSize, last + delta);
+
+			// fill new elements from spheres
+			for(uint32_t i = last; i < last + delta; ++i)
+				start[i] = spheres[i - first];
+
+			// adjust capsule indices
+			typename T::MappedIndexVectorType indices = mCloth.mCapsuleIndices;
+			Vector<IndexPair>::Type::Iterator cIt, cEnd = indices.end();
+			for(cIt = indices.begin(); cIt != cEnd;)
+			{
+				bool removed = false;
+				removed |= updateIndex(cIt->first, last + PxMin(0, delta), int32_t(delta));
+				removed |= updateIndex(cIt->second, last + PxMin(0, delta), int32_t(delta));
+				if(!removed)
+					++cIt;
+				else
+				{
+					indices.replaceWithLast(cIt);
+					cEnd = indices.end();
+				}
+			}
+
+			start.resize(newSize);
+			target.resize(newSize);
+
+			mCloth.notifyChanged();
+		}
+
+		// fill target elements with spheres
+		for(uint32_t i = 0; i < spheres.size(); ++i)
+			target[first + i] = spheres[i];
+	}
+
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumSpheres() const
+{
+	return uint32_t(mCloth.mStartCollisionSpheres.size());
+}
+
+// Fixed 4505:local function has been removed
+template <typename T>
+inline void ClothImpl<T>::setCapsules(Range<const uint32_t> capsules, uint32_t first, uint32_t last)
+{
+	uint32_t oldSize = mCloth.mCapsuleIndices.size();
+	uint32_t newSize = uint32_t(capsules.size() / 2) + oldSize - last + first;
+
+	PX_ASSERT(newSize <= 32);
+	PX_ASSERT(first <= oldSize);
+	PX_ASSERT(last <= oldSize);
+
+	const IndexPair* srcIndices = reinterpret_cast<const IndexPair*>(capsules.begin());
+
+	if(mCloth.mCapsuleIndices.capacity() < newSize)
+	{
+		ContextLockType contextLock(mCloth.mFactory);
+		mCloth.mCapsuleIndices.reserve(newSize);
+	}
+
+	// resize to larger of oldSize and newSize
+	mCloth.mCapsuleIndices.resize(PxMax(oldSize, newSize));
+
+	typename T::MappedIndexVectorType dstIndices = mCloth.mCapsuleIndices;
+
+	if(uint32_t delta = newSize - oldSize)
+	{
+		// move past-range elements to new place
+		move(dstIndices.begin(), last, oldSize, last + delta);
+
+		// fill new elements from capsules
+		for(uint32_t i = last; i < last + delta; ++i)
+			dstIndices[i] = srcIndices[i - first];
+
+		dstIndices.resize(newSize);
+		mCloth.notifyChanged();
+	}
+
+	// fill existing elements from capsules
+	for(uint32_t i = first; i < last; ++i)
+		dstIndices[i] = srcIndices[i - first];
+
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumCapsules() const
+{
+	return uint32_t(mCloth.mCapsuleIndices.size());
+}
+
+template <typename T>
+inline void ClothImpl<T>::setPlanes(Range<const PxVec4> planes, uint32_t first, uint32_t last)
+{
+	uint32_t oldSize = uint32_t(mCloth.mStartCollisionPlanes.size());
+	uint32_t newSize = uint32_t(planes.size()) + oldSize - last + first;
+
+	PX_ASSERT(newSize <= 32);
+	PX_ASSERT(first <= oldSize);
+	PX_ASSERT(last <= oldSize);
+
+	if(!oldSize && !newSize)
+		return;
+
+	if(!oldSize)
+	{
+		ContextLockType contextLock(mCloth.mFactory);
+		mCloth.mStartCollisionPlanes.assign(planes.begin(), planes.end());
+		mCloth.notifyChanged();
+	}
+	else
+	{
+		if(PxMax(oldSize, newSize) >
+		   PxMin(mCloth.mStartCollisionPlanes.capacity(), mCloth.mTargetCollisionPlanes.capacity()))
+		{
+			ContextLockType contextLock(mCloth.mFactory);
+			mCloth.mStartCollisionPlanes.reserve(newSize);
+			mCloth.mTargetCollisionPlanes.reserve(PxMax(oldSize, newSize));
+		}
+
+		// fill target from start
+		for(uint32_t i = mCloth.mTargetCollisionPlanes.size(); i < oldSize; ++i)
+			mCloth.mTargetCollisionPlanes.pushBack(mCloth.mStartCollisionPlanes[i]);
+
+		// resize to larger of oldSize and newSize
+		mCloth.mStartCollisionPlanes.resize(PxMax(oldSize, newSize), PxZero);
+		mCloth.mTargetCollisionPlanes.resize(PxMax(oldSize, newSize), PxZero);
+
+		if(int32_t delta = int32_t(newSize - oldSize))
+		{
+			// move past-range elements to new place
+			move(mCloth.mStartCollisionPlanes.begin(), last, oldSize, last + delta);
+			move(mCloth.mTargetCollisionPlanes.begin(), last, oldSize, last + delta);
+
+			// fill new elements from planes
+			for(uint32_t i = last; i < last + delta; ++i)
+				mCloth.mStartCollisionPlanes[i] = planes[i - first];
+
+			// adjust convex indices
+			uint32_t mask = (uint32_t(1) << (last + PxMin(delta, 0))) - 1;
+			Vector<uint32_t>::Type::Iterator cIt, cEnd = mCloth.mConvexMasks.end();
+			for(cIt = mCloth.mConvexMasks.begin(); cIt != cEnd;)
+			{
+				uint32_t convex = (*cIt & mask);
+				if(delta < 0)
+					convex |= *cIt >> -delta & ~mask;
+				else
+					convex |= (*cIt & ~mask) << delta;
+				if(convex)
+					*cIt++ = convex;
+				else
+				{
+					mCloth.mConvexMasks.replaceWithLast(cIt);
+					cEnd = mCloth.mConvexMasks.end();
+				}
+			}
+
+			mCloth.mStartCollisionPlanes.resize(newSize);
+			mCloth.mTargetCollisionPlanes.resize(newSize);
+
+			mCloth.notifyChanged();
+		}
+
+		// fill target elements with planes
+		for(uint32_t i = 0; i < planes.size(); ++i)
+			mCloth.mTargetCollisionPlanes[first + i] = planes[i];
+	}
+
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumPlanes() const
+{
+	return uint32_t(mCloth.mStartCollisionPlanes.size());
+}
+
+template <typename T>
+inline void ClothImpl<T>::setConvexes(Range<const uint32_t> convexes, uint32_t first, uint32_t last)
+{
+	uint32_t oldSize = mCloth.mConvexMasks.size();
+	uint32_t newSize = uint32_t(convexes.size()) + oldSize - last + first;
+
+	PX_ASSERT(newSize <= 32);
+	PX_ASSERT(first <= oldSize);
+	PX_ASSERT(last <= oldSize);
+
+	if(mCloth.mConvexMasks.capacity() < newSize)
+	{
+		ContextLockType contextLock(mCloth.mFactory);
+		mCloth.mConvexMasks.reserve(newSize);
+	}
+
+	// resize to larger of oldSize and newSize
+	mCloth.mConvexMasks.resize(PxMax(oldSize, newSize));
+
+	if(uint32_t delta = newSize - oldSize)
+	{
+		// move past-range elements to new place
+		move(mCloth.mConvexMasks.begin(), last, oldSize, last + delta);
+
+		// fill new elements from capsules
+		for(uint32_t i = last; i < last + delta; ++i)
+			mCloth.mConvexMasks[i] = convexes[i - first];
+
+		mCloth.mConvexMasks.resize(newSize);
+		mCloth.notifyChanged();
+	}
+
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumConvexes() const
+{
+	return uint32_t(mCloth.mConvexMasks.size());
+}
+
+template <typename T>
+inline void ClothImpl<T>::setTriangles(Range<const PxVec3> triangles, uint32_t first, uint32_t last)
+{
+	// convert from triangle to vertex count
+	first *= 3;
+	last *= 3;
+
+	triangles = mCloth.clampTriangleCount(triangles, last - first);
+	PX_ASSERT(0 == triangles.size() % 3);
+
+	uint32_t oldSize = uint32_t(mCloth.mStartCollisionTriangles.size());
+	uint32_t newSize = uint32_t(triangles.size()) + oldSize - last + first;
+
+	PX_ASSERT(first <= oldSize);
+	PX_ASSERT(last <= oldSize);
+
+	if(!oldSize && !newSize)
+		return;
+
+	if(!oldSize)
+	{
+		ContextLockType contextLock(mCloth.mFactory);
+		mCloth.mStartCollisionTriangles.assign(triangles.begin(), triangles.end());
+		mCloth.notifyChanged();
+	}
+	else
+	{
+		if(PxMax(oldSize, newSize) >
+		   PxMin(mCloth.mStartCollisionTriangles.capacity(), mCloth.mTargetCollisionTriangles.capacity()))
+		{
+			ContextLockType contextLock(mCloth.mFactory);
+			mCloth.mStartCollisionTriangles.reserve(newSize);
+			mCloth.mTargetCollisionTriangles.reserve(PxMax(oldSize, newSize));
+		}
+
+		// fill target from start
+		for(uint32_t i = mCloth.mTargetCollisionTriangles.size(); i < oldSize; ++i)
+			mCloth.mTargetCollisionTriangles.pushBack(mCloth.mStartCollisionTriangles[i]);
+
+		// resize to larger of oldSize and newSize
+		mCloth.mStartCollisionTriangles.resize(PxMax(oldSize, newSize));
+		mCloth.mTargetCollisionTriangles.resize(PxMax(oldSize, newSize));
+
+		if(uint32_t delta = newSize - oldSize)
+		{
+			// move past-range elements to new place
+			move(mCloth.mStartCollisionTriangles.begin(), last, oldSize, last + delta);
+			move(mCloth.mTargetCollisionTriangles.begin(), last, oldSize, last + delta);
+
+			// fill new elements from triangles
+			for(uint32_t i = last; i < last + delta; ++i)
+				mCloth.mStartCollisionTriangles[i] = triangles[i - first];
+
+			mCloth.mStartCollisionTriangles.resize(newSize);
+			mCloth.mTargetCollisionTriangles.resize(newSize);
+
+			mCloth.notifyChanged();
+		}
+
+		// fill target elements with triangles
+		for(uint32_t i = 0; i < triangles.size(); ++i)
+			mCloth.mTargetCollisionTriangles[first + i] = triangles[i];
+	}
+
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline void ClothImpl<T>::setTriangles(Range<const PxVec3> startTriangles, Range<const PxVec3> targetTriangles,
+                                       uint32_t first)
+{
+	PX_ASSERT(startTriangles.size() == targetTriangles.size());
+
+	// convert from triangle to vertex count
+	first *= 3;
+
+	uint32_t last = uint32_t(mCloth.mStartCollisionTriangles.size());
+
+	startTriangles = mCloth.clampTriangleCount(startTriangles, last - first);
+	targetTriangles = mCloth.clampTriangleCount(targetTriangles, last - first);
+
+	uint32_t oldSize = uint32_t(mCloth.mStartCollisionTriangles.size());
+	uint32_t newSize = uint32_t(startTriangles.size()) + oldSize - last + first;
+
+	PX_ASSERT(first <= oldSize);
+	PX_ASSERT(last == oldSize); // this path only supports replacing the tail
+
+	if(!oldSize && !newSize)
+		return;
+
+	if(newSize > PxMin(mCloth.mStartCollisionTriangles.capacity(), mCloth.mTargetCollisionTriangles.capacity()))
+	{
+		ContextLockType contextLock(mCloth.mFactory);
+		mCloth.mStartCollisionTriangles.reserve(newSize);
+		mCloth.mTargetCollisionTriangles.reserve(newSize);
+	}
+
+	uint32_t retainSize = oldSize - last + first;
+	mCloth.mStartCollisionTriangles.resize(retainSize);
+	mCloth.mTargetCollisionTriangles.resize(retainSize);
+
+	for(uint32_t i = 0, n = startTriangles.size(); i < n; ++i)
+	{
+		mCloth.mStartCollisionTriangles.pushBack(startTriangles[i]);
+		mCloth.mTargetCollisionTriangles.pushBack(targetTriangles[i]);
+	}
+
+	if(newSize - oldSize)
+		mCloth.notifyChanged();
+
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumTriangles() const
+{
+	return uint32_t(mCloth.mStartCollisionTriangles.size()) / 3;
+}
+
+template <typename T>
+inline bool ClothImpl<T>::isContinuousCollisionEnabled() const
+{
+	return mCloth.mEnableContinuousCollision;
+}
+
+template <typename T>
+inline void ClothImpl<T>::enableContinuousCollision(bool enable)
+{
+	if(enable == mCloth.mEnableContinuousCollision)
+		return;
+
+	mCloth.mEnableContinuousCollision = enable;
+	mCloth.notifyChanged();
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getCollisionMassScale() const
+{
+	return mCloth.mCollisionMassScale;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setCollisionMassScale(float scale)
+{
+	if(scale == mCloth.mCollisionMassScale)
+		return;
+
+	mCloth.mCollisionMassScale = scale;
+	mCloth.notifyChanged();
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline void ClothImpl<T>::setFriction(float friction)
+{
+	mCloth.mFriction = friction;
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getFriction() const
+{
+	return mCloth.mFriction;
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumVirtualParticleWeights() const
+{
+	return uint32_t(mCloth.mVirtualParticleWeights.size());
+}
+
+template <typename T>
+inline void ClothImpl<T>::setTetherConstraintScale(float scale)
+{
+	if(scale == mCloth.mTetherConstraintScale)
+		return;
+
+	mCloth.mTetherConstraintScale = scale;
+	mCloth.notifyChanged();
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getTetherConstraintScale() const
+{
+	return mCloth.mTetherConstraintScale;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setTetherConstraintStiffness(float stiffness)
+{
+	float value = safeLog2(1 - stiffness);
+	if(value == mCloth.mTetherConstraintLogStiffness)
+		return;
+
+	mCloth.mTetherConstraintLogStiffness = value;
+	mCloth.notifyChanged();
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getTetherConstraintStiffness() const
+{
+	return 1 - safeExp2(mCloth.mTetherConstraintLogStiffness);
+}
+
+template <typename T>
+inline Range<PxVec4> ClothImpl<T>::getMotionConstraints()
+{
+	mCloth.wakeUp();
+	return mCloth.push(mCloth.mMotionConstraints);
+}
+
+template <typename T>
+inline void ClothImpl<T>::clearMotionConstraints()
+{
+	mCloth.clear(mCloth.mMotionConstraints);
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumMotionConstraints() const
+{
+	return uint32_t(mCloth.mMotionConstraints.mStart.size());
+}
+
+template <typename T>
+inline void ClothImpl<T>::setMotionConstraintScaleBias(float scale, float bias)
+{
+	if(scale == mCloth.mMotionConstraintScale && bias == mCloth.mMotionConstraintBias)
+		return;
+
+	mCloth.mMotionConstraintScale = scale;
+	mCloth.mMotionConstraintBias = bias;
+	mCloth.notifyChanged();
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getMotionConstraintScale() const
+{
+	return mCloth.mMotionConstraintScale;
+}
+
+template <typename T>
+inline float ClothImpl<T>::getMotionConstraintBias() const
+{
+	return mCloth.mMotionConstraintBias;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setMotionConstraintStiffness(float stiffness)
+{
+	float value = safeLog2(1 - stiffness);
+	if(value == mCloth.mMotionConstraintLogStiffness)
+		return;
+
+	mCloth.mMotionConstraintLogStiffness = value;
+	mCloth.notifyChanged();
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getMotionConstraintStiffness() const
+{
+	return 1 - safeExp2(mCloth.mMotionConstraintLogStiffness);
+}
+
+template <typename T>
+inline Range<PxVec4> ClothImpl<T>::getSeparationConstraints()
+{
+	mCloth.wakeUp();
+	return mCloth.push(mCloth.mSeparationConstraints);
+}
+
+template <typename T>
+inline void ClothImpl<T>::clearSeparationConstraints()
+{
+	mCloth.clear(mCloth.mSeparationConstraints);
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline void ClothImpl<T>::clearInterpolation()
+{
+	if(!mCloth.mTargetCollisionSpheres.empty())
+	{
+		nvidia::swap(mCloth.mStartCollisionSpheres, mCloth.mTargetCollisionSpheres);
+		mCloth.mTargetCollisionSpheres.resize(0);
+	}
+	mCloth.mMotionConstraints.pop();
+	mCloth.mSeparationConstraints.pop();
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumSeparationConstraints() const
+{
+	return uint32_t(mCloth.mSeparationConstraints.mStart.size());
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumParticleAccelerations() const
+{
+	return uint32_t(mCloth.mParticleAccelerations.size());
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumSelfCollisionIndices() const
+{
+	return uint32_t(mCloth.mSelfCollisionIndices.size());
+}
+
+// Fixed 4505:local function has been removed
+template <typename T>
+inline void ClothImpl<T>::setRestPositions(Range<const PxVec4> restPositions)
+{
+	PX_ASSERT(restPositions.empty() || restPositions.size() == getNumParticles());
+	ContextLockType contextLock(mCloth.mFactory);
+	mCloth.mRestPositions.assign(restPositions.begin(), restPositions.end());
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getNumRestPositions() const
+{
+	return uint32_t(mCloth.mRestPositions.size());
+}
+
+template <typename T>
+inline void ClothImpl<T>::setSelfCollisionDistance(float distance)
+{
+	if(distance == mCloth.mSelfCollisionDistance)
+		return;
+
+	mCloth.mSelfCollisionDistance = distance;
+	mCloth.notifyChanged();
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getSelfCollisionDistance() const
+{
+	return mCloth.mSelfCollisionDistance;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setSelfCollisionStiffness(float stiffness)
+{
+	float value = safeLog2(1 - stiffness);
+	if(value == mCloth.mSelfCollisionLogStiffness)
+		return;
+
+	mCloth.mSelfCollisionLogStiffness = value;
+	mCloth.notifyChanged();
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getSelfCollisionStiffness() const
+{
+	return 1 - safeExp2(mCloth.mSelfCollisionLogStiffness);
+}
+
+template <typename T>
+inline const PxVec3& ClothImpl<T>::getBoundingBoxCenter() const
+{
+	return mCloth.mParticleBoundsCenter;
+}
+
+template <typename T>
+inline const PxVec3& ClothImpl<T>::getBoundingBoxScale() const
+{
+	return mCloth.mParticleBoundsHalfExtent;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setSleepThreshold(float threshold)
+{
+	if(threshold == mCloth.mSleepThreshold)
+		return;
+
+	mCloth.mSleepThreshold = threshold;
+	mCloth.notifyChanged();
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline float ClothImpl<T>::getSleepThreshold() const
+{
+	return mCloth.mSleepThreshold;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setSleepTestInterval(uint32_t interval)
+{
+	if(interval == mCloth.mSleepTestInterval)
+		return;
+
+	mCloth.mSleepTestInterval = interval;
+	mCloth.notifyChanged();
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getSleepTestInterval() const
+{
+	return mCloth.mSleepTestInterval;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setSleepAfterCount(uint32_t afterCount)
+{
+	if(afterCount == mCloth.mSleepAfterCount)
+		return;
+
+	mCloth.mSleepAfterCount = afterCount;
+	mCloth.notifyChanged();
+	mCloth.wakeUp();
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getSleepAfterCount() const
+{
+	return mCloth.mSleepAfterCount;
+}
+
+template <typename T>
+inline uint32_t ClothImpl<T>::getSleepPassCount() const
+{
+	return mCloth.mSleepPassCounter;
+}
+
+template <typename T>
+inline bool ClothImpl<T>::isAsleep() const
+{
+	return mCloth.isSleeping();
+}
+
+template <typename T>
+inline void ClothImpl<T>::putToSleep()
+{
+	mCloth.mSleepPassCounter = mCloth.mSleepAfterCount;
+}
+
+template <typename T>
+inline void ClothImpl<T>::wakeUp()
+{
+	mCloth.wakeUp();
+}
+
+
+template <typename T>
+inline void ClothImpl<T>::setHalfPrecisionOption(bool isAllowed)
+{
+	mCloth.mIsAllowedHalfPrecisionSolver = isAllowed;
+}
+
+template <typename T>
+inline bool ClothImpl<T>::getHalfPrecisionOption() const
+{
+	return mCloth.mIsAllowedHalfPrecisionSolver;
+}
+
+template <typename T>
+inline void ClothImpl<T>::setUserData(void* data)
+{
+	mCloth.mUserData = data;
+}
+
+template <typename T>
+inline void* ClothImpl<T>::getUserData() const
+{
+	return mCloth.mUserData;
+}
+
+template <typename T>
+template <typename U>
+inline MappedRange<U> ClothImpl<T>::getMappedParticles(U* data) const
+{
+	return MappedRange<U>(data, data + getNumParticles(), *this, &Cloth::lockParticles, &Cloth::unlockParticles);
+}
+
+} // namespace cloth
+
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Factory.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Factory.cpp
new file mode 100644
index 00000000..6e49c85f
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Factory.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "SwFactory.h"
+
+#if ENABLE_CUFACTORY
+#include "CuFactory.h"
+#endif
+
+#if ENABLE_DXFACTORY
+#include "windows/DxFactory.h"
+//#include "PxGraphicsContextManager.h"
+#pragma warning(disable : 4668 4917 4365 4061 4005)
+#if PX_XBOXONE
+#include <d3d11_x.h>
+#else
+#include <d3d11.h>
+#endif
+#endif
+
+namespace nvidia
+{
+namespace cloth
+{
+uint32_t getNextFabricId()
+{
+	static uint32_t sNextFabricId = 0;
+	return sNextFabricId++;
+}
+}
+}
+
+using namespace nvidia;
+
+cloth::Factory* cloth::Factory::createFactory(Platform platform, void* contextManager)
+{
+	PX_UNUSED(contextManager);
+
+	if(platform == Factory::CPU)
+		return new SwFactory;
+
+#if ENABLE_CUFACTORY
+	if(platform == Factory::CUDA)
+		return new CuFactory((PxCudaContextManager*)contextManager);
+#endif
+
+#if ENABLE_DXFACTORY
+	if(platform == Factory::DirectCompute)
+	{
+		//physx::PxGraphicsContextManager* graphicsContextManager = (physx::PxGraphicsContextManager*)contextManager;
+		//if(graphicsContextManager->getDevice()->GetFeatureLevel() >= D3D_FEATURE_LEVEL_11_0)
+		//	return new DxFactory(graphicsContextManager);
+	}
+#endif
+
+	return 0;
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/IndexPair.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/IndexPair.h
new file mode 100644
index 00000000..89dd9090
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/IndexPair.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+
+struct IndexPair
+{
+	uint32_t first;
+	uint32_t second;
+};
+
+} // namespace cloth
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/IterationState.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/IterationState.h
new file mode 100644
index 00000000..527cf163
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/IterationState.h
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+#include "Array.h"
+#include "PxTransform.h"
+#include "PxMat44.h"
+#include "PsMathUtils.h"
+#include "Simd4f.h"
+#include "Simd4i.h"
+
+namespace nvidia
+{
+
+/* function object to perform solver iterations on one cloth */
+
+// todo: performance optimization: cache this object and test if velocity/iterDt has changed
+// c'tor takes about 5% of the iteration time of a 20x20 cloth
+
+namespace cloth
+{
+
+/*  helper functions */
+
+inline PxVec3 log(const PxQuat& q)
+{
+	float theta = q.getImaginaryPart().magnitude();
+	float scale = theta > PX_EPS_REAL ? PxAsin(theta) / theta : 1.0f;
+	scale = intrinsics::fsel(q.w, scale, -scale);
+	return PxVec3(q.x * scale, q.y * scale, q.z * scale);
+}
+
+inline PxQuat exp(const PxVec3& v)
+{
+	float theta = v.magnitude();
+	float scale = theta > PX_EPS_REAL ? PxSin(theta) / theta : 1.0f;
+	return PxQuat(v.x * scale, v.y * scale, v.z * scale, cos(theta));
+}
+
+template <typename Simd4f, uint32_t N>
+inline void assign(Simd4f (&columns)[N], const PxMat44& matrix)
+{
+	for(uint32_t i = 0; i < N; ++i)
+		columns[i] = load(array(matrix[i]));
+}
+
+template <typename Simd4f>
+inline Simd4f transform(const Simd4f (&columns)[3], const Simd4f& vec)
+{
+	return splat<0>(vec) * columns[0] + splat<1>(vec) * columns[1] + splat<2>(vec) * columns[2];
+}
+
+template <typename Simd4f>
+inline Simd4f transform(const Simd4f (&columns)[3], const Simd4f& translate, const Simd4f& vec)
+{
+	return translate + splat<0>(vec) * columns[0] + splat<1>(vec) * columns[1] + splat<2>(vec) * columns[2];
+}
+
+template <typename>
+struct IterationState; // forward declaration
+
+struct IterationStateFactory
+{
+	template <typename MyCloth>
+	IterationStateFactory(MyCloth& cloth, float frameDt);
+
+	template <typename Simd4f, typename MyCloth>
+	IterationState<Simd4f> create(MyCloth const& cloth) const;
+
+	template <typename Simd4f>
+	static Simd4f lengthSqr(Simd4f const& v)
+	{
+		return dot3(v, v);
+	}
+
+	template <typename Simd4f>
+	static PxVec3 castToPxVec3(const Simd4f& v)
+	{
+		return *reinterpret_cast<const PxVec3*>(reinterpret_cast<const char*>(&v));
+	}
+
+	int mNumIterations;
+	float mInvNumIterations;
+	float mIterDt, mIterDtRatio, mIterDtAverage;
+	PxQuat mCurrentRotation;
+	PxVec3 mPrevLinearVelocity;
+	PxVec3 mPrevAngularVelocity;
+};
+
+/* solver iterations helper functor */
+template <typename Simd4f>
+struct IterationState
+{
+	// call after each iteration
+	void update();
+
+	inline float getCurrentAlpha() const;
+	inline float getPreviousAlpha() const;
+
+  public:
+	Simd4f mRotationMatrix[3];
+	Simd4f mCurBias;  // in local space
+	Simd4f mPrevBias; // in local space
+
+	Simd4f mPrevMatrix[3];
+	Simd4f mCurMatrix[3];
+	Simd4f mDampScaleUpdate;
+
+	// iteration counter
+	uint32_t mRemainingIterations;
+
+	// reciprocal total number of iterations
+	float mInvNumIterations;
+
+	// time step size per iteration
+	float mIterDt;
+
+	bool mIsTurning; // if false, mPositionScale = mPrevMatrix[0]
+};
+
+} // namespace cloth
+
+template <typename Simd4f>
+inline float cloth::IterationState<Simd4f>::getCurrentAlpha() const
+{
+	return getPreviousAlpha() + mInvNumIterations;
+}
+
+template <typename Simd4f>
+inline float cloth::IterationState<Simd4f>::getPreviousAlpha() const
+{
+	return 1.0f - mRemainingIterations * mInvNumIterations;
+}
+
+template <typename MyCloth>
+cloth::IterationStateFactory::IterationStateFactory(MyCloth& cloth, float frameDt)
+{
+	mNumIterations = PxMax(1, int(frameDt * cloth.mSolverFrequency + 0.5f));
+	mInvNumIterations = 1.0f / mNumIterations;
+	mIterDt = frameDt * mInvNumIterations;
+
+	mIterDtRatio = cloth.mPrevIterDt ? mIterDt / cloth.mPrevIterDt : 1.0f;
+	mIterDtAverage = cloth.mIterDtAvg.empty() ? mIterDt : cloth.mIterDtAvg.average();
+
+	mCurrentRotation = cloth.mCurrentMotion.q;
+	mPrevLinearVelocity = cloth.mLinearVelocity;
+	mPrevAngularVelocity = cloth.mAngularVelocity;
+
+	// update cloth
+	float invFrameDt = 1.0f / frameDt;
+	cloth.mLinearVelocity = invFrameDt * (cloth.mTargetMotion.p - cloth.mCurrentMotion.p);
+	PxQuat dq = cloth.mTargetMotion.q * cloth.mCurrentMotion.q.getConjugate();
+	cloth.mAngularVelocity = log(dq) * invFrameDt;
+
+	cloth.mPrevIterDt = mIterDt;
+	cloth.mIterDtAvg.push((uint32_t)mNumIterations, mIterDt);
+	cloth.mCurrentMotion = cloth.mTargetMotion;
+}
+
+/*
+momentum conservation:
+m2*x2 - m1*x1 = m1*x1 - m0*x0 + g*dt2, m = r+t
+r2*x2+t2 = 2(r1*x1+t1) - (r0*x0+t0) + g*dt2
+r2*x2 = r1*x1 + r1*x1 - r0*x0 - (t2-2t1+t0) + g*dt2
+substitue       r1*x1 - r0*x0 = r1*(x1-x0) + (r1-r0)*x0
+and     r1*x1 = r2*x1 - (r2-r1)*x1
+
+x2 = x1 + r2'*g*dt2
+   + r2'r1*(x1-x0) //< damp
+   + (r2'r1-r2'r0)*x0 - (1-r2'r1)*x1 - r2'*(t2-2t1+t0) //< inertia
+   + (1-r2'r1)x1 + t2-t1 //< drag (not momentum conserving)
+
+x2 = x0 + a0*x0 + a1*x1 + b with
+a0 = (inertia-damp)*r2'r1 - inertia*r2'r0 - eye
+a1 = (1-inertia-drag)*eye + (damp+inertia+drag)*r2'r1
+b = r2'*(g*dt2 - (inertia+drag)*(t2-t1) + inertia*(t1-t0))
+
+Velocities are used to deal with multiple iterations and varying dt. Only b needs
+to updated from one iteration to the next. Specifically, it is multiplied
+by (r2'r1)^1/numIterations. a0 and a1 are unaffected by that multiplication.
+
+The centrifugal and coriolis forces of non-inertial (turning) reference frame are
+not generally captured in these formulas. The 'inertia' term above contains radial
+acceleration plus centrifugal and coriolis force for a single iteration.
+For multiple iterations, or when the centrifugal forces are scaled differently
+than angular inertia, we need to add explicit centrifugal and coriolis forces.
+We only use them to correct the above formula because their discretization is
+not accurate.
+
+Possible improvements: multiply coriolis and centrifugal matrix by curInvRotation
+from the left. Do the alpha trick of linearInertia also for angularInertia, write
+prevParticle after multiplying it with matrix.
+
+If you change anything in this function, make sure that ClothCustomFloating and
+ClothInertia haven't regressed for any choice of solver frequency.
+*/
+
+template <typename Simd4f, typename MyCloth>
+cloth::IterationState<Simd4f> cloth::IterationStateFactory::create(MyCloth const& cloth) const
+{
+	IterationState<Simd4f> result;
+
+	result.mRemainingIterations = (uint32_t)mNumIterations;
+	result.mInvNumIterations = mInvNumIterations;
+	result.mIterDt = mIterDt;
+
+	Simd4f curLinearVelocity = load(array(cloth.mLinearVelocity));
+	Simd4f prevLinearVelocity = load(array(mPrevLinearVelocity));
+
+	Simd4f iterDt = simd4f(mIterDt);
+	Simd4f dampExponent = simd4f(cloth.mStiffnessFrequency) * iterDt;
+
+	// gravity delta per iteration
+	Simd4f gravity = load(array(cloth.mGravity)) * (Simd4f)simd4f(sqr(mIterDtAverage));
+
+	// scale of local particle velocity per iteration
+	Simd4f dampScale = simdf::exp2(load(array(cloth.mLogDamping)) * dampExponent);
+	// adjust for the change in time step during the first iteration
+	Simd4f firstDampScale = dampScale * simd4f(mIterDtRatio);
+
+	// portion of negative frame velocity to transfer to particle
+	Simd4f linearDrag =
+	    (simd4f(_1) - simdf::exp2(load(array(cloth.mLinearLogDrag)) * dampExponent)) * iterDt * curLinearVelocity;
+
+	// portion of frame acceleration to transfer to particle
+	Simd4f linearInertia = load(array(cloth.mLinearInertia)) * iterDt * (prevLinearVelocity - curLinearVelocity);
+
+	// for inertia, we want to violate newton physics to
+	// match velocity and position as given by the user, which means:
+	// vt = v0 + a*t and xt = x0 + v0*t + (!) a*t^2
+	// this is achieved by applying a different portion to cur and prev
+	// position, compared to the normal +0.5 and -0.5 for '... 1/2 a*t^2'.
+	// specifically, the portion is alpha=(n+1)/2n and 1-alpha.
+
+	float linearAlpha = (mNumIterations + 1) * 0.5f * mInvNumIterations;
+	Simd4f curLinearInertia = linearInertia * simd4f(linearAlpha);
+
+	// rotate to local space (use mRotationMatrix temporarily to hold matrix)
+	PxMat44 invRotation(mCurrentRotation.getConjugate());
+	assign(result.mRotationMatrix, invRotation);
+
+	Simd4f maskXYZ = simd4f(simd4i(~0, ~0, ~0, 0));
+
+	// Previously, we split the bias between previous and current position to
+	// get correct disretized position and velocity. However, this made a
+	// hanging cloth experience a downward velocity, which is problematic
+	// when scaled by the iterDt ratio and results in jitter under variable
+	// timesteps. Instead, we now apply the entire bias to current position
+	// and accept a less noticeable error for a free falling cloth.
+
+	Simd4f bias = gravity - linearDrag;
+	result.mCurBias = transform(result.mRotationMatrix, curLinearInertia + bias) & maskXYZ;
+	result.mPrevBias = transform(result.mRotationMatrix, linearInertia - curLinearInertia) & maskXYZ;
+
+	result.mIsTurning = mPrevAngularVelocity.magnitudeSquared() + cloth.mAngularVelocity.magnitudeSquared() > 0.0f;
+
+	if(result.mIsTurning)
+	{
+		Simd4f curAngularVelocity = load(array(invRotation.rotate(cloth.mAngularVelocity)));
+		Simd4f prevAngularVelocity = load(array(invRotation.rotate(mPrevAngularVelocity)));
+
+		// rotation for one iteration in local space
+		Simd4f curInvAngle = -iterDt * curAngularVelocity;
+		Simd4f prevInvAngle = -iterDt * prevAngularVelocity;
+
+		PxQuat curInvRotation = exp(castToPxVec3(curInvAngle));
+		PxQuat prevInvRotation = exp(castToPxVec3(prevInvAngle));
+
+		PxMat44 curMatrix(curInvRotation);
+		PxMat44 prevMatrix(prevInvRotation * curInvRotation);
+
+		assign(result.mRotationMatrix, curMatrix);
+
+		Simd4f angularDrag = simd4f(_1) - simdf::exp2(load(array(cloth.mAngularLogDrag)) * dampExponent);
+		Simd4f centrifugalInertia = load(array(cloth.mCentrifugalInertia));
+		Simd4f angularInertia = load(array(cloth.mAngularInertia));
+		Simd4f angularAcceleration = curAngularVelocity - prevAngularVelocity;
+
+		Simd4f epsilon = simd4f(sqrt(FLT_MIN)); // requirement: sqr(epsilon) > 0
+		Simd4f velocityLengthSqr = lengthSqr(curAngularVelocity) + epsilon;
+		Simd4f dragLengthSqr = lengthSqr(Simd4f(curAngularVelocity * angularDrag)) + epsilon;
+		Simd4f centrifugalLengthSqr = lengthSqr(Simd4f(curAngularVelocity * centrifugalInertia)) + epsilon;
+		Simd4f accelerationLengthSqr = lengthSqr(angularAcceleration) + epsilon;
+		Simd4f inertiaLengthSqr = lengthSqr(Simd4f(angularAcceleration * angularInertia)) + epsilon;
+
+		float dragScale = array(rsqrt(velocityLengthSqr * dragLengthSqr) * dragLengthSqr)[0];
+		float inertiaScale =
+		    mInvNumIterations * array(rsqrt(accelerationLengthSqr * inertiaLengthSqr) * inertiaLengthSqr)[0];
+
+		// magic factor found by comparing to global space simulation:
+		// some centrifugal force is in inertia part, remainder is 2*(n-1)/n
+		// after scaling the inertia part, we get for centrifugal:
+		float centrifugalAlpha = (2 * mNumIterations - 1) * mInvNumIterations;
+		float centrifugalScale =
+		    centrifugalAlpha * array(rsqrt(velocityLengthSqr * centrifugalLengthSqr) * centrifugalLengthSqr)[0] -
+		    inertiaScale;
+
+		// slightly better in ClothCustomFloating than curInvAngle alone
+		Simd4f centrifugalVelocity = (prevInvAngle + curInvAngle) * simd4f(0.5f);
+		const Simd4f data = lengthSqr(centrifugalVelocity);
+		float centrifugalSqrLength = array(data)[0] * centrifugalScale;
+
+		Simd4f coriolisVelocity = centrifugalVelocity * simd4f(centrifugalScale);
+		PxMat33 coriolisMatrix = physx::shdfnd::star(castToPxVec3(coriolisVelocity));
+
+		const float* dampScalePtr = array(firstDampScale);
+		const float* centrifugalPtr = array(centrifugalVelocity);
+
+		for(unsigned int j = 0; j < 3; ++j)
+		{
+			float centrifugalJ = -centrifugalPtr[j] * centrifugalScale;
+			for(unsigned int i = 0; i < 3; ++i)
+			{
+				float damping = dampScalePtr[j];
+				float coriolis = coriolisMatrix(i, j);
+				float centrifugal = centrifugalPtr[i] * centrifugalJ;
+
+				prevMatrix(i, j) = centrifugal - coriolis + curMatrix(i, j) * (inertiaScale - damping) -
+				                   prevMatrix(i, j) * inertiaScale;
+				curMatrix(i, j) = centrifugal + coriolis + curMatrix(i, j) * (inertiaScale + damping + dragScale);
+			}
+			curMatrix(j, j) += centrifugalSqrLength - inertiaScale - dragScale;
+			prevMatrix(j, j) += centrifugalSqrLength;
+		}
+
+		assign(result.mPrevMatrix, prevMatrix);
+		assign(result.mCurMatrix, curMatrix);
+	}
+	else
+	{
+		Simd4f minusOne = -(Simd4f)simd4f(_1);
+		result.mRotationMatrix[0] = minusOne;
+		result.mPrevMatrix[0] = select(maskXYZ, firstDampScale, minusOne);
+	}
+
+	// difference of damp scale between first and other iterations
+	result.mDampScaleUpdate = (dampScale - firstDampScale) & maskXYZ;
+
+	return result;
+}
+
+template <typename Simd4f>
+void cloth::IterationState<Simd4f>::update()
+{
+	if(mIsTurning)
+	{
+		// only need to turn bias, matrix is unaffected (todo: verify)
+		mCurBias = transform(mRotationMatrix, mCurBias);
+		mPrevBias = transform(mRotationMatrix, mPrevBias);
+	}
+
+	// remove time step ratio in damp scale after first iteration
+	for(uint32_t i = 0; i < 3; ++i)
+	{
+		mPrevMatrix[i] = mPrevMatrix[i] - mRotationMatrix[i] * mDampScaleUpdate;
+		mCurMatrix[i] = mCurMatrix[i] + mRotationMatrix[i] * mDampScaleUpdate;
+	}
+	mDampScaleUpdate = simd4f(_0); // only once
+
+	--mRemainingIterations;
+}
+
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/MovingAverage.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/MovingAverage.h
new file mode 100644
index 00000000..76eb7f4c
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/MovingAverage.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Allocator.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+
+struct MovingAverage
+{
+	struct Element
+	{
+		uint32_t mCount;
+		float mValue;
+	};
+
+  public:
+	MovingAverage(uint32_t n = 1) : mCount(0), mSize(n)
+	{
+	}
+
+	bool empty() const
+	{
+		return mData.empty();
+	}
+
+	uint32_t size() const
+	{
+		return mSize;
+	}
+
+	void resize(uint32_t n)
+	{
+		PX_ASSERT(n);
+		mSize = n;
+		trim();
+	}
+
+	void reset()
+	{
+		mData.resize(0);
+		mCount = 0;
+	}
+
+	void push(uint32_t n, float value)
+	{
+		n = PxMin(n, mSize);
+
+		if(mData.empty() || mData.back().mValue != value)
+		{
+			Element element = { n, value };
+			mData.pushBack(element);
+		}
+		else
+		{
+			mData.back().mCount += n;
+		}
+
+		mCount += n;
+		trim();
+	}
+
+	float average() const
+	{
+		PX_ASSERT(!mData.empty());
+
+		float sum = 0.0f;
+		Vector<Element>::Type::ConstIterator it = mData.begin(), end = mData.end();
+		for(; it != end; ++it)
+			sum += it->mCount * it->mValue;
+
+		// linear weight ramps at both ends for smoother average
+		uint32_t n = mCount / 8;
+		float ramp = 0.0f, temp = 0.0f;
+		uint32_t countLo = (it = mData.begin())->mCount;
+		uint32_t countHi = (--end)->mCount;
+		for(uint32_t i = 0; i < n; ++i)
+		{
+			if(i == countLo)
+				countLo += (++it)->mCount;
+			if(i == countHi)
+				countHi += (--end)->mCount;
+
+			temp += it->mValue + end->mValue;
+			ramp += temp;
+		}
+
+		uint32_t num = (mCount - n) * (n + 1);
+		return (sum * (n + 1) - ramp) / num;
+	}
+
+  private:
+	// remove oldest (front) values until mCount<=mSize
+	void trim()
+	{
+		Vector<Element>::Type::Iterator it = mData.begin();
+		for(uint32_t k = mSize; k < mCount; it += k <= mCount)
+		{
+			k += it->mCount;
+			it->mCount = k - mCount;
+		}
+
+		if(it != mData.begin())
+			mData.assign(it, mData.end());
+
+		mCount = PxMin(mCount, mSize);
+	}
+
+	Vector<Element>::Type mData;
+
+	uint32_t mCount;
+	uint32_t mSize;
+};
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/PhaseConfig.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/PhaseConfig.cpp
new file mode 100644
index 00000000..310c43d6
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/PhaseConfig.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "PhaseConfig.h"
+#include "ApexUsingNamespace.h"
+#include "PsMathUtils.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+PhaseConfig transform(const PhaseConfig&);
+}
+}
+
+using namespace nvidia;
+
+namespace
+{
+float safeLog2(float x)
+{
+	float saturated = PxMax(0.0f, PxMin(x, 1.0f));
+	return saturated ? physx::shdfnd::log2(saturated) : -FLT_MAX_EXP;
+}
+}
+
+cloth::PhaseConfig::PhaseConfig(uint16_t index)
+: mPhaseIndex(index)
+, mPadding(0xffff)
+, mStiffness(1.0f)
+, mStiffnessMultiplier(1.0f)
+, mCompressionLimit(1.0f)
+, mStretchLimit(1.0f)
+{
+}
+
+// convert from user input to solver format
+cloth::PhaseConfig cloth::transform(const PhaseConfig& config)
+{
+	PhaseConfig result(config.mPhaseIndex);
+
+	result.mStiffness = safeLog2(1.0f - config.mStiffness);
+	result.mStiffnessMultiplier = safeLog2(config.mStiffnessMultiplier);
+
+	// negative for compression, positive for stretch
+	result.mCompressionLimit = 1.f - 1.f / config.mCompressionLimit;
+	result.mStretchLimit = 1.f - 1.f / config.mStretchLimit;
+
+	return result;
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/PointInterpolator.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/PointInterpolator.h
new file mode 100644
index 00000000..fe130156
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/PointInterpolator.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+#include "Simd4f.h"
+
+namespace nvidia
+{
+
+namespace cloth
+{
+
+// acts as a poor mans random access iterator
+template <typename Simd4f, typename BaseIterator>
+class LerpIterator
+{
+
+	LerpIterator& operator=(const LerpIterator&); // not implemented
+
+  public:
+	LerpIterator(BaseIterator start, BaseIterator target, float alpha)
+	: mAlpha(simd4f(alpha)), mStart(start), mTarget(target)
+	{
+	}
+
+	// return the interpolated point at a given index
+	inline Simd4f operator[](size_t index) const
+	{
+		return mStart[index] + (mTarget[index] - mStart[index]) * mAlpha;
+	}
+
+	inline Simd4f operator*() const
+	{
+		return (*this)[0];
+	}
+
+	// prefix increment only
+	inline LerpIterator& operator++()
+	{
+		++mStart;
+		++mTarget;
+		return *this;
+	}
+
+  private:
+	// interpolation parameter
+	const Simd4f mAlpha;
+
+	BaseIterator mStart;
+	BaseIterator mTarget;
+};
+
+template <typename Simd4f, size_t Stride>
+class UnalignedIterator
+{
+
+	UnalignedIterator& operator=(const UnalignedIterator&); // not implemented
+
+  public:
+	UnalignedIterator(const float* pointer) : mPointer(pointer)
+	{
+	}
+
+	inline Simd4f operator[](size_t index) const
+	{
+		return load(mPointer + index * Stride);
+	}
+
+	inline Simd4f operator*() const
+	{
+		return (*this)[0];
+	}
+
+	// prefix increment only
+	inline UnalignedIterator& operator++()
+	{
+		mPointer += Stride;
+		return *this;
+	}
+
+  private:
+	const float* mPointer;
+};
+
+// acts as an iterator but returns a constant
+template <typename Simd4f>
+class ConstantIterator
+{
+  public:
+	ConstantIterator(const Simd4f& value) : mValue(value)
+	{
+	}
+
+	inline Simd4f operator*() const
+	{
+		return mValue;
+	}
+
+	inline ConstantIterator& operator++()
+	{
+		return *this;
+	}
+
+  private:
+	ConstantIterator& operator=(const ConstantIterator&);
+	const Simd4f mValue;
+};
+
+// wraps an iterator with constant scale and bias
+template <typename Simd4f, typename BaseIterator>
+class ScaleBiasIterator
+{
+  public:
+	ScaleBiasIterator(BaseIterator base, const Simd4f& scale, const Simd4f& bias)
+	: mScale(scale), mBias(bias), mBaseIterator(base)
+	{
+	}
+
+	inline Simd4f operator*() const
+	{
+		return (*mBaseIterator) * mScale + mBias;
+	}
+
+	inline ScaleBiasIterator& operator++()
+	{
+		++mBaseIterator;
+		return *this;
+	}
+
+  private:
+	ScaleBiasIterator& operator=(const ScaleBiasIterator&);
+
+	const Simd4f mScale;
+	const Simd4f mBias;
+
+	BaseIterator mBaseIterator;
+};
+
+} // namespace cloth
+
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Simd4f.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Simd4f.h
new file mode 100644
index 00000000..8755a010
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Simd4f.h
@@ -0,0 +1,478 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "SimdTypes.h"
+
+#if NVMATH_FUSE_MULTIPLY_ADD
+
+/*! \brief Expression template to fuse multiply-adds.
+ * \relates Simd4f */
+struct ProductExpr
+{
+	inline ProductExpr(Simd4f const& v0_, Simd4f const& v1_) : v0(v0_), v1(v1_)
+	{
+	}
+	inline operator Simd4f() const;
+	const Simd4f v0, v1;
+
+  private:
+	ProductExpr& operator=(const ProductExpr&); // not implemented
+};
+
+inline Simd4f operator+(const ProductExpr&, const Simd4f&);
+inline Simd4f operator+(const Simd4f& v, const ProductExpr&);
+inline Simd4f operator+(const ProductExpr&, const ProductExpr&);
+inline Simd4f operator-(const Simd4f& v, const ProductExpr&);
+inline Simd4f operator-(const ProductExpr&, const ProductExpr&);
+
+#else  // NVMATH_FUSE_MULTIPLY_ADD
+typedef Simd4f ProductExpr;
+#endif // NVMATH_FUSE_MULTIPLY_ADD
+
+template <typename T>
+struct Simd4fFactory
+{
+	Simd4fFactory(T v_) : v(v_)
+	{
+	}
+	inline operator Simd4f() const;
+	inline operator Scalar4f() const;
+	Simd4fFactory& operator=(const Simd4fFactory&); // not implemented
+	T v;
+};
+
+template <>
+struct Simd4fFactory<detail::FourTuple>
+{
+	Simd4fFactory(float x, float y, float z, float w)
+	{
+		v[0] = x, v[1] = y, v[2] = z, v[3] = w;
+	}
+	Simd4fFactory(const Simd4fFactory<const float&>& f)
+	{
+		v[3] = v[2] = v[1] = v[0] = f.v;
+	}
+	inline operator Simd4f() const;
+	inline operator Scalar4f() const;
+	Simd4fFactory& operator=(const Simd4fFactory&); // not implemented
+	PX_ALIGN(16, float) v[4];
+};
+
+template <int i>
+struct Simd4fFactory<detail::IntType<i> >
+{
+	inline operator Simd4f() const;
+	inline operator Scalar4f() const;
+};
+
+// forward declaration
+template <typename>
+struct Simd4iFactory;
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression template
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+#if NVMATH_SIMD
+inline Simd4f operator&(const ComplementExpr<Simd4f>&, const Simd4f&);
+inline Simd4f operator&(const Simd4f&, const ComplementExpr<Simd4f>&);
+#endif
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operators
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+// note: operator?= missing because they don't have corresponding intrinsics.
+
+/*! \brief Test for equality of two vectors.
+* \return Vector of per element result mask (all bits set for 'true', none set for 'false').
+* \note QNaPs aren't handled on SPU: comparing two QNaPs will return true.
+* \relates Simd4f */
+inline Simd4f operator==(const Simd4f& v0, const Simd4f& v1);
+
+// no operator!= because VMX128 does not support it, use ~operator== and handle QNaPs
+
+/*! \brief Less-compare all elements of two vectors.
+* \return Vector of per element result mask (all bits set for 'true', none set for 'false').
+* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false.
+* \relates Simd4f */
+inline Simd4f operator<(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Less-or-equal-compare all elements of two vectors.
+* \return Vector of per element result mask (all bits set for 'true', none set for 'false').
+* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false.
+* \relates Simd4f */
+inline Simd4f operator<=(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Greater-compare all elements of two vectors.
+* \return Vector of per element result mask (all bits set for 'true', none set for 'false').
+* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false.
+* \relates Simd4f */
+inline Simd4f operator>(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Greater-or-equal-compare all elements of two vectors.
+* \return Vector of per element result mask (all bits set for 'true', none set for 'false').
+* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false.
+* \relates Simd4f */
+inline Simd4f operator>=(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Vector bit-wise NOT operator
+* \return A vector holding the bit-negate of \a v.
+* \relates Simd4f */
+inline ComplementExpr<Simd4f> operator~(const Simd4f& v);
+
+/*! \brief Vector bit-wise AND operator
+* \return A vector holding the bit-wise AND of \a v0 and \a v1.
+* \relates Simd4f */
+inline Simd4f operator&(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Vector bit-wise OR operator
+* \return A vector holding the bit-wise OR of \a v0 and \a v1.
+* \relates Simd4f */
+inline Simd4f operator|(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Vector bit-wise XOR operator
+* \return A vector holding the bit-wise XOR of \a v0 and \a v1.
+* \relates Simd4f */
+inline Simd4f operator^(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Vector logical left shift.
+* \return A vector with 4 elements of \a v0, each shifted left by \a shift bits.
+* \relates Simd4f */
+inline Simd4f operator<<(const Simd4f& v, int shift);
+
+/*! \brief Vector logical right shift.
+* \return A vector with 4 elements of \a v0, each shifted right by \a shift bits.
+* \relates Simd4f */
+inline Simd4f operator>>(const Simd4f& v, int shift);
+
+#if NVMATH_SHIFT_BY_VECTOR
+/*! \brief Vector logical left shift.
+* \return A vector with 4 elements of \a v0, each shifted left by \a shift bits.
+* \relates Simd4f */
+inline Simd4f operator<<(const Simd4f& v, const Simd4f& shift);
+
+/*! \brief Vector logical right shift.
+* \return A vector with 4 elements of \a v0, each shifted right by \a shift bits.
+* \relates Simd4f */
+inline Simd4f operator>>(const Simd4f& v, const Simd4f& shift);
+#endif
+
+/*! \brief Unary vector addition operator.
+* \return A vector holding the component-wise copy of \a v.
+* \relates Simd4f */
+inline Simd4f operator+(const Simd4f& v);
+
+/*! \brief Vector addition operator
+* \return A vector holding the component-wise sum of \a v0 and \a v1.
+* \relates Simd4f */
+inline Simd4f operator+(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Unary vector negation operator.
+* \return A vector holding the component-wise negation of \a v.
+* \relates Simd4f */
+inline Simd4f operator-(const Simd4f& v);
+
+/*! \brief Vector subtraction operator.
+* \return A vector holding the component-wise difference of \a v0 and \a v1.
+* \relates Simd4f */
+inline Simd4f operator-(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Vector multiplication.
+* \return Element-wise product of \a v0 and \a v1.
+* \note For VMX, returns expression template to fuse multiply-add.
+* \relates Simd4f */
+inline ProductExpr operator*(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Vector division.
+* \return Element-wise division of \a v0 and \a v1.
+* \relates Simd4f */
+inline Simd4f operator/(const Simd4f& v0, const Simd4f& v1);
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// functions
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+/*! \brief Load float value into all vector components.
+* \relates Simd4f */
+inline Simd4fFactory<const float&> simd4f(const float& s)
+{
+	return Simd4fFactory<const float&>(s);
+}
+
+/*! \brief Load 4 float values into vector.
+* \relates Simd4f */
+inline Simd4fFactory<detail::FourTuple> simd4f(float x, float y, float z, float w)
+{
+	return Simd4fFactory<detail::FourTuple>(x, y, z, w);
+}
+
+/*! \brief Create vector from literal.
+* \return Vector with all elements set to i.
+* \relates Simd4f */
+template <int i>
+inline Simd4fFactory<detail::IntType<i> > simd4f(detail::IntType<i> const&)
+{
+	return Simd4fFactory<detail::IntType<i> >();
+}
+
+/*! \brief Reinterpret Simd4i as Simd4f.
+* \return A copy of \a v, but cast as Simd4f.
+* \relates Simd4f */
+inline Simd4f simd4f(const Simd4i& v);
+
+/*! \brief Reinterpret Simd4iFactory as Simd4fFactory.
+* \relates Simd4f */
+template <typename T>
+inline Simd4fFactory<T> simd4f(const Simd4iFactory<T>& v)
+{
+	return reinterpret_cast<const Simd4fFactory<T>&>(v);
+}
+
+/*! \brief return reference to contiguous array of vector elements
+* \relates Simd4f */
+inline float (&array(Simd4f& v))[4];
+
+/*! \brief return constant reference to contiguous array of vector elements
+* \relates Simd4f */
+inline const float (&array(const Simd4f& v))[4];
+
+/*! \brief Create vector from float array.
+* \relates Simd4f */
+inline Simd4fFactory<const float*> load(const float* ptr)
+{
+	return ptr;
+}
+
+/*! \brief Create vector from aligned float array.
+* \note \a ptr needs to be 16 byte aligned.
+* \relates Simd4f */
+inline Simd4fFactory<detail::AlignedPointer<float> > loadAligned(const float* ptr)
+{
+	return detail::AlignedPointer<float>(ptr);
+}
+
+/*! \brief Create vector from aligned float array.
+* \param offset pointer offset in bytes.
+* \note \a ptr+offset needs to be 16 byte aligned.
+* \relates Simd4f */
+inline Simd4fFactory<detail::OffsetPointer<float> > loadAligned(const float* ptr, unsigned int offset)
+{
+	return detail::OffsetPointer<float>(ptr, offset);
+}
+
+/*! \brief Store vector \a v to float array \a ptr.
+* \relates Simd4f */
+inline void store(float* ptr, Simd4f const& v);
+
+/*! \brief Store vector \a v to aligned float array \a ptr.
+* \note \a ptr needs to be 16 byte aligned.
+* \relates Simd4f */
+inline void storeAligned(float* ptr, Simd4f const& v);
+
+/*! \brief Store vector \a v to aligned float array \a ptr.
+* \param offset pointer offset in bytes.
+* \note \a ptr+offset needs to be 16 byte aligned.
+* \relates Simd4f */
+inline void storeAligned(float* ptr, unsigned int offset, Simd4f const& v);
+
+/*! \brief replicate i-th component into all vector components.
+* \return Vector with all elements set to \a v[i].
+* \relates Simd4f */
+template <size_t i>
+inline Simd4f splat(Simd4f const& v);
+
+/*! \brief Select \a v0 or \a v1 based on \a mask.
+* \return mask ? v0 : v1
+* \relates Simd4f */
+inline Simd4f select(Simd4f const& mask, Simd4f const& v0, Simd4f const& v1);
+
+/*! \brief Per element absolute value.
+* \return Vector with absolute values of \a v.
+* \relates Simd4f */
+inline Simd4f abs(const Simd4f& v);
+
+/*! \brief Per element floor value.
+* \note Result undefined for QNaN elements.
+* \note On SSE and NEON, returns v-1 if v is negative integer value
+* \relates Simd4f */
+inline Simd4f floor(const Simd4f& v);
+
+/*! \brief Per-component minimum of two vectors
+* \note Result undefined for QNaN elements.
+* \relates Simd4f */
+inline Simd4f max(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Per-component minimum of two vectors
+* \note Result undefined for QNaN elements.
+* \relates Simd4f */
+inline Simd4f min(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Return reciprocal estimate of a vector.
+* \return Vector of per-element reciprocal estimate.
+* \relates Simd4f */
+inline Simd4f recip(const Simd4f& v);
+
+/*! \brief Return reciprocal of a vector.
+* \return Vector of per-element reciprocal.
+* \note Performs \a n Newton-Raphson iterations on initial estimate.
+* \relates Simd4f */
+template <int n>
+inline Simd4f recipT(const Simd4f& v);
+
+/*! \brief Return square root of a vector.
+* \return Vector of per-element square root.
+* \note The behavior is undefined for negative elements.
+* \relates Simd4f */
+inline Simd4f sqrt(const Simd4f& v);
+
+/*! \brief Return inverse square root estimate of a vector.
+* \return Vector of per-element inverse square root estimate.
+* \note The behavior is undefined for negative, zero, and infinity elements.
+* \relates Simd4f */
+inline Simd4f rsqrt(const Simd4f& v);
+
+/*! \brief Return inverse square root of a vector.
+* \return Vector of per-element inverse square root.
+* \note Performs \a n Newton-Raphson iterations on initial estimate.
+* \note The behavior is undefined for negative and infinity elements.
+* \relates Simd4f */
+template <int n>
+inline Simd4f rsqrtT(const Simd4f& v);
+
+/*! \brief Return 2 raised to the power of v.
+* \note Result undefined for QNaN elements.
+* \relates Simd4f */
+inline Simd4f exp2(const Simd4f& v);
+
+#if NVMATH_SIMD
+namespace simdf
+{
+// PSP2 is confused resolving about exp2, forwarding works
+inline Simd4f exp2(const Simd4f& v)
+{
+	return ::exp2(v);
+}
+}
+#endif
+
+/*! \brief Return logarithm of v to base 2.
+* \note Result undefined for QNaN elements.
+* \relates Simd4f */
+inline Simd4f log2(const Simd4f& v);
+
+/*! \brief Return dot product of two 3-vectors.
+* \note The result is replicated across all 4 components.
+* \relates Simd4f */
+inline Simd4f dot3(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Return cross product of two 3-vectors.
+* \note The 4th component is undefined.
+* \relates Simd4f */
+inline Simd4f cross3(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief Transposes 4x4 matrix represented by \a x, \a y, \a z, and \a w.
+* \relates Simd4f */
+inline void transpose(Simd4f& x, Simd4f& y, Simd4f& z, Simd4f& w);
+
+/*! \brief returns non-zero if all elements or \a v0 and \a v1 are equal
+* \note QNaPs aren't handled on SPU: comparing two QNaPs will return true.
+* \relates Simd4f */
+inline int allEqual(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief returns non-zero if all elements or \a v0 and \a v1 are equal
+* \param outMask holds the result of \a v0 == \a v1.
+* \note QNaPs aren't handled on SPU: comparing two QNaPs will return true.
+* \relates Simd4f */
+inline int allEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are equal
+* \note QNaPs aren't handled on SPU: comparing two QNaPs will return true.
+* \relates Simd4f */
+inline int anyEqual(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are equal
+* \param outMask holds the result of \a v0 == \a v1.
+* \note QNaPs aren't handled on SPU: comparing two QNaPs will return true.
+* \relates Simd4f */
+inline int anyEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask);
+
+/*! \brief returns non-zero if all elements or \a v0 and \a v1 are greater
+* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false.
+* \relates Simd4f */
+inline int allGreater(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief returns non-zero if all elements or \a v0 and \a v1 are greater
+* \param outMask holds the result of \a v0 == \a v1.
+* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false.
+* \relates Simd4f */
+inline int allGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater
+* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false.
+* \relates Simd4f */
+inline int anyGreater(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater
+* \param outMask holds the result of \a v0 == \a v1.
+* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false.
+* \relates Simd4f */
+inline int anyGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask);
+
+/*! \brief returns non-zero if all elements or \a v0 and \a v1 are greater or equal
+* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false.
+* \relates Simd4f */
+inline int allGreaterEqual(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief returns non-zero if all elements or \a v0 and \a v1 are greater or equal
+* \param outMask holds the result of \a v0 == \a v1.
+* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false.
+* \relates Simd4f */
+inline int allGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater or equal
+* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false.
+* \relates Simd4f */
+inline int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater or equal
+* \param outMask holds the result of \a v0 == \a v1.
+* \note QNaPs aren't handled on SPU: comparisons against QNaPs don't necessarily return false.
+* \relates Simd4f */
+inline int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask);
+
+/*! \brief returns non-zero if all elements are true
+* \note Undefined if parameter is not result of a comparison.
+* \relates Simd4f */
+inline int allTrue(const Simd4f& v);
+
+/*! \brief returns non-zero if any element is true
+* \note Undefined if parameter is not result of a comparison.
+* \relates Simd4f */
+inline int anyTrue(const Simd4f& v);
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// platform specific includes
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+#if NVMATH_SSE2
+#include "sse2/Simd4f.h"
+#elif NVMATH_NEON
+#include "neon/Simd4f.h"
+#endif
+
+#if NVMATH_SCALAR
+#include "scalar/Simd4f.h"
+#endif
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Simd4i.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Simd4i.h
new file mode 100644
index 00000000..d237e1fa
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Simd4i.h
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "SimdTypes.h"
+
+template <typename T>
+struct Simd4iFactory
+{
+	Simd4iFactory(T v_) : v(v_)
+	{
+	}
+	inline operator Simd4i() const;
+	inline operator Scalar4i() const;
+	Simd4iFactory& operator=(const Simd4iFactory&); // not implemented
+	T v;
+};
+
+template <>
+struct Simd4iFactory<detail::FourTuple>
+{
+	Simd4iFactory(int x, int y, int z, int w)
+	{
+		v[0] = x, v[1] = y, v[2] = z, v[3] = w;
+	}
+	Simd4iFactory(const Simd4iFactory<const int&>& f)
+	{
+		v[3] = v[2] = v[1] = v[0] = f.v;
+	}
+	inline operator Simd4i() const;
+	inline operator Scalar4i() const;
+	Simd4iFactory& operator=(const Simd4iFactory&); // not implemented
+	PX_ALIGN(16, int) v[4];
+};
+
+template <int i>
+struct Simd4iFactory<detail::IntType<i> >
+{
+	inline operator Simd4i() const;
+	inline operator Scalar4i() const;
+};
+
+// forward declaration
+template <typename>
+struct Simd4fFactory;
+
+// map Simd4f/Scalar4f to Simd4i/Scalar4i
+template <typename>
+struct Simd4fToSimd4i;
+template <>
+struct Simd4fToSimd4i<Simd4f>
+{
+	typedef Simd4i Type;
+};
+template <>
+struct Simd4fToSimd4i<Scalar4f>
+{
+	typedef Scalar4i Type;
+};
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression template
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+#if NVMATH_DISTINCT_TYPES
+inline Simd4i operator&(const ComplementExpr<Simd4i>&, const Simd4i&);
+inline Simd4i operator&(const Simd4i&, const ComplementExpr<Simd4i>&);
+#endif
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operators
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+#if NVMATH_DISTINCT_TYPES
+
+/*! \brief Vector bit-wise NOT operator
+* \return A vector holding the bit-negate of \a v.
+* \relates Simd4i */
+inline ComplementExpr<Simd4i> operator~(const Simd4i& v);
+
+/*! \brief Vector bit-wise AND operator
+* \return A vector holding the bit-wise AND of \a v0 and \a v1.
+* \relates Simd4i */
+inline Simd4i operator&(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief Vector bit-wise OR operator
+* \return A vector holding the bit-wise OR of \a v0 and \a v1.
+* \relates Simd4i */
+inline Simd4i operator|(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief Vector bit-wise XOR operator
+* \return A vector holding the bit-wise XOR of \a v0 and \a v1.
+* \relates Simd4i */
+inline Simd4i operator^(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief Vector logical left shift.
+* \return A vector with 4 elements of \a v0, each shifted left by \a shift bits.
+* \relates Simd4i */
+inline Simd4i operator<<(const Simd4i& v, int shift);
+
+/*! \brief Vector logical right shift.
+* \return A vector with 4 elements of \a v0, each shifted right by \a shift bits.
+* \relates Simd4i */
+inline Simd4i operator>>(const Simd4i& v, int shift);
+
+#if NVMATH_SHIFT_BY_VECTOR
+
+/*! \brief Vector logical left shift.
+* \return A vector with 4 elements of \a v0, each shifted left by \a shift bits.
+* \relates Simd4i */
+inline Simd4i operator<<(const Simd4i& v, const Simd4i& shift);
+
+/*! \brief Vector logical right shift.
+* \return A vector with 4 elements of \a v0, each shifted right by \a shift bits.
+* \relates Simd4i */
+inline Simd4i operator>>(const Simd4i& v, const Simd4i& shift);
+
+#endif // NVMATH_SHIFT_BY_VECTOR
+
+#endif // NVMATH_DISTINCT_TYPES
+
+namespace simdi // disambiguate for VMX
+{
+// note: operator?= missing because they don't have corresponding intrinsics.
+
+/*! \brief Test for equality of two vectors.
+* \return Vector of per element result mask (all bits set for 'true', none set for 'false').
+* \relates Simd4i */
+inline Simd4i operator==(const Simd4i& v0, const Simd4i& v1);
+
+// no !=, <=, >= because VMX128/SSE don't support it, use ~equal etc.
+
+/*! \brief Less-compare all elements of two *signed* vectors.
+* \return Vector of per element result mask (all bits set for 'true', none set for 'false').
+* \relates Simd4i */
+inline Simd4i operator<(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief Greater-compare all elements of two *signed* vectors.
+* \return Vector of per element result mask (all bits set for 'true', none set for 'false').
+* \relates Simd4i */
+inline Simd4i operator>(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief Vector addition operator
+* \return A vector holding the component-wise sum of \a v0 and \a v1.
+* \relates Simd4i */
+inline Simd4i operator+(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief Unary vector negation operator.
+* \return A vector holding the component-wise negation of \a v.
+* \relates Simd4i */
+inline Simd4i operator-(const Simd4i& v);
+
+/*! \brief Vector subtraction operator.
+* \return A vector holding the component-wise difference of \a v0 and \a v1.
+* \relates Simd4i */
+inline Simd4i operator-(const Simd4i& v0, const Simd4i& v1);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// functions
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+/*! \brief Load int value into all vector components.
+* \relates Simd4i */
+inline Simd4iFactory<const int&> simd4i(const int& s)
+{
+	return Simd4iFactory<const int&>(s);
+}
+
+/*! \brief Load 4 int values into vector.
+* \relates Simd4i */
+inline Simd4iFactory<detail::FourTuple> simd4i(int x, int y, int z, int w)
+{
+	return Simd4iFactory<detail::FourTuple>(x, y, z, w);
+}
+
+/*! \brief Create vector from literal.
+* \return Vector with all elements set to \c i.
+* \relates Simd4i */
+template <int i>
+inline Simd4iFactory<detail::IntType<i> > simd4i(const detail::IntType<i>&)
+{
+	return Simd4iFactory<detail::IntType<i> >();
+}
+
+template <>
+inline Simd4iFactory<detail::IntType<1> > simd4i(const detail::IntType<1>&)
+{
+	return Simd4iFactory<detail::IntType<1> >();
+}
+
+template <>
+inline Simd4iFactory<detail::IntType<int(0x80000000)> > simd4i(const detail::IntType<int(0x80000000)>&)
+{
+	return Simd4iFactory<detail::IntType<int(0x80000000)> >();
+}
+
+template <>
+inline Simd4iFactory<detail::IntType<-1> > simd4i(const detail::IntType<-1>&)
+{
+	return Simd4iFactory<detail::IntType<-1> >();
+}
+
+/*! \brief Reinterpret Simd4f as Simd4i.
+* \return A copy of \a v, but cast as Simd4i.
+* \relates Simd4i */
+inline Simd4i simd4i(const Simd4f& v);
+
+/*! \brief Reinterpret Simd4fFactory as Simd4iFactory.
+* \relates Simd4i */
+template <typename T>
+inline Simd4iFactory<T> simd4i(const Simd4fFactory<T>& v)
+{
+	return reinterpret_cast<const Simd4iFactory<T>&>(v);
+}
+
+namespace simdi
+{
+
+/*! \brief return reference to contiguous array of vector elements
+* \relates Simd4i */
+inline int (&array(Simd4i& v))[4];
+
+/*! \brief return constant reference to contiguous array of vector elements
+* \relates Simd4i */
+inline const int (&array(const Simd4i& v))[4];
+
+} // namespace simdi
+
+/*! \brief Create vector from int array.
+* \relates Simd4i */
+inline Simd4iFactory<const int*> load(const int* ptr)
+{
+	return ptr;
+}
+
+/*! \brief Create vector from aligned int array.
+* \note \a ptr needs to be 16 byte aligned.
+* \relates Simd4i */
+inline Simd4iFactory<detail::AlignedPointer<int> > loadAligned(const int* ptr)
+{
+	return detail::AlignedPointer<int>(ptr);
+}
+
+/*! \brief Create vector from aligned float array.
+* \param offset pointer offset in bytes.
+* \note \a ptr+offset needs to be 16 byte aligned.
+* \relates Simd4i */
+inline Simd4iFactory<detail::OffsetPointer<int> > loadAligned(const int* ptr, unsigned int offset)
+{
+	return detail::OffsetPointer<int>(ptr, offset);
+}
+
+/*! \brief Store vector \a v to int array \a ptr.
+* \relates Simd4i */
+inline void store(int* ptr, const Simd4i& v);
+
+/*! \brief Store vector \a v to aligned int array \a ptr.
+* \note \a ptr needs to be 16 byte aligned.
+* \relates Simd4i */
+inline void storeAligned(int* ptr, const Simd4i& v);
+
+/*! \brief Store vector \a v to aligned int array \a ptr.
+* \param offset pointer offset in bytes.
+* \note \a ptr+offset needs to be 16 byte aligned.
+* \relates Simd4i */
+inline void storeAligned(int* ptr, unsigned int offset, const Simd4i& v);
+
+#if NVMATH_DISTINCT_TYPES
+
+/*! \brief replicate i-th component into all vector components.
+* \return Vector with all elements set to \a v[i].
+* \relates Simd4i */
+template <size_t i>
+inline Simd4i splat(const Simd4i& v);
+
+/*! \brief Select \a v0 or \a v1 based on \a mask.
+* \return mask ? v0 : v1
+* \relates Simd4i */
+inline Simd4i select(const Simd4i& mask, const Simd4i& v0, const Simd4i& v1);
+
+#endif // NVMATH_DISTINCT_TYPES
+
+namespace simdi // disambiguate for VMX
+{
+/*! \brief returns non-zero if all elements or \a v0 and \a v1 are equal
+* \relates Simd4i */
+inline int allEqual(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief returns non-zero if all elements or \a v0 and \a v1 are equal
+* \param outMask holds the result of \a v0 == \a v1.
+* \relates Simd4i */
+inline int allEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are equal
+* \relates Simd4i */
+inline int anyEqual(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are equal
+* \param outMask holds the result of \a v0 == \a v1.
+* \relates Simd4i */
+inline int anyEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask);
+
+/*! \brief returns non-zero if all *signed* elements or \a v0 and \a v1 are greater
+* \relates Simd4i */
+inline int allGreater(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief returns non-zero if all *signed* elements or \a v0 and \a v1 are greater
+* \param outMask holds the result of \a v0 == \a v1.
+* \relates Simd4i */
+inline int allGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater
+* \relates Simd4i */
+inline int anyGreater(const Simd4i& v0, const Simd4i& v1);
+
+/*! \brief returns non-zero if any elements or \a v0 and \a v1 are greater
+* \param outMask holds the result of \a v0 == \a v1.
+* \relates Simd4i */
+inline int anyGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask);
+}
+
+#if NVMATH_DISTINCT_TYPES
+
+/*! \brief returns non-zero if all elements are true
+* \note undefined if parameter is not result of a comparison.
+* \relates Simd4i */
+inline int allTrue(const Simd4i& v);
+
+/*! \brief returns non-zero if any element is true
+* \note undefined if parameter is not result of a comparison.
+* \relates Simd4i */
+inline int anyTrue(const Simd4i& v);
+
+#endif // NVMATH_DISTINCT_TYPES
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// platform specific includes
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+#if NVMATH_SSE2
+#include "sse2/Simd4i.h"
+#elif NVMATH_NEON
+#include "neon/Simd4i.h"
+#endif
+
+#if NVMATH_SCALAR
+#include "scalar/Simd4i.h"
+#endif
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SimdTypes.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SimdTypes.h
new file mode 100644
index 00000000..e44e876a
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SimdTypes.h
@@ -0,0 +1,150 @@
+// This code contains NVIDIA Confidential Information and is disclosed to you 
+// under a form of NVIDIA software license agreement provided separately to you.
+//
+// Notice
+// NVIDIA Corporation and its licensors retain all intellectual property and
+// proprietary rights in and to this software and related documentation and 
+// any modifications thereto. Any use, reproduction, disclosure, or 
+// distribution of this software and related documentation without an express 
+// license agreement from NVIDIA Corporation is strictly prohibited.
+// 
+// ALL NVIDIA DESIGN SPECIFICATIONS, CODE ARE PROVIDED "AS IS.". NVIDIA MAKES
+// NO WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ALL IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, AND FITNESS FOR A PARTICULAR PURPOSE.
+//
+// Information and code furnished is believed to be accurate and reliable.
+// However, NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright (c) 2008-2015 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include <cmath>
+
+// ps4 compiler defines _M_X64 without value
+#if((defined _M_IX86) || (defined _M_X64) || (defined __i386__) || (defined __x86_64__))
+#define NVMATH_SSE2 1
+#else
+#define NVMATH_SSE2 0
+#endif
+#define NVMATH_NEON (defined _M_ARM || defined __ARM_NEON__)
+
+// which simd types are implemented (one or both are all valid options)
+#define NVMATH_SIMD (NVMATH_SSE2 || NVMATH_NEON)
+#define NVMATH_SCALAR !NVMATH_SIMD
+// #define NVMATH_SCALAR 1
+
+// use template expression to fuse multiply-adds into a single instruction
+#define NVMATH_FUSE_MULTIPLY_ADD (NVMATH_NEON)
+// support shift by vector operarations
+#define NVMATH_SHIFT_BY_VECTOR (NVMATH_NEON)
+// Simd4f and Simd4i map to different types
+#define NVMATH_DISTINCT_TYPES (NVMATH_SSE2 || NVMATH_NEON)
+// support inline assembler
+#define NVMATH_INLINE_ASSEMBLER !((defined _M_ARM) || (defined SN_TARGET_PSP2) || (defined __arm64__))
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression template
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+/*! \brief Expression template to fuse and-not. */
+template <typename T>
+struct ComplementExpr
+{
+	inline ComplementExpr(T const& v_) : v(v_)
+	{
+	}
+	inline operator T() const;
+	const T v;
+
+  private:
+	ComplementExpr& operator=(const ComplementExpr&); // not implemented
+};
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// helper functions
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <typename T>
+T sqr(const T& x)
+{
+	return x * x;
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// details
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+namespace detail
+{
+template <typename T>
+struct AlignedPointer
+{
+	AlignedPointer(const T* p) : ptr(p)
+	{
+	}
+	const T* ptr;
+};
+
+template <typename T>
+struct OffsetPointer
+{
+	OffsetPointer(const T* p, unsigned int off) : ptr(p), offset(off)
+	{
+	}
+	const T* ptr;
+	unsigned int offset;
+};
+
+struct FourTuple
+{
+};
+
+// zero and one literals
+template <int i>
+struct IntType
+{
+};
+}
+
+// Supress warnings
+#if defined(__GNUC__) || defined(__SNC__)
+#define NVMATH_UNUSED __attribute__((unused))
+#else
+#define NVMATH_UNUSED
+#endif
+
+static detail::IntType<0> _0 NVMATH_UNUSED;
+static detail::IntType<1> _1 NVMATH_UNUSED;
+static detail::IntType<int(0x80000000)> _sign NVMATH_UNUSED;
+static detail::IntType<-1> _true NVMATH_UNUSED;
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// platform specific includes
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+#if NVMATH_SSE2
+#include "sse2/SimdTypes.h"
+#elif NVMATH_NEON
+#include "neon/SimdTypes.h"
+#else
+struct Simd4f;
+struct Simd4i;
+#endif
+
+#if NVMATH_SCALAR
+#include "scalar/SimdTypes.h"
+#else
+struct Scalar4f;
+struct Scalar4i;
+#endif
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/StackAllocator.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/StackAllocator.h
new file mode 100644
index 00000000..f8c6b2dc
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/StackAllocator.h
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include <PxAssert.h>
+
+#if PX_LINUX_FAMILY
+#include <stdint.h> // intptr_t
+#endif
+
+template <size_t align>
+class StackAllocator
+{
+	typedef unsigned char byte;
+
+	// todo: switch to offsets so size is consistent on x64
+	// mSize is just for book keeping so could be 4 bytes
+	struct Header
+	{
+		Header* mPrev;
+		size_t mSize : 31;
+		size_t mFree : 1;
+	};
+
+	StackAllocator(const StackAllocator&);
+	StackAllocator& operator=(const StackAllocator&);
+
+  public:
+	StackAllocator(void* buffer, size_t bufferSize)
+	: mBuffer(reinterpret_cast<byte*>(buffer)), mBufferSize(bufferSize), mFreeStart(mBuffer), mTop(0)
+	{
+	}
+
+	~StackAllocator()
+	{
+		PX_ASSERT(userBytes() == 0);
+	}
+
+	void* allocate(size_t numBytes)
+	{
+		// this is non-standard
+		if(!numBytes)
+			return 0;
+
+		uintptr_t unalignedStart = uintptr_t(mFreeStart) + sizeof(Header);
+
+		byte* allocStart = reinterpret_cast<byte*>((unalignedStart + (align - 1)) & ~(align - 1));
+		byte* allocEnd = allocStart + numBytes;
+
+		// ensure there is space for the alloc
+		PX_ASSERT(allocEnd <= mBuffer + mBufferSize);
+
+		Header* h = getHeader(allocStart);
+		h->mPrev = mTop;
+		h->mSize = numBytes;
+		h->mFree = false;
+
+		mTop = h;
+		mFreeStart = allocEnd;
+
+		return allocStart;
+	}
+
+	void deallocate(void* p)
+	{
+		if(!p)
+			return;
+
+		Header* h = getHeader(p);
+		h->mFree = true;
+
+		// unwind the stack to the next live alloc
+		while(mTop && mTop->mFree)
+		{
+			mFreeStart = reinterpret_cast<byte*>(mTop);
+			mTop = mTop->mPrev;
+		}
+	}
+
+  private:
+	// return the header for an allocation
+	inline Header* getHeader(void* p) const
+	{
+		PX_ASSERT((reinterpret_cast<uintptr_t>(p) & (align - 1)) == 0);
+		PX_ASSERT(reinterpret_cast<byte*>(p) >= mBuffer + sizeof(Header));
+		PX_ASSERT(reinterpret_cast<byte*>(p) < mBuffer + mBufferSize);
+
+		return reinterpret_cast<Header*>(p) - 1;
+	}
+
+  public:
+	// total user-allocated bytes not including any overhead
+	size_t userBytes() const
+	{
+		size_t total = 0;
+		Header* iter = mTop;
+		while(iter)
+		{
+			total += iter->mSize;
+			iter = iter->mPrev;
+		}
+
+		return total;
+	}
+
+	// total user-allocated bytes + overhead
+	size_t totalUsedBytes() const
+	{
+		return mFreeStart - mBuffer;
+	}
+
+	size_t remainingBytes() const
+	{
+		return mBufferSize - totalUsedBytes();
+	}
+
+	size_t wastedBytes() const
+	{
+		return totalUsedBytes() - userBytes();
+	}
+
+  private:
+	byte* const mBuffer;
+	const size_t mBufferSize;
+
+	byte* mFreeStart; // start of free space
+	Header* mTop;     // top allocation header
+};
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCloth.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCloth.cpp
new file mode 100644
index 00000000..2283a319
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCloth.cpp
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "SwCloth.h"
+#include "SwFabric.h"
+#include "SwFactory.h"
+#include "TripletScheduler.h"
+#include "ClothBase.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+PhaseConfig transform(const PhaseConfig&); // from PhaseConfig.cpp
+}
+}
+
+using namespace nvidia;
+using namespace physx::shdfnd;
+using namespace nvidia;
+
+cloth::SwCloth::SwCloth(SwFactory& factory, SwFabric& fabric, Range<const PxVec4> particles)
+: mFactory(factory)
+, mFabric(fabric)
+, mNumVirtualParticles(0)
+#if APEX_UE4
+, mSimulationTask(NULL)
+#endif
+, mUserData(0)
+{
+	PX_ASSERT(!particles.empty());
+
+	initialize(*this, particles.begin(), particles.end());
+
+#if PX_WINDOWS_FAMILY
+	const uint32_t kSimdWidth = 8; // avx
+#else
+	const uint32_t kSimdWidth = 4; // sse
+#endif
+
+	mCurParticles.reserve(particles.size() + kSimdWidth - 1);
+	mCurParticles.assign(reinterpret_cast<const PxVec4*>(particles.begin()),
+	                     reinterpret_cast<const PxVec4*>(particles.end()));
+
+	// 7 dummy particles used in SIMD solver
+	mCurParticles.resize(particles.size() + kSimdWidth - 1, PxVec4(0.0f));
+	mPrevParticles = mCurParticles;
+
+	mCurParticles.resize(particles.size());
+	mPrevParticles.resize(particles.size());
+
+	mFabric.incRefCount();
+}
+
+namespace
+{
+// copy vector and make same capacity
+void copyVector(cloth::Vec4fAlignedVector& dst, const cloth::Vec4fAlignedVector& src)
+{
+	dst.reserve(src.capacity());
+	dst.assign(src.begin(), src.end());
+
+	// ensure valid dummy data
+	dst.resize(src.capacity(), PxVec4(0.0f));
+	dst.resize(src.size());
+}
+}
+
+// copy constructor, supports rebinding to a different factory
+cloth::SwCloth::SwCloth(SwFactory& factory, const SwCloth& cloth)
+: mFactory(factory)
+, mFabric(cloth.mFabric)
+, mClothCostDirty(true)
+, mPhaseConfigs(cloth.mPhaseConfigs)
+, mCapsuleIndices(cloth.mCapsuleIndices)
+, mStartCollisionSpheres(cloth.mStartCollisionSpheres)
+, mTargetCollisionSpheres(cloth.mTargetCollisionSpheres)
+, mStartCollisionPlanes(cloth.mStartCollisionPlanes)
+, mTargetCollisionPlanes(cloth.mTargetCollisionPlanes)
+, mStartCollisionTriangles(cloth.mStartCollisionTriangles)
+, mTargetCollisionTriangles(cloth.mTargetCollisionTriangles)
+, mVirtualParticleIndices(cloth.mVirtualParticleIndices)
+, mVirtualParticleWeights(cloth.mVirtualParticleWeights)
+, mNumVirtualParticles(cloth.mNumVirtualParticles)
+, mSelfCollisionIndices(cloth.mSelfCollisionIndices)
+, mRestPositions(cloth.mRestPositions)
+#if APEX_UE4
+, mSimulationTask(NULL)
+#endif
+{
+	copy(*this, cloth);
+
+	// carry over capacity (using as dummy particles)
+	copyVector(mCurParticles, cloth.mCurParticles);
+	copyVector(mPrevParticles, cloth.mPrevParticles);
+	copyVector(mMotionConstraints.mStart, cloth.mMotionConstraints.mStart);
+	copyVector(mMotionConstraints.mTarget, cloth.mMotionConstraints.mTarget);
+	copyVector(mSeparationConstraints.mStart, cloth.mSeparationConstraints.mStart);
+	copyVector(mSeparationConstraints.mTarget, cloth.mSeparationConstraints.mTarget);
+	copyVector(mParticleAccelerations, cloth.mParticleAccelerations);
+
+	mFabric.incRefCount();
+}
+
+cloth::SwCloth::~SwCloth()
+{
+	mFabric.decRefCount();
+}
+
+cloth::Range<PxVec4> cloth::SwCloth::push(SwConstraints& constraints)
+{
+	uint32_t n = mCurParticles.size();
+
+	if(!constraints.mTarget.capacity())
+		constraints.mTarget.resize((n + 3) & ~3, PxVec4(0.0f)); // reserve multiple of 4 for SIMD
+
+	constraints.mTarget.resizeUninitialized(n);
+	PxVec4* data = &constraints.mTarget.front();
+	Range<PxVec4> result(data, data + constraints.mTarget.size());
+
+	if(constraints.mStart.empty()) // initialize start first
+		constraints.mStart.swap(constraints.mTarget);
+
+	return result;
+}
+
+void cloth::SwCloth::clear(SwConstraints& constraints)
+{
+	Vec4fAlignedVector().swap(constraints.mStart);
+	Vec4fAlignedVector().swap(constraints.mTarget);
+}
+
+cloth::Range<const PxVec3> cloth::SwCloth::clampTriangleCount(Range<const PxVec3> range, uint32_t)
+{
+	return range;
+}
+
+#include "ClothImpl.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+
+template <>
+Cloth* ClothImpl<SwCloth>::clone(Factory& factory) const
+{
+	return factory.clone(*this);
+}
+
+template <>
+uint32_t ClothImpl<SwCloth>::getNumParticles() const
+{
+	return mCloth.mCurParticles.size();
+}
+
+template <>
+void ClothImpl<SwCloth>::lockParticles() const
+{
+}
+
+template <>
+void ClothImpl<SwCloth>::unlockParticles() const
+{
+}
+
+template <>
+MappedRange<PxVec4> ClothImpl<SwCloth>::getCurrentParticles()
+{
+	return getMappedParticles(&mCloth.mCurParticles.front());
+}
+
+template <>
+MappedRange<const PxVec4> ClothImpl<SwCloth>::getCurrentParticles() const
+{
+	return getMappedParticles(&mCloth.mCurParticles.front());
+}
+
+template <>
+MappedRange<PxVec4> ClothImpl<SwCloth>::getPreviousParticles()
+{
+	return getMappedParticles(&mCloth.mPrevParticles.front());
+}
+
+template <>
+MappedRange<const PxVec4> ClothImpl<SwCloth>::getPreviousParticles() const
+{
+	return getMappedParticles(&mCloth.mPrevParticles.front());
+}
+
+template <>
+GpuParticles ClothImpl<SwCloth>::getGpuParticles()
+{
+	GpuParticles result = { 0, 0, 0 };
+	return result;
+}
+
+template <>
+void ClothImpl<SwCloth>::setPhaseConfig(Range<const PhaseConfig> configs)
+{
+	mCloth.mPhaseConfigs.resize(0);
+
+	// transform phase config to use in solver
+	for(; !configs.empty(); configs.popFront())
+		if(configs.front().mStiffness > 0.0f)
+			mCloth.mPhaseConfigs.pushBack(transform(configs.front()));
+
+	mCloth.wakeUp();
+}
+
+template <>
+void ClothImpl<SwCloth>::setSelfCollisionIndices(Range<const uint32_t> indices)
+{
+	ContextLockType lock(mCloth.mFactory);
+	mCloth.mSelfCollisionIndices.assign(indices.begin(), indices.end());
+	mCloth.notifyChanged();
+	mCloth.wakeUp();
+}
+
+template <>
+uint32_t ClothImpl<SwCloth>::getNumVirtualParticles() const
+{
+	return uint32_t(mCloth.mNumVirtualParticles);
+}
+
+template <>
+Range<PxVec4> ClothImpl<SwCloth>::getParticleAccelerations()
+{
+	if(mCloth.mParticleAccelerations.empty())
+	{
+		uint32_t n = mCloth.mCurParticles.size();
+		mCloth.mParticleAccelerations.resize(n, PxVec4(0.0f));
+	}
+
+	mCloth.wakeUp();
+
+	PxVec4* data = &mCloth.mParticleAccelerations.front();
+	return Range<PxVec4>(data, data + mCloth.mParticleAccelerations.size());
+}
+
+template <>
+void ClothImpl<SwCloth>::clearParticleAccelerations()
+{
+	Vec4fAlignedVector().swap(mCloth.mParticleAccelerations);
+	mCloth.wakeUp();
+}
+
+template <>
+void ClothImpl<SwCloth>::setVirtualParticles(Range<const uint32_t[4]> indices, Range<const PxVec3> weights)
+{
+	mCloth.mNumVirtualParticles = 0;
+
+	// shuffle indices to form independent SIMD sets
+	uint16_t numParticles = uint16_t(mCloth.mCurParticles.size());
+	TripletScheduler scheduler(indices);
+	scheduler.simd(numParticles, 4);
+
+	// convert indices to byte offset
+	Vec4us dummy(numParticles, uint16_t(numParticles + 1), uint16_t(numParticles + 2), 0);
+	Vector<uint32_t>::Type::ConstIterator sIt = scheduler.mSetSizes.begin();
+	Vector<uint32_t>::Type::ConstIterator sEnd = scheduler.mSetSizes.end();
+	TripletScheduler::ConstTripletIter tIt = scheduler.mTriplets.begin(), tLast;
+	mCloth.mVirtualParticleIndices.resize(0);
+	mCloth.mVirtualParticleIndices.reserve(indices.size() + 3 * uint32_t(sEnd - sIt));
+	for(; sIt != sEnd; ++sIt)
+	{
+		uint32_t setSize = *sIt;
+		for(tLast = tIt + setSize; tIt != tLast; ++tIt, ++mCloth.mNumVirtualParticles)
+			mCloth.mVirtualParticleIndices.pushBack(Vec4us(*tIt));
+		mCloth.mVirtualParticleIndices.resize((mCloth.mVirtualParticleIndices.size() + 3) & ~3, dummy);
+	}
+	Vector<Vec4us>::Type(mCloth.mVirtualParticleIndices.begin(), mCloth.mVirtualParticleIndices.end())
+	    .swap(mCloth.mVirtualParticleIndices);
+
+	// precompute 1/dot(w,w)
+	Vec4fAlignedVector().swap(mCloth.mVirtualParticleWeights);
+	mCloth.mVirtualParticleWeights.reserve(weights.size());
+	for(; !weights.empty(); weights.popFront())
+	{
+		PxVec3 w = reinterpret_cast<const PxVec3&>(weights.front());
+		float scale = 1 / w.magnitudeSquared();
+		mCloth.mVirtualParticleWeights.pushBack(PxVec4(w.x, w.y, w.z, scale));
+	}
+
+	mCloth.notifyChanged();
+}
+
+#if APEX_UE4
+template <>
+void ClothImpl<SwCloth>::simulate(float dt)
+{
+	(*SwCloth::sSimulationFunction)(mCloth.mSimulationTask, dt);
+}
+#endif
+
+} // namespace cloth
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCloth.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCloth.h
new file mode 100644
index 00000000..3d0569af
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCloth.h
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Cloth.h"
+#include "Range.h"
+#include "MovingAverage.h"
+#include "PhaseConfig.h"
+#include "IndexPair.h"
+#include "Vec4T.h"
+#include "Array.h"
+#include "PxTransform.h"
+
+namespace nvidia
+{
+
+namespace cloth
+{
+
+class SwFabric;
+class SwFactory;
+#if APEX_UE4
+class SwCloth;
+#endif
+
+typedef AlignedVector<PxVec4, 16>::Type Vec4fAlignedVector;
+
+struct SwConstraints
+{
+	void pop()
+	{
+		if(!mTarget.empty())
+		{
+			mStart.swap(mTarget);
+			mTarget.resize(0);
+		}
+	}
+
+	Vec4fAlignedVector mStart;
+	Vec4fAlignedVector mTarget;
+};
+
+class SwCloth
+{
+	SwCloth& operator=(const SwCloth&); // not implemented
+	struct SwContextLock
+	{
+		SwContextLock(const SwFactory&)
+		{
+		}
+	};
+
+  public:
+	typedef SwFactory FactoryType;
+	typedef SwFabric FabricType;
+	typedef SwContextLock ContextLockType;
+
+	typedef Vec4fAlignedVector& MappedVec4fVectorType;
+	typedef Vector<IndexPair>::Type& MappedIndexVectorType;
+
+	SwCloth(SwFactory&, SwFabric&, Range<const PxVec4>);
+	SwCloth(SwFactory&, const SwCloth&);
+	~SwCloth(); // not virtual on purpose
+
+  public:
+	bool isSleeping() const
+	{
+		return mSleepPassCounter >= mSleepAfterCount;
+	}
+	void wakeUp()
+	{
+		mSleepPassCounter = 0;
+	}
+
+	void notifyChanged()
+	{
+	}
+
+	void setParticleBounds(const float*);
+
+	Range<PxVec4> push(SwConstraints&);
+	static void clear(SwConstraints&);
+
+	static Range<const PxVec3> clampTriangleCount(Range<const PxVec3>, uint32_t);
+
+  public:
+	SwFactory& mFactory;
+	SwFabric& mFabric;
+
+	bool mClothCostDirty;
+
+	// current and previous-iteration particle positions
+	Vec4fAlignedVector mCurParticles;
+	Vec4fAlignedVector mPrevParticles;
+
+	PxVec3 mParticleBoundsCenter;
+	PxVec3 mParticleBoundsHalfExtent;
+
+	PxVec3 mGravity;
+	PxVec3 mLogDamping;
+	PxVec3 mLinearLogDrag;
+	PxVec3 mAngularLogDrag;
+	PxVec3 mLinearInertia;
+	PxVec3 mAngularInertia;
+	PxVec3 mCentrifugalInertia;
+	float mSolverFrequency;
+	float mStiffnessFrequency;
+
+	PxTransform mTargetMotion;
+	PxTransform mCurrentMotion;
+	PxVec3 mLinearVelocity;
+	PxVec3 mAngularVelocity;
+
+	float mPrevIterDt;
+	MovingAverage mIterDtAvg;
+
+	Vector<PhaseConfig>::Type mPhaseConfigs; // transformed!
+
+	// tether constraints stuff
+	float mTetherConstraintLogStiffness;
+	float mTetherConstraintScale;
+
+	// motion constraints stuff
+	SwConstraints mMotionConstraints;
+	float mMotionConstraintScale;
+	float mMotionConstraintBias;
+	float mMotionConstraintLogStiffness;
+
+	// separation constraints stuff
+	SwConstraints mSeparationConstraints;
+
+	// particle acceleration stuff
+	Vec4fAlignedVector mParticleAccelerations;
+
+	// collision stuff
+	Vector<IndexPair>::Type mCapsuleIndices;
+	Vec4fAlignedVector mStartCollisionSpheres;
+	Vec4fAlignedVector mTargetCollisionSpheres;
+	Vector<uint32_t>::Type mConvexMasks;
+	Vec4fAlignedVector mStartCollisionPlanes;
+	Vec4fAlignedVector mTargetCollisionPlanes;
+	Vector<PxVec3>::Type mStartCollisionTriangles;
+	Vector<PxVec3>::Type mTargetCollisionTriangles;
+	bool mEnableContinuousCollision;
+	float mCollisionMassScale;
+	float mFriction;
+
+	// virtual particles
+	Vector<Vec4us>::Type mVirtualParticleIndices;
+	Vec4fAlignedVector mVirtualParticleWeights;
+	uint32_t mNumVirtualParticles;
+
+	// self collision
+	float mSelfCollisionDistance;
+	float mSelfCollisionLogStiffness;
+
+	Vector<uint32_t>::Type mSelfCollisionIndices;
+
+	Vec4fAlignedVector mRestPositions;
+
+	// sleeping
+	uint32_t mSleepTestInterval; // how often to test for movement
+	uint32_t mSleepAfterCount;   // number of tests to pass before sleep
+	float mSleepThreshold;       // max movement delta to pass test
+	uint32_t mSleepPassCounter;  // how many tests passed
+	uint32_t mSleepTestCounter;  // how many iterations since tested
+
+	// unused for CPU simulation
+	bool mIsAllowedHalfPrecisionSolver;
+
+#if APEX_UE4
+	void* mSimulationTask;
+	static void(*const sSimulationFunction)(void*, float);
+#endif
+
+	void* mUserData;
+
+} PX_ALIGN_SUFFIX(16);
+
+} // namespace cloth
+
+// bounds = lower[3], upper[3]
+inline void cloth::SwCloth::setParticleBounds(const float* bounds)
+{
+	for(uint32_t i = 0; i < 3; ++i)
+	{
+		mParticleBoundsCenter[i] = (bounds[3 + i] + bounds[i]) * 0.5f;
+		mParticleBoundsHalfExtent[i] = (bounds[3 + i] - bounds[i]) * 0.5f;
+	}
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwClothData.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwClothData.cpp
new file mode 100644
index 00000000..bc09612f
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwClothData.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "SwClothData.h"
+#include "SwCloth.h"
+#include "SwFabric.h"
+#include "Simd4f.h"
+#include "PsUtilities.h"
+
+using namespace nvidia;
+
+cloth::SwClothData::SwClothData(SwCloth& cloth, const SwFabric& fabric)
+{
+	mNumParticles = uint32_t(cloth.mCurParticles.size());
+	mCurParticles = array(cloth.mCurParticles.front());
+	mPrevParticles = array(cloth.mPrevParticles.front());
+
+	const float* center = array(cloth.mParticleBoundsCenter);
+	const float* extent = array(cloth.mParticleBoundsHalfExtent);
+	for(uint32_t i = 0; i < 3; ++i)
+	{
+		mCurBounds[i] = center[i] - extent[i];
+		mCurBounds[i + 3] = center[i] + extent[i];
+	}
+
+	// avoid reading uninitialized data into mCurBounds, even though it's never used.
+	mPrevBounds[0] = 0.0f;
+
+	mConfigBegin = cloth.mPhaseConfigs.empty() ? 0 : &cloth.mPhaseConfigs.front();
+	mConfigEnd = mConfigBegin + cloth.mPhaseConfigs.size();
+
+	mPhases = &fabric.mPhases.front();
+	mNumPhases = uint32_t(fabric.mPhases.size());
+
+	mSets = &fabric.mSets.front();
+	mNumSets = uint32_t(fabric.mSets.size());
+
+	mRestvalues = &fabric.mRestvalues.front();
+	mNumRestvalues = uint32_t(fabric.mRestvalues.size());
+
+	mIndices = &fabric.mIndices.front();
+	mNumIndices = uint32_t(fabric.mIndices.size());
+
+	float stiffnessExponent = cloth.mStiffnessFrequency * cloth.mPrevIterDt * 0.69314718055994531f; // logf(2.0f);
+
+	mTethers = fabric.mTethers.begin();
+	mNumTethers = uint32_t(fabric.mTethers.size());
+	mTetherConstraintStiffness = 1.0f - exp(stiffnessExponent * cloth.mTetherConstraintLogStiffness);
+	mTetherConstraintScale = cloth.mTetherConstraintScale * fabric.mTetherLengthScale;
+
+	mStartMotionConstraints = cloth.mMotionConstraints.mStart.size() ? array(cloth.mMotionConstraints.mStart.front()) : 0;
+	mTargetMotionConstraints =
+	    !cloth.mMotionConstraints.mTarget.empty() ? array(cloth.mMotionConstraints.mTarget.front()) : 0;
+	mMotionConstraintStiffness = 1.0f - exp(stiffnessExponent * cloth.mMotionConstraintLogStiffness);
+
+	mStartSeparationConstraints =
+	    cloth.mSeparationConstraints.mStart.size() ? array(cloth.mSeparationConstraints.mStart.front()) : 0;
+	mTargetSeparationConstraints =
+	    !cloth.mSeparationConstraints.mTarget.empty() ? array(cloth.mSeparationConstraints.mTarget.front()) : 0;
+
+	mParticleAccelerations = cloth.mParticleAccelerations.size() ? array(cloth.mParticleAccelerations.front()) : 0;
+
+	mStartCollisionSpheres = cloth.mStartCollisionSpheres.empty() ? 0 : array(cloth.mStartCollisionSpheres.front());
+	mTargetCollisionSpheres =
+	    cloth.mTargetCollisionSpheres.empty() ? mStartCollisionSpheres : array(cloth.mTargetCollisionSpheres.front());
+	mNumSpheres = uint32_t(cloth.mStartCollisionSpheres.size());
+
+	mCapsuleIndices = cloth.mCapsuleIndices.empty() ? 0 : &cloth.mCapsuleIndices.front();
+	mNumCapsules = uint32_t(cloth.mCapsuleIndices.size());
+
+	mStartCollisionPlanes = cloth.mStartCollisionPlanes.empty() ? 0 : array(cloth.mStartCollisionPlanes.front());
+	mTargetCollisionPlanes =
+	    cloth.mTargetCollisionPlanes.empty() ? mStartCollisionPlanes : array(cloth.mTargetCollisionPlanes.front());
+	mNumPlanes = uint32_t(cloth.mStartCollisionPlanes.size());
+
+	mConvexMasks = cloth.mConvexMasks.empty() ? 0 : &cloth.mConvexMasks.front();
+	mNumConvexes = uint32_t(cloth.mConvexMasks.size());
+
+	mStartCollisionTriangles = cloth.mStartCollisionTriangles.empty() ? 0 : array(cloth.mStartCollisionTriangles.front());
+	mTargetCollisionTriangles = cloth.mTargetCollisionTriangles.empty() ? mStartCollisionTriangles
+	                                                                    : array(cloth.mTargetCollisionTriangles.front());
+	mNumTriangles = uint32_t(cloth.mStartCollisionTriangles.size()) / 3;
+
+	mVirtualParticlesBegin = cloth.mVirtualParticleIndices.empty() ? 0 : array(cloth.mVirtualParticleIndices.front());
+	mVirtualParticlesEnd = mVirtualParticlesBegin + 4 * cloth.mVirtualParticleIndices.size();
+	mVirtualParticleWeights = cloth.mVirtualParticleWeights.empty() ? 0 : array(cloth.mVirtualParticleWeights.front());
+	mNumVirtualParticleWeights = uint32_t(cloth.mVirtualParticleWeights.size());
+
+	mEnableContinuousCollision = cloth.mEnableContinuousCollision;
+	mCollisionMassScale = cloth.mCollisionMassScale;
+	mFrictionScale = cloth.mFriction;
+
+	mSelfCollisionDistance = cloth.mSelfCollisionDistance;
+	mSelfCollisionStiffness = 1.0f - exp(stiffnessExponent * cloth.mSelfCollisionLogStiffness);
+
+	mSelfCollisionIndices = cloth.mSelfCollisionIndices.empty() ? 0 : cloth.mSelfCollisionIndices.begin();
+	mNumSelfCollisionIndices = mSelfCollisionIndices ? cloth.mSelfCollisionIndices.size() : mNumParticles;
+
+	mRestPositions = cloth.mRestPositions.size() ? array(cloth.mRestPositions.front()) : 0;
+
+	mSleepPassCounter = cloth.mSleepPassCounter;
+	mSleepTestCounter = cloth.mSleepTestCounter;
+}
+
+void cloth::SwClothData::reconcile(SwCloth& cloth) const
+{
+	cloth.setParticleBounds(mCurBounds);
+	cloth.mSleepTestCounter = mSleepTestCounter;
+	cloth.mSleepPassCounter = mSleepPassCounter;
+}
+
+void cloth::SwClothData::verify() const
+{
+	// checks needs to be run after the constructor because
+
+	PX_ASSERT(!mNumCapsules ||
+	          mNumSpheres > *nvidia::maxElement(&mCapsuleIndices->first, &(mCapsuleIndices + mNumCapsules)->first));
+
+	PX_ASSERT(!mNumConvexes || (1u << mNumPlanes) - 1 >= *nvidia::maxElement(mConvexMasks, mConvexMasks + mNumConvexes));
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwClothData.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwClothData.h
new file mode 100644
index 00000000..3aaa6a2b
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwClothData.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Px.h"
+#include "Types.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+
+class SwCloth;
+class SwFabric;
+struct PhaseConfig;
+struct IndexPair;
+struct SwTether;
+
+// reference to cloth instance bulk data (POD)
+struct SwClothData
+{
+	SwClothData(SwCloth&, const SwFabric&);
+	void reconcile(SwCloth&) const;
+	void verify() const;
+
+	// particle data
+	uint32_t mNumParticles;
+	float* mCurParticles;
+	float* mPrevParticles;
+
+	float mCurBounds[6]; // lower[3], upper[3]
+	float mPrevBounds[6];
+	float mPadding; // write as simd
+
+	// distance constraints
+	const PhaseConfig* mConfigBegin;
+	const PhaseConfig* mConfigEnd;
+
+	const uint32_t* mPhases;
+	uint32_t mNumPhases;
+
+	const uint32_t* mSets;
+	uint32_t mNumSets;
+
+	const float* mRestvalues;
+	uint32_t mNumRestvalues;
+
+	const uint16_t* mIndices;
+	uint32_t mNumIndices;
+
+	const SwTether* mTethers;
+	uint32_t mNumTethers;
+	float mTetherConstraintStiffness;
+	float mTetherConstraintScale;
+
+	// motion constraint data
+	const float* mStartMotionConstraints;
+	const float* mTargetMotionConstraints;
+	float mMotionConstraintStiffness;
+
+	// separation constraint data
+	const float* mStartSeparationConstraints;
+	const float* mTargetSeparationConstraints;
+
+	// particle acceleration data
+	const float* mParticleAccelerations;
+
+	// collision stuff
+	const float* mStartCollisionSpheres;
+	const float* mTargetCollisionSpheres;
+	uint32_t mNumSpheres;
+
+	const IndexPair* mCapsuleIndices;
+	uint32_t mNumCapsules;
+
+	const float* mStartCollisionPlanes;
+	const float* mTargetCollisionPlanes;
+	uint32_t mNumPlanes;
+
+	const uint32_t* mConvexMasks;
+	uint32_t mNumConvexes;
+
+	const float* mStartCollisionTriangles;
+	const float* mTargetCollisionTriangles;
+	uint32_t mNumTriangles;
+
+	const uint16_t* mVirtualParticlesBegin;
+	const uint16_t* mVirtualParticlesEnd;
+
+	const float* mVirtualParticleWeights;
+	uint32_t mNumVirtualParticleWeights;
+
+	bool mEnableContinuousCollision;
+	float mFrictionScale;
+	float mCollisionMassScale;
+
+	float mSelfCollisionDistance;
+	float mSelfCollisionStiffness;
+
+	uint32_t mNumSelfCollisionIndices;
+	const uint32_t* mSelfCollisionIndices;
+
+	float* mRestPositions;
+
+	// sleep data
+	uint32_t mSleepPassCounter;
+	uint32_t mSleepTestCounter;
+
+} PX_ALIGN_SUFFIX(16);
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollision.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollision.cpp
new file mode 100644
index 00000000..581d276b
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollision.cpp
@@ -0,0 +1,1927 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "SwCollision.h"
+#include "SwCloth.h"
+#include "SwClothData.h"
+#include "IterationState.h"
+#include "BoundingBox.h"
+#include "PointInterpolator.h"
+#include "SwCollisionHelpers.h"
+#include "PxAssert.h"
+#include <string.h> // for memset
+
+using namespace nvidia;
+
+// the particle trajectory needs to penetrate more than 0.2 * radius to trigger continuous collision
+template <typename Simd4f>
+const Simd4f cloth::SwCollision<Simd4f>::sSkeletonWidth = simd4f(sqr(1 - 0.2f) - 1);
+
+#if NVMATH_SSE2
+const Simd4i cloth::Gather<Simd4i>::sIntSignBit = simd4i(_sign);
+const Simd4i cloth::Gather<Simd4i>::sSignedMask = sIntSignBit | simd4i(0x7);
+#elif NVMATH_NEON
+const Simd4i cloth::Gather<Simd4i>::sPack = simd4i(0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c);
+const Simd4i cloth::Gather<Simd4i>::sOffset = simd4i(0x03020100);
+const Simd4i cloth::Gather<Simd4i>::sShift = simd4i(detail::IntType<2>());
+const Simd4i cloth::Gather<Simd4i>::sMask = simd4i(detail::IntType<7>());
+#endif
+
+namespace
+{
+typedef Simd4fFactory<detail::FourTuple> Simd4fConstant;
+
+const Simd4fConstant sEpsilon = simd4f(FLT_EPSILON);
+const Simd4fConstant sMax = simd4f(FLT_MAX);
+const Simd4fConstant sMaskX = simd4f(simd4i(~0, 0, 0, 0));
+const Simd4fConstant sMaskZ = simd4f(simd4i(0, 0, ~0, 0));
+const Simd4fConstant sMaskW = simd4f(simd4i(0, 0, 0, ~0));
+const Simd4fConstant sZero = simd4f(0.0f);
+const Simd4fConstant sOne = simd4f(1.0f);
+const Simd4fConstant sNegOne = simd4f(-1.0f);
+const Simd4fConstant sHalf = simd4f(0.5f);
+const Simd4fConstant sOneXYZ = simd4f(1.0f, 1.0f, 1.0f, 0.0f);
+const Simd4fConstant sGridLength = simd4f(8 - 1e-3f); // sGridSize
+const Simd4fConstant sGridExpand = simd4f(1e-4f);
+const Simd4fConstant sMinusFloatMaxXYZ = simd4f(-FLT_MAX, -FLT_MAX, -FLT_MAX, 0.0f);
+
+#if PX_PROFILE || PX_DEBUG
+template <typename Simd4f>
+uint32_t horizontalSum(const Simd4f& x)
+{
+	const float* p = array(x);
+	return uint32_t(0.5f + p[0] + p[1] + p[2] + p[3]);
+}
+#endif
+
+// 7 elements are written to ptr!
+template <typename Simd4f>
+void storeBounds(float* ptr, const cloth::BoundingBox<Simd4f>& bounds)
+{
+	store(ptr, bounds.mLower);
+	store(ptr + 3, bounds.mUpper);
+}
+}
+
+struct cloth::SphereData
+{
+	PxVec3 center;
+	float radius;
+};
+
+struct cloth::ConeData
+{
+	PxVec3 center;
+	float radius; // cone radius at center
+	PxVec3 axis;
+	float slope; // tan(alpha)
+
+	float sqrCosine; // cos^2(alpha)
+	float halfLength;
+
+	uint32_t firstMask;
+	uint32_t bothMask;
+};
+
+struct cloth::TriangleData
+{
+	PxVec3 base;
+	float edge0DotEdge1;
+
+	PxVec3 edge0;
+	float edge0SqrLength;
+
+	PxVec3 edge1;
+	float edge1SqrLength;
+
+	PxVec3 normal;
+	float padding;
+
+	float det;
+	float denom;
+
+	float edge0InvSqrLength;
+	float edge1InvSqrLength;
+};
+
+namespace nvidia
+{
+namespace cloth
+{
+template <typename Simd4f>
+BoundingBox<Simd4f> expandBounds(const BoundingBox<Simd4f>& bbox, const SphereData* sIt, const SphereData* sEnd)
+{
+	BoundingBox<Simd4f> result = bbox;
+	for(; sIt != sEnd; ++sIt)
+	{
+		Simd4f p = loadAligned(array(sIt->center));
+		Simd4f r = splat<3>(p);
+		result.mLower = min(result.mLower, p - r);
+		result.mUpper = max(result.mUpper, p + r);
+	}
+	return result;
+}
+}
+}
+
+namespace
+{
+template <typename Simd4f, typename SrcIterator>
+void generateSpheres(Simd4f* dIt, const SrcIterator& src, uint32_t count)
+{
+	// have to copy out iterator to ensure alignment is maintained
+	for(SrcIterator sIt = src; 0 < count--; ++sIt, ++dIt)
+		*dIt = max(sMinusFloatMaxXYZ, *sIt); // clamp radius to 0
+}
+
+void generateCones(cloth::ConeData* dst, const cloth::SphereData* sourceSpheres, const cloth::IndexPair* capsuleIndices,
+                   uint32_t numCones)
+{
+	cloth::ConeData* cIt = dst;
+	for(const cloth::IndexPair* iIt = capsuleIndices, *iEnd = iIt + numCones; iIt != iEnd; ++iIt, ++cIt)
+	{
+		PxVec4 first = reinterpret_cast<const PxVec4&>(sourceSpheres[iIt->first]);
+		PxVec4 second = reinterpret_cast<const PxVec4&>(sourceSpheres[iIt->second]);
+
+		PxVec4 center = (second + first) * 0.5f;
+		PxVec4 axis = (second - first) * 0.5f;
+
+		float sqrAxisLength = axis.x * axis.x + axis.y * axis.y + axis.z * axis.z;
+		float sqrConeLength = sqrAxisLength - sqr(axis.w);
+
+		float invAxisLength = 1 / sqrtf(sqrAxisLength);
+		float invConeLength = 1 / sqrtf(sqrConeLength);
+
+		if(sqrConeLength <= 0.0f)
+			invAxisLength = invConeLength = 0.0f;
+
+		float axisLength = sqrAxisLength * invAxisLength;
+		float slope = axis.w * invConeLength;
+
+		cIt->center = PxVec3(center.x, center.y, center.z);
+		cIt->radius = (axis.w + first.w) * invConeLength * axisLength;
+		cIt->axis = PxVec3(axis.x, axis.y, axis.z) * invAxisLength;
+		cIt->slope = slope;
+
+		cIt->sqrCosine = 1.0f - sqr(axis.w * invAxisLength);
+		cIt->halfLength = axisLength;
+
+		uint32_t firstMask = 0x1u << iIt->first;
+		cIt->firstMask = firstMask;
+		cIt->bothMask = firstMask | 0x1u << iIt->second;
+	}
+}
+
+template <typename Simd4f, typename SrcIterator>
+void generatePlanes(Simd4f* dIt, const SrcIterator& src, uint32_t count)
+{
+	// have to copy out iterator to ensure alignment is maintained
+	for(SrcIterator sIt = src; 0 < count--; ++sIt, ++dIt)
+		*dIt = *sIt;
+}
+
+template <typename Simd4f, typename SrcIterator>
+void generateTriangles(cloth::TriangleData* dIt, const SrcIterator& src, uint32_t count)
+{
+	// have to copy out iterator to ensure alignment is maintained
+	for(SrcIterator sIt = src; 0 < count--; ++dIt)
+	{
+		Simd4f p0 = *sIt;
+		++sIt;
+		Simd4f p1 = *sIt;
+		++sIt;
+		Simd4f p2 = *sIt;
+		++sIt;
+
+		Simd4f edge0 = p1 - p0;
+		Simd4f edge1 = p2 - p0;
+		Simd4f normal = cross3(edge0, edge1);
+
+		Simd4f edge0SqrLength = dot3(edge0, edge0);
+		Simd4f edge1SqrLength = dot3(edge1, edge1);
+		Simd4f edge0DotEdge1 = dot3(edge0, edge1);
+		Simd4f normalInvLength = rsqrt(dot3(normal, normal));
+
+		Simd4f det = edge0SqrLength * edge1SqrLength - edge0DotEdge1 * edge0DotEdge1;
+		Simd4f denom = edge0SqrLength + edge1SqrLength - edge0DotEdge1 - edge0DotEdge1;
+
+		// there are definitely faster ways...
+		Simd4f aux = select(sMaskX, det, denom);
+		aux = select(sMaskZ, edge0SqrLength, aux);
+		aux = select(sMaskW, edge1SqrLength, aux);
+
+		storeAligned(&dIt->base.x, select(sMaskW, edge0DotEdge1, p0));
+		storeAligned(&dIt->edge0.x, select(sMaskW, edge0SqrLength, edge0));
+		storeAligned(&dIt->edge1.x, select(sMaskW, edge1SqrLength, edge1));
+		storeAligned(&dIt->normal.x, normal * normalInvLength);
+		storeAligned(&dIt->det, recipT<1>(aux));
+	}
+}
+
+} // namespace
+
+template <typename Simd4f>
+cloth::SwCollision<Simd4f>::CollisionData::CollisionData()
+: mSpheres(0), mCones(0)
+{
+}
+
+template <typename Simd4f>
+cloth::SwCollision<Simd4f>::SwCollision(SwClothData& clothData, SwKernelAllocator& alloc, profile::PxProfileZone* profiler)
+: mClothData(clothData), mAllocator(alloc), mProfiler(profiler)
+{
+	allocate(mCurData);
+
+	if(mClothData.mEnableContinuousCollision || mClothData.mFrictionScale > 0.0f)
+	{
+		allocate(mPrevData);
+
+		generateSpheres(reinterpret_cast<Simd4f*>(mPrevData.mSpheres),
+		                reinterpret_cast<const Simd4f*>(clothData.mStartCollisionSpheres), clothData.mNumSpheres);
+
+		generateCones(mPrevData.mCones, mPrevData.mSpheres, clothData.mCapsuleIndices, clothData.mNumCapsules);
+	}
+}
+
+template <typename Simd4f>
+cloth::SwCollision<Simd4f>::~SwCollision()
+{
+	deallocate(mCurData);
+	deallocate(mPrevData);
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::operator()(const IterationState<Simd4f>& state)
+{
+	mNumCollisions = 0;
+
+	collideConvexes(state);  // discrete convex collision, no friction
+	collideTriangles(state); // discrete triangle collision, no friction
+
+	computeBounds();
+
+	if(!mClothData.mNumSpheres)
+		return;
+
+	bool lastIteration = state.mRemainingIterations == 1;
+
+	const Simd4f* targetSpheres = reinterpret_cast<const Simd4f*>(mClothData.mTargetCollisionSpheres);
+
+	// generate sphere and cone collision data
+	if(!lastIteration)
+	{
+		// interpolate spheres
+		LerpIterator<Simd4f, const Simd4f*> pIter(reinterpret_cast<const Simd4f*>(mClothData.mStartCollisionSpheres),
+		                                          targetSpheres, state.getCurrentAlpha());
+		generateSpheres(reinterpret_cast<Simd4f*>(mCurData.mSpheres), pIter, mClothData.mNumSpheres);
+	}
+	else
+	{
+		// otherwise use the target spheres directly
+		generateSpheres(reinterpret_cast<Simd4f*>(mCurData.mSpheres), targetSpheres, mClothData.mNumSpheres);
+	}
+
+	// generate cones even if test below fails because
+	// continuous collision might need it in next iteration
+	generateCones(mCurData.mCones, mCurData.mSpheres, mClothData.mCapsuleIndices, mClothData.mNumCapsules);
+
+	if(buildAcceleration())
+	{
+		if(mClothData.mEnableContinuousCollision)
+			collideContinuousParticles();
+
+		mergeAcceleration((uint32_t*)mSphereGrid);
+		mergeAcceleration((uint32_t*)mConeGrid);
+
+		if(!mClothData.mEnableContinuousCollision)
+			collideParticles();
+
+		collideVirtualParticles();
+	}
+
+	if(mPrevData.mSpheres)
+		nvidia::swap(mCurData, mPrevData);
+}
+
+template <typename Simd4f>
+size_t cloth::SwCollision<Simd4f>::estimateTemporaryMemory(const SwCloth& cloth)
+{
+	size_t numTriangles = cloth.mStartCollisionTriangles.size();
+	size_t numPlanes = cloth.mStartCollisionPlanes.size();
+
+	const size_t kTriangleDataSize = sizeof(TriangleData) * numTriangles;
+	const size_t kPlaneDataSize = sizeof(PxVec4) * numPlanes * 2;
+
+	return PxMax(kTriangleDataSize, kPlaneDataSize);
+}
+
+template <typename Simd4f>
+size_t cloth::SwCollision<Simd4f>::estimatePersistentMemory(const SwCloth& cloth)
+{
+	size_t numCapsules = cloth.mCapsuleIndices.size();
+	size_t numSpheres = cloth.mStartCollisionSpheres.size();
+
+	size_t sphereDataSize = sizeof(SphereData) * numSpheres * 2;
+	size_t coneDataSize = sizeof(ConeData) * numCapsules * 2;
+
+	return sphereDataSize + coneDataSize;
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::allocate(CollisionData& data)
+{
+	data.mSpheres = static_cast<SphereData*>(mAllocator.allocate(sizeof(SphereData) * mClothData.mNumSpheres));
+
+	data.mCones = static_cast<ConeData*>(mAllocator.allocate(sizeof(ConeData) * mClothData.mNumCapsules));
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::deallocate(const CollisionData& data)
+{
+	mAllocator.deallocate(data.mSpheres);
+	mAllocator.deallocate(data.mCones);
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::computeBounds()
+{
+#if PX_PROFILE
+	ProfileZone zone("cloth::SwSolverKernel::computeBounds", mProfiler);
+#endif
+
+	Simd4f* prevIt = reinterpret_cast<Simd4f*>(mClothData.mPrevParticles);
+	Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles);
+	Simd4f* curEnd = curIt + mClothData.mNumParticles;
+	Simd4f floatMaxXYZ = -(Simd4f)sMinusFloatMaxXYZ;
+
+	Simd4f lower = simd4f(FLT_MAX), upper = -lower;
+	for(; curIt < curEnd; ++curIt, ++prevIt)
+	{
+		Simd4f current = *curIt;
+		lower = min(lower, current);
+		upper = max(upper, current);
+		// if(current.w > 0) current.w = previous.w
+		*curIt = select(current > floatMaxXYZ, *prevIt, current);
+	}
+
+	BoundingBox<Simd4f> curBounds;
+	curBounds.mLower = lower;
+	curBounds.mUpper = upper;
+
+	// don't change this order, storeBounds writes 7 floats
+	BoundingBox<Simd4f> prevBounds = loadBounds<Simd4f>(mClothData.mCurBounds);
+	storeBounds(mClothData.mCurBounds, curBounds);
+	storeBounds(mClothData.mPrevBounds, prevBounds);
+}
+
+namespace
+{
+template <typename Simd4i>
+Simd4i andNotIsZero(const Simd4i& left, const Simd4i& right)
+{
+	return simdi::operator==(left & ~right, simd4i(_0));
+}
+}
+
+// build per-axis mask arrays of spheres on the right/left of grid cell
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::buildSphereAcceleration(const SphereData* sIt)
+{
+	static const int maxIndex = sGridSize - 1;
+
+	const SphereData* sEnd = sIt + mClothData.mNumSpheres;
+	for(uint32_t mask = 0x1; sIt != sEnd; ++sIt, mask <<= 1)
+	{
+		Simd4f sphere = loadAligned(array(sIt->center));
+		Simd4f radius = splat<3>(sphere);
+
+		Simd4i first = intFloor(max((sphere - radius) * mGridScale + mGridBias, sZero));
+		Simd4i last = intFloor(min((sphere + radius) * mGridScale + mGridBias, sGridLength));
+
+		const int* firstIdx = simdi::array(first);
+		const int* lastIdx = simdi::array(last);
+
+		uint32_t* firstIt = (uint32_t*)mSphereGrid;
+		uint32_t* lastIt = firstIt + 3 * sGridSize;
+
+		for(uint32_t i = 0; i < 3; ++i, firstIt += sGridSize, lastIt += sGridSize)
+		{
+			for(int j = firstIdx[i]; j <= maxIndex; ++j)
+				firstIt[j] |= mask;
+
+			for(int j = lastIdx[i]; j >= 0; --j)
+				lastIt[j] |= mask;
+		}
+	}
+}
+
+// generate cone masks from sphere masks
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::buildConeAcceleration()
+{
+	const ConeData* coneIt = mCurData.mCones;
+	const ConeData* coneEnd = coneIt + mClothData.mNumCapsules;
+	for(uint32_t coneMask = 0x1; coneIt != coneEnd; ++coneIt, coneMask <<= 1)
+	{
+		if(coneIt->radius == 0.0f)
+			continue;
+
+		uint32_t spheresMask = coneIt->bothMask;
+
+		uint32_t* sphereIt = (uint32_t*)mSphereGrid;
+		uint32_t* sphereEnd = sphereIt + 6 * sGridSize;
+		uint32_t* gridIt = (uint32_t*)mConeGrid;
+		for(; sphereIt != sphereEnd; ++sphereIt, ++gridIt)
+			if(*sphereIt & spheresMask)
+				*gridIt |= coneMask;
+	}
+}
+
+// convert right/left mask arrays into single overlap array
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::mergeAcceleration(uint32_t* firstIt)
+{
+	uint32_t* firstEnd = firstIt + 3 * sGridSize;
+	uint32_t* lastIt = firstEnd;
+	for(; firstIt != firstEnd; ++firstIt, ++lastIt)
+		*firstIt &= *lastIt;
+}
+
+// build mask of spheres/cones touching a regular grid along each axis
+template <typename Simd4f>
+bool cloth::SwCollision<Simd4f>::buildAcceleration()
+{
+	// determine sphere bbox
+	BoundingBox<Simd4f> sphereBounds =
+	    expandBounds(emptyBounds<Simd4f>(), mCurData.mSpheres, mCurData.mSpheres + mClothData.mNumSpheres);
+	BoundingBox<Simd4f> particleBounds = loadBounds<Simd4f>(mClothData.mCurBounds);
+	if(mClothData.mEnableContinuousCollision)
+	{
+		sphereBounds = expandBounds(sphereBounds, mPrevData.mSpheres, mPrevData.mSpheres + mClothData.mNumSpheres);
+		particleBounds = expandBounds(particleBounds, loadBounds<Simd4f>(mClothData.mPrevBounds));
+	}
+
+	BoundingBox<Simd4f> bounds = intersectBounds(sphereBounds, particleBounds);
+	Simd4f edgeLength = (bounds.mUpper - bounds.mLower) & ~(Simd4f)sMaskW;
+	if(!allGreaterEqual(edgeLength, simd4f(_0)))
+		return false;
+
+	// calculate an expanded bounds to account for numerical inaccuracy
+	const Simd4f expandedLower = bounds.mLower - abs(bounds.mLower) * sGridExpand;
+	const Simd4f expandedUpper = bounds.mUpper + abs(bounds.mUpper) * sGridExpand;
+	const Simd4f expandedEdgeLength = max(expandedUpper - expandedLower, sEpsilon);
+
+	// make grid minimal thickness and strict upper bound of spheres
+	mGridScale = sGridLength * recipT<1>(expandedEdgeLength);
+	mGridBias = -expandedLower * mGridScale;
+	array(mGridBias)[3] = 1.0f; // needed for collideVirtualParticles()
+
+	PX_ASSERT(allTrue(((bounds.mLower * mGridScale + mGridBias) >= simd4f(0.0f)) | sMaskW));
+	PX_ASSERT(allTrue(((bounds.mUpper * mGridScale + mGridBias) < simd4f(8.0f)) | sMaskW));
+
+	memset(mSphereGrid, 0, sizeof(uint32_t) * 6 * (sGridSize));
+	if(mClothData.mEnableContinuousCollision)
+		buildSphereAcceleration(mPrevData.mSpheres);
+	buildSphereAcceleration(mCurData.mSpheres);
+
+	memset(mConeGrid, 0, sizeof(uint32_t) * 6 * (sGridSize));
+	buildConeAcceleration();
+
+	return true;
+}
+
+#ifdef _MSC_VER
+#define FORCE_INLINE __forceinline
+#else
+#define FORCE_INLINE inline __attribute__((always_inline))
+#endif
+
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask& cloth::SwCollision<Simd4f>::ShapeMask::
+operator=(const ShapeMask& right)
+{
+	mCones = right.mCones;
+	mSpheres = right.mSpheres;
+	return *this;
+}
+
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask& cloth::SwCollision<Simd4f>::ShapeMask::
+operator&=(const ShapeMask& right)
+{
+	mCones = mCones & right.mCones;
+	mSpheres = mSpheres & right.mSpheres;
+	return *this;
+}
+
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask
+cloth::SwCollision<Simd4f>::getShapeMask(const Simd4f& position, const Simd4i* __restrict sphereGrid,
+                                         const Simd4i* __restrict coneGrid)
+{
+	Gather<Simd4i> gather(intFloor(position));
+
+	ShapeMask result;
+	result.mCones = gather(coneGrid);
+	result.mSpheres = gather(sphereGrid);
+	return result;
+}
+
+// lookup acceleration structure and return mask of potential intersectors
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask
+cloth::SwCollision<Simd4f>::getShapeMask(const Simd4f* __restrict positions) const
+{
+	Simd4f posX = positions[0] * splat<0>(mGridScale) + splat<0>(mGridBias);
+	Simd4f posY = positions[1] * splat<1>(mGridScale) + splat<1>(mGridBias);
+	Simd4f posZ = positions[2] * splat<2>(mGridScale) + splat<2>(mGridBias);
+
+	ShapeMask result = getShapeMask(posX, mSphereGrid, mConeGrid);
+	result &= getShapeMask(posY, mSphereGrid + 2, mConeGrid + 2);
+	result &= getShapeMask(posZ, mSphereGrid + 4, mConeGrid + 4);
+
+	return result;
+}
+
+// lookup acceleration structure and return mask of potential intersectors
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::ShapeMask
+cloth::SwCollision<Simd4f>::getShapeMask(const Simd4f* __restrict prevPos, const Simd4f* __restrict curPos) const
+{
+	Simd4f scaleX = splat<0>(mGridScale);
+	Simd4f scaleY = splat<1>(mGridScale);
+	Simd4f scaleZ = splat<2>(mGridScale);
+
+	Simd4f biasX = splat<0>(mGridBias);
+	Simd4f biasY = splat<1>(mGridBias);
+	Simd4f biasZ = splat<2>(mGridBias);
+
+	Simd4f prevX = prevPos[0] * scaleX + biasX;
+	Simd4f prevY = prevPos[1] * scaleY + biasY;
+	Simd4f prevZ = prevPos[2] * scaleZ + biasZ;
+
+	Simd4f curX = curPos[0] * scaleX + biasX;
+	Simd4f curY = curPos[1] * scaleY + biasY;
+	Simd4f curZ = curPos[2] * scaleZ + biasZ;
+
+	Simd4f maxX = min(max(prevX, curX), sGridLength);
+	Simd4f maxY = min(max(prevY, curY), sGridLength);
+	Simd4f maxZ = min(max(prevZ, curZ), sGridLength);
+
+	ShapeMask result = getShapeMask(maxX, mSphereGrid, mConeGrid);
+	result &= getShapeMask(maxY, mSphereGrid + 2, mConeGrid + 2);
+	result &= getShapeMask(maxZ, mSphereGrid + 4, mConeGrid + 4);
+
+	Simd4f zero = simd4f(_0);
+	Simd4f minX = max(min(prevX, curX), zero);
+	Simd4f minY = max(min(prevY, curY), zero);
+	Simd4f minZ = max(min(prevZ, curZ), zero);
+
+	result &= getShapeMask(minX, mSphereGrid + 6, mConeGrid + 6);
+	result &= getShapeMask(minY, mSphereGrid + 8, mConeGrid + 8);
+	result &= getShapeMask(minZ, mSphereGrid + 10, mConeGrid + 10);
+
+	return result;
+}
+
+template <typename Simd4f>
+struct cloth::SwCollision<Simd4f>::ImpulseAccumulator
+{
+	ImpulseAccumulator()
+	: mDeltaX(simd4f(_0))
+	, mDeltaY(mDeltaX)
+	, mDeltaZ(mDeltaX)
+	, mVelX(mDeltaX)
+	, mVelY(mDeltaX)
+	, mVelZ(mDeltaX)
+	, mNumCollisions(sEpsilon)
+	{
+	}
+
+	void add(const Simd4f& x, const Simd4f& y, const Simd4f& z, const Simd4f& scale, const Simd4f& mask)
+	{
+		PX_ASSERT(allTrue((mask & x) == (mask & x)));
+		PX_ASSERT(allTrue((mask & y) == (mask & y)));
+		PX_ASSERT(allTrue((mask & z) == (mask & z)));
+		PX_ASSERT(allTrue((mask & scale) == (mask & scale)));
+
+		Simd4f maskedScale = scale & mask;
+		mDeltaX = mDeltaX + x * maskedScale;
+		mDeltaY = mDeltaY + y * maskedScale;
+		mDeltaZ = mDeltaZ + z * maskedScale;
+		mNumCollisions = mNumCollisions + (simd4f(_1) & mask);
+	}
+
+	void addVelocity(const Simd4f& vx, const Simd4f& vy, const Simd4f& vz, const Simd4f& mask)
+	{
+		PX_ASSERT(allTrue((mask & vx) == (mask & vx)));
+		PX_ASSERT(allTrue((mask & vy) == (mask & vy)));
+		PX_ASSERT(allTrue((mask & vz) == (mask & vz)));
+
+		mVelX = mVelX + (vx & mask);
+		mVelY = mVelY + (vy & mask);
+		mVelZ = mVelZ + (vz & mask);
+	}
+
+	void subtract(const Simd4f& x, const Simd4f& y, const Simd4f& z, const Simd4f& scale, const Simd4f& mask)
+	{
+		PX_ASSERT(allTrue((mask & x) == (mask & x)));
+		PX_ASSERT(allTrue((mask & y) == (mask & y)));
+		PX_ASSERT(allTrue((mask & z) == (mask & z)));
+		PX_ASSERT(allTrue((mask & scale) == (mask & scale)));
+
+		Simd4f maskedScale = scale & mask;
+		mDeltaX = mDeltaX - x * maskedScale;
+		mDeltaY = mDeltaY - y * maskedScale;
+		mDeltaZ = mDeltaZ - z * maskedScale;
+		mNumCollisions = mNumCollisions + (simd4f(_1) & mask);
+	}
+
+	Simd4f mDeltaX, mDeltaY, mDeltaZ;
+	Simd4f mVelX, mVelY, mVelZ;
+	Simd4f mNumCollisions;
+};
+
+template <typename Simd4f>
+FORCE_INLINE void cloth::SwCollision<Simd4f>::collideSpheres(const Simd4i& sphereMask, const Simd4f* positions,
+                                                             ImpulseAccumulator& accum) const
+{
+	const float* __restrict spherePtr = array(mCurData.mSpheres->center);
+
+	bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+
+	Simd4i mask4 = horizontalOr(sphereMask);
+	uint32_t mask = uint32_t(simdi::array(mask4)[0]);
+	while(mask)
+	{
+		uint32_t test = mask - 1;
+		uint32_t offset = findBitSet(mask & ~test) * sizeof(SphereData);
+		mask = mask & test;
+
+		Simd4f sphere = loadAligned(spherePtr, offset);
+
+		Simd4f deltaX = positions[0] - splat<0>(sphere);
+		Simd4f deltaY = positions[1] - splat<1>(sphere);
+		Simd4f deltaZ = positions[2] - splat<2>(sphere);
+
+		Simd4f sqrDistance = sEpsilon + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ;
+		Simd4f negativeScale = simd4f(_1) - rsqrt(sqrDistance) * splat<3>(sphere);
+
+		Simd4f contactMask;
+		if(!anyGreater(simd4f(_0), negativeScale, contactMask))
+			continue;
+
+		accum.subtract(deltaX, deltaY, deltaZ, negativeScale, contactMask);
+
+		if(frictionEnabled)
+		{
+			// load previous sphere pos
+			const float* __restrict prevSpherePtr = array(mPrevData.mSpheres->center);
+
+			Simd4f prevSphere = loadAligned(prevSpherePtr, offset);
+			Simd4f velocity = sphere - prevSphere;
+
+			accum.addVelocity(splat<0>(velocity), splat<1>(velocity), splat<2>(velocity), contactMask);
+		}
+	}
+}
+
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::Simd4i
+cloth::SwCollision<Simd4f>::collideCones(const Simd4f* __restrict positions, ImpulseAccumulator& accum) const
+{
+	const float* __restrict centerPtr = array(mCurData.mCones->center);
+	const float* __restrict axisPtr = array(mCurData.mCones->axis);
+	const float* __restrict auxiliaryPtr = &mCurData.mCones->sqrCosine;
+
+	bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+
+	ShapeMask shapeMask = getShapeMask(positions);
+	Simd4i mask4 = horizontalOr(shapeMask.mCones);
+	uint32_t mask = uint32_t(simdi::array(mask4)[0]);
+	while(mask)
+	{
+		uint32_t test = mask - 1;
+		uint32_t coneIndex = findBitSet(mask & ~test);
+		uint32_t offset = coneIndex * sizeof(ConeData);
+		mask = mask & test;
+
+		Simd4i test4 = simdi::operator-(mask4, simd4i(_1));
+		Simd4f culled = simd4f(andNotIsZero(shapeMask.mCones, test4));
+		mask4 = mask4 & test4;
+
+		Simd4f center = loadAligned(centerPtr, offset);
+
+		Simd4f deltaX = positions[0] - splat<0>(center);
+		Simd4f deltaY = positions[1] - splat<1>(center);
+		Simd4f deltaZ = positions[2] - splat<2>(center);
+
+		Simd4f axis = loadAligned(axisPtr, offset);
+
+		Simd4f axisX = splat<0>(axis);
+		Simd4f axisY = splat<1>(axis);
+		Simd4f axisZ = splat<2>(axis);
+		Simd4f slope = splat<3>(axis);
+
+		Simd4f dot = deltaX * axisX + deltaY * axisY + deltaZ * axisZ;
+		Simd4f radius = dot * slope + splat<3>(center);
+
+		// set radius to zero if cone is culled
+		radius = max(radius, sZero) & ~culled;
+
+		Simd4f sqrDistance = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ - dot * dot;
+
+		Simd4i auxiliary = simd4i((Simd4f)loadAligned(auxiliaryPtr, offset));
+		Simd4i bothMask = splat<3>(auxiliary);
+
+		Simd4f contactMask;
+		if(!anyGreater(radius * radius, sqrDistance, contactMask))
+		{
+			// cone only culled when spheres culled, ok to clear those too
+			shapeMask.mSpheres = shapeMask.mSpheres & ~bothMask;
+			continue;
+		}
+
+		// clamp to a small positive epsilon to avoid numerical error
+		// making sqrDistance negative when point lies on the cone axis
+		sqrDistance = max(sqrDistance, sEpsilon);
+
+		Simd4f invDistance = rsqrt(sqrDistance);
+		Simd4f base = dot + slope * sqrDistance * invDistance;
+
+		// force left/rightMask to false if not inside cone
+		base = base & contactMask;
+
+		Simd4f halfLength = splat<1>(simd4f(auxiliary));
+		Simd4i leftMask = simd4i(base < -halfLength);
+		Simd4i rightMask = simd4i(base > halfLength);
+
+		// we use both mask because of the early out above.
+		Simd4i firstMask = splat<2>(auxiliary);
+		Simd4i secondMask = firstMask ^ bothMask;
+		shapeMask.mSpheres = shapeMask.mSpheres & ~(firstMask & ~leftMask);
+		shapeMask.mSpheres = shapeMask.mSpheres & ~(secondMask & ~rightMask);
+
+		deltaX = deltaX - base * axisX;
+		deltaY = deltaY - base * axisY;
+		deltaZ = deltaZ - base * axisZ;
+
+		Simd4f sqrCosine = splat<0>(simd4f(auxiliary));
+		Simd4f scale = radius * invDistance * sqrCosine - sqrCosine;
+
+		contactMask = contactMask & ~simd4f(leftMask | rightMask);
+
+		if(!anyTrue(contactMask))
+			continue;
+
+		accum.add(deltaX, deltaY, deltaZ, scale, contactMask);
+
+		if(frictionEnabled)
+		{
+			uint32_t s0 = mClothData.mCapsuleIndices[coneIndex].first;
+			uint32_t s1 = mClothData.mCapsuleIndices[coneIndex].second;
+
+			float* prevSpheres = reinterpret_cast<float*>(mPrevData.mSpheres);
+			float* curSpheres = reinterpret_cast<float*>(mCurData.mSpheres);
+
+			// todo: could pre-compute sphere velocities or it might be
+			// faster to compute cur/prev sphere positions directly
+			Simd4f s0p0 = loadAligned(prevSpheres, s0 * sizeof(SphereData));
+			Simd4f s0p1 = loadAligned(curSpheres, s0 * sizeof(SphereData));
+
+			Simd4f s1p0 = loadAligned(prevSpheres, s1 * sizeof(SphereData));
+			Simd4f s1p1 = loadAligned(curSpheres, s1 * sizeof(SphereData));
+
+			Simd4f v0 = s0p1 - s0p0;
+			Simd4f v1 = s1p1 - s1p0;
+			Simd4f vd = v1 - v0;
+
+			// dot is in the range -1 to 1, scale and bias to 0 to 1
+			dot = dot * sHalf + sHalf;
+
+			// interpolate velocity at contact points
+			Simd4f vx = splat<0>(v0) + dot * splat<0>(vd);
+			Simd4f vy = splat<1>(v0) + dot * splat<1>(vd);
+			Simd4f vz = splat<2>(v0) + dot * splat<2>(vd);
+
+			accum.addVelocity(vx, vy, vz, contactMask);
+		}
+	}
+
+	return shapeMask.mSpheres;
+}
+
+template <typename Simd4f>
+FORCE_INLINE void cloth::SwCollision<Simd4f>::collideSpheres(const Simd4i& sphereMask, const Simd4f* __restrict prevPos,
+                                                             Simd4f* __restrict curPos, ImpulseAccumulator& accum) const
+{
+	const float* __restrict prevSpheres = array(mPrevData.mSpheres->center);
+	const float* __restrict curSpheres = array(mCurData.mSpheres->center);
+
+	bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+
+	Simd4i mask4 = horizontalOr(sphereMask);
+	uint32_t mask = uint32_t(simdi::array(mask4)[0]);
+	while(mask)
+	{
+		uint32_t test = mask - 1;
+		uint32_t offset = findBitSet(mask & ~test) * sizeof(SphereData);
+		mask = mask & test;
+
+		Simd4f prevSphere = loadAligned(prevSpheres, offset);
+		Simd4f prevX = prevPos[0] - splat<0>(prevSphere);
+		Simd4f prevY = prevPos[1] - splat<1>(prevSphere);
+		Simd4f prevZ = prevPos[2] - splat<2>(prevSphere);
+		Simd4f prevRadius = splat<3>(prevSphere);
+
+		Simd4f curSphere = loadAligned(curSpheres, offset);
+		Simd4f curX = curPos[0] - splat<0>(curSphere);
+		Simd4f curY = curPos[1] - splat<1>(curSphere);
+		Simd4f curZ = curPos[2] - splat<2>(curSphere);
+		Simd4f curRadius = splat<3>(curSphere);
+
+		Simd4f sqrDistance = sEpsilon + curX * curX + curY * curY + curZ * curZ;
+
+		Simd4f dotPrevPrev = prevX * prevX + prevY * prevY + prevZ * prevZ - prevRadius * prevRadius;
+		Simd4f dotPrevCur = prevX * curX + prevY * curY + prevZ * curZ - prevRadius * curRadius;
+		Simd4f dotCurCur = sqrDistance - curRadius * curRadius;
+
+		Simd4f discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev;
+		Simd4f sqrtD = sqrt(discriminant);
+		Simd4f halfB = dotPrevCur - dotPrevPrev;
+		Simd4f minusA = dotPrevCur - dotCurCur + halfB;
+
+		// time of impact or 0 if prevPos inside sphere
+		Simd4f toi = recip(minusA) * min(simd4f(_0), halfB + sqrtD);
+		Simd4f collisionMask = (toi < simd4f(_1)) & (halfB < sqrtD);
+
+		// skip continuous collision if the (un-clamped) particle
+		// trajectory only touches the outer skin of the cone.
+		Simd4f rMin = prevRadius + halfB * minusA * (curRadius - prevRadius);
+		collisionMask = collisionMask & (discriminant > minusA * rMin * rMin * sSkeletonWidth);
+
+		// a is negative when one sphere is contained in the other,
+		// which is already handled by discrete collision.
+		collisionMask = collisionMask & (minusA < -(Simd4f)sEpsilon);
+
+		if(!allEqual(collisionMask, simd4f(_0)))
+		{
+			Simd4f deltaX = prevX - curX;
+			Simd4f deltaY = prevY - curY;
+			Simd4f deltaZ = prevZ - curZ;
+
+			Simd4f oneMinusToi = (simd4f(_1) - toi) & collisionMask;
+
+			// reduce ccd impulse if (clamped) particle trajectory stays in sphere skin,
+			// i.e. scale by exp2(-k) or 1/(1+k) with k = (tmin - toi) / (1 - toi)
+			Simd4f minusK = sqrtD * recip(minusA * oneMinusToi) & (oneMinusToi > sEpsilon);
+			oneMinusToi = oneMinusToi * recip(sOne - minusK);
+
+			curX = curX + deltaX * oneMinusToi;
+			curY = curY + deltaY * oneMinusToi;
+			curZ = curZ + deltaZ * oneMinusToi;
+
+			curPos[0] = splat<0>(curSphere) + curX;
+			curPos[1] = splat<1>(curSphere) + curY;
+			curPos[2] = splat<2>(curSphere) + curZ;
+
+			sqrDistance = sEpsilon + curX * curX + curY * curY + curZ * curZ;
+		}
+
+		Simd4f negativeScale = simd4f(_1) - rsqrt(sqrDistance) * curRadius;
+
+		Simd4f contactMask;
+		if(!anyGreater(simd4f(_0), negativeScale, contactMask))
+			continue;
+
+		accum.subtract(curX, curY, curZ, negativeScale, contactMask);
+
+		if(frictionEnabled)
+		{
+			Simd4f velocity = curSphere - prevSphere;
+			accum.addVelocity(splat<0>(velocity), splat<1>(velocity), splat<2>(velocity), contactMask);
+		}
+	}
+}
+
+template <typename Simd4f>
+FORCE_INLINE typename cloth::SwCollision<Simd4f>::Simd4i
+cloth::SwCollision<Simd4f>::collideCones(const Simd4f* __restrict prevPos, Simd4f* __restrict curPos,
+                                         ImpulseAccumulator& accum) const
+{
+	const float* __restrict prevCenterPtr = array(mPrevData.mCones->center);
+	const float* __restrict prevAxisPtr = array(mPrevData.mCones->axis);
+	const float* __restrict prevAuxiliaryPtr = &mPrevData.mCones->sqrCosine;
+
+	const float* __restrict curCenterPtr = array(mCurData.mCones->center);
+	const float* __restrict curAxisPtr = array(mCurData.mCones->axis);
+	const float* __restrict curAuxiliaryPtr = &mCurData.mCones->sqrCosine;
+
+	bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+
+	ShapeMask shapeMask = getShapeMask(prevPos, curPos);
+	Simd4i mask4 = horizontalOr(shapeMask.mCones);
+	uint32_t mask = uint32_t(simdi::array(mask4)[0]);
+	while(mask)
+	{
+		uint32_t test = mask - 1;
+		uint32_t coneIndex = findBitSet(mask & ~test);
+		uint32_t offset = coneIndex * sizeof(ConeData);
+		mask = mask & test;
+
+		Simd4i test4 = simdi::operator-(mask4, simd4i(_1));
+		Simd4f culled = simd4f(andNotIsZero(shapeMask.mCones, test4));
+		mask4 = mask4 & test4;
+
+		Simd4f prevCenter = loadAligned(prevCenterPtr, offset);
+		Simd4f prevAxis = loadAligned(prevAxisPtr, offset);
+		Simd4f prevAxisX = splat<0>(prevAxis);
+		Simd4f prevAxisY = splat<1>(prevAxis);
+		Simd4f prevAxisZ = splat<2>(prevAxis);
+		Simd4f prevSlope = splat<3>(prevAxis);
+
+		Simd4f prevX = prevPos[0] - splat<0>(prevCenter);
+		Simd4f prevY = prevPos[1] - splat<1>(prevCenter);
+		Simd4f prevZ = prevPos[2] - splat<2>(prevCenter);
+		Simd4f prevT = prevY * prevAxisZ - prevZ * prevAxisY;
+		Simd4f prevU = prevZ * prevAxisX - prevX * prevAxisZ;
+		Simd4f prevV = prevX * prevAxisY - prevY * prevAxisX;
+		Simd4f prevDot = prevX * prevAxisX + prevY * prevAxisY + prevZ * prevAxisZ;
+		Simd4f prevRadius = prevDot * prevSlope + splat<3>(prevCenter);
+
+		Simd4f curCenter = loadAligned(curCenterPtr, offset);
+		Simd4f curAxis = loadAligned(curAxisPtr, offset);
+		Simd4f curAxisX = splat<0>(curAxis);
+		Simd4f curAxisY = splat<1>(curAxis);
+		Simd4f curAxisZ = splat<2>(curAxis);
+		Simd4f curSlope = splat<3>(curAxis);
+		Simd4i curAuxiliary = simd4i((Simd4f)loadAligned(curAuxiliaryPtr, offset));
+
+		Simd4f curX = curPos[0] - splat<0>(curCenter);
+		Simd4f curY = curPos[1] - splat<1>(curCenter);
+		Simd4f curZ = curPos[2] - splat<2>(curCenter);
+		Simd4f curT = curY * curAxisZ - curZ * curAxisY;
+		Simd4f curU = curZ * curAxisX - curX * curAxisZ;
+		Simd4f curV = curX * curAxisY - curY * curAxisX;
+		Simd4f curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ;
+		Simd4f curRadius = curDot * curSlope + splat<3>(curCenter);
+
+		Simd4f curSqrDistance = sEpsilon + curT * curT + curU * curU + curV * curV;
+
+		// set radius to zero if cone is culled
+		prevRadius = max(prevRadius, simd4f(_0)) & ~culled;
+		curRadius = max(curRadius, simd4f(_0)) & ~culled;
+
+		Simd4f dotPrevPrev = prevT * prevT + prevU * prevU + prevV * prevV - prevRadius * prevRadius;
+		Simd4f dotPrevCur = prevT * curT + prevU * curU + prevV * curV - prevRadius * curRadius;
+		Simd4f dotCurCur = curSqrDistance - curRadius * curRadius;
+
+		Simd4f discriminant = dotPrevCur * dotPrevCur - dotCurCur * dotPrevPrev;
+		Simd4f sqrtD = sqrt(discriminant);
+		Simd4f halfB = dotPrevCur - dotPrevPrev;
+		Simd4f minusA = dotPrevCur - dotCurCur + halfB;
+
+		// time of impact or 0 if prevPos inside cone
+		Simd4f toi = recip(minusA) * min(simd4f(_0), halfB + sqrtD);
+		Simd4f collisionMask = (toi < simd4f(_1)) & (halfB < sqrtD);
+
+		// skip continuous collision if the (un-clamped) particle
+		// trajectory only touches the outer skin of the cone.
+		Simd4f rMin = prevRadius + halfB * minusA * (curRadius - prevRadius);
+		collisionMask = collisionMask & (discriminant > minusA * rMin * rMin * sSkeletonWidth);
+
+		// a is negative when one cone is contained in the other,
+		// which is already handled by discrete collision.
+		collisionMask = collisionMask & (minusA < -(Simd4f)sEpsilon);
+
+		// test if any particle hits infinite cone (and 0<time of impact<1)
+		if(!allEqual(collisionMask, simd4f(_0)))
+		{
+			Simd4f deltaX = prevX - curX;
+			Simd4f deltaY = prevY - curY;
+			Simd4f deltaZ = prevZ - curZ;
+
+			// interpolate delta at toi
+			Simd4f posX = prevX - deltaX * toi;
+			Simd4f posY = prevY - deltaY * toi;
+			Simd4f posZ = prevZ - deltaZ * toi;
+
+			Simd4f curScaledAxis = curAxis * splat<1>(simd4f(curAuxiliary));
+			Simd4i prevAuxiliary = simd4i((Simd4f)loadAligned(prevAuxiliaryPtr, offset));
+			Simd4f deltaScaledAxis = curScaledAxis - prevAxis * splat<1>(simd4f(prevAuxiliary));
+
+			Simd4f oneMinusToi = simd4f(_1) - toi;
+
+			// interpolate axis at toi
+			Simd4f axisX = splat<0>(curScaledAxis) - splat<0>(deltaScaledAxis) * oneMinusToi;
+			Simd4f axisY = splat<1>(curScaledAxis) - splat<1>(deltaScaledAxis) * oneMinusToi;
+			Simd4f axisZ = splat<2>(curScaledAxis) - splat<2>(deltaScaledAxis) * oneMinusToi;
+			Simd4f slope = (prevSlope * oneMinusToi + curSlope * toi);
+
+			Simd4f sqrHalfLength = axisX * axisX + axisY * axisY + axisZ * axisZ;
+			Simd4f invHalfLength = rsqrt(sqrHalfLength);
+			Simd4f dot = (posX * axisX + posY * axisY + posZ * axisZ) * invHalfLength;
+
+			Simd4f sqrDistance = posX * posX + posY * posY + posZ * posZ - dot * dot;
+			Simd4f invDistance = rsqrt(sqrDistance) & (sqrDistance > simd4f(_0));
+
+			Simd4f base = dot + slope * sqrDistance * invDistance;
+			Simd4f scale = base * invHalfLength & collisionMask;
+
+			Simd4f cullMask = (abs(scale) < simd4f(_1)) & collisionMask;
+
+			// test if any impact position is in cone section
+			if(!allEqual(cullMask, simd4f(_0)))
+			{
+				deltaX = deltaX + splat<0>(deltaScaledAxis) * scale;
+				deltaY = deltaY + splat<1>(deltaScaledAxis) * scale;
+				deltaZ = deltaZ + splat<2>(deltaScaledAxis) * scale;
+
+				oneMinusToi = oneMinusToi & cullMask;
+
+				// reduce ccd impulse if (clamped) particle trajectory stays in cone skin,
+				// i.e. scale by exp2(-k) or 1/(1+k) with k = (tmin - toi) / (1 - toi)
+				// oneMinusToi = oneMinusToi * recip(sOne - sqrtD * recip(minusA * oneMinusToi));
+				Simd4f minusK = sqrtD * recip(minusA * oneMinusToi) & (oneMinusToi > sEpsilon);
+				oneMinusToi = oneMinusToi * recip(sOne - minusK);
+
+				curX = curX + deltaX * oneMinusToi;
+				curY = curY + deltaY * oneMinusToi;
+				curZ = curZ + deltaZ * oneMinusToi;
+
+				curDot = curX * curAxisX + curY * curAxisY + curZ * curAxisZ;
+				curRadius = curDot * curSlope + splat<3>(curCenter);
+				curRadius = max(curRadius, simd4f(_0)) & ~culled;
+				curSqrDistance = curX * curX + curY * curY + curZ * curZ - curDot * curDot;
+
+				curPos[0] = splat<0>(curCenter) + curX;
+				curPos[1] = splat<1>(curCenter) + curY;
+				curPos[2] = splat<2>(curCenter) + curZ;
+			}
+		}
+
+		// curPos inside cone (discrete collision)
+		Simd4f contactMask;
+		int anyContact = anyGreater(curRadius * curRadius, curSqrDistance, contactMask);
+
+		Simd4i bothMask = splat<3>(curAuxiliary);
+
+		// instead of culling continuous collision for ~collisionMask, and discrete
+		// collision for ~contactMask, disable both if ~collisionMask & ~contactMask
+		Simd4i cullMask = bothMask & ~simd4i(collisionMask | contactMask);
+		shapeMask.mSpheres = shapeMask.mSpheres & ~cullMask;
+
+		if(!anyContact)
+			continue;
+
+		Simd4f invDistance = rsqrt(curSqrDistance) & (curSqrDistance > sZero);
+		Simd4f base = curDot + curSlope * curSqrDistance * invDistance;
+
+		Simd4f halfLength = splat<1>(simd4f(curAuxiliary));
+		Simd4i leftMask = simd4i(base < -halfLength);
+		Simd4i rightMask = simd4i(base > halfLength);
+
+		// can only skip continuous sphere collision if post-ccd position
+		// is on code side *and* particle had cone-ccd collision.
+		Simd4i firstMask = splat<2>(curAuxiliary);
+		Simd4i secondMask = firstMask ^ bothMask;
+		cullMask = (firstMask & ~leftMask) | (secondMask & ~rightMask);
+		shapeMask.mSpheres = shapeMask.mSpheres & ~(cullMask & simd4i(collisionMask));
+
+		Simd4f deltaX = curX - base * curAxisX;
+		Simd4f deltaY = curY - base * curAxisY;
+		Simd4f deltaZ = curZ - base * curAxisZ;
+
+		Simd4f sqrCosine = splat<0>(simd4f(curAuxiliary));
+		Simd4f scale = curRadius * invDistance * sqrCosine - sqrCosine;
+
+		contactMask = contactMask & ~simd4f(leftMask | rightMask);
+
+		if(!anyTrue(contactMask))
+			continue;
+
+		accum.add(deltaX, deltaY, deltaZ, scale, contactMask);
+
+		if(frictionEnabled)
+		{
+			uint32_t s0 = mClothData.mCapsuleIndices[coneIndex].first;
+			uint32_t s1 = mClothData.mCapsuleIndices[coneIndex].second;
+
+			float* prevSpheres = reinterpret_cast<float*>(mPrevData.mSpheres);
+			float* curSpheres = reinterpret_cast<float*>(mCurData.mSpheres);
+
+			// todo: could pre-compute sphere velocities or it might be
+			// faster to compute cur/prev sphere positions directly
+			Simd4f s0p0 = loadAligned(prevSpheres, s0 * sizeof(SphereData));
+			Simd4f s0p1 = loadAligned(curSpheres, s0 * sizeof(SphereData));
+
+			Simd4f s1p0 = loadAligned(prevSpheres, s1 * sizeof(SphereData));
+			Simd4f s1p1 = loadAligned(curSpheres, s1 * sizeof(SphereData));
+
+			Simd4f v0 = s0p1 - s0p0;
+			Simd4f v1 = s1p1 - s1p0;
+			Simd4f vd = v1 - v0;
+
+			// dot is in the range -1 to 1, scale and bias to 0 to 1
+			curDot = curDot * sHalf + sHalf;
+
+			// interpolate velocity at contact points
+			Simd4f vx = splat<0>(v0) + curDot * splat<0>(vd);
+			Simd4f vy = splat<1>(v0) + curDot * splat<1>(vd);
+			Simd4f vz = splat<2>(v0) + curDot * splat<2>(vd);
+
+			accum.addVelocity(vx, vy, vz, contactMask);
+		}
+	}
+
+	return shapeMask.mSpheres;
+}
+
+namespace
+{
+
+template <typename Simd4f>
+PX_INLINE void calculateFrictionImpulse(const Simd4f& deltaX, const Simd4f& deltaY, const Simd4f& deltaZ,
+                                        const Simd4f& velX, const Simd4f& velY, const Simd4f& velZ,
+                                        const Simd4f* curPos, const Simd4f* prevPos, const Simd4f& scale,
+                                        const Simd4f& coefficient, const Simd4f& mask, Simd4f* impulse)
+{
+	// calculate collision normal
+	Simd4f deltaSq = deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ;
+
+	Simd4f rcpDelta = rsqrt(deltaSq + sEpsilon);
+
+	Simd4f nx = deltaX * rcpDelta;
+	Simd4f ny = deltaY * rcpDelta;
+	Simd4f nz = deltaZ * rcpDelta;
+
+	// calculate relative velocity scaled by number of collisions
+	Simd4f rvx = curPos[0] - prevPos[0] - velX * scale;
+	Simd4f rvy = curPos[1] - prevPos[1] - velY * scale;
+	Simd4f rvz = curPos[2] - prevPos[2] - velZ * scale;
+
+	// calculate magnitude of relative normal velocity
+	Simd4f rvn = rvx * nx + rvy * ny + rvz * nz;
+
+	// calculate relative tangential velocity
+	Simd4f rvtx = rvx - rvn * nx;
+	Simd4f rvty = rvy - rvn * ny;
+	Simd4f rvtz = rvz - rvn * nz;
+
+	// calculate magnitude of vt
+	Simd4f rcpVt = rsqrt(rvtx * rvtx + rvty * rvty + rvtz * rvtz + sEpsilon);
+
+	// magnitude of friction impulse (cannot be greater than -vt)
+	Simd4f j = max(-coefficient * deltaSq * rcpDelta * rcpVt, sNegOne) & mask;
+
+	impulse[0] = rvtx * j;
+	impulse[1] = rvty * j;
+	impulse[2] = rvtz * j;
+}
+
+} // anonymous namespace
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideParticles()
+{
+	const bool massScalingEnabled = mClothData.mCollisionMassScale > 0.0f;
+	const Simd4f massScale = simd4f(mClothData.mCollisionMassScale);
+
+	const bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+	const Simd4f frictionScale = simd4f(mClothData.mFrictionScale);
+
+	Simd4f curPos[4];
+	Simd4f prevPos[4];
+
+	float* __restrict prevIt = mClothData.mPrevParticles;
+	float* __restrict pIt = mClothData.mCurParticles;
+	float* __restrict pEnd = pIt + mClothData.mNumParticles * 4;
+	for(; pIt < pEnd; pIt += 16, prevIt += 16)
+	{
+		curPos[0] = loadAligned(pIt, 0);
+		curPos[1] = loadAligned(pIt, 16);
+		curPos[2] = loadAligned(pIt, 32);
+		curPos[3] = loadAligned(pIt, 48);
+		transpose(curPos[0], curPos[1], curPos[2], curPos[3]);
+
+		ImpulseAccumulator accum;
+		Simd4i sphereMask = collideCones(curPos, accum);
+		collideSpheres(sphereMask, curPos, accum);
+
+		Simd4f mask;
+		if(!anyGreater(accum.mNumCollisions, sEpsilon, mask))
+			continue;
+
+		Simd4f invNumCollisions = recip(accum.mNumCollisions);
+
+		if(frictionEnabled)
+		{
+			prevPos[0] = loadAligned(prevIt, 0);
+			prevPos[1] = loadAligned(prevIt, 16);
+			prevPos[2] = loadAligned(prevIt, 32);
+			prevPos[3] = loadAligned(prevIt, 48);
+			transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]);
+
+			Simd4f frictionImpulse[3];
+			calculateFrictionImpulse(accum.mDeltaX, accum.mDeltaY, accum.mDeltaZ, accum.mVelX, accum.mVelY, accum.mVelZ,
+			                         curPos, prevPos, invNumCollisions, frictionScale, mask, frictionImpulse);
+
+			prevPos[0] = prevPos[0] - frictionImpulse[0];
+			prevPos[1] = prevPos[1] - frictionImpulse[1];
+			prevPos[2] = prevPos[2] - frictionImpulse[2];
+
+			transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]);
+			storeAligned(prevIt, 0, prevPos[0]);
+			storeAligned(prevIt, 16, prevPos[1]);
+			storeAligned(prevIt, 32, prevPos[2]);
+			storeAligned(prevIt, 48, prevPos[3]);
+		}
+
+		if(massScalingEnabled)
+		{
+			// calculate the inverse mass scale based on the collision impulse magnitude
+			Simd4f dSq = invNumCollisions * invNumCollisions *
+			             (accum.mDeltaX * accum.mDeltaX + accum.mDeltaY * accum.mDeltaY + accum.mDeltaZ * accum.mDeltaZ);
+
+			Simd4f scale = recip(sOne + massScale * dSq);
+
+			// scale invmass
+			curPos[3] = select(mask, curPos[3] * scale, curPos[3]);
+		}
+
+		curPos[0] = curPos[0] + accum.mDeltaX * invNumCollisions;
+		curPos[1] = curPos[1] + accum.mDeltaY * invNumCollisions;
+		curPos[2] = curPos[2] + accum.mDeltaZ * invNumCollisions;
+
+		transpose(curPos[0], curPos[1], curPos[2], curPos[3]);
+		storeAligned(pIt, 0, curPos[0]);
+		storeAligned(pIt, 16, curPos[1]);
+		storeAligned(pIt, 32, curPos[2]);
+		storeAligned(pIt, 48, curPos[3]);
+
+#if PX_PROFILE || PX_DEBUG
+		mNumCollisions += horizontalSum(accum.mNumCollisions);
+#endif
+	}
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideVirtualParticles()
+{
+	const bool massScalingEnabled = mClothData.mCollisionMassScale > 0.0f;
+	const Simd4f massScale = simd4f(mClothData.mCollisionMassScale);
+
+	const bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+	const Simd4f frictionScale = simd4f(mClothData.mFrictionScale);
+
+	Simd4f curPos[3];
+
+	const float* __restrict weights = mClothData.mVirtualParticleWeights;
+	float* __restrict particles = mClothData.mCurParticles;
+	float* __restrict prevParticles = mClothData.mPrevParticles;
+
+	// move dummy particles outside of collision range
+	Simd4f* __restrict dummy = mClothData.mNumParticles + reinterpret_cast<Simd4f*>(mClothData.mCurParticles);
+	Simd4f invGridScale = recip(mGridScale) & (mGridScale > sEpsilon);
+	dummy[0] = dummy[1] = dummy[2] = invGridScale * mGridBias - invGridScale;
+
+	const uint16_t* __restrict vpIt = mClothData.mVirtualParticlesBegin;
+	const uint16_t* __restrict vpEnd = mClothData.mVirtualParticlesEnd;
+	for(; vpIt != vpEnd; vpIt += 16)
+	{
+		// load 12 particles and 4 weights
+		Simd4f p0v0 = loadAligned(particles, vpIt[0] * sizeof(PxVec4));
+		Simd4f p0v1 = loadAligned(particles, vpIt[1] * sizeof(PxVec4));
+		Simd4f p0v2 = loadAligned(particles, vpIt[2] * sizeof(PxVec4));
+		Simd4f w0 = loadAligned(weights, vpIt[3] * sizeof(PxVec4));
+
+		Simd4f p1v0 = loadAligned(particles, vpIt[4] * sizeof(PxVec4));
+		Simd4f p1v1 = loadAligned(particles, vpIt[5] * sizeof(PxVec4));
+		Simd4f p1v2 = loadAligned(particles, vpIt[6] * sizeof(PxVec4));
+		Simd4f w1 = loadAligned(weights, vpIt[7] * sizeof(PxVec4));
+
+		Simd4f p2v0 = loadAligned(particles, vpIt[8] * sizeof(PxVec4));
+		Simd4f p2v1 = loadAligned(particles, vpIt[9] * sizeof(PxVec4));
+		Simd4f p2v2 = loadAligned(particles, vpIt[10] * sizeof(PxVec4));
+		Simd4f w2 = loadAligned(weights, vpIt[11] * sizeof(PxVec4));
+
+		Simd4f p3v1 = loadAligned(particles, vpIt[13] * sizeof(PxVec4));
+		Simd4f p3v0 = loadAligned(particles, vpIt[12] * sizeof(PxVec4));
+		Simd4f p3v2 = loadAligned(particles, vpIt[14] * sizeof(PxVec4));
+		Simd4f w3 = loadAligned(weights, vpIt[15] * sizeof(PxVec4));
+
+		// interpolate particles and transpose
+		Simd4f px = p0v0 * splat<0>(w0) + p0v1 * splat<1>(w0) + p0v2 * splat<2>(w0);
+		Simd4f py = p1v0 * splat<0>(w1) + p1v1 * splat<1>(w1) + p1v2 * splat<2>(w1);
+		Simd4f pz = p2v0 * splat<0>(w2) + p2v1 * splat<1>(w2) + p2v2 * splat<2>(w2);
+		Simd4f pw = p3v0 * splat<0>(w3) + p3v1 * splat<1>(w3) + p3v2 * splat<2>(w3);
+		transpose(px, py, pz, pw);
+
+		curPos[0] = px;
+		curPos[1] = py;
+		curPos[2] = pz;
+
+		ImpulseAccumulator accum;
+		Simd4i sphereMask = collideCones(curPos, accum);
+		collideSpheres(sphereMask, curPos, accum);
+
+		Simd4f mask;
+		if(!anyGreater(accum.mNumCollisions, sEpsilon, mask))
+			continue;
+
+		Simd4f invNumCollisions = recip(accum.mNumCollisions);
+
+		// displacement and transpose back
+		Simd4f d0 = accum.mDeltaX * invNumCollisions;
+		Simd4f d1 = accum.mDeltaY * invNumCollisions;
+		Simd4f d2 = accum.mDeltaZ * invNumCollisions;
+		Simd4f d3 = sZero;
+		transpose(d0, d1, d2, d3);
+
+		// scale weights by 1/dot(w,w)
+		Simd4f rw0 = w0 * splat<3>(w0);
+		Simd4f rw1 = w1 * splat<3>(w1);
+		Simd4f rw2 = w2 * splat<3>(w2);
+		Simd4f rw3 = w3 * splat<3>(w3);
+
+		if(frictionEnabled)
+		{
+			Simd4f q0v0 = loadAligned(prevParticles, vpIt[0] * sizeof(PxVec4));
+			Simd4f q0v1 = loadAligned(prevParticles, vpIt[1] * sizeof(PxVec4));
+			Simd4f q0v2 = loadAligned(prevParticles, vpIt[2] * sizeof(PxVec4));
+
+			Simd4f q1v0 = loadAligned(prevParticles, vpIt[4] * sizeof(PxVec4));
+			Simd4f q1v1 = loadAligned(prevParticles, vpIt[5] * sizeof(PxVec4));
+			Simd4f q1v2 = loadAligned(prevParticles, vpIt[6] * sizeof(PxVec4));
+
+			Simd4f q2v0 = loadAligned(prevParticles, vpIt[8] * sizeof(PxVec4));
+			Simd4f q2v1 = loadAligned(prevParticles, vpIt[9] * sizeof(PxVec4));
+			Simd4f q2v2 = loadAligned(prevParticles, vpIt[10] * sizeof(PxVec4));
+
+			Simd4f q3v0 = loadAligned(prevParticles, vpIt[12] * sizeof(PxVec4));
+			Simd4f q3v1 = loadAligned(prevParticles, vpIt[13] * sizeof(PxVec4));
+			Simd4f q3v2 = loadAligned(prevParticles, vpIt[14] * sizeof(PxVec4));
+
+			// calculate previous interpolated positions
+			Simd4f qx = q0v0 * splat<0>(w0) + q0v1 * splat<1>(w0) + q0v2 * splat<2>(w0);
+			Simd4f qy = q1v0 * splat<0>(w1) + q1v1 * splat<1>(w1) + q1v2 * splat<2>(w1);
+			Simd4f qz = q2v0 * splat<0>(w2) + q2v1 * splat<1>(w2) + q2v2 * splat<2>(w2);
+			Simd4f qw = q3v0 * splat<0>(w3) + q3v1 * splat<1>(w3) + q3v2 * splat<2>(w3);
+			transpose(qx, qy, qz, qw);
+
+			Simd4f prevPos[3] = { qx, qy, qz };
+			Simd4f frictionImpulse[4];
+			frictionImpulse[3] = sZero;
+
+			calculateFrictionImpulse(accum.mDeltaX, accum.mDeltaY, accum.mDeltaZ, accum.mVelX, accum.mVelY, accum.mVelZ,
+			                         curPos, prevPos, invNumCollisions, frictionScale, mask, frictionImpulse);
+
+			transpose(frictionImpulse[0], frictionImpulse[1], frictionImpulse[2], frictionImpulse[3]);
+
+			q0v0 = q0v0 - (splat<0>(rw0) * frictionImpulse[0]);
+			q0v1 = q0v1 - (splat<1>(rw0) * frictionImpulse[0]);
+			q0v2 = q0v2 - (splat<2>(rw0) * frictionImpulse[0]);
+
+			q1v0 = q1v0 - (splat<0>(rw1) * frictionImpulse[1]);
+			q1v1 = q1v1 - (splat<1>(rw1) * frictionImpulse[1]);
+			q1v2 = q1v2 - (splat<2>(rw1) * frictionImpulse[1]);
+
+			q2v0 = q2v0 - (splat<0>(rw2) * frictionImpulse[2]);
+			q2v1 = q2v1 - (splat<1>(rw2) * frictionImpulse[2]);
+			q2v2 = q2v2 - (splat<2>(rw2) * frictionImpulse[2]);
+
+			q3v0 = q3v0 - (splat<0>(rw3) * frictionImpulse[3]);
+			q3v1 = q3v1 - (splat<1>(rw3) * frictionImpulse[3]);
+			q3v2 = q3v2 - (splat<2>(rw3) * frictionImpulse[3]);
+
+			// write back prev particles
+			storeAligned(prevParticles, vpIt[0] * sizeof(PxVec4), q0v0);
+			storeAligned(prevParticles, vpIt[1] * sizeof(PxVec4), q0v1);
+			storeAligned(prevParticles, vpIt[2] * sizeof(PxVec4), q0v2);
+
+			storeAligned(prevParticles, vpIt[4] * sizeof(PxVec4), q1v0);
+			storeAligned(prevParticles, vpIt[5] * sizeof(PxVec4), q1v1);
+			storeAligned(prevParticles, vpIt[6] * sizeof(PxVec4), q1v2);
+
+			storeAligned(prevParticles, vpIt[8] * sizeof(PxVec4), q2v0);
+			storeAligned(prevParticles, vpIt[9] * sizeof(PxVec4), q2v1);
+			storeAligned(prevParticles, vpIt[10] * sizeof(PxVec4), q2v2);
+
+			storeAligned(prevParticles, vpIt[12] * sizeof(PxVec4), q3v0);
+			storeAligned(prevParticles, vpIt[13] * sizeof(PxVec4), q3v1);
+			storeAligned(prevParticles, vpIt[14] * sizeof(PxVec4), q3v2);
+		}
+
+		if(massScalingEnabled)
+		{
+			// calculate the inverse mass scale based on the collision impulse
+			Simd4f dSq = invNumCollisions * invNumCollisions *
+			             (accum.mDeltaX * accum.mDeltaX + accum.mDeltaY * accum.mDeltaY + accum.mDeltaZ * accum.mDeltaZ);
+
+			Simd4f weightScale = recip(sOne + massScale * dSq);
+
+			weightScale = weightScale - sOne;
+			Simd4f s0 = sOne + splat<0>(weightScale) * (w0 & splat<0>(mask));
+			Simd4f s1 = sOne + splat<1>(weightScale) * (w1 & splat<1>(mask));
+			Simd4f s2 = sOne + splat<2>(weightScale) * (w2 & splat<2>(mask));
+			Simd4f s3 = sOne + splat<3>(weightScale) * (w3 & splat<3>(mask));
+
+			p0v0 = p0v0 * (sOneXYZ | (splat<0>(s0) & sMaskW));
+			p0v1 = p0v1 * (sOneXYZ | (splat<1>(s0) & sMaskW));
+			p0v2 = p0v2 * (sOneXYZ | (splat<2>(s0) & sMaskW));
+
+			p1v0 = p1v0 * (sOneXYZ | (splat<0>(s1) & sMaskW));
+			p1v1 = p1v1 * (sOneXYZ | (splat<1>(s1) & sMaskW));
+			p1v2 = p1v2 * (sOneXYZ | (splat<2>(s1) & sMaskW));
+
+			p2v0 = p2v0 * (sOneXYZ | (splat<0>(s2) & sMaskW));
+			p2v1 = p2v1 * (sOneXYZ | (splat<1>(s2) & sMaskW));
+			p2v2 = p2v2 * (sOneXYZ | (splat<2>(s2) & sMaskW));
+
+			p3v0 = p3v0 * (sOneXYZ | (splat<0>(s3) & sMaskW));
+			p3v1 = p3v1 * (sOneXYZ | (splat<1>(s3) & sMaskW));
+			p3v2 = p3v2 * (sOneXYZ | (splat<2>(s3) & sMaskW));
+		}
+
+		p0v0 = p0v0 + (splat<0>(rw0) * d0);
+		p0v1 = p0v1 + (splat<1>(rw0) * d0);
+		p0v2 = p0v2 + (splat<2>(rw0) * d0);
+
+		p1v0 = p1v0 + (splat<0>(rw1) * d1);
+		p1v1 = p1v1 + (splat<1>(rw1) * d1);
+		p1v2 = p1v2 + (splat<2>(rw1) * d1);
+
+		p2v0 = p2v0 + (splat<0>(rw2) * d2);
+		p2v1 = p2v1 + (splat<1>(rw2) * d2);
+		p2v2 = p2v2 + (splat<2>(rw2) * d2);
+
+		p3v0 = p3v0 + (splat<0>(rw3) * d3);
+		p3v1 = p3v1 + (splat<1>(rw3) * d3);
+		p3v2 = p3v2 + (splat<2>(rw3) * d3);
+
+		// write back particles
+		storeAligned(particles, vpIt[0] * sizeof(PxVec4), p0v0);
+		storeAligned(particles, vpIt[1] * sizeof(PxVec4), p0v1);
+		storeAligned(particles, vpIt[2] * sizeof(PxVec4), p0v2);
+
+		storeAligned(particles, vpIt[4] * sizeof(PxVec4), p1v0);
+		storeAligned(particles, vpIt[5] * sizeof(PxVec4), p1v1);
+		storeAligned(particles, vpIt[6] * sizeof(PxVec4), p1v2);
+
+		storeAligned(particles, vpIt[8] * sizeof(PxVec4), p2v0);
+		storeAligned(particles, vpIt[9] * sizeof(PxVec4), p2v1);
+		storeAligned(particles, vpIt[10] * sizeof(PxVec4), p2v2);
+
+		storeAligned(particles, vpIt[12] * sizeof(PxVec4), p3v0);
+		storeAligned(particles, vpIt[13] * sizeof(PxVec4), p3v1);
+		storeAligned(particles, vpIt[14] * sizeof(PxVec4), p3v2);
+
+#if PX_PROFILE || PX_DEBUG
+		mNumCollisions += horizontalSum(accum.mNumCollisions);
+#endif
+	}
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideContinuousParticles()
+{
+	Simd4f curPos[4];
+	Simd4f prevPos[4];
+
+	const bool massScalingEnabled = mClothData.mCollisionMassScale > 0.0f;
+	const Simd4f massScale = simd4f(mClothData.mCollisionMassScale);
+
+	const bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+	const Simd4f frictionScale = simd4f(mClothData.mFrictionScale);
+
+	float* __restrict prevIt = mClothData.mPrevParticles;
+	float* __restrict curIt = mClothData.mCurParticles;
+	float* __restrict curEnd = curIt + mClothData.mNumParticles * 4;
+
+	for(; curIt < curEnd; curIt += 16, prevIt += 16)
+	{
+		prevPos[0] = loadAligned(prevIt, 0);
+		prevPos[1] = loadAligned(prevIt, 16);
+		prevPos[2] = loadAligned(prevIt, 32);
+		prevPos[3] = loadAligned(prevIt, 48);
+		transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]);
+
+		curPos[0] = loadAligned(curIt, 0);
+		curPos[1] = loadAligned(curIt, 16);
+		curPos[2] = loadAligned(curIt, 32);
+		curPos[3] = loadAligned(curIt, 48);
+		transpose(curPos[0], curPos[1], curPos[2], curPos[3]);
+
+		ImpulseAccumulator accum;
+		Simd4i sphereMask = collideCones(prevPos, curPos, accum);
+		collideSpheres(sphereMask, prevPos, curPos, accum);
+
+		Simd4f mask;
+		if(!anyGreater(accum.mNumCollisions, sEpsilon, mask))
+			continue;
+
+		Simd4f invNumCollisions = recip(accum.mNumCollisions);
+
+		if(frictionEnabled)
+		{
+			Simd4f frictionImpulse[3];
+			calculateFrictionImpulse(accum.mDeltaX, accum.mDeltaY, accum.mDeltaZ, accum.mVelX, accum.mVelY, accum.mVelZ,
+			                         curPos, prevPos, invNumCollisions, frictionScale, mask, frictionImpulse);
+
+			prevPos[0] = prevPos[0] - frictionImpulse[0];
+			prevPos[1] = prevPos[1] - frictionImpulse[1];
+			prevPos[2] = prevPos[2] - frictionImpulse[2];
+
+			transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]);
+			storeAligned(prevIt, 0, prevPos[0]);
+			storeAligned(prevIt, 16, prevPos[1]);
+			storeAligned(prevIt, 32, prevPos[2]);
+			storeAligned(prevIt, 48, prevPos[3]);
+		}
+
+		if(massScalingEnabled)
+		{
+			// calculate the inverse mass scale based on the collision impulse magnitude
+			Simd4f dSq = invNumCollisions * invNumCollisions *
+			             (accum.mDeltaX * accum.mDeltaX + accum.mDeltaY * accum.mDeltaY + accum.mDeltaZ * accum.mDeltaZ);
+
+			Simd4f weightScale = recip(sOne + massScale * dSq);
+
+			// scale invmass
+			curPos[3] = select(mask, curPos[3] * weightScale, curPos[3]);
+		}
+
+		curPos[0] = curPos[0] + accum.mDeltaX * invNumCollisions;
+		curPos[1] = curPos[1] + accum.mDeltaY * invNumCollisions;
+		curPos[2] = curPos[2] + accum.mDeltaZ * invNumCollisions;
+
+		transpose(curPos[0], curPos[1], curPos[2], curPos[3]);
+		storeAligned(curIt, 0, curPos[0]);
+		storeAligned(curIt, 16, curPos[1]);
+		storeAligned(curIt, 32, curPos[2]);
+		storeAligned(curIt, 48, curPos[3]);
+
+#if PX_PROFILE || PX_DEBUG
+		mNumCollisions += horizontalSum(accum.mNumCollisions);
+#endif
+	}
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideConvexes(const IterationState<Simd4f>& state)
+{
+	if(!mClothData.mNumConvexes)
+		return;
+
+	// times 2 for plane equation result buffer
+	Simd4f* planes = static_cast<Simd4f*>(mAllocator.allocate(sizeof(Simd4f) * mClothData.mNumPlanes * 2));
+
+	const Simd4f* targetPlanes = reinterpret_cast<const Simd4f*>(mClothData.mTargetCollisionPlanes);
+
+	// generate plane collision data
+	if(state.mRemainingIterations != 1)
+	{
+		// interpolate planes
+		LerpIterator<Simd4f, const Simd4f*> planeIter(reinterpret_cast<const Simd4f*>(mClothData.mStartCollisionPlanes),
+		                                              targetPlanes, state.getCurrentAlpha());
+
+		// todo: normalize plane equations
+		generatePlanes(planes, planeIter, mClothData.mNumPlanes);
+	}
+	else
+	{
+		// otherwise use the target planes directly
+		generatePlanes(planes, targetPlanes, mClothData.mNumPlanes);
+	}
+
+	Simd4f curPos[4], prevPos[4];
+
+	const bool frictionEnabled = mClothData.mFrictionScale > 0.0f;
+	const Simd4f frictionScale = simd4f(mClothData.mFrictionScale);
+
+	float* __restrict curIt = mClothData.mCurParticles;
+	float* __restrict curEnd = curIt + mClothData.mNumParticles * 4;
+	float* __restrict prevIt = mClothData.mPrevParticles;
+	for(; curIt < curEnd; curIt += 16, prevIt += 16)
+	{
+		curPos[0] = loadAligned(curIt, 0);
+		curPos[1] = loadAligned(curIt, 16);
+		curPos[2] = loadAligned(curIt, 32);
+		curPos[3] = loadAligned(curIt, 48);
+		transpose(curPos[0], curPos[1], curPos[2], curPos[3]);
+
+		ImpulseAccumulator accum;
+		collideConvexes(planes, curPos, accum);
+
+		Simd4f mask;
+		if(!anyGreater(accum.mNumCollisions, sEpsilon, mask))
+			continue;
+
+		Simd4f invNumCollisions = recip(accum.mNumCollisions);
+
+		if(frictionEnabled)
+		{
+			prevPos[0] = loadAligned(prevIt, 0);
+			prevPos[1] = loadAligned(prevIt, 16);
+			prevPos[2] = loadAligned(prevIt, 32);
+			prevPos[3] = loadAligned(prevIt, 48);
+			transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]);
+
+			Simd4f frictionImpulse[3];
+			calculateFrictionImpulse(accum.mDeltaX, accum.mDeltaY, accum.mDeltaZ, accum.mVelX, accum.mVelY, accum.mVelZ,
+			                         curPos, prevPos, invNumCollisions, frictionScale, mask, frictionImpulse);
+
+			prevPos[0] = prevPos[0] - frictionImpulse[0];
+			prevPos[1] = prevPos[1] - frictionImpulse[1];
+			prevPos[2] = prevPos[2] - frictionImpulse[2];
+
+			transpose(prevPos[0], prevPos[1], prevPos[2], prevPos[3]);
+			storeAligned(prevIt, 0, prevPos[0]);
+			storeAligned(prevIt, 16, prevPos[1]);
+			storeAligned(prevIt, 32, prevPos[2]);
+			storeAligned(prevIt, 48, prevPos[3]);
+		}
+
+		curPos[0] = curPos[0] + accum.mDeltaX * invNumCollisions;
+		curPos[1] = curPos[1] + accum.mDeltaY * invNumCollisions;
+		curPos[2] = curPos[2] + accum.mDeltaZ * invNumCollisions;
+
+		transpose(curPos[0], curPos[1], curPos[2], curPos[3]);
+		storeAligned(curIt, 0, curPos[0]);
+		storeAligned(curIt, 16, curPos[1]);
+		storeAligned(curIt, 32, curPos[2]);
+		storeAligned(curIt, 48, curPos[3]);
+
+#if PX_PROFILE || PX_DEBUG
+		mNumCollisions += horizontalSum(accum.mNumCollisions);
+#endif
+	}
+
+	mAllocator.deallocate(planes);
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideConvexes(const Simd4f* __restrict planes, Simd4f* __restrict curPos,
+                                                 ImpulseAccumulator& accum)
+{
+	Simd4i result = simd4i(_0);
+	Simd4i mask4 = simd4i(_1);
+
+	const Simd4f* __restrict pIt, *pEnd = planes + mClothData.mNumPlanes;
+	Simd4f* __restrict dIt = const_cast<Simd4f*>(pEnd);
+	for(pIt = planes; pIt != pEnd; ++pIt, ++dIt)
+	{
+		*dIt = splat<3>(*pIt) + curPos[2] * splat<2>(*pIt) + curPos[1] * splat<1>(*pIt) + curPos[0] * splat<0>(*pIt);
+		result = result | (mask4 & simd4i(*dIt < simd4f(_0)));
+		mask4 = mask4 << 1; // todo: shift by Simd4i on consoles
+	}
+
+	if(simdi::allEqual(result, simd4i(_0)))
+		return;
+
+	const uint32_t* __restrict cIt = mClothData.mConvexMasks;
+	const uint32_t* __restrict cEnd = cIt + mClothData.mNumConvexes;
+	for(; cIt != cEnd; ++cIt)
+	{
+		uint32_t mask = *cIt;
+		mask4 = simd4i(int(mask));
+		if(!simdi::anyEqual(mask4 & result, mask4, mask4))
+			continue;
+
+		uint32_t test = mask - 1;
+		uint32_t planeIndex = findBitSet(mask & ~test);
+		Simd4f plane = planes[planeIndex];
+		Simd4f planeX = splat<0>(plane);
+		Simd4f planeY = splat<1>(plane);
+		Simd4f planeZ = splat<2>(plane);
+		Simd4f planeD = pEnd[planeIndex];
+		while(mask &= test)
+		{
+			test = mask - 1;
+			planeIndex = findBitSet(mask & ~test);
+			plane = planes[planeIndex];
+			Simd4f dist = pEnd[planeIndex];
+			Simd4f closer = dist > planeD;
+			planeX = select(closer, splat<0>(plane), planeX);
+			planeY = select(closer, splat<1>(plane), planeY);
+			planeZ = select(closer, splat<2>(plane), planeZ);
+			planeD = max(dist, planeD);
+		}
+
+		accum.subtract(planeX, planeY, planeZ, planeD, simd4f(mask4));
+	}
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideTriangles(const IterationState<Simd4f>& state)
+{
+	if(!mClothData.mNumTriangles)
+		return;
+
+	TriangleData* triangles =
+	    static_cast<TriangleData*>(mAllocator.allocate(sizeof(TriangleData) * mClothData.mNumTriangles));
+
+	UnalignedIterator<Simd4f, 3> targetTriangles(mClothData.mTargetCollisionTriangles);
+
+	// generate triangle collision data
+	if(state.mRemainingIterations != 1)
+	{
+		// interpolate triangles
+		LerpIterator<Simd4f, UnalignedIterator<Simd4f, 3> > triangleIter(mClothData.mStartCollisionTriangles,
+		                                                                 targetTriangles, state.getCurrentAlpha());
+
+		generateTriangles<Simd4f>(triangles, triangleIter, mClothData.mNumTriangles);
+	}
+	else
+	{
+		// otherwise use the target triangles directly
+		generateTriangles<Simd4f>(triangles, targetTriangles, mClothData.mNumTriangles);
+	}
+
+	Simd4f positions[4];
+
+	float* __restrict pIt = mClothData.mCurParticles;
+	float* __restrict pEnd = pIt + mClothData.mNumParticles * 4;
+	for(; pIt < pEnd; pIt += 16)
+	{
+		positions[0] = loadAligned(pIt, 0);
+		positions[1] = loadAligned(pIt, 16);
+		positions[2] = loadAligned(pIt, 32);
+		positions[3] = loadAligned(pIt, 48);
+		transpose(positions[0], positions[1], positions[2], positions[3]);
+
+		ImpulseAccumulator accum;
+		collideTriangles(triangles, positions, accum);
+
+		Simd4f mask;
+		if(!anyGreater(accum.mNumCollisions, sEpsilon, mask))
+			continue;
+
+		Simd4f invNumCollisions = recip(accum.mNumCollisions);
+
+		positions[0] = positions[0] + accum.mDeltaX * invNumCollisions;
+		positions[1] = positions[1] + accum.mDeltaY * invNumCollisions;
+		positions[2] = positions[2] + accum.mDeltaZ * invNumCollisions;
+
+		transpose(positions[0], positions[1], positions[2], positions[3]);
+		storeAligned(pIt, 0, positions[0]);
+		storeAligned(pIt, 16, positions[1]);
+		storeAligned(pIt, 32, positions[2]);
+		storeAligned(pIt, 48, positions[3]);
+
+#if PX_PROFILE || PX_DEBUG
+		mNumCollisions += horizontalSum(accum.mNumCollisions);
+#endif
+	}
+
+	mAllocator.deallocate(triangles);
+}
+
+template <typename Simd4f>
+void cloth::SwCollision<Simd4f>::collideTriangles(const TriangleData* __restrict triangles, Simd4f* __restrict curPos,
+                                                  ImpulseAccumulator& accum)
+{
+	Simd4f normalX, normalY, normalZ, normalD;
+	normalX = normalY = normalZ = normalD = simd4f(_0);
+	Simd4f minSqrLength = sMax;
+
+	const TriangleData* __restrict tIt, *tEnd = triangles + mClothData.mNumTriangles;
+	for(tIt = triangles; tIt != tEnd; ++tIt)
+	{
+		Simd4f base = loadAligned(&tIt->base.x);
+		Simd4f edge0 = loadAligned(&tIt->edge0.x);
+		Simd4f edge1 = loadAligned(&tIt->edge1.x);
+		Simd4f normal = loadAligned(&tIt->normal.x);
+		Simd4f aux = loadAligned(&tIt->det);
+
+		Simd4f dx = curPos[0] - splat<0>(base);
+		Simd4f dy = curPos[1] - splat<1>(base);
+		Simd4f dz = curPos[2] - splat<2>(base);
+
+		Simd4f e0x = splat<0>(edge0);
+		Simd4f e0y = splat<1>(edge0);
+		Simd4f e0z = splat<2>(edge0);
+
+		Simd4f e1x = splat<0>(edge1);
+		Simd4f e1y = splat<1>(edge1);
+		Simd4f e1z = splat<2>(edge1);
+
+		Simd4f nx = splat<0>(normal);
+		Simd4f ny = splat<1>(normal);
+		Simd4f nz = splat<2>(normal);
+
+		Simd4f deltaDotEdge0 = dx * e0x + dy * e0y + dz * e0z;
+		Simd4f deltaDotEdge1 = dx * e1x + dy * e1y + dz * e1z;
+		Simd4f deltaDotNormal = dx * nx + dy * ny + dz * nz;
+
+		Simd4f edge0DotEdge1 = splat<3>(base);
+		Simd4f edge0SqrLength = splat<3>(edge0);
+		Simd4f edge1SqrLength = splat<3>(edge1);
+
+		Simd4f s = edge1SqrLength * deltaDotEdge0 - edge0DotEdge1 * deltaDotEdge1;
+		Simd4f t = edge0SqrLength * deltaDotEdge1 - edge0DotEdge1 * deltaDotEdge0;
+
+		Simd4f sPositive = s > simd4f(_0);
+		Simd4f tPositive = t > simd4f(_0);
+
+		Simd4f det = splat<0>(aux);
+
+		s = select(tPositive, s * det, deltaDotEdge0 * splat<2>(aux));
+		t = select(sPositive, t * det, deltaDotEdge1 * splat<3>(aux));
+
+		Simd4f clamp = simd4f(_1) < s + t;
+		Simd4f numerator = edge1SqrLength - edge0DotEdge1 + deltaDotEdge0 - deltaDotEdge1;
+
+		s = select(clamp, numerator * splat<1>(aux), s);
+
+		s = max(simd4f(_0), min(simd4f(_1), s));
+		t = max(simd4f(_0), min(simd4f(_1) - s, t));
+
+		dx = dx - e0x * s - e1x * t;
+		dy = dy - e0y * s - e1y * t;
+		dz = dz - e0z * s - e1z * t;
+
+		Simd4f sqrLength = dx * dx + dy * dy + dz * dz;
+
+		// slightly increase distance for colliding triangles
+		Simd4f slack = (simd4f(_0) > deltaDotNormal) & simd4f(1e-4f);
+		sqrLength = sqrLength + sqrLength * slack;
+
+		Simd4f mask = sqrLength < minSqrLength;
+
+		normalX = select(mask, nx, normalX);
+		normalY = select(mask, ny, normalY);
+		normalZ = select(mask, nz, normalZ);
+		normalD = select(mask, deltaDotNormal, normalD);
+
+		minSqrLength = min(sqrLength, minSqrLength);
+	}
+
+	Simd4f mask;
+	if(!anyGreater(simd4f(_0), normalD, mask))
+		return;
+
+	accum.subtract(normalX, normalY, normalZ, normalD, mask);
+}
+
+// explicit template instantiation
+#if NVMATH_SIMD
+template class cloth::SwCollision<Simd4f>;
+#endif
+#if NVMATH_SCALAR
+template class cloth::SwCollision<Scalar4f>;
+#endif
+/*
+namespace
+{
+    using namespace cloth;
+
+    int test()
+    {
+        Simd4f vertices[] = {
+            simd4f(0.0f, 0.0f, 0.0f, 0.0f),
+            simd4f(0.1f, 0.0f, 0.0f, 0.0f),
+            simd4f(0.0f, 0.1f, 0.0f, 0.0f)
+        };
+        TriangleData triangle;
+        generateTriangles<Simd4f>(&triangle, &*vertices, 1);
+
+        char buffer[1000];
+        SwKernelAllocator alloc(buffer, 1000);
+
+        SwClothData* cloth = static_cast<SwClothData*>(malloc(sizeof(SwClothData)));
+        memset(cloth, 0, sizeof(SwClothData));
+        cloth->mNumTriangles = 1;
+
+        SwCollision<Simd4f> collision(*cloth, alloc);
+        SwCollision<Simd4f>::ImpulseAccumulator accum;
+
+        Simd4f particles[4] = {};
+        for(float y=-0.1f; y < 0.0f; y += 0.2f)
+        {
+            for(float x=-0.1f; x < 0.0f; x += 0.2f)
+            {
+                particles[0] = simd4f(x);
+                particles[1] = simd4f(y);
+                particles[2] = simd4f(-1.0f);
+
+                collision.collideTriangles(&triangle, particles, accum);
+            }
+        }
+
+        return 0;
+    }
+
+    static int blah = test();
+}
+*/
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollision.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollision.h
new file mode 100644
index 00000000..bf5f3177
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollision.h
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+#include "StackAllocator.h"
+#include "Simd4i.h"
+
+#if PX_PROFILE
+#include "PxProfileEventSender.h"
+#include "PxProfileZone.h"
+#else
+namespace physx
+{
+namespace profile
+{
+	class PxProfileZone;
+}
+}
+#endif
+
+namespace nvidia
+{
+namespace cloth
+{
+
+#if PX_PROFILE
+
+struct ProfileZone
+{
+	ProfileZone(const char* name, profile::PxProfileZone* profiler)
+	: mSender(profiler), mEventId(profiler ? profiler->getEventIdForName(name) : uint16_t(-1))
+	{
+		if(mSender)
+			mSender->startEvent(mEventId, (uint64_t)intptr_t(this));
+	}
+
+	~ProfileZone()
+	{
+		if(mSender)
+			mSender->stopEvent(mEventId, (uint64_t)intptr_t(this));
+	}
+
+	void setValue(int64_t value) const
+	{
+		if(mSender)
+			mSender->eventValue(mEventId, (uint64_t)intptr_t(this), value);
+	}
+
+	profile::PxProfileEventSender* mSender;
+	uint16_t mEventId;
+};
+
+#else // PX_PROFILE
+
+struct ProfileZone
+{
+	ProfileZone(const char*, profile::PxProfileZone*)
+	{
+	}
+	void setValue(int64_t) const
+	{
+	}
+};
+
+#endif // PX_PROFILE
+
+class SwCloth;
+struct SwClothData;
+template <typename>
+struct IterationState;
+struct IndexPair;
+struct SphereData;
+struct ConeData;
+struct TriangleData;
+
+typedef StackAllocator<16> SwKernelAllocator;
+
+/**
+   Collision handler for SwSolver.
+ */
+template <typename Simd4f>
+class SwCollision
+{
+	typedef typename Simd4fToSimd4i<Simd4f>::Type Simd4i;
+
+  public:
+	struct ShapeMask
+	{
+		Simd4i mCones;
+		Simd4i mSpheres;
+
+		ShapeMask& operator=(const ShapeMask&);
+		ShapeMask& operator&=(const ShapeMask&);
+	};
+
+	struct CollisionData
+	{
+		CollisionData();
+		SphereData* mSpheres;
+		ConeData* mCones;
+	};
+
+	struct ImpulseAccumulator;
+
+  public:
+	SwCollision(SwClothData& clothData, SwKernelAllocator& alloc, profile::PxProfileZone* profiler);
+	~SwCollision();
+
+	void operator()(const IterationState<Simd4f>& state);
+
+	static size_t estimateTemporaryMemory(const SwCloth& cloth);
+	static size_t estimatePersistentMemory(const SwCloth& cloth);
+
+  private:
+	SwCollision& operator=(const SwCollision&); // not implemented
+	void allocate(CollisionData&);
+	void deallocate(const CollisionData&);
+
+	void computeBounds();
+
+	void buildSphereAcceleration(const SphereData*);
+	void buildConeAcceleration();
+	static void mergeAcceleration(uint32_t*);
+	bool buildAcceleration();
+
+	static ShapeMask getShapeMask(const Simd4f&, const Simd4i*, const Simd4i*);
+	ShapeMask getShapeMask(const Simd4f*) const;
+	ShapeMask getShapeMask(const Simd4f*, const Simd4f*) const;
+
+	void collideSpheres(const Simd4i&, const Simd4f*, ImpulseAccumulator&) const;
+	Simd4i collideCones(const Simd4f*, ImpulseAccumulator&) const;
+
+	void collideSpheres(const Simd4i&, const Simd4f*, Simd4f*, ImpulseAccumulator&) const;
+	Simd4i collideCones(const Simd4f*, Simd4f*, ImpulseAccumulator&) const;
+
+	void collideParticles();
+	void collideVirtualParticles();
+	void collideContinuousParticles();
+
+	void collideConvexes(const IterationState<Simd4f>&);
+	void collideConvexes(const Simd4f*, Simd4f*, ImpulseAccumulator&);
+
+	void collideTriangles(const IterationState<Simd4f>&);
+	void collideTriangles(const TriangleData*, Simd4f*, ImpulseAccumulator&);
+
+  public:
+	// acceleration structure
+	static const uint32_t sGridSize = 8;
+	Simd4i mSphereGrid[6 * sGridSize / 4];
+	Simd4i mConeGrid[6 * sGridSize / 4];
+	Simd4f mGridScale, mGridBias;
+
+	CollisionData mPrevData;
+	CollisionData mCurData;
+
+	SwClothData& mClothData;
+	SwKernelAllocator& mAllocator;
+
+	uint32_t mNumCollisions;
+
+	profile::PxProfileZone* mProfiler;
+
+	static const Simd4f sSkeletonWidth;
+};
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollisionHelpers.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollisionHelpers.h
new file mode 100644
index 00000000..5e098922
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwCollisionHelpers.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Simd4i.h"
+
+// platform specific helpers
+
+namespace nvidia
+{
+namespace cloth
+{
+
+inline uint32_t findBitSet(uint32_t mask);
+
+// intFloor(-1.0f) returns -2 on SSE and NEON!
+inline Simd4i intFloor(const Simd4f& v);
+
+inline Simd4i horizontalOr(Simd4i mask);
+
+template <typename>
+struct Gather;
+
+#if NVMATH_SIMD
+template <>
+struct Gather<Simd4i>
+{
+	inline Gather(const Simd4i& index);
+	inline Simd4i operator()(const Simd4i*) const;
+
+#if NVMATH_SSE2
+	Simd4i mSelectQ, mSelectD, mSelectW;
+	static const Simd4i sIntSignBit;
+	static const Simd4i sSignedMask;
+#elif NVMATH_NEON
+	Simd4i mPermute;
+	static const Simd4i sPack;
+	static const Simd4i sOffset;
+	static const Simd4i sShift;
+	static const Simd4i sMask;
+#endif
+	Simd4i mOutOfRange;
+};
+#endif
+
+} // namespace cloth
+} // namespace nvidia
+
+#if NVMATH_SSE2
+#include "sse2/SwCollisionHelpers.h"
+#elif NVMATH_NEON
+#include "neon/SwCollisionHelpers.h"
+#endif
+
+#if NVMATH_SCALAR
+#include "scalar/SwCollisionHelpers.h"
+#endif
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFabric.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFabric.cpp
new file mode 100644
index 00000000..0d527dbf
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFabric.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "PxAssert.h"
+#include "SwFabric.h"
+#include "SwFactory.h"
+#include "PsSort.h"
+#include "limits.h" // for USHRT_MAX
+
+#include "PsUtilities.h"
+
+using namespace nvidia;
+using namespace physx::shdfnd;
+
+cloth::SwTether::SwTether(uint16_t anchor, float length) : mAnchor(anchor), mLength(length)
+{
+}
+
+cloth::SwFabric::SwFabric(SwFactory& factory, uint32_t numParticles, Range<const uint32_t> phases,
+                          Range<const uint32_t> sets, Range<const float> restvalues, Range<const uint32_t> indices,
+                          Range<const uint32_t> anchors, Range<const float> tetherLengths, uint32_t id)
+: mFactory(factory), mNumParticles(numParticles), mTetherLengthScale(1.0f), mId(id)
+{
+	// should no longer be prefixed with 0
+	PX_ASSERT(sets.front() != 0);
+
+#if PX_WINDOWS_FAMILY
+	const uint32_t kSimdWidth = 8; // avx
+#else
+	const uint32_t kSimdWidth = 4;
+#endif
+
+	// consistency check
+	PX_ASSERT(sets.back() == restvalues.size());
+	PX_ASSERT(restvalues.size() * 2 == indices.size());
+	PX_ASSERT(mNumParticles > *maxElement(indices.begin(), indices.end()));
+	PX_ASSERT(mNumParticles + kSimdWidth - 1 <= USHRT_MAX);
+
+	mPhases.assign(phases.begin(), phases.end());
+	mSets.reserve(sets.size() + 1);
+	mSets.pushBack(0); // prefix with 0
+
+	mOriginalNumRestvalues = uint32_t(restvalues.size());
+
+	// padd indices for SIMD
+	const uint32_t* iBegin = indices.begin(), *iIt = iBegin;
+	const float* rBegin = restvalues.begin(), *rIt = rBegin;
+	const uint32_t* sIt, *sEnd = sets.end();
+	for(sIt = sets.begin(); sIt != sEnd; ++sIt)
+	{
+		const float* rEnd = rBegin + *sIt;
+		const uint32_t* iEnd = iBegin + *sIt * 2;
+		uint32_t numConstraints = uint32_t(rEnd - rIt);
+
+		for(; rIt != rEnd; ++rIt)
+			mRestvalues.pushBack(*rIt);
+
+		for(; iIt != iEnd; ++iIt)
+			mIndices.pushBack(uint16_t(*iIt));
+
+		// add dummy indices to make multiple of 4
+		for(; numConstraints &= kSimdWidth - 1; ++numConstraints)
+		{
+			mRestvalues.pushBack(-FLT_MAX);
+			uint32_t index = mNumParticles + numConstraints - 1;
+			mIndices.pushBack(uint16_t(index));
+			mIndices.pushBack(uint16_t(index));
+		}
+
+		mSets.pushBack(uint32_t(mRestvalues.size()));
+	}
+
+	// trim overallocations
+	RestvalueContainer(mRestvalues.begin(), mRestvalues.end()).swap(mRestvalues);
+	Vector<uint16_t>::Type(mIndices.begin(), mIndices.end()).swap(mIndices);
+
+	// tethers
+	PX_ASSERT(anchors.size() == tetherLengths.size());
+
+	// pad to allow for direct 16 byte (unaligned) loads
+	mTethers.reserve(anchors.size() + 2);
+	for(; !anchors.empty(); anchors.popFront(), tetherLengths.popFront())
+		mTethers.pushBack(SwTether(uint16_t(anchors.front()), tetherLengths.front()));
+
+	mFactory.mFabrics.pushBack(this);
+}
+
+cloth::SwFabric::~SwFabric()
+{
+	Vector<SwFabric*>::Type::Iterator fIt = mFactory.mFabrics.find(this);
+	PX_ASSERT(fIt != mFactory.mFabrics.end());
+	mFactory.mFabrics.replaceWithLast(fIt);
+}
+
+cloth::Factory& cloth::SwFabric::getFactory() const
+{
+	return mFactory;
+}
+
+uint32_t cloth::SwFabric::getNumPhases() const
+{
+	return uint32_t(mPhases.size());
+}
+
+uint32_t cloth::SwFabric::getNumRestvalues() const
+{
+	return mOriginalNumRestvalues;
+}
+
+uint32_t cloth::SwFabric::getNumSets() const
+{
+	return uint32_t(mSets.size() - 1);
+}
+
+uint32_t cloth::SwFabric::getNumIndices() const
+{
+	return 2 * mOriginalNumRestvalues;
+}
+
+uint32_t cloth::SwFabric::getNumParticles() const
+{
+	return mNumParticles;
+}
+
+uint32_t cloth::SwFabric::getNumTethers() const
+{
+	return uint32_t(mTethers.size());
+}
+
+void cloth::SwFabric::scaleRestvalues(float scale)
+{
+	RestvalueContainer::Iterator rIt, rEnd = mRestvalues.end();
+	for(rIt = mRestvalues.begin(); rIt != rEnd; ++rIt)
+		*rIt *= scale;
+}
+
+void cloth::SwFabric::scaleTetherLengths(float scale)
+{
+	mTetherLengthScale *= scale;
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFabric.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFabric.h
new file mode 100644
index 00000000..e2081866
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFabric.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Allocator.h"
+#include "Fabric.h"
+#include "Types.h"
+#include "Range.h"
+#include "PxVec4.h"
+
+namespace nvidia
+{
+
+namespace cloth
+{
+
+class SwFactory;
+
+struct SwTether
+{
+	SwTether(uint16_t, float);
+	uint16_t mAnchor;
+	float mLength;
+};
+
+class SwFabric : public UserAllocated, public Fabric
+{
+  public:
+#if PX_WINDOWS_FAMILY
+	typedef AlignedVector<float, 32>::Type RestvalueContainer; // avx
+#else
+	typedef AlignedVector<float, 16>::Type RestvalueContainer;
+#endif
+
+	SwFabric(SwFactory& factory, uint32_t numParticles, Range<const uint32_t> phases, Range<const uint32_t> sets,
+	         Range<const float> restvalues, Range<const uint32_t> indices, Range<const uint32_t> anchors,
+	         Range<const float> tetherLengths, uint32_t id);
+
+	SwFabric& operator=(const SwFabric&);
+
+	virtual ~SwFabric();
+
+	virtual Factory& getFactory() const;
+
+	virtual uint32_t getNumPhases() const;
+	virtual uint32_t getNumRestvalues() const;
+
+	virtual uint32_t getNumSets() const;
+	virtual uint32_t getNumIndices() const;
+
+	virtual uint32_t getNumParticles() const;
+
+	virtual uint32_t getNumTethers() const;
+
+	virtual void scaleRestvalues(float);
+	virtual void scaleTetherLengths(float);
+
+  public:
+	SwFactory& mFactory;
+
+	uint32_t mNumParticles;
+
+	Vector<uint32_t>::Type mPhases; // index of set to use
+	Vector<uint32_t>::Type mSets;   // offset of first restvalue, with 0 prefix
+
+	RestvalueContainer mRestvalues;  // rest values (edge length)
+	Vector<uint16_t>::Type mIndices; // particle index pairs
+
+	Vector<SwTether>::Type mTethers;
+	float mTetherLengthScale;
+
+	uint32_t mId;
+
+	uint32_t mOriginalNumRestvalues;
+
+} PX_ALIGN_SUFFIX(16);
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFactory.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFactory.cpp
new file mode 100644
index 00000000..9955156d
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFactory.cpp
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "SwFactory.h"
+#include "SwFabric.h"
+#include "SwCloth.h"
+#include "SwSolver.h"
+#include "ClothImpl.h"
+#include <string.h> // for memcpy
+#include "PsIntrinsics.h"
+
+using namespace nvidia;
+using namespace nvidia;
+
+namespace nvidia
+{
+namespace cloth
+{
+// defined in Factory.cpp
+uint32_t getNextFabricId();
+}
+}
+
+cloth::SwFactory::SwFactory() : Factory(CPU)
+{
+}
+
+cloth::SwFactory::~SwFactory()
+{
+}
+
+cloth::Fabric* cloth::SwFactory::createFabric(uint32_t numParticles, Range<const uint32_t> phases,
+                                              Range<const uint32_t> sets, Range<const float> restvalues,
+                                              Range<const uint32_t> indices, Range<const uint32_t> anchors,
+                                              Range<const float> tetherLengths)
+{
+	return new SwFabric(*this, numParticles, phases, sets, restvalues, indices, anchors, tetherLengths,
+	                    getNextFabricId());
+}
+
+cloth::Cloth* cloth::SwFactory::createCloth(Range<const PxVec4> particles, Fabric& fabric)
+{
+	return new SwClothImpl(*this, fabric, particles);
+}
+
+cloth::Solver* cloth::SwFactory::createSolver(profile::PxProfileZone* profiler, PxTaskManager* taskMgr)
+{
+#ifdef PX_PHYSX_GPU_EXPORTS
+	// SwSolver not defined in PhysXGpu project
+	PX_UNUSED(profiler);
+	PX_UNUSED(taskMgr);
+	return 0;
+#else
+	return new SwSolver(profiler, taskMgr);
+#endif
+}
+
+cloth::Cloth* cloth::SwFactory::clone(const Cloth& cloth)
+{
+	if(cloth.getFactory().getPlatform() != Factory::CPU)
+		return cloth.clone(*this); // forward to CuCloth
+
+	// copy construct
+	return new SwClothImpl(*this, static_cast<const SwClothImpl&>(cloth));
+}
+
+void cloth::SwFactory::extractFabricData(const Fabric& fabric, Range<uint32_t> phases, Range<uint32_t> sets,
+                                         Range<float> restvalues, Range<uint32_t> indices, Range<uint32_t> anchors,
+                                         Range<float> tetherLengths) const
+{
+	const SwFabric& swFabric = static_cast<const SwFabric&>(fabric);
+
+	PX_ASSERT(phases.empty() || phases.size() == swFabric.getNumPhases());
+	PX_ASSERT(restvalues.empty() || restvalues.size() == swFabric.getNumRestvalues());
+	PX_ASSERT(sets.empty() || sets.size() == swFabric.getNumSets());
+	PX_ASSERT(indices.empty() || indices.size() == swFabric.getNumIndices());
+	PX_ASSERT(anchors.empty() || anchors.size() == swFabric.getNumTethers());
+	PX_ASSERT(tetherLengths.empty() || tetherLengths.size() == swFabric.getNumTethers());
+
+	for(uint32_t i = 0; !phases.empty(); ++i, phases.popFront())
+		phases.front() = swFabric.mPhases[i];
+
+	const uint32_t* sEnd = swFabric.mSets.end(), *sIt;
+	const float* rBegin = swFabric.mRestvalues.begin(), *rIt = rBegin;
+	const uint16_t* iIt = swFabric.mIndices.begin();
+
+	uint32_t* sDst = sets.begin();
+	float* rDst = restvalues.begin();
+	uint32_t* iDst = indices.begin();
+
+	uint32_t numConstraints = 0;
+	for(sIt = swFabric.mSets.begin(); ++sIt != sEnd;)
+	{
+		const float* rEnd = rBegin + *sIt;
+		for(; rIt != rEnd; ++rIt)
+		{
+			uint16_t i0 = *iIt++;
+			uint16_t i1 = *iIt++;
+
+			if(PxMax(i0, i1) >= swFabric.mNumParticles)
+				continue;
+
+			if(!restvalues.empty())
+				*rDst++ = *rIt;
+
+			if(!indices.empty())
+			{
+				*iDst++ = i0;
+				*iDst++ = i1;
+			}
+
+			++numConstraints;
+		}
+
+		if(!sets.empty())
+			*sDst++ = numConstraints;
+	}
+
+	for(uint32_t i = 0; !anchors.empty(); ++i, anchors.popFront())
+		anchors.front() = swFabric.mTethers[i].mAnchor;
+
+	for(uint32_t i = 0; !tetherLengths.empty(); ++i, tetherLengths.popFront())
+		tetherLengths.front() = swFabric.mTethers[i].mLength * swFabric.mTetherLengthScale;
+}
+
+void cloth::SwFactory::extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules,
+                                            Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const
+{
+	PX_ASSERT(&cloth.getFactory() == this);
+
+	const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth;
+
+	PX_ASSERT(spheres.empty() || spheres.size() == swCloth.mStartCollisionSpheres.size());
+	PX_ASSERT(capsules.empty() || capsules.size() == swCloth.mCapsuleIndices.size() * 2);
+	PX_ASSERT(planes.empty() || planes.size() == swCloth.mStartCollisionPlanes.size());
+	PX_ASSERT(convexes.empty() || convexes.size() == swCloth.mConvexMasks.size());
+	PX_ASSERT(triangles.empty() || triangles.size() == swCloth.mStartCollisionTriangles.size());
+
+	if(!swCloth.mStartCollisionSpheres.empty() && !spheres.empty())
+		memcpy(spheres.begin(), &swCloth.mStartCollisionSpheres.front(),
+		       swCloth.mStartCollisionSpheres.size() * sizeof(PxVec4));
+
+	if(!swCloth.mCapsuleIndices.empty() && !capsules.empty())
+		memcpy(capsules.begin(), &swCloth.mCapsuleIndices.front(), swCloth.mCapsuleIndices.size() * sizeof(IndexPair));
+
+	if(!swCloth.mStartCollisionPlanes.empty() && !planes.empty())
+		memcpy(planes.begin(), &swCloth.mStartCollisionPlanes.front(),
+		       swCloth.mStartCollisionPlanes.size() * sizeof(PxVec4));
+
+	if(!swCloth.mConvexMasks.empty() && !convexes.empty())
+		memcpy(convexes.begin(), &swCloth.mConvexMasks.front(), swCloth.mConvexMasks.size() * sizeof(uint32_t));
+
+	if(!swCloth.mStartCollisionTriangles.empty() && !triangles.empty())
+		memcpy(triangles.begin(), &swCloth.mStartCollisionTriangles.front(),
+		       swCloth.mStartCollisionTriangles.size() * sizeof(PxVec3));
+}
+
+void cloth::SwFactory::extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const
+{
+	PX_ASSERT(&cloth.getFactory() == this);
+
+	const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth;
+
+	Vec4fAlignedVector const& srcConstraints = !swCloth.mMotionConstraints.mTarget.empty()
+	                                               ? swCloth.mMotionConstraints.mTarget
+	                                               : swCloth.mMotionConstraints.mStart;
+
+	if(!srcConstraints.empty())
+	{
+		// make sure dest array is big enough
+		PX_ASSERT(destConstraints.size() == srcConstraints.size());
+
+		memcpy(destConstraints.begin(), &srcConstraints.front(), srcConstraints.size() * sizeof(PxVec4));
+	}
+}
+
+void cloth::SwFactory::extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const
+{
+	PX_ASSERT(&cloth.getFactory() == this);
+
+	const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth;
+
+	Vec4fAlignedVector const& srcConstraints = !swCloth.mSeparationConstraints.mTarget.empty()
+	                                               ? swCloth.mSeparationConstraints.mTarget
+	                                               : swCloth.mSeparationConstraints.mStart;
+
+	if(!srcConstraints.empty())
+	{
+		// make sure dest array is big enough
+		PX_ASSERT(destConstraints.size() == srcConstraints.size());
+
+		memcpy(destConstraints.begin(), &srcConstraints.front(), srcConstraints.size() * sizeof(PxVec4));
+	}
+}
+
+void cloth::SwFactory::extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const
+{
+	PX_ASSERT(&cloth.getFactory() == this);
+
+	const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth;
+
+	if(!swCloth.mParticleAccelerations.empty())
+	{
+		// make sure dest array is big enough
+		PX_ASSERT(destAccelerations.size() == swCloth.mParticleAccelerations.size());
+
+		memcpy(destAccelerations.begin(), &swCloth.mParticleAccelerations.front(),
+		       swCloth.mParticleAccelerations.size() * sizeof(PxVec4));
+	}
+}
+
+void cloth::SwFactory::extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> indices, Range<PxVec3> weights) const
+{
+	PX_ASSERT(this == &cloth.getFactory());
+
+	const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth;
+
+	uint32_t numIndices = cloth.getNumVirtualParticles();
+	uint32_t numWeights = cloth.getNumVirtualParticleWeights();
+
+	PX_ASSERT(indices.size() == numIndices || indices.empty());
+	PX_ASSERT(weights.size() == numWeights || weights.empty());
+
+	if(weights.size() == numWeights)
+	{
+		PxVec3* wDestIt = reinterpret_cast<PxVec3*>(weights.begin());
+
+		// convert weights from vec4 to vec3
+		cloth::Vec4fAlignedVector::ConstIterator wIt = swCloth.mVirtualParticleWeights.begin();
+		cloth::Vec4fAlignedVector::ConstIterator wEnd = wIt + numWeights;
+
+		for(; wIt != wEnd; ++wIt, ++wDestIt)
+			*wDestIt = PxVec3(wIt->x, wIt->y, wIt->z);
+
+		PX_ASSERT(wDestIt == weights.end());
+	}
+	if(indices.size() == numIndices)
+	{
+		// convert indices
+		Vec4u* iDestIt = reinterpret_cast<Vec4u*>(indices.begin());
+		Vector<Vec4us>::Type::ConstIterator iIt = swCloth.mVirtualParticleIndices.begin();
+		Vector<Vec4us>::Type::ConstIterator iEnd = swCloth.mVirtualParticleIndices.end();
+
+		uint32_t numParticles = uint32_t(swCloth.mCurParticles.size());
+
+		for(; iIt != iEnd; ++iIt)
+		{
+			// skip dummy indices
+			if(iIt->x < numParticles)
+				// byte offset to element index
+				*iDestIt++ = Vec4u(*iIt);
+		}
+
+		PX_ASSERT(&array(*iDestIt) == indices.end());
+	}
+}
+
+void cloth::SwFactory::extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const
+{
+	const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth;
+	PX_ASSERT(destIndices.size() == swCloth.mSelfCollisionIndices.size());
+	intrinsics::memCopy(destIndices.begin(), swCloth.mSelfCollisionIndices.begin(), destIndices.size() * sizeof(uint32_t));
+}
+
+void cloth::SwFactory::extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const
+{
+	const SwCloth& swCloth = static_cast<const SwClothImpl&>(cloth).mCloth;
+	PX_ASSERT(destRestPositions.size() == swCloth.mRestPositions.size());
+	intrinsics::memCopy(destRestPositions.begin(), swCloth.mRestPositions.begin(), destRestPositions.size() * sizeof(PxVec4));
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFactory.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFactory.h
new file mode 100644
index 00000000..a078add0
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwFactory.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Factory.h"
+#include "Allocator.h"
+
+namespace nvidia
+{
+
+namespace cloth
+{
+
+class SwFabric;
+class SwCloth;
+template <typename>
+class ClothImpl;
+
+class SwFactory : public UserAllocated, public Factory
+{
+  public:
+	typedef SwFabric FabricType;
+	typedef ClothImpl<SwCloth> ImplType;
+
+	SwFactory();
+	virtual ~SwFactory();
+
+	virtual Fabric* createFabric(uint32_t numParticles, Range<const uint32_t> phases, Range<const uint32_t> sets,
+	                             Range<const float> restvalues, Range<const uint32_t> indices,
+	                             Range<const uint32_t> anchors, Range<const float> tetherLengths);
+
+	virtual Cloth* createCloth(Range<const PxVec4> particles, Fabric& fabric);
+
+	virtual Solver* createSolver(profile::PxProfileZone*, PxTaskManager*);
+
+	virtual Cloth* clone(const Cloth& cloth);
+
+	virtual void extractFabricData(const Fabric& fabric, Range<uint32_t> phases, Range<uint32_t> sets,
+	                               Range<float> restvalues, Range<uint32_t> indices, Range<uint32_t> anchors,
+	                               Range<float> tetherLengths) const;
+
+	virtual void extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules,
+	                                  Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const;
+
+	virtual void extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const;
+
+	virtual void extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const;
+
+	virtual void extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const;
+
+	virtual void extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> destIndices,
+	                                     Range<PxVec3> destWeights) const;
+
+	virtual void extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const;
+
+	virtual void extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const;
+
+  public:
+	Vector<SwFabric*>::Type mFabrics;
+};
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwInterCollision.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwInterCollision.cpp
new file mode 100644
index 00000000..c2c924cf
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwInterCollision.cpp
@@ -0,0 +1,694 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "SwInterCollision.h"
+#include "PsIntrinsics.h"
+#include "SwCollision.h" //temp fix, needed by SwCollisionHelper implementations
+#include "Simd4f.h"
+#include "SwCollisionHelpers.h"
+#include "BoundingBox.h"
+#include "PsSort.h"
+#include "PsIntrinsics.h"
+
+#pragma warning(disable:4127)
+
+using namespace nvidia;
+
+namespace
+{
+typedef Simd4fFactory<detail::FourTuple> Simd4fConstant;
+
+const Simd4fConstant sMaskXYZ = simd4f(simd4i(~0, ~0, ~0, 0));
+const Simd4fConstant sMaskW = simd4f(simd4i(0, 0, 0, ~0));
+const Simd4fConstant sEpsilon = simd4f(FLT_EPSILON);
+const Simd4fConstant sZeroW = simd4f(-FLT_MAX, -FLT_MAX, -FLT_MAX, 0.0f);
+
+// returns sorted indices, output needs to be at least 2*(last-first)+1024
+void radixSort(const uint32_t* first, const uint32_t* last, uint32_t* out)
+{
+	uint32_t n = uint32_t(last - first);
+
+	uint32_t* buffer = out + 2 * n;
+	uint32_t* __restrict histograms[] = { buffer, buffer + 256, buffer + 512, buffer + 768 };
+
+	intrinsics::memZero(buffer, 1024 * sizeof(uint32_t));
+
+	// build 3 histograms in one pass
+	for(const uint32_t* __restrict it = first; it != last; ++it)
+	{
+		uint32_t key = *it;
+		++histograms[0][0xff & key];
+		++histograms[1][0xff & (key >> 8)];
+		++histograms[2][0xff & (key >> 16)];
+		++histograms[3][key >> 24];
+	}
+
+	// convert histograms to offset tables in-place
+	uint32_t sums[4] = {};
+	for(uint32_t i = 0; i < 256; ++i)
+	{
+		uint32_t temp0 = histograms[0][i] + sums[0];
+		histograms[0][i] = sums[0], sums[0] = temp0;
+
+		uint32_t temp1 = histograms[1][i] + sums[1];
+		histograms[1][i] = sums[1], sums[1] = temp1;
+
+		uint32_t temp2 = histograms[2][i] + sums[2];
+		histograms[2][i] = sums[2], sums[2] = temp2;
+
+		uint32_t temp3 = histograms[3][i] + sums[3];
+		histograms[3][i] = sums[3], sums[3] = temp3;
+	}
+
+	PX_ASSERT(sums[0] == n && sums[1] == n && sums[2] == n && sums[3] == n);
+
+#if PX_DEBUG
+	memset(out, 0xff, 2 * n * sizeof(uint32_t));
+#endif
+
+	// sort 8 bits per pass
+
+	uint32_t* __restrict indices[] = { out, out + n };
+
+	for(uint32_t i = 0; i != n; ++i)
+		indices[1][histograms[0][0xff & first[i]]++] = i;
+
+	for(uint32_t i = 0, index; index = indices[1][i], i != n; ++i)
+		indices[0][histograms[1][0xff & (first[index] >> 8)]++] = index;
+
+	for(uint32_t i = 0, index; index = indices[0][i], i != n; ++i)
+		indices[1][histograms[2][0xff & (first[index] >> 16)]++] = index;
+
+	for(uint32_t i = 0, index; index = indices[1][i], i != n; ++i)
+		indices[0][histograms[3][first[index] >> 24]++] = index;
+}
+
+template <typename Simd4f>
+uint32_t longestAxis(const Simd4f& edgeLength)
+{
+	const float* e = array(edgeLength);
+
+	if(e[0] > e[1])
+		return uint32_t(e[0] > e[2] ? 0 : 2);
+	else
+		return uint32_t(e[1] > e[2] ? 1 : 2);
+}
+}
+
+template <typename Simd4f>
+cloth::SwInterCollision<Simd4f>::SwInterCollision(const cloth::SwInterCollisionData* instances, uint32_t n, float colDist,
+                                                  float stiffness, uint32_t iterations, InterCollisionFilter filter,
+                                                  cloth::SwKernelAllocator& alloc, profile::PxProfileZone* zone)
+: mInstances(instances)
+, mNumInstances(n)
+, mClothIndices(NULL)
+, mParticleIndices(NULL)
+, mNumParticles(0)
+, mTotalParticles(0)
+, mFilter(filter)
+, mAllocator(alloc)
+, mProfiler(zone)
+{
+	PX_ASSERT(mFilter);
+
+	mCollisionDistance = simd4f(colDist, colDist, colDist, 0.0f);
+	mCollisionSquareDistance = mCollisionDistance * mCollisionDistance;
+	mStiffness = simd4f(stiffness);
+	mNumIterations = iterations;
+
+	// calculate particle size
+	for(uint32_t i = 0; i < n; ++i)
+		mTotalParticles += instances[i].mNumParticles;
+}
+
+template <typename Simd4f>
+cloth::SwInterCollision<Simd4f>::~SwInterCollision()
+{
+}
+
+namespace
+{
+// multiple x by m leaving w component of x intact
+template <typename Simd4f>
+PX_INLINE Simd4f transform(const Simd4f m[4], const Simd4f& x)
+{
+	const Simd4f a = m[3] + splat<0>(x) * m[0] + splat<1>(x) * m[1] + splat<2>(x) * m[2];
+	return select(sMaskXYZ, a, x);
+}
+
+// rotate x by m leaving w component intact
+template <typename Simd4f>
+PX_INLINE Simd4f rotate(const Simd4f m[4], const Simd4f& x)
+{
+	const Simd4f a = splat<0>(x) * m[0] + splat<1>(x) * m[1] + splat<2>(x) * m[2];
+	return select(sMaskXYZ, a, x);
+}
+
+template <typename Simd4f>
+struct ClothSorter
+{
+	typedef cloth::BoundingBox<Simd4f> BoundingBox;
+
+	ClothSorter(BoundingBox* bounds, uint32_t n, uint32_t axis) : mBounds(bounds), mNumBounds(n), mAxis(axis)
+	{
+	}
+
+	bool operator()(uint32_t i, uint32_t j) const
+	{
+		PX_ASSERT(i < mNumBounds);
+		PX_ASSERT(j < mNumBounds);
+
+		return array(mBounds[i].mLower)[mAxis] < array(mBounds[j].mLower)[mAxis];
+	}
+
+	BoundingBox* mBounds;
+	uint32_t mNumBounds;
+	uint32_t mAxis;
+};
+
+// for the given cloth array this function calculates the set of particles
+// which potentially interact, the potential colliders are returned with their
+// cloth index and particle index in clothIndices and particleIndices, the
+// function returns the number of potential colliders
+template <typename Simd4f>
+uint32_t calculatePotentialColliders(const cloth::SwInterCollisionData* cBegin, const cloth::SwInterCollisionData* cEnd,
+                                     Simd4f colDist, uint16_t* clothIndices, uint32_t* particleIndices,
+                                     cloth::BoundingBox<Simd4f>& bounds, uint32_t* overlapMasks,
+                                     cloth::InterCollisionFilter filter, cloth::SwKernelAllocator& allocator)
+{
+	using namespace cloth;
+
+	typedef BoundingBox<Simd4f> BoundingBox;
+
+	uint32_t numParticles = 0;
+	const uint32_t numCloths = uint32_t(cEnd - cBegin);
+
+	// bounds of each cloth objects in world space
+	BoundingBox* const clothBounds = (BoundingBox*)(allocator.allocate(numCloths * sizeof(BoundingBox)));
+	BoundingBox* const overlapBounds = (BoundingBox*)(allocator.allocate(numCloths * sizeof(BoundingBox)));
+
+	// union of all cloth world bounds
+	BoundingBox totalClothBounds = emptyBounds<Simd4f>();
+
+	uint32_t* sortedIndices = (uint32_t*)allocator.allocate(numCloths * sizeof(uint32_t));
+
+	for(uint32_t i = 0; i < numCloths; ++i)
+	{
+		const SwInterCollisionData& c = cBegin[i];
+
+		// transform bounds from b local space to local space of a
+		PxBounds3 lcBounds = PxBounds3::centerExtents(c.mBoundsCenter, c.mBoundsHalfExtent + PxVec3(array(colDist)[0]));
+		PX_ASSERT(!lcBounds.isEmpty());
+		PxBounds3 cWorld = PxBounds3::transformFast(c.mGlobalPose, lcBounds);
+
+		BoundingBox cBounds = {(Simd4f)simd4f(cWorld.minimum.x, cWorld.minimum.y, cWorld.minimum.z, 0.0f),
+			                   (Simd4f)simd4f(cWorld.maximum.x, cWorld.maximum.y, cWorld.maximum.z, 0.0f) };
+
+		sortedIndices[i] = i;
+		clothBounds[i] = cBounds;
+
+		totalClothBounds = expandBounds(totalClothBounds, cBounds);
+	}
+
+	// sort indices by their minimum extent on the longest axis
+	const uint32_t sweepAxis = longestAxis(totalClothBounds.mUpper - totalClothBounds.mLower);
+
+	ClothSorter<Simd4f> predicate(clothBounds, numCloths, sweepAxis);
+	nvidia::sort(sortedIndices, numCloths, predicate);
+
+	for(uint32_t i = 0; i < numCloths; ++i)
+	{
+		PX_ASSERT(sortedIndices[i] < numCloths);
+
+		const SwInterCollisionData& a = cBegin[sortedIndices[i]];
+
+		// local bounds
+		const Simd4f aCenter = load(reinterpret_cast<const float*>(&a.mBoundsCenter));
+		const Simd4f aHalfExtent = load(reinterpret_cast<const float*>(&a.mBoundsHalfExtent)) + colDist;
+		const BoundingBox aBounds = { aCenter - aHalfExtent, aCenter + aHalfExtent };
+
+		const PxMat44 aToWorld(a.mGlobalPose);
+		const PxTransform aToLocal(a.mGlobalPose.getInverse());
+
+		const float axisMin = array(clothBounds[sortedIndices[i]].mLower)[sweepAxis];
+		const float axisMax = array(clothBounds[sortedIndices[i]].mUpper)[sweepAxis];
+
+		uint32_t overlapMask = 0;
+		uint32_t numOverlaps = 0;
+
+		// scan back to find first intersecting bounding box
+		uint32_t startIndex = i;
+		while(startIndex > 0 && array(clothBounds[sortedIndices[startIndex]].mUpper)[sweepAxis] > axisMin)
+			--startIndex;
+
+		// compute all overlapping bounds
+		for(uint32_t j = startIndex; j < numCloths; ++j)
+		{
+			// ignore self-collision
+			if(i == j)
+				continue;
+
+			// early out if no more cloths along axis intersect us
+			if(array(clothBounds[sortedIndices[j]].mLower)[sweepAxis] > axisMax)
+				break;
+
+			const SwInterCollisionData& b = cBegin[sortedIndices[j]];
+
+			// check if collision between these shapes is filtered
+			if(!filter(a.mUserData, b.mUserData))
+				continue;
+
+			// set mask bit for this cloth
+			overlapMask |= 1 << sortedIndices[j];
+
+			// transform bounds from b local space to local space of a
+			PxBounds3 lcBounds =
+			    PxBounds3::centerExtents(b.mBoundsCenter, b.mBoundsHalfExtent + PxVec3(array(colDist)[0]));
+			PX_ASSERT(!lcBounds.isEmpty());
+			PxBounds3 bLocal = PxBounds3::transformFast(aToLocal * b.mGlobalPose, lcBounds);
+
+			BoundingBox bBounds = {(Simd4f)simd4f(bLocal.minimum.x, bLocal.minimum.y, bLocal.minimum.z, 0.0f),
+				                   (Simd4f)simd4f(bLocal.maximum.x, bLocal.maximum.y, bLocal.maximum.z, 0.0f) };
+
+			BoundingBox iBounds = intersectBounds(aBounds, bBounds);
+
+			// setup bounding box w to make point containment test cheaper
+			iBounds.mLower = (iBounds.mLower & sMaskXYZ) | ((Simd4f)sMaskW & simd4f(-FLT_MAX));
+			iBounds.mUpper = (iBounds.mUpper & sMaskXYZ) | ((Simd4f)sMaskW & simd4f(FLT_MAX));
+
+			if(!isEmptyBounds(iBounds))
+				overlapBounds[numOverlaps++] = iBounds;
+		}
+
+		//----------------------------------------------------------------
+		// cull all particles to overlapping bounds and transform particles to world space
+
+		const uint32_t clothIndex = sortedIndices[i];
+		overlapMasks[clothIndex] = overlapMask;
+
+		Simd4f* pBegin = reinterpret_cast<Simd4f*>(a.mParticles);
+		Simd4f* qBegin = reinterpret_cast<Simd4f*>(a.mPrevParticles);
+
+		const Simd4f xform[4] = { load(reinterpret_cast<const float*>(&aToWorld.column0)),
+			                      load(reinterpret_cast<const float*>(&aToWorld.column1)),
+			                      load(reinterpret_cast<const float*>(&aToWorld.column2)),
+			                      load(reinterpret_cast<const float*>(&aToWorld.column3)) };
+
+		Simd4f impulseInvScale = recip(Simd4f(simd4f(cBegin[clothIndex].mImpulseScale)));
+
+		for(uint32_t k = 0; k < a.mNumParticles; ++k)
+		{
+			Simd4f* pIt = a.mIndices ? pBegin + a.mIndices[k] : pBegin + k;
+			Simd4f* qIt = a.mIndices ? qBegin + a.mIndices[k] : qBegin + k;
+
+			const Simd4f p = *pIt;
+
+			for(const BoundingBox* oIt = overlapBounds, *oEnd = overlapBounds + numOverlaps; oIt != oEnd; ++oIt)
+			{
+				// point in box test
+				if(anyGreater(oIt->mLower, p) != 0)
+					continue;
+				if(anyGreater(p, oIt->mUpper) != 0)
+					continue;
+
+				// transform particle to world space in-place
+				// (will be transformed back after collision)
+				*pIt = transform(xform, p);
+
+				Simd4f impulse = (p - *qIt) * impulseInvScale;
+				*qIt = rotate(xform, impulse);
+
+				// update world bounds
+				bounds = expandBounds(bounds, pIt, pIt + 1);
+
+				// add particle to output arrays
+				clothIndices[numParticles] = uint16_t(clothIndex);
+				particleIndices[numParticles] = uint32_t(pIt - pBegin);
+
+				// output each particle only once
+				++numParticles;
+				break;
+			}
+		}
+	}
+
+	allocator.deallocate(sortedIndices);
+	allocator.deallocate(overlapBounds);
+	allocator.deallocate(clothBounds);
+
+	return numParticles;
+}
+}
+
+template <typename Simd4f>
+PX_INLINE Simd4f& cloth::SwInterCollision<Simd4f>::getParticle(uint32_t index)
+{
+	PX_ASSERT(index < mNumParticles);
+
+	uint16_t clothIndex = mClothIndices[index];
+	uint32_t particleIndex = mParticleIndices[index];
+
+	PX_ASSERT(clothIndex < mNumInstances);
+
+	return reinterpret_cast<Simd4f&>(mInstances[clothIndex].mParticles[particleIndex]);
+}
+
+template <typename Simd4f>
+void cloth::SwInterCollision<Simd4f>::operator()()
+{
+	mNumTests = mNumCollisions = 0;
+
+	mClothIndices = static_cast<uint16_t*>(mAllocator.allocate(sizeof(uint16_t) * mTotalParticles));
+	mParticleIndices = static_cast<uint32_t*>(mAllocator.allocate(sizeof(uint32_t) * mTotalParticles));
+	mOverlapMasks = static_cast<uint32_t*>(mAllocator.allocate(sizeof(uint32_t*) * mNumInstances));
+
+	for(uint32_t k = 0; k < mNumIterations; ++k)
+	{
+		// world bounds of particles
+		BoundingBox<Simd4f> bounds = emptyBounds<Simd4f>();
+
+		// calculate potentially colliding set
+		{
+#if PX_PROFILE
+			ProfileZone zone("cloth::SwInterCollision::BroadPhase", mProfiler);
+#endif
+
+			mNumParticles =
+			    calculatePotentialColliders(mInstances, mInstances + mNumInstances, mCollisionDistance, mClothIndices,
+			                                mParticleIndices, bounds, mOverlapMasks, mFilter, mAllocator);
+		}
+
+		// collide
+		if(mNumParticles)
+		{
+#if PX_PROFILE
+			ProfileZone zone("cloth::SwInterCollision::Collide", mProfiler);
+#endif
+
+			Simd4f lowerBound = bounds.mLower;
+			Simd4f edgeLength = max(bounds.mUpper - lowerBound, sEpsilon);
+
+			// sweep along longest axis
+			uint32_t sweepAxis = longestAxis(edgeLength);
+			uint32_t hashAxis0 = (sweepAxis + 1) % 3;
+			uint32_t hashAxis1 = (sweepAxis + 2) % 3;
+
+			// reserve 0, 127, and 65535 for sentinel
+			Simd4f cellSize = max(mCollisionDistance, simd4f(1.0f / 253) * edgeLength);
+			array(cellSize)[sweepAxis] = array(edgeLength)[sweepAxis] / 65533;
+
+			Simd4f one = simd4f(_1);
+			Simd4f gridSize = simd4f(254.0f);
+			array(gridSize)[sweepAxis] = 65534.0f;
+
+			Simd4f gridScale = recipT<1>(cellSize);
+			Simd4f gridBias = -lowerBound * gridScale + simd4f(_1);
+
+			void* buffer = mAllocator.allocate(getBufferSize(mNumParticles));
+
+			uint32_t* __restrict sortedIndices = reinterpret_cast<uint32_t*>(buffer);
+			uint32_t* __restrict sortedKeys = sortedIndices + mNumParticles;
+			uint32_t* __restrict keys = PxMax(sortedKeys + mNumParticles, sortedIndices + 2 * mNumParticles + 1024);
+
+			typedef typename Simd4fToSimd4i<Simd4f>::Type Simd4i;
+
+			// create keys
+			for(uint32_t i = 0; i < mNumParticles; ++i)
+			{
+				// grid coordinate
+				Simd4f indexf = getParticle(i) * gridScale + gridBias;
+
+				// need to clamp index because shape collision potentially
+				// pushes particles outside of their original bounds
+				Simd4i indexi = intFloor(max(one, min(indexf, gridSize)));
+
+				const int32_t* ptr = simdi::array(indexi);
+				keys[i] = uint32_t(ptr[sweepAxis] | (ptr[hashAxis0] << 16) | (ptr[hashAxis1] << 24));
+			}
+
+			// compute sorted keys indices
+			radixSort(keys, keys + mNumParticles, sortedIndices);
+
+			// snoop histogram: offset of first index with 8 msb > 1 (0 is sentinel)
+			uint32_t firstColumnSize = sortedIndices[2 * mNumParticles + 769];
+
+			// sort keys
+			for(uint32_t i = 0; i < mNumParticles; ++i)
+				sortedKeys[i] = keys[sortedIndices[i]];
+			sortedKeys[mNumParticles] = uint32_t(-1); // sentinel
+
+			// calculate the number of buckets we need to search forward
+			const Simd4i data = intFloor(gridScale * mCollisionDistance);
+			uint32_t collisionDistance = uint32_t(2 + simdi::array(data)[sweepAxis]);
+
+			// collide particles
+			collideParticles(sortedKeys, firstColumnSize, sortedIndices, mNumParticles, collisionDistance);
+
+			mAllocator.deallocate(buffer);
+		}
+
+		/*
+		// verify against brute force (disable collision response when testing)
+		uint32_t numCollisions = mNumCollisions;
+		mNumCollisions = 0;
+
+		for(uint32_t i = 0; i < mNumParticles; ++i)
+		    for(uint32_t j = i+1; j < mNumParticles; ++j)
+		        if (mOverlapMasks[mClothIndices[i]] & (1 << mClothIndices[j]))
+		            collideParticles(getParticle(i), getParticle(j));
+
+		static uint32_t iter = 0; ++iter;
+		if(numCollisions != mNumCollisions)
+		    printf("%u: %u != %u\n", iter, numCollisions, mNumCollisions);
+		*/
+
+		// transform back to local space
+		{
+#if PX_PROFILE
+			ProfileZone zone("cloth::SwInterCollision::PostTransform", mProfiler);
+#endif
+			Simd4f toLocal[4], impulseScale;
+			uint16_t lastCloth = uint16_t(0xffff);
+
+			for(uint32_t i = 0; i < mNumParticles; ++i)
+			{
+				uint16_t clothIndex = mClothIndices[i];
+				const SwInterCollisionData* instance = mInstances + clothIndex;
+
+				// todo: could pre-compute these inverses
+				if(clothIndex != lastCloth)
+				{
+					const PxMat44 xform(instance->mGlobalPose.getInverse());
+
+					toLocal[0] = load(reinterpret_cast<const float*>(&xform.column0));
+					toLocal[1] = load(reinterpret_cast<const float*>(&xform.column1));
+					toLocal[2] = load(reinterpret_cast<const float*>(&xform.column2));
+					toLocal[3] = load(reinterpret_cast<const float*>(&xform.column3));
+
+					impulseScale = simd4f(instance->mImpulseScale);
+
+					lastCloth = mClothIndices[i];
+				}
+
+				uint32_t particleIndex = mParticleIndices[i];
+				Simd4f& particle = reinterpret_cast<Simd4f&>(instance->mParticles[particleIndex]);
+				Simd4f& impulse = reinterpret_cast<Simd4f&>(instance->mPrevParticles[particleIndex]);
+
+				particle = transform(toLocal, particle);
+				// avoid w becoming negative due to numerical inaccuracies
+				impulse = max(sZeroW, particle - rotate(toLocal, Simd4f(impulse * impulseScale)));
+			}
+		}
+	}
+
+	mAllocator.deallocate(mOverlapMasks);
+	mAllocator.deallocate(mParticleIndices);
+	mAllocator.deallocate(mClothIndices);
+}
+
+template <typename Simd4f>
+size_t cloth::SwInterCollision<Simd4f>::estimateTemporaryMemory(SwInterCollisionData* cloths, uint32_t n)
+{
+	// count total particles
+	uint32_t numParticles = 0;
+	for(uint32_t i = 0; i < n; ++i)
+		numParticles += cloths[i].mNumParticles;
+
+	uint32_t boundsSize = 2 * n * sizeof(BoundingBox<Simd4f>) + n * sizeof(uint32_t);
+	uint32_t clothIndicesSize = numParticles * sizeof(uint16_t);
+	uint32_t particleIndicesSize = numParticles * sizeof(uint32_t);
+	uint32_t masksSize = n * sizeof(uint32_t);
+
+	return boundsSize + clothIndicesSize + particleIndicesSize + masksSize + getBufferSize(numParticles);
+}
+
+template <typename Simd4f>
+size_t cloth::SwInterCollision<Simd4f>::getBufferSize(uint32_t numParticles)
+{
+	uint32_t keysSize = numParticles * sizeof(uint32_t);
+	uint32_t indicesSize = numParticles * sizeof(uint32_t);
+	uint32_t histogramSize = 1024 * sizeof(uint32_t);
+
+	return keysSize + indicesSize + PxMax(indicesSize + histogramSize, keysSize);
+}
+
+template <typename Simd4f>
+void cloth::SwInterCollision<Simd4f>::collideParticle(uint32_t index)
+{
+	uint16_t clothIndex = mClothIndices[index];
+
+	if((1 << clothIndex) & ~mClothMask)
+		return;
+
+	const SwInterCollisionData* instance = mInstances + clothIndex;
+
+	uint32_t particleIndex = mParticleIndices[index];
+	Simd4f& particle = reinterpret_cast<Simd4f&>(instance->mParticles[particleIndex]);
+
+	Simd4f diff = particle - mParticle;
+	Simd4f distSqr = dot3(diff, diff);
+
+#if PX_DEBUG
+	++mNumTests;
+#endif
+
+	if(allGreater(distSqr, mCollisionSquareDistance))
+		return;
+
+	Simd4f w0 = splat<3>(mParticle);
+	Simd4f w1 = splat<3>(particle);
+
+	Simd4f ratio = mCollisionDistance * rsqrtT<1>(distSqr);
+	Simd4f scale = mStiffness * recipT<1>(sEpsilon + w0 + w1);
+	Simd4f delta = (scale * (diff - diff * ratio)) & sMaskXYZ;
+
+	mParticle = mParticle + delta * w0;
+	particle = particle - delta * w1;
+
+	Simd4f& impulse = reinterpret_cast<Simd4f&>(instance->mPrevParticles[particleIndex]);
+
+	mImpulse = mImpulse + delta * w0;
+	impulse = impulse - delta * w1;
+
+#if PX_DEBUG || PX_PROFILE
+	++mNumCollisions;
+#endif
+}
+
+template <typename Simd4f>
+void cloth::SwInterCollision<Simd4f>::collideParticles(const uint32_t* keys, uint32_t firstColumnSize,
+                                                       const uint32_t* indices, uint32_t numParticles,
+                                                       uint32_t collisionDistance)
+{
+	const uint32_t bucketMask = uint16_t(-1);
+
+	const uint32_t keyOffsets[] = { 0, 0x00010000, 0x00ff0000, 0x01000000, 0x01010000 };
+
+	const uint32_t* __restrict kFirst[5];
+	const uint32_t* __restrict kLast[5];
+
+	{
+		// optimization: scan forward iterator starting points once instead of 9 times
+		const uint32_t* __restrict kIt = keys;
+
+		uint32_t key = *kIt;
+		uint32_t firstKey = key - PxMin(collisionDistance, key & bucketMask);
+		uint32_t lastKey = PxMin(key + collisionDistance, key | bucketMask);
+
+		kFirst[0] = kIt;
+		while(*kIt < lastKey)
+			++kIt;
+		kLast[0] = kIt;
+
+		for(uint32_t k = 1; k < 5; ++k)
+		{
+			for(uint32_t n = firstKey + keyOffsets[k]; *kIt < n;)
+				++kIt;
+			kFirst[k] = kIt;
+
+			for(uint32_t n = lastKey + keyOffsets[k]; *kIt < n;)
+				++kIt;
+			kLast[k] = kIt;
+
+			// jump forward once to second column
+			kIt = keys + firstColumnSize;
+			firstColumnSize = 0;
+		}
+	}
+
+	const uint32_t* __restrict iIt = indices;
+	const uint32_t* __restrict iEnd = indices + numParticles;
+
+	const uint32_t* __restrict jIt;
+	const uint32_t* __restrict jEnd;
+
+	for(; iIt != iEnd; ++iIt, ++kFirst[0])
+	{
+		// load current particle once outside of inner loop
+		uint32_t index = *iIt;
+		PX_ASSERT(index < mNumParticles);
+		mClothIndex = mClothIndices[index];
+		PX_ASSERT(mClothIndex < mNumInstances);
+		mClothMask = mOverlapMasks[mClothIndex];
+
+		const SwInterCollisionData* instance = mInstances + mClothIndex;
+
+		mParticleIndex = mParticleIndices[index];
+		mParticle = reinterpret_cast<const Simd4f&>(instance->mParticles[mParticleIndex]);
+		mImpulse = reinterpret_cast<const Simd4f&>(instance->mPrevParticles[mParticleIndex]);
+
+		uint32_t key = *kFirst[0];
+
+		// range of keys we need to check against for this particle
+		uint32_t firstKey = key - PxMin(collisionDistance, key & bucketMask);
+		uint32_t lastKey = PxMin(key + collisionDistance, key | bucketMask);
+
+		// scan forward end point
+		while(*kLast[0] < lastKey)
+			++kLast[0];
+
+		// process potential colliders of same cell
+		jEnd = indices + (kLast[0] - keys);
+		for(jIt = iIt + 1; jIt != jEnd; ++jIt)
+			collideParticle(*jIt);
+
+		// process neighbor cells
+		for(uint32_t k = 1; k < 5; ++k)
+		{
+			// scan forward start point
+			for(uint32_t n = firstKey + keyOffsets[k]; *kFirst[k] < n;)
+				++kFirst[k];
+
+			// scan forward end point
+			for(uint32_t n = lastKey + keyOffsets[k]; *kLast[k] < n;)
+				++kLast[k];
+
+			// process potential colliders
+			jEnd = indices + (kLast[k] - keys);
+			for(jIt = indices + (kFirst[k] - keys); jIt != jEnd; ++jIt)
+				collideParticle(*jIt);
+		}
+
+		// write back particle and impulse
+		reinterpret_cast<Simd4f&>(instance->mParticles[mParticleIndex]) = mParticle;
+		reinterpret_cast<Simd4f&>(instance->mPrevParticles[mParticleIndex]) = mImpulse;
+	}
+}
+
+// explicit template instantiation
+#if NVMATH_SIMD
+template class cloth::SwInterCollision<Simd4f>;
+#endif
+#if NVMATH_SCALAR
+template class cloth::SwInterCollision<Scalar4f>;
+#endif
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwInterCollision.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwInterCollision.h
new file mode 100644
index 00000000..ffc62eb1
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwInterCollision.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+
+#include "StackAllocator.h"
+
+#include "Simd4i.h"
+
+#include "PxMat44.h"
+#include "PxTransform.h"
+#include "PxBounds3.h"
+
+namespace physx
+{
+	namespace profile
+	{
+		class PxProfileZone;
+	}
+}
+
+namespace nvidia
+{
+namespace cloth
+{
+
+class SwCloth;
+struct SwClothData;
+
+typedef StackAllocator<16> SwKernelAllocator;
+
+typedef bool (*InterCollisionFilter)(void* cloth0, void* cloth1);
+
+struct SwInterCollisionData
+{
+	SwInterCollisionData()
+	{
+	}
+	SwInterCollisionData(PxVec4* particles, PxVec4* prevParticles, uint32_t numParticles, uint32_t* indices,
+	                     const PxTransform& globalPose, const PxVec3& boundsCenter, const PxVec3& boundsHalfExtents,
+	                     float impulseScale, void* userData)
+	: mParticles(particles)
+	, mPrevParticles(prevParticles)
+	, mNumParticles(numParticles)
+	, mIndices(indices)
+	, mGlobalPose(globalPose)
+	, mBoundsCenter(boundsCenter)
+	, mBoundsHalfExtent(boundsHalfExtents)
+	, mImpulseScale(impulseScale)
+	, mUserData(userData)
+	{
+	}
+
+	PxVec4* mParticles;
+	PxVec4* mPrevParticles;
+	uint32_t mNumParticles;
+	uint32_t* mIndices;
+	PxTransform mGlobalPose;
+	PxVec3 mBoundsCenter;
+	PxVec3 mBoundsHalfExtent;
+	float mImpulseScale;
+	void* mUserData;
+};
+
+template <typename Simd4f>
+class SwInterCollision
+{
+
+  public:
+	SwInterCollision(const SwInterCollisionData* cloths, uint32_t n, float colDist, float stiffness, uint32_t iterations,
+	                 InterCollisionFilter filter, cloth::SwKernelAllocator& alloc, nvidia::profile::PxProfileZone* zone);
+
+	~SwInterCollision();
+
+	void operator()();
+
+	static size_t estimateTemporaryMemory(SwInterCollisionData* cloths, uint32_t n);
+
+  private:
+	SwInterCollision& operator=(const SwInterCollision&); // not implemented
+
+	static size_t getBufferSize(uint32_t);
+
+	void collideParticles(const uint32_t* keys, uint32_t firstColumnSize, const uint32_t* sortedIndices,
+	                      uint32_t numParticles, uint32_t collisionDistance);
+
+	Simd4f& getParticle(uint32_t index);
+
+	// better wrap these in a struct
+	void collideParticle(uint32_t index);
+
+	Simd4f mParticle;
+	Simd4f mImpulse;
+
+	Simd4f mCollisionDistance;
+	Simd4f mCollisionSquareDistance;
+	Simd4f mStiffness;
+
+	uint16_t mClothIndex;
+	uint32_t mClothMask;
+	uint32_t mParticleIndex;
+
+	uint32_t mNumIterations;
+
+	const SwInterCollisionData* mInstances;
+	uint32_t mNumInstances;
+
+	uint16_t* mClothIndices;
+	uint32_t* mParticleIndices;
+	uint32_t mNumParticles;
+	uint32_t* mOverlapMasks;
+
+	uint32_t mTotalParticles;
+
+	InterCollisionFilter mFilter;
+
+	SwKernelAllocator& mAllocator;
+
+	profile::PxProfileZone* mProfiler;
+
+  public:
+	mutable uint32_t mNumTests;
+	mutable uint32_t mNumCollisions;
+};
+
+} // namespace cloth
+
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSelfCollision.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSelfCollision.cpp
new file mode 100644
index 00000000..939543f4
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSelfCollision.cpp
@@ -0,0 +1,404 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "SwSelfCollision.h"
+#include "SwCloth.h"
+#include "SwClothData.h"
+#include "PsIntrinsics.h"
+#include "SwCollision.h" //temp fix, needed by SwCollisionHelper implementaitons
+#include "Simd4f.h"
+#include "SwCollisionHelpers.h"
+
+#pragma warning(disable:4127)
+
+using namespace nvidia;
+using namespace nvidia;
+
+namespace
+{
+typedef Simd4fFactory<detail::FourTuple> Simd4fConstant;
+
+const Simd4fConstant sMaskXYZ = simd4f(simd4i(~0, ~0, ~0, 0));
+const Simd4fConstant sEpsilon = simd4f(FLT_EPSILON);
+
+// returns sorted indices, output needs to be at least 2*(last-first)+1024
+void radixSort(const uint32_t* first, const uint32_t* last, uint16_t* out)
+{
+	uint16_t n = uint16_t(last - first);
+
+	uint16_t* buffer = out + 2 * n;
+	uint16_t* __restrict histograms[] = { buffer, buffer + 256, buffer + 512, buffer + 768 };
+
+	intrinsics::memZero(buffer, 1024 * sizeof(uint16_t));
+
+	// build 3 histograms in one pass
+	for(const uint32_t* __restrict it = first; it != last; ++it)
+	{
+		uint32_t key = *it;
+		++histograms[0][0xff & key];
+		++histograms[1][0xff & (key >> 8)];
+		++histograms[2][0xff & (key >> 16)];
+		++histograms[3][key >> 24];
+	}
+
+	// convert histograms to offset tables in-place
+	uint16_t sums[4] = {};
+	for(uint32_t i = 0; i < 256; ++i)
+	{
+		uint16_t temp0 = uint16_t(histograms[0][i] + sums[0]);
+		histograms[0][i] = sums[0], sums[0] = temp0;
+
+		uint16_t temp1 = uint16_t(histograms[1][i] + sums[1]);
+		histograms[1][i] = sums[1], sums[1] = temp1;
+
+		uint16_t temp2 = uint16_t(histograms[2][i] + sums[2]);
+		histograms[2][i] = sums[2], sums[2] = temp2;
+
+		uint16_t temp3 = uint16_t(histograms[3][i] + sums[3]);
+		histograms[3][i] = sums[3], sums[3] = temp3;
+	}
+
+	PX_ASSERT(sums[0] == n && sums[1] == n && sums[2] == n && sums[3] == n);
+
+#if PX_DEBUG
+	memset(out, 0xff, 2 * n * sizeof(uint16_t));
+#endif
+
+	// sort 8 bits per pass
+
+	uint16_t* __restrict indices[] = { out, out + n };
+
+	for(uint16_t i = 0; i != n; ++i)
+		indices[1][histograms[0][0xff & first[i]]++] = i;
+
+	for(uint16_t i = 0, index; index = indices[1][i], i != n; ++i)
+		indices[0][histograms[1][0xff & (first[index] >> 8)]++] = index;
+
+	for(uint16_t i = 0, index; index = indices[0][i], i != n; ++i)
+		indices[1][histograms[2][0xff & (first[index] >> 16)]++] = index;
+
+	for(uint16_t i = 0, index; index = indices[1][i], i != n; ++i)
+		indices[0][histograms[3][first[index] >> 24]++] = index;
+}
+
+template <typename Simd4f>
+uint32_t longestAxis(const Simd4f& edgeLength)
+{
+	const float* e = array(edgeLength);
+
+	if(e[0] > e[1])
+		return uint32_t(e[0] > e[2] ? 0 : 2);
+	else
+		return uint32_t(e[1] > e[2] ? 1 : 2);
+}
+
+bool isSelfCollisionEnabled(const cloth::SwClothData& cloth)
+{
+	return PxMin(cloth.mSelfCollisionDistance, cloth.mSelfCollisionStiffness) > 0.0f;
+}
+
+bool isSelfCollisionEnabled(const cloth::SwCloth& cloth)
+{
+	return PxMin(cloth.mSelfCollisionDistance, -cloth.mSelfCollisionLogStiffness) > 0.0f;
+}
+
+inline uint32_t align2(uint32_t x)
+{
+	return (x + 1) & ~1;
+}
+
+} // anonymous namespace
+
+template <typename Simd4f>
+cloth::SwSelfCollision<Simd4f>::SwSelfCollision(cloth::SwClothData& clothData, cloth::SwKernelAllocator& alloc)
+: mClothData(clothData), mAllocator(alloc)
+{
+	mCollisionDistance = simd4f(mClothData.mSelfCollisionDistance);
+	mCollisionSquareDistance = mCollisionDistance * mCollisionDistance;
+	mStiffness = (Simd4f)sMaskXYZ & simd4f(mClothData.mSelfCollisionStiffness);
+}
+
+template <typename Simd4f>
+cloth::SwSelfCollision<Simd4f>::~SwSelfCollision()
+{
+}
+
+template <typename Simd4f>
+void cloth::SwSelfCollision<Simd4f>::operator()()
+{
+	mNumTests = mNumCollisions = 0;
+
+	if(!isSelfCollisionEnabled(mClothData))
+		return;
+
+	Simd4f lowerBound = load(mClothData.mCurBounds);
+	Simd4f edgeLength = max(load(mClothData.mCurBounds + 3) - lowerBound, sEpsilon);
+
+	// sweep along longest axis
+	uint32_t sweepAxis = longestAxis(edgeLength);
+	uint32_t hashAxis0 = (sweepAxis + 1) % 3;
+	uint32_t hashAxis1 = (sweepAxis + 2) % 3;
+
+	// reserve 0, 127, and 65535 for sentinel
+	Simd4f cellSize = max(mCollisionDistance, simd4f(1.0f / 253) * edgeLength);
+	array(cellSize)[sweepAxis] = array(edgeLength)[sweepAxis] / 65533;
+
+	Simd4f one = simd4f(_1);
+	Simd4f gridSize = simd4f(254.0f);
+	array(gridSize)[sweepAxis] = 65534.0f;
+
+	Simd4f gridScale = recipT<1>(cellSize);
+	Simd4f gridBias = -lowerBound * gridScale + simd4f(_1);
+
+	uint32_t numIndices = mClothData.mNumSelfCollisionIndices;
+	void* buffer = mAllocator.allocate(getBufferSize(numIndices));
+
+	const uint32_t* __restrict indices = mClothData.mSelfCollisionIndices;
+	uint32_t* __restrict keys = reinterpret_cast<uint32_t*>(buffer);
+	uint16_t* __restrict sortedIndices = reinterpret_cast<uint16_t*>(keys + numIndices);
+	uint32_t* __restrict sortedKeys = reinterpret_cast<uint32_t*>(sortedIndices + align2(numIndices));
+
+	const Simd4f* particles = reinterpret_cast<const Simd4f*>(mClothData.mCurParticles);
+
+	// create keys
+	for(uint32_t i = 0; i < numIndices; ++i)
+	{
+		uint32_t index = indices ? indices[i] : i;
+
+		// grid coordinate
+		Simd4f keyf = particles[index] * gridScale + gridBias;
+
+		// need to clamp index because shape collision potentially
+		// pushes particles outside of their original bounds
+		Simd4i keyi = intFloor(max(one, min(keyf, gridSize)));
+
+		const int32_t* ptr = simdi::array(keyi);
+		keys[i] = uint32_t(ptr[sweepAxis] | (ptr[hashAxis0] << 16) | (ptr[hashAxis1] << 24));
+	}
+
+	// compute sorted keys indices
+	radixSort(keys, keys + numIndices, sortedIndices);
+
+	// snoop histogram: offset of first index with 8 msb > 1 (0 is sentinel)
+	uint16_t firstColumnSize = sortedIndices[2 * numIndices + 769];
+
+	// sort keys
+	for(uint32_t i = 0; i < numIndices; ++i)
+		sortedKeys[i] = keys[sortedIndices[i]];
+	sortedKeys[numIndices] = uint32_t(-1); // sentinel
+
+	if(indices)
+	{
+		// sort indices (into no-longer-needed keys array)
+		const uint16_t* __restrict permutation = sortedIndices;
+		sortedIndices = reinterpret_cast<uint16_t*>(keys);
+		for(uint32_t i = 0; i < numIndices; ++i)
+			sortedIndices[i] = uint16_t(indices[permutation[i]]);
+	}
+
+	// calculate the number of buckets we need to search forward
+	const Simd4i data = intFloor(gridScale * mCollisionDistance);
+	uint32_t collisionDistance = 2 + (uint32_t)simdi::array(data)[sweepAxis];
+
+	// collide particles
+	if(mClothData.mRestPositions)
+		collideParticles<true>(sortedKeys, firstColumnSize, sortedIndices, collisionDistance);
+	else
+		collideParticles<false>(sortedKeys, firstColumnSize, sortedIndices, collisionDistance);
+
+	mAllocator.deallocate(buffer);
+
+	// verify against brute force (disable collision response when testing)
+	/*
+	uint32_t numCollisions = mNumCollisions;
+	mNumCollisions = 0;
+
+	Simd4f* qarticles = reinterpret_cast<
+	    Simd4f*>(mClothData.mCurParticles);
+	for(uint32_t i = 0; i < numIndices; ++i)
+	{
+	    uint32_t indexI = indices ? indices[i] : i;
+	    for(uint32_t j = i+1; j < numIndices; ++j)
+	    {
+	        uint32_t indexJ = indices ? indices[j] : j;
+	        collideParticles(qarticles[indexI], qarticles[indexJ]);
+	    }
+	}
+
+	static uint32_t iter = 0; ++iter;
+	if(numCollisions != mNumCollisions)
+	    printf("%u: %u != %u\n", iter, numCollisions, mNumCollisions);
+	*/
+}
+
+template <typename Simd4f>
+size_t cloth::SwSelfCollision<Simd4f>::estimateTemporaryMemory(const SwCloth& cloth)
+{
+	uint32_t numIndices =
+	    cloth.mSelfCollisionIndices.empty() ? cloth.mCurParticles.size() : cloth.mSelfCollisionIndices.size();
+	return isSelfCollisionEnabled(cloth) ? getBufferSize(numIndices) : 0;
+}
+
+template <typename Simd4f>
+size_t cloth::SwSelfCollision<Simd4f>::getBufferSize(uint32_t numIndices)
+{
+	uint32_t keysSize = numIndices * sizeof(uint32_t);
+	uint32_t indicesSize = align2(numIndices) * sizeof(uint16_t);
+	uint32_t radixSize = (numIndices + 1024) * sizeof(uint16_t);
+	return keysSize + indicesSize + PxMax(radixSize, keysSize + uint32_t(sizeof(uint32_t)));
+}
+
+template <typename Simd4f>
+template <bool useRestParticles>
+void cloth::SwSelfCollision<Simd4f>::collideParticles(Simd4f& pos0, Simd4f& pos1, const Simd4f& pos0rest,
+                                                      const Simd4f& pos1rest)
+{
+	Simd4f diff = pos1 - pos0;
+	Simd4f distSqr = dot3(diff, diff);
+
+#if PX_DEBUG
+	++mNumTests;
+#endif
+
+	if(allGreater(distSqr, mCollisionSquareDistance))
+		return;
+
+	if(useRestParticles)
+	{
+		// calculate distance in rest configuration, if less than collision
+		// distance then ignore collision between particles in deformed config
+		Simd4f restDiff = pos1rest - pos0rest;
+		Simd4f restDistSqr = dot3(restDiff, restDiff);
+
+		if(allGreater(mCollisionSquareDistance, restDistSqr))
+			return;
+	}
+
+	Simd4f w0 = splat<3>(pos0);
+	Simd4f w1 = splat<3>(pos1);
+
+	Simd4f ratio = mCollisionDistance * rsqrt(distSqr);
+	Simd4f scale = mStiffness * recip(sEpsilon + w0 + w1);
+	Simd4f delta = (scale * (diff - diff * ratio)) & sMaskXYZ;
+
+	pos0 = pos0 + delta * w0;
+	pos1 = pos1 - delta * w1;
+
+#if PX_DEBUG || PX_PROFILE
+	++mNumCollisions;
+#endif
+}
+
+template <typename Simd4f>
+template <bool useRestParticles>
+void cloth::SwSelfCollision<Simd4f>::collideParticles(const uint32_t* keys, uint16_t firstColumnSize,
+                                                      const uint16_t* indices, uint32_t collisionDistance)
+{
+	Simd4f* __restrict particles = reinterpret_cast<Simd4f*>(mClothData.mCurParticles);
+	Simd4f* __restrict restParticles =
+	    useRestParticles ? reinterpret_cast<Simd4f*>(mClothData.mRestPositions) : particles;
+
+	const uint32_t bucketMask = uint16_t(-1);
+
+	const uint32_t keyOffsets[] = { 0, 0x00010000, 0x00ff0000, 0x01000000, 0x01010000 };
+
+	const uint32_t* __restrict kFirst[5];
+	const uint32_t* __restrict kLast[5];
+
+	{
+		// optimization: scan forward iterator starting points once instead of 9 times
+		const uint32_t* __restrict kIt = keys;
+
+		uint32_t key = *kIt;
+		uint32_t firstKey = key - PxMin(collisionDistance, key & bucketMask);
+		uint32_t lastKey = PxMin(key + collisionDistance, key | bucketMask);
+
+		kFirst[0] = kIt;
+		while(*kIt < lastKey)
+			++kIt;
+		kLast[0] = kIt;
+
+		for(uint32_t k = 1; k < 5; ++k)
+		{
+			for(uint32_t n = firstKey + keyOffsets[k]; *kIt < n;)
+				++kIt;
+			kFirst[k] = kIt;
+
+			for(uint32_t n = lastKey + keyOffsets[k]; *kIt < n;)
+				++kIt;
+			kLast[k] = kIt;
+
+			// jump forward once to second column
+			kIt = keys + firstColumnSize;
+			firstColumnSize = 0;
+		}
+	}
+
+	const uint16_t* __restrict iIt = indices;
+	const uint16_t* __restrict iEnd = indices + mClothData.mNumSelfCollisionIndices;
+
+	const uint16_t* __restrict jIt;
+	const uint16_t* __restrict jEnd;
+
+	for(; iIt != iEnd; ++iIt, ++kFirst[0])
+	{
+		PX_ASSERT(*iIt < mClothData.mNumParticles);
+
+		// load current particle once outside of inner loop
+		Simd4f particle = particles[*iIt];
+		Simd4f restParticle = restParticles[*iIt];
+
+		uint32_t key = *kFirst[0];
+
+		// range of keys we need to check against for this particle
+		uint32_t firstKey = key - PxMin(collisionDistance, key & bucketMask);
+		uint32_t lastKey = PxMin(key + collisionDistance, key | bucketMask);
+
+		// scan forward end point
+		while(*kLast[0] < lastKey)
+			++kLast[0];
+
+		// process potential colliders of same cell
+		jEnd = indices + (kLast[0] - keys);
+		for(jIt = iIt + 1; jIt != jEnd; ++jIt)
+			collideParticles<useRestParticles>(particle, particles[*jIt], restParticle, restParticles[*jIt]);
+
+		// process neighbor cells
+		for(uint32_t k = 1; k < 5; ++k)
+		{
+			// scan forward start point
+			for(uint32_t n = firstKey + keyOffsets[k]; *kFirst[k] < n;)
+				++kFirst[k];
+
+			// scan forward end point
+			for(uint32_t n = lastKey + keyOffsets[k]; *kLast[k] < n;)
+				++kLast[k];
+
+			// process potential colliders
+			jEnd = indices + (kLast[k] - keys);
+			for(jIt = indices + (kFirst[k] - keys); jIt != jEnd; ++jIt)
+				collideParticles<useRestParticles>(particle, particles[*jIt], restParticle, restParticles[*jIt]);
+		}
+
+		// store current particle
+		particles[*iIt] = particle;
+	}
+}
+
+// explicit template instantiation
+#if NVMATH_SIMD
+template class cloth::SwSelfCollision<Simd4f>;
+#endif
+#if NVMATH_SCALAR
+template class cloth::SwSelfCollision<Scalar4f>;
+#endif
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSelfCollision.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSelfCollision.h
new file mode 100644
index 00000000..fa023e56
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSelfCollision.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+#include "StackAllocator.h"
+#include "Simd4i.h"
+
+namespace nvidia
+{
+
+namespace cloth
+{
+
+class SwCloth;
+struct SwClothData;
+
+typedef StackAllocator<16> SwKernelAllocator;
+
+template <typename Simd4f>
+class SwSelfCollision
+{
+	typedef typename Simd4fToSimd4i<Simd4f>::Type Simd4i;
+
+  public:
+	SwSelfCollision(SwClothData& clothData, SwKernelAllocator& alloc);
+	~SwSelfCollision();
+
+	void operator()();
+
+	static size_t estimateTemporaryMemory(const SwCloth&);
+
+  private:
+	SwSelfCollision& operator=(const SwSelfCollision&); // not implemented
+	static size_t getBufferSize(uint32_t);
+
+	template <bool useRestParticles>
+	void collideParticles(Simd4f&, Simd4f&, const Simd4f&, const Simd4f&);
+
+	template <bool useRestParticles>
+	void collideParticles(const uint32_t*, uint16_t, const uint16_t*, uint32_t);
+
+	Simd4f mCollisionDistance;
+	Simd4f mCollisionSquareDistance;
+	Simd4f mStiffness;
+
+	SwClothData& mClothData;
+	SwKernelAllocator& mAllocator;
+
+  public:
+	mutable uint32_t mNumTests;
+	mutable uint32_t mNumCollisions;
+};
+
+} // namespace cloth
+
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolver.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolver.cpp
new file mode 100644
index 00000000..35cb1bde
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolver.cpp
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "SwSolver.h"
+#include "SwCloth.h"
+#include "ClothImpl.h"
+#include "SwFabric.h"
+#include "SwFactory.h"
+#include "SwClothData.h"
+#include "SwSolverKernel.h"
+#include "SwInterCollision.h"
+#include "IterationState.h"
+#include "PxCpuDispatcher.h"
+#include "PxProfileZone.h"
+#include "PsFPU.h"
+#include "PsSort.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+bool neonSolverKernel(SwCloth const&, SwClothData&, SwKernelAllocator&, IterationStateFactory&, profile::PxProfileZone*);
+}
+}
+
+#if NVMATH_SIMD
+typedef Simd4f Simd4fType;
+#else
+typedef Scalar4f Simd4fType;
+#endif
+
+using namespace nvidia;
+
+cloth::SwSolver::SwSolver(nvidia::profile::PxProfileZone* profiler, PxTaskManager* taskMgr)
+: mProfiler(profiler)
+, mSimulateEventId(mProfiler ? mProfiler->getEventIdForName("cloth::SwSolver::simulate") : uint16_t(-1))
+#if APEX_UE4
+, mDt(0.0f)
+#endif
+, mInterCollisionDistance(0.0f)
+, mInterCollisionStiffness(1.0f)
+, mInterCollisionIterations(1)
+, mInterCollisionScratchMem(NULL)
+, mInterCollisionScratchMemSize(0)
+{
+	mStartSimulationTask.mSolver = this;
+	mEndSimulationTask.mSolver = this;
+
+	PX_UNUSED(taskMgr);
+}
+
+cloth::SwSolver::~SwSolver()
+{
+	if(mInterCollisionScratchMem)
+		PX_FREE(mInterCollisionScratchMem);
+
+	PX_ASSERT(mCpuClothSimulationTasks.empty());
+}
+
+namespace
+{
+template <typename T>
+bool clothSizeGreater(const T& t0, const T& t1)
+{
+#if APEX_UE4
+	return t0->mCloth->mCurParticles.size() > t1->mCloth->mCurParticles.size();
+#else
+	return t0.mCloth->mCurParticles.size() > t1.mCloth->mCurParticles.size();
+#endif
+}
+
+template <typename T>
+void sortTasks(nvidia::Array<T, nvidia::NonTrackingAllocator>& tasks)
+{
+	nvidia::sort(tasks.begin(), tasks.size(), &clothSizeGreater<T>);
+}
+}
+
+void cloth::SwSolver::addCloth(Cloth* cloth)
+{
+	SwCloth& swCloth = static_cast<SwClothImpl&>(*cloth).mCloth;
+
+#if APEX_UE4
+	mCpuClothSimulationTasks.pushBack(new CpuClothSimulationTask(swCloth, *this));
+#else
+	mCpuClothSimulationTasks.pushBack(CpuClothSimulationTask(swCloth, mEndSimulationTask));
+#endif
+
+	sortTasks(mCpuClothSimulationTasks);
+}
+
+void cloth::SwSolver::removeCloth(Cloth* cloth)
+{
+	SwCloth& swCloth = static_cast<SwClothImpl&>(*cloth).mCloth;
+
+	CpuClothSimulationTaskVector::Iterator tIt = mCpuClothSimulationTasks.begin();
+	CpuClothSimulationTaskVector::Iterator tEnd = mCpuClothSimulationTasks.end();
+
+	while (tIt != tEnd &&
+#if APEX_UE4
+		(*tIt)->mCloth != &swCloth
+#else
+		tIt->mCloth != &swCloth
+#endif
+		)
+		++tIt;
+
+	if(tIt != tEnd)
+	{
+#if APEX_UE4
+		delete *tIt;
+#else
+		deallocate(tIt->mScratchMemory);
+#endif
+		mCpuClothSimulationTasks.replaceWithLast(tIt);
+		sortTasks(mCpuClothSimulationTasks);
+	}
+}
+
+PxBaseTask& cloth::SwSolver::simulate(float dt, PxBaseTask& continuation)
+{
+	if (mCpuClothSimulationTasks.empty()
+#if APEX_UE4
+		|| dt == 0.0f
+#endif
+		)
+	{
+		continuation.addReference();
+		return continuation;
+	}
+
+	mEndSimulationTask.setContinuation(&continuation);
+#if APEX_UE4
+	mDt = dt;
+#else
+	mEndSimulationTask.mDt = dt;
+#endif
+
+	mStartSimulationTask.setContinuation(&mEndSimulationTask);
+
+	mEndSimulationTask.removeReference();
+
+	return mStartSimulationTask;
+}
+
+void cloth::SwSolver::interCollision()
+{
+	if(!mInterCollisionIterations || mInterCollisionDistance == 0.0f)
+		return;
+
+	float elasticity = 1.0f;
+
+	// rebuild cloth instance array
+	mInterCollisionInstances.resize(0);
+	for(uint32_t i = 0; i < mCpuClothSimulationTasks.size(); ++i)
+	{
+#if APEX_UE4
+		SwCloth* c = mCpuClothSimulationTasks[i]->mCloth;
+		float invNumIterations = mCpuClothSimulationTasks[i]->mInvNumIterations;
+#else
+		SwCloth* c = mCpuClothSimulationTasks[i].mCloth;
+		float invNumIterations = mCpuClothSimulationTasks[i].mInvNumIterations;
+#endif
+
+		mInterCollisionInstances.pushBack(SwInterCollisionData(
+		    c->mCurParticles.begin(), c->mPrevParticles.begin(),
+		    c->mSelfCollisionIndices.empty() ? c->mCurParticles.size() : c->mSelfCollisionIndices.size(),
+		    c->mSelfCollisionIndices.empty() ? NULL : &c->mSelfCollisionIndices[0], c->mTargetMotion,
+		    c->mParticleBoundsCenter, c->mParticleBoundsHalfExtent, elasticity * invNumIterations, c->mUserData));
+	}
+
+	const uint32_t requiredTempMemorySize = uint32_t(SwInterCollision<Simd4fType>::estimateTemporaryMemory(
+	    &mInterCollisionInstances[0], mInterCollisionInstances.size()));
+
+	// realloc temp memory if necessary
+	if(mInterCollisionScratchMemSize < requiredTempMemorySize)
+	{
+		if(mInterCollisionScratchMem)
+			PX_FREE(mInterCollisionScratchMem);
+
+		mInterCollisionScratchMem = PX_ALLOC(requiredTempMemorySize, "cloth::SwSolver::mInterCollisionScratchMem");
+		mInterCollisionScratchMemSize = requiredTempMemorySize;
+	}
+
+	SwKernelAllocator allocator(mInterCollisionScratchMem, mInterCollisionScratchMemSize);
+
+	// run inter-collision
+	SwInterCollision<Simd4fType> collider(mInterCollisionInstances.begin(), mInterCollisionInstances.size(),
+	                                      mInterCollisionDistance, mInterCollisionStiffness, mInterCollisionIterations,
+	                                      mInterCollisionFilter, allocator, mProfiler);
+
+	collider();
+}
+
+void cloth::SwSolver::beginFrame() const
+{
+	if(mProfiler)
+		mProfiler->startEvent(mSimulateEventId, uint64_t(intptr_t(this)), uint32_t(intptr_t(this)));
+}
+
+void cloth::SwSolver::endFrame() const
+{
+	if(mProfiler)
+		mProfiler->stopEvent(mSimulateEventId, uint64_t(intptr_t(this)), uint32_t(intptr_t(this)));
+}
+
+#if APEX_UE4
+void cloth::SwSolver::simulate(void* task, float dt)
+{
+	if (task)
+		static_cast<cloth::SwSolver::CpuClothSimulationTask*>(task)->simulate(dt);
+}
+#endif
+
+void cloth::SwSolver::StartSimulationTask::runInternal()
+{
+	mSolver->beginFrame();
+
+	CpuClothSimulationTaskVector::Iterator tIt = mSolver->mCpuClothSimulationTasks.begin();
+	CpuClothSimulationTaskVector::Iterator tEnd = mSolver->mCpuClothSimulationTasks.end();
+
+	for(; tIt != tEnd; ++tIt)
+	{
+#if APEX_UE4
+		if (!(*tIt)->mCloth->isSleeping())
+		{
+			(*tIt)->setContinuation(mCont);
+			(*tIt)->removeReference();
+		}
+#else
+		if(!tIt->mCloth->isSleeping())
+		{
+			tIt->setContinuation(mCont);
+			tIt->removeReference();
+		}
+#endif
+	}
+}
+
+const char* cloth::SwSolver::StartSimulationTask::getName() const
+{
+	return "cloth.SwSolver.startSimulation";
+}
+
+void cloth::SwSolver::EndSimulationTask::runInternal()
+{
+	mSolver->interCollision();
+	mSolver->endFrame();
+}
+
+const char* cloth::SwSolver::EndSimulationTask::getName() const
+{
+	return "cloth.SwSolver.endSimulation";
+}
+
+#if !APEX_UE4
+cloth::SwSolver::CpuClothSimulationTask::CpuClothSimulationTask(SwCloth& cloth, EndSimulationTask& continuation)
+: mCloth(&cloth), mContinuation(&continuation), mScratchMemorySize(0), mScratchMemory(0), mInvNumIterations(0.0f)
+{
+}
+#endif
+
+#if APEX_UE4
+cloth::SwSolver::CpuClothSimulationTask::CpuClothSimulationTask(SwCloth& cloth, SwSolver& solver)
+	: mCloth(&cloth), mSolver(&solver), mScratchMemorySize(0), mScratchMemory(0), mInvNumIterations(0.0f)
+{
+	mCloth->mSimulationTask = this;
+}
+
+cloth::SwSolver::CpuClothSimulationTask::~CpuClothSimulationTask()
+{
+	deallocate(mScratchMemory);
+	mCloth->mSimulationTask = NULL;
+}
+
+void cloth::SwSolver::CpuClothSimulationTask::runInternal()
+{
+	simulate(mSolver->mDt);
+}
+
+
+void cloth::SwSolver::CpuClothSimulationTask::simulate(float dt)
+{
+	// check if we need to reallocate the temp memory buffer
+	// (number of shapes may have changed)
+	uint32_t requiredTempMemorySize = uint32_t(SwSolverKernel<Simd4fType>::estimateTemporaryMemory(*mCloth));
+
+	if (mScratchMemorySize < requiredTempMemorySize)
+	{
+		deallocate(mScratchMemory);
+
+		mScratchMemory = allocate(requiredTempMemorySize);
+		mScratchMemorySize = requiredTempMemorySize;
+	}
+
+	IterationStateFactory factory(*mCloth, dt);
+	mInvNumIterations = factory.mInvNumIterations;
+
+	nvidia::SIMDGuard simdGuard;
+
+	SwClothData data(*mCloth, mCloth->mFabric);
+	SwKernelAllocator allocator(mScratchMemory, uint32_t(mScratchMemorySize));
+	nvidia::profile::PxProfileZone* profileZone = mSolver->mProfiler;
+
+	// construct kernel functor and execute
+#if PX_ANDROID
+	// if(!neonSolverKernel(cloth, data, allocator, factory, profileZone))
+#endif
+	SwSolverKernel<Simd4fType>(*mCloth, data, allocator, factory, profileZone)();
+
+	data.reconcile(*mCloth); // update cloth
+
+	release();
+}
+
+#else
+
+void cloth::SwSolver::CpuClothSimulationTask::runInternal()
+{
+	// check if we need to reallocate the temp memory buffer
+	// (number of shapes may have changed)
+	uint32_t requiredTempMemorySize = uint32_t(SwSolverKernel<Simd4fType>::estimateTemporaryMemory(*mCloth));
+
+	if(mScratchMemorySize < requiredTempMemorySize)
+	{
+		deallocate(mScratchMemory);
+
+		mScratchMemory = allocate(requiredTempMemorySize);
+		mScratchMemorySize = requiredTempMemorySize;
+	}
+
+	if(mContinuation->mDt == 0.0f)
+		return;
+
+	IterationStateFactory factory(*mCloth, mContinuation->mDt);
+	mInvNumIterations = factory.mInvNumIterations;
+
+	nvidia::SIMDGuard simdGuard;
+
+	SwClothData data(*mCloth, mCloth->mFabric);
+	SwKernelAllocator allocator(mScratchMemory, uint32_t(mScratchMemorySize));
+	nvidia::profile::PxProfileZone* profileZone = mContinuation->mSolver->mProfiler;
+
+	// construct kernel functor and execute
+#if PX_ANDROID
+	// if(!neonSolverKernel(cloth, data, allocator, factory, profileZone))
+#endif
+	SwSolverKernel<Simd4fType>(*mCloth, data, allocator, factory, profileZone)();
+
+	data.reconcile(*mCloth); // update cloth
+}
+#endif
+
+const char* cloth::SwSolver::CpuClothSimulationTask::getName() const
+{
+	return "cloth.SwSolver.cpuClothSimulation";
+}
+
+void cloth::SwSolver::CpuClothSimulationTask::release()
+{
+	mCloth->mMotionConstraints.pop();
+	mCloth->mSeparationConstraints.pop();
+
+	if (!mCloth->mTargetCollisionSpheres.empty())
+	{
+		swap(mCloth->mStartCollisionSpheres, mCloth->mTargetCollisionSpheres);
+		mCloth->mTargetCollisionSpheres.resize(0);
+	}
+
+	if (!mCloth->mTargetCollisionPlanes.empty())
+	{
+		swap(mCloth->mStartCollisionPlanes, mCloth->mTargetCollisionPlanes);
+		mCloth->mTargetCollisionPlanes.resize(0);
+	}
+
+	if (!mCloth->mTargetCollisionTriangles.empty())
+	{
+		swap(mCloth->mStartCollisionTriangles, mCloth->mTargetCollisionTriangles);
+		mCloth->mTargetCollisionTriangles.resize(0);
+	}
+#if !APEX_UE4
+	mContinuation->removeReference();
+#endif
+}
+
+#if APEX_UE4
+void(*const cloth::SwCloth::sSimulationFunction)(void*, float) = &cloth::SwSolver::simulate;
+#endif
+\ No newline at end of file
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolver.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolver.h
new file mode 100644
index 00000000..472a5dba
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolver.h
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Solver.h"
+#include "Allocator.h"
+#include "SwInterCollision.h"
+#include "CmTask.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+
+class SwCloth;
+class SwFactory;
+
+/// CPU/SSE based cloth solver
+class SwSolver : public UserAllocated, public Solver
+{
+	struct StartSimulationTask : public Cm::Task
+	{
+		using PxLightCpuTask::mRefCount;
+		using PxLightCpuTask::mTm;
+
+		virtual void runInternal();
+		virtual const char* getName() const;
+
+		SwSolver* mSolver;
+	};
+
+	struct EndSimulationTask : public Cm::Task
+	{
+		using PxLightCpuTask::mRefCount;
+
+		virtual void runInternal();
+		virtual const char* getName() const;
+
+		SwSolver* mSolver;
+#if !APEX_UE4
+		float mDt;
+#endif
+	};
+
+	struct CpuClothSimulationTask : public Cm::Task
+	{
+#if APEX_UE4
+		void* operator new(size_t n){ return allocate(n); }
+		void operator delete(void* ptr) { return deallocate(ptr); }
+
+		CpuClothSimulationTask(SwCloth&, SwSolver&);
+		~CpuClothSimulationTask();
+
+		void simulate(float dt);
+
+		SwSolver* mSolver;
+#else
+		CpuClothSimulationTask(SwCloth&, EndSimulationTask&);
+
+		EndSimulationTask* mContinuation;
+#endif
+		virtual void runInternal();
+		virtual const char* getName() const;
+		virtual void release();
+
+		SwCloth* mCloth;
+
+		uint32_t mScratchMemorySize;
+		void* mScratchMemory;
+		float mInvNumIterations;
+	};
+
+  public:
+	SwSolver(nvidia::profile::PxProfileZone*, PxTaskManager*);
+	virtual ~SwSolver();
+
+	virtual void addCloth(Cloth*);
+	virtual void removeCloth(Cloth*);
+
+	virtual PxBaseTask& simulate(float dt, PxBaseTask&);
+
+	virtual void setInterCollisionDistance(float distance)
+	{
+		mInterCollisionDistance = distance;
+	}
+	virtual float getInterCollisionDistance() const
+	{
+		return mInterCollisionDistance;
+	}
+
+	virtual void setInterCollisionStiffness(float stiffness)
+	{
+		mInterCollisionStiffness = stiffness;
+	}
+	virtual float getInterCollisionStiffness() const
+	{
+		return mInterCollisionStiffness;
+	}
+
+	virtual void setInterCollisionNbIterations(uint32_t nbIterations)
+	{
+		mInterCollisionIterations = nbIterations;
+	}
+	virtual uint32_t getInterCollisionNbIterations() const
+	{
+		return mInterCollisionIterations;
+	}
+
+	virtual void setInterCollisionFilter(InterCollisionFilter filter)
+	{
+		mInterCollisionFilter = filter;
+	}
+
+	virtual uint32_t getNumSharedPositions( const Cloth* ) const 
+	{ 
+		return uint32_t(-1); 
+	}
+
+	virtual bool hasError() const
+	{
+		return false;
+	}
+
+#if APEX_UE4
+	static void simulate(void*, float);
+#endif
+
+  private:
+	void beginFrame() const;
+	void endFrame() const;
+
+	void interCollision();
+
+  private:
+	StartSimulationTask mStartSimulationTask;
+
+#if APEX_UE4
+	typedef Vector<CpuClothSimulationTask*>::Type CpuClothSimulationTaskVector;
+	float mDt;
+#else
+	typedef Vector<CpuClothSimulationTask>::Type CpuClothSimulationTaskVector;
+#endif
+
+	CpuClothSimulationTaskVector mCpuClothSimulationTasks;
+
+	EndSimulationTask mEndSimulationTask;
+
+	profile::PxProfileZone* mProfiler;
+	uint16_t mSimulateEventId;
+
+	float mInterCollisionDistance;
+	float mInterCollisionStiffness;
+	uint32_t mInterCollisionIterations;
+	InterCollisionFilter mInterCollisionFilter;
+
+	void* mInterCollisionScratchMem;
+	uint32_t mInterCollisionScratchMemSize;
+	nvidia::Array<SwInterCollisionData> mInterCollisionInstances;
+
+};
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolverKernel.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolverKernel.cpp
new file mode 100644
index 00000000..29f3fdc3
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolverKernel.cpp
@@ -0,0 +1,695 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "SwSolverKernel.h"
+#include "SwCloth.h"
+#include "SwClothData.h"
+#include "SwFabric.h"
+#include "SwFactory.h"
+#include "PointInterpolator.h"
+#include "BoundingBox.h"
+#include "Simd4i.h"
+
+#if defined(_MSC_VER) && _MSC_VER >= 1600 && PX_WINDOWS_FAMILY
+#define PX_AVX 1
+
+namespace avx
+{
+// defined in SwSolveConstraints.cpp
+
+void initialize();
+
+template <bool, uint32_t>
+void solveConstraints(float* __restrict, const float* __restrict, const float* __restrict, const uint16_t* __restrict,
+                      const __m128&);
+}
+
+namespace
+{
+uint32_t getAvxSupport()
+{
+// Checking for AVX requires 3 things:
+// 1) CPUID indicates that the OS uses XSAVE and XRSTORE
+// 2) CPUID indicates support for AVX
+// 3) XGETBV indicates registers are saved and restored on context switch
+
+#if _MSC_FULL_VER < 160040219 || !defined(_XCR_XFEATURE_ENABLED_MASK)
+	// need at least VC10 SP1 and compile on at least Win7 SP1
+	return 0;
+#else
+	int cpuInfo[4];
+	__cpuid(cpuInfo, 1);
+	int avxFlags = 3 << 27; // checking 1) and 2) above
+	if((cpuInfo[2] & avxFlags) != avxFlags)
+		return 0; // xgetbv not enabled or no AVX support
+
+	if((_xgetbv(_XCR_XFEATURE_ENABLED_MASK) & 0x6) != 0x6)
+		return 0; // OS does not save YMM registers
+
+	avx::initialize();
+
+#if _MSC_VER < 1700
+	return 1;
+#else
+	int fmaFlags = 1 << 12;
+	if((cpuInfo[2] & fmaFlags) != fmaFlags)
+		return 1; // no FMA3 support
+
+	/* only using fma at the moment, don't lock out AMD's piledriver by requiring avx2
+	__cpuid(cpuInfo, 7);
+	int avx2Flags = 1 << 5;
+	if((cpuInfo[1] & avx2Flags) != avx2Flags)
+	    return 1; // no AVX2 support
+	*/
+
+	return 2;
+#endif // _MSC_VER
+#endif // _MSC_FULL_VER
+}
+
+const uint32_t sAvxSupport = getAvxSupport(); // 0: no AVX, 1: AVX, 2: AVX+FMA
+}
+#endif
+
+using namespace nvidia;
+
+namespace
+{
+/* simd constants */
+
+typedef Simd4fFactory<detail::FourTuple> Simd4fConstant;
+
+const Simd4fConstant sMaskW = simd4f(simd4i(0, 0, 0, ~0));
+const Simd4fConstant sMaskXY = simd4f(simd4i(~0, ~0, 0, 0));
+const Simd4fConstant sMaskXYZ = simd4f(simd4i(~0, ~0, ~0, 0));
+const Simd4fConstant sMaskYZW = simd4f(simd4i(0, ~0, ~0, ~0));
+const Simd4fConstant sEpsilon = simd4f(FLT_EPSILON);
+const Simd4fConstant sMinusOneXYZOneW = simd4f(-1.0f, -1.0f, -1.0f, 1.0f);
+const Simd4fConstant sFloatMaxW = simd4f(0.0f, 0.0f, 0.0f, FLT_MAX);
+const Simd4fConstant sMinusFloatMaxXYZ = simd4f(-FLT_MAX, -FLT_MAX, -FLT_MAX, 0.0f);
+
+/* static worker functions */
+
+/**
+   This function performs explicit Euler integration based on position, where
+   x_next = x_cur + (x_cur - x_prev) * dt_cur/dt_prev * damping + g * dt * dt
+   The g * dt * dt term is folded into accelIt.
+ */
+
+template <typename Simd4f, typename AccelerationIterator>
+void integrateParticles(Simd4f* __restrict curIt, Simd4f* __restrict curEnd, Simd4f* __restrict prevIt, Simd4f scale,
+                        const AccelerationIterator& aIt, const Simd4f& prevBias)
+{
+	// local copy to avoid LHS
+	AccelerationIterator accelIt(aIt);
+
+	for(; curIt != curEnd; ++curIt, ++prevIt, ++accelIt)
+	{
+		Simd4f current = *curIt;
+		Simd4f previous = *prevIt;
+		// if(current.w == 0) current.w = previous.w
+		current = select(current > sMinusFloatMaxXYZ, current, previous);
+		Simd4f finiteMass = splat<3>(previous) > sFloatMaxW;
+		Simd4f delta = (current - previous) * scale + *accelIt;
+		*curIt = current + (delta & finiteMass);
+		*prevIt = select(sMaskW, previous, current) + (prevBias & finiteMass);
+	}
+}
+
+template <typename Simd4f, typename AccelerationIterator>
+void integrateParticles(Simd4f* __restrict curIt, Simd4f* __restrict curEnd, Simd4f* __restrict prevIt,
+                        const Simd4f (&prevMatrix)[3], const Simd4f (&curMatrix)[3], const AccelerationIterator& aIt,
+                        const Simd4f& prevBias)
+{
+	// local copy to avoid LHS
+	AccelerationIterator accelIt(aIt);
+
+	for(; curIt != curEnd; ++curIt, ++prevIt, ++accelIt)
+	{
+		Simd4f current = *curIt;
+		Simd4f previous = *prevIt;
+		// if(current.w == 0) current.w = previous.w
+		current = select(current > sMinusFloatMaxXYZ, current, previous);
+		Simd4f finiteMass = splat<3>(previous) > sFloatMaxW;
+		// curMatrix*current + prevMatrix*previous + accel
+		Simd4f delta = cloth::transform(curMatrix, cloth::transform(prevMatrix, *accelIt, previous), current);
+		*curIt = current + (delta & finiteMass);
+		*prevIt = select(sMaskW, previous, current) + (prevBias & finiteMass);
+	}
+}
+
+template <typename Simd4f, typename ConstraintIterator>
+void constrainMotion(Simd4f* __restrict curIt, const Simd4f* __restrict curEnd, const ConstraintIterator& spheres,
+                     Simd4f scaleBiasStiffness)
+{
+	Simd4f scale = splat<0>(scaleBiasStiffness);
+	Simd4f bias = splat<1>(scaleBiasStiffness);
+	Simd4f stiffness = splat<3>(scaleBiasStiffness);
+
+	// local copy of iterator to maintain alignment
+	ConstraintIterator sphIt = spheres;
+
+	for(; curIt < curEnd; curIt += 4)
+	{
+		// todo: use msub where available
+		Simd4f curPos0 = curIt[0];
+		Simd4f curPos1 = curIt[1];
+		Simd4f curPos2 = curIt[2];
+		Simd4f curPos3 = curIt[3];
+
+		Simd4f delta0 = *sphIt - (sMaskXYZ & curPos0);
+		++sphIt;
+		Simd4f delta1 = *sphIt - (sMaskXYZ & curPos1);
+		++sphIt;
+		Simd4f delta2 = *sphIt - (sMaskXYZ & curPos2);
+		++sphIt;
+		Simd4f delta3 = *sphIt - (sMaskXYZ & curPos3);
+		++sphIt;
+
+		Simd4f deltaX = delta0, deltaY = delta1, deltaZ = delta2, deltaW = delta3;
+		transpose(deltaX, deltaY, deltaZ, deltaW);
+
+		Simd4f sqrLength = sEpsilon + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ;
+		Simd4f radius = max(simd4f(_0), deltaW * scale + bias);
+
+		Simd4f slack = simd4f(_1) - radius * rsqrt(sqrLength);
+
+		// if slack <= 0.0f then we don't want to affect particle
+		// and can skip if all particles are unaffected
+		Simd4f isPositive;
+		if(anyGreater(slack, simd4f(_0), isPositive))
+		{
+			// set invMass to zero if radius is zero
+			curPos0 = curPos0 & (splat<0>(radius) > sMinusFloatMaxXYZ);
+			curPos1 = curPos1 & (splat<1>(radius) > sMinusFloatMaxXYZ);
+			curPos2 = curPos2 & (splat<2>(radius) > sMinusFloatMaxXYZ);
+			curPos3 = curPos3 & ((radius) > sMinusFloatMaxXYZ);
+
+			slack = slack * stiffness & isPositive;
+
+			curIt[0] = curPos0 + (delta0 & sMaskXYZ) * splat<0>(slack);
+			curIt[1] = curPos1 + (delta1 & sMaskXYZ) * splat<1>(slack);
+			curIt[2] = curPos2 + (delta2 & sMaskXYZ) * splat<2>(slack);
+			curIt[3] = curPos3 + (delta3 & sMaskXYZ) * splat<3>(slack);
+		}
+	}
+}
+
+template <typename Simd4f, typename ConstraintIterator>
+void constrainSeparation(Simd4f* __restrict curIt, const Simd4f* __restrict curEnd, const ConstraintIterator& spheres)
+{
+	// local copy of iterator to maintain alignment
+	ConstraintIterator sphIt = spheres;
+
+	for(; curIt < curEnd; curIt += 4)
+	{
+		// todo: use msub where available
+		Simd4f curPos0 = curIt[0];
+		Simd4f curPos1 = curIt[1];
+		Simd4f curPos2 = curIt[2];
+		Simd4f curPos3 = curIt[3];
+
+		Simd4f delta0 = *sphIt - (sMaskXYZ & curPos0);
+		++sphIt;
+		Simd4f delta1 = *sphIt - (sMaskXYZ & curPos1);
+		++sphIt;
+		Simd4f delta2 = *sphIt - (sMaskXYZ & curPos2);
+		++sphIt;
+		Simd4f delta3 = *sphIt - (sMaskXYZ & curPos3);
+		++sphIt;
+
+		Simd4f deltaX = delta0, deltaY = delta1, deltaZ = delta2, deltaW = delta3;
+		transpose(deltaX, deltaY, deltaZ, deltaW);
+
+		Simd4f sqrLength = sEpsilon + deltaX * deltaX + deltaY * deltaY + deltaZ * deltaZ;
+
+		Simd4f slack = simd4f(_1) - deltaW * rsqrtT<1>(sqrLength);
+
+		// if slack >= 0.0f then we don't want to affect particle
+		// and can skip if all particles are unaffected
+		Simd4f isNegative;
+		if(anyGreater(simd4f(_0), slack, isNegative))
+		{
+			slack = slack & isNegative;
+
+			curIt[0] = curPos0 + (delta0 & sMaskXYZ) * splat<0>(slack);
+			curIt[1] = curPos1 + (delta1 & sMaskXYZ) * splat<1>(slack);
+			curIt[2] = curPos2 + (delta2 & sMaskXYZ) * splat<2>(slack);
+			curIt[3] = curPos3 + (delta3 & sMaskXYZ) * splat<3>(slack);
+		}
+	}
+}
+
+/**
+    traditional gauss-seidel internal constraint solver
+ */
+template <bool useMultiplier, typename Simd4f>
+void solveConstraints(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd,
+                      const uint16_t* __restrict iIt, Simd4f stiffness)
+{
+	Simd4f stretchLimit, compressionLimit, multiplier;
+	if(useMultiplier)
+	{
+		stretchLimit = splat<3>(stiffness);
+		compressionLimit = splat<2>(stiffness);
+		multiplier = splat<1>(stiffness);
+	}
+	stiffness = splat<0>(stiffness);
+
+	for(; rIt != rEnd; rIt += 4, iIt += 8)
+	{
+		uint32_t p0i = iIt[0] * sizeof(PxVec4);
+		uint32_t p0j = iIt[1] * sizeof(PxVec4);
+		uint32_t p1i = iIt[2] * sizeof(PxVec4);
+		uint32_t p1j = iIt[3] * sizeof(PxVec4);
+		uint32_t p2i = iIt[4] * sizeof(PxVec4);
+		uint32_t p2j = iIt[5] * sizeof(PxVec4);
+		uint32_t p3i = iIt[6] * sizeof(PxVec4);
+		uint32_t p3j = iIt[7] * sizeof(PxVec4);
+
+		Simd4f v0i = loadAligned(posIt, p0i);
+		Simd4f v0j = loadAligned(posIt, p0j);
+		Simd4f v1i = loadAligned(posIt, p1i);
+		Simd4f v1j = loadAligned(posIt, p1j);
+		Simd4f v2i = loadAligned(posIt, p2i);
+		Simd4f v2j = loadAligned(posIt, p2j);
+		Simd4f v3i = loadAligned(posIt, p3i);
+		Simd4f v3j = loadAligned(posIt, p3j);
+
+		Simd4f h0ij = v0j + v0i * sMinusOneXYZOneW;
+		Simd4f h1ij = v1j + v1i * sMinusOneXYZOneW;
+		Simd4f h2ij = v2j + v2i * sMinusOneXYZOneW;
+		Simd4f h3ij = v3j + v3i * sMinusOneXYZOneW;
+
+		Simd4f hxij = h0ij, hyij = h1ij, hzij = h2ij, vwij = h3ij;
+		transpose(hxij, hyij, hzij, vwij);
+
+		Simd4f rij = loadAligned(rIt);
+		Simd4f e2ij = sEpsilon + hxij * hxij + hyij * hyij + hzij * hzij;
+		Simd4f erij = (simd4f(_1) - rij * rsqrt(e2ij)) & (rij > sEpsilon); // add parentheses for wiiu
+
+		if(useMultiplier)
+		{
+			erij = erij - multiplier * max(compressionLimit, min(erij, stretchLimit));
+		}
+		Simd4f exij = erij * stiffness * recip(sEpsilon + vwij);
+
+		h0ij = h0ij * splat<0>(exij) & sMaskXYZ;
+		h1ij = h1ij * splat<1>(exij) & sMaskXYZ;
+		h2ij = h2ij * splat<2>(exij) & sMaskXYZ;
+		h3ij = h3ij * splat<3>(exij) & sMaskXYZ;
+
+		storeAligned(posIt, p0i, v0i + h0ij * splat<3>(v0i));
+		storeAligned(posIt, p0j, v0j - h0ij * splat<3>(v0j));
+		storeAligned(posIt, p1i, v1i + h1ij * splat<3>(v1i));
+		storeAligned(posIt, p1j, v1j - h1ij * splat<3>(v1j));
+		storeAligned(posIt, p2i, v2i + h2ij * splat<3>(v2i));
+		storeAligned(posIt, p2j, v2j - h2ij * splat<3>(v2j));
+		storeAligned(posIt, p3i, v3i + h3ij * splat<3>(v3i));
+		storeAligned(posIt, p3j, v3j - h3ij * splat<3>(v3j));
+	}
+}
+
+#if PX_WINDOWS_FAMILY
+#include "sse2/SwSolveConstraints.h"
+#endif
+
+// calculates upper bound of all position deltas
+template <typename Simd4f>
+Simd4f calculateMaxDelta(const Simd4f* prevIt, const Simd4f* curIt, const Simd4f* curEnd)
+{
+	Simd4f maxDelta(simd4f(_0));
+	for(; curIt < curEnd; ++curIt, ++prevIt)
+		maxDelta = max(maxDelta, abs(*curIt - *prevIt));
+
+	return maxDelta & sMaskXYZ;
+}
+
+} // anonymous namespace
+
+template <typename Simd4f>
+cloth::SwSolverKernel<Simd4f>::SwSolverKernel(SwCloth const& cloth, SwClothData& clothData, SwKernelAllocator& allocator,
+                                              IterationStateFactory& factory, profile::PxProfileZone* profiler)
+: mCloth(cloth)
+, mClothData(clothData)
+, mAllocator(allocator)
+, mCollision(clothData, allocator, profiler)
+, mSelfCollision(clothData, allocator)
+, mState(factory.create<Simd4f>(cloth))
+, mProfiler(profiler)
+{
+	mClothData.verify();
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::operator()()
+{
+	simulateCloth();
+}
+
+template <typename Simd4f>
+size_t cloth::SwSolverKernel<Simd4f>::estimateTemporaryMemory(const SwCloth& cloth)
+{
+	size_t collisionTempMemory = SwCollision<Simd4f>::estimateTemporaryMemory(cloth);
+	size_t selfCollisionTempMemory = SwSelfCollision<Simd4f>::estimateTemporaryMemory(cloth);
+
+	size_t tempMemory = PxMax(collisionTempMemory, selfCollisionTempMemory);
+	size_t persistentMemory = SwCollision<Simd4f>::estimatePersistentMemory(cloth);
+
+	// account for any allocator overhead (this could be exposed in the allocator)
+	size_t maxAllocs = 32;
+	size_t maxPerAllocationOverhead = 32;
+	size_t maxAllocatorOverhead = maxAllocs * maxPerAllocationOverhead;
+
+	return maxAllocatorOverhead + persistentMemory + tempMemory;
+}
+
+template <typename Simd4f>
+template <typename AccelerationIterator>
+void cloth::SwSolverKernel<Simd4f>::integrateParticles(AccelerationIterator& accelIt, const Simd4f& prevBias)
+{
+	Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles);
+	Simd4f* curEnd = curIt + mClothData.mNumParticles;
+	Simd4f* prevIt = reinterpret_cast<Simd4f*>(mClothData.mPrevParticles);
+
+	if(!mState.mIsTurning)
+		::integrateParticles(curIt, curEnd, prevIt, mState.mPrevMatrix[0], accelIt, prevBias);
+	else
+		::integrateParticles(curIt, curEnd, prevIt, mState.mPrevMatrix, mState.mCurMatrix, accelIt, prevBias);
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::integrateParticles()
+{
+	ProfileZone zone("cloth::SwSolverKernel::integrateParticles", mProfiler);
+
+	const Simd4f* startAccelIt = reinterpret_cast<const Simd4f*>(mClothData.mParticleAccelerations);
+
+	// dt^2 (todo: should this be the smoothed dt used for gravity?)
+	const Simd4f sqrIterDt = simd4f(sqr(mState.mIterDt)) & (Simd4f)sMaskXYZ;
+
+	if(!startAccelIt)
+	{
+		// no per-particle accelerations, use a constant
+		ConstantIterator<Simd4f> accelIt(mState.mCurBias);
+		integrateParticles(accelIt, mState.mPrevBias);
+	}
+	else
+	{
+		// iterator implicitly scales by dt^2 and adds gravity
+		ScaleBiasIterator<Simd4f, const Simd4f*> accelIt(startAccelIt, sqrIterDt, mState.mCurBias);
+		integrateParticles(accelIt, mState.mPrevBias);
+	}
+
+	zone.setValue(mState.mIsTurning);
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::constrainTether()
+{
+	if(0.0f == mClothData.mTetherConstraintStiffness || !mClothData.mNumTethers)
+		return;
+
+#if PX_PROFILE
+	ProfileZone zone("cloth::SwSolverKernel::solveTethers", mProfiler);
+#endif
+
+	uint32_t numParticles = mClothData.mNumParticles;
+	uint32_t numTethers = mClothData.mNumTethers;
+	PX_ASSERT(0 == numTethers % numParticles);
+
+	float* __restrict curIt = mClothData.mCurParticles;
+	const float* __restrict curFirst = curIt;
+	const float* __restrict curEnd = curIt + 4 * numParticles;
+
+	typedef const SwTether* __restrict TetherIter;
+	TetherIter tFirst = mClothData.mTethers;
+	TetherIter tEnd = tFirst + numTethers;
+
+	Simd4f stiffness = (Simd4f)sMaskXYZ & simd4f(numParticles * mClothData.mTetherConstraintStiffness / numTethers);
+	Simd4f scale = simd4f(mClothData.mTetherConstraintScale);
+
+	for(; curIt != curEnd; curIt += 4, ++tFirst)
+	{
+		Simd4f position = loadAligned(curIt);
+		Simd4f offset = simd4f(_0);
+
+		for(TetherIter tIt = tFirst; tIt < tEnd; tIt += numParticles)
+		{
+			PX_ASSERT(tIt->mAnchor < numParticles);
+			Simd4f anchor = loadAligned(curFirst, tIt->mAnchor * sizeof(PxVec4));
+			Simd4f delta = anchor - position;
+			Simd4f sqrLength = sEpsilon + dot3(delta, delta);
+
+			Simd4f tetherLength = load(&tIt->mLength);
+			tetherLength = splat<0>(tetherLength);
+
+			Simd4f radius = tetherLength * scale;
+			Simd4f slack = simd4f(_1) - radius * rsqrt(sqrLength);
+
+			offset = offset + delta * max(slack, simd4f(_0));
+		}
+
+		storeAligned(curIt, position + offset * stiffness);
+	}
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::solveFabric()
+{
+	ProfileZone zone("cloth::SwSolverKernel::solveFabric", mProfiler);
+
+	float* pIt = mClothData.mCurParticles;
+
+	const PhaseConfig* cIt = mClothData.mConfigBegin;
+	const PhaseConfig* cEnd = mClothData.mConfigEnd;
+
+	const uint32_t* pBegin = mClothData.mPhases;
+	const float* rBegin = mClothData.mRestvalues;
+
+	const uint32_t* sBegin = mClothData.mSets;
+	const uint16_t* iBegin = mClothData.mIndices;
+
+	uint32_t totalConstraints = 0;
+
+	Simd4f stiffnessExponent = simd4f(mCloth.mStiffnessFrequency * mState.mIterDt);
+
+	for(; cIt != cEnd; ++cIt)
+	{
+		const uint32_t* sIt = sBegin + pBegin[cIt->mPhaseIndex];
+		const float* rIt = rBegin + sIt[0];
+		const float* rEnd = rBegin + sIt[1];
+		const uint16_t* iIt = iBegin + sIt[0] * 2;
+
+		totalConstraints += uint32_t(rEnd - rIt);
+
+		// (stiffness, multiplier, compressionLimit, stretchLimit)
+		Simd4f config = load(&cIt->mStiffness);
+		// stiffness specified as fraction of constraint error per-millisecond
+		Simd4f scaledConfig = simd4f(_1) - simdf::exp2(config * stiffnessExponent);
+		Simd4f stiffness = select(sMaskXY, scaledConfig, config);
+
+		int neutralMultiplier = allEqual(sMaskYZW & stiffness, simd4f(_0));
+
+#if PX_AVX
+		switch(sAvxSupport)
+		{
+		case 2:
+#if _MSC_VER >= 1700
+			neutralMultiplier ? avx::solveConstraints<false, 2>(pIt, rIt, rEnd, iIt, stiffness)
+			                  : avx::solveConstraints<true, 2>(pIt, rIt, rEnd, iIt, stiffness);
+			break;
+#endif
+		case 1:
+			neutralMultiplier ? avx::solveConstraints<false, 1>(pIt, rIt, rEnd, iIt, stiffness)
+			                  : avx::solveConstraints<true, 1>(pIt, rIt, rEnd, iIt, stiffness);
+			break;
+		default:
+#endif
+			neutralMultiplier ? solveConstraints<false>(pIt, rIt, rEnd, iIt, stiffness)
+			                  : solveConstraints<true>(pIt, rIt, rEnd, iIt, stiffness);
+#if PX_AVX
+			break;
+		}
+#endif
+	}
+
+	zone.setValue(totalConstraints);
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::constrainMotion()
+{
+	if(!mClothData.mStartMotionConstraints)
+		return;
+
+#if PX_PROFILE
+	ProfileZone zone("cloth::SwSolverKernel::constrainMotion", mProfiler);
+#endif
+
+	Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles);
+	Simd4f* curEnd = curIt + mClothData.mNumParticles;
+
+	const Simd4f* startIt = reinterpret_cast<const Simd4f*>(mClothData.mStartMotionConstraints);
+	const Simd4f* targetIt = reinterpret_cast<const Simd4f*>(mClothData.mTargetMotionConstraints);
+
+	Simd4f scaleBias = load(&mCloth.mMotionConstraintScale);
+	Simd4f stiffness = simd4f(mClothData.mMotionConstraintStiffness);
+	Simd4f scaleBiasStiffness = select(sMaskXYZ, scaleBias, stiffness);
+
+	if(!mClothData.mTargetMotionConstraints)
+		// no interpolation, use the start positions
+		return ::constrainMotion(curIt, curEnd, startIt, scaleBiasStiffness);
+
+	if(mState.mRemainingIterations == 1)
+		// use the target positions on last iteration
+		return ::constrainMotion(curIt, curEnd, targetIt, scaleBiasStiffness);
+
+	// otherwise use an interpolating iterator
+	LerpIterator<Simd4f, const Simd4f*> interpolator(startIt, targetIt, mState.getCurrentAlpha());
+	::constrainMotion(curIt, curEnd, interpolator, scaleBiasStiffness);
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::constrainSeparation()
+{
+	if(!mClothData.mStartSeparationConstraints)
+		return;
+
+#if PX_PROFILE
+	ProfileZone zone("cloth::SwSolverKernel::constrainSeparation", mProfiler);
+#endif
+
+	Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles);
+	Simd4f* curEnd = curIt + mClothData.mNumParticles;
+
+	const Simd4f* startIt = reinterpret_cast<const Simd4f*>(mClothData.mStartSeparationConstraints);
+	const Simd4f* targetIt = reinterpret_cast<const Simd4f*>(mClothData.mTargetSeparationConstraints);
+
+	if(!mClothData.mTargetSeparationConstraints)
+		// no interpolation, use the start positions
+		return ::constrainSeparation(curIt, curEnd, startIt);
+
+	if(mState.mRemainingIterations == 1)
+		// use the target positions on last iteration
+		return ::constrainSeparation(curIt, curEnd, targetIt);
+
+	// otherwise use an interpolating iterator
+	LerpIterator<Simd4f, const Simd4f*> interpolator(startIt, targetIt, mState.getCurrentAlpha());
+	::constrainSeparation(curIt, curEnd, interpolator);
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::collideParticles()
+{
+	ProfileZone zone("cloth::SwSolverKernel::collideParticles", mProfiler);
+
+	mCollision(mState);
+
+	zone.setValue(mCollision.mNumCollisions);
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::selfCollideParticles()
+{
+	ProfileZone zone("cloth::SwSolverKernel::selfCollideParticles", mProfiler);
+
+	mSelfCollision();
+
+	zone.setValue(mSelfCollision.mNumCollisions);
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::updateSleepState()
+{
+	ProfileZone zone("cloth::SwSolverKernel::updateSleepState", mProfiler);
+
+	mClothData.mSleepTestCounter += PxMax(1u, uint32_t(mState.mIterDt * 1000));
+	if(mClothData.mSleepTestCounter >= mCloth.mSleepTestInterval)
+	{
+		const Simd4f* prevIt = reinterpret_cast<Simd4f*>(mClothData.mPrevParticles);
+		const Simd4f* curIt = reinterpret_cast<Simd4f*>(mClothData.mCurParticles);
+		const Simd4f* curEnd = curIt + mClothData.mNumParticles;
+
+		// calculate max particle delta since last iteration
+		Simd4f maxDelta = calculateMaxDelta(prevIt, curIt, curEnd);
+
+		++mClothData.mSleepPassCounter;
+		Simd4f threshold = simd4f(mCloth.mSleepThreshold * mState.mIterDt);
+		if(anyGreaterEqual(maxDelta, threshold))
+			mClothData.mSleepPassCounter = 0;
+
+		mClothData.mSleepTestCounter -= mCloth.mSleepTestInterval;
+	}
+
+	zone.setValue(mClothData.mSleepPassCounter);
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::iterateCloth()
+{
+	// note on invMass (stored in current/previous positions.w):
+	// integrateParticles()
+	//   - if(current.w == 0) current.w = previous.w
+	// constraintMotion()
+	//   - if(constraint.radius <= 0) current.w = 0
+	// computeBounds()
+	//   - if(current.w > 0) current.w = previous.w
+	// collideParticles()
+	//   - if(collides) current.w *= 1/massScale
+	// after simulate()
+	//   - previous.w: original invMass as set by user
+	//   - current.w: zeroed by motion constraints and mass-scaled by collision
+
+	// integrate positions
+	integrateParticles();
+
+	// motion constraints
+	constrainMotion();
+
+	// solve tether constraints
+	constrainTether();
+
+	// solve edge constraints
+	solveFabric();
+
+	// separation constraints
+	constrainSeparation();
+
+	// perform character collision
+	collideParticles();
+
+	// perform self collision
+	selfCollideParticles();
+
+	// test wake / sleep conditions
+	updateSleepState();
+}
+
+template <typename Simd4f>
+void cloth::SwSolverKernel<Simd4f>::simulateCloth()
+{
+	while(mState.mRemainingIterations)
+	{
+		iterateCloth();
+		mState.update();
+	}
+}
+
+// explicit template instantiation
+#if NVMATH_SIMD
+template class cloth::SwSolverKernel<Simd4f>;
+#endif
+#if NVMATH_SCALAR
+template class cloth::SwSolverKernel<Scalar4f>;
+#endif
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolverKernel.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolverKernel.h
new file mode 100644
index 00000000..26b45a88
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/SwSolverKernel.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "IterationState.h"
+#include "SwCollision.h"
+#include "SwSelfCollision.h"
+
+namespace nvidia
+{
+namespace cloth
+{
+
+class SwCloth;
+struct SwClothData;
+
+template <typename Simd4f>
+class SwSolverKernel
+{
+  public:
+	SwSolverKernel(SwCloth const&, SwClothData&, SwKernelAllocator&, IterationStateFactory&, nvidia::profile::PxProfileZone*);
+
+	void operator()();
+
+	// returns a conservative estimate of the
+	// total memory requirements during a solve
+	static size_t estimateTemporaryMemory(const SwCloth& c);
+
+  private:
+	void integrateParticles();
+	void constrainTether();
+	void solveFabric();
+	void constrainMotion();
+	void constrainSeparation();
+	void collideParticles();
+	void selfCollideParticles();
+	void updateSleepState();
+
+	void iterateCloth();
+	void simulateCloth();
+
+	SwCloth const& mCloth;
+	SwClothData& mClothData;
+	SwKernelAllocator& mAllocator;
+
+	SwCollision<Simd4f> mCollision;
+	SwSelfCollision<Simd4f> mSelfCollision;
+	IterationState<Simd4f> mState;
+
+	profile::PxProfileZone* mProfiler;
+
+  private:
+	SwSolverKernel<Simd4f>& operator=(const SwSolverKernel<Simd4f>&);
+	template <typename AccelerationIterator>
+	void integrateParticles(AccelerationIterator& accelIt, const Simd4f&);
+};
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/TripletScheduler.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/TripletScheduler.cpp
new file mode 100644
index 00000000..d077624e
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/TripletScheduler.cpp
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "TripletScheduler.h"
+#include "PxMath.h"
+#include "PsFPU.h"
+#include "PxMat33.h"
+#include "PsVecMath.h"
+#include "PsUtilities.h"
+
+using namespace nvidia;
+using namespace physx::shdfnd::aos;
+
+cloth::TripletScheduler::TripletScheduler(Range<const uint32_t[4]> triplets)
+: mTriplets(reinterpret_cast<const Vec4u*>(triplets.begin()), reinterpret_cast<const Vec4u*>(triplets.end()))
+{
+}
+
+// SSE version
+void cloth::TripletScheduler::simd(uint32_t numParticles, uint32_t simdWidth)
+{
+	if(mTriplets.empty())
+		return;
+
+	Vector<uint32_t>::Type mark(numParticles, uint32_t(-1));
+
+	uint32_t setIndex = 0, setSize = 0;
+	for(TripletIter tIt = mTriplets.begin(), tEnd = mTriplets.end(); tIt != tEnd; ++setIndex)
+	{
+		TripletIter tLast = tIt + PxMin(simdWidth, uint32_t(tEnd - tIt));
+		TripletIter tSwap = tEnd;
+
+		for(; tIt != tLast && tIt != tSwap; ++tIt, ++setSize)
+		{
+			// swap from tail until independent triplet found
+			while((mark[tIt->x] == setIndex || mark[tIt->y] == setIndex || mark[tIt->z] == setIndex) && tIt != --tSwap)
+				swap(*tIt, *tSwap);
+
+			if(tIt == tSwap)
+				break; // no independent triplet found
+
+			// mark vertices to be used in simdIndex
+			mark[tIt->x] = setIndex;
+			mark[tIt->y] = setIndex;
+			mark[tIt->z] = setIndex;
+		}
+
+		if(tIt == tSwap) // remaining triplets depend on current set
+		{
+			if(setSize > simdWidth) // trim set to multiple of simdWidth
+			{
+				uint32_t overflow = setSize % simdWidth;
+				setSize -= overflow;
+				tIt -= overflow;
+			}
+			mSetSizes.pushBack(setSize);
+			setSize = 0;
+		}
+	}
+}
+
+namespace
+{
+struct TripletSet
+{
+	TripletSet() : mMark(0xFFFFFFFF)
+	{
+		mNumReplays[0] = mNumReplays[1] = mNumReplays[2] = 1;
+		memset(mNumConflicts[0], 0, 32);
+		memset(mNumConflicts[1], 0, 32);
+		memset(mNumConflicts[2], 0, 32);
+	}
+
+	uint32_t mMark; // triplet index
+	uint8_t mNumReplays[3];
+	uint8_t mNumConflicts[3][32];
+};
+
+/*
+struct GreaterSum
+{
+    typedef cloth::Vector<uint32_t>::Type Container;
+
+    GreaterSum(const Container& cont)
+        : mContainer(cont)
+    {}
+
+    bool operator()(const cloth::Vec4u& a, const cloth::Vec4u& b) const
+    {
+        return mContainer[a.x] + mContainer[a.y] + mContainer[a.z]
+            > mContainer[b.x] + mContainer[b.y] + mContainer[b.z];
+    }
+
+    const Container& mContainer;
+};
+*/
+
+// calculate the inclusive prefix sum, equivalent of std::partial_sum
+template <typename T>
+void prefixSum(const T* first, const T* last, T* dest)
+{
+	if(first == last)
+		return;
+	else
+	{
+		*(dest++) = *(first++);
+
+		for(; first != last; ++first, ++dest)
+			*dest = *(dest - 1) + *first;
+	}
+}
+}
+
+// CUDA version
+void cloth::TripletScheduler::warp(uint32_t numParticles, uint32_t warpWidth)
+{
+	// PX_ASSERT(warpWidth == 32 || warpWidth == 16);
+
+	if(mTriplets.empty())
+		return;
+
+	TripletIter tIt, tEnd = mTriplets.end();
+	uint32_t tripletIndex;
+
+	// count number of triplets per particle
+	Vector<uint32_t>::Type adjacentCount(numParticles + 1, uint32_t(0));
+	for(tIt = mTriplets.begin(); tIt != tEnd; ++tIt)
+		for(int i = 0; i < 3; ++i)
+			++adjacentCount[(*tIt)[i]];
+
+	/* neither of those were really improving number of batches:
+	// run simd version to pre-sort particles
+	simd(numParticles, blockWidth); mSetSizes.resize(0);
+	// sort according to triplet degree (estimated by sum of adjacentCount)
+	std::sort(mTriplets.begin(), tEnd, GreaterSum(adjacentCount));
+	*/
+
+	uint32_t maxTripletCount = *maxElement(adjacentCount.begin(), adjacentCount.end());
+
+	// compute in place prefix sum (inclusive)
+	prefixSum(adjacentCount.begin(), adjacentCount.end(), adjacentCount.begin());
+
+	// initialize adjacencies (for each particle, collect touching triplets)
+	// also converts partial sum in adjacentCount from inclusive to exclusive
+	Vector<uint32_t>::Type adjacencies(adjacentCount.back());
+	for(tIt = mTriplets.begin(), tripletIndex = 0; tIt != tEnd; ++tIt, ++tripletIndex)
+		for(int i = 0; i < 3; ++i)
+			adjacencies[--adjacentCount[(*tIt)[i]]] = tripletIndex;
+
+	uint32_t warpMask = warpWidth - 1;
+
+	uint32_t numSets = maxTripletCount; // start with minimum number of sets
+	Vector<TripletSet>::Type sets(numSets);
+	Vector<uint32_t>::Type setIndices(mTriplets.size(), uint32_t(-1));
+	mSetSizes.resize(numSets);
+
+	// color triplets (assign to sets)
+	Vector<uint32_t>::Type::ConstIterator aBegin = adjacencies.begin(), aIt, aEnd;
+	for(tIt = mTriplets.begin(), tripletIndex = 0; tIt != tEnd; ++tIt, ++tripletIndex)
+	{
+		// mark sets of adjacent triplets
+		for(int i = 0; i < 3; ++i)
+		{
+			uint32_t particleIndex = (*tIt)[i];
+			aIt = aBegin + adjacentCount[particleIndex];
+			aEnd = aBegin + adjacentCount[particleIndex + 1];
+			for(uint32_t setIndex; aIt != aEnd; ++aIt)
+				if(numSets > (setIndex = setIndices[*aIt]))
+					sets[setIndex].mMark = tripletIndex;
+		}
+
+		// find valid set with smallest number of bank conflicts
+		uint32_t bestIndex = numSets;
+		uint32_t minReplays = 4;
+		for(uint32_t setIndex = 0; setIndex < numSets && minReplays; ++setIndex)
+		{
+			const TripletSet& set = sets[setIndex];
+
+			if(set.mMark == tripletIndex)
+				continue; // triplet collision
+
+			uint32_t numReplays = 0;
+			for(uint32_t i = 0; i < 3; ++i)
+				numReplays += set.mNumReplays[i] == set.mNumConflicts[i][warpMask & (*tIt)[i]];
+
+			if(minReplays > numReplays)
+				minReplays = numReplays, bestIndex = setIndex;
+		}
+
+		// add new set if none found
+		if(bestIndex == numSets)
+		{
+			sets.pushBack(TripletSet());
+			mSetSizes.pushBack(0);
+			++numSets;
+		}
+
+		// increment bank conflicts or reset if warp filled
+		TripletSet& set = sets[bestIndex];
+		if(++mSetSizes[bestIndex] & warpMask)
+			for(uint32_t i = 0; i < 3; ++i)
+				set.mNumReplays[i] = PxMax(set.mNumReplays[i], ++set.mNumConflicts[i][warpMask & (*tIt)[i]]);
+		else
+			set = TripletSet();
+
+		setIndices[tripletIndex] = bestIndex;
+	}
+
+	// reorder triplets
+	Vector<uint32_t>::Type setOffsets(mSetSizes.size());
+	prefixSum(mSetSizes.begin(), mSetSizes.end(), setOffsets.begin());
+
+	Vector<Vec4u>::Type triplets(mTriplets.size());
+	Vector<uint32_t>::Type::ConstIterator iIt = setIndices.begin();
+	for(tIt = mTriplets.begin(), tripletIndex = 0; tIt != tEnd; ++tIt, ++iIt)
+		triplets[--setOffsets[*iIt]] = *tIt;
+
+	mTriplets.swap(triplets);
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/TripletScheduler.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/TripletScheduler.h
new file mode 100644
index 00000000..836c9784
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/TripletScheduler.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+#include "Range.h"
+#include "Allocator.h"
+#include "Vec4T.h"
+
+namespace nvidia
+{
+
+namespace cloth
+{
+
+struct TripletScheduler
+{
+	typedef Vector<Vec4u>::Type::ConstIterator ConstTripletIter;
+	typedef Vector<Vec4u>::Type::Iterator TripletIter;
+
+	TripletScheduler(Range<const uint32_t[4]>);
+	void simd(uint32_t numParticles, uint32_t simdWidth);
+	void warp(uint32_t numParticles, uint32_t warpWidth);
+
+	Vector<Vec4u>::Type mTriplets;
+	Vector<uint32_t>::Type mSetSizes;
+};
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Vec4T.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Vec4T.h
new file mode 100644
index 00000000..c82b9629
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/Vec4T.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Types.h"
+
+namespace nvidia
+{
+
+namespace cloth
+{
+
+template <typename T>
+struct Vec4T
+{
+	Vec4T()
+	{
+	}
+
+	Vec4T(T a, T b, T c, T d) : x(a), y(b), z(c), w(d)
+	{
+	}
+
+	template <typename S>
+	Vec4T(const Vec4T<S>& other)
+	{
+		x = T(other.x);
+		y = T(other.y);
+		z = T(other.z);
+		w = T(other.w);
+	}
+
+	template <typename Index>
+	T& operator[](Index i)
+	{
+		return reinterpret_cast<T*>(this)[i];
+	}
+
+	template <typename Index>
+	const T& operator[](Index i) const
+	{
+		return reinterpret_cast<const T*>(this)[i];
+	}
+
+	T x, y, z, w;
+};
+
+template <typename T>
+Vec4T<T> operator*(const Vec4T<T>& vec, T scalar)
+{
+	return Vec4T<T>(vec.x * scalar, vec.y * scalar, vec.z * scalar, vec.w * scalar);
+}
+
+template <typename T>
+Vec4T<T> operator/(const Vec4T<T>& vec, T scalar)
+{
+	return Vec4T<T>(vec.x / scalar, vec.y / scalar, vec.z / scalar, vec.w / scalar);
+}
+
+template <typename T>
+T (&array(Vec4T<T>& vec))[4]
+{
+	return reinterpret_cast<T(&)[4]>(vec);
+}
+
+template <typename T>
+const T (&array(const Vec4T<T>& vec))[4]
+{
+	return reinterpret_cast<const T(&)[4]>(vec);
+}
+
+typedef Vec4T<uint32_t> Vec4u;
+typedef Vec4T<uint16_t> Vec4us;
+
+} // namespace cloth
+
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/avx/SwSolveConstraints.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/avx/SwSolveConstraints.cpp
new file mode 100644
index 00000000..b9a6ab35
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/avx/SwSolveConstraints.cpp
@@ -0,0 +1,916 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma warning(push)
+#pragma warning(disable : 4668) //'symbol' is not defined as a preprocessor macro, replacing with '0' for 'directives'
+#pragma warning(disable : 4987) // nonstandard extension used: 'throw (...)'
+#include <intrin.h>
+#pragma warning(pop)
+
+#pragma warning(disable : 4127) // conditional expression is constant
+
+typedef unsigned __int16 uint16_t;
+typedef unsigned __int32 uint32_t;
+
+namespace avx
+{
+__m128 sMaskYZW;
+__m256 sOne, sEpsilon, sMinusOneXYZOneW, sMaskXY;
+
+void initialize()
+{
+	sMaskYZW = _mm_castsi128_ps(_mm_setr_epi32(0, ~0, ~0, ~0));
+	sOne = _mm256_set1_ps(1.0f);
+	sEpsilon = _mm256_set1_ps(1.192092896e-07f);
+	sMinusOneXYZOneW = _mm256_setr_ps(-1.0f, -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, -1.0f, 1.0f);
+	sMaskXY = _mm256_castsi256_ps(_mm256_setr_epi32(~0, ~0, 0, 0, ~0, ~0, 0, 0));
+}
+
+template <uint32_t>
+__m256 fmadd_ps(__m256 a, __m256 b, __m256 c)
+{
+	return _mm256_add_ps(_mm256_mul_ps(a, b), c);
+}
+template <uint32_t>
+__m256 fnmadd_ps(__m256 a, __m256 b, __m256 c)
+{
+	return _mm256_sub_ps(c, _mm256_mul_ps(a, b));
+}
+#if _MSC_VER >= 1700
+template <>
+__m256 fmadd_ps<2>(__m256 a, __m256 b, __m256 c)
+{
+	return _mm256_fmadd_ps(a, b, c);
+}
+template <>
+__m256 fnmadd_ps<2>(__m256 a, __m256 b, __m256 c)
+{
+	return _mm256_fnmadd_ps(a, b, c);
+}
+#endif
+
+// roughly same perf as SSE2 intrinsics, the asm version below is about 10% faster
+template <bool useMultiplier, uint32_t avx>
+void solveConstraints(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd,
+                      const uint16_t* __restrict iIt, const __m128& stiffnessRef)
+{
+	__m256 stiffness, stretchLimit, compressionLimit, multiplier;
+
+	if(useMultiplier)
+	{
+		stiffness = _mm256_broadcast_ps(&stiffnessRef);
+		stretchLimit = _mm256_permute_ps(stiffness, 0xff);
+		compressionLimit = _mm256_permute_ps(stiffness, 0xaa);
+		multiplier = _mm256_permute_ps(stiffness, 0x55);
+		stiffness = _mm256_permute_ps(stiffness, 0x00);
+	}
+	else
+	{
+		stiffness = _mm256_broadcast_ss((const float*)&stiffnessRef);
+	}
+
+	for(; rIt < rEnd; rIt += 8, iIt += 16)
+	{
+		float* p0i = posIt + iIt[0] * 4;
+		float* p4i = posIt + iIt[8] * 4;
+		float* p0j = posIt + iIt[1] * 4;
+		float* p4j = posIt + iIt[9] * 4;
+		float* p1i = posIt + iIt[2] * 4;
+		float* p5i = posIt + iIt[10] * 4;
+		float* p1j = posIt + iIt[3] * 4;
+		float* p5j = posIt + iIt[11] * 4;
+
+		__m128 v0i = _mm_load_ps(p0i);
+		__m128 v4i = _mm_load_ps(p4i);
+		__m128 v0j = _mm_load_ps(p0j);
+		__m128 v4j = _mm_load_ps(p4j);
+		__m128 v1i = _mm_load_ps(p1i);
+		__m128 v5i = _mm_load_ps(p5i);
+		__m128 v1j = _mm_load_ps(p1j);
+		__m128 v5j = _mm_load_ps(p5j);
+
+		__m256 v04i = _mm256_insertf128_ps(_mm256_castps128_ps256(v0i), v4i, 1);
+		__m256 v04j = _mm256_insertf128_ps(_mm256_castps128_ps256(v0j), v4j, 1);
+		__m256 v15i = _mm256_insertf128_ps(_mm256_castps128_ps256(v1i), v5i, 1);
+		__m256 v15j = _mm256_insertf128_ps(_mm256_castps128_ps256(v1j), v5j, 1);
+
+		__m256 h04ij = fmadd_ps<avx>(sMinusOneXYZOneW, v04i, v04j);
+		__m256 h15ij = fmadd_ps<avx>(sMinusOneXYZOneW, v15i, v15j);
+
+		float* p2i = posIt + iIt[4] * 4;
+		float* p6i = posIt + iIt[12] * 4;
+		float* p2j = posIt + iIt[5] * 4;
+		float* p6j = posIt + iIt[13] * 4;
+		float* p3i = posIt + iIt[6] * 4;
+		float* p7i = posIt + iIt[14] * 4;
+		float* p3j = posIt + iIt[7] * 4;
+		float* p7j = posIt + iIt[15] * 4;
+
+		__m128 v2i = _mm_load_ps(p2i);
+		__m128 v6i = _mm_load_ps(p6i);
+		__m128 v2j = _mm_load_ps(p2j);
+		__m128 v6j = _mm_load_ps(p6j);
+		__m128 v3i = _mm_load_ps(p3i);
+		__m128 v7i = _mm_load_ps(p7i);
+		__m128 v3j = _mm_load_ps(p3j);
+		__m128 v7j = _mm_load_ps(p7j);
+
+		__m256 v26i = _mm256_insertf128_ps(_mm256_castps128_ps256(v2i), v6i, 1);
+		__m256 v26j = _mm256_insertf128_ps(_mm256_castps128_ps256(v2j), v6j, 1);
+		__m256 v37i = _mm256_insertf128_ps(_mm256_castps128_ps256(v3i), v7i, 1);
+		__m256 v37j = _mm256_insertf128_ps(_mm256_castps128_ps256(v3j), v7j, 1);
+
+		__m256 h26ij = fmadd_ps<avx>(sMinusOneXYZOneW, v26i, v26j);
+		__m256 h37ij = fmadd_ps<avx>(sMinusOneXYZOneW, v37i, v37j);
+
+		__m256 a = _mm256_unpacklo_ps(h04ij, h26ij);
+		__m256 b = _mm256_unpackhi_ps(h04ij, h26ij);
+		__m256 c = _mm256_unpacklo_ps(h15ij, h37ij);
+		__m256 d = _mm256_unpackhi_ps(h15ij, h37ij);
+
+		__m256 hxij = _mm256_unpacklo_ps(a, c);
+		__m256 hyij = _mm256_unpackhi_ps(a, c);
+		__m256 hzij = _mm256_unpacklo_ps(b, d);
+		__m256 vwij = _mm256_unpackhi_ps(b, d);
+
+		__m256 e2ij = fmadd_ps<avx>(hxij, hxij, fmadd_ps<avx>(hyij, hyij, fmadd_ps<avx>(hzij, hzij, sEpsilon)));
+
+		__m256 rij = _mm256_load_ps(rIt);
+		__m256 mask = _mm256_cmp_ps(rij, sEpsilon, _CMP_GT_OQ);
+		__m256 erij = _mm256_and_ps(fnmadd_ps<avx>(rij, _mm256_rsqrt_ps(e2ij), sOne), mask);
+
+		if(useMultiplier)
+		{
+			erij = fnmadd_ps<avx>(multiplier, _mm256_max_ps(compressionLimit, _mm256_min_ps(erij, stretchLimit)), erij);
+		}
+
+		__m256 exij = _mm256_mul_ps(erij, _mm256_mul_ps(stiffness, _mm256_rcp_ps(_mm256_add_ps(sEpsilon, vwij))));
+
+		// replace these two instructions with _mm_maskstore_ps below?
+		__m256 exlo = _mm256_and_ps(sMaskXY, exij);
+		__m256 exhi = _mm256_andnot_ps(sMaskXY, exij);
+
+		__m256 f04ij = _mm256_mul_ps(h04ij, _mm256_permute_ps(exlo, 0xc0));
+		__m256 u04i = fmadd_ps<avx>(f04ij, _mm256_permute_ps(v04i, 0xff), v04i);
+		__m256 u04j = fnmadd_ps<avx>(f04ij, _mm256_permute_ps(v04j, 0xff), v04j);
+
+		_mm_store_ps(p0i, _mm256_extractf128_ps(u04i, 0));
+		_mm_store_ps(p0j, _mm256_extractf128_ps(u04j, 0));
+		_mm_store_ps(p4i, _mm256_extractf128_ps(u04i, 1));
+		_mm_store_ps(p4j, _mm256_extractf128_ps(u04j, 1));
+
+		__m256 f15ij = _mm256_mul_ps(h15ij, _mm256_permute_ps(exlo, 0xd5));
+		__m256 u15i = fmadd_ps<avx>(f15ij, _mm256_permute_ps(v15i, 0xff), v15i);
+		__m256 u15j = fnmadd_ps<avx>(f15ij, _mm256_permute_ps(v15j, 0xff), v15j);
+
+		_mm_store_ps(p1i, _mm256_extractf128_ps(u15i, 0));
+		_mm_store_ps(p1j, _mm256_extractf128_ps(u15j, 0));
+		_mm_store_ps(p5i, _mm256_extractf128_ps(u15i, 1));
+		_mm_store_ps(p5j, _mm256_extractf128_ps(u15j, 1));
+
+		__m256 f26ij = _mm256_mul_ps(h26ij, _mm256_permute_ps(exhi, 0x2a));
+		__m256 u26i = fmadd_ps<avx>(f26ij, _mm256_permute_ps(v26i, 0xff), v26i);
+		__m256 u26j = fnmadd_ps<avx>(f26ij, _mm256_permute_ps(v26j, 0xff), v26j);
+
+		_mm_store_ps(p2i, _mm256_extractf128_ps(u26i, 0));
+		_mm_store_ps(p2j, _mm256_extractf128_ps(u26j, 0));
+		_mm_store_ps(p6i, _mm256_extractf128_ps(u26i, 1));
+		_mm_store_ps(p6j, _mm256_extractf128_ps(u26j, 1));
+
+		__m256 f37ij = _mm256_mul_ps(h37ij, _mm256_permute_ps(exhi, 0x3f));
+		__m256 u37i = fmadd_ps<avx>(f37ij, _mm256_permute_ps(v37i, 0xff), v37i);
+		__m256 u37j = fnmadd_ps<avx>(f37ij, _mm256_permute_ps(v37j, 0xff), v37j);
+
+		_mm_store_ps(p3i, _mm256_extractf128_ps(u37i, 0));
+		_mm_store_ps(p3j, _mm256_extractf128_ps(u37j, 0));
+		_mm_store_ps(p7i, _mm256_extractf128_ps(u37i, 1));
+		_mm_store_ps(p7j, _mm256_extractf128_ps(u37j, 1));
+	}
+
+	_mm256_zeroupper();
+}
+
+#ifdef _M_IX86
+
+// clang-format:disable
+
+/* full template specializations of above functions in assembler */
+
+// AVX without useMultiplier
+template <>
+void solveConstraints<false, 1>(float* __restrict posIt, const float* __restrict rIt,
+                                const float* __restrict rEnd, const uint16_t* __restrict iIt, const __m128& stiffnessRef)
+{
+	__m256 stiffness = _mm256_broadcast_ss((const float*)&stiffnessRef);
+
+	__m256 vtmp[8], htmp[4];
+	float* ptmp[16];
+
+	__asm 
+	{
+		mov edx, rIt
+		mov esi, rEnd
+
+		cmp edx, esi
+		jae forEnd
+
+		mov eax, iIt
+		mov ecx, posIt
+
+forBegin:
+		movzx edi, WORD PTR [eax   ] __asm shl edi, 4 __asm mov [ptmp   ], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v0i
+		movzx edi, WORD PTR [eax+16] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v4i
+		movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v0j
+		movzx edi, WORD PTR [eax+18] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v4j
+		movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v1i
+		movzx edi, WORD PTR [eax+20] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v5i
+		movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v1j
+		movzx edi, WORD PTR [eax+22] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v5j
+
+		vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp    ], ymm0 // v04i
+		vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+ 32], ymm2 // v04j
+		vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+ 64], ymm4 // v15i
+		vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+ 96], ymm6 // v15j
+
+		vmovaps ymm7, sMinusOneXYZOneW
+		vmulps ymm2, ymm2, ymm7 __asm vaddps ymm0, ymm0, ymm2 __asm vmovaps YMMWORD PTR [htmp   ], ymm0 // h04ij
+		vmulps ymm6, ymm6, ymm7 __asm vaddps ymm4, ymm4, ymm6 __asm vmovaps YMMWORD PTR [htmp+32], ymm4 // h15ij
+
+		movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+32], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v2i
+		movzx edi, WORD PTR [eax+24] __asm shl edi, 4 __asm mov [ptmp+36], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v6i
+		movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+40], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v2j
+		movzx edi, WORD PTR [eax+26] __asm shl edi, 4 __asm mov [ptmp+44], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v6j
+		movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+48], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v3i
+		movzx edi, WORD PTR [eax+28] __asm shl edi, 4 __asm mov [ptmp+52], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v7i
+		movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+56], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v3j
+		movzx edi, WORD PTR [eax+30] __asm shl edi, 4 __asm mov [ptmp+60], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v7j
+
+		vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp+128], ymm0 // v26i
+		vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+160], ymm2 // v26j
+		vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+192], ymm4 // v37i
+		vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+224], ymm6 // v37j
+
+		vmovaps ymm7, sMinusOneXYZOneW
+		vmulps ymm2, ymm2, ymm7 __asm vaddps ymm2, ymm0, ymm2 __asm vmovaps YMMWORD PTR [htmp+64], ymm2 // h26ij
+		vmulps ymm6, ymm6, ymm7 __asm vaddps ymm6, ymm4, ymm6 __asm vmovaps YMMWORD PTR [htmp+96], ymm6 // h37ij
+
+		vmovaps ymm0, YMMWORD PTR [htmp   ] // h04ij
+		vmovaps ymm4, YMMWORD PTR [htmp+32] // h15ij
+
+		vunpcklps ymm1, ymm0, ymm2 // a
+		vunpckhps ymm3, ymm0, ymm2 // b
+		vunpcklps ymm5, ymm4, ymm6 // c
+		vunpckhps ymm7, ymm4, ymm6 // d
+
+		vunpcklps ymm0, ymm1, ymm5 // hxij
+		vunpckhps ymm2, ymm1, ymm5 // hyij
+		vunpcklps ymm4, ymm3, ymm7 // hzij
+		vunpckhps ymm6, ymm3, ymm7 // vwij
+
+		vmovaps ymm7, sEpsilon
+		vmovaps ymm5, sOne
+		vmovaps ymm3, stiffness
+		vmovaps ymm1, YMMWORD PTR [edx] // rij
+
+		vmulps ymm0, ymm0, ymm0 __asm vaddps ymm0, ymm0, ymm7 // e2ij
+		vmulps ymm2, ymm2, ymm2 __asm vaddps ymm0, ymm0, ymm2
+		vmulps ymm4, ymm4, ymm4 __asm vaddps ymm0, ymm0, ymm4
+
+		vcmpgt_oqps ymm2, ymm1, ymm7 // mask
+		vrsqrtps ymm0, ymm0 __asm vmulps ymm0, ymm0, ymm1 // erij
+		vsubps ymm5, ymm5, ymm0 __asm vandps ymm5, ymm5, ymm2
+		vaddps ymm6, ymm6, ymm7 __asm vrcpps ymm6, ymm6
+
+		vmulps ymm6, ymm6, ymm3 __asm vmulps ymm6, ymm6, ymm5 // exij
+
+		vmovaps ymm7, sMaskXY
+		vandps ymm7, ymm7, ymm6 // exlo
+		vxorps ymm6, ymm6, ymm7 // exhi
+
+		vmovaps ymm4, YMMWORD PTR [htmp    ] // h04ij
+		vmovaps ymm0, YMMWORD PTR [vtmp    ] // v04i
+		vmovaps ymm1, YMMWORD PTR [vtmp+ 32] // v04j
+
+		vpermilps ymm5, ymm7, 0xc0 __asm vmulps ymm4, ymm4, ymm5 // f04ij
+		vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u04i
+		vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u04j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp   ] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v0i
+		mov edi, [ptmp+ 8] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v0j
+		mov edi, [ptmp+ 4] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v4i
+		mov edi, [ptmp+12] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v4j
+
+		vmovaps ymm4, YMMWORD PTR [htmp+ 32] // h15ij
+		vmovaps ymm0, YMMWORD PTR [vtmp+ 64] // v15i
+		vmovaps ymm1, YMMWORD PTR [vtmp+ 96] // v15j
+
+		vpermilps ymm5, ymm7, 0xd5 __asm vmulps ymm4, ymm4, ymm5 // f15ij
+		vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u15i
+		vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u15j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp+16] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v1i
+		mov edi, [ptmp+24] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v1j
+		mov edi, [ptmp+20] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v5i
+		mov edi, [ptmp+28] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v5j
+
+		vmovaps ymm4, YMMWORD PTR [htmp+ 64] // h26ij
+		vmovaps ymm0, YMMWORD PTR [vtmp+128] // v26i
+		vmovaps ymm1, YMMWORD PTR [vtmp+160] // v26j
+
+		vpermilps ymm5, ymm6, 0x2a __asm vmulps ymm4, ymm4, ymm5 // f26ij
+		vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u26i
+		vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u26j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp+32] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v2i
+		mov edi, [ptmp+40] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v2j
+		mov edi, [ptmp+36] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v6i
+		mov edi, [ptmp+44] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v6j
+
+		vmovaps ymm4, YMMWORD PTR [htmp+ 96] // h37ij
+		vmovaps ymm0, YMMWORD PTR [vtmp+192] // v37i
+		vmovaps ymm1, YMMWORD PTR [vtmp+224] // v37j
+
+		vpermilps ymm5, ymm6, 0x3f __asm vmulps ymm4, ymm4, ymm5 // f37ij
+		vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u37i
+		vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u37j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp+48] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v3i
+		mov edi, [ptmp+56] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v3j
+		mov edi, [ptmp+52] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v7i
+		mov edi, [ptmp+60] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v7j
+
+		add eax, 32
+		add edx, 32
+
+		cmp edx, esi
+		jb forBegin
+forEnd:
+	}
+
+	_mm256_zeroupper();
+}
+
+// AVX with useMultiplier
+template <>
+void solveConstraints<true, 1>(float* __restrict posIt, const float* __restrict rIt,
+                               const float* __restrict rEnd, const uint16_t* __restrict iIt, const __m128& stiffnessRef)
+{
+	__m256 stiffness = _mm256_broadcast_ps(&stiffnessRef);
+	__m256 stretchLimit = _mm256_permute_ps(stiffness, 0xff);
+	__m256 compressionLimit = _mm256_permute_ps(stiffness, 0xaa);
+	__m256 multiplier = _mm256_permute_ps(stiffness, 0x55);
+	stiffness = _mm256_permute_ps(stiffness, 0x00);
+
+	__m256 vtmp[8], htmp[4];
+	float* ptmp[16];
+
+	__asm 
+	{
+		mov edx, rIt
+		mov esi, rEnd
+
+		cmp edx, esi
+		jae forEnd
+
+		mov eax, iIt
+		mov ecx, posIt
+
+forBegin:
+		movzx edi, WORD PTR [eax   ] __asm shl edi, 4 __asm mov [ptmp   ], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v0i
+		movzx edi, WORD PTR [eax+16] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v4i
+		movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v0j
+		movzx edi, WORD PTR [eax+18] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v4j
+		movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v1i
+		movzx edi, WORD PTR [eax+20] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v5i
+		movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v1j
+		movzx edi, WORD PTR [eax+22] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v5j
+
+		vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp    ], ymm0 // v04i
+		vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+ 32], ymm2 // v04j
+		vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+ 64], ymm4 // v15i
+		vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+ 96], ymm6 // v15j
+
+		vmovaps ymm7, sMinusOneXYZOneW
+		vmulps ymm2, ymm2, ymm7 __asm vaddps ymm0, ymm0, ymm2 __asm vmovaps YMMWORD PTR [htmp   ], ymm0 // h04ij
+		vmulps ymm6, ymm6, ymm7 __asm vaddps ymm4, ymm4, ymm6 __asm vmovaps YMMWORD PTR [htmp+32], ymm4 // h15ij
+
+		movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+32], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v2i
+		movzx edi, WORD PTR [eax+24] __asm shl edi, 4 __asm mov [ptmp+36], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v6i
+		movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+40], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v2j
+		movzx edi, WORD PTR [eax+26] __asm shl edi, 4 __asm mov [ptmp+44], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v6j
+		movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+48], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v3i
+		movzx edi, WORD PTR [eax+28] __asm shl edi, 4 __asm mov [ptmp+52], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v7i
+		movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+56], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v3j
+		movzx edi, WORD PTR [eax+30] __asm shl edi, 4 __asm mov [ptmp+60], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v7j
+
+		vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp+128], ymm0 // v26i
+		vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+160], ymm2 // v26j
+		vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+192], ymm4 // v37i
+		vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+224], ymm6 // v37j
+
+		vmovaps ymm7, sMinusOneXYZOneW
+		vmulps ymm2, ymm2, ymm7 __asm vaddps ymm2, ymm0, ymm2 __asm vmovaps YMMWORD PTR [htmp+64], ymm2 // h26ij
+		vmulps ymm6, ymm6, ymm7 __asm vaddps ymm6, ymm4, ymm6 __asm vmovaps YMMWORD PTR [htmp+96], ymm6 // h37ij
+
+		vmovaps ymm0, YMMWORD PTR [htmp   ] // h04ij
+		vmovaps ymm4, YMMWORD PTR [htmp+32] // h15ij
+
+		vunpcklps ymm1, ymm0, ymm2 // a
+		vunpckhps ymm3, ymm0, ymm2 // b
+		vunpcklps ymm5, ymm4, ymm6 // c
+		vunpckhps ymm7, ymm4, ymm6 // d
+
+		vunpcklps ymm0, ymm1, ymm5 // hxij
+		vunpckhps ymm2, ymm1, ymm5 // hyij
+		vunpcklps ymm4, ymm3, ymm7 // hzij
+		vunpckhps ymm6, ymm3, ymm7 // vwij
+
+		vmovaps ymm7, sEpsilon
+		vmovaps ymm5, sOne
+		vmovaps ymm3, stiffness
+		vmovaps ymm1, YMMWORD PTR [edx] // rij
+
+		vmulps ymm0, ymm0, ymm0 __asm vaddps ymm0, ymm0, ymm7 // e2ij
+		vmulps ymm2, ymm2, ymm2 __asm vaddps ymm0, ymm0, ymm2
+		vmulps ymm4, ymm4, ymm4 __asm vaddps ymm0, ymm0, ymm4
+
+		vcmpgt_oqps ymm2, ymm1, ymm7 // mask
+		vrsqrtps ymm0, ymm0 __asm vmulps ymm0, ymm0, ymm1 // erij
+		vsubps ymm5, ymm5, ymm0 __asm vandps ymm5, ymm5, ymm2
+		vaddps ymm6, ymm6, ymm7 __asm vrcpps ymm6, ymm6
+
+		vmovaps ymm0, stretchLimit // multiplier block
+		vmovaps ymm1, compressionLimit
+		vmovaps ymm2, multiplier
+		vminps ymm0, ymm0, ymm5
+		vmaxps ymm1, ymm1, ymm0
+		vmulps ymm2, ymm2, ymm1
+		vsubps ymm5, ymm5, ymm2
+
+		vmulps ymm6, ymm6, ymm3 __asm vmulps ymm6, ymm6, ymm5 // exij
+
+		vmovaps ymm7, sMaskXY
+		vandps ymm7, ymm7, ymm6 // exlo
+		vxorps ymm6, ymm6, ymm7 // exhi
+
+		vmovaps ymm4, YMMWORD PTR [htmp    ] // h04ij
+		vmovaps ymm0, YMMWORD PTR [vtmp    ] // v04i
+		vmovaps ymm1, YMMWORD PTR [vtmp+ 32] // v04j
+
+		vpermilps ymm5, ymm7, 0xc0 __asm vmulps ymm4, ymm4, ymm5 // f04ij
+		vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u04i
+		vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u04j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp   ] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v0i
+		mov edi, [ptmp+ 8] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v0j
+		mov edi, [ptmp+ 4] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v4i
+		mov edi, [ptmp+12] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v4j
+
+		vmovaps ymm4, YMMWORD PTR [htmp+ 32] // h15ij
+		vmovaps ymm0, YMMWORD PTR [vtmp+ 64] // v15i
+		vmovaps ymm1, YMMWORD PTR [vtmp+ 96] // v15j
+
+		vpermilps ymm5, ymm7, 0xd5 __asm vmulps ymm4, ymm4, ymm5 // f15ij
+		vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u15i
+		vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u15j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp+16] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v1i
+		mov edi, [ptmp+24] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v1j
+		mov edi, [ptmp+20] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v5i
+		mov edi, [ptmp+28] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v5j
+
+		vmovaps ymm4, YMMWORD PTR [htmp+ 64] // h26ij
+		vmovaps ymm0, YMMWORD PTR [vtmp+128] // v26i
+		vmovaps ymm1, YMMWORD PTR [vtmp+160] // v26j
+
+		vpermilps ymm5, ymm6, 0x2a __asm vmulps ymm4, ymm4, ymm5 // f26ij
+		vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u26i
+		vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u26j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp+32] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v2i
+		mov edi, [ptmp+40] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v2j
+		mov edi, [ptmp+36] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v6i
+		mov edi, [ptmp+44] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v6j
+
+		vmovaps ymm4, YMMWORD PTR [htmp+ 96] // h37ij
+		vmovaps ymm0, YMMWORD PTR [vtmp+192] // v37i
+		vmovaps ymm1, YMMWORD PTR [vtmp+224] // v37j
+
+		vpermilps ymm5, ymm6, 0x3f __asm vmulps ymm4, ymm4, ymm5 // f37ij
+		vpermilps ymm2, ymm0, 0xff __asm vmulps ymm2, ymm2, ymm4 __asm vsubps ymm0, ymm0, ymm2 // u37i
+		vpermilps ymm3, ymm1, 0xff __asm vmulps ymm3, ymm3, ymm4 __asm vaddps ymm1, ymm1, ymm3 // u37j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp+48] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v3i
+		mov edi, [ptmp+56] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v3j
+		mov edi, [ptmp+52] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v7i
+		mov edi, [ptmp+60] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v7j
+
+		add eax, 32
+		add edx, 32
+
+		cmp edx, esi
+		jb forBegin
+forEnd:
+	}
+
+	_mm256_zeroupper();
+}
+
+#if _MSC_VER >= 1700
+// AVX2 without useMultiplier
+template <>
+void solveConstraints<false, 2>(float* __restrict posIt, const float* __restrict rIt, 
+                                const float* __restrict rEnd, const uint16_t* __restrict iIt, const __m128& stiffnessRef)
+{
+	__m256 stiffness = _mm256_broadcast_ss((const float*)&stiffnessRef);
+
+	__m256 vtmp[8], htmp[4];
+	float* ptmp[16];
+
+	__asm 
+	{
+		mov edx, rIt
+			mov esi, rEnd
+
+			cmp edx, esi
+			jae forEnd
+
+			mov eax, iIt
+			mov ecx, posIt
+
+forBegin:
+		movzx edi, WORD PTR [eax   ] __asm shl edi, 4 __asm mov [ptmp   ], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v0i
+		movzx edi, WORD PTR [eax+16] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v4i
+		movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v0j
+		movzx edi, WORD PTR [eax+18] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v4j
+		movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v1i
+		movzx edi, WORD PTR [eax+20] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v5i
+		movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v1j
+		movzx edi, WORD PTR [eax+22] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v5j
+
+		vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp    ], ymm0 // v04i
+		vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+ 32], ymm2 // v04j
+		vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+ 64], ymm4 // v15i
+		vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+ 96], ymm6 // v15j
+
+		vmovaps ymm7, sMinusOneXYZOneW
+		vfmadd213ps ymm2, ymm7, ymm0 __asm vmovaps YMMWORD PTR [htmp   ], ymm2 // h04ij
+		vfmadd213ps ymm6, ymm7, ymm4 __asm vmovaps YMMWORD PTR [htmp+32], ymm6 // h15ij
+
+		movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+32], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v2i
+		movzx edi, WORD PTR [eax+24] __asm shl edi, 4 __asm mov [ptmp+36], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v6i
+		movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+40], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v2j
+		movzx edi, WORD PTR [eax+26] __asm shl edi, 4 __asm mov [ptmp+44], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v6j
+		movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+48], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v3i
+		movzx edi, WORD PTR [eax+28] __asm shl edi, 4 __asm mov [ptmp+52], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v7i
+		movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+56], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v3j
+		movzx edi, WORD PTR [eax+30] __asm shl edi, 4 __asm mov [ptmp+60], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v7j
+
+		vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp+128], ymm0 // v26i
+		vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+160], ymm2 // v26j
+		vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+192], ymm4 // v37i
+		vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+224], ymm6 // v37j
+
+		vmovaps ymm7, sMinusOneXYZOneW
+		vfmadd213ps ymm2, ymm7, ymm0  __asm vmovaps YMMWORD PTR [htmp+64], ymm2 // h26ij
+		vfmadd213ps ymm6, ymm7, ymm4  __asm vmovaps YMMWORD PTR [htmp+96], ymm6 // h37ij
+
+		vmovaps ymm0, YMMWORD PTR [htmp   ] // h04ij
+		vmovaps ymm4, YMMWORD PTR [htmp+32] // h15ij
+
+		vunpcklps ymm1, ymm0, ymm2 // a
+		vunpckhps ymm3, ymm0, ymm2 // b
+		vunpcklps ymm5, ymm4, ymm6 // c
+		vunpckhps ymm7, ymm4, ymm6 // d
+
+		vunpcklps ymm0, ymm1, ymm5 // hxij
+		vunpckhps ymm2, ymm1, ymm5 // hyij
+		vunpcklps ymm4, ymm3, ymm7 // hzij
+		vunpckhps ymm6, ymm3, ymm7 // vwij
+
+		vmovaps ymm7, sEpsilon
+		vmovaps ymm5, sOne
+		vmovaps ymm3, stiffness
+		vmovaps ymm1, YMMWORD PTR [edx] // rij
+
+		vfmadd213ps ymm4, ymm4, ymm7 // e2ij
+		vfmadd213ps ymm2, ymm2, ymm4
+		vfmadd213ps ymm0, ymm0, ymm2
+
+		vcmpgt_oqps ymm2, ymm1, ymm7 // mask
+		vrsqrtps ymm0, ymm0 __asm vfnmadd231ps ymm5, ymm0, ymm1 // erij
+		vandps ymm5, ymm5, ymm2
+		vaddps ymm6, ymm6, ymm7 __asm vrcpps ymm6, ymm6
+
+		vmulps ymm6, ymm6, ymm3 __asm vmulps ymm6, ymm6, ymm5 // exij
+
+		vmovaps ymm7, sMaskXY
+		vandps ymm7, ymm7, ymm6 // exlo
+		vxorps ymm6, ymm6, ymm7 // exhi
+
+		vmovaps ymm4, YMMWORD PTR [htmp    ] // h04ij
+		vmovaps ymm0, YMMWORD PTR [vtmp    ] // v04i
+		vmovaps ymm1, YMMWORD PTR [vtmp+ 32] // v04j
+
+		vpermilps ymm5, ymm7, 0xc0 __asm vmulps ymm4, ymm4, ymm5 // f04ij
+		vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u04i
+		vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4  // u04j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp   ] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v0i
+		mov edi, [ptmp+ 8] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v0j
+		mov edi, [ptmp+ 4] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v4i
+		mov edi, [ptmp+12] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v4j
+
+		vmovaps ymm4, YMMWORD PTR [htmp+ 32] // h15ij
+		vmovaps ymm0, YMMWORD PTR [vtmp+ 64] // v15i
+		vmovaps ymm1, YMMWORD PTR [vtmp+ 96] // v15j
+
+		vpermilps ymm5, ymm7, 0xd5 __asm vmulps ymm4, ymm4, ymm5 // f15ij
+		vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u15i
+		vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u15j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp+16] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v1i
+		mov edi, [ptmp+24] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v1j
+		mov edi, [ptmp+20] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v5i
+		mov edi, [ptmp+28] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v5j
+
+		vmovaps ymm4, YMMWORD PTR [htmp+ 64] // h26ij
+		vmovaps ymm0, YMMWORD PTR [vtmp+128] // v26i
+		vmovaps ymm1, YMMWORD PTR [vtmp+160] // v26j
+
+		vpermilps ymm5, ymm6, 0x2a __asm vmulps ymm4, ymm4, ymm5 // f26ij
+		vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u26i
+		vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u26j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp+32] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v2i
+		mov edi, [ptmp+40] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v2j
+		mov edi, [ptmp+36] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v6i
+		mov edi, [ptmp+44] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v6j
+
+		vmovaps ymm4, YMMWORD PTR [htmp+ 96] // h37ij
+		vmovaps ymm0, YMMWORD PTR [vtmp+192] // v37i
+		vmovaps ymm1, YMMWORD PTR [vtmp+224] // v37j
+
+		vpermilps ymm5, ymm6, 0x3f __asm vmulps ymm4, ymm4, ymm5 // f37ij
+		vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u37i
+		vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u37j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp+48] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v3i
+		mov edi, [ptmp+56] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v3j
+		mov edi, [ptmp+52] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v7i
+		mov edi, [ptmp+60] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v7j
+
+		add eax, 32
+		add edx, 32
+
+		cmp edx, esi
+		jb forBegin
+forEnd:
+	}
+
+	_mm256_zeroupper();
+}
+
+// AVX2 with useMultiplier
+template <>
+void solveConstraints<true, 2>(float* __restrict posIt, const float* __restrict rIt, 
+                               const float* __restrict rEnd, const uint16_t* __restrict iIt, const __m128& stiffnessRef)
+{
+	__m256 stiffness = _mm256_broadcast_ps(&stiffnessRef);
+	__m256 stretchLimit = _mm256_permute_ps(stiffness, 0xff);
+	__m256 compressionLimit = _mm256_permute_ps(stiffness, 0xaa);
+	__m256 multiplier = _mm256_permute_ps(stiffness, 0x55);
+	stiffness = _mm256_permute_ps(stiffness, 0x00);
+
+	__m256 vtmp[8], htmp[4];
+	float* ptmp[16];
+
+	__asm 
+	{
+		mov edx, rIt
+		mov esi, rEnd
+
+		cmp edx, esi
+		jae forEnd
+
+		mov eax, iIt
+		mov ecx, posIt
+
+forBegin:
+		movzx edi, WORD PTR [eax   ] __asm shl edi, 4 __asm mov [ptmp   ], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v0i
+		movzx edi, WORD PTR [eax+16] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v4i
+		movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v0j
+		movzx edi, WORD PTR [eax+18] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v4j
+		movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v1i
+		movzx edi, WORD PTR [eax+20] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v5i
+		movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v1j
+		movzx edi, WORD PTR [eax+22] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v5j
+
+		vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp    ], ymm0 // v04i
+		vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+ 32], ymm2 // v04j
+		vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+ 64], ymm4 // v15i
+		vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+ 96], ymm6 // v15j
+
+		vmovaps ymm7, sMinusOneXYZOneW
+		vfmadd213ps ymm2, ymm7, ymm0 __asm vmovaps YMMWORD PTR [htmp   ], ymm2 // h04ij
+		vfmadd213ps ymm6, ymm7, ymm4 __asm vmovaps YMMWORD PTR [htmp+32], ymm6 // h15ij
+
+		movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+32], edi __asm vmovaps xmm0, XMMWORD PTR [edi + ecx] // v2i
+		movzx edi, WORD PTR [eax+24] __asm shl edi, 4 __asm mov [ptmp+36], edi __asm vmovaps xmm1, XMMWORD PTR [edi + ecx] // v6i
+		movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+40], edi __asm vmovaps xmm2, XMMWORD PTR [edi + ecx] // v2j
+		movzx edi, WORD PTR [eax+26] __asm shl edi, 4 __asm mov [ptmp+44], edi __asm vmovaps xmm3, XMMWORD PTR [edi + ecx] // v6j
+		movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+48], edi __asm vmovaps xmm4, XMMWORD PTR [edi + ecx] // v3i
+		movzx edi, WORD PTR [eax+28] __asm shl edi, 4 __asm mov [ptmp+52], edi __asm vmovaps xmm5, XMMWORD PTR [edi + ecx] // v7i
+		movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+56], edi __asm vmovaps xmm6, XMMWORD PTR [edi + ecx] // v3j
+		movzx edi, WORD PTR [eax+30] __asm shl edi, 4 __asm mov [ptmp+60], edi __asm vmovaps xmm7, XMMWORD PTR [edi + ecx] // v7j
+
+		vinsertf128 ymm0, ymm0, xmm1, 1 __asm vmovaps YMMWORD PTR [vtmp+128], ymm0 // v26i
+		vinsertf128 ymm2, ymm2, xmm3, 1 __asm vmovaps YMMWORD PTR [vtmp+160], ymm2 // v26j
+		vinsertf128 ymm4, ymm4, xmm5, 1 __asm vmovaps YMMWORD PTR [vtmp+192], ymm4 // v37i
+		vinsertf128 ymm6, ymm6, xmm7, 1 __asm vmovaps YMMWORD PTR [vtmp+224], ymm6 // v37j
+
+		vmovaps ymm7, sMinusOneXYZOneW
+		vfmadd213ps ymm2, ymm7, ymm0  __asm vmovaps YMMWORD PTR [htmp+64], ymm2 // h26ij
+		vfmadd213ps ymm6, ymm7, ymm4  __asm vmovaps YMMWORD PTR [htmp+96], ymm6 // h37ij
+
+		vmovaps ymm0, YMMWORD PTR [htmp   ] // h04ij
+		vmovaps ymm4, YMMWORD PTR [htmp+32] // h15ij
+
+		vunpcklps ymm1, ymm0, ymm2 // a
+		vunpckhps ymm3, ymm0, ymm2 // b
+		vunpcklps ymm5, ymm4, ymm6 // c
+		vunpckhps ymm7, ymm4, ymm6 // d
+
+		vunpcklps ymm0, ymm1, ymm5 // hxij
+		vunpckhps ymm2, ymm1, ymm5 // hyij
+		vunpcklps ymm4, ymm3, ymm7 // hzij
+		vunpckhps ymm6, ymm3, ymm7 // vwij
+
+		vmovaps ymm7, sEpsilon
+		vmovaps ymm5, sOne
+		vmovaps ymm3, stiffness
+		vmovaps ymm1, YMMWORD PTR [edx] // rij
+
+		vfmadd213ps ymm4, ymm4, ymm7 // e2ij
+		vfmadd213ps ymm2, ymm2, ymm4
+		vfmadd213ps ymm0, ymm0, ymm2
+
+		vcmpgt_oqps ymm2, ymm1, ymm7 // mask
+		vrsqrtps ymm0, ymm0 __asm vfnmadd231ps ymm5, ymm0, ymm1 // erij
+		vandps ymm5, ymm5, ymm2
+		vaddps ymm6, ymm6, ymm7 __asm vrcpps ymm6, ymm6
+
+		vmovaps ymm0, stretchLimit // multiplier block
+		vmovaps ymm1, compressionLimit
+		vmovaps ymm2, multiplier
+		vminps ymm0, ymm0, ymm5
+		vmaxps ymm1, ymm1, ymm0
+		vfnmadd231ps ymm5, ymm1, ymm2
+
+		vmulps ymm6, ymm6, ymm3 __asm vmulps ymm6, ymm6, ymm5 // exij
+
+		vmovaps ymm7, sMaskXY
+		vandps ymm7, ymm7, ymm6 // exlo
+		vxorps ymm6, ymm6, ymm7 // exhi
+
+		vmovaps ymm4, YMMWORD PTR [htmp    ] // h04ij
+		vmovaps ymm0, YMMWORD PTR [vtmp    ] // v04i
+		vmovaps ymm1, YMMWORD PTR [vtmp+ 32] // v04j
+
+		vpermilps ymm5, ymm7, 0xc0 __asm vmulps ymm4, ymm4, ymm5 // f04ij
+		vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u04i
+		vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4  // u04j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp   ] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v0i
+		mov edi, [ptmp+ 8] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v0j
+		mov edi, [ptmp+ 4] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v4i
+		mov edi, [ptmp+12] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v4j
+
+		vmovaps ymm4, YMMWORD PTR [htmp+ 32] // h15ij
+		vmovaps ymm0, YMMWORD PTR [vtmp+ 64] // v15i
+		vmovaps ymm1, YMMWORD PTR [vtmp+ 96] // v15j
+
+		vpermilps ymm5, ymm7, 0xd5 __asm vmulps ymm4, ymm4, ymm5 // f15ij
+		vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u15i
+		vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u15j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp+16] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v1i
+		mov edi, [ptmp+24] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v1j
+		mov edi, [ptmp+20] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v5i
+		mov edi, [ptmp+28] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v5j
+
+		vmovaps ymm4, YMMWORD PTR [htmp+ 64] // h26ij
+		vmovaps ymm0, YMMWORD PTR [vtmp+128] // v26i
+		vmovaps ymm1, YMMWORD PTR [vtmp+160] // v26j
+
+		vpermilps ymm5, ymm6, 0x2a __asm vmulps ymm4, ymm4, ymm5 // f26ij
+		vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u26i
+		vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u26j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp+32] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v2i
+		mov edi, [ptmp+40] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v2j
+		mov edi, [ptmp+36] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v6i
+		mov edi, [ptmp+44] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v6j
+
+		vmovaps ymm4, YMMWORD PTR [htmp+ 96] // h37ij
+		vmovaps ymm0, YMMWORD PTR [vtmp+192] // v37i
+		vmovaps ymm1, YMMWORD PTR [vtmp+224] // v37j
+
+		vpermilps ymm5, ymm6, 0x3f __asm vmulps ymm4, ymm4, ymm5 // f37ij
+		vpermilps ymm2, ymm0, 0xff __asm vfnmadd231ps ymm0, ymm2, ymm4 // u37i
+		vpermilps ymm3, ymm1, 0xff __asm vfmadd231ps ymm1, ymm3, ymm4 // u37j
+
+		vextractf128 xmm2, ymm0, 1
+		vextractf128 xmm3, ymm1, 1
+
+		mov edi, [ptmp+48] __asm vmovaps XMMWORD PTR [edi + ecx], xmm0 // v3i
+		mov edi, [ptmp+56] __asm vmovaps XMMWORD PTR [edi + ecx], xmm1 // v3j
+		mov edi, [ptmp+52] __asm vmovaps XMMWORD PTR [edi + ecx], xmm2 // v7i
+		mov edi, [ptmp+60] __asm vmovaps XMMWORD PTR [edi + ecx], xmm3 // v7j
+
+		add eax, 32
+		add edx, 32
+
+		cmp edx, esi
+		jb forBegin
+forEnd:
+	}
+
+	_mm256_zeroupper();
+}
+#endif // _MSC_VER >= 1700
+
+// clang-format:enable
+
+#else // _M_IX86
+
+template void solveConstraints<false, 1>(float* __restrict, const float* __restrict, const float* __restrict,
+                                         const uint16_t* __restrict, const __m128&);
+
+template void solveConstraints<true, 1>(float* __restrict, const float* __restrict, const float* __restrict,
+                                        const uint16_t* __restrict, const __m128&);
+
+template void solveConstraints<false, 2>(float* __restrict, const float* __restrict, const float* __restrict,
+                                         const uint16_t* __restrict, const __m128&);
+
+template void solveConstraints<true, 2>(float* __restrict, const float* __restrict, const float* __restrict,
+                                        const uint16_t* __restrict, const __m128&);
+
+#endif // _M_IX86
+
+} // namespace avx
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonCollision.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonCollision.cpp
new file mode 100644
index 00000000..01f1fb50
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonCollision.cpp
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef __ARM_NEON__
+#error This file needs to be compiled with NEON support!
+#endif
+
+#include "SwCollision.cpp"
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonSelfCollision.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonSelfCollision.cpp
new file mode 100644
index 00000000..d272bb6d
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonSelfCollision.cpp
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef __ARM_NEON__
+#error This file needs to be compiled with NEON support!
+#endif
+
+#include "SwSelfCollision.cpp"
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonSolverKernel.cpp b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonSolverKernel.cpp
new file mode 100644
index 00000000..068c900a
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/NeonSolverKernel.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef __ARM_NEON__
+#error This file needs to be compiled with NEON support!
+#endif
+
+#include "SwSolverKernel.cpp"
+
+#include <cpu-features.h>
+
+namespace nvidia
+{
+namespace cloth
+{
+bool neonSolverKernel(SwCloth const& cloth, SwClothData& data, SwKernelAllocator& allocator,
+                      IterationStateFactory& factory, PxProfileZone* profileZone)
+{
+	return ANDROID_CPU_ARM_FEATURE_NEON & android_getCpuFeatures() &&
+	       (SwSolverKernel<Simd4f>(cloth, data, allocator, factory, profileZone)(), true);
+}
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/Simd4f.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/Simd4f.h
new file mode 100644
index 00000000..0c0b884c
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/Simd4f.h
@@ -0,0 +1,500 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// factory implementation
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline Simd4fFactory<const float&>::operator Simd4f() const
+{
+	return vdupq_n_f32(reinterpret_cast<const float32_t&>(v));
+}
+
+inline Simd4fFactory<detail::FourTuple>::operator Simd4f() const
+{
+	return reinterpret_cast<const Simd4f&>(v);
+}
+
+template <int i>
+inline Simd4fFactory<detail::IntType<i> >::operator Simd4f() const
+{
+	return vdupq_n_u32(i);
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<1> >::operator Simd4f() const
+{
+	return vdupq_n_f32(1.0f);
+}
+
+template <>
+inline Simd4fFactory<const float*>::operator Simd4f() const
+{
+	return vld1q_f32((const float32_t*)v);
+}
+
+template <>
+inline Simd4fFactory<detail::AlignedPointer<float> >::operator Simd4f() const
+{
+	return vld1q_f32((const float32_t*)v.ptr);
+}
+
+template <>
+inline Simd4fFactory<detail::OffsetPointer<float> >::operator Simd4f() const
+{
+	return vld1q_f32(reinterpret_cast<const float32_t*>(reinterpret_cast<const char*>(v.ptr) + v.offset));
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression templates
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline ComplementExpr<Simd4f>::operator Simd4f() const
+{
+	return vbicq_u32(vdupq_n_u32(0xffffffff), v.u4);
+}
+
+Simd4f operator&(const ComplementExpr<Simd4f>& complement, const Simd4f& v)
+{
+	return vbicq_u32(v.u4, complement.v.u4);
+}
+
+Simd4f operator&(const Simd4f& v, const ComplementExpr<Simd4f>& complement)
+{
+	return vbicq_u32(v.u4, complement.v.u4);
+}
+
+ProductExpr::operator Simd4f() const
+{
+	return vmulq_f32(v0.f4, v1.f4);
+}
+
+Simd4f operator+(const ProductExpr& p, const Simd4f& v)
+{
+	return vmlaq_f32(v.f4, p.v0.f4, p.v1.f4);
+}
+
+Simd4f operator+(const Simd4f& v, const ProductExpr& p)
+{
+	return vmlaq_f32(v.f4, p.v0.f4, p.v1.f4);
+}
+
+Simd4f operator+(const ProductExpr& p0, const ProductExpr& p1)
+{
+	// cast calls operator Simd4f() which evaluates the other ProductExpr
+	return vmlaq_f32(static_cast<Simd4f>(p0).f4, p1.v0.f4, p1.v1.f4);
+}
+
+Simd4f operator-(const Simd4f& v, const ProductExpr& p)
+{
+	return vmlsq_f32(v.f4, p.v0.f4, p.v1.f4);
+}
+
+Simd4f operator-(const ProductExpr& p0, const ProductExpr& p1)
+{
+	// cast calls operator Simd4f() which evaluates the other ProductExpr
+	return vmlsq_f32(static_cast<Simd4f>(p0).f4, p1.v0.f4, p1.v1.f4);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operator implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4f operator==(const Simd4f& v0, const Simd4f& v1)
+{
+	return vceqq_f32(v0.f4, v1.f4);
+}
+
+Simd4f operator<(const Simd4f& v0, const Simd4f& v1)
+{
+	return vcltq_f32(v0.f4, v1.f4);
+}
+
+Simd4f operator<=(const Simd4f& v0, const Simd4f& v1)
+{
+	return vcleq_f32(v0.f4, v1.f4);
+}
+
+Simd4f operator>(const Simd4f& v0, const Simd4f& v1)
+{
+	return vcgtq_f32(v0.f4, v1.f4);
+}
+
+Simd4f operator>=(const Simd4f& v0, const Simd4f& v1)
+{
+	return vcgeq_f32(v0.f4, v1.f4);
+}
+
+ComplementExpr<Simd4f> operator~(const Simd4f& v)
+{
+	return ComplementExpr<Simd4f>(v);
+}
+
+Simd4f operator&(const Simd4f& v0, const Simd4f& v1)
+{
+	return vandq_u32(v0.u4, v1.u4);
+}
+
+Simd4f operator|(const Simd4f& v0, const Simd4f& v1)
+{
+	return vorrq_u32(v0.u4, v1.u4);
+}
+
+Simd4f operator^(const Simd4f& v0, const Simd4f& v1)
+{
+	return veorq_u32(v0.u4, v1.u4);
+}
+
+Simd4f operator<<(const Simd4f& v, int shift)
+{
+	return vshlq_u32(v.u4, vdupq_n_s32(shift));
+}
+
+Simd4f operator>>(const Simd4f& v, int shift)
+{
+	return vshlq_u32(v.u4, vdupq_n_s32(-shift));
+}
+
+Simd4f operator<<(const Simd4f& v, const Simd4f& shift)
+{
+	return vshlq_u32(v.u4, shift.i4);
+}
+
+Simd4f operator>>(const Simd4f& v, const Simd4f& shift)
+{
+	return vshlq_u32(v.u4, vnegq_s32(shift.i4));
+}
+
+Simd4f operator+(const Simd4f& v)
+{
+	return v;
+}
+
+Simd4f operator+(const Simd4f& v0, const Simd4f& v1)
+{
+	return vaddq_f32(v0.f4, v1.f4);
+}
+
+Simd4f operator-(const Simd4f& v)
+{
+	return vnegq_f32(v.f4);
+}
+
+Simd4f operator-(const Simd4f& v0, const Simd4f& v1)
+{
+	return vsubq_f32(v0.f4, v1.f4);
+}
+
+ProductExpr operator*(const Simd4f& v0, const Simd4f& v1)
+{
+	return ProductExpr(v0, v1);
+}
+
+Simd4f operator/(const Simd4f& v0, const Simd4f& v1)
+{
+	return v0 * vrecpeq_f32(v1.f4); // reciprocal estimate
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// function implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4f simd4f(const Simd4i& v)
+{
+	return v.u4;
+}
+
+float (&array(Simd4f& v))[4]
+{
+	return (float(&)[4])v;
+}
+
+const float (&array(const Simd4f& v))[4]
+{
+	return (const float(&)[4])v;
+}
+
+void store(float* ptr, Simd4f const& v)
+{
+	return vst1q_f32((float32_t*)ptr, v.f4);
+}
+
+void storeAligned(float* ptr, Simd4f const& v)
+{
+	return vst1q_f32((float32_t*)ptr, v.f4);
+}
+
+void storeAligned(float* ptr, unsigned int offset, Simd4f const& v)
+{
+	return storeAligned(reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + offset), v);
+}
+
+template <size_t i>
+Simd4f splat(Simd4f const& v)
+{
+	return vdupq_n_f32(array(v)[i]);
+}
+
+Simd4f select(Simd4f const& mask, Simd4f const& v0, Simd4f const& v1)
+{
+	return vbslq_f32(mask.u4, v0.f4, v1.f4);
+}
+
+Simd4f abs(const Simd4f& v)
+{
+	return vabsq_f32(v.f4);
+}
+
+Simd4f floor(const Simd4f& v)
+{
+	int32x4_t neg = vreinterpretq_s32_u32(vshrq_n_u32(v.u4, 31));
+	return vcvtq_f32_s32(vsubq_s32(vcvtq_s32_f32(v.f4), neg));
+}
+
+Simd4f max(const Simd4f& v0, const Simd4f& v1)
+{
+	return vmaxq_f32(v0.f4, v1.f4);
+}
+
+Simd4f min(const Simd4f& v0, const Simd4f& v1)
+{
+	return vminq_f32(v0.f4, v1.f4);
+}
+
+Simd4f recip(const Simd4f& v)
+{
+	return recipT<0>(v);
+}
+
+template <int n>
+Simd4f recipT(const Simd4f& v)
+{
+	Simd4f recipV = vrecpeq_f32(v.f4);
+	// n+1 newton iterations because initial approximation is crude
+	for(int i = 0; i <= n; ++i)
+		recipV = vrecpsq_f32(v.f4, recipV.f4) * recipV;
+	return recipV;
+}
+
+Simd4f sqrt(const Simd4f& v)
+{
+	return v * rsqrt(v);
+}
+
+Simd4f rsqrt(const Simd4f& v)
+{
+	return rsqrtT<0>(v);
+}
+
+template <int n>
+Simd4f rsqrtT(const Simd4f& v)
+{
+	Simd4f rsqrtV = vrsqrteq_f32(v.f4);
+	// n+1 newton iterations because initial approximation is crude
+	for(int i = 0; i <= n; ++i)
+		rsqrtV = vrsqrtsq_f32(vmulq_f32(v.f4, rsqrtV.f4), rsqrtV.f4) * rsqrtV;
+	return rsqrtV;
+}
+
+Simd4f exp2(const Simd4f& v)
+{
+	// http://www.netlib.org/cephes/
+
+	Simd4f limit = simd4f(127.4999f);
+	Simd4f x = min(max(-limit, v), limit);
+
+	// separate into integer and fractional part
+
+	Simd4f fx = x + simd4f(0.5f);
+	Simd4i ix = vsubq_s32(vcvtq_s32_f32(fx.f4), vreinterpretq_s32_u32(vshrq_n_u32(fx.u4, 31)));
+	fx = x - vcvtq_f32_s32(ix.i4);
+
+	// exp2(fx) ~ 1 + 2*P(fx) / (Q(fx) - P(fx))
+
+	Simd4f fx2 = fx * fx;
+
+	Simd4f px = fx * (simd4f(1.51390680115615096133e+3f) +
+	                  fx2 * (simd4f(2.02020656693165307700e+1f) + fx2 * simd4f(2.30933477057345225087e-2f)));
+	Simd4f qx = simd4f(4.36821166879210612817e+3f) + fx2 * (simd4f(2.33184211722314911771e+2f) + fx2);
+
+	Simd4f exp2fx = px * recip(qx - px);
+	exp2fx = simd4f(_1) + exp2fx + exp2fx;
+
+	// exp2(ix)
+
+	Simd4f exp2ix = vreinterpretq_f32_s32(vshlq_n_s32(vaddq_s32(ix.i4, vdupq_n_s32(0x7f)), 23));
+
+	return exp2fx * exp2ix;
+}
+
+Simd4f log2(const Simd4f& v)
+{
+	Simd4f scale = simd4f(1.44269504088896341f); // 1/ln(2)
+	const float* ptr = array(v);
+	return simd4f(::logf(ptr[0]), ::logf(ptr[1]), ::logf(ptr[2]), ::logf(ptr[3])) * scale;
+}
+
+Simd4f dot3(const Simd4f& v0, const Simd4f& v1)
+{
+	Simd4f tmp = v0 * v1;
+	return splat<0>(tmp) + splat<1>(tmp) + splat<2>(tmp);
+}
+
+Simd4f cross3(const Simd4f& v0, const Simd4f& v1)
+{
+	float32x2_t x0_y0 = vget_low_f32(v0.f4);
+	float32x2_t z0_w0 = vget_high_f32(v0.f4);
+	float32x2_t x1_y1 = vget_low_f32(v1.f4);
+	float32x2_t z1_w1 = vget_high_f32(v1.f4);
+
+	float32x2_t y1_z1 = vext_f32(x1_y1, z1_w1, 1);
+	float32x2_t y0_z0 = vext_f32(x0_y0, z0_w0, 1);
+
+	float32x2_t z0x1_w0y1 = vmul_f32(z0_w0, x1_y1);
+	float32x2_t x0y1_y0z1 = vmul_f32(x0_y0, y1_z1);
+
+	float32x2_t y2_w2 = vmls_f32(z0x1_w0y1, x0_y0, z1_w1);
+	float32x2_t z2_x2 = vmls_f32(x0y1_y0z1, y0_z0, x1_y1);
+	float32x2_t x2_y2 = vext_f32(z2_x2, y2_w2, 1);
+
+	return vcombine_f32(x2_y2, z2_x2);
+}
+
+void transpose(Simd4f& x, Simd4f& y, Simd4f& z, Simd4f& w)
+{
+#if NVMATH_INLINE_ASSEMBLER
+	asm volatile("vzip.f32 %q0, %q2 \n\t"
+	             "vzip.f32 %q1, %q3 \n\t"
+	             "vzip.f32 %q0, %q1 \n\t"
+	             "vzip.f32 %q2, %q3 \n\t"
+	             : "+w"(x.f4), "+w"(y.f4), "+w"(z.f4), "+w"(w.f4));
+#else
+	float32x4x2_t v0v1 = vzipq_f32(x.f4, z.f4);
+	float32x4x2_t v2v3 = vzipq_f32(y.f4, w.f4);
+	float32x4x2_t zip0 = vzipq_f32(v0v1.val[0], v2v3.val[0]);
+	float32x4x2_t zip1 = vzipq_f32(v0v1.val[1], v2v3.val[1]);
+
+	x = zip0.val[0];
+	y = zip0.val[1];
+	z = zip1.val[0];
+	w = zip1.val[1];
+#endif
+}
+
+int allEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return allTrue(v0 == v1);
+}
+
+int allEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return allTrue(outMask = v0 == v1);
+}
+
+int anyEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return anyTrue(v0 == v1);
+}
+
+int anyEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return anyTrue(outMask = v0 == v1);
+}
+
+int allGreater(const Simd4f& v0, const Simd4f& v1)
+{
+	return allTrue(v0 > v1);
+}
+
+int allGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return allTrue(outMask = v0 > v1);
+}
+
+int anyGreater(const Simd4f& v0, const Simd4f& v1)
+{
+	return anyTrue(v0 > v1);
+}
+
+int anyGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return anyTrue(outMask = v0 > v1);
+}
+
+int allGreaterEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return allTrue(v0 >= v1);
+}
+
+int allGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return allTrue(outMask = v0 >= v1);
+}
+
+int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return anyTrue(v0 >= v1);
+}
+
+int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return anyTrue(outMask = v0 >= v1);
+}
+
+int allTrue(const Simd4f& v)
+{
+#if NVMATH_INLINE_ASSEMBLER
+	int result;
+	asm volatile("vmovq q0, %q1 \n\t"
+	             "vand.u32 d0, d0, d1 \n\t"
+	             "vpmin.u32 d0, d0, d0 \n\t"
+	             "vcmp.f32 s0, #0 \n\t"
+	             "fmrx %0, fpscr"
+	             : "=r"(result)
+	             : "w"(v.f4)
+	             : "q0");
+	return result >> 28 & 0x1;
+#else
+	uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4));
+	uint16x4_t lo = vmovn_u32(v.u4);
+	uint16x8_t combined = vcombine_u16(lo, hi);
+	uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined));
+	return vget_lane_u32(reduced, 0) == 0xffffffff;
+#endif
+}
+
+int anyTrue(const Simd4f& v)
+{
+#if NVMATH_INLINE_ASSEMBLER
+	int result;
+	asm volatile("vmovq q0, %q1 \n\t"
+	             "vorr.u32 d0, d0, d1 \n\t"
+	             "vpmax.u32 d0, d0, d0 \n\t"
+	             "vcmp.f32 s0, #0 \n\t"
+	             "fmrx %0, fpscr"
+	             : "=r"(result)
+	             : "w"(v.f4)
+	             : "q0");
+	return result >> 28 & 0x1;
+#else
+	uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4));
+	uint16x4_t lo = vmovn_u32(v.u4);
+	uint16x8_t combined = vcombine_u16(lo, hi);
+	uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined));
+	return vget_lane_u32(reduced, 0) != 0x0;
+#endif
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/Simd4i.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/Simd4i.h
new file mode 100644
index 00000000..7a566256
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/Simd4i.h
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// factory implementation
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline Simd4iFactory<const int&>::operator Simd4i() const
+{
+	return vdupq_n_s32(v);
+}
+
+inline Simd4iFactory<detail::FourTuple>::operator Simd4i() const
+{
+	return reinterpret_cast<const Simd4i&>(v);
+}
+
+template <int i>
+inline Simd4iFactory<detail::IntType<i> >::operator Simd4i() const
+{
+	return vdupq_n_u32(i);
+}
+
+template <>
+inline Simd4iFactory<const int*>::operator Simd4i() const
+{
+	return vld1q_s32(v);
+}
+
+template <>
+inline Simd4iFactory<detail::AlignedPointer<int> >::operator Simd4i() const
+{
+	return vld1q_s32(v.ptr);
+}
+
+template <>
+inline Simd4iFactory<detail::OffsetPointer<int> >::operator Simd4i() const
+{
+	return vld1q_s32(reinterpret_cast<const int*>(reinterpret_cast<const char*>(v.ptr) + v.offset));
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression template
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline ComplementExpr<Simd4i>::operator Simd4i() const
+{
+	return vbicq_u32(vdupq_n_u32(0xffffffff), v.u4);
+}
+
+Simd4i operator&(const ComplementExpr<Simd4i>& complement, const Simd4i& v)
+{
+	return vbicq_u32(v.u4, complement.v.u4);
+}
+
+Simd4i operator&(const Simd4i& v, const ComplementExpr<Simd4i>& complement)
+{
+	return vbicq_u32(v.u4, complement.v.u4);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operator implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4i simdi::operator==(const Simd4i& v0, const Simd4i& v1)
+{
+	return vceqq_u32(v0.u4, v1.u4);
+}
+
+Simd4i simdi::operator<(const Simd4i& v0, const Simd4i& v1)
+{
+	return vcltq_s32(v0.i4, v1.i4);
+}
+
+Simd4i simdi::operator>(const Simd4i& v0, const Simd4i& v1)
+{
+	return vcgtq_s32(v0.i4, v1.i4);
+}
+
+ComplementExpr<Simd4i> operator~(const Simd4i& v)
+{
+	return ComplementExpr<Simd4i>(v);
+}
+
+Simd4i operator&(const Simd4i& v0, const Simd4i& v1)
+{
+	return vandq_u32(v0.u4, v1.u4);
+}
+
+Simd4i operator|(const Simd4i& v0, const Simd4i& v1)
+{
+	return vorrq_u32(v0.u4, v1.u4);
+}
+
+Simd4i operator^(const Simd4i& v0, const Simd4i& v1)
+{
+	return veorq_u32(v0.u4, v1.u4);
+}
+
+Simd4i operator<<(const Simd4i& v, int shift)
+{
+	return vshlq_u32(v.u4, vdupq_n_s32(shift));
+}
+
+Simd4i operator>>(const Simd4i& v, int shift)
+{
+	return vshlq_u32(v.u4, vdupq_n_s32(-shift));
+}
+
+Simd4i operator<<(const Simd4i& v, const Simd4i& shift)
+{
+	return vshlq_u32(v.u4, shift.i4);
+}
+
+Simd4i operator>>(const Simd4i& v, const Simd4i& shift)
+{
+	return vshlq_u32(v.u4, vnegq_s32(shift.i4));
+}
+
+Simd4i simdi::operator+(const Simd4i& v0, const Simd4i& v1)
+{
+	return vaddq_u32(v0.u4, v1.u4);
+}
+
+Simd4i simdi::operator-(const Simd4i& v)
+{
+	return vnegq_s32(v.i4);
+}
+
+Simd4i simdi::operator-(const Simd4i& v0, const Simd4i& v1)
+{
+	return vsubq_u32(v0.u4, v1.u4);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// function implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4i simd4i(const Simd4f& v)
+{
+	return v.u4;
+}
+
+int (&simdi::array(Simd4i& v))[4]
+{
+	return (int(&)[4])v;
+}
+
+const int (&simdi::array(const Simd4i& v))[4]
+{
+	return (const int(&)[4])v;
+}
+
+void store(int* ptr, const Simd4i& v)
+{
+	return vst1q_s32(ptr, v.i4);
+}
+
+void storeAligned(int* ptr, const Simd4i& v)
+{
+	vst1q_s32(ptr, v.i4);
+}
+
+void storeAligned(int* ptr, unsigned int offset, const Simd4i& v)
+{
+	return storeAligned(reinterpret_cast<int*>(reinterpret_cast<char*>(ptr) + offset), v);
+}
+
+template <size_t i>
+Simd4i splat(Simd4i const& v)
+{
+	return vdupq_n_s32(simdi::array(v)[i]);
+}
+
+Simd4i select(Simd4i const& mask, Simd4i const& v0, Simd4i const& v1)
+{
+	return vbslq_u32(mask.u4, v0.u4, v1.u4);
+}
+
+int simdi::allEqual(const Simd4i& v0, const Simd4i& v1)
+{
+	return allTrue(simdi::operator==(v0, v1));
+}
+
+int simdi::allEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return allTrue(outMask = simdi::operator==(v0, v1));
+}
+
+int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1)
+{
+	return anyTrue(simdi::operator==(v0, v1));
+}
+
+int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return anyTrue(outMask = simdi::operator==(v0, v1));
+}
+
+int simdi::allGreater(const Simd4i& v0, const Simd4i& v1)
+{
+	return allTrue(simdi::operator>(v0, v1));
+}
+
+int simdi::allGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return allTrue(outMask = simdi::operator>(v0, v1));
+}
+
+int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1)
+{
+	return anyTrue(simdi::operator>(v0, v1));
+}
+
+int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return anyTrue(outMask = simdi::operator>(v0, v1));
+}
+
+int allTrue(const Simd4i& v)
+{
+#if NVMATH_INLINE_ASSEMBLER
+	int result;
+	asm volatile("vmovq q0, %q1 \n\t"
+	             "vand.u32 d0, d0, d1 \n\t"
+	             "vpmin.u32 d0, d0, d0 \n\t"
+	             "vcmp.f32 s0, #0 \n\t"
+	             "fmrx %0, fpscr"
+	             : "=r"(result)
+	             : "w"(v.u4)
+	             : "q0");
+	return result >> 28 & 0x1;
+#else
+	uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4));
+	uint16x4_t lo = vmovn_u32(v.u4);
+	uint16x8_t combined = vcombine_u16(lo, hi);
+	uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined));
+	return vget_lane_u32(reduced, 0) == 0xffffffff;
+#endif
+}
+
+int anyTrue(const Simd4i& v)
+{
+#if NVMATH_INLINE_ASSEMBLER
+	int result;
+	asm volatile("vmovq q0, %q1 \n\t"
+	             "vorr.u32 d0, d0, d1 \n\t"
+	             "vpmax.u32 d0, d0, d0 \n\t"
+	             "vcmp.f32 s0, #0 \n\t"
+	             "fmrx %0, fpscr"
+	             : "=r"(result)
+	             : "w"(v.u4)
+	             : "q0");
+	return result >> 28 & 0x1;
+#else
+	uint16x4_t hi = vget_high_u16(vreinterpretq_u16_u32(v.u4));
+	uint16x4_t lo = vmovn_u32(v.u4);
+	uint16x8_t combined = vcombine_u16(lo, hi);
+	uint32x2_t reduced = vreinterpret_u32_u8(vmovn_u16(combined));
+	return vget_lane_u32(reduced, 0) != 0x0;
+#endif
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/SimdTypes.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/SimdTypes.h
new file mode 100644
index 00000000..542fac08
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/SimdTypes.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include <arm_neon.h>
+
+union Simd4f
+{
+	Simd4f()
+	{
+	}
+	Simd4f(const float32x4_t& v) : f4(v)
+	{
+	}
+#ifndef _M_ARM // all *32x4_t map to the same type
+	Simd4f(const uint32x4_t& v) : u4(v)
+	{
+	}
+#endif
+	float32x4_t f4;
+	uint32x4_t u4;
+	int32x4_t i4;
+};
+
+union Simd4i
+{
+	Simd4i()
+	{
+	}
+	Simd4i(const uint32x4_t& v) : u4(v)
+	{
+	}
+#ifndef _M_ARM // all *32x4_t map to the same type
+	Simd4i(const int32x4_t& v) : i4(v)
+	{
+	}
+#endif
+	uint32x4_t u4;
+	int32x4_t i4;
+};
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/SwCollisionHelpers.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/SwCollisionHelpers.h
new file mode 100644
index 00000000..b67f96aa
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/neon/SwCollisionHelpers.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#ifdef _M_ARM
+#include <arm_neon.h>
+#endif
+
+namespace nvidia
+{
+namespace cloth
+{
+
+uint32_t findBitSet(uint32_t mask)
+{
+#ifdef _M_ARM
+	__n64 t = { mask };
+	return 31 - (vclz_u32(t)).n64_u32[0];
+#else
+	return 31 - __builtin_clz(mask);
+#endif
+}
+
+Simd4i intFloor(const Simd4f& v)
+{
+	int32x4_t neg = vreinterpretq_s32_u32(vshrq_n_u32(v.u4, 31));
+	return vsubq_s32(vcvtq_s32_f32(v.f4), neg);
+}
+
+Simd4i horizontalOr(Simd4i mask)
+{
+	using namespace simdi;
+	uint32x2_t hi = vget_high_u32(mask.u4);
+	uint32x2_t lo = vget_low_u32(mask.u4);
+	uint32x2_t tmp = vorr_u32(lo, hi);
+	uint32x2_t rev = vrev64_u32(tmp);
+	uint32x2_t res = vorr_u32(tmp, rev);
+	return vcombine_u32(res, res);
+}
+
+Gather<Simd4i>::Gather(const Simd4i& index)
+{
+#ifdef __arm64__
+	using namespace simdi;
+	PX_ALIGN(16, uint8x8x2_t) byteIndex = reinterpret_cast<const uint8x8x2_t&>(sPack);
+	uint8x16_t lohiIndex = reinterpret_cast<const uint8x16_t&>(index);
+	byteIndex.val[0] = vtbl1q_u8(lohiIndex, byteIndex.val[0]);
+	byteIndex.val[1] = vtbl1q_u8(lohiIndex, byteIndex.val[1]);
+	mPermute = vshlq_n_u32(reinterpret_cast<const uint32x4_t&>(byteIndex), 2);
+	mPermute = mPermute | sOffset | vcgtq_u32(index.u4, sMask.u4);
+#else
+	using namespace simdi;
+	PX_ALIGN(16, uint8x8x2_t) byteIndex = reinterpret_cast<const uint8x8x2_t&>(sPack);
+	uint8x8x2_t lohiIndex = reinterpret_cast<const uint8x8x2_t&>(index);
+	byteIndex.val[0] = vtbl2_u8(lohiIndex, byteIndex.val[0]);
+	byteIndex.val[1] = vtbl2_u8(lohiIndex, byteIndex.val[1]);
+	mPermute = vshlq_n_u32(reinterpret_cast<const uint32x4_t&>(byteIndex), 2);
+	mPermute = mPermute | sOffset | vcgtq_u32(index.u4, sMask.u4);
+#endif
+}
+
+Simd4i Gather<Simd4i>::operator()(const Simd4i* ptr) const
+{
+#ifdef __arm64__
+	PX_ALIGN(16, uint8x8x2_t) result = reinterpret_cast<const uint8x8x2_t&>(mPermute);
+	const uint8x16x2_t* table = reinterpret_cast<const uint8x16x2_t*>(ptr);
+	result.val[0] = vtbl2q_u8(*table, result.val[0]);
+	result.val[1] = vtbl2q_u8(*table, result.val[1]);
+	return reinterpret_cast<const Simd4i&>(result);
+#else
+	PX_ALIGN(16, uint8x8x2_t) result = reinterpret_cast<const uint8x8x2_t&>(mPermute);
+	const uint8x8x4_t* table = reinterpret_cast<const uint8x8x4_t*>(ptr);
+	result.val[0] = vtbl4_u8(*table, result.val[0]);
+	result.val[1] = vtbl4_u8(*table, result.val[1]);
+	return reinterpret_cast<const Simd4i&>(result);
+#endif
+}
+
+} // namespace cloth
+} // namespace physx
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/Simd4f.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/Simd4f.h
new file mode 100644
index 00000000..d02d5066
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/Simd4f.h
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// factory implementation
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline Simd4fFactory<const float&>::operator Scalar4f() const
+{
+	return Scalar4f(v, v, v, v);
+}
+
+inline Simd4fFactory<detail::FourTuple>::operator Scalar4f() const
+{
+	return reinterpret_cast<const Scalar4f&>(v);
+}
+
+template <int i>
+inline Simd4fFactory<detail::IntType<i> >::operator Scalar4f() const
+{
+	float s = i;
+	return Scalar4f(s, s, s, s);
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<0x80000000u> >::operator Scalar4f() const
+{
+	int32_t i = 0x80000000u;
+	return Scalar4f(i, i, i, i);
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<0xffffffff> >::operator Scalar4f() const
+{
+	int32_t i = 0xffffffff;
+	return Scalar4f(i, i, i, i);
+}
+
+template <>
+inline Simd4fFactory<const float*>::operator Scalar4f() const
+{
+	return Scalar4f(v[0], v[1], v[2], v[3]);
+}
+
+template <>
+inline Simd4fFactory<detail::AlignedPointer<float> >::operator Scalar4f() const
+{
+	return Scalar4f(v.ptr[0], v.ptr[1], v.ptr[2], v.ptr[3]);
+}
+
+template <>
+inline Simd4fFactory<detail::OffsetPointer<float> >::operator Scalar4f() const
+{
+	const float* ptr = reinterpret_cast<const float*>(reinterpret_cast<const char*>(v.ptr) + v.offset);
+	return Scalar4f(ptr[0], ptr[1], ptr[2], ptr[3]);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression template
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline ComplementExpr<Scalar4f>::operator Scalar4f() const
+{
+	return Scalar4f(~v.u4[0], ~v.u4[1], ~v.u4[2], ~v.u4[3]);
+}
+
+inline Scalar4f operator&(const ComplementExpr<Scalar4f>& complement, const Scalar4f& v)
+{
+	return Scalar4f(v.u4[0] & ~complement.v.u4[0], v.u4[1] & ~complement.v.u4[1], v.u4[2] & ~complement.v.u4[2],
+	                v.u4[3] & ~complement.v.u4[3]);
+}
+
+inline Scalar4f operator&(const Scalar4f& v, const ComplementExpr<Scalar4f>& complement)
+{
+	return Scalar4f(v.u4[0] & ~complement.v.u4[0], v.u4[1] & ~complement.v.u4[1], v.u4[2] & ~complement.v.u4[2],
+	                v.u4[3] & ~complement.v.u4[3]);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operator implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+inline Scalar4f operator==(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] == v1.f4[0], v0.f4[1] == v1.f4[1], v0.f4[2] == v1.f4[2], v0.f4[3] == v1.f4[3]);
+}
+
+inline Scalar4f operator<(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] < v1.f4[0], v0.f4[1] < v1.f4[1], v0.f4[2] < v1.f4[2], v0.f4[3] < v1.f4[3]);
+}
+
+inline Scalar4f operator<=(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] <= v1.f4[0], v0.f4[1] <= v1.f4[1], v0.f4[2] <= v1.f4[2], v0.f4[3] <= v1.f4[3]);
+}
+
+inline Scalar4f operator>(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] > v1.f4[0], v0.f4[1] > v1.f4[1], v0.f4[2] > v1.f4[2], v0.f4[3] > v1.f4[3]);
+}
+
+inline Scalar4f operator>=(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] >= v1.f4[0], v0.f4[1] >= v1.f4[1], v0.f4[2] >= v1.f4[2], v0.f4[3] >= v1.f4[3]);
+}
+
+inline ComplementExpr<Scalar4f> operator~(const Scalar4f& v)
+{
+	return ComplementExpr<Scalar4f>(v);
+}
+
+inline Scalar4f operator&(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.u4[0] & v1.u4[0], v0.u4[1] & v1.u4[1], v0.u4[2] & v1.u4[2], v0.u4[3] & v1.u4[3]);
+}
+
+inline Scalar4f operator|(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.u4[0] | v1.u4[0], v0.u4[1] | v1.u4[1], v0.u4[2] | v1.u4[2], v0.u4[3] | v1.u4[3]);
+}
+
+inline Scalar4f operator^(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.u4[0] ^ v1.u4[0], v0.u4[1] ^ v1.u4[1], v0.u4[2] ^ v1.u4[2], v0.u4[3] ^ v1.u4[3]);
+}
+
+inline Scalar4f operator<<(const Scalar4f& v, int shift)
+{
+	return Scalar4f(v.u4[0] << shift, v.u4[1] << shift, v.u4[2] << shift, v.u4[3] << shift);
+}
+
+inline Scalar4f operator>>(const Scalar4f& v, int shift)
+{
+	return Scalar4f(v.u4[0] >> shift, v.u4[1] >> shift, v.u4[2] >> shift, v.u4[3] >> shift);
+}
+
+inline Scalar4f operator+(const Scalar4f& v)
+{
+	return v;
+}
+
+inline Scalar4f operator+(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] + v1.f4[0], v0.f4[1] + v1.f4[1], v0.f4[2] + v1.f4[2], v0.f4[3] + v1.f4[3]);
+}
+
+inline Scalar4f operator-(const Scalar4f& v)
+{
+	return Scalar4f(-v.f4[0], -v.f4[1], -v.f4[2], -v.f4[3]);
+}
+
+inline Scalar4f operator-(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] - v1.f4[0], v0.f4[1] - v1.f4[1], v0.f4[2] - v1.f4[2], v0.f4[3] - v1.f4[3]);
+}
+
+inline Scalar4f operator*(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] * v1.f4[0], v0.f4[1] * v1.f4[1], v0.f4[2] * v1.f4[2], v0.f4[3] * v1.f4[3]);
+}
+
+inline Scalar4f operator/(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(v0.f4[0] / v1.f4[0], v0.f4[1] / v1.f4[1], v0.f4[2] / v1.f4[2], v0.f4[3] / v1.f4[3]);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// function implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+inline Scalar4f simd4f(const Scalar4i& v)
+{
+	return v;
+}
+
+inline float (&array(Scalar4f& v))[4]
+{
+	return v.f4;
+}
+
+inline const float (&array(const Scalar4f& v))[4]
+{
+	return v.f4;
+}
+
+inline void store(float* ptr, const Scalar4f& v)
+{
+	ptr[0] = v.f4[0];
+	ptr[1] = v.f4[1];
+	ptr[2] = v.f4[2];
+	ptr[3] = v.f4[3];
+}
+
+inline void storeAligned(float* ptr, const Scalar4f& v)
+{
+	store(ptr, v);
+}
+
+inline void storeAligned(float* ptr, unsigned int offset, const Scalar4f& v)
+{
+	storeAligned(reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + offset), v);
+}
+
+template <size_t i>
+inline Scalar4f splat(const Scalar4f& v)
+{
+	return Scalar4f(v.f4[i], v.f4[i], v.f4[i], v.f4[i]);
+}
+
+inline Scalar4f select(const Scalar4f& mask, const Scalar4f& v0, const Scalar4f& v1)
+{
+	return ((v0 ^ v1) & mask) ^ v1;
+}
+
+inline Scalar4f abs(const Scalar4f& v)
+{
+	return Scalar4f(::fabsf(v.f4[0]), ::fabsf(v.f4[1]), ::fabsf(v.f4[2]), ::fabsf(v.f4[3]));
+}
+
+inline Scalar4f floor(const Scalar4f& v)
+{
+	return Scalar4f(::floorf(v.f4[0]), ::floorf(v.f4[1]), ::floorf(v.f4[2]), ::floorf(v.f4[3]));
+}
+
+inline Scalar4f max(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(std::max(v0.f4[0], v1.f4[0]), std::max(v0.f4[1], v1.f4[1]), std::max(v0.f4[2], v1.f4[2]),
+	                std::max(v0.f4[3], v1.f4[3]));
+}
+
+inline Scalar4f min(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return Scalar4f(std::min(v0.f4[0], v1.f4[0]), std::min(v0.f4[1], v1.f4[1]), std::min(v0.f4[2], v1.f4[2]),
+	                std::min(v0.f4[3], v1.f4[3]));
+}
+
+inline Scalar4f recip(const Scalar4f& v)
+{
+	return Scalar4f(1 / v.f4[0], 1 / v.f4[1], 1 / v.f4[2], 1 / v.f4[3]);
+}
+
+template <int n>
+inline Scalar4f recipT(const Scalar4f& v)
+{
+	return recip(v);
+}
+
+inline Scalar4f sqrt(const Scalar4f& v)
+{
+	return Scalar4f(::sqrtf(v.f4[0]), ::sqrtf(v.f4[1]), ::sqrtf(v.f4[2]), ::sqrtf(v.f4[3]));
+}
+
+inline Scalar4f rsqrt(const Scalar4f& v)
+{
+	return recip(sqrt(v));
+}
+
+template <int n>
+inline Scalar4f rsqrtT(const Scalar4f& v)
+{
+	return rsqrt(v);
+}
+
+inline Scalar4f exp2(const Scalar4f& v)
+{
+	float scale = 0.69314718055994531f; // ::logf(2.0f);
+	return Scalar4f(::expf(v.f4[0] * scale), ::expf(v.f4[1] * scale), ::expf(v.f4[2] * scale), ::expf(v.f4[3] * scale));
+}
+
+namespace simdf
+{
+// PSP2 is confused resolving about exp2, forwarding works
+inline Scalar4f exp2(const Scalar4f& v)
+{
+	return ::exp2(v);
+}
+}
+
+inline Scalar4f log2(const Scalar4f& v)
+{
+	float scale = 1.44269504088896341f; // 1/ln(2)
+	return Scalar4f(::logf(v.f4[0]) * scale, ::logf(v.f4[1]) * scale, ::logf(v.f4[2]) * scale, ::logf(v.f4[3]) * scale);
+}
+
+inline Scalar4f dot3(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return simd4f(v0.f4[0] * v1.f4[0] + v0.f4[1] * v1.f4[1] + v0.f4[2] * v1.f4[2]);
+}
+
+inline Scalar4f cross3(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return simd4f(v0.f4[1] * v1.f4[2] - v0.f4[2] * v1.f4[1], v0.f4[2] * v1.f4[0] - v0.f4[0] * v1.f4[2],
+	              v0.f4[0] * v1.f4[1] - v0.f4[1] * v1.f4[0], 0.0f);
+}
+
+inline void transpose(Scalar4f& x, Scalar4f& y, Scalar4f& z, Scalar4f& w)
+{
+	float x1 = x.f4[1], x2 = x.f4[2], x3 = x.f4[3];
+	float y2 = y.f4[2], y3 = y.f4[3], z3 = z.f4[3];
+
+	x.f4[1] = y.f4[0];
+	x.f4[2] = z.f4[0];
+	x.f4[3] = w.f4[0];
+	y.f4[0] = x1;
+	y.f4[2] = z.f4[1];
+	y.f4[3] = w.f4[1];
+	z.f4[0] = x2;
+	z.f4[1] = y2;
+	z.f4[3] = w.f4[2];
+	w.f4[0] = x3;
+	w.f4[1] = y3;
+	w.f4[2] = z3;
+}
+
+inline int allEqual(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return v0.f4[0] == v1.f4[0] && v0.f4[1] == v1.f4[1] && v0.f4[2] == v1.f4[2] && v0.f4[3] == v1.f4[3];
+}
+
+inline int allEqual(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask)
+{
+	bool b0 = v0.f4[0] == v1.f4[0], b1 = v0.f4[1] == v1.f4[1], b2 = v0.f4[2] == v1.f4[2], b3 = v0.f4[3] == v1.f4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 && b1 && b2 && b3;
+}
+
+inline int anyEqual(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return v0.f4[0] == v1.f4[0] || v0.f4[1] == v1.f4[1] || v0.f4[2] == v1.f4[2] || v0.f4[3] == v1.f4[3];
+}
+
+inline int anyEqual(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask)
+{
+	bool b0 = v0.f4[0] == v1.f4[0], b1 = v0.f4[1] == v1.f4[1], b2 = v0.f4[2] == v1.f4[2], b3 = v0.f4[3] == v1.f4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 || b1 || b2 || b3;
+}
+
+inline int allGreater(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return v0.f4[0] > v1.f4[0] && v0.f4[1] > v1.f4[1] && v0.f4[2] > v1.f4[2] && v0.f4[3] > v1.f4[3];
+}
+
+inline int allGreater(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask)
+{
+	bool b0 = v0.f4[0] > v1.f4[0], b1 = v0.f4[1] > v1.f4[1], b2 = v0.f4[2] > v1.f4[2], b3 = v0.f4[3] > v1.f4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 && b1 && b2 && b3;
+}
+
+inline int anyGreater(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return v0.f4[0] > v1.f4[0] || v0.f4[1] > v1.f4[1] || v0.f4[2] > v1.f4[2] || v0.f4[3] > v1.f4[3];
+}
+
+inline int anyGreater(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask)
+{
+	bool b0 = v0.f4[0] > v1.f4[0], b1 = v0.f4[1] > v1.f4[1], b2 = v0.f4[2] > v1.f4[2], b3 = v0.f4[3] > v1.f4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 || b1 || b2 || b3;
+}
+
+inline int allGreaterEqual(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return v0.f4[0] >= v1.f4[0] && v0.f4[1] >= v1.f4[1] && v0.f4[2] >= v1.f4[2] && v0.f4[3] >= v1.f4[3];
+}
+
+inline int allGreaterEqual(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask)
+{
+	bool b0 = v0.f4[0] >= v1.f4[0], b1 = v0.f4[1] >= v1.f4[1], b2 = v0.f4[2] >= v1.f4[2], b3 = v0.f4[3] >= v1.f4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 && b1 && b2 && b3;
+}
+
+inline int anyGreaterEqual(const Scalar4f& v0, const Scalar4f& v1)
+{
+	return v0.f4[0] >= v1.f4[0] || v0.f4[1] >= v1.f4[1] || v0.f4[2] >= v1.f4[2] || v0.f4[3] >= v1.f4[3];
+}
+
+inline int anyGreaterEqual(const Scalar4f& v0, const Scalar4f& v1, Scalar4f& outMask)
+{
+	bool b0 = v0.f4[0] >= v1.f4[0], b1 = v0.f4[1] >= v1.f4[1], b2 = v0.f4[2] >= v1.f4[2], b3 = v0.f4[3] >= v1.f4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 || b1 || b2 || b3;
+}
+
+inline int allTrue(const Scalar4f& v)
+{
+	return v.u4[0] & v.u4[1] & v.u4[2] & v.u4[3];
+}
+
+inline int anyTrue(const Scalar4f& v)
+{
+	return v.u4[0] | v.u4[1] | v.u4[2] | v.u4[3];
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/Simd4i.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/Simd4i.h
new file mode 100644
index 00000000..80ac2abd
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/Simd4i.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// factory implementation
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline Simd4iFactory<const int&>::operator Scalar4i() const
+{
+	return Scalar4i(v, v, v, v);
+}
+
+inline Simd4iFactory<detail::FourTuple>::operator Scalar4i() const
+{
+	return reinterpret_cast<const Scalar4i&>(v);
+}
+
+template <int i>
+inline Simd4iFactory<detail::IntType<i> >::operator Scalar4i() const
+{
+	return Scalar4i(i, i, i, i);
+}
+
+template <>
+inline Simd4iFactory<const int*>::operator Scalar4i() const
+{
+	return Scalar4i(v[0], v[1], v[2], v[3]);
+}
+
+template <>
+inline Simd4iFactory<detail::AlignedPointer<int> >::operator Scalar4i() const
+{
+	return Scalar4i(v.ptr[0], v.ptr[1], v.ptr[2], v.ptr[3]);
+}
+
+template <>
+inline Simd4iFactory<detail::OffsetPointer<int> >::operator Scalar4i() const
+{
+	const int* ptr = reinterpret_cast<const int*>(reinterpret_cast<const char*>(v.ptr) + v.offset);
+	return Scalar4i(ptr[0], ptr[1], ptr[2], ptr[3]);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operator implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+namespace simdi
+{
+
+inline Scalar4i operator==(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return Scalar4i(v0.i4[0] == v1.i4[0], v0.i4[1] == v1.i4[1], v0.i4[2] == v1.i4[2], v0.i4[3] == v1.i4[3]);
+}
+
+inline Scalar4i operator<(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return Scalar4i(v0.i4[0] < v1.i4[0], v0.i4[1] < v1.i4[1], v0.i4[2] < v1.i4[2], v0.i4[3] < v1.i4[3]);
+}
+
+inline Scalar4i operator>(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return Scalar4i(v0.i4[0] > v1.i4[0], v0.i4[1] > v1.i4[1], v0.i4[2] > v1.i4[2], v0.i4[3] > v1.i4[3]);
+}
+
+inline Scalar4i operator+(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return Scalar4i(v0.i4[0] + v1.i4[0], v0.i4[1] + v1.i4[1], v0.i4[2] + v1.i4[2], v0.i4[3] + v1.i4[3]);
+}
+
+inline Scalar4i operator-(const Scalar4i& v)
+{
+	return Scalar4i(-v.i4[0], -v.i4[1], -v.i4[2], -v.i4[3]);
+}
+
+inline Scalar4i operator-(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return Scalar4i(v0.i4[0] - v1.i4[0], v0.i4[1] - v1.i4[1], v0.i4[2] - v1.i4[2], v0.i4[3] - v1.i4[3]);
+}
+
+} // namespace simd
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// function implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+inline Scalar4i simd4i(const Scalar4f& v)
+{
+	return v;
+}
+
+namespace simdi
+{
+
+inline int (&array(Scalar4i& v))[4]
+{
+	return v.i4;
+}
+
+inline const int (&array(const Scalar4i& v))[4]
+{
+	return v.i4;
+}
+
+} // namespace simdi
+
+inline void store(int* ptr, const Scalar4i& v)
+{
+	ptr[0] = v.i4[0];
+	ptr[1] = v.i4[1];
+	ptr[2] = v.i4[2];
+	ptr[3] = v.i4[3];
+}
+
+inline void storeAligned(int* ptr, const Scalar4i& v)
+{
+	store(ptr, v);
+}
+
+inline void storeAligned(int* ptr, unsigned int offset, const Scalar4i& v)
+{
+	store(reinterpret_cast<int*>(reinterpret_cast<char*>(ptr) + offset), v);
+}
+
+namespace simdi
+{
+
+inline int allEqual(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return v0.i4[0] == v1.i4[0] && v0.i4[1] == v1.i4[1] && v0.i4[2] == v1.i4[2] && v0.i4[3] == v1.i4[3];
+}
+
+inline int allEqual(const Scalar4i& v0, const Scalar4i& v1, Scalar4i& outMask)
+{
+	bool b0 = v0.i4[0] == v1.i4[0], b1 = v0.i4[1] == v1.i4[1], b2 = v0.i4[2] == v1.i4[2], b3 = v0.i4[3] == v1.i4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 && b1 && b2 && b3;
+}
+
+inline int anyEqual(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return v0.i4[0] == v1.i4[0] || v0.i4[1] == v1.i4[1] || v0.i4[2] == v1.i4[2] || v0.i4[3] == v1.i4[3];
+}
+
+inline int anyEqual(const Scalar4i& v0, const Scalar4i& v1, Scalar4i& outMask)
+{
+	bool b0 = v0.i4[0] == v1.i4[0], b1 = v0.i4[1] == v1.i4[1], b2 = v0.i4[2] == v1.i4[2], b3 = v0.i4[3] == v1.i4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 || b1 || b2 || b3;
+}
+
+inline int allGreater(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return v0.i4[0] > v1.i4[0] && v0.i4[1] > v1.i4[1] && v0.i4[2] > v1.i4[2] && v0.i4[3] > v1.i4[3];
+}
+
+inline int allGreater(const Scalar4i& v0, const Scalar4i& v1, Scalar4i& outMask)
+{
+	bool b0 = v0.i4[0] > v1.i4[0], b1 = v0.i4[1] > v1.i4[1], b2 = v0.i4[2] > v1.i4[2], b3 = v0.i4[3] > v1.i4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 && b1 && b2 && b3;
+}
+
+inline int anyGreater(const Scalar4i& v0, const Scalar4i& v1)
+{
+	return v0.i4[0] > v1.i4[0] || v0.i4[1] > v1.i4[1] || v0.i4[2] > v1.i4[2] || v0.i4[3] > v1.i4[3];
+}
+
+inline int anyGreater(const Scalar4i& v0, const Scalar4i& v1, Scalar4i& outMask)
+{
+	bool b0 = v0.i4[0] > v1.i4[0], b1 = v0.i4[1] > v1.i4[1], b2 = v0.i4[2] > v1.i4[2], b3 = v0.i4[3] > v1.i4[3];
+	outMask = Scalar4f(b0, b1, b2, b3);
+	return b0 || b1 || b2 || b3;
+}
+
+} // namespace simd
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/SimdTypes.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/SimdTypes.h
new file mode 100644
index 00000000..a287766c
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/SimdTypes.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#ifdef PX_WIIU
+#pragma ghs nowarning 193 // warning #193-D: zero used for undefined preprocessing identifier
+#endif
+
+#include <algorithm>
+
+#ifdef PX_WIIU
+#pragma ghs endnowarning
+#endif
+
+union Scalar4f
+{
+	Scalar4f()
+	{
+	}
+
+	Scalar4f(float x, float y, float z, float w)
+	{
+		f4[0] = x;
+		f4[1] = y;
+		f4[2] = z;
+		f4[3] = w;
+	}
+
+	Scalar4f(int32_t x, int32_t y, int32_t z, int32_t w)
+	{
+		i4[0] = x;
+		i4[1] = y;
+		i4[2] = z;
+		i4[3] = w;
+	}
+
+	Scalar4f(uint32_t x, uint32_t y, uint32_t z, uint32_t w)
+	{
+		u4[0] = x;
+		u4[1] = y;
+		u4[2] = z;
+		u4[3] = w;
+	}
+
+	Scalar4f(bool x, bool y, bool z, bool w)
+	{
+		u4[0] = ~(uint32_t(x) - 1);
+		u4[1] = ~(uint32_t(y) - 1);
+		u4[2] = ~(uint32_t(z) - 1);
+		u4[3] = ~(uint32_t(w) - 1);
+	}
+
+	Scalar4f(const Scalar4f& other)
+	{
+		u4[0] = other.u4[0];
+		u4[1] = other.u4[1];
+		u4[2] = other.u4[2];
+		u4[3] = other.u4[3];
+	}
+
+	Scalar4f& operator=(const Scalar4f& other)
+	{
+		u4[0] = other.u4[0];
+		u4[1] = other.u4[1];
+		u4[2] = other.u4[2];
+		u4[3] = other.u4[3];
+		return *this;
+	}
+
+	float f4[4];
+	int32_t i4[4];
+	uint32_t u4[4];
+};
+
+typedef Scalar4f Scalar4i;
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/SwCollisionHelpers.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/SwCollisionHelpers.h
new file mode 100644
index 00000000..33b35f72
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/scalar/SwCollisionHelpers.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+namespace nvidia
+{
+namespace cloth
+{
+
+#if !NVMATH_SIMD
+uint32_t findBitSet(uint32_t mask)
+{
+	uint32_t result = 0;
+	while(mask >>= 1)
+		++result;
+	return result;
+}
+#endif
+
+inline Scalar4i intFloor(const Scalar4f& v)
+{
+	return Scalar4i(int(floor(v.f4[0])), int(floor(v.f4[1])), int(floor(v.f4[2])), int(floor(v.f4[3])));
+}
+
+inline Scalar4i horizontalOr(Scalar4i mask)
+{
+	return simd4i(mask.i4[0] | mask.i4[1] | mask.i4[2] | mask.i4[3]);
+}
+
+template <>
+struct Gather<Scalar4i>
+{
+	inline Gather(const Scalar4i& index);
+	inline Scalar4i operator()(const Scalar4i*) const;
+
+	Scalar4i mIndex;
+	Scalar4i mOutOfRange;
+};
+
+Gather<Scalar4i>::Gather(const Scalar4i& index)
+{
+	uint32_t mask = physx::cloth::SwCollision<Scalar4i>::sGridSize - 1;
+
+	mIndex.u4[0] = index.u4[0] & mask;
+	mIndex.u4[1] = index.u4[1] & mask;
+	mIndex.u4[2] = index.u4[2] & mask;
+	mIndex.u4[3] = index.u4[3] & mask;
+
+	mOutOfRange.u4[0] = index.u4[0] & ~mask ? 0 : -1;
+	mOutOfRange.u4[1] = index.u4[1] & ~mask ? 0 : -1;
+	mOutOfRange.u4[2] = index.u4[2] & ~mask ? 0 : -1;
+	mOutOfRange.u4[3] = index.u4[3] & ~mask ? 0 : -1;
+}
+
+Scalar4i Gather<Scalar4i>::operator()(const Scalar4i* ptr) const
+{
+	const int32_t* base = ptr->i4;
+	const int32_t* index = mIndex.i4;
+	const int32_t* mask = mOutOfRange.i4;
+	return Scalar4i(base[index[0]] & mask[0], base[index[1]] & mask[1], base[index[2]] & mask[2],
+	                base[index[3]] & mask[3]);
+}
+
+} // namespace cloth
+} // namespace physx
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/Simd4f.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/Simd4f.h
new file mode 100644
index 00000000..3f04750f
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/Simd4f.h
@@ -0,0 +1,411 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// factory implementation
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline Simd4fFactory<const float&>::operator Simd4f() const
+{
+	return _mm_set1_ps(v);
+}
+
+inline Simd4fFactory<detail::FourTuple>::operator Simd4f() const
+{
+	return reinterpret_cast<const Simd4f&>(v);
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<0> >::operator Simd4f() const
+{
+	return _mm_setzero_ps();
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<1> >::operator Simd4f() const
+{
+	return _mm_set1_ps(1.0f);
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<int(0x80000000)> >::operator Simd4f() const
+{
+	return _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+}
+
+template <>
+inline Simd4fFactory<detail::IntType<int(0xffffffff)> >::operator Simd4f() const
+{
+	return _mm_castsi128_ps(_mm_set1_epi32(-1));
+}
+
+template <>
+inline Simd4fFactory<const float*>::operator Simd4f() const
+{
+	return _mm_loadu_ps(v);
+}
+
+template <>
+inline Simd4fFactory<detail::AlignedPointer<float> >::operator Simd4f() const
+{
+	return _mm_load_ps(v.ptr);
+}
+
+template <>
+inline Simd4fFactory<detail::OffsetPointer<float> >::operator Simd4f() const
+{
+	return _mm_load_ps(reinterpret_cast<const float*>(reinterpret_cast<const char*>(v.ptr) + v.offset));
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression template
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline ComplementExpr<Simd4f>::operator Simd4f() const
+{
+	return _mm_andnot_ps(v, _mm_castsi128_ps(_mm_set1_epi32(-1)));
+}
+
+Simd4f operator&(const ComplementExpr<Simd4f>& complement, const Simd4f& v)
+{
+	return _mm_andnot_ps(complement.v, v);
+}
+
+Simd4f operator&(const Simd4f& v, const ComplementExpr<Simd4f>& complement)
+{
+	return _mm_andnot_ps(complement.v, v);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operator implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4f operator==(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_cmpeq_ps(v0, v1);
+}
+
+Simd4f operator<(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_cmplt_ps(v0, v1);
+}
+
+Simd4f operator<=(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_cmple_ps(v0, v1);
+}
+
+Simd4f operator>(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_cmpgt_ps(v0, v1);
+}
+
+Simd4f operator>=(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_cmpge_ps(v0, v1);
+}
+
+ComplementExpr<Simd4f> operator~(const Simd4f& v)
+{
+	return ComplementExpr<Simd4f>(v);
+}
+
+Simd4f operator&(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_and_ps(v0, v1);
+}
+
+Simd4f operator|(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_or_ps(v0, v1);
+}
+
+Simd4f operator^(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_xor_ps(v0, v1);
+}
+
+Simd4f operator<<(const Simd4f& v, int shift)
+{
+	return _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(v), shift));
+}
+
+Simd4f operator>>(const Simd4f& v, int shift)
+{
+	return _mm_castsi128_ps(_mm_srli_epi32(_mm_castps_si128(v), shift));
+}
+
+Simd4f operator+(const Simd4f& v)
+{
+	return v;
+}
+
+Simd4f operator+(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_add_ps(v0, v1);
+}
+
+Simd4f operator-(const Simd4f& v)
+{
+	return _mm_sub_ps(_mm_setzero_ps(), v);
+}
+
+Simd4f operator-(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_sub_ps(v0, v1);
+}
+
+Simd4f operator*(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_mul_ps(v0, v1);
+}
+
+Simd4f operator/(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_div_ps(v0, v1);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// function implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4f simd4f(const Simd4i& v)
+{
+	return _mm_castsi128_ps(v);
+}
+
+float (&array(Simd4f& v))[4]
+{
+	return reinterpret_cast<float(&)[4]>(v);
+}
+
+const float (&array(const Simd4f& v))[4]
+{
+	return reinterpret_cast<const float(&)[4]>(v);
+}
+
+void store(float* ptr, Simd4f const& v)
+{
+	_mm_storeu_ps(ptr, v);
+}
+
+void storeAligned(float* ptr, Simd4f const& v)
+{
+	_mm_store_ps(ptr, v);
+}
+
+void storeAligned(float* ptr, unsigned int offset, Simd4f const& v)
+{
+	_mm_store_ps(reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + offset), v);
+}
+
+template <size_t i>
+Simd4f splat(Simd4f const& v)
+{
+	return _mm_shuffle_ps(v, v, _MM_SHUFFLE(i, i, i, i));
+}
+
+Simd4f select(Simd4f const& mask, Simd4f const& v0, Simd4f const& v1)
+{
+	return _mm_xor_ps(v1, _mm_and_ps(mask, _mm_xor_ps(v1, v0)));
+}
+
+Simd4f abs(const Simd4f& v)
+{
+	return _mm_andnot_ps(_mm_castsi128_ps(_mm_set1_epi32(0x80000000)), v);
+}
+
+Simd4f floor(const Simd4f& v)
+{
+	// SSE 4.1: return _mm_floor_ps(v);
+	Simd4i i = _mm_cvttps_epi32(v);
+	return _mm_cvtepi32_ps(_mm_sub_epi32(i, _mm_srli_epi32(i, 31)));
+}
+
+Simd4f max(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_max_ps(v0, v1);
+}
+
+Simd4f min(const Simd4f& v0, const Simd4f& v1)
+{
+	return _mm_min_ps(v0, v1);
+}
+
+Simd4f recip(const Simd4f& v)
+{
+	return _mm_rcp_ps(v);
+}
+
+template <int n>
+Simd4f recipT(const Simd4f& v)
+{
+	Simd4f two = simd4f(2.0f);
+	Simd4f recipV = recip(v);
+	for(int i = 0; i < n; ++i)
+		recipV = recipV * (two - v * recipV);
+	return recipV;
+}
+
+Simd4f sqrt(const Simd4f& v)
+{
+	return _mm_sqrt_ps(v);
+}
+
+Simd4f rsqrt(const Simd4f& v)
+{
+	return _mm_rsqrt_ps(v);
+}
+
+template <int n>
+Simd4f rsqrtT(const Simd4f& v)
+{
+	Simd4f halfV = v * simd4f(0.5f);
+	Simd4f threeHalf = simd4f(1.5f);
+	Simd4f rsqrtV = rsqrt(v);
+	for(int i = 0; i < n; ++i)
+		rsqrtV = rsqrtV * (threeHalf - halfV * rsqrtV * rsqrtV);
+	return rsqrtV;
+}
+
+Simd4f exp2(const Simd4f& v)
+{
+	// http://www.netlib.org/cephes/
+
+	Simd4f limit = simd4f(127.4999f);
+	Simd4f x = min(max(-limit, v), limit);
+
+	// separate into integer and fractional part
+
+	Simd4f fx = x + simd4f(0.5f);
+	Simd4i ix = _mm_sub_epi32(_mm_cvttps_epi32(fx), _mm_srli_epi32(_mm_castps_si128(fx), 31));
+	fx = x - Simd4f(_mm_cvtepi32_ps(ix));
+
+	// exp2(fx) ~ 1 + 2*P(fx) / (Q(fx) - P(fx))
+
+	Simd4f fx2 = fx * fx;
+
+	Simd4f px = fx * (simd4f(1.51390680115615096133e+3f) +
+	                  fx2 * (simd4f(2.02020656693165307700e+1f) + fx2 * simd4f(2.30933477057345225087e-2f)));
+	Simd4f qx = simd4f(4.36821166879210612817e+3f) + fx2 * (simd4f(2.33184211722314911771e+2f) + fx2);
+
+	Simd4f exp2fx = px * recip(qx - px);
+	exp2fx = simd4f(_1) + exp2fx + exp2fx;
+
+	// exp2(ix)
+
+	Simd4f exp2ix = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ix, _mm_set1_epi32(0x7f)), 23));
+
+	return exp2fx * exp2ix;
+}
+
+Simd4f log2(const Simd4f& v)
+{
+	// todo: fast approximate implementation like exp2
+	Simd4f scale = simd4f(1.44269504088896341f); // 1/ln(2)
+	const float* ptr = array(v);
+	return simd4f(::logf(ptr[0]), ::logf(ptr[1]), ::logf(ptr[2]), ::logf(ptr[3])) * scale;
+}
+
+Simd4f dot3(const Simd4f& v0, const Simd4f& v1)
+{
+	Simd4f tmp = v0 * v1;
+	return splat<0>(tmp) + splat<1>(tmp) + splat<2>(tmp);
+}
+
+Simd4f cross3(const Simd4f& v0, const Simd4f& v1)
+{
+	Simd4f t0 = _mm_shuffle_ps(v0, v0, 0xc9); // w z y x -> w x z y
+	Simd4f t1 = _mm_shuffle_ps(v1, v1, 0xc9);
+	Simd4f tmp = v0 * t1 - t0 * v1;
+	return _mm_shuffle_ps(tmp, tmp, 0xc9);
+}
+
+void transpose(Simd4f& x, Simd4f& y, Simd4f& z, Simd4f& w)
+{
+	_MM_TRANSPOSE4_PS(x, y, z, w);
+}
+
+int allEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return allTrue(v0 == v1);
+}
+
+int allEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return allTrue(outMask = v0 == v1);
+}
+
+int anyEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return anyTrue(v0 == v1);
+}
+
+int anyEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return anyTrue(outMask = v0 == v1);
+}
+
+int allGreater(const Simd4f& v0, const Simd4f& v1)
+{
+	return allTrue(v0 > v1);
+}
+
+int allGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return allTrue(outMask = v0 > v1);
+}
+
+int anyGreater(const Simd4f& v0, const Simd4f& v1)
+{
+	return anyTrue(v0 > v1);
+}
+
+int anyGreater(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return anyTrue(outMask = v0 > v1);
+}
+
+int allGreaterEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return allTrue(v0 >= v1);
+}
+
+int allGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return allTrue(outMask = v0 >= v1);
+}
+
+int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1)
+{
+	return anyTrue(v0 >= v1);
+}
+
+int anyGreaterEqual(const Simd4f& v0, const Simd4f& v1, Simd4f& outMask)
+{
+	return anyTrue(outMask = v0 >= v1);
+}
+
+int allTrue(const Simd4f& v)
+{
+	return _mm_movemask_ps(v) == 0xf;
+}
+
+int anyTrue(const Simd4f& v)
+{
+	return _mm_movemask_ps(v);
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/Simd4i.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/Simd4i.h
new file mode 100644
index 00000000..d4a70a02
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/Simd4i.h
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// factory implementation
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline Simd4iFactory<const int&>::operator Simd4i() const
+{
+	return _mm_set1_epi32(v);
+}
+
+inline Simd4iFactory<detail::FourTuple>::operator Simd4i() const
+{
+	return reinterpret_cast<const Simd4i&>(v);
+}
+
+template <int i>
+inline Simd4iFactory<detail::IntType<i> >::operator Simd4i() const
+{
+	return _mm_set1_epi32(i);
+}
+
+template <>
+inline Simd4iFactory<detail::IntType<0> >::operator Simd4i() const
+{
+	return _mm_setzero_si128();
+}
+
+template <>
+inline Simd4iFactory<const int*>::operator Simd4i() const
+{
+	return _mm_loadu_si128(reinterpret_cast<const __m128i*>(v));
+}
+
+template <>
+inline Simd4iFactory<detail::AlignedPointer<int> >::operator Simd4i() const
+{
+	return _mm_load_si128(reinterpret_cast<const __m128i*>(v.ptr));
+}
+
+template <>
+inline Simd4iFactory<detail::OffsetPointer<int> >::operator Simd4i() const
+{
+	return _mm_load_si128(reinterpret_cast<const __m128i*>(reinterpret_cast<const char*>(v.ptr) + v.offset));
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// expression template
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <>
+inline ComplementExpr<Simd4i>::operator Simd4i() const
+{
+	return _mm_andnot_si128(v, _mm_set1_epi32(0xffffffff));
+}
+
+Simd4i operator&(const ComplementExpr<Simd4i>& complement, const Simd4i& v)
+{
+	return _mm_andnot_si128(complement.v, v);
+}
+
+Simd4i operator&(const Simd4i& v, const ComplementExpr<Simd4i>& complement)
+{
+	return _mm_andnot_si128(complement.v, v);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// operator implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4i simdi::operator==(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_cmpeq_epi32(v0, v1);
+}
+
+Simd4i simdi::operator<(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_cmplt_epi32(v0, v1);
+}
+
+Simd4i simdi::operator>(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_cmpgt_epi32(v0, v1);
+}
+
+ComplementExpr<Simd4i> operator~(const Simd4i& v)
+{
+	return ComplementExpr<Simd4i>(v);
+}
+
+Simd4i operator&(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_and_si128(v0, v1);
+}
+
+Simd4i operator|(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_or_si128(v0, v1);
+}
+
+Simd4i operator^(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_xor_si128(v0, v1);
+}
+
+Simd4i operator<<(const Simd4i& v, int shift)
+{
+	return _mm_slli_epi32(v, shift);
+}
+
+Simd4i operator>>(const Simd4i& v, int shift)
+{
+	return _mm_srli_epi32(v, shift);
+}
+
+Simd4i simdi::operator+(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_add_epi32(v0, v1);
+}
+
+Simd4i simdi::operator-(const Simd4i& v)
+{
+	return _mm_sub_epi32(_mm_setzero_si128(), v);
+}
+
+Simd4i simdi::operator-(const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_sub_epi32(v0, v1);
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// function implementations
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+Simd4i simd4i(const Simd4f& v)
+{
+	return _mm_castps_si128(v);
+}
+
+int (&simdi::array(Simd4i& v))[4]
+{
+	return reinterpret_cast<int(&)[4]>(v);
+}
+
+const int (&simdi::array(const Simd4i& v))[4]
+{
+	return reinterpret_cast<const int(&)[4]>(v);
+}
+
+void store(int* ptr, const Simd4i& v)
+{
+	_mm_storeu_si128(reinterpret_cast<__m128i*>(ptr), v);
+}
+
+void storeAligned(int* ptr, const Simd4i& v)
+{
+	_mm_store_si128(reinterpret_cast<__m128i*>(ptr), v);
+}
+
+void storeAligned(int* ptr, unsigned int offset, const Simd4i& v)
+{
+	_mm_store_si128(reinterpret_cast<__m128i*>(reinterpret_cast<char*>(ptr) + offset), v);
+}
+
+template <size_t i>
+Simd4i splat(const Simd4i& v)
+{
+	return _mm_shuffle_epi32(v, _MM_SHUFFLE(i, i, i, i));
+}
+
+Simd4i select(const Simd4i& mask, const Simd4i& v0, const Simd4i& v1)
+{
+	return _mm_xor_si128(v1, _mm_and_si128(mask, _mm_xor_si128(v1, v0)));
+}
+
+int simdi::allEqual(const Simd4i& v0, const Simd4i& v1)
+{
+	return allTrue(simdi::operator==(v0, v1));
+}
+
+int simdi::allEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return allTrue(outMask = simdi::operator==(v0, v1));
+}
+
+int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1)
+{
+	return anyTrue(simdi::operator==(v0, v1));
+}
+
+int simdi::anyEqual(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return anyTrue(outMask = simdi::operator==(v0, v1));
+}
+
+int simdi::allGreater(const Simd4i& v0, const Simd4i& v1)
+{
+	return allTrue(simdi::operator>(v0, v1));
+}
+
+int simdi::allGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return allTrue(outMask = simdi::operator>(v0, v1));
+}
+
+int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1)
+{
+	return anyTrue(simdi::operator>(v0, v1));
+}
+
+int simdi::anyGreater(const Simd4i& v0, const Simd4i& v1, Simd4i& outMask)
+{
+	return anyTrue(outMask = simdi::operator>(v0, v1));
+}
+
+int allTrue(const Simd4i& v)
+{
+	return _mm_movemask_ps(_mm_castsi128_ps(v)) == 0xf;
+}
+
+int anyTrue(const Simd4i& v)
+{
+	return _mm_movemask_ps(_mm_castsi128_ps(v));
+}
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SimdTypes.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SimdTypes.h
new file mode 100644
index 00000000..e54edde7
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SimdTypes.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+// SSE + SSE2 (don't include intrin.h!)
+#include <emmintrin.h>
+
+#if defined(_MSC_VER)
+
+typedef __m128 Simd4f;
+typedef __m128i Simd4i;
+
+#else
+
+struct Simd4f
+{
+	Simd4f()
+	{
+	}
+	Simd4f(__m128 x) : m128(x)
+	{
+	}
+
+	operator __m128&()
+	{
+		return m128;
+	}
+	operator const __m128&() const
+	{
+		return m128;
+	}
+
+  private:
+	__m128 m128;
+};
+
+struct Simd4i
+{
+	Simd4i()
+	{
+	}
+	Simd4i(__m128i x) : m128i(x)
+	{
+	}
+
+	operator __m128i&()
+	{
+		return m128i;
+	}
+	operator const __m128i&() const
+	{
+		return m128i;
+	}
+
+  private:
+	__m128i m128i;
+};
+
+#endif
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SwCollisionHelpers.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SwCollisionHelpers.h
new file mode 100644
index 00000000..0750fcf5
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SwCollisionHelpers.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#ifdef PX_GCC_FAMILY
+#include <xmmintrin.h> // _BitScanForward
+#else
+#pragma warning(push)
+#pragma warning(disable : 4668) //'symbol'  is not defined as a preprocessor macro, replacing with '0' for 'directives'
+#pragma warning(disable : 4987) // nonstandard extension used: 'throw (...)'
+#include <intrin.h>             // _BitScanForward
+#pragma warning(pop)
+#endif
+
+namespace nvidia
+{
+namespace cloth
+{
+
+uint32_t findBitSet(uint32_t mask)
+{
+#if defined(_MSC_VER)
+	unsigned long result;
+	_BitScanForward(&result, unsigned long(mask));
+	return result;
+#else
+	return __builtin_ffs(mask) - 1;
+#endif
+}
+
+Simd4i intFloor(const Simd4f& v)
+{
+	Simd4i i = _mm_cvttps_epi32(v);
+	return simdi::operator-(i, _mm_srli_epi32(simd4i(v), 31));
+}
+
+Simd4i horizontalOr(Simd4i mask)
+{
+	Simd4i tmp = mask | _mm_shuffle_epi32(mask, 0xb1); // w z y x -> z w x y
+	return tmp | _mm_shuffle_epi32(tmp, 0x4e);         // w z y x -> y x w z
+}
+
+Gather<Simd4i>::Gather(const Simd4i& index)
+{
+	mSelectQ = _mm_srai_epi32(index << 29, 31);
+	mSelectD = _mm_srai_epi32(index << 30, 31);
+	mSelectW = _mm_srai_epi32(index << 31, 31);
+	mOutOfRange = simdi::operator>(index ^ sIntSignBit, sSignedMask);
+}
+
+Simd4i Gather<Simd4i>::operator()(const Simd4i* ptr) const
+{
+	// more efficient with _mm_shuffle_epi8 (SSSE3)
+	Simd4i lo = ptr[0], hi = ptr[1];
+	Simd4i m01 = select(mSelectW, splat<1>(lo), splat<0>(lo));
+	Simd4i m23 = select(mSelectW, splat<3>(lo), splat<2>(lo));
+	Simd4i m45 = select(mSelectW, splat<1>(hi), splat<0>(hi));
+	Simd4i m67 = select(mSelectW, splat<3>(hi), splat<2>(hi));
+	Simd4i m0123 = select(mSelectD, m23, m01);
+	Simd4i m4567 = select(mSelectD, m67, m45);
+	return select(mSelectQ, m4567, m0123) & ~mOutOfRange;
+}
+
+} // namespace cloth
+} // namespace nvidia
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SwSolveConstraints.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SwSolveConstraints.h
new file mode 100644
index 00000000..382812bb
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/sse2/SwSolveConstraints.h
@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma warning(push)
+#pragma warning(disable:4127) // Disable the nag warning 'conditional expression is constant'
+
+template <bool useMultiplier>
+void solveConstraints(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd,
+                      const uint16_t* __restrict iIt, __m128 stiffness)
+{
+	__m128 sOne = _mm_set1_ps(1.0f);
+
+	__m128 stretchLimit, compressionLimit, multiplier;
+	if(useMultiplier)
+	{
+		stretchLimit = _mm_shuffle_ps(stiffness, stiffness, 0xff);
+		compressionLimit = _mm_shuffle_ps(stiffness, stiffness, 0xaa);
+		multiplier = _mm_shuffle_ps(stiffness, stiffness, 0x55);
+	}
+	stiffness = _mm_shuffle_ps(stiffness, stiffness, 0x00);
+
+	for(; rIt != rEnd; rIt += 4, iIt += 8)
+	{
+		float* p0i = posIt + iIt[0] * 4;
+		float* p0j = posIt + iIt[1] * 4;
+		float* p1i = posIt + iIt[2] * 4;
+		float* p1j = posIt + iIt[3] * 4;
+		float* p2i = posIt + iIt[4] * 4;
+		float* p2j = posIt + iIt[5] * 4;
+		float* p3i = posIt + iIt[6] * 4;
+		float* p3j = posIt + iIt[7] * 4;
+
+		__m128 v0i = _mm_load_ps(p0i);
+		__m128 v0j = _mm_load_ps(p0j);
+		__m128 v1i = _mm_load_ps(p1i);
+		__m128 v1j = _mm_load_ps(p1j);
+		__m128 v2i = _mm_load_ps(p2i);
+		__m128 v2j = _mm_load_ps(p2j);
+		__m128 v3i = _mm_load_ps(p3i);
+		__m128 v3j = _mm_load_ps(p3j);
+
+		__m128 h0ij = _mm_add_ps(v0j, _mm_mul_ps(v0i, sMinusOneXYZOneW));
+		__m128 h1ij = _mm_add_ps(v1j, _mm_mul_ps(v1i, sMinusOneXYZOneW));
+		__m128 h2ij = _mm_add_ps(v2j, _mm_mul_ps(v2i, sMinusOneXYZOneW));
+		__m128 h3ij = _mm_add_ps(v3j, _mm_mul_ps(v3i, sMinusOneXYZOneW));
+
+		__m128 a = _mm_unpacklo_ps(h0ij, h2ij);
+		__m128 b = _mm_unpackhi_ps(h0ij, h2ij);
+		__m128 c = _mm_unpacklo_ps(h1ij, h3ij);
+		__m128 d = _mm_unpackhi_ps(h1ij, h3ij);
+
+		__m128 hxij = _mm_unpacklo_ps(a, c);
+		__m128 hyij = _mm_unpackhi_ps(a, c);
+		__m128 hzij = _mm_unpacklo_ps(b, d);
+		__m128 vwij = _mm_unpackhi_ps(b, d);
+
+		__m128 rij = _mm_load_ps(rIt);
+		__m128 e2ij = _mm_add_ps(
+		    sEpsilon, _mm_add_ps(_mm_mul_ps(hxij, hxij), _mm_add_ps(_mm_mul_ps(hyij, hyij), _mm_mul_ps(hzij, hzij))));
+		__m128 mask = _mm_cmpnle_ps(rij, sEpsilon);
+		__m128 erij = _mm_and_ps(_mm_sub_ps(sOne, _mm_mul_ps(rij, _mm_rsqrt_ps(e2ij))), mask);
+
+		if(useMultiplier)
+		{
+			erij = _mm_sub_ps(erij, _mm_mul_ps(multiplier, _mm_max_ps(compressionLimit, _mm_min_ps(erij, stretchLimit))));
+		}
+		__m128 exij = _mm_mul_ps(erij, _mm_mul_ps(stiffness, _mm_rcp_ps(_mm_add_ps(sEpsilon, vwij))));
+
+		__m128 exlo = _mm_and_ps(sMaskXY, exij);
+		__m128 exhi = _mm_andnot_ps(sMaskXY, exij);
+
+		__m128 f0ij = _mm_mul_ps(h0ij, _mm_shuffle_ps(exlo, exlo, 0xc0));
+		__m128 f1ij = _mm_mul_ps(h1ij, _mm_shuffle_ps(exlo, exlo, 0xd5));
+		__m128 f2ij = _mm_mul_ps(h2ij, _mm_shuffle_ps(exhi, exhi, 0x2a));
+		__m128 f3ij = _mm_mul_ps(h3ij, _mm_shuffle_ps(exhi, exhi, 0x3f));
+
+		__m128 u0i = _mm_add_ps(v0i, _mm_mul_ps(f0ij, _mm_shuffle_ps(v0i, v0i, 0xff)));
+		__m128 u0j = _mm_sub_ps(v0j, _mm_mul_ps(f0ij, _mm_shuffle_ps(v0j, v0j, 0xff)));
+		__m128 u1i = _mm_add_ps(v1i, _mm_mul_ps(f1ij, _mm_shuffle_ps(v1i, v1i, 0xff)));
+		__m128 u1j = _mm_sub_ps(v1j, _mm_mul_ps(f1ij, _mm_shuffle_ps(v1j, v1j, 0xff)));
+		__m128 u2i = _mm_add_ps(v2i, _mm_mul_ps(f2ij, _mm_shuffle_ps(v2i, v2i, 0xff)));
+		__m128 u2j = _mm_sub_ps(v2j, _mm_mul_ps(f2ij, _mm_shuffle_ps(v2j, v2j, 0xff)));
+		__m128 u3i = _mm_add_ps(v3i, _mm_mul_ps(f3ij, _mm_shuffle_ps(v3i, v3i, 0xff)));
+		__m128 u3j = _mm_sub_ps(v3j, _mm_mul_ps(f3ij, _mm_shuffle_ps(v3j, v3j, 0xff)));
+
+		_mm_store_ps(p0i, u0i);
+		_mm_store_ps(p0j, u0j);
+		_mm_store_ps(p1i, u1i);
+		_mm_store_ps(p1j, u1j);
+		_mm_store_ps(p2i, u2i);
+		_mm_store_ps(p2j, u2j);
+		_mm_store_ps(p3i, u3i);
+		_mm_store_ps(p3j, u3j);
+	}
+}
+
+#if PX_X86
+
+// clang-format:disable
+
+// asm blocks in static condition blocks don't get removed, specialize
+template <>
+void solveConstraints<false>(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd, 
+                             const uint16_t* __restrict iIt, __m128 stiffness)
+{
+	__m128 sOne = _mm_set1_ps(1.0f);
+	stiffness = _mm_shuffle_ps(stiffness, stiffness, 0x00);
+
+	__m128 htmp[4];
+	float* ptmp[8];
+
+	__asm 
+	{
+		mov edx, rIt
+		mov esi, rEnd
+
+		cmp edx, esi
+		jae forEnd
+
+		mov eax, iIt
+		mov ecx, posIt
+
+forBegin:
+		movzx edi, WORD PTR [eax   ] __asm shl edi, 4 __asm mov [ptmp   ], edi __asm movaps xmm0, XMMWORD PTR [edi + ecx] /* v0i */
+		movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v0j */
+		movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm movaps xmm1, XMMWORD PTR [edi + ecx] /* v1i */
+		movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v1j */
+
+		movaps xmm7, sMinusOneXYZOneW
+		mulps xmm2, xmm7 __asm addps xmm0, xmm2 __asm movaps XMMWORD PTR [htmp   ], xmm0 /* h0ij */
+		mulps xmm3, xmm7 __asm addps xmm1, xmm3 __asm movaps XMMWORD PTR [htmp+16], xmm1 /* h1ij */
+
+		movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */
+		movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v2j */
+		movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm movaps xmm5, XMMWORD PTR [edi + ecx] /* v3i */
+		movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v3j */
+
+		mulps xmm2, xmm7 __asm addps xmm2, xmm4 __asm movaps XMMWORD PTR [htmp+32], xmm2 /* h2ij */
+		mulps xmm3, xmm7 __asm addps xmm3, xmm5 __asm movaps XMMWORD PTR [htmp+48], xmm3 /* h3ij */
+
+		movaps xmm4, xmm0
+		movaps xmm5, xmm1
+
+		unpcklps xmm0, xmm2 /* a */
+		unpckhps xmm4, xmm2 /* b */
+		unpcklps xmm1, xmm3 /* c */
+		unpckhps xmm5, xmm3 /* d */
+
+		movaps xmm2, xmm0
+		movaps xmm6, xmm4
+
+		unpcklps xmm0, xmm1 /* hxij */
+		unpckhps xmm2, xmm1 /* hyij */
+		unpcklps xmm4, xmm5 /* hzij */
+		unpckhps xmm6, xmm5 /* vwij */
+
+		movaps xmm7, sEpsilon
+		movaps xmm5, sOne
+		movaps xmm3, stiffness
+		movaps xmm1, XMMWORD PTR [edx] /* rij */
+
+		mulps xmm0, xmm0 __asm addps xmm0, xmm7 /* e2ij */
+		mulps xmm2, xmm2 __asm addps xmm0, xmm2
+		mulps xmm4, xmm4 __asm addps xmm0, xmm4
+
+		rsqrtps xmm0, xmm0 __asm mulps xmm0, xmm1 /* erij */
+		cmpnleps xmm1, xmm7 /* mask */
+		subps xmm5, xmm0 __asm andps xmm5, xmm1
+		addps xmm6, xmm7 __asm rcpps xmm6, xmm6
+
+		mulps xmm6, xmm3 __asm mulps xmm6, xmm5 /* exij */
+
+		movaps xmm7, sMaskXY
+		andps xmm7, xmm6 /* exlo */
+		xorps xmm6, xmm7 /* exhi */
+
+		movaps xmm0, XMMWORD PTR [htmp   ] /* h0ij */
+		movaps xmm1, XMMWORD PTR [htmp+16] /* h1ij */
+		movaps xmm2, XMMWORD PTR [htmp+32] /* h2ij */
+		movaps xmm3, XMMWORD PTR [htmp+48] /* h3ij */
+
+		pshufd xmm5, xmm7, 0xc0 __asm mulps xmm0, xmm5 /* f0ij */
+		pshufd xmm7, xmm7, 0xd5 __asm mulps xmm1, xmm7 /* f1ij */
+		pshufd xmm4, xmm6, 0x2a __asm mulps xmm2, xmm4 /* f2ij */
+		pshufd xmm6, xmm6, 0x3f __asm mulps xmm3, xmm6 /* f3ij */
+
+		mov edi, [ptmp   ] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v0i */
+		pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm0 __asm subps xmm4, xmm5 /* u0i */
+		movaps XMMWORD PTR [edi + ecx], xmm4
+
+		mov edi, [ptmp+ 4] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v0j */
+		pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm0 __asm addps xmm6, xmm7 /* u0j */
+		movaps XMMWORD PTR [edi + ecx], xmm6
+
+		mov edi, [ptmp+ 8] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v1i */
+		pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm1 __asm subps xmm4, xmm5 /* u1i */
+		movaps XMMWORD PTR [edi + ecx], xmm4
+
+		mov edi, [ptmp+12] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v1j */
+		pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm1 __asm addps xmm6, xmm7 /* u1j */
+		movaps XMMWORD PTR [edi + ecx], xmm6
+
+		mov edi, [ptmp+16] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */
+		pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm2 __asm subps xmm4, xmm5 /* u2i */
+		movaps XMMWORD PTR [edi + ecx], xmm4
+
+		mov edi, [ptmp+20] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v2j */
+		pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm2 __asm addps xmm6, xmm7 /* u2j */
+		movaps XMMWORD PTR [edi + ecx], xmm6
+
+		mov edi, [ptmp+24] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v3i */
+		pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm3 __asm subps xmm4, xmm5 /* u3i */
+		movaps XMMWORD PTR [edi + ecx], xmm4
+
+		mov edi, [ptmp+28] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v3j */
+		pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm3 __asm addps xmm6, xmm7 /* u3j */
+		movaps XMMWORD PTR [edi + ecx], xmm6
+
+		add eax, 16
+		add edx, 16
+
+		cmp edx, esi
+		jb forBegin
+forEnd:
+	}
+}
+
+template <>
+void solveConstraints<true>(float* __restrict posIt, const float* __restrict rIt, const float* __restrict rEnd, 
+                            const uint16_t* __restrict iIt, __m128 stiffness)
+{
+	__m128 sOne = _mm_set1_ps(1.0f);
+	__m128 stretchLimit = _mm_shuffle_ps(stiffness, stiffness, 0xff);
+	__m128 compressionLimit = _mm_shuffle_ps(stiffness, stiffness, 0xaa);
+	__m128 multiplier = _mm_shuffle_ps(stiffness, stiffness, 0x55);
+	stiffness = _mm_shuffle_ps(stiffness, stiffness, 0x00);
+
+	__m128 htmp[4];
+	float* ptmp[8];
+
+	__asm 
+	{
+		mov edx, rIt
+		mov esi, rEnd
+
+		cmp edx, esi
+		jae forEnd
+
+		mov eax, iIt
+		mov ecx, posIt
+
+forBegin:
+		movzx edi, WORD PTR [eax   ] __asm shl edi, 4 __asm mov [ptmp   ], edi __asm movaps xmm0, XMMWORD PTR [edi + ecx] /* v0i */
+		movzx edi, WORD PTR [eax+ 2] __asm shl edi, 4 __asm mov [ptmp+ 4], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v0j */
+		movzx edi, WORD PTR [eax+ 4] __asm shl edi, 4 __asm mov [ptmp+ 8], edi __asm movaps xmm1, XMMWORD PTR [edi + ecx] /* v1i */
+		movzx edi, WORD PTR [eax+ 6] __asm shl edi, 4 __asm mov [ptmp+12], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v1j */
+
+		movaps xmm7, sMinusOneXYZOneW
+		mulps xmm2, xmm7 __asm addps xmm0, xmm2 __asm movaps XMMWORD PTR [htmp   ], xmm0 /* h0ij */
+		mulps xmm3, xmm7 __asm addps xmm1, xmm3 __asm movaps XMMWORD PTR [htmp+16], xmm1 /* h1ij */
+
+		movzx edi, WORD PTR [eax+ 8] __asm shl edi, 4 __asm mov [ptmp+16], edi __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */
+		movzx edi, WORD PTR [eax+10] __asm shl edi, 4 __asm mov [ptmp+20], edi __asm movaps xmm2, XMMWORD PTR [edi + ecx] /* v2j */
+		movzx edi, WORD PTR [eax+12] __asm shl edi, 4 __asm mov [ptmp+24], edi __asm movaps xmm5, XMMWORD PTR [edi + ecx] /* v3i */
+		movzx edi, WORD PTR [eax+14] __asm shl edi, 4 __asm mov [ptmp+28], edi __asm movaps xmm3, XMMWORD PTR [edi + ecx] /* v3j */
+
+		mulps xmm2, xmm7 __asm addps xmm2, xmm4 __asm movaps XMMWORD PTR [htmp+32], xmm2 /* h2ij */
+		mulps xmm3, xmm7 __asm addps xmm3, xmm5 __asm movaps XMMWORD PTR [htmp+48], xmm3 /* h3ij */
+
+		movaps xmm4, xmm0
+		movaps xmm5, xmm1
+
+		unpcklps xmm0, xmm2 /* a */
+		unpckhps xmm4, xmm2 /* b */
+		unpcklps xmm1, xmm3 /* c */
+		unpckhps xmm5, xmm3 /* d */
+
+		movaps xmm2, xmm0
+		movaps xmm6, xmm4
+
+		unpcklps xmm0, xmm1 /* hxij */
+		unpckhps xmm2, xmm1 /* hyij */
+		unpcklps xmm4, xmm5 /* hzij */
+		unpckhps xmm6, xmm5 /* vwij */
+
+		movaps xmm7, sEpsilon
+		movaps xmm5, sOne
+		movaps xmm3, stiffness
+		movaps xmm1, XMMWORD PTR [edx] /* rij */
+
+		mulps xmm0, xmm0 __asm addps xmm0, xmm7 /* e2ij */
+		mulps xmm2, xmm2 __asm addps xmm0, xmm2
+		mulps xmm4, xmm4 __asm addps xmm0, xmm4
+
+		rsqrtps xmm0, xmm0 __asm mulps xmm0, xmm1 /* erij */
+		cmpnleps xmm1, xmm7 /* mask */
+		subps xmm5, xmm0 __asm andps xmm5, xmm1
+		addps xmm6, xmm7 __asm rcpps xmm6, xmm6
+
+		movaps xmm0, stretchLimit /* multiplier block */
+		movaps xmm1, compressionLimit
+		movaps xmm2, multiplier
+		minps xmm0, xmm5
+		maxps xmm1, xmm0
+		mulps xmm2, xmm1
+		subps xmm5, xmm2
+
+		mulps xmm6, xmm3 __asm mulps xmm6, xmm5 /* exij */
+
+		movaps xmm7, sMaskXY
+		andps xmm7, xmm6 /* exlo */
+		xorps xmm6, xmm7 /* exhi */
+
+		movaps xmm0, XMMWORD PTR [htmp   ] /* h0ij */
+		movaps xmm1, XMMWORD PTR [htmp+16] /* h1ij */
+		movaps xmm2, XMMWORD PTR [htmp+32] /* h2ij */
+		movaps xmm3, XMMWORD PTR [htmp+48] /* h3ij */
+
+		pshufd xmm5, xmm7, 0xc0 __asm mulps xmm0, xmm5 /* f0ij */
+		pshufd xmm7, xmm7, 0xd5 __asm mulps xmm1, xmm7 /* f1ij */
+		pshufd xmm4, xmm6, 0x2a __asm mulps xmm2, xmm4 /* f2ij */
+		pshufd xmm6, xmm6, 0x3f __asm mulps xmm3, xmm6 /* f3ij */
+
+		mov edi, [ptmp   ] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v0i */
+		pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm0 __asm subps xmm4, xmm5 /* u0i */
+		movaps XMMWORD PTR [edi + ecx], xmm4
+
+		mov edi, [ptmp+ 4] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v0j */
+		pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm0 __asm addps xmm6, xmm7 /* u0j */
+		movaps XMMWORD PTR [edi + ecx], xmm6
+
+		mov edi, [ptmp+ 8] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v1i */
+		pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm1 __asm subps xmm4, xmm5 /* u1i */
+		movaps XMMWORD PTR [edi + ecx], xmm4
+
+		mov edi, [ptmp+12] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v1j */
+		pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm1 __asm addps xmm6, xmm7 /* u1j */
+		movaps XMMWORD PTR [edi + ecx], xmm6
+
+		mov edi, [ptmp+16] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v2i */
+		pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm2 __asm subps xmm4, xmm5 /* u2i */
+		movaps XMMWORD PTR [edi + ecx], xmm4
+
+		mov edi, [ptmp+20] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v2j */
+		pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm2 __asm addps xmm6, xmm7 /* u2j */
+		movaps XMMWORD PTR [edi + ecx], xmm6
+
+		mov edi, [ptmp+24] __asm movaps xmm4, XMMWORD PTR [edi + ecx] /* v3i */
+		pshufd xmm5, xmm4, 0xff __asm mulps xmm5, xmm3 __asm subps xmm4, xmm5 /* u3i */
+		movaps XMMWORD PTR [edi + ecx], xmm4
+
+		mov edi, [ptmp+28] __asm movaps xmm6, XMMWORD PTR [edi + ecx] /* v3j */
+		pshufd xmm7, xmm6, 0xff __asm mulps xmm7, xmm3 __asm addps xmm6, xmm7 /* u3j */
+		movaps XMMWORD PTR [edi + ecx], xmm6
+
+		add eax, 16
+		add edx, 16
+
+		cmp edx, esi
+		jb forBegin
+forEnd:
+	}
+}
+
+// clang-format:enable
+
+#endif
+
+#pragma warning(pop)
diff --git a/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/windows/CuFactory.h b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/windows/CuFactory.h
new file mode 100644
index 00000000..59cec2d9
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/LowLevelCloth/src/windows/CuFactory.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#pragma once
+
+#include "Factory.h"
+#include "Allocator.h"
+
+namespace physx
+{
+	class PxCudaContextManager;
+}
+
+namespace nvidia
+{
+namespace cloth
+{
+
+class CuFabric;
+class CuCloth;
+template <typename>
+class ClothImpl;
+
+class CuFactory : public UserAllocated, public Factory
+{
+  protected:
+	CuFactory& operator=(const CuFactory&);
+
+  public:
+	typedef CuFabric FabricType;
+	typedef ClothImpl<CuCloth> ImplType;
+
+	CuFactory(PxCudaContextManager*);
+	virtual ~CuFactory();
+
+	virtual Fabric* createFabric(uint32_t numParticles, Range<const uint32_t> phases, Range<const uint32_t> sets,
+	                             Range<const float> restvalues, Range<const uint32_t> indices,
+	                             Range<const uint32_t> anchors, Range<const float> tetherLengths);
+
+	virtual Cloth* createCloth(Range<const PxVec4> particles, Fabric& fabric);
+
+	virtual Solver* createSolver(profile::PxProfileZone* profiler, PxTaskManager* taskMgr);
+
+	virtual Cloth* clone(const Cloth& cloth);
+
+	virtual void extractFabricData(const Fabric& fabric, Range<uint32_t> phases, Range<uint32_t> sets,
+	                               Range<float> restvalues, Range<uint32_t> indices, Range<uint32_t> anchors,
+	                               Range<float> tetherLengths) const;
+
+	virtual void extractCollisionData(const Cloth& cloth, Range<PxVec4> spheres, Range<uint32_t> capsules,
+	                                  Range<PxVec4> planes, Range<uint32_t> convexes, Range<PxVec3> triangles) const;
+
+	virtual void extractMotionConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const;
+
+	virtual void extractSeparationConstraints(const Cloth& cloth, Range<PxVec4> destConstraints) const;
+
+	virtual void extractParticleAccelerations(const Cloth& cloth, Range<PxVec4> destAccelerations) const;
+
+	virtual void extractVirtualParticles(const Cloth& cloth, Range<uint32_t[4]> destIndices,
+	                                     Range<PxVec3> destWeights) const;
+
+	virtual void extractSelfCollisionIndices(const Cloth& cloth, Range<uint32_t> destIndices) const;
+
+	virtual void extractRestPositions(const Cloth& cloth, Range<PxVec4> destRestPositions) const;
+
+  public:
+	void copyToHost(const void* srcIt, const void* srcEnd, void* dstIt) const;
+
+  public:
+	Vector<CuFabric*>::Type mFabrics;
+
+	PxCudaContextManager* mContextManager;
+
+	uint32_t mNumThreadsPerBlock;
+
+	const uint32_t mMaxThreadsPerBlock;
+};
+}
+}
diff --git a/APEX_1.4/module/clothing/embedded/PxClothFabric.h b/APEX_1.4/module/clothing/embedded/PxClothFabric.h
new file mode 100644
index 00000000..78d41228
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/PxClothFabric.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PX_PHYSICS_NX_CLOTH_FABRIC
+#define PX_PHYSICS_NX_CLOTH_FABRIC
+/** \addtogroup cloth
+  @{
+*/
+
+
+#if PX_DOXYGEN == 0
+namespace nvidia
+{
+#endif
+
+/**
+\brief Describe type of phase in cloth fabric.
+\see PxClothFabric for an explanation of concepts on phase and set.
+*/
+struct PxClothFabricPhaseType
+{
+	enum Enum
+	{
+		eINVALID,     //!< invalid type 
+		eVERTICAL,    //!< resists stretching or compression, usually along the gravity
+		eHORIZONTAL,  //!< resists stretching or compression, perpendicular to the gravity
+		eBENDING,     //!< resists out-of-plane bending in angle-based formulation
+		eSHEARING,    //!< resists in-plane shearing along (typically) diagonal edges,
+        eCOUNT        // internal use only
+	};
+};
+
+/**
+\brief References a set of constraints that can be solved in parallel.
+\see PxClothFabric for an explanation of the concepts on phase and set.
+*/
+struct PxClothFabricPhase
+{
+	PxClothFabricPhase(PxClothFabricPhaseType::Enum type = 
+		PxClothFabricPhaseType::eINVALID, uint32_t index = 0);
+
+	/**
+	\brief Type of constraints to solve.
+	*/
+	PxClothFabricPhaseType::Enum phaseType;
+
+	/**
+	\brief Index of the set that contains the particle indices.
+	*/
+	uint32_t setIndex;
+};
+
+PX_INLINE PxClothFabricPhase::PxClothFabricPhase(
+	PxClothFabricPhaseType::Enum type, uint32_t index)
+	: phaseType(type)
+	, setIndex(index)
+{}
+
+/**
+\brief References all the data required to create a fabric.
+\see PxPhysics.createClothFabric(), PxClothFabricCooker.getDescriptor()
+*/
+class PxClothFabricDesc
+{
+public:
+	/** \brief The number of particles needed when creating a PxCloth instance from the fabric. */
+	uint32_t nbParticles;
+
+	/** \brief The number of solver phases. */
+	uint32_t nbPhases;
+	/** \brief Array defining which constraints to solve each phase. See #PxClothFabric.getPhases(). */
+	const PxClothFabricPhase* phases;
+
+	/** \brief The number of sets in the fabric. */
+	uint32_t nbSets;
+	/** \brief Array with an index per set which points one entry beyond the last constraint of the set. See #PxClothFabric.getSets(). */
+	const uint32_t* sets;
+
+	/** \brief Array of particle indices which specifies the pair of constrained vertices. See #PxClothFabric.getParticleIndices(). */
+	const uint32_t* indices;
+	/** \brief Array of rest values for each constraint. See #PxClothFabric.getRestvalues(). */
+	const float* restvalues;
+
+	/** \brief Size of tetherAnchors and tetherLengths arrays, needs to be multiple of nbParticles. */
+	uint32_t nbTethers;
+	/** \brief Array of particle indices specifying the tether anchors. See #PxClothFabric.getTetherAnchors(). */
+	const uint32_t* tetherAnchors;
+	/** \brief Array of rest distance between tethered particle pairs. See #PxClothFabric.getTetherLengths(). */
+	const float* tetherLengths;
+
+	/**
+	\brief constructor sets to default.
+	*/
+	PX_INLINE PxClothFabricDesc();
+
+	/**
+	\brief (re)sets the structure to the default.	
+	*/
+	PX_INLINE void setToDefault();
+
+	/**
+	\brief Returns true if the descriptor is valid.
+	\return True if the current settings are valid
+	*/
+	PX_INLINE bool isValid() const;
+};
+
+PX_INLINE PxClothFabricDesc::PxClothFabricDesc()
+{
+	setToDefault();
+}
+
+PX_INLINE void PxClothFabricDesc::setToDefault()
+{
+	memset(this, 0, sizeof(PxClothFabricDesc));
+}
+
+PX_INLINE bool PxClothFabricDesc::isValid() const
+{
+	return (nbParticles && nbPhases && phases && restvalues && nbSets 
+		&& sets && indices && (!nbTethers || (tetherAnchors && tetherLengths)));
+}
+
+
+#if PX_DOXYGEN == 0
+} // namespace nvidia
+#endif
+
+/** @} */
+#endif
diff --git a/APEX_1.4/module/clothing/embedded/PxClothMeshDesc.h b/APEX_1.4/module/clothing/embedded/PxClothMeshDesc.h
new file mode 100644
index 00000000..c60aef5b
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/PxClothMeshDesc.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PX_PHYSICS_NX_CLOTHMESHDESC
+#define PX_PHYSICS_NX_CLOTHMESHDESC
+/** \addtogroup cooking
+@{
+*/
+
+#include "ExtClothCoreUtilityTypes.h"
+#include "PxVec3.h"
+
+#if PX_DOXYGEN == 0
+namespace nvidia
+{
+#endif
+
+/**
+\brief Descriptor class for a cloth mesh.
+
+@see PxCooking.cookClothMesh()
+
+*/
+class PxClothMeshDesc
+{
+public:
+
+	/**
+	\brief Pointer to first vertex point.
+	*/
+	PxBoundedData points;
+
+	/**
+	\brief Determines whether particle is simulated or static.
+	A positive value denotes that the particle is being simulated, zero denotes a static particle.
+	This data is used to generate tether and zero stretch constraints.
+	If invMasses.data is null, all particles are assumed to be simulated 
+	and no tether and zero stretch constraints are being generated.
+	*/
+	PxBoundedData invMasses;
+
+	/**
+	\brief Pointer to the first triangle.
+
+	These are triplets of 0 based indices:
+	vert0 vert1 vert2
+	vert0 vert1 vert2
+	vert0 vert1 vert2
+	...
+
+	where vert* is either a 32 or 16 bit unsigned integer. There are a total of 3*count indices.
+	The stride determines the byte offset to the next index triple.
+	
+	This is declared as a void pointer because it is actually either an uint16_t or a uint32_t pointer.
+	*/
+	PxBoundedData triangles;
+
+	/**
+	\brief Pointer to the first quad.
+
+	These are quadruples of 0 based indices:
+	vert0 vert1 vert2 vert3
+	vert0 vert1 vert2 vert3
+	vert0 vert1 vert2 vert3
+	...
+
+	where vert* is either a 32 or 16 bit unsigned integer. There are a total of 4*count indices.
+	The stride determines the byte offset to the next index quadruple.
+
+	This is declared as a void pointer because it is actually either an uint16_t or a uint32_t pointer.
+	*/
+	PxBoundedData quads;
+
+	/**
+	\brief Flags bits, combined from values of the enum ::PxMeshFlag
+	*/
+	PxMeshFlags flags;
+
+	/**
+	\brief constructor sets to default.
+	*/
+	PX_INLINE PxClothMeshDesc();
+	/**
+	\brief (re)sets the structure to the default.	
+	*/
+	PX_INLINE void setToDefault();
+	/**
+	\brief Returns true if the descriptor is valid.
+	\return True if the current settings are valid
+	*/
+	PX_INLINE bool isValid() const;
+};
+
+PX_INLINE PxClothMeshDesc::PxClothMeshDesc()	//constructor sets to default
+{
+}
+
+PX_INLINE void PxClothMeshDesc::setToDefault()
+{
+	*this = PxClothMeshDesc();
+}
+
+PX_INLINE bool PxClothMeshDesc::isValid() const
+{
+	if(points.count < 3) 	//at least 1 trig's worth of points
+		return false;
+	if(points.count > 0xffff && flags & PxMeshFlag::e16_BIT_INDICES)
+		return false;
+	if(!points.data)
+		return false;
+	if(points.stride < sizeof(physx::PxVec3))	//should be at least one point's worth of data
+		return false;
+
+	if(invMasses.data && invMasses.stride < sizeof(float))
+		return false;
+	if(invMasses.data && invMasses.count != points.count)
+		return false;
+
+	if (!triangles.count && !quads.count)	// no support for non-indexed mesh
+		return false;
+	if (triangles.count && !triangles.data)
+		return false;
+	if (quads.count && !quads.data)
+		return false;
+
+	uint32_t indexSize = (flags & PxMeshFlag::e16_BIT_INDICES) ? sizeof(uint16_t) : sizeof(uint32_t);
+	if(triangles.count && triangles.stride < indexSize*3) 
+		return false; 
+	if(quads.count && quads.stride < indexSize*4)
+		return false;
+
+	return true;
+}
+
+#if PX_DOXYGEN == 0
+} // namespace nvidia
+#endif
+
+/** @} */
+#endif
diff --git a/APEX_1.4/module/clothing/embedded/PxClothTypes.h b/APEX_1.4/module/clothing/embedded/PxClothTypes.h
new file mode 100644
index 00000000..a210d6cc
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/PxClothTypes.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#ifndef PX_PHYSICS_NX_CLOTH_TYPES
+#define PX_PHYSICS_NX_CLOTH_TYPES
+/** \addtogroup cloth
+  @{
+*/
+
+#include "PxPhysXConfig.h"
+#include "PxFlags.h"
+
+#include "PxVec3.h"
+
+#if PX_DOXYGEN == 0
+namespace nvidia
+{
+#endif
+
+/**
+   \brief flag for behaviors of the cloth solver
+   \details Defines flags to turn on/off features of the cloth solver.
+   The flag can be set during the cloth object construction (\see PxPhysics.createCloth() ),
+   or individually after the cloth has been created (\see PxCloth.setClothFlag() ).
+ */
+struct PxClothFlag
+{
+	enum Enum
+	{
+		eGPU			 = (1<<0), //!< turn on/off gpu based solver
+		eSWEPT_CONTACT	 = (1<<1), //!< use swept contact (continuous collision)
+		eSCENE_COLLISION = (1<<2), //!< collide against rigid body shapes in scene
+		eCOUNT			 = 3	   // internal use only
+	};
+};
+
+typedef PxFlags<PxClothFlag::Enum,uint16_t> PxClothFlags;
+PX_FLAGS_OPERATORS(PxClothFlag::Enum, uint16_t)
+
+/**
+   \brief Per particle data for cloth.
+   \details Defines position of the cloth particle as well as inverse mass.
+   When inverse mass is set to 0, the particle gets fully constrained
+   to the position during simulation.
+   \see PxPhysics.createCloth()
+   \see PxCloth.setParticles()
+*/
+struct PxClothParticle
+{
+	PxVec3 pos;			//!< position of the particle (in cloth local space)
+	float invWeight;	//!< inverse mass of the particle. If set to 0, the particle is fully constrained.
+
+	/**
+	\brief Default constructor, performs no initialization.
+	*/
+	PxClothParticle() {}
+	PxClothParticle(const PxVec3& pos_, float invWeight_) 
+		: pos(pos_), invWeight(invWeight_){}
+};
+
+/**
+\brief Constraints for cloth particle motion.
+\details Defines a spherical volume to which the motion of a particle should be constrained.
+@see PxCloth.setMotionConstraints()
+*/
+struct PxClothParticleMotionConstraint
+{
+	PxVec3 pos;			//!< Center of the motion constraint sphere (in cloth local space)
+	float radius;		//!< Maximum distance the particle can move away from the sphere center.
+
+	/**
+	\brief Default constructor, performs no initialization.
+	*/
+	PxClothParticleMotionConstraint() {}
+	PxClothParticleMotionConstraint(const PxVec3& p, float r) 
+		: pos(p), radius(r){}
+};
+
+/**
+\brief Separation constraints for cloth particle movement
+\details Defines a spherical volume such that corresponding particles should stay outside.
+@see PxCloth.setSeparationConstraints()
+*/
+struct PxClothParticleSeparationConstraint
+{
+	PxVec3 pos;			//!< Center of the constraint sphere (in cloth local space)
+	float radius;		//!< Radius of the constraint sphere such that the particle stay outside of this sphere.
+
+	/**
+	\brief Default constructor, performs no initialization.
+	*/
+	PxClothParticleSeparationConstraint() {}
+	PxClothParticleSeparationConstraint(const PxVec3& p, float r) 
+		: pos(p), radius(r){}
+};
+
+#if PX_DOXYGEN == 0
+} // namespace nvidia
+#endif
+
+/** @} */
+#endif
diff --git a/APEX_1.4/module/clothing/embedded/Simulation.cpp b/APEX_1.4/module/clothing/embedded/Simulation.cpp
new file mode 100644
index 00000000..3705a156
--- /dev/null
+++ b/APEX_1.4/module/clothing/embedded/Simulation.cpp
@@ -0,0 +1,2488 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+#include "Simulation.h"
+
+#include "ModuleClothingImpl.h"
+#include "ClothingScene.h"
+#include "ClothingCookedPhysX3Param.h"
+
+#include "DebugRenderParams.h"
+#include "ClothingDebugRenderParams.h"
+//#include "RenderDebugInterface.h"
+#include "RenderDebugInterface.h"
+
+#include "ModuleClothingHelpers.h"
+#include "ClothStructs.h"
+
+// only for the phase flags
+#include "ExtClothFabricCooker.h"
+
+// from LowLevelCloth
+#include "Cloth.h"
+#include "Fabric.h"
+#include "Factory.h"
+#include "Range.h"
+#include "Solver.h"
+
+#include "ApexSDKIntl.h"
+#include "SceneIntl.h"
+#include "PxCudaContextManager.h"
+#include "PxGpuDispatcher.h"
+
+#include "PsIntrinsics.h"
+#include "ProfilerCallback.h"
+
+#include <ApexCollision.h>
+#include "ApexMath.h"
+
+#include "Lock.h"
+
+#include "ClothingCollisionImpl.h"
+
+// visualize convexes
+#include "ApexSharedUtils.h"
+
+// pvd
+#include "ApexPvdClient.h"
+
+#include "PxPvdDataStream.h"
+#include "PxPvdUserRenderer.h"
+
+#if PX_PHYSICS_VERSION_MAJOR == 3
+#include "ScopedPhysXLock.h"
+#endif
+
+namespace nvidia
+{
+namespace clothing
+{
+
+using namespace physx;
+
+Simulation::Simulation(ClothingScene* clothingScene, bool useCuda) : SimulationAbstract(clothingScene),
+	mCookedData(NULL),
+	mIndices(NULL),
+	mRestPositions(NULL),
+	mConstrainCoeffs(NULL),
+	mCloth(NULL),
+	mNumAssetSpheres(0),
+	mNumAssetCapsules(0),
+	mNumAssetCapsulesInvalid(0),
+	mNumAssetConvexes(0),
+	mConstrainConstantsDirty(false),
+	mMotionConstrainScale(1.0f),
+	mMotionConstrainBias(0.0f),
+	mNumBackstopConstraints(-1),
+	mScaledGravity(0.0f),
+	mLastTimestep(0.0f),
+	mLocalSpaceSim(false),
+	mGlobalPose(PxMat44(PxIdentity)),
+	mGlobalPosePrevious(PxMat44(PxIdentity)),
+	mGlobalPoseNormalized(PxMat44(PxIdentity)),
+	mGlobalPoseNormalizedInv(PxMat44(PxIdentity)),
+	mActorScale(0.0f),
+	mTetherLimit(0.0f),
+	mTeleported(false),
+	mIsStatic(false)
+{
+	PX_ASSERT(clothingScene != NULL);
+	PX_UNUSED(useCuda);
+
+#if PX_WINDOWS_FAMILY
+	mUseCuda = useCuda;
+#else
+	mUseCuda = false; // disabled on consoles
+#endif
+}
+
+
+
+Simulation::~Simulation()
+{
+	if (mCloth != NULL)
+	{
+		mClothingScene->lockScene();
+		mClothingScene->getClothSolver(mUseCuda)->removeCloth(mCloth);
+		delete mCloth;
+		mClothingScene->unlockScene();
+		mCloth = NULL;
+	}
+}
+
+
+
+bool Simulation::needsExpensiveCreation()
+{
+	// disable caching of unused objects!
+	return false;
+}
+
+
+
+bool Simulation::needsAdaptiveTargetFrequency()
+{
+	// this is handled by the cloth solver directly
+	return false;
+}
+
+
+
+bool Simulation::needsManualSubstepping()
+{
+	// the solver will interpolate the skinned positions itself
+	return false;
+}
+
+
+
+bool Simulation::needsLocalSpaceGravity()
+{
+	return false;
+}
+
+
+
+uint32_t Simulation::getNumSolverIterations() const
+{
+	uint32_t numSolverIterations = 0;
+	if (mCloth != NULL)
+	{
+		numSolverIterations = (uint32_t)PxMax(1, int(mLastTimestep * mCloth->getSolverFrequency() + 0.5f));
+	}
+	return numSolverIterations;
+}
+
+
+
+bool Simulation::setCookedData(NvParameterized::Interface* cookedData, float actorScale)
+{
+	PX_ASSERT(cookedData != NULL);
+
+	mActorScale = actorScale;
+	PX_ASSERT(mActorScale > 0.0f);
+
+	if (::strcmp(cookedData->className(), ClothingCookedPhysX3Param::staticClassName()) != 0)
+	{
+		PX_ALWAYS_ASSERT();
+		return false;
+	}
+
+	mCookedData = static_cast<ClothingCookedPhysX3Param*>(cookedData);
+
+	return true;
+}
+
+
+bool Simulation::initPhysics(uint32_t _physicalMeshId, uint32_t* indices, PxVec3* restPositions, tMaterial* material, const PxMat44& /*globalPose*/, const PxVec3& scaledGravity, bool /*localSpaceSim*/)
+{
+	PX_ASSERT(mCookedData != NULL);
+
+	while (mCookedData->physicalMeshId != _physicalMeshId)
+	{
+		mCookedData = static_cast<ClothingCookedPhysX3Param*>(mCookedData->nextCookedData);
+	}
+
+	PX_ASSERT(mCookedData != NULL);
+	PX_ASSERT(mCookedData->physicalMeshId == _physicalMeshId);
+
+	mIndices = indices;
+	mRestPositions = restPositions;
+
+	if (mCookedData != NULL)
+	{
+#if PX_PHYSICS_VERSION_MAJOR == 3
+		SCOPED_PHYSX_LOCK_WRITE(mClothingScene->getApexScene());
+#else
+		WRITE_LOCK(*mClothingScene->getApexScene());
+#endif
+		// PH: mUseCuda is passed by reference. If for whatever reason a FactoryGPU could not be created, a FactoryCPU is returned and mUseCuda will be false
+		ClothFactory factory = mClothingScene->getClothFactory(mUseCuda);
+		nvidia::Mutex::ScopedLock _wlockFactory(*factory.mutex);
+
+
+		// find if there's a shared fabric
+		cloth::Fabric* fabric = NULL;
+		if (factory.factory->getPlatform() == cloth::Factory::CPU)
+		{
+			fabric = (cloth::Fabric*)mCookedData->fabricCPU;
+		}
+		else
+		{
+			for (int32_t i = 0; i < mCookedData->fabricGPU.arraySizes[0]; ++i)
+			{
+				if (mCookedData->fabricGPU.buf[i].factory == factory.factory)
+				{
+					fabric = (cloth::Fabric*)mCookedData->fabricGPU.buf[i].fabricGPU;
+					break;
+				}
+			}
+		}
+
+		if (fabric == NULL)
+		{
+			nvidia::Array<uint32_t> phases((uint32_t)mCookedData->deformablePhaseDescs.arraySizes[0]);
+			for (uint32_t i = 0; i < phases.size(); i++)
+				phases[i] = mCookedData->deformablePhaseDescs.buf[i].setIndex;
+			nvidia::Array<uint32_t> sets((uint32_t)mCookedData->deformableSets.arraySizes[0]);
+			for (uint32_t i = 0; i < sets.size(); i++)
+			{
+				sets[i] = mCookedData->deformableSets.buf[i].fiberEnd;
+			}
+			cloth::Range<uint32_t> indices(mCookedData->deformableIndices.buf,   mCookedData->deformableIndices.buf     + mCookedData->deformableIndices.arraySizes[0]);
+			cloth::Range<float> restLengths(mCookedData->deformableRestLengths.buf, mCookedData->deformableRestLengths.buf + mCookedData->deformableRestLengths.arraySizes[0]);
+			cloth::Range<uint32_t> tetherAnchors(mCookedData->tetherAnchors.buf, mCookedData->tetherAnchors.buf + mCookedData->tetherAnchors.arraySizes[0]);
+			cloth::Range<float> tetherLengths(mCookedData->tetherLengths.buf, mCookedData->tetherLengths.buf + mCookedData->tetherLengths.arraySizes[0]);
+
+			PX_PROFILE_ZONE("ClothingActorImpl::createClothFabric", GetInternalApexSDK()->getContextId());
+
+			// TODO use PhysX interface to scale tethers when available
+			for (int i = 0; i < mCookedData->tetherLengths.arraySizes[0]; ++i)
+			{
+				mCookedData->tetherLengths.buf[i] *= simulation.restLengthScale;
+			}
+
+			fabric = factory.factory->createFabric(
+			             mCookedData->numVertices,
+			             cloth::Range<uint32_t>(phases.begin(), phases.end()),
+			             cloth::Range<uint32_t>(sets.begin(), sets.end()),
+			             restLengths,
+			             indices,
+			             tetherAnchors,
+			             tetherLengths
+			         );
+
+
+			// store new fabric pointer so it can be shared
+			if (factory.factory->getPlatform() == cloth::Factory::CPU)
+			{
+				mCookedData->fabricCPU = fabric;
+			}
+			else
+			{
+				NvParameterized::Handle handle(*mCookedData);
+				int32_t arraysize = 0;
+
+				if (mCookedData->getParameterHandle("fabricGPU", handle) == NvParameterized::ERROR_NONE)
+				{
+					handle.getArraySize(arraysize, 0);
+					handle.resizeArray(arraysize + 1);
+					PX_ASSERT(mCookedData->fabricGPU.arraySizes[0] == arraysize+1);
+					
+					ClothingCookedPhysX3ParamNS::FabricGPU_Type fabricGPU;
+					fabricGPU.fabricGPU = fabric;
+					fabricGPU.factory = factory.factory;
+					mCookedData->fabricGPU.buf[arraysize] = fabricGPU;
+				}
+			}
+
+
+			if (simulation.restLengthScale != 1.0f && fabric != NULL)
+			{
+				uint32_t numPhases = phases.size();
+				float* restValueScales = (float*)GetInternalApexSDK()->getTempMemory(numPhases * sizeof(float));
+				(fabric)->scaleRestvalues( simulation.restLengthScale );
+				GetInternalApexSDK()->releaseTempMemory(restValueScales);
+			}
+		}
+
+		if (fabric != NULL && mCloth == NULL)
+		{
+			PX_ASSERT(mCookedData->deformableInvVertexWeights.arraySizes[0] == (int32_t)mCookedData->numVertices);
+
+			Array<PxVec4> startPositions(mCookedData->numVertices);
+			for (uint32_t i = 0; i < mCookedData->numVertices; i++)
+			{
+				startPositions[i] = PxVec4(sdkWritebackPosition[i], mCookedData->deformableInvVertexWeights.buf[i]);
+			}
+
+			const PxVec4* pos = (const PxVec4*)startPositions.begin();
+
+			cloth::Range<const PxVec4> startPos(pos, pos + startPositions.size());
+
+			PX_PROFILE_ZONE("ClothingActorImpl::createCloth", GetInternalApexSDK()->getContextId());
+
+			mCloth = factory.factory->createCloth(startPos, *((cloth::Fabric*)fabric));
+		}
+
+		if (mCloth != NULL)
+		{
+			// setup capsules
+			const uint32_t numSupportedCapsules = 32;
+			const uint32_t* collisionIndicesEnd = (mCollisionCapsules.size() > 2 * numSupportedCapsules) ? &mCollisionCapsules[2 * numSupportedCapsules] : mCollisionCapsules.end();
+			cloth::Range<const uint32_t> cIndices(mCollisionCapsules.begin(), collisionIndicesEnd);
+			mCloth->setCapsules(cIndices,0,mCloth->getNumCapsules());
+
+			// setup convexes
+			cloth::Range<const uint32_t> convexes(mCollisionConvexes.begin(), mCollisionConvexes.end());
+			mCloth->setConvexes(convexes,0,mCloth->getNumConvexes());
+
+			mClothingScene->lockScene();
+			mClothingScene->getClothSolver(mUseCuda)->addCloth(mCloth);
+			mClothingScene->unlockScene();
+			mIsStatic = false;
+
+			// add virtual particles
+			const uint32_t numVirtualParticleIndices = (uint32_t)mCookedData->virtualParticleIndices.arraySizes[0];
+			const uint32_t numVirtualParticleWeights = (uint32_t)mCookedData->virtualParticleWeights.arraySizes[0];
+			if (numVirtualParticleIndices > 0)
+			{
+				cloth::Range<const uint32_t[4]> vIndices((const uint32_t(*)[4])(mCookedData->virtualParticleIndices.buf), (const uint32_t(*)[4])(mCookedData->virtualParticleIndices.buf + numVirtualParticleIndices));
+				cloth::Range<const PxVec3> weights((PxVec3*)mCookedData->virtualParticleWeights.buf, (PxVec3*)(mCookedData->virtualParticleWeights.buf + numVirtualParticleWeights));
+				mCloth->setVirtualParticles(vIndices, weights);
+			}
+
+			const uint32_t numSelfcollisionIndices = (uint32_t)mCookedData->selfCollisionIndices.arraySizes[0];
+			ModuleClothingImpl* module = static_cast<ModuleClothingImpl*>(mClothingScene->getModule());
+			if (module->useSparseSelfCollision() && numSelfcollisionIndices > 0)
+			{
+				cloth::Range<const uint32_t> vIndices(mCookedData->selfCollisionIndices.buf, mCookedData->selfCollisionIndices.buf + numSelfcollisionIndices);
+				mCloth->setSelfCollisionIndices(vIndices);
+			}
+
+			applyCollision();
+
+			mTeleported = true; // need to clear inertia
+		}
+	}
+
+
+	// configure phases
+	mPhaseConfigs.clear();
+
+	// if this is hit, PhaseConfig has changed. check if we need to adapt something below.
+	PX_COMPILE_TIME_ASSERT(sizeof(cloth::PhaseConfig) == 20);
+
+	const uint32_t numPhaseDescs = (uint32_t)mCookedData->deformablePhaseDescs.arraySizes[0];
+	for (uint32_t i = 0; i < numPhaseDescs; ++i)
+	{
+		cloth::PhaseConfig phaseConfig;
+		phaseConfig.mPhaseIndex = uint16_t(i);
+		phaseConfig.mStiffness = 1.0f;
+		phaseConfig.mStiffnessMultiplier = 1.0f;
+
+		mPhaseConfigs.pushBack(phaseConfig);
+	}
+
+	if (mCloth != NULL)
+	{
+		cloth::Range<cloth::PhaseConfig> phaseConfig(mPhaseConfigs.begin(), mPhaseConfigs.end());
+		mCloth->setPhaseConfig(phaseConfig);
+	}
+
+	// apply clothing material after phases are set up
+	if (material != NULL)
+	{
+		applyClothingMaterial(material, scaledGravity);
+	}
+
+	physicalMeshId = _physicalMeshId;
+
+	return (mCloth != NULL);
+}
+
+
+void Simulation::initCollision(tBoneActor* boneActors, uint32_t numBoneActors,
+									 tBoneSphere* boneSpheres, uint32_t numBoneSpheres,
+									 uint16_t* spherePairIndices, uint32_t numSpherePairIndices,
+									 tBonePlane* bonePlanes, uint32_t numBonePlanes,
+									 uint32_t* convexes, uint32_t numConvexes, tBoneEntry* bones,
+									 const PxMat44* boneTransforms,
+									 ResourceList& actorPlanes,
+									 ResourceList& actorConvexes,
+									 ResourceList& actorSpheres,
+									 ResourceList& actorCapsules,
+									 ResourceList& actorTriangleMeshes,
+									 const tActorDescTemplate& /*actorDesc*/, const tShapeDescTemplate& /*shapeDesc*/, float actorScale,
+									 const PxMat44& globalPose, bool localSpaceSim)
+{
+	// these need to be initialized here, because they are read in
+	// updateCollision
+	mLocalSpaceSim = localSpaceSim;
+	setGlobalPose(globalPose); // initialize current frame
+	setGlobalPose(globalPose); // initialize previous frame
+
+	if (numBoneActors + numBoneSpheres + actorPlanes.getSize() + actorSpheres.getSize() + actorTriangleMeshes.getSize() == 0)
+	{
+		return;
+	}
+
+	if (numBoneActors > 0 && numBoneSpheres > 0)
+	{
+		// ignore case where both exist
+		APEX_INVALID_PARAMETER("This asset contains regular collision volumes and new ones. Having both is not supported, ignoring the regular ones");
+		numBoneActors = 0;
+	}
+
+	mActorScale = actorScale;
+
+	// Note: each capsule will have two spheres at each end, nothing is shared, so the index map is quite trivial so far
+	for (uint32_t i = 0; i < numBoneActors; i++)
+	{
+		if (boneActors[i].convexVerticesCount == 0)
+		{
+			PX_ASSERT(boneActors[i].capsuleRadius > 0.0f);
+			if (mCollisionCapsules.size() < 32)
+			{
+				uint32_t index = mCollisionCapsules.size();
+				mCollisionCapsules.pushBack(index);
+				mCollisionCapsules.pushBack(index + 1);
+			}
+			else
+			{
+				uint32_t index = mCollisionCapsules.size() + mCollisionCapsulesInvalid.size();
+				mCollisionCapsulesInvalid.pushBack(index);
+				mCollisionCapsulesInvalid.pushBack(index + 1);
+			}
+		}
+	}
+
+	// now add the sphere pairs for PhysX3 capsules
+	for (uint32_t i = 0; i < numSpherePairIndices; i += 2)
+	{
+		if (spherePairIndices[i] < 32 && spherePairIndices[i + 1] < 32)
+		{
+			mCollisionCapsules.pushBack(spherePairIndices[i]);
+			mCollisionCapsules.pushBack(spherePairIndices[i + 1]);
+		}
+		else
+		{
+			mCollisionCapsulesInvalid.pushBack(spherePairIndices[i]);
+			mCollisionCapsulesInvalid.pushBack(spherePairIndices[i + 1]);
+		}
+	}
+	mNumAssetCapsules = mCollisionCapsules.size();
+	mNumAssetCapsulesInvalid = mCollisionCapsulesInvalid.size();
+
+	// convexes
+	for (uint32_t i = 0; i < numConvexes; ++i)
+	{
+		mCollisionConvexes.pushBack(convexes[i]);
+	}
+	mNumAssetConvexes = mCollisionConvexes.size();
+
+	// notify triangle meshes of initialization
+	for (uint32_t i = 0; i < actorTriangleMeshes.getSize(); ++i)
+	{
+		ClothingTriangleMeshImpl* mesh = (ClothingTriangleMeshImpl*)(actorTriangleMeshes.getResource(i));
+		mesh->setId(-1); // this makes sure that mesh->update does not try read non-existing previous frame data
+	}
+
+	updateCollision(boneActors, numBoneActors, boneSpheres, numBoneSpheres, bonePlanes, numBonePlanes, bones, boneTransforms,
+					actorPlanes, actorConvexes, actorSpheres, actorCapsules, actorTriangleMeshes, false);
+
+	if (!mCollisionCapsulesInvalid.empty())
+	{
+		PX_ASSERT(mCollisionSpheres.size() > 32);
+		if (mCollisionSpheres.size() > 32)
+		{
+			APEX_INVALID_PARAMETER("This asset has %d collision volumes, but only 32 are supported. %d will be ignored!", mCollisionSpheres.size(), mCollisionSpheres.size() - 32);
+		}
+	}
+}
+
+
+
+class CollisionCompare
+{
+public:
+	PX_INLINE bool operator()(const ApexResourceInterface* a, const ApexResourceInterface* b) const
+	{
+		ClothingCollisionImpl* collisionA = (ClothingCollisionImpl*)a;
+		ClothingCollisionImpl* collisionB = (ClothingCollisionImpl*)b;
+		return (uint32_t)collisionA->getId() < (uint32_t)collisionB->getId(); // cast to uint32_t so we get -1 at the end
+	}
+};
+
+
+
+void Simulation::updateCollision(tBoneActor* boneActors, uint32_t numBoneActors,
+									   tBoneSphere* boneSpheres, uint32_t numBoneSpheres,
+									   tBonePlane* bonePlanes, uint32_t numBonePlanes,
+									   tBoneEntry* bones, const PxMat44* boneTransforms,
+									   ResourceList& actorPlanes,
+									   ResourceList& actorConvexes,
+									   ResourceList& actorSpheres,
+									   ResourceList& actorCapsules,
+									   ResourceList& actorTriangleMeshes,
+									   bool /*teleport*/)
+{
+	if (numBoneActors > 0 && numBoneSpheres > 0)
+	{
+		// error message already emitted in initCollision
+		numBoneActors = 0;
+	}
+
+	// Note: if we have more than 32 collision spheres, we add them to the array, but we don't pass more than 32 of them to the PxCloth (allows to still debug render them in red)
+
+	const float collisionThickness = simulation.thickness / 2.0f;
+
+	PX_ASSERT(mActorScale != 0.0f);
+
+	if (numBoneActors > 0)
+	{
+		// old style
+		if (mCollisionSpheres.empty())
+		{
+			// resize them the first time
+			uint32_t count = 0;
+			for (uint32_t i = 0; i < numBoneActors; i++)
+			{
+				count += (boneActors[i].convexVerticesCount == 0) ? 2 : 0;
+			}
+			mNumAssetSpheres = count;
+			mCollisionSpheres.resize(count);
+		}
+
+		uint32_t writeIndex = 0;
+		for (uint32_t i = 0; i < numBoneActors; i++)
+		{
+			if (boneActors[i].convexVerticesCount == 0)
+			{
+				PX_ASSERT(boneActors[i].capsuleRadius > 0.0f);
+				if (boneActors[i].capsuleRadius > 0.0f)
+				{
+					const int32_t boneIndex = boneActors[i].boneIndex;
+					PX_ASSERT(boneIndex >= 0);
+					if (boneIndex >= 0)
+					{
+						const PxMat44 boneBindPose = bones[boneIndex].bindPose;
+						const PxMat44& diff = boneTransforms[boneIndex];
+
+						const PxMat44 globalPose = diff * boneBindPose * (PxMat44)boneActors[i].localPose;
+
+						const PxVec3 vertex(0.0f, boneActors[i].capsuleHeight * 0.5f, 0.0f);
+						const float radius = (boneActors[i].capsuleRadius + collisionThickness) * mActorScale;
+						mCollisionSpheres[writeIndex++] = PxVec4(globalPose.transform(vertex), radius);
+						mCollisionSpheres[writeIndex++] = PxVec4(globalPose.transform(-vertex), radius);
+					}
+				}
+			}
+		}
+		PX_ASSERT(writeIndex == mNumAssetSpheres);
+	}
+	else if (numBoneSpheres > 0)
+	{
+		// new style
+
+		// write physx3 bone spheres
+		mNumAssetSpheres = numBoneSpheres;
+		mCollisionSpheres.resize(numBoneSpheres);
+		for (uint32_t i = 0; i < mCollisionSpheres.size(); ++i)
+		{
+			const int32_t boneIndex = boneSpheres[i].boneIndex;
+			PX_ASSERT(boneIndex >= 0);
+
+			const PxMat44 boneBindPose = bones[boneIndex].bindPose;
+			const PxMat44& diff = boneTransforms[boneIndex];
+
+			PxVec3 globalPos = diff.transform(boneBindPose.transform(boneSpheres[i].localPos));
+
+			mCollisionSpheres[i] = PxVec4(globalPos, (boneSpheres[i].radius + collisionThickness) * mActorScale);
+		}
+	}
+
+	// collision spheres from actor
+	if (mReleasedSphereIds.size() > 0)
+	{
+		// make sure the order of id's doesn't change
+		CollisionCompare compare;
+		actorSpheres.sort(compare);
+	}
+	mCollisionSpheres.resize(mNumAssetSpheres + actorSpheres.getSize());
+	for (uint32_t i = 0; i < actorSpheres.getSize(); ++i)
+	{
+		uint32_t sphereId = mNumAssetSpheres + i;
+		ClothingSphereImpl* actorSphere = DYNAMIC_CAST(ClothingSphereImpl*)(actorSpheres.getResource(i));
+		actorSphere->setId((int32_t)sphereId);
+		PxVec3 pos = actorSphere->getPosition();
+		if (mLocalSpaceSim)
+		{
+			pos = mGlobalPoseNormalizedInv.transform(pos);
+		}
+
+		PxVec4 sphere(pos, actorSphere->getRadius());
+		mCollisionSpheres[sphereId] = sphere;
+	}
+
+	// collision capsules from actor
+	mCollisionCapsules.resizeUninitialized(mNumAssetCapsules);
+	mCollisionCapsulesInvalid.resizeUninitialized(mNumAssetCapsulesInvalid);
+	for (uint32_t i = 0; i < actorCapsules.getSize(); ++i)
+	{
+		ClothingCapsuleImpl* actorCapsule = DYNAMIC_CAST(ClothingCapsuleImpl*)(actorCapsules.getResource(i));
+		ClothingSphereImpl** spheres = (ClothingSphereImpl**)actorCapsule->getSpheres();
+		uint32_t s0 = (uint32_t)spheres[0]->getId();
+		uint32_t s1 = (uint32_t)spheres[1]->getId();
+		if (s0 > 32 || s1 > 32)
+		{
+			mCollisionCapsulesInvalid.pushBack(s0);
+			mCollisionCapsulesInvalid.pushBack(s1);
+		}
+		else
+		{
+			mCollisionCapsules.pushBack(s0);
+			mCollisionCapsules.pushBack(s1);
+		}
+	}
+
+
+	// collision planes of convexes
+	mCollisionPlanes.resize(numBonePlanes + actorPlanes.getSize());
+	for (uint32_t i = 0; i < numBonePlanes; ++i)
+	{
+		const int32_t boneIndex = bonePlanes[i].boneIndex;
+		PX_ASSERT(boneIndex >= 0);
+		if (boneIndex >= 0)
+		{
+			const PxMat44 boneBindPose = bones[boneIndex].bindPose;
+			const PxMat44& diff = boneTransforms[boneIndex];
+
+			PxVec3 p = diff.transform(boneBindPose.transform(bonePlanes[i].n * -bonePlanes[i].d));
+			PxVec3 n = diff.rotate(boneBindPose.rotate(bonePlanes[i].n));
+
+			PxPlane skinnedPlane(p, n);
+
+			mCollisionPlanes[i] = PxVec4(skinnedPlane.n, skinnedPlane.d);
+		}
+	}
+
+
+	// collision convexes and planes from actor
+	mCollisionConvexes.resizeUninitialized(mNumAssetConvexes);
+	mCollisionConvexesInvalid.clear();
+
+	// planes
+	if (mReleasedPlaneIds.size() > 0)
+	{
+		// make sure the order of id's doesn't change
+		CollisionCompare compare;
+		actorPlanes.sort(compare);
+	}
+	for (uint32_t i = 0; i < actorPlanes.getSize(); ++i)
+	{
+		uint32_t planeId = (uint32_t)(numBonePlanes + i);
+		ClothingPlaneImpl* actorPlane = DYNAMIC_CAST(ClothingPlaneImpl*)(actorPlanes.getResource(i));
+		actorPlane->setId((int32_t)planeId);
+		PxPlane plane = actorPlane->getPlane();
+		if (mLocalSpaceSim)
+		{
+			PxVec3 p = plane.pointInPlane();
+			plane = PxPlane(mGlobalPoseNormalizedInv.transform(p), mGlobalPoseNormalizedInv.rotate(plane.n));
+		}
+		mCollisionPlanes[planeId] = PxVec4(plane.n, plane.d);
+
+		// create a convex for unreferenced planes (otherwise they don't collide)
+		if (actorPlane->getRefCount() == 0 && planeId <= 32)
+		{
+			mCollisionConvexes.pushBack(1u << planeId);
+		}
+	}
+
+	// convexes
+	for (uint32_t i = 0; i < actorConvexes.getSize(); ++i)
+	{
+		ClothingConvexImpl* convex = DYNAMIC_CAST(ClothingConvexImpl*)(actorConvexes.getResource(i));
+
+		uint32_t convexMask = 0;
+		ClothingPlaneImpl** planes = (ClothingPlaneImpl**)convex->getPlanes();
+		for (uint32_t j = 0; j < convex->getNumPlanes(); ++j)
+		{
+			ClothingPlaneImpl* plane = planes[j];
+			uint32_t planeId = (uint32_t)plane->getId();
+			if (planeId > 32)
+			{
+				convexMask = 0;
+				break;
+			}
+			convexMask |= 1 << planeId;
+		}
+
+		if (convexMask > 0)
+		{
+			mCollisionConvexes.pushBack(convexMask);
+		}
+		else
+		{
+			mCollisionConvexesInvalid.pushBack(convex);
+		}
+	}
+
+	// triangles
+	PX_ASSERT(mCollisionTrianglesOld.empty());
+	nvidia::Array<PxVec3> collisionTrianglesTemp; // mCollisionTriangles is used in update, so we cannot clear it
+	for (uint32_t i = 0; i < actorTriangleMeshes.getSize(); ++i)
+	{
+		ClothingTriangleMeshImpl* mesh = (ClothingTriangleMeshImpl*)(actorTriangleMeshes.getResource(i));
+
+		const PxMat44& pose = mesh->getPose();
+		PxTransform tm(pose);
+		if (mLocalSpaceSim)
+		{
+			tm = PxTransform(mGlobalPoseNormalizedInv) * tm;
+		}
+
+ 		mesh->update(tm, mCollisionTriangles, mCollisionTrianglesOld, collisionTrianglesTemp);
+	}
+	mCollisionTriangles.swap(collisionTrianglesTemp);
+}
+
+
+
+void Simulation::releaseCollision(ClothingCollisionImpl& collision)
+{
+	ClothingSphereImpl* sphere = DYNAMIC_CAST(ClothingSphereImpl*)(collision.isSphere());
+	if (sphere != NULL)
+	{
+		int32_t id = sphere->getId();
+		if (id != -1)
+		{
+			mReleasedSphereIds.pushBack((uint32_t)id);
+		}
+		return;
+	}
+
+	ClothingPlaneImpl* plane = DYNAMIC_CAST(ClothingPlaneImpl*)(collision.isPlane());
+	if (plane != NULL)
+	{
+		int32_t id = plane->getId();
+		if (id != -1)
+		{
+			mReleasedPlaneIds.pushBack((uint32_t)id);
+		}
+		return;
+	}
+}
+
+
+
+void Simulation::updateCollisionDescs(const tActorDescTemplate& /*actorDesc*/, const tShapeDescTemplate& /*shapeDesc*/)
+{
+}
+
+
+
+void Simulation::disablePhysX(Actor* /*dummy*/)
+{
+	PX_ASSERT(false);
+}
+
+
+
+void Simulation::reenablePhysX(Actor* /*newMaster*/, const PxMat44& /*globalPose*/)
+{
+	PX_ASSERT(false);
+}
+
+
+
+void Simulation::fetchResults(bool computePhysicsMeshNormals)
+{
+	if (mCloth != NULL)
+	{
+		{
+			cloth::Range<PxVec4> particles = mCloth->getCurrentParticles();
+
+			PX_ASSERT(particles.size() == sdkNumDeformableVertices);
+			for (uint32_t i = 0; i < sdkNumDeformableVertices; i++)
+			{
+				sdkWritebackPosition[i] = particles[i].getXYZ();
+				PX_ASSERT(sdkWritebackPosition[i].isFinite());
+			}
+		}
+
+		// compute the normals
+		if (computePhysicsMeshNormals)
+		{
+			memset(sdkWritebackNormal, 0, sizeof(PxVec3) * sdkNumDeformableVertices);
+			for (uint32_t i = 0; i < sdkNumDeformableIndices; i += 3)
+			{
+				PxVec3 v1 = sdkWritebackPosition[mIndices[i + 1]] - sdkWritebackPosition[mIndices[i]];
+				PxVec3 v2 = sdkWritebackPosition[mIndices[i + 2]] - sdkWritebackPosition[mIndices[i]];
+				PxVec3 faceNormal = v1.cross(v2);
+
+				for (uint32_t j = 0; j < 3; j++)
+				{
+					sdkWritebackNormal[mIndices[i + j]] += faceNormal;
+				}
+			}
+
+			for (uint32_t i = 0; i < sdkNumDeformableVertices; i++)
+			{
+				sdkWritebackNormal[i].normalize();
+			}
+		}
+	}
+	else
+	{
+		for (uint32_t i = 0; i < sdkNumDeformableVertices; i++)
+		{
+			sdkWritebackPosition[i] = skinnedPhysicsPositions[i];
+			sdkWritebackNormal[i] = skinnedPhysicsNormals[i];
+		}
+	}
+}
+
+
+
+
+bool Simulation::isSimulationMeshDirty() const
+{
+	return true; // always expect something to change
+}
+
+
+
+void Simulation::clearSimulationMeshDirt()
+{
+}
+
+
+
+void Simulation::setStatic(bool on)
+{
+	if (on)
+	{
+		if (mIsStatic && !mCloth->isAsleep())
+		{
+			APEX_INTERNAL_ERROR("Cloth has not stayed static. Something must have woken it up.");
+		}
+		mCloth->putToSleep();
+	}
+	else
+	{
+		mCloth->wakeUp();
+	}
+	mIsStatic = on;
+}
+
+
+
+bool Simulation::applyPressure(float /*pressure*/)
+{
+	return false;
+}
+
+
+
+bool Simulation::raycast(const PxVec3& rayOrigin, const PxVec3& rayDirection, float& _hitTime, PxVec3& _hitNormal, uint32_t& _vertexIndex)
+{
+	const uint32_t numIndices = sdkNumDeformableIndices;
+	float hitTime = PX_MAX_F32;
+	uint32_t hitIndex = 0xffffffff;
+	uint32_t hitVertexIndex = 0;
+	for (uint32_t i = 0; i < numIndices; i += 3)
+	{
+		float t = 0, u = 0, v = 0;
+
+		if (APEX_RayTriangleIntersect(rayOrigin, rayDirection,
+		                              sdkWritebackPosition[mIndices[i + 0]],
+		                              sdkWritebackPosition[mIndices[i + 1]],
+		                              sdkWritebackPosition[mIndices[i + 2]],
+		                              t, u, v))
+		{
+			if (t < hitTime)
+			{
+				hitTime = t;
+				hitIndex = i;
+				float w = 1 - u - v;
+				if (w >= u && w >= v)
+				{
+					hitVertexIndex = mIndices[i];
+				}
+				else if (u > w && u >= v)
+				{
+					hitVertexIndex = mIndices[i + 1];
+				}
+				else
+				{
+					hitVertexIndex = mIndices[i + 2];
+				}
+			}
+		}
+	}
+
+	if (hitIndex != 0xffffffff)
+	{
+		_hitTime = hitTime;
+		_hitNormal = PxVec3(0.0f, 1.0f, 0.0f);
+		_vertexIndex = hitVertexIndex;
+		return true;
+	}
+
+	return false;
+}
+
+
+
+void Simulation::attachVertexToGlobalPosition(uint32_t vertexIndex, const PxVec3& globalPosition)
+{
+	if (mCloth == NULL)
+	{
+		return;
+	}
+
+	cloth::Range<PxVec4> curParticles = mCloth->getCurrentParticles();
+	cloth::Range<PxVec4> prevParticles = mCloth->getPreviousParticles();
+
+	PX_ASSERT(vertexIndex < curParticles.size());
+	PX_ASSERT(vertexIndex < prevParticles.size());
+
+	// the .w component contains inverse mass of the vertex
+	// the solver needs it set on both current and previous
+	// (current contains an adjusted mass, scaled or zeroed by distance constraints)
+	curParticles[vertexIndex] = PxVec4(globalPosition, 0.0f);
+	prevParticles[vertexIndex].w = 0;
+}
+
+
+
+void Simulation::freeVertex(uint32_t vertexIndex)
+{
+	if (mCloth == NULL)
+	{
+		return;
+	}
+
+	const float weight = mCookedData->deformableInvVertexWeights.buf[vertexIndex];
+
+	cloth::Range<PxVec4> curParticles = mCloth->getPreviousParticles();
+	cloth::Range<PxVec4> prevParticles = mCloth->getPreviousParticles();
+
+	PX_ASSERT(vertexIndex < curParticles.size());
+	PX_ASSERT(vertexIndex < prevParticles.size());
+
+	// the .w component contains inverse mass of the vertex
+	// the solver needs it set on both current and previous
+	// (current contains an adjusted mass, scaled or zeroed by distance constraints)
+	curParticles[vertexIndex].w = weight;
+	prevParticles[vertexIndex].w = weight;
+}
+
+
+
+void Simulation::setGlobalPose(const PxMat44& globalPose)
+{
+	mGlobalPosePrevious = mGlobalPose;
+	mGlobalPose = mGlobalPoseNormalized = globalPose;
+
+	mGlobalPoseNormalized.column0.normalize();
+	mGlobalPoseNormalized.column1.normalize();
+	mGlobalPoseNormalized.column2.normalize();
+
+	mGlobalPoseNormalizedInv = mGlobalPoseNormalized.inverseRT();
+
+	mTeleported = false;
+}
+
+
+
+void Simulation::applyGlobalPose()
+{
+	if (mCloth == NULL || mIsStatic)
+	{
+		return;
+	}
+
+	PxTransform pose = mLocalSpaceSim ? PxTransform(mGlobalPoseNormalized) : PxTransform(PxIdentity);
+
+	mCloth->setTranslation(pose.p);
+	mCloth->setRotation(pose.q);
+
+	if (mTeleported)
+	{
+		mCloth->clearInertia();
+	}
+}
+
+
+
+NvParameterized::Interface* Simulation::getCookedData()
+{
+	return NULL;
+}
+
+
+
+void Simulation::verifyTimeStep(float substepSize)
+{
+	mLastTimestep = substepSize;
+}
+
+
+#ifndef WITHOUT_DEBUG_VISUALIZE
+void Simulation::visualizeConvexes(RenderDebugInterface& renderDebug)
+{
+	if(mCloth != NULL && mCollisionConvexes.size() > 0)
+	{
+		ConvexMeshBuilder builder(&mCollisionPlanes[0]);
+
+
+		float scale = mCloth->getBoundingBoxScale().maxElement();
+
+		for(uint32_t i=0; i<mCollisionConvexes.size(); ++i)
+		{
+			builder(mCollisionConvexes[i], scale);
+		}
+
+		for (uint32_t i = 0; i < builder.mIndices.size(); i += 3)
+		{
+			RENDER_DEBUG_IFACE(&renderDebug)->debugTri(builder.mVertices[builder.mIndices[i]], builder.mVertices[builder.mIndices[i+1]], builder.mVertices[builder.mIndices[i+2]]);
+		}
+	}
+}
+
+
+
+void Simulation::visualizeConvexesInvalid(RenderDebugInterface& renderDebug)
+{
+	// this is rather slow and unprecise
+	for (uint32_t i = 0; i < mCollisionConvexesInvalid.size(); ++i)
+	{
+		ClothingConvexImpl* convex = mCollisionConvexesInvalid[i];
+		ClothingPlaneImpl** convexPlanes = (ClothingPlaneImpl**)convex->getPlanes();
+		ConvexHullImpl hull;
+		hull.init();
+		Array<PxPlane> planes;
+		for (uint32_t j = 0; j < convex->getNumPlanes(); ++j)
+		{
+			PxPlane plane = convexPlanes[j]->getPlane();
+			if (mLocalSpaceSim)
+			{
+				PxVec3 p = plane.pointInPlane();
+				plane = PxPlane(mGlobalPoseNormalizedInv.transform(p), mGlobalPoseNormalizedInv.rotate(plane.n));
+			}
+			planes.pushBack(plane);
+		}
+
+		hull.buildFromPlanes(planes.begin(), planes.size(), 0.1f);
+
+		// TODO render triangles (or polygons)
+		for (uint32_t j = 0; j < hull.getEdgeCount(); j++)
+		{
+			RENDER_DEBUG_IFACE(&renderDebug)->debugLine(hull.getVertex(hull.getEdgeEndpointIndex(j, 0)), hull.getVertex(hull.getEdgeEndpointIndex(j, 1)));
+		}
+
+		if (hull.getEdgeCount() == 0)
+		{
+			float planeSize = mCloth ? mCloth->getBoundingBoxScale().maxElement() * 0.3f : 1.0f;
+			for (uint32_t j = 0; j < planes.size(); ++j)
+			{
+				RENDER_DEBUG_IFACE(&renderDebug)->debugPlane(PxPlane(planes[j].n, planes[j].d), planeSize, planeSize);
+			}
+		}
+	}
+}
+#endif
+
+
+
+void Simulation::visualize(RenderDebugInterface& renderDebug, ClothingDebugRenderParams& clothingDebugParams)
+{
+#ifdef WITHOUT_DEBUG_VISUALIZE
+	PX_UNUSED(renderDebug);
+	PX_UNUSED(clothingDebugParams);
+#else
+	if (!clothingDebugParams.Actors)
+	{
+		return;
+	}
+
+	using RENDER_DEBUG::DebugColors;
+	using RENDER_DEBUG::DebugRenderState;
+	const PxMat44 globalPose = *RENDER_DEBUG_IFACE(&renderDebug)->getPoseTyped();
+
+	if (clothingDebugParams.CollisionShapes || clothingDebugParams.CollisionShapesWire)
+	{
+		RENDER_DEBUG_IFACE(&renderDebug)->pushRenderState();
+
+		// Wireframe only when solid is not set, when both are on, just do the solid thing
+		if (!clothingDebugParams.CollisionShapes)
+		{
+			RENDER_DEBUG_IFACE(&renderDebug)->removeFromCurrentState(DebugRenderState::SolidShaded);
+			RENDER_DEBUG_IFACE(&renderDebug)->removeFromCurrentState(DebugRenderState::SolidWireShaded);
+		}
+		else
+		{
+			RENDER_DEBUG_IFACE(&renderDebug)->addToCurrentState(DebugRenderState::SolidShaded);
+			RENDER_DEBUG_IFACE(&renderDebug)->removeFromCurrentState(DebugRenderState::SolidWireShaded);
+		}
+
+		const uint32_t colorLightGray = RENDER_DEBUG_IFACE(&renderDebug)->getDebugColor(DebugColors::LightGray);
+		const uint32_t colorGray = RENDER_DEBUG_IFACE(&renderDebug)->getDebugColor(DebugColors::Gray);
+		const uint32_t colorRed = RENDER_DEBUG_IFACE(&renderDebug)->getDebugColor(DebugColors::Red);
+
+		RENDER_DEBUG_IFACE(&renderDebug)->setCurrentColor(colorLightGray);
+
+		PX_ALLOCA(usedSpheres, bool, mCollisionSpheres.size());
+		for (uint32_t i = 0; i < mCollisionSpheres.size(); i++)
+		{
+			usedSpheres[i] = false;
+		}
+
+		const uint32_t numIndices1 = mCollisionCapsules.size();
+		const uint32_t numIndices2 = mCollisionCapsulesInvalid.size();
+		const uint32_t numIndices = numIndices2 + numIndices1;
+		for (uint32_t i = 0; i < numIndices; i += 2)
+		{
+			const bool valid = i < numIndices1;
+			const uint32_t index1 = valid ? mCollisionCapsules[i + 0] : mCollisionCapsulesInvalid[i + 0 - numIndices1];
+			const uint32_t index2 = valid ? mCollisionCapsules[i + 1] : mCollisionCapsulesInvalid[i + 1 - numIndices1];
+
+			RENDER_DEBUG_IFACE(&renderDebug)->setCurrentColor(valid ? colorLightGray : colorRed);
+
+			PxVec3 pos1 = mCollisionSpheres[index1].getXYZ();
+			PxVec3 pos2 = mCollisionSpheres[index2].getXYZ();
+
+			PxVec3 capsuleAxis = pos1 - pos2;
+			const float axisHeight = capsuleAxis.normalize();
+
+			PxMat44 capsulePose;
+			{
+				// construct matrix from this
+				const PxVec3 capsuleDefaultAxis(0.0f, 1.0f, 0.0f);
+				PxVec3 axis = capsuleDefaultAxis.cross(capsuleAxis).getNormalized();
+				const float angle = PxAcos(capsuleDefaultAxis.dot(capsuleAxis));
+				if (angle < 0.001f || angle + 0.001 > PxPi || axis.isZero())
+				{
+					axis = PxVec3(0.0f, 1.0f, 0.0f);
+				}
+				PxQuat q(angle, axis);
+				capsulePose = PxMat44(q);
+				capsulePose.setPosition((pos1 + pos2) * 0.5f);
+			}
+
+			const float radius1 = mCollisionSpheres[index1].w;
+			const float radius2 = mCollisionSpheres[index2].w;
+
+			RENDER_DEBUG_IFACE(&renderDebug)->setPose(globalPose * capsulePose);
+			RENDER_DEBUG_IFACE(&renderDebug)->debugCapsuleTapered(radius1, radius2, axisHeight, 2);
+
+			usedSpheres[index1] = true;
+			usedSpheres[index2] = true;
+		}
+
+		for (uint32_t i = 0; i < mCollisionSpheres.size(); i++)
+		{
+			if (!usedSpheres[i])
+			{
+				RENDER_DEBUG_IFACE(&renderDebug)->setCurrentColor(i < 32 ? colorGray : colorRed);
+				RENDER_DEBUG_IFACE(&renderDebug)->debugSphere(mCollisionSpheres[i].getXYZ(), mCollisionSpheres[i].w);
+			}
+		}
+		RENDER_DEBUG_IFACE(&renderDebug)->setPose(globalPose);
+		RENDER_DEBUG_IFACE(&renderDebug)->setCurrentColor(colorLightGray);
+		visualizeConvexes(renderDebug);
+		RENDER_DEBUG_IFACE(&renderDebug)->setCurrentColor(colorRed);
+		visualizeConvexesInvalid(renderDebug);
+
+		// collision triangles
+		PX_ASSERT(mCollisionTriangles.size() % 3 == 0);
+		uint32_t numTriangleVertsInCloth = mCloth ? 3*mCloth->getNumTriangles() : mCollisionTriangles.size();
+		for (uint32_t i = 0; i < mCollisionTriangles.size(); i += 3)
+		{
+			if (i < numTriangleVertsInCloth)
+			{
+				// only 500 triangles simulated in cuda
+				RENDER_DEBUG_IFACE(&renderDebug)->setCurrentColor(colorLightGray);
+			}
+			else
+			{
+				RENDER_DEBUG_IFACE(&renderDebug)->setCurrentColor(colorRed);
+			}
+			RENDER_DEBUG_IFACE(&renderDebug)->debugTri(mCollisionTriangles[i + 0], mCollisionTriangles[i + 1], mCollisionTriangles[i + 2]);
+		}
+
+		RENDER_DEBUG_IFACE(&renderDebug)->popRenderState();
+	}
+
+	if (clothingDebugParams.LengthFibers ||
+	        clothingDebugParams.CrossSectionFibers ||
+	        clothingDebugParams.BendingFibers ||
+	        clothingDebugParams.ShearingFibers)
+	{
+		const uint32_t colorGreen = RENDER_DEBUG_IFACE(&renderDebug)->getDebugColor(DebugColors::Green);
+		const uint32_t colorRed = RENDER_DEBUG_IFACE(&renderDebug)->getDebugColor(DebugColors::Red);
+
+		for (uint32_t pc = 0; pc < mPhaseConfigs.size(); ++pc)
+		{
+			const uint32_t phaseIndex = mPhaseConfigs[pc].mPhaseIndex;
+			PX_ASSERT(phaseIndex < (uint32_t)mCookedData->deformablePhaseDescs.arraySizes[0]);
+
+			const uint32_t setIndex = mCookedData->deformablePhaseDescs.buf[phaseIndex].setIndex;
+
+			const PxClothFabricPhaseType::Enum type = (PxClothFabricPhaseType::Enum)mCookedData->deformablePhaseDescs.buf[phaseIndex].phaseType;
+
+			float stretchRangeMultiplier = mPhaseConfigs[pc].mStretchLimit;
+			float compressionRangeMultiplier = mPhaseConfigs[pc].mCompressionLimit;
+
+			float stiffnessScale = mPhaseConfigs[pc].mStiffnessMultiplier;
+			uint8_t brightness = (uint8_t)(64 * stiffnessScale + 64);
+			if (stiffnessScale == 1.f)
+			{
+				brightness = 255;
+			}
+			else if (stiffnessScale == 0.f)
+			{
+				brightness = 0;
+			}
+			uint32_t rangeColor = uint32_t(brightness | (brightness << 8) | (brightness << 16));
+			uint32_t stretchRangeColor		= rangeColor;
+			uint32_t compressionRangeColor = rangeColor;
+			if (stretchRangeMultiplier > 1.f)
+			{
+				// red
+				rangeColor |= 0xFF << 16;
+			}
+			else if (compressionRangeMultiplier < 1.f)
+			{
+				// blue
+				rangeColor |= 0xFF << 0;
+			}
+			if (stiffnessScale == 1)
+			{
+				rangeColor = 0xFFFFFF;
+			}
+
+			bool ok = false;
+			ok |= clothingDebugParams.LengthFibers && type == PxClothFabricPhaseType::eVERTICAL;
+			ok |= clothingDebugParams.CrossSectionFibers && type == PxClothFabricPhaseType::eHORIZONTAL;
+			ok |= clothingDebugParams.BendingFibers && type == PxClothFabricPhaseType::eBENDING;
+			ok |= clothingDebugParams.ShearingFibers && type == PxClothFabricPhaseType::eSHEARING;
+
+			if (ok)
+			{
+				const uint32_t fromIndex	= setIndex ? mCookedData->deformableSets.buf[setIndex - 1].fiberEnd : 0;
+				const uint32_t toIndex		= mCookedData->deformableSets.buf[setIndex].fiberEnd;
+
+				if ((int32_t)toIndex > mCookedData->deformableIndices.arraySizes[0])
+				{
+					break;
+				}
+
+				for (uint32_t f = fromIndex; f < toIndex; ++f)
+				{
+					uint32_t	posIndex1	= mCookedData->deformableIndices.buf[2 * f];
+					uint32_t	posIndex2	= mCookedData->deformableIndices.buf[2 * f + 1]; 
+
+					PX_ASSERT((int32_t)posIndex2 <= mCookedData->deformableIndices.arraySizes[0]);
+					PX_ASSERT(mCookedData->deformableIndices.buf[posIndex1] < sdkNumDeformableVertices);
+
+					PxVec3	pos1		= sdkWritebackPosition[posIndex1];
+					PxVec3	pos2		= sdkWritebackPosition[posIndex2];
+
+					const float restLength	= mCookedData->deformableRestLengths.buf[f] * simulation.restLengthScale;
+					PxVec3 dir				= pos2 - pos1;
+					PxVec3 middle			= pos1 + 0.5f * dir;
+					const float simLength	= dir.normalize();
+					PxVec3 edge				= dir * restLength;
+					PxVec3 e1				= middle - 0.5f * edge;
+					PxVec3 e2				= middle + 0.5f * edge;
+
+					if (clothingDebugParams.FiberRange && type != PxClothFabricPhaseType::eBENDING)
+					{
+						PxVec3 stretchRangeOffset		= edge;
+						PxVec3 compressionRangeOffset	= edge;
+
+						if (stretchRangeMultiplier > 1.f)
+						{
+							stretchRangeOffset *= 0.5f * (1.0f - stretchRangeMultiplier);
+
+							RENDER_DEBUG_IFACE(&renderDebug)->setCurrentColor(stretchRangeColor);
+							RENDER_DEBUG_IFACE(&renderDebug)->debugLine(e1, e1 + stretchRangeOffset);
+							RENDER_DEBUG_IFACE(&renderDebug)->debugLine(e2, e2 - stretchRangeOffset);
+						}
+						
+						if (compressionRangeMultiplier < 1.f)
+						{
+							compressionRangeOffset *= 0.5f * (1.0f - compressionRangeMultiplier);
+
+							RENDER_DEBUG_IFACE(&renderDebug)->setCurrentColor(compressionRangeColor);
+							RENDER_DEBUG_IFACE(&renderDebug)->debugLine(e1, e1 + compressionRangeOffset);
+							RENDER_DEBUG_IFACE(&renderDebug)->debugLine(e2, e2 - compressionRangeOffset);
+						}
+
+						RENDER_DEBUG_IFACE(&renderDebug)->setCurrentColor(0xFFFFFFFF);
+						RENDER_DEBUG_IFACE(&renderDebug)->debugPoint(pos1, 0.01f);
+						RENDER_DEBUG_IFACE(&renderDebug)->debugPoint(pos2, 0.01f);
+						if (compressionRangeMultiplier < 1.0f)
+						{
+							RENDER_DEBUG_IFACE(&renderDebug)->debugLine(e1 + compressionRangeOffset, e2 - compressionRangeOffset);
+						}
+						else
+						{
+							RENDER_DEBUG_IFACE(&renderDebug)->debugLine(e1, e2);
+						}
+					}
+					else
+					{
+						if (simLength < restLength || type == PxClothFabricPhaseType::eBENDING)
+						{
+							RENDER_DEBUG_IFACE(&renderDebug)->debugGradientLine(pos1, pos2, colorGreen, colorGreen);
+						}
+						else
+						{
+							RENDER_DEBUG_IFACE(&renderDebug)->debugGradientLine(pos1, e1, colorRed, colorRed);
+							RENDER_DEBUG_IFACE(&renderDebug)->debugGradientLine(e1, e2, colorGreen, colorGreen);
+							RENDER_DEBUG_IFACE(&renderDebug)->debugGradientLine(e2, pos2, colorRed, colorRed);
+						}
+					}
+				}
+			}
+		}
+	}
+
+	if (clothingDebugParams.TethersActive || clothingDebugParams.TethersInactive)
+	{
+		const uint32_t colorDarkBlue = RENDER_DEBUG_IFACE(&renderDebug)->getDebugColor(DebugColors::Blue);
+		const uint32_t colorLightBlue = RENDER_DEBUG_IFACE(&renderDebug)->getDebugColor(DebugColors::LightBlue);
+		const uint32_t colorGreen = RENDER_DEBUG_IFACE(&renderDebug)->getDebugColor(DebugColors::Green);
+		const uint32_t colorRed = RENDER_DEBUG_IFACE(&renderDebug)->getDebugColor(DebugColors::Red);
+
+		const uint32_t numTetherAnchors = (uint32_t)mCookedData->tetherAnchors.arraySizes[0];
+		for (uint32_t i = 0; i < numTetherAnchors; ++i)
+		{
+			uint32_t anchorIndex = mCookedData->tetherAnchors.buf[i];
+			PX_ASSERT(anchorIndex < sdkNumDeformableVertices);
+			const PxVec3 p1 = sdkWritebackPosition[anchorIndex];
+			const PxVec3 p2 = sdkWritebackPosition[i % sdkNumDeformableVertices];
+			PxVec3 dir = p2 - p1;
+			const float d = dir.normalize();
+			const float tetherLength = mCookedData->tetherLengths.buf[i];
+
+			if (d < tetherLength)
+			{
+				if (d < tetherLength * 0.99)
+				{
+					if (clothingDebugParams.TethersInactive)
+					{
+						RENDER_DEBUG_IFACE(&renderDebug)->setCurrentColor(colorDarkBlue);
+						RENDER_DEBUG_IFACE(&renderDebug)->debugLine(p1, p2);
+					}
+				}
+				else if (clothingDebugParams.TethersActive)
+				{
+					RENDER_DEBUG_IFACE(&renderDebug)->setCurrentColor(colorLightBlue);
+					RENDER_DEBUG_IFACE(&renderDebug)->debugLine(p1, p2);
+				}
+			}
+			else if (clothingDebugParams.TethersActive)
+			{
+				const PxVec3 p = p1 + tetherLength * dir;
+				RENDER_DEBUG_IFACE(&renderDebug)->setCurrentColor(colorLightBlue);
+				RENDER_DEBUG_IFACE(&renderDebug)->debugLine(p1, p);
+				RENDER_DEBUG_IFACE(&renderDebug)->setCurrentColor(colorGreen);
+				const PxVec3 p_ = p1 + dir * PxMin(tetherLength * mTetherLimit, d);
+				RENDER_DEBUG_IFACE(&renderDebug)->debugLine(p, p_);
+
+				if (d > tetherLength * mTetherLimit)
+				{
+					RENDER_DEBUG_IFACE(&renderDebug)->setCurrentColor(colorRed);
+					RENDER_DEBUG_IFACE(&renderDebug)->debugLine(p_, p2);
+				}
+			}
+		}
+	}
+
+	if (clothingDebugParams.MassScale && mCloth != NULL && mCloth->getCollisionMassScale() > 0.0f)
+	{
+		cloth::Range<const PxVec4> curParticles = mCloth->getCurrentParticles();
+		cloth::Range<const PxVec4> prevParticles = mCloth->getPreviousParticles();
+
+		uint32_t colorRed = RENDER_DEBUG_IFACE(&renderDebug)->getDebugColor(DebugColors::Red);
+
+		RENDER_DEBUG_IFACE(&renderDebug)->pushRenderState();
+		RENDER_DEBUG_IFACE(&renderDebug)->setCurrentColor(colorRed);
+
+		// draw a point anywhere the mass difference between cur and prev is non-zero
+		for (uint32_t i = 0; i < curParticles.size(); ++i)
+		{
+			float curInvMass = curParticles[i][3];
+			float prevInvMass = prevParticles[i][3];
+			float massDelta = curInvMass - prevInvMass;
+
+			// ignore prevInvMass of 0.0f because it is probably a motion constraint
+			if (massDelta > 0.0f && prevInvMass > 0.0f)
+			{
+				RENDER_DEBUG_IFACE(&renderDebug)->debugPoint(PxVec3(curParticles[i][0], curParticles[i][1], curParticles[i][2]), massDelta * 10.0f);
+			}
+		}
+
+		RENDER_DEBUG_IFACE(&renderDebug)->popRenderState();
+	}
+
+	if (clothingDebugParams.VirtualCollision)
+	{
+		uint32_t colorParticle = RENDER_DEBUG_IFACE(&renderDebug)->getDebugColor(DebugColors::Gold);
+		uint32_t colorVertex = RENDER_DEBUG_IFACE(&renderDebug)->getDebugColor(DebugColors::White);
+
+		const uint32_t numVirtualParticleIndices = (uint32_t)mCookedData->virtualParticleIndices.arraySizes[0];
+		for (uint32_t i = 0; i < numVirtualParticleIndices; i += 4)
+		{
+			const PxVec3 positions[3] =
+			{
+				sdkWritebackPosition[mCookedData->virtualParticleIndices.buf[i + 0]],
+				sdkWritebackPosition[mCookedData->virtualParticleIndices.buf[i + 1]],
+				sdkWritebackPosition[mCookedData->virtualParticleIndices.buf[i + 2]],
+			};
+
+			const uint32_t weightIndex = mCookedData->virtualParticleIndices.buf[i + 3];
+
+			PxVec3 particlePos(0.0f);
+
+			uint32_t colors[3] =
+			{
+				colorVertex,
+				colorVertex,
+				colorVertex,
+			};
+
+			for (uint32_t j = 0; j < 3; j++)
+			{
+				const float weight = mCookedData->virtualParticleWeights.buf[3 * weightIndex + j];
+				particlePos += weight * positions[j];
+
+				uint8_t* colorParts = (uint8_t*)(colors + j);
+				for (uint32_t k = 0; k < 4; k++)
+				{
+					colorParts[k] = (uint8_t)(weight * colorParts[k]);
+				}
+			}
+
+			for (uint32_t j = 0; j < 3; j++)
+			{
+				RENDER_DEBUG_IFACE(&renderDebug)->debugGradientLine(particlePos, positions[j], colorParticle, colors[j]);
+			}
+		}
+	}
+
+	ModuleClothingImpl* module = static_cast<ModuleClothingImpl*>(mClothingScene->getModule());
+	if (clothingDebugParams.SelfCollision && module->useSparseSelfCollision())
+	{
+		RENDER_DEBUG_IFACE(&renderDebug)->pushRenderState();
+		RENDER_DEBUG_IFACE(&renderDebug)->addToCurrentState(DebugRenderState::SolidShaded);
+
+		const PxVec3* const positions = sdkWritebackPosition;
+		uint32_t* indices = mCookedData->selfCollisionIndices.buf;
+		uint32_t numIndices = (uint32_t)mCookedData->selfCollisionIndices.arraySizes[0];
+
+		PxMat44 pose = PxMat44(PxIdentity);
+		for (uint32_t i = 0; i < numIndices; ++i)
+		{
+			uint32_t index = indices[i];
+			pose.setPosition(positions[index]);
+			RENDER_DEBUG_IFACE(&renderDebug)->debugSphere(pose.getPosition(), 0.5f * mCloth->getSelfCollisionDistance());
+		}
+		RENDER_DEBUG_IFACE(&renderDebug)->popRenderState();
+	}
+
+	if (clothingDebugParams.SelfCollisionAttenuation > 0.0f)
+	{
+		createAttenuationData();
+
+		RENDER_DEBUG_IFACE(&renderDebug)->pushRenderState();
+
+		for (uint32_t i = 0; i < mSelfCollisionAttenuationPairs.size(); i += 2)
+		{
+			float val = mSelfCollisionAttenuationValues[i/2];
+			PX_ASSERT(val <= 1.0f);
+
+			if (val > clothingDebugParams.SelfCollisionAttenuation)
+				continue;
+
+			uint8_t c = (uint8_t)(UINT8_MAX * val);
+			if (val == 0.0f)
+			{
+				RENDER_DEBUG_IFACE(&renderDebug)->setCurrentColor(0x000000FF);
+			}
+			else
+			{
+				RENDER_DEBUG_IFACE(&renderDebug)->setCurrentColor(uint32_t(c << 16 | c << 8 | c));
+			}
+			PxVec3 p0 = sdkWritebackPosition[mSelfCollisionAttenuationPairs[i]];
+			PxVec3 p1 = sdkWritebackPosition[mSelfCollisionAttenuationPairs[i+1]];
+			RENDER_DEBUG_IFACE(&renderDebug)->debugLine(p0, p1);
+		}
+		
+		RENDER_DEBUG_IFACE(&renderDebug)->popRenderState();
+	}
+#endif // WITHOUT_DEBUG_VISUALIZE
+}
+
+
+#ifndef WITHOUT_DEBUG_VISUALIZE
+// solver logic is replicated here
+void Simulation::createAttenuationData()
+{
+	if (mSelfCollisionAttenuationPairs.size() > 0)
+		return;
+	
+	PX_ASSERT(mSelfCollisionAttenuationValues.size() == 0);
+
+	float collD2 = mCloth->getSelfCollisionDistance();
+	collD2 = collD2 * collD2;
+
+	// it's just debug rendering, n^2 probably won't hurt us here
+	for (uint32_t i = 0; i < sdkNumDeformableVertices; ++i)
+	{
+		for (uint32_t j = i+1; j < sdkNumDeformableVertices; ++j)
+		{
+			float restD2 = (mRestPositions[j] - mRestPositions[i]).magnitudeSquared();
+			if (restD2 < collD2)
+			{
+				// closer than rest distance. pair is ignored by selfcollision
+				mSelfCollisionAttenuationPairs.pushBack(PxMin(i,j));
+				mSelfCollisionAttenuationPairs.pushBack(PxMax(i,j));
+				mSelfCollisionAttenuationValues.pushBack(0.0f);
+			}
+			else if(restD2 < 4*collD2)
+			{
+				// within the doubled rest distance. selfcollision stiffness is attenuated
+				mSelfCollisionAttenuationPairs.pushBack(PxMin(i,j));
+				mSelfCollisionAttenuationPairs.pushBack(PxMax(i,j));
+
+				float ratio = sqrtf(restD2 / collD2) - 1.0f;
+				mSelfCollisionAttenuationValues.pushBack(ratio);
+			}
+		}
+	}
+}
+#endif
+
+
+
+#ifndef WITHOUT_PVD
+void Simulation::updatePvd(pvdsdk::PvdDataStream& /*pvdStream*/, pvdsdk::PvdUserRenderer& pvdRenderer, ApexResourceInterface* clothingActor, bool localSpaceSim)
+{
+	// update rendering
+	pvdRenderer.setInstanceId(clothingActor);
+
+	PX_ASSERT(sdkNumDeformableIndices%3 == 0);
+	uint32_t numTriangles = sdkNumDeformableIndices/3;
+	pvdsdk::PvdDebugTriangle* pvdTriangles = (pvdsdk::PvdDebugTriangle*)GetInternalApexSDK()->getTempMemory(numTriangles * sizeof(pvdsdk::PvdDebugTriangle));
+	uint32_t color = (uint32_t)RENDER_DEBUG::DebugColors::Yellow;
+	for (uint32_t i = 0; i < numTriangles; ++i)
+	{
+		if (localSpaceSim)
+		{
+			pvdTriangles[i].pos0 = mGlobalPose.transform(sdkWritebackPosition[mIndices[3*i+0]]);
+			pvdTriangles[i].pos1 = mGlobalPose.transform(sdkWritebackPosition[mIndices[3*i+1]]);
+			pvdTriangles[i].pos2 = mGlobalPose.transform(sdkWritebackPosition[mIndices[3*i+2]]);
+		}
+		else
+		{
+			pvdTriangles[i].pos0 = sdkWritebackPosition[mIndices[3*i+0]];
+			pvdTriangles[i].pos1 = sdkWritebackPosition[mIndices[3*i+1]];
+			pvdTriangles[i].pos2 = sdkWritebackPosition[mIndices[3*i+2]];
+		}
+
+		pvdTriangles[i].color0 = color;
+		pvdTriangles[i].color1 = color;
+		pvdTriangles[i].color2 = color;
+	}
+	pvdRenderer.drawTriangles(pvdTriangles, numTriangles);
+	GetInternalApexSDK()->releaseTempMemory(pvdTriangles);
+}
+#endif
+
+
+
+GpuSimMemType::Enum Simulation::getGpuSimMemType() const
+{
+	// should we remove this?
+	/*
+	if (mUseCuda && mCloth != NULL)
+	{
+		cloth::Solver* solver = mClothingScene->getClothSolver(mUseCuda);
+		if (solver != NULL)
+		{
+			uint32_t numSharedPos = solver->getNumSharedPositions(mCloth);
+			GpuSimMemType::Enum type = (GpuSimMemType::Enum)numSharedPos;
+			return type;
+		}
+	}
+	*/
+	return GpuSimMemType::UNDEFINED;
+}
+
+
+
+void Simulation::setPositions(PxVec3* /*positions*/)
+{
+	PX_ALWAYS_ASSERT();
+	// not necessary for now, maybe when supporting physics LOD
+}
+
+
+
+void Simulation::setConstrainCoefficients(const tConstrainCoeffs* assetCoeffs, float maxDistanceBias, float maxDistanceScale, float /*maxDistanceDeform*/, float /*actorScale*/)
+{
+	if (mCloth == NULL)
+	{
+		return;
+	}
+
+	// Note: the spherical constraint distances are only computed here. They get set in the updateConstrainPositions method
+	//       The reason for this is that it doesn't behave well when being set twice. Also skinnedPhysicsPositions are not
+	//       always initialized when this method is called!
+
+	PX_ASSERT(mConstrainCoeffs == NULL || mConstrainCoeffs == assetCoeffs);
+	mConstrainCoeffs = assetCoeffs;
+
+	// Note: maxDistanceScale already has actorScale included...
+
+	mMotionConstrainBias = -maxDistanceBias;
+	mMotionConstrainScale = maxDistanceScale;
+
+	if (mNumBackstopConstraints == -1)
+	{
+		mNumBackstopConstraints = 0;
+		for (uint32_t i = 0; i < sdkNumDeformableVertices; i++)
+		{
+			mNumBackstopConstraints += assetCoeffs[i].collisionSphereRadius > 0.0f ? 1 : 0;
+		}
+	}
+
+	if (mConstrainConstants.size() != sdkNumDeformableVertices)
+	{
+		mConstrainConstants.resize(sdkNumDeformableVertices, ConstrainConstants());
+
+		for (uint32_t i = 0; i < sdkNumDeformableVertices; i++)
+		{
+			mConstrainConstants[i].motionConstrainDistance = PxMax(0.0f, assetCoeffs[i].maxDistance);
+			mConstrainConstants[i].backstopDistance = PxMax(0.0f, assetCoeffs[i].collisionSphereDistance) + assetCoeffs[i].collisionSphereRadius;
+			mConstrainConstants[i].backstopRadius = assetCoeffs[i].collisionSphereRadius;
+		}
+
+		mConstrainConstantsDirty = true;
+	}
+}
+
+
+
+void Simulation::getVelocities(PxVec3* velocities) const
+{
+	if (mCloth == NULL)
+	{
+		return;
+	}
+
+	PX_PROFILE_ZONE("SimulationPxCloth::getVelocities", GetInternalApexSDK()->getContextId());
+
+	PX_ALIGN(16, PxMat44 oldFrameDiff) = PxMat44(PxIdentity);
+	bool useOldFrame = false;
+	if (mGlobalPose != mGlobalPosePrevious && mLocalSpaceSim && mLastTimestep > 0.0f)
+	{
+		oldFrameDiff = mGlobalPosePrevious;
+		oldFrameDiff.column0.normalize();
+		oldFrameDiff.column1.normalize();
+		oldFrameDiff.column2.normalize();
+		const float w = mCloth->getPreviousIterationDt() / mLastTimestep;
+		oldFrameDiff = interpolateMatrix(w, oldFrameDiff, mGlobalPoseNormalized);
+		oldFrameDiff = mGlobalPoseNormalized.inverseRT() * oldFrameDiff;
+		useOldFrame = true;
+	}
+
+	const float previousIterDt = mCloth->getPreviousIterationDt();
+	const float invTimeStep = previousIterDt > 0.0f ? 1.0f / previousIterDt : 0.0f;
+
+	const cloth::Range<PxVec4> newPositions = mCloth->getCurrentParticles();
+	const cloth::Range<PxVec4> oldPositions = mCloth->getPreviousParticles();
+
+	if (useOldFrame)
+	{
+		// use SIMD code only here, it was slower for the non-matrix-multiply codepath :(
+
+		// In localspace (and if the localspace has changed, i.e. frameDiff != ID) the previous positions are in a
+		// different frame, interpolated for each iteration. We need to generate that interpolated frame (20 lines above)
+		// and then apply the diff to the previous positions to move them into the same frame as the current positions.
+		// This is the same frame as we refer to 'current local space'.
+		using namespace physx::shdfnd::aos;
+		const Vec3V invTime = V3Load(invTimeStep);
+		PX_ASSERT(((size_t)(&newPositions[0].x) & 0xf) == 0); // 16 byte aligned?
+		PX_ASSERT(((size_t)(&oldPositions[0].x) & 0xf) == 0); // 16 byte aligned?
+		const Mat34V frameDiff = (Mat34V&)oldFrameDiff;
+
+		for (uint32_t i = 0; i < sdkNumDeformableVertices; i++)
+		{
+			const Vec3V newPos = Vec3V_From_Vec4V(V4LoadA(&newPositions[i].x));
+			const Vec3V oldPos = Vec3V_From_Vec4V(V4LoadA(&oldPositions[i].x));
+			const Vec3V oldPosReal = M34MulV3(frameDiff, oldPos);
+
+			const Vec3V velocity = V3Mul(V3Sub(newPos, oldPosReal), invTime);
+			V3StoreU(velocity, velocities[i]);
+		}
+	}
+	else
+	{
+		for (uint32_t i = 0; i < sdkNumDeformableVertices; i++)
+		{
+			const PxVec3& newPos = PxVec3(newPositions[i].x, newPositions[i].y, newPositions[i].z);
+			const PxVec3& oldPos = PxVec3(oldPositions[i].x, oldPositions[i].y, oldPositions[i].z);
+
+			PxVec3 d = newPos - oldPos;
+			velocities[i] = d * invTimeStep;
+		}
+	}
+
+	// no unmap since we only read
+}
+
+
+
+void Simulation::setVelocities(PxVec3* velocities)
+{
+	if (mCloth == NULL || mIsStatic)
+	{
+		return;
+	}
+
+	PX_PROFILE_ZONE("ClothingActorImpl::setVelocities", GetInternalApexSDK()->getContextId());
+
+	const float timeStep = mCloth->getPreviousIterationDt();
+
+	cloth::Range<PxVec4> newPositions = mCloth->getCurrentParticles();
+	cloth::Range<PxVec4> oldPositions = mCloth->getPreviousParticles(); // read the data, the .w is vital!
+
+	// assuming the weights are still up to date!
+
+	PX_ALIGN(16, PxMat44 oldFrameDiff) = PxMat44(PxIdentity);
+	bool useOldFrame = false;
+	if (mGlobalPose != mGlobalPosePrevious && mLocalSpaceSim)
+	{
+		oldFrameDiff = mGlobalPosePrevious;
+		oldFrameDiff.column0.normalize();
+		oldFrameDiff.column1.normalize();
+		oldFrameDiff.column2.normalize();
+		const float w = mCloth->getPreviousIterationDt() / mLastTimestep;
+		oldFrameDiff = interpolateMatrix(w, oldFrameDiff, mGlobalPoseNormalized);
+		oldFrameDiff = oldFrameDiff.inverseRT() * mGlobalPoseNormalized;
+		useOldFrame = true;
+	}
+
+	if (useOldFrame)
+	{
+		using namespace physx::shdfnd::aos;
+
+		const Vec3V time = V3Load(timeStep);
+
+		PX_ASSERT(((size_t)(&newPositions[0].x) & 0xf) == 0); // 16 byte aligned?
+		PX_ASSERT(((size_t)(&oldPositions[0].x) & 0xf) == 0); // 16 byte aligned?
+		const Mat34V frameDiff = (Mat34V&)oldFrameDiff;
+		BoolV mask = BTTTF();
+
+		for (uint32_t i = 0; i < sdkNumDeformableVertices; i++)
+		{
+			const Vec3V velocity = V3LoadU(velocities[i]);
+			const Vec4V newPos = V4LoadA(&newPositions[i].x);
+			const Vec4V oldWeight = V4Load(oldPositions[i].w);
+			const Vec3V oldPosReal = V3NegMulSub(velocity, time, Vec3V_From_Vec4V(newPos)); // newPos - velocity * time
+			const Vec3V oldPos = M34MulV3(frameDiff, oldPosReal);
+			const Vec4V oldPosOut = V4Sel(mask, Vec4V_From_Vec3V(oldPos), oldWeight);
+
+			aos::V4StoreA(oldPosOut, &oldPositions[i].x);
+		}
+	}
+	else
+	{
+		for (uint32_t i = 0; i < sdkNumDeformableVertices; i++)
+		{
+			PxVec3* oldPos = (PxVec3*)(oldPositions.begin() + i);
+			const PxVec3* const newPos = (const PxVec3 * const)(newPositions.begin() + i);
+			*oldPos = *newPos - velocities[i] * timeStep;
+		}
+	}
+}
+
+
+
+bool Simulation::applyWind(PxVec3* velocities, const PxVec3* normals, const tConstrainCoeffs* coeffs, const PxVec3& wind, float adaption, float /*dt*/)
+{
+	if (mCloth == NULL || mIsStatic)
+	{
+		return false;
+	}
+
+	// here we leave velocities untouched
+
+	if (adaption > 0.0f)
+	{
+		cloth::Range<PxVec4> accelerations = mCloth->getParticleAccelerations();
+		for (uint32_t i = 0; i < sdkNumDeformableVertices; i++)
+		{
+			PxVec3 velocity = velocities[i];
+			PxVec3 dv = wind - velocity;
+
+			if (coeffs[i].maxDistance > 0.0f && !dv.isZero())
+			{
+				// scale the wind depending on angle
+				PxVec3 normalizedDv = dv;
+				normalizedDv *= ModuleClothingHelpers::invSqrt(normalizedDv.magnitudeSquared());
+				const float dot = normalizedDv.dot(normals[i]);
+				dv *= PxMin(1.0f, PxAbs(dot) * adaption); // factor should not exceed 1.0f
+
+				// We set the acceleration such that we get
+				// end velocity = velocity + (wind - velocity) * dot * adaption * dt.
+				// using
+				// end velocity = velocity + acceleration * dt
+				accelerations[i] = PxVec4(dv, 0.0f);
+			}
+			else
+			{
+				accelerations[i].setZero();
+			}
+		}
+	}
+	else
+	{
+		mCloth->clearParticleAccelerations();
+	}
+
+	return false;
+}
+
+
+void Simulation::setTeleportWeight(float weight, bool reset, bool localSpaceSim)
+{
+	if (mCloth != NULL && weight > 0.0f && !mIsStatic)
+	{
+		mTeleported = true;
+
+		if (reset)
+		{
+			cloth::Range<PxVec4> curPos = mCloth->getCurrentParticles();
+			cloth::Range<PxVec4> prevPos = mCloth->getPreviousParticles();
+
+			const uint32_t numParticles = (uint32_t)curPos.size();
+			for (uint32_t i = 0; i < numParticles; i++)
+			{
+				curPos[i] = PxVec4(skinnedPhysicsPositions[i], curPos[i].w);
+				prevPos[i] = PxVec4(skinnedPhysicsPositions[i], prevPos[i].w);
+			}
+			mCloth->clearParticleAccelerations();
+		}
+		else if (!localSpaceSim)
+		{
+			cloth::Range<PxVec4> curPos = mCloth->getCurrentParticles();
+			cloth::Range<PxVec4> prevPos = mCloth->getPreviousParticles();
+
+			const uint32_t numParticles = (uint32_t)curPos.size();
+
+			PxMat44 globalPosePreviousNormalized = mGlobalPosePrevious;
+			globalPosePreviousNormalized.column0.normalize();
+			globalPosePreviousNormalized.column1.normalize();
+			globalPosePreviousNormalized.column2.normalize();
+
+			const PxMat44 realTransform = mGlobalPoseNormalized * globalPosePreviousNormalized.inverseRT();
+
+			for (uint32_t i = 0; i < numParticles; i++)
+			{
+				curPos[i] = PxVec4(realTransform.transform(sdkWritebackPosition[i]), curPos[i].w);
+				prevPos[i] = PxVec4(realTransform.transform(prevPos[i].getXYZ()), prevPos[i].w);
+			}
+		}
+	}
+
+	mLocalSpaceSim = localSpaceSim;
+}
+
+
+
+void Simulation::setSolverIterations(uint32_t /*iterations*/)
+{
+	/*
+	if (mCloth != NULL)
+	{
+		mSolverIterationsPerSecond = iterations * 50.0f;
+		mCloth->setSolverFrequency(mSolverIterationsPerSecond);
+	}
+	*/
+}
+
+
+
+void Simulation::updateConstrainPositions(bool isDirty)
+{
+	if (mCloth == NULL || mIsStatic)
+	{
+		return;
+	}
+
+	PX_ASSERT(mConstrainCoeffs != NULL); // guarantees that setConstrainCoefficients has been called before!
+
+	if (mConstrainConstantsDirty || isDirty)
+	{
+		if (mTeleported)
+		{
+			mCloth->clearMotionConstraints();
+		}
+		cloth::Range<PxVec4> sphericalConstraints = mCloth->getMotionConstraints();
+
+		PX_ASSERT(sphericalConstraints.size() == sdkNumDeformableVertices);
+
+		for (uint32_t i = 0; i < sdkNumDeformableVertices; i++)
+		{
+			// we also must write the .w component to be sure everything works!
+			sphericalConstraints[i] = PxVec4(skinnedPhysicsPositions[i], mConstrainConstants[i].motionConstrainDistance);
+		}
+
+		if (mNumBackstopConstraints > 0)
+		{
+			if (mTeleported)
+			{
+				mCloth->clearSeparationConstraints();
+			}
+			cloth::Range<PxVec4> backstopConstraints = mCloth->getSeparationConstraints();
+
+			for (uint32_t i = 0; i < sdkNumDeformableVertices; i++)
+			{
+				backstopConstraints[i] = PxVec4(skinnedPhysicsPositions[i] - mConstrainConstants[i].backstopDistance * skinnedPhysicsNormals[i], mConstrainConstants[i].backstopRadius);
+			}
+		}
+
+		mConstrainConstantsDirty = false;
+	}
+
+	mCloth->setMotionConstraintScaleBias(mMotionConstrainScale, mMotionConstrainBias);
+}
+
+
+
+bool Simulation::applyClothingMaterial(tMaterial* material, PxVec3 scaledGravity)
+{
+	if (mCloth == NULL || material == NULL || mIsStatic)
+	{
+		return false;
+	}
+
+	// solver iterations
+	mCloth->setSolverFrequency(material->solverFrequency);
+
+	// filter window for handling dynamic timesteps. smooth over 2s.
+	mCloth->setAcceleationFilterWidth(2 * (uint32_t)material->solverFrequency);
+
+	// damping scale is here to remove the influence of the stiffness frequency from all damping values
+	// (or to be more precise, to use 10 as a stiffness frequency)
+	const float dampingStiffnessFrequency = 10.0f;
+	const float exponentDamping = dampingStiffnessFrequency / material->stiffnessFrequency * physx::shdfnd::log2(1 - material->damping);
+	const float exponentDrag = dampingStiffnessFrequency / material->stiffnessFrequency * physx::shdfnd::log2(1 - material->drag);
+	const float newDamping = 1.0f - ::expf(exponentDamping * 0.693147180559945309417f); // exp -> exp2, 0.69 = ln(2)
+	const float newDrag = 1.0f - ::expf(exponentDrag * 0.693147180559945309417f); // exp -> exp2
+
+	// damping
+	// TODO damping as vector
+	mCloth->setDamping(PxVec3(newDamping));
+
+	mCloth->setStiffnessFrequency(material->stiffnessFrequency);
+
+	// drag
+	// TODO expose linear and angular drag separately
+	mCloth->setLinearDrag(PxVec3(newDrag));
+	mCloth->setAngularDrag(PxVec3(newDrag));
+
+	// friction
+	mCloth->setFriction(material->friction);
+
+	// gravity
+	PxVec3 gravity;
+	gravity[0] = scaledGravity.x * material->gravityScale;
+	gravity[1] = scaledGravity.y * material->gravityScale;
+	gravity[2] = scaledGravity.z * material->gravityScale;
+	mScaledGravity = scaledGravity * material->gravityScale;
+	mCloth->setGravity(mScaledGravity);
+
+	// inertia scale
+	// TODO expose linear and angular inertia separately
+	mCloth->setLinearInertia(PxVec3(material->inertiaScale));
+	mCloth->setAngularInertia(PxVec3(material->inertiaScale));
+
+	// mass scale
+	mCloth->setCollisionMassScale(material->massScale);
+
+	// tether settings
+	mCloth->setTetherConstraintScale(material->tetherLimit);
+	mCloth->setTetherConstraintStiffness(material->tetherStiffness);
+
+
+	// remember for debug rendering
+	mTetherLimit = material->tetherLimit;
+
+	// self collision
+	// clear debug render data if it's not needed, or stale
+	if(mClothingScene->getDebugRenderParams()->SelfCollisionAttenuation == 0.0f || material->selfcollisionThickness * mActorScale != mCloth->getSelfCollisionDistance())
+	{
+		mSelfCollisionAttenuationPairs.clear();
+		mSelfCollisionAttenuationValues.clear();
+	}
+
+	if (	(mCloth->getSelfCollisionDistance() == 0.0f || mCloth->getSelfCollisionStiffness() == 0.0f)
+		&&	(material->selfcollisionThickness * mActorScale > 0.0f && material->selfcollisionStiffness > 0.0)
+		)
+	{
+		// turning on
+		setRestPositions(true);
+	}
+	else if(	(mCloth->getSelfCollisionDistance() > 0.0f && mCloth->getSelfCollisionStiffness() > 0.0f)
+		&&	(material->selfcollisionThickness * mActorScale == 0.0f || material->selfcollisionStiffness == 0.0)
+		)
+	{
+		// turning off
+		setRestPositions(false);
+	}
+	mCloth->setSelfCollisionDistance(material->selfcollisionThickness * mActorScale);
+	mCloth->setSelfCollisionStiffness(material->selfcollisionStiffness);
+
+	for (uint32_t i = 0; i < mPhaseConfigs.size(); i++)
+	{
+		PxClothFabricPhaseType::Enum phaseType = (PxClothFabricPhaseType::Enum)mCookedData->deformablePhaseDescs.buf[mPhaseConfigs[i].mPhaseIndex].phaseType;
+
+		if (phaseType == PxClothFabricPhaseType::eVERTICAL)
+		{
+			mPhaseConfigs[i].mStiffness = material->verticalStretchingStiffness;
+			mPhaseConfigs[i].mStiffnessMultiplier = material->verticalStiffnessScaling.scale;
+			mPhaseConfigs[i].mCompressionLimit = material->verticalStiffnessScaling.compressionRange;
+			mPhaseConfigs[i].mStretchLimit = material->verticalStiffnessScaling.stretchRange;
+		}
+		else if (phaseType == PxClothFabricPhaseType::eHORIZONTAL)
+		{
+			mPhaseConfigs[i].mStiffness = material->horizontalStretchingStiffness;
+			mPhaseConfigs[i].mStiffnessMultiplier = material->horizontalStiffnessScaling.scale;
+			mPhaseConfigs[i].mCompressionLimit = material->horizontalStiffnessScaling.compressionRange;
+			mPhaseConfigs[i].mStretchLimit = material->horizontalStiffnessScaling.stretchRange;
+		}
+		else if (phaseType == PxClothFabricPhaseType::eBENDING)
+		{
+			mPhaseConfigs[i].mStiffness = material->bendingStiffness;
+			mPhaseConfigs[i].mStiffnessMultiplier = material->bendingStiffnessScaling.scale;
+			mPhaseConfigs[i].mCompressionLimit = material->bendingStiffnessScaling.compressionRange;
+			mPhaseConfigs[i].mStretchLimit = material->bendingStiffnessScaling.stretchRange;
+		}
+		else
+		{
+			PX_ASSERT(phaseType == PxClothFabricPhaseType::eSHEARING);
+			mPhaseConfigs[i].mStiffness = material->shearingStiffness;
+			mPhaseConfigs[i].mStiffnessMultiplier = material->shearingStiffnessScaling.scale;
+			mPhaseConfigs[i].mCompressionLimit = material->shearingStiffnessScaling.compressionRange;
+			mPhaseConfigs[i].mStretchLimit = material->shearingStiffnessScaling.stretchRange;
+		}
+	}
+
+	cloth::Range<cloth::PhaseConfig> phaseConfig(mPhaseConfigs.begin(), mPhaseConfigs.end());
+	mCloth->setPhaseConfig(phaseConfig);
+
+	return true;
+}
+
+
+
+void Simulation::setRestPositions(bool on)
+{
+	if (mCloth == NULL || mIsStatic)
+		return;
+
+
+	if (on)
+	{
+		PxVec4* tempRestPositions = (PxVec4*)GetInternalApexSDK()->getTempMemory(sdkNumDeformableVertices * sizeof(PxVec4));
+
+		for (uint32_t i = 0; i < sdkNumDeformableVertices; ++i)
+		{
+			tempRestPositions[i] = PxVec4(mRestPositions[i]*mActorScale, 0.0f);
+		}
+
+		mCloth->setRestPositions(cloth::Range<PxVec4>(tempRestPositions, tempRestPositions + sdkNumDeformableVertices));
+
+		GetInternalApexSDK()->releaseTempMemory(tempRestPositions);
+	}
+	else
+	{
+		mCloth->setRestPositions(cloth::Range<PxVec4>());
+	}
+}
+
+
+
+void Simulation::applyClothingDesc(tClothingDescTemplate& /*clothingTemplate*/)
+{
+}
+
+
+
+void Simulation::setInterCollisionChannels(uint32_t channels)
+{
+	if (mCloth != NULL)
+	{
+		mCloth->setUserData((void*)(size_t)channels);
+	}
+}
+
+
+#if APEX_UE4
+void Simulation::simulate(float dt)
+{
+	return mCloth->simulate(dt);
+}
+#endif
+
+
+void Simulation::setHalfPrecisionOption(bool isAllowed)
+{
+	if (mCloth != NULL)
+	{
+		mCloth->setHalfPrecisionOption(isAllowed);
+	}
+}
+
+
+
+void Simulation::releaseFabric(NvParameterized::Interface* _cookedData)
+{
+	if (::strcmp(_cookedData->className(), ClothingCookedPhysX3Param::staticClassName()) == 0)
+	{
+		ClothingCookedPhysX3Param* cookedData = static_cast<ClothingCookedPhysX3Param*>(_cookedData);
+
+		while (cookedData != NULL)
+		{
+			if (cookedData->fabricCPU != NULL)
+			{
+				cloth::Fabric* fabric = static_cast<cloth::Fabric*>(cookedData->fabricCPU);
+				delete fabric;
+				cookedData->fabricCPU = NULL;
+			}
+
+			for (int32_t i = 0; i < cookedData->fabricGPU.arraySizes[0]; ++i)
+			{
+				cloth::Fabric* fabric = static_cast<cloth::Fabric*>(cookedData->fabricGPU.buf[i].fabricGPU);
+				delete fabric;
+			}
+			NvParameterized::Handle handle(*cookedData);
+			if (cookedData->getParameterHandle("fabricGPU", handle) == NvParameterized::ERROR_NONE)
+			{
+				handle.resizeArray(0);
+			}
+
+			cookedData = static_cast<ClothingCookedPhysX3Param*>(cookedData->nextCookedData);
+		}
+	}
+}
+
+
+
+void Simulation::applyCollision()
+{
+	if (mCloth != NULL && !mIsStatic)
+	{
+		// spheres
+		uint32_t numReleased = mReleasedSphereIds.size();
+		if (numReleased > 0)
+		{
+			// remove all deleted spheres
+			// biggest id's first, such that we don't
+			// invalidate remaining id's
+			nvidia::sort<uint32_t>(&mReleasedSphereIds[0], numReleased);
+			for (int32_t i = (int32_t)numReleased-1; i >= 0; --i)
+			{
+				uint32_t id = mReleasedSphereIds[(uint32_t)i];
+				if(id < 32)
+				{
+					mCloth->setSpheres(cloth::Range<const PxVec4>(),id, id+1);
+				}
+			}
+			mReleasedSphereIds.clear();
+		}
+		PxVec4* end = (mCollisionSpheres.size() > 32) ? mCollisionSpheres.begin() + 32 : mCollisionSpheres.end();
+		cloth::Range<const PxVec4> spheres((PxVec4*)mCollisionSpheres.begin(), end);
+		mCloth->setSpheres(spheres, 0, mCloth->getNumSpheres());
+
+		// capsules
+		cloth::Range<const uint32_t> capsules(mCollisionCapsules.begin(), mCollisionCapsules.end());
+		mCloth->setCapsules(capsules, 0, mCloth->getNumCapsules());
+
+		// planes
+		numReleased = mReleasedPlaneIds.size();
+		if (numReleased > 0)
+		{
+			// remove all deleted planes
+			// biggest id's first, such that we don't
+			// invalidate remaining id's
+			nvidia::sort<uint32_t>(&mReleasedPlaneIds[0], numReleased);
+			for (int32_t i = (int32_t)numReleased-1; i >= 0; --i)
+			{
+				uint32_t id = mReleasedPlaneIds[(uint32_t)i];
+				if(id < 32)
+				{
+					mCloth->setPlanes(cloth::Range<const PxVec4>(),id, id+1);
+				}
+			}
+			mReleasedPlaneIds.clear();
+		}
+
+		end = (mCollisionPlanes.size() > 32) ? mCollisionPlanes.begin() + 32 : mCollisionPlanes.end();
+		cloth::Range<const PxVec4> planes((PxVec4*)mCollisionPlanes.begin(), end);
+		mCloth->setPlanes(planes, 0, mCloth->getNumPlanes());
+
+		// convexes
+		cloth::Range<const uint32_t> convexes(mCollisionConvexes.begin(), mCollisionConvexes.end());
+		mCloth->setConvexes(convexes,0,mCloth->getNumConvexes());
+
+		// triangle meshes
+		// If mCollisionTrianglesOld is empty, updateCollision hasn't been called.
+		// In that case there have been no changes, so use the same buffer for old
+		// and new triangle positions.
+		cloth::Range<const PxVec3> trianglesOld(
+												(mCollisionTrianglesOld.size() > 0) ? mCollisionTrianglesOld.begin() : mCollisionTriangles.begin(),
+												(mCollisionTrianglesOld.size() > 0) ? mCollisionTrianglesOld.end() : mCollisionTriangles.end()
+												);
+		cloth::Range<const PxVec3> triangles(mCollisionTriangles.begin(), mCollisionTriangles.end());
+		mCloth->setTriangles(trianglesOld, triangles, 0);
+		mCollisionTrianglesOld.clear();
+
+		mCloth->enableContinuousCollision(!simulation.disableCCD);
+		if (mTeleported)
+		{
+			mCloth->clearInterpolation();
+		}
+	}
+}
+
+
+
+bool Simulation::allocateHostMemory(MappedArray& mappedArray)
+{
+	bool allocated = false;
+	if (mappedArray.hostMemory.size() != mappedArray.deviceMemory.size())
+	{
+		mappedArray.hostMemory.resize((uint32_t)mappedArray.deviceMemory.size());
+		allocated = true; // read the first time to init the data!
+	}
+	return allocated;
+}
+
+
+
+ClothingSceneSimulateTask::ClothingSceneSimulateTask(SceneIntl* apexScene, ClothingScene* scene, ModuleClothingImpl* module, profile::PxProfileZoneManager* manager) :
+	mModule(module),
+	mApexScene(apexScene),
+	mScene(scene),
+	mSimulationDelta(0.0f),
+	mSolverGPU(NULL), mSolverCPU(NULL),
+	mProfileSolverGPU(NULL), mProfileSolverCPU(NULL),
+	mWaitForSolverTask(NULL),
+	mProfileManager(manager),
+	mFailedGpuFactory(false)
+{
+	PX_UNUSED(mFailedGpuFactory);
+#ifndef PHYSX_PROFILE_SDK
+	PX_UNUSED(mProfileSolverGPU);
+#endif
+}
+
+
+
+ClothingSceneSimulateTask::~ClothingSceneSimulateTask()
+{
+	PX_ASSERT(mSolverGPU == NULL);
+
+	if (mSolverCPU != NULL)
+	{
+		delete mSolverCPU;
+		mSolverCPU = NULL;
+	}
+
+#ifdef PHYSX_PROFILE_SDK
+	if (mProfileSolverGPU != NULL)
+	{
+		mProfileSolverGPU->release();
+		mProfileSolverGPU = NULL;
+	}
+
+	if (mProfileSolverCPU != NULL)
+	{
+		mProfileSolverCPU->release();
+		mProfileSolverCPU = NULL;
+	}
+#endif
+}
+
+
+
+void ClothingSceneSimulateTask::setWaitTask(PxBaseTask* waitForSolver)
+{
+	mWaitForSolverTask = waitForSolver;
+}
+
+
+
+void ClothingSceneSimulateTask::setDeltaTime(float simulationDelta)
+{
+	mSimulationDelta = simulationDelta;
+}
+
+
+
+float ClothingSceneSimulateTask::getDeltaTime()
+{
+	return mSimulationDelta;
+}
+
+
+
+cloth::Solver* ClothingSceneSimulateTask::getSolver(ClothFactory factory)
+{
+	PX_ASSERT(factory.factory != NULL);
+	PX_ASSERT(factory.mutex != NULL);
+
+#if PX_WINDOWS_FAMILY
+	if (factory.factory->getPlatform() == cloth::Factory::CUDA)
+	{
+		if (mSolverGPU == NULL)
+		{
+			PX_ASSERT(mProfileSolverGPU == NULL);
+			if (mProfileManager != NULL)
+			{
+#ifdef PHYSX_PROFILE_SDK
+				mProfileSolverGPU = &mProfileManager->createProfileZone("CUDA Cloth", profile::PxProfileNames());
+#endif
+			}
+
+			nvidia::Mutex::ScopedLock wlock(*factory.mutex);
+			mSolverGPU = factory.factory->createSolver(mProfileSolverGPU, mApexScene->getTaskManager());
+		}
+
+		PX_ASSERT(mSolverGPU != NULL);
+		return mSolverGPU;
+	}
+#endif
+
+	if (factory.factory->getPlatform() == cloth::Factory::CPU && mSolverCPU == NULL)
+	{
+		PX_ASSERT(mProfileSolverCPU == NULL);
+		if (mProfileManager != NULL)
+		{
+#ifdef PHYSX_PROFILE_SDK
+			mProfileSolverCPU = &mProfileManager->createProfileZone("CPU Cloth", profile::PxProfileNames());
+#endif
+		}
+
+		nvidia::Mutex::ScopedLock wlock(*factory.mutex);
+		mSolverCPU = factory.factory->createSolver(mProfileSolverCPU, mApexScene->getTaskManager());
+	}
+
+	PX_ASSERT(mSolverCPU != NULL);
+	return mSolverCPU;
+}
+
+
+
+void ClothingSceneSimulateTask::clearGpuSolver()
+{
+#if PX_WINDOWS_FAMILY
+
+#ifdef PHYSX_PROFILE_SDK
+	if (mProfileSolverGPU != NULL)
+	{
+		mProfileSolverGPU->release();
+		mProfileSolverGPU = NULL;
+	}
+#endif
+
+	if (mSolverGPU != NULL)
+	{
+		delete mSolverGPU;
+		mSolverGPU = NULL;
+	}
+#endif
+}
+
+
+
+void ClothingSceneSimulateTask::run()
+{
+	PX_ASSERT(mSimulationDelta > 0.0f);
+	PX_ASSERT(mWaitForSolverTask != NULL);
+
+	mScene->setSceneRunning(true);
+
+	PxBaseTask* task1 = NULL;
+	PxBaseTask* task2 = NULL;
+
+	float interCollisionDistance = mModule->getInterCollisionDistance();
+	float interCollisionStiffness = mModule->getInterCollisionStiffness();
+	uint32_t interCollisionIterations = mModule->getInterCollisionIterations();
+
+	if (mSolverCPU != NULL)
+	{
+		mSolverCPU->setInterCollisionDistance(interCollisionDistance);
+		mSolverCPU->setInterCollisionStiffness(interCollisionStiffness);
+		mSolverCPU->setInterCollisionNbIterations(interCollisionIterations);
+		mSolverCPU->setInterCollisionFilter(interCollisionFilter);
+
+		task1 = &mSolverCPU->simulate(mSimulationDelta, *mWaitForSolverTask);
+	}
+	if (mSolverGPU != NULL)
+	{
+		mSolverGPU->setInterCollisionDistance(interCollisionDistance);
+		mSolverGPU->setInterCollisionStiffness(interCollisionStiffness);
+		mSolverGPU->setInterCollisionNbIterations(interCollisionIterations);
+		mSolverGPU->setInterCollisionFilter(interCollisionFilter);
+
+		task2 = &mSolverGPU->simulate(mSimulationDelta, *mWaitForSolverTask);
+	}
+
+	// only remove the references when both simulate() methods have been called
+	if (task1 != NULL)
+	{
+		task1->removeReference();
+	}
+
+	if (task2 != NULL)
+	{
+		task2->removeReference();
+	}
+}
+
+
+
+const char* ClothingSceneSimulateTask::getName() const
+{
+	return "Simulate";
+}
+
+
+
+bool ClothingSceneSimulateTask::interCollisionFilter(void* user0, void* user1)
+{
+	size_t collisionChannels0 = reinterpret_cast<size_t>(user0);
+	size_t collisionChannels1 = reinterpret_cast<size_t>(user1);
+	return (collisionChannels0 & collisionChannels1) > 0;
+}
+
+
+
+WaitForSolverTask::WaitForSolverTask(ClothingScene* scene) :
+	mScene(scene)
+{
+}
+
+
+
+void WaitForSolverTask::run()
+{
+	mScene->setSceneRunning(false);
+	mScene->embeddedPostSim();
+}
+
+const char* WaitForSolverTask::getName() const
+{
+	return "WaitForSolverTask";
+}
+
+}
+} // namespace nvidia
author	git perforce import user <a@b>	2016-10-25 12:29:14 -0600
committer	Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees>	2016-10-25 18:56:37 -0500
commit	3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
tree	fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /APEX_1.4/module/clothing/embedded
download	physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip