Initial commit:

PhysX 3.4.0 Update @ 21294896 APEX 1.4.0 Update @ 21275617 [CL 21300167]
author: git perforce import user <a@b> 2016-10-25 12:29:14 -0600
committer: Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees> 2016-10-25 18:56:37 -0500
commit: 3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
tree: fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /APEX_1.4/module/iofx/src/IofxManagerGPU.cpp
download: physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz
physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip
1 files changed, 1319 insertions, 0 deletions
diff --git a/APEX_1.4/module/iofx/src/IofxManagerGPU.cpp b/APEX_1.4/module/iofx/src/IofxManagerGPU.cpp
new file mode 100644
index 00000000..06d1209a
--- /dev/null
+++ b/APEX_1.4/module/iofx/src/IofxManagerGPU.cpp
@@ -0,0 +1,1319 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+
+#include "Apex.h"
+#include "ApexDefs.h"
+
+#if APEX_CUDA_SUPPORT
+
+#include "ApexSDKIntl.h"
+#include "SceneIntl.h"
+#include "ModifierImpl.h"
+#include "IofxActor.h"
+#include "IofxManagerGPU.h"
+#include "IofxAssetImpl.h"
+#include "IofxSceneGPU.h"
+
+#include "ModuleIofxImpl.h"
+#include "IofxActorGPU.h"
+
+#include "PxGpuTask.h"
+#include "ApexCutil.h"
+
+#include "RandStateHelpers.h"
+
+#include "IofxRenderData.h"
+
+#define CUDA_OBJ(name) SCENE_CUDA_OBJ(mIofxScene, name)
+
+namespace nvidia
+{
+namespace iofx
+{
+
+class IofxAssetSceneInstGPU : public IofxAssetSceneInst
+{
+public:
+	IofxAssetSceneInstGPU(IofxAssetImpl* asset, uint32_t semantics, IofxScene* scene)
+		: IofxAssetSceneInst(asset, semantics)
+		, _constMemGroup(SCENE_CUDA_OBJ(*scene, modifierStorage))
+	{
+		_totalRandomCount = 0;
+
+		APEX_CUDA_CONST_MEM_GROUP_SCOPE(_constMemGroup)
+
+		_storage_.alloc(_assetParamsHandle);
+		AssetParams assetParams;
+		buildModifierList(assetParams.spawnModifierList, _asset->mSpawnModifierStack);
+		buildModifierList(assetParams.continuousModifierList, _asset->mContinuousModifierStack);
+		_storage_.update(_assetParamsHandle, assetParams);
+	}
+	virtual ~IofxAssetSceneInstGPU() {}
+
+	InplaceHandle<AssetParams> getAssetParamsHandle() const
+	{
+		return _assetParamsHandle;
+	}
+
+private:
+
+	void buildModifierList(ModifierList& list, const ModifierStack& stack)
+	{
+		InplaceStorage& _storage_ = _constMemGroup.getStorage();
+
+		class Mapper : public ModifierParamsMapperGPU
+		{
+		public:
+			InplaceStorage* storage;
+
+			InplaceHandleBase paramsHandle;
+			uint32_t paramsRandomCount;
+
+			virtual InplaceStorage& getStorage()
+			{
+				return *storage;
+			}
+
+			virtual void  onParams(InplaceHandleBase handle, uint32_t randomCount)
+			{
+				paramsHandle = handle;
+				paramsRandomCount = randomCount;
+			}
+
+		} mapper;
+		mapper.storage = &_storage_;
+
+		list.resize(_storage_, stack.size());
+
+		uint32_t index = 0;
+		for (ModifierStack::ConstIterator it = stack.begin(); it != stack.end(); ++it)
+		{
+			uint32_t type = (*it)->getModifierType();
+			//NxU32 usage = (*it)->getModifierUsage();
+			//if ((usage & usageStage) == usageStage && (usage & usageClass) == usageClass)
+			{
+				const ModifierImpl* modifier = ModifierImpl::castFrom(*it);
+				modifier->mapParamsGPU(mapper);
+
+				ModifierListElem listElem;
+				listElem.type = type;
+				listElem.paramsHandle = mapper.paramsHandle;
+				list.updateElem(_storage_, listElem, index);
+
+				_totalRandomCount += mapper.paramsRandomCount;
+			}
+			++index;
+		}
+	}
+
+	ApexCudaConstMemGroup		_constMemGroup;
+	InplaceHandle<AssetParams>	_assetParamsHandle;
+	uint32_t				_totalRandomCount;
+};
+
+class IofxManagerClientGPU : public IofxManagerClient
+{
+public:
+	IofxManagerClientGPU(IofxAssetSceneInst* assetSceneInst, uint32_t actorClassID, const IofxManagerClientIntl::Params& params, IofxScene* scene)
+		: IofxManagerClient(assetSceneInst, actorClassID, params)
+		, _constMemGroup(SCENE_CUDA_OBJ(*scene, modifierStorage))
+	{
+		setParamsGPU();
+	}
+
+	InplaceHandle<ClientParams> getClientParamsHandle() const
+	{
+		return _clientParamsHandle;
+	}
+
+	// IofxManagerClientIntl interface
+	virtual void setParams(const IofxManagerClientIntl::Params& params)
+	{
+		IofxManagerClient::setParams(params);
+		setParamsGPU();
+	}
+
+private:
+	void setParamsGPU()
+	{
+		APEX_CUDA_CONST_MEM_GROUP_SCOPE(_constMemGroup)
+
+		ClientParams clientParams;
+		if (_clientParamsHandle.allocOrFetch(_storage_, clientParams))
+		{
+			clientParams.assetParamsHandle = static_cast<IofxAssetSceneInstGPU*>(_assetSceneInst)->getAssetParamsHandle();
+		}
+		clientParams.objectScale = _params.objectScale;
+		_clientParamsHandle.update(_storage_, clientParams);
+	}
+
+	ApexCudaConstMemGroup		_constMemGroup;
+	InplaceHandle<ClientParams>	_clientParamsHandle;
+};
+
+
+IofxManagerClient* IofxManagerGPU::createClient(IofxAssetSceneInst* assetSceneInst, uint32_t actorClassID, const IofxManagerClientIntl::Params& params)
+{
+	return PX_NEW(IofxManagerClientGPU)(assetSceneInst, actorClassID, params, &mIofxScene);
+}
+
+IofxAssetSceneInst* IofxManagerGPU::createAssetSceneInst(IofxAssetImpl* asset,uint32_t semantics)
+{
+	return PX_NEW(IofxAssetSceneInstGPU)(asset, semantics, &mIofxScene);
+}
+
+class IofxManagerLaunchTask : public PxGpuTask, public UserAllocated
+{
+public:
+	IofxManagerLaunchTask(IofxManagerGPU* actor) : mActor(actor) {}
+	const char* getName() const
+	{
+		return "IofxManagerLaunchTask";
+	}
+	void         run()
+	{
+		PX_ALWAYS_ASSERT();
+	}
+	bool         launchInstance(CUstream stream, int kernelIndex)
+	{
+		return mActor->cudaLaunch(stream, kernelIndex);
+	}
+	PxGpuTaskHint::Enum getTaskHint() const
+	{
+		return PxGpuTaskHint::Kernel;
+	}
+
+protected:
+	IofxManagerGPU* mActor;
+};
+
+IofxManagerGPU::IofxManagerGPU(SceneIntl& scene, const IofxManagerDescIntl& desc, IofxManager& mgr, const ApexMirroredPlace::Enum defaultPlace)
+	: mManager(mgr)
+	, mIofxScene(*mgr.mIofxScene)
+	, mCopyQueue(*scene.getTaskManager()->getGpuDispatcher())
+	, mDefaultPlace(defaultPlace)
+	, mCuSpawnScale(scene)
+	, mCuSpawnSeed(scene)
+	, mCuBlockPRNGs(scene)
+	, mCuSortedActorIDs(scene)
+	, mCuSortedStateIDs(scene)
+	, mCuSortTempKeys(scene)
+	, mCuSortTempValues(scene)
+	, mCuSortTemp(scene)
+	, mCuMinBounds(scene)
+	, mCuMaxBounds(scene)
+	, mCuTempMinBounds(scene)
+	, mCuTempMaxBounds(scene)
+	, mCuTempActorIDs(scene)
+	, mCuActorStart(scene)
+	, mCuActorEnd(scene)
+	, mCuActorVisibleEnd(scene)
+	, mCurSeed(0)
+	, mTargetBufDevPtr(NULL)
+	, mCountActorIDs(0)
+	, mNumberVolumes(0)
+	, mNumberActorClasses(0)
+	, mEmptySimulation(false)
+	, mVolumeConstMemGroup(CUDA_OBJ(migrationStorage))
+	, mRemapConstMemGroup(CUDA_OBJ(remapStorage))
+	, mModifierConstMemGroup(CUDA_OBJ(modifierStorage))
+{
+	mTaskLaunch = PX_NEW(IofxManagerLaunchTask)(this);
+
+	const uint32_t maxObjectCount = desc.maxObjectCount;
+	const uint32_t maxInStateCount = desc.maxInStateCount;
+	uint32_t usageClass = 0;
+	uint32_t blockSize = MAX_THREADS_PER_BLOCK;
+
+	if (mManager.mIsMesh)
+	{
+		usageClass = ModifierUsage_Mesh;
+		//blockSize = CUDA_OBJ(meshModifiersKernel).getBlockDim().x;
+	}
+	else
+	{
+		usageClass = ModifierUsage_Sprite;
+		//blockSize = CUDA_OBJ(spriteModifiersKernel).getBlockDim().x;
+	}
+
+	mCuSpawnScale.reserve(mManager.mOutStateOffset + maxObjectCount, ApexMirroredPlace::GPU);
+	mCuSpawnSeed.reserve(mManager.mOutStateOffset + maxObjectCount, ApexMirroredPlace::GPU);
+
+	mCuSortedActorIDs.reserve(maxInStateCount, defaultPlace);
+	mCuSortedStateIDs.reserve(maxInStateCount, defaultPlace);
+
+	mCuSortTempKeys.reserve(maxInStateCount, ApexMirroredPlace::GPU);
+	mCuSortTempValues.reserve(maxInStateCount, ApexMirroredPlace::GPU);
+	mCuSortTemp.reserve(MAX_BOUND_BLOCKS * NEW_SORT_KEY_DIGITS, ApexMirroredPlace::GPU);
+
+	mCuTempMinBounds.reserve(WARP_SIZE * 2, ApexMirroredPlace::GPU);
+	mCuTempMaxBounds.reserve(WARP_SIZE * 2, ApexMirroredPlace::GPU);
+	mCuTempActorIDs.reserve(WARP_SIZE * 2, ApexMirroredPlace::GPU);
+
+	// alloc volumeConstMem
+	{
+		APEX_CUDA_CONST_MEM_GROUP_SCOPE(mVolumeConstMemGroup)
+
+		mVolumeParamsArrayHandle.alloc(_storage_);
+		mActorClassIDBitmapArrayHandle.alloc(_storage_);
+	}
+
+	// alloc remapConstMem
+	{
+		APEX_CUDA_CONST_MEM_GROUP_SCOPE(mRemapConstMemGroup)
+
+		mActorIDRemapArrayHandle.alloc(_storage_);
+	}
+
+	// alloc modifierConstMem
+	{
+		APEX_CUDA_CONST_MEM_GROUP_SCOPE(mModifierConstMemGroup)
+
+		mClientParamsHandleArrayHandle.alloc(_storage_);
+
+		if (mManager.mIsMesh)
+		{
+			mMeshOutputLayoutHandle.alloc(_storage_);
+		}
+		else
+		{
+			mSpriteOutputLayoutHandle.alloc(_storage_);
+		}
+	}
+
+	InitDevicePRNGs(scene, blockSize, mRandThreadLeap, mRandGridLeap, mCuBlockPRNGs);
+}
+
+void IofxManagerGPU::release()
+{
+	delete this;
+}
+
+IofxManagerGPU::~IofxManagerGPU()
+{
+	delete mTaskLaunch;
+}
+
+
+void IofxManagerGPU::submitTasks()
+{
+	mNumberActorClasses = mManager.mActorClassTable.size();
+	mNumberVolumes = mManager.mVolumeTable.size();
+	mCountActorIDs = mManager.mActorTable.size() * mNumberVolumes;
+
+	// update volumeConstMem
+	if (mNumberVolumes)
+	{
+		APEX_CUDA_CONST_MEM_GROUP_SCOPE(mVolumeConstMemGroup)
+
+		VolumeParamsArray volumeParamsArray;
+		_storage_.fetch(mVolumeParamsArrayHandle, volumeParamsArray);
+		volumeParamsArray.resize(_storage_, mNumberVolumes);
+		_storage_.update(mVolumeParamsArrayHandle, volumeParamsArray);
+
+
+		ActorClassIDBitmapArray actorClassIDBitmapArray;
+		_storage_.fetch(mActorClassIDBitmapArrayHandle, actorClassIDBitmapArray);
+		actorClassIDBitmapArray.resize(_storage_, mManager.mVolumeActorClassBitmap.size());
+		_storage_.update(mActorClassIDBitmapArrayHandle, actorClassIDBitmapArray);
+
+		actorClassIDBitmapArray.updateRange(_storage_, &mManager.mVolumeActorClassBitmap.front(), actorClassIDBitmapArray.getSize());
+
+		for (uint32_t i = 0 ; i < mNumberVolumes ; i++)
+		{
+			VolumeParams volumeParams;
+			IofxManager::VolumeData& vd = mManager.mVolumeTable[ i ];
+			if (vd.vol)
+			{
+				volumeParams.bounds = vd.mBounds;
+				volumeParams.priority = vd.mPri;
+			}
+			else
+			{
+				volumeParams.bounds.setEmpty();
+				volumeParams.priority = 0;
+			}
+			volumeParamsArray.updateElem(_storage_, volumeParams, i);
+		}
+	}
+	else
+	{
+		APEX_DEBUG_WARNING("IofxManager: There is no render volume!");
+	}
+
+	// update remapConstMem
+	{
+		APEX_CUDA_CONST_MEM_GROUP_SCOPE(mRemapConstMemGroup)
+
+		ActorIDRemapArray actorIDRemapArray;
+		_storage_.fetch(mActorIDRemapArrayHandle, actorIDRemapArray);
+		actorIDRemapArray.resize(_storage_, mNumberActorClasses);
+		for (uint32_t i = 0 ; i < mNumberActorClasses ; ++i)
+		{
+			actorIDRemapArray.updateElem(_storage_, mManager.mActorClassTable[i].actorID, i);
+		}
+		_storage_.update(mActorIDRemapArrayHandle, actorIDRemapArray);
+	}
+
+	// update modifierConstMem
+	{
+		APEX_CUDA_CONST_MEM_GROUP_SCOPE(mModifierConstMemGroup)
+
+		ClientParamsHandleArray clientParamsHandleArray;
+		_storage_.fetch(mClientParamsHandleArrayHandle, clientParamsHandleArray);
+		clientParamsHandleArray.resize(_storage_, mNumberActorClasses);
+		for (uint32_t i = 0 ; i < mNumberActorClasses ; ++i)
+		{
+			InplaceHandle<ClientParams> clientParamsHandle;
+			IofxManagerClientGPU* clientGPU = static_cast<IofxManagerClientGPU*>(mManager.mActorClassTable[i].client);
+			if (clientGPU != NULL)
+			{
+				clientParamsHandle = clientGPU->getClientParamsHandle();
+			}
+			clientParamsHandleArray.updateElem(_storage_, clientParamsHandle, i);
+		}
+		_storage_.update(mClientParamsHandleArrayHandle, clientParamsHandleArray);
+
+		if (mManager.mIsMesh)
+		{
+			MeshOutputLayout meshOutputLayout;
+
+			IosObjectGpuData* mWorkingData = DYNAMIC_CAST(IosObjectGpuData*)(mManager.mWorkingIosData);
+			IofxSharedRenderDataMeshImpl* meshRenderData = DYNAMIC_CAST(IofxSharedRenderDataMeshImpl*)(mWorkingData->renderData);
+			const IofxMeshRenderLayout& meshRenderLayout = meshRenderData->getRenderLayout();
+
+			mOutputDWords = meshRenderLayout.stride >> 2;
+			meshOutputLayout.stride = meshRenderLayout.stride;
+			::memcpy(meshOutputLayout.offsets, meshRenderLayout.offsets, sizeof(meshOutputLayout.offsets));
+
+			_storage_.update(mMeshOutputLayoutHandle, meshOutputLayout);
+		}
+		else
+		{
+			SpriteOutputLayout spriteOutputLayout;
+
+			IosObjectGpuData* mWorkingData = DYNAMIC_CAST(IosObjectGpuData*)(mManager.mWorkingIosData);
+			IofxSharedRenderDataSpriteImpl* spriteRenderData = DYNAMIC_CAST(IofxSharedRenderDataSpriteImpl*)(mWorkingData->renderData);
+			const IofxSpriteRenderLayout& spriteRenderLayout = spriteRenderData->getRenderLayout();
+			
+			mOutputDWords = spriteRenderLayout.stride >> 2;
+			spriteOutputLayout.stride = spriteRenderLayout.stride;
+			::memcpy(spriteOutputLayout.offsets, spriteRenderLayout.offsets, sizeof(spriteOutputLayout.offsets));
+
+			_storage_.update(mSpriteOutputLayoutHandle, spriteOutputLayout);
+		}
+	}
+
+}
+
+
+#pragma warning(push)
+#pragma warning(disable:4312) // conversion from 'CUdeviceptr' to 'uint32_t *' of greater size
+
+PxTaskID IofxManagerGPU::launchGpuTasks()
+{
+	PxTaskManager* tm = mIofxScene.mApexScene->getTaskManager();
+	tm->submitUnnamedTask(*mTaskLaunch, PxTaskType::TT_GPU);
+	mTaskLaunch->finishBefore(mManager.mPostUpdateTaskID);
+	return mTaskLaunch->getTaskID();
+}
+
+void IofxManagerGPU::launchPrep()
+{
+	IosObjectGpuData* mWorkingData = DYNAMIC_CAST(IosObjectGpuData*)(mManager.mWorkingIosData);
+
+	if (!mWorkingData->numParticles)
+	{
+		mEmptySimulation = true;
+		return;
+	}
+
+	mCurSeed = static_cast<uint32_t>(mIofxScene.mApexScene->getSeed());
+
+	PxTaskManager* tm = mIofxScene.mApexScene->getTaskManager();
+	PxCudaContextManager* ctx = tm->getGpuDispatcher()->getCudaContextManager();
+	{
+		PxScopedCudaLock s(*ctx);
+
+		mTargetTextureCount = 0;
+		mTargetBufDevPtr = 0;
+		if (!mManager.mIsMesh)
+		{
+			IofxSharedRenderDataSpriteImpl* spriteRenderData = DYNAMIC_CAST(IofxSharedRenderDataSpriteImpl*)(mWorkingData->renderData);
+			const IofxSpriteRenderLayout& spriteRenderLayout = spriteRenderData->getRenderLayout();
+
+			mTargetTextureCount = spriteRenderLayout.surfaceCount;
+			for( uint32_t i = 0; i < mTargetTextureCount; ++i )
+			{
+				const CUarray cuArray = spriteRenderData->getSurfaceMappedCudaArray(i);
+				if (cuArray != NULL)
+				{
+					mTargetCudaArrayList[i].assign(cuArray, false);
+				}
+				else
+				{
+					CUarray_format format = CUarray_format(0);
+					uint32_t numChannels = 0;
+					switch (spriteRenderLayout.surfaceElements[i])
+					{
+					case IofxSpriteRenderLayoutSurfaceElement::POSITION_FLOAT4:
+					case IofxSpriteRenderLayoutSurfaceElement::SCALE_ORIENT_SUBTEX_FLOAT4:
+					case IofxSpriteRenderLayoutSurfaceElement::COLOR_FLOAT4:
+						format = CU_AD_FORMAT_FLOAT;
+						numChannels = 4;
+						break;
+					case IofxSpriteRenderLayoutSurfaceElement::COLOR_RGBA8:
+					case IofxSpriteRenderLayoutSurfaceElement::COLOR_BGRA8:
+						format = CU_AD_FORMAT_UNSIGNED_INT32;
+						numChannels = 1;
+					default:
+						PX_ALWAYS_ASSERT();
+						break;
+					}
+					const UserRenderSurfaceDesc& desc = spriteRenderLayout.surfaceDescs[i];
+					mTargetCudaArrayList[i].create(format, numChannels, uint32_t(desc.width), uint32_t(desc.height), 0, true);
+				}
+			}
+			for( uint32_t i = mTargetTextureCount; i < IofxSpriteRenderLayout::MAX_SURFACE_COUNT; ++i ) {
+				mTargetCudaArrayList[i].release();
+			}
+		}
+
+		if (mTargetTextureCount == 0)
+		{
+			const CUdeviceptr cudaPtr = mWorkingData->renderData->getBufferMappedCudaPtr();
+			if (cudaPtr != 0)
+			{
+				mTargetOutputBuffer.release();
+				mTargetBufDevPtr = reinterpret_cast<uint32_t*>(cudaPtr);
+			}
+			else
+			{
+				const size_t size = mWorkingData->renderData->getRenderBufferSize();
+				if (size > 0)
+				{
+					mTargetOutputBuffer.realloc(size, ctx);
+					mTargetBufDevPtr = static_cast<uint32_t*>( mTargetOutputBuffer.getGpuPtr() );
+				}
+			}
+		}
+	}
+
+	const uint32_t numActorIDValues = mCountActorIDs + 2;
+	mCuActorStart.setSize(numActorIDValues, ApexMirroredPlace::CPU_GPU);
+	mCuActorEnd.setSize(numActorIDValues, ApexMirroredPlace::CPU_GPU);
+	mCuActorVisibleEnd.setSize(numActorIDValues, ApexMirroredPlace::CPU_GPU);
+	mCuMinBounds.setSize(numActorIDValues, ApexMirroredPlace::CPU_GPU);
+	mCuMaxBounds.setSize(numActorIDValues, ApexMirroredPlace::CPU_GPU);
+
+	mCuSortedActorIDs.setSize(mWorkingData->maxStateID, mDefaultPlace);
+	mCuSortedStateIDs.setSize(mWorkingData->maxStateID, mDefaultPlace);
+
+	mManager.positionMass.setSize(mWorkingData->maxInputID, ApexMirroredPlace::CPU_GPU);
+	mManager.velocityLife.setSize(mWorkingData->maxInputID, ApexMirroredPlace::CPU_GPU);
+	mManager.actorIdentifiers.setSize(mWorkingData->maxInputID, ApexMirroredPlace::CPU_GPU);
+	mManager.inStateToInput.setSize(mWorkingData->maxStateID, ApexMirroredPlace::CPU_GPU);
+	mManager.outStateToInput.setSize(mWorkingData->numParticles, ApexMirroredPlace::CPU_GPU);
+	if (mWorkingData->iosSupportsCollision)
+	{
+		mManager.collisionNormalFlags.setSize(mWorkingData->maxInputID, ApexMirroredPlace::CPU_GPU);
+	}
+	if (mWorkingData->iosSupportsDensity)
+	{
+		mManager.density.setSize(mWorkingData->maxInputID, ApexMirroredPlace::CPU_GPU);
+	}
+	if (mWorkingData->iosSupportsUserData)
+	{
+		mManager.userData.setSize(mWorkingData->maxInputID, ApexMirroredPlace::CPU_GPU);
+	}
+
+	mEmptySimulation = false;
+}
+
+#pragma warning(pop)
+
+
+///
+PX_INLINE uint32_t getHighestBitShift(uint32_t x)
+{
+	PX_ASSERT(isPowerOfTwo(x));
+	return highestSetBit(x);
+}
+
+void IofxManagerGPU::cudaLaunchRadixSort(CUstream stream, unsigned int numElements, unsigned int keyBits, unsigned int startBit, bool useSyncKernels)
+{
+	if (useSyncKernels)
+	{
+		//we use OLD Radix Sort on Tesla (SM < 2), because it is faster
+		CUDA_OBJ(radixSortSyncKernel)(
+			stream, numElements,
+			mCuSortedActorIDs.getGpuPtr(), mCuSortedStateIDs.getGpuPtr(),
+			mCuSortTempKeys.getGpuPtr(), mCuSortTempValues.getGpuPtr(),
+			mCuSortTemp.getGpuPtr(), keyBits, startBit
+		);
+	}
+	else
+	{
+#if 1
+		//NEW Radix Sort
+		unsigned int totalThreads = (numElements + NEW_SORT_VECTOR_SIZE - 1) / NEW_SORT_VECTOR_SIZE;
+		if (CUDA_OBJ(newRadixSortBlockKernel).isSingleBlock(totalThreads))
+		{
+			//launch just a single block for small sizes
+			CUDA_OBJ(newRadixSortBlockKernel)(
+				stream, APEX_CUDA_SINGLE_BLOCK_LAUNCH,
+				numElements, keyBits, startBit,
+				mCuSortedActorIDs.getGpuPtr(), mCuSortedStateIDs.getGpuPtr()
+			);
+		}
+		else
+		{
+			for (unsigned int bit = startBit; bit < startBit + keyBits; bit += RADIX_SORT_NBITS)
+			{
+				uint32_t gridSize = 
+					CUDA_OBJ(newRadixSortStepKernel)(
+						stream, totalThreads,
+						numElements, bit,
+						mCuSortedActorIDs.getGpuPtr(), mCuSortedStateIDs.getGpuPtr(),
+						mCuSortTempKeys.getGpuPtr(), mCuSortTempValues.getGpuPtr(),
+						mCuSortTemp.getGpuPtr(),
+						1, 0
+					);
+
+				//launch just a single block
+				CUDA_OBJ(newRadixSortStepKernel)(
+					stream, APEX_CUDA_SINGLE_BLOCK_LAUNCH,
+					numElements, bit,
+					mCuSortedActorIDs.getGpuPtr(), mCuSortedStateIDs.getGpuPtr(),
+					mCuSortTempKeys.getGpuPtr(), mCuSortTempValues.getGpuPtr(),
+					mCuSortTemp.getGpuPtr(),
+					2, gridSize
+				);
+
+				CUDA_OBJ(newRadixSortStepKernel)(
+					stream, totalThreads,
+					numElements, bit,
+					mCuSortedActorIDs.getGpuPtr(), mCuSortedStateIDs.getGpuPtr(),
+					mCuSortTempKeys.getGpuPtr(), mCuSortTempValues.getGpuPtr(),
+					mCuSortTemp.getGpuPtr(),
+					3, 0
+				);
+
+				mCuSortedActorIDs.swapGpuPtr(mCuSortTempKeys);
+				mCuSortedStateIDs.swapGpuPtr(mCuSortTempValues);
+			}
+		}
+#else
+		//OLD Radix Sort
+		for (unsigned int startBit = 0; startBit < keyBits; startBit += RADIX_SORT_NBITS)
+		{
+			int gridSize = 
+				CUDA_OBJ(radixSortStep1Kernel)(
+					stream, numElements,
+					mCuSortedActorIDs.getGpuPtr(), mCuSortedStateIDs.getGpuPtr(),
+					mCuSortTempKeys.getGpuPtr(), mCuSortTempValues.getGpuPtr(),
+					mCuSortTemp.getGpuPtr(), startBit
+				);
+
+			//launch just 1 block
+			CUDA_OBJ(radixSortStep2Kernel)(
+				stream, CUDA_OBJ(radixSortStep2Kernel).getBlockDim().x,
+				mCuSortTemp.getGpuPtr(), gridSize
+			);
+
+			CUDA_OBJ(radixSortStep3Kernel)(
+				stream, numElements,
+				mCuSortedActorIDs.getGpuPtr(), mCuSortedStateIDs.getGpuPtr(),
+				mCuSortTempKeys.getGpuPtr(), mCuSortTempValues.getGpuPtr(),
+				mCuSortTemp.getGpuPtr(), startBit
+			);
+		}
+#endif
+	}
+}
+
+bool IofxManagerGPU::cudaLaunch(CUstream stream, int kernelIndex)
+{
+	PxTaskManager* tm = mIofxScene.mApexScene->getTaskManager();
+
+	if (mEmptySimulation)
+	{
+		return false;
+	}
+
+	const uint32_t numActorIDValues = mCountActorIDs + 2;
+	//value <  mCountActorIDs     - valid particle with volume
+	//value == mCountActorIDs     - homeless particle (no volume or invalid actor class)
+	//value == mCountActorIDs + 1 - NOT_A_PARTICLE
+
+
+	IofxSceneGPU* sceneGPU = static_cast<IofxSceneGPU*>(&mIofxScene);
+	bool useSyncKernels = !sceneGPU->getGpuDispatcher()->getCudaContextManager()->supportsArchSM20();
+
+	IosObjectGpuData* mWorkingData = DYNAMIC_CAST(IosObjectGpuData*)(mManager.mWorkingIosData);
+
+	switch (kernelIndex)
+	{
+	case 0:
+		if (mManager.mOnStartCallback)
+		{
+			(*mManager.mOnStartCallback)(stream);
+		}
+		mCopyQueue.reset(stream, 24);
+		if (!mManager.mCudaIos && mWorkingData->maxInputID > 0)
+		{
+			mManager.positionMass.copyHostToDeviceQ(mCopyQueue);
+			mManager.velocityLife.copyHostToDeviceQ(mCopyQueue);
+			mManager.actorIdentifiers.copyHostToDeviceQ(mCopyQueue);
+			mManager.inStateToInput.copyHostToDeviceQ(mCopyQueue);
+			if (mWorkingData->iosSupportsCollision)
+			{
+				mManager.collisionNormalFlags.copyHostToDeviceQ(mCopyQueue);
+			}
+			if (mWorkingData->iosSupportsDensity)
+			{
+				mManager.density.copyHostToDeviceQ(mCopyQueue);
+			}
+			if (mWorkingData->iosSupportsUserData)
+			{
+				mManager.userData.copyHostToDeviceQ(mCopyQueue);
+			}
+			mCopyQueue.flushEnqueued();
+		}
+		break;
+
+	case 1:
+		/* Volume Migration (input space) */
+		CUDA_OBJ(volumeMigrationKernel)(stream,
+		                                PxMax(mWorkingData->maxInputID, numActorIDValues),
+										mVolumeConstMemGroup.getStorage().mappedHandle(mVolumeParamsArrayHandle),
+		                                mVolumeConstMemGroup.getStorage().mappedHandle(mActorClassIDBitmapArrayHandle),
+		                                mNumberActorClasses, mNumberVolumes, numActorIDValues,
+		                                mManager.actorIdentifiers.getGpuPtr(), mWorkingData->maxInputID,
+		                                (const float4*)mManager.positionMass.getGpuPtr(),
+		                                mCuActorStart.getGpuPtr(), mCuActorEnd.getGpuPtr(), mCuActorVisibleEnd.getGpuPtr()
+		                               );
+		break;
+
+	case 2:
+		{
+			APEX_CUDA_TEXTURE_SCOPE_BIND(texRefRemapPositions,      mManager.positionMass)
+			APEX_CUDA_TEXTURE_SCOPE_BIND(texRefRemapActorIDs,       mManager.actorIdentifiers)
+			APEX_CUDA_TEXTURE_SCOPE_BIND(texRefRemapInStateToInput, mManager.inStateToInput)
+
+			/* if mDistanceSortingEnabled, sort on camera distance first, else directly make ActorID keys */
+			CUDA_OBJ(makeSortKeys)(stream, mWorkingData->maxStateID,
+								   mManager.inStateToInput.getGpuPtr(), mWorkingData->maxInputID,
+								   mManager.mActorTable.size(), mCountActorIDs,
+								   mRemapConstMemGroup.getStorage().mappedHandle(mActorIDRemapArrayHandle),
+								   (const float4*)mManager.positionMass.getGpuPtr(), mManager.mDistanceSortingEnabled,
+								   mWorkingData->eyePosition, mWorkingData->eyeDirection, mWorkingData->zNear,
+								   mCuSortedActorIDs.getGpuPtr(), mCuSortedStateIDs.getGpuPtr());
+
+			if (mManager.mDistanceSortingEnabled)
+			{
+				cudaLaunchRadixSort(stream, mWorkingData->maxStateID, 32, 0, useSyncKernels);
+
+				/* Generate ActorID sort keys, using distance sorted stateID values */
+				CUDA_OBJ(remapKernel)(stream, mWorkingData->maxStateID,
+									  mManager.inStateToInput.getGpuPtr(), mWorkingData->maxInputID,
+									  mManager.mActorTable.size(), mCountActorIDs,
+									  mRemapConstMemGroup.getStorage().mappedHandle(mActorIDRemapArrayHandle),
+									  mCuSortedStateIDs.getGpuPtr(), mCuSortedActorIDs.getGpuPtr());
+			}
+		}
+		break;
+
+	case 3:
+		/* ActorID Sort (output state space) */
+		// input: mCuSortedActorIDs == actorIDs, in distance sorted order
+		// input: mCuSortedStateIDs == stateIDs, in distance sorted order
+
+		// output: mCuSortedActorIDs == sorted ActorIDs
+		// output: mCuSortedStateIDs == output-to-input state
+		{
+			//SortedActorIDs could contain values from 0 to mCountActorIDs + 1 (included),
+			//so keybits should cover at least mCountActorIDs + 2 numbers
+			uint32_t keybits = 0;
+			while ((1U << keybits) < numActorIDValues)
+			{
+				++keybits;
+			}
+
+			cudaLaunchRadixSort(stream, mWorkingData->maxStateID, keybits, 0, useSyncKernels);
+		}
+		break;
+
+	case 4:
+		/* Per-IOFX actor particle range detection */
+		CUDA_OBJ(actorRangeKernel)(stream, mWorkingData->maxStateID,
+		                           mCuSortedActorIDs.getGpuPtr(), mCountActorIDs,
+		                           mCuActorStart.getGpuPtr(), mCuActorEnd.getGpuPtr(), mCuActorVisibleEnd.getGpuPtr(),
+								   mCuSortedStateIDs.getGpuPtr()
+		                          );
+		break;
+
+	case 5:
+		/* Modifiers (output state space) */
+		{
+			PX_PROFILE_ZONE("IofxManagerGPUModifiers", GetInternalApexSDK()->getContextId());
+			ModifierCommonParams commonParams = mWorkingData->getCommonParams();
+
+			APEX_CUDA_TEXTURE_SCOPE_BIND(texRefPositionMass,     mManager.positionMass)
+			APEX_CUDA_TEXTURE_SCOPE_BIND(texRefVelocityLife,     mManager.velocityLife)
+			APEX_CUDA_TEXTURE_SCOPE_BIND(texRefInStateToInput,   mManager.inStateToInput)
+			APEX_CUDA_TEXTURE_SCOPE_BIND(texRefStateSpawnSeed,   mCuSpawnSeed)
+			APEX_CUDA_TEXTURE_SCOPE_BIND(texRefStateSpawnScale,  mCuSpawnScale)
+
+			APEX_CUDA_TEXTURE_SCOPE_BIND(texRefActorIDs,         mManager.actorIdentifiers)
+
+			if (mWorkingData->iosSupportsCollision)
+			{
+				CUDA_OBJ(texRefCollisionNormalFlags).bindTo(mManager.collisionNormalFlags);
+			}
+			if (mWorkingData->iosSupportsDensity)
+			{
+				CUDA_OBJ(texRefDensity).bindTo(mManager.density);
+			}
+			if (mWorkingData->iosSupportsUserData)
+			{
+				CUDA_OBJ(texRefUserData).bindTo(mManager.userData);
+			}
+
+			PRNGInfo rand;
+			rand.g_stateSpawnSeed = mCuSpawnSeed.getGpuPtr();
+			rand.g_randBlock = mCuBlockPRNGs.getGpuPtr();
+			rand.randGrid = mRandGridLeap;
+			rand.randThread = mRandThreadLeap;
+			rand.seed = mCurSeed;
+
+			if (mManager.mIsMesh)
+			{
+				// 3x3 matrix => 9 float scalars => 3 slices
+
+				APEX_CUDA_TEXTURE_SCOPE_BIND(texRefMeshPrivState0, *mManager.privState.slices[0]);
+				APEX_CUDA_TEXTURE_SCOPE_BIND(texRefMeshPrivState1, *mManager.privState.slices[1]);
+				APEX_CUDA_TEXTURE_SCOPE_BIND(texRefMeshPrivState2, *mManager.privState.slices[2]);
+
+				MeshPrivateStateArgs meshPrivStateArgs;
+				meshPrivStateArgs.g_state[0] = mManager.privState.a[0];
+				meshPrivStateArgs.g_state[1] = mManager.privState.a[1];
+				meshPrivStateArgs.g_state[2] = mManager.privState.a[2];
+
+				CUDA_OBJ(meshModifiersKernel)(ApexKernelConfig(MAX_SMEM_BANKS * mOutputDWords, WARP_SIZE * PxMax<uint32_t>(mOutputDWords, 4)), 
+											  stream, mWorkingData->numParticles,
+											  mManager.mInStateOffset, mManager.mOutStateOffset,
+											  mModifierConstMemGroup.getStorage().mappedHandle(mClientParamsHandleArrayHandle),
+											  commonParams,
+											  mCuSortedActorIDs.getGpuPtr(), mCuSortedStateIDs.getGpuPtr(),
+											  mManager.outStateToInput.getGpuPtr(),
+											  meshPrivStateArgs, mCuSpawnScale.getGpuPtr(),
+											  rand, mTargetBufDevPtr,
+											  mModifierConstMemGroup.getStorage().mappedHandle(mMeshOutputLayoutHandle)
+											 );
+			}
+			else
+			{
+				// 1 float scalar => 1 slice
+
+				APEX_CUDA_TEXTURE_SCOPE_BIND(texRefSpritePrivState0, *mManager.privState.slices[0]);
+
+				SpritePrivateStateArgs spritePrivStateArgs;
+				spritePrivStateArgs.g_state[0] = mManager.privState.a[0];
+
+				IofxSharedRenderDataSpriteImpl* renderDataSprite = static_cast<IofxSharedRenderDataSpriteImpl*>(mWorkingData->renderData);
+				const IofxSpriteRenderLayout& spriteRenderLayout = renderDataSprite->getRenderLayout();
+
+				if (mTargetTextureCount > 0)
+				{
+					SpriteTextureOutputLayout outputLayout;
+					outputLayout.textureCount = mTargetTextureCount;
+					for (uint32_t i = 0; i < outputLayout.textureCount; ++i)
+					{
+						outputLayout.textureData[i].layout = static_cast<uint16_t>(spriteRenderLayout.surfaceElements[i]);
+
+						uint32_t width = mTargetCudaArrayList[i].getWidth();
+						//width should be a power of 2 and a multiply of WARP_SIZE
+						PX_ASSERT(isPowerOfTwo(width));
+						PX_ASSERT((width & (WARP_SIZE - 1)) == 0);
+						outputLayout.textureData[i].widthShift = static_cast<uint8_t>(highestSetBit(width));
+
+						outputLayout.textureData[i].pitchShift = 0; //unused in GPU mode!
+						outputLayout.texturePtr[i] = NULL; //unused in GPU mode!
+					}
+
+					if (0 < outputLayout.textureCount) APEX_CUDA_SURFACE_BIND(surfRefOutput0, mTargetCudaArrayList[0], ApexCudaMemFlags::OUT);
+					if (1 < outputLayout.textureCount) APEX_CUDA_SURFACE_BIND(surfRefOutput1, mTargetCudaArrayList[1], ApexCudaMemFlags::OUT);
+					if (2 < outputLayout.textureCount) APEX_CUDA_SURFACE_BIND(surfRefOutput2, mTargetCudaArrayList[2], ApexCudaMemFlags::OUT);
+					if (3 < outputLayout.textureCount) APEX_CUDA_SURFACE_BIND(surfRefOutput3, mTargetCudaArrayList[3], ApexCudaMemFlags::OUT);
+
+					CUDA_OBJ(spriteTextureModifiersKernel)(stream, mWorkingData->numParticles,
+														   mManager.mInStateOffset, mManager.mOutStateOffset,
+														   mModifierConstMemGroup.getStorage().mappedHandle(mClientParamsHandleArrayHandle),
+														   commonParams,
+														   mCuSortedActorIDs.getGpuPtr(), mCuSortedStateIDs.getGpuPtr(),
+														   mManager.outStateToInput.getGpuPtr(),
+														   spritePrivStateArgs, mCuSpawnScale.getGpuPtr(),
+														   rand, outputLayout
+														  );
+
+					if (0 < outputLayout.textureCount) APEX_CUDA_SURFACE_UNBIND(surfRefOutput0);
+					if (1 < outputLayout.textureCount) APEX_CUDA_SURFACE_UNBIND(surfRefOutput1);
+					if (2 < outputLayout.textureCount) APEX_CUDA_SURFACE_UNBIND(surfRefOutput2);
+					if (3 < outputLayout.textureCount) APEX_CUDA_SURFACE_UNBIND(surfRefOutput3);
+				}
+				else
+				{
+					CUDA_OBJ(spriteModifiersKernel)(ApexKernelConfig(MAX_SMEM_BANKS * mOutputDWords, WARP_SIZE * PxMax<uint32_t>(mOutputDWords, 4)),
+													stream, mWorkingData->numParticles,
+													mManager.mInStateOffset, mManager.mOutStateOffset,
+													mModifierConstMemGroup.getStorage().mappedHandle(mClientParamsHandleArrayHandle),
+													commonParams,
+													mCuSortedActorIDs.getGpuPtr(), mCuSortedStateIDs.getGpuPtr(),
+													mManager.outStateToInput.getGpuPtr(),
+													spritePrivStateArgs, mCuSpawnScale.getGpuPtr(),
+													rand, mTargetBufDevPtr,
+													mModifierConstMemGroup.getStorage().mappedHandle(mSpriteOutputLayoutHandle)
+												   );
+				}
+			}
+
+			if (mWorkingData->iosSupportsCollision)
+			{
+				CUDA_OBJ(texRefCollisionNormalFlags).unbind();
+			}
+			if (mWorkingData->iosSupportsDensity)
+			{
+				CUDA_OBJ(texRefDensity).unbind();
+			}
+			if (mWorkingData->iosSupportsUserData)
+			{
+				CUDA_OBJ(texRefUserData).unbind();
+			}
+		}
+		break;
+
+	case 6:
+		if (mCountActorIDs > 0)
+		{
+			/* Per-IOFX actor BBox generation */
+			APEX_CUDA_TEXTURE_SCOPE_BIND(texRefBBoxPositions, mManager.positionMass)
+
+			if (useSyncKernels)
+			{
+				CUDA_OBJ(bboxSyncKernel)(
+					stream, mWorkingData->numParticles,
+					mCuSortedActorIDs.getGpuPtr(),
+					mManager.outStateToInput.getGpuPtr(),
+					(const float4*)mManager.positionMass.getGpuPtr(),
+					(float4*)mCuMinBounds.getGpuPtr(), (float4*)mCuMaxBounds.getGpuPtr(),
+					mCuTempActorIDs.getGpuPtr(),
+					(float4*)mCuTempMinBounds.getGpuPtr(), (float4*)mCuTempMaxBounds.getGpuPtr()
+				);
+			}
+			else
+			{
+				uint32_t bboxGridSize =
+					CUDA_OBJ(bboxKernel)(
+						stream, mWorkingData->numParticles,
+						mCuSortedActorIDs.getGpuPtr(),
+						mManager.outStateToInput.getGpuPtr(),
+						(const float4*)mManager.positionMass.getGpuPtr(),
+						(float4*)mCuMinBounds.getGpuPtr(), (float4*)mCuMaxBounds.getGpuPtr(),
+						mCuTempActorIDs.getGpuPtr(),
+						(float4*)mCuTempMinBounds.getGpuPtr(), (float4*)mCuTempMaxBounds.getGpuPtr(),
+						1, 0
+					);
+
+				CUDA_OBJ(bboxKernel)(
+					stream, APEX_CUDA_SINGLE_BLOCK_LAUNCH,
+					mCuSortedActorIDs.getGpuPtr(),
+					mManager.outStateToInput.getGpuPtr(),
+					(const float4*)mManager.positionMass.getGpuPtr(),
+					(float4*)mCuMinBounds.getGpuPtr(), (float4*)mCuMaxBounds.getGpuPtr(),
+					mCuTempActorIDs.getGpuPtr(),
+					(float4*)mCuTempMinBounds.getGpuPtr(), (float4*)mCuTempMaxBounds.getGpuPtr(),
+					2, bboxGridSize
+				);
+			}
+		}
+		break;
+
+	case 7:
+		if (mTargetTextureCount > 0)
+		{
+			IofxSharedRenderDataSpriteImpl* spriteRenderData = DYNAMIC_CAST(IofxSharedRenderDataSpriteImpl*)(mWorkingData->renderData);
+			PX_ASSERT(spriteRenderData->getRenderLayout().surfaceCount == mTargetTextureCount);
+
+			for (uint32_t i = 0; i < mTargetTextureCount; ++i)
+			{
+				UserRenderSurface::MappedInfo mappedInfo;
+				if (spriteRenderData->getSurfaceMappedInfo(i, mappedInfo))
+				{
+					const size_t surfaceWidth = spriteRenderData->getRenderLayout().surfaceDescs[i].width;
+					size_t copyHeight = (mWorkingData->numParticles + surfaceWidth - 1) / surfaceWidth;
+					mTargetCudaArrayList[i].copyToHost(stream, mappedInfo.pData, mappedInfo.rowPitch, 0, 0, copyHeight);
+				}
+			}
+		}
+		else
+		{
+			void* mappedPtr = mWorkingData->renderData->getBufferMappedPtr();
+			if (mappedPtr)
+			{
+				size_t size = (mOutputDWords << 2) * mWorkingData->numParticles;
+				mTargetOutputBuffer.copyToHost(stream, mappedPtr, size);
+			}
+		}
+		if (mCountActorIDs > 0)
+		{
+			mCuMinBounds.copyDeviceToHostQ(mCopyQueue);
+			mCuMaxBounds.copyDeviceToHostQ(mCopyQueue);
+		}
+		mCuActorStart.copyDeviceToHostQ(mCopyQueue);
+		mCuActorEnd.copyDeviceToHostQ(mCopyQueue);
+		mCuActorVisibleEnd.copyDeviceToHostQ(mCopyQueue);
+
+
+		if (mCuSortedActorIDs.cpuPtrIsValid())
+		{
+			mManager.inStateToInput.copyDeviceToHostQ(mCopyQueue);
+			mManager.actorIdentifiers.copyDeviceToHostQ(mCopyQueue);
+			mManager.outStateToInput.copyDeviceToHostQ(mCopyQueue);
+			mManager.positionMass.copyDeviceToHostQ(mCopyQueue);
+
+			mCuSortedActorIDs.copyDeviceToHostQ(mCopyQueue);
+			mCuSortedStateIDs.copyDeviceToHostQ(mCopyQueue);
+		}
+		else if (!mManager.mCudaIos)
+		{
+			mManager.actorIdentifiers.copyDeviceToHostQ(mCopyQueue);
+			mManager.outStateToInput.copyDeviceToHostQ(mCopyQueue);
+		}
+
+		mCopyQueue.flushEnqueued();
+
+		if (mManager.mOnFinishCallback)
+		{
+			(*mManager.mOnFinishCallback)(stream);
+		}
+
+		tm->getGpuDispatcher()->addCompletionPrereq(*tm->getTaskFromID(mManager.mPostUpdateTaskID));
+		return false;
+
+	default:
+		PX_ALWAYS_ASSERT();
+		return false;
+	}
+
+	return true;
+}
+
+void IofxManagerGPU::fetchResults()
+{
+	IosObjectGpuData* mWorkingData = DYNAMIC_CAST(IosObjectGpuData*)(mManager.mWorkingIosData);
+	PX_UNUSED(mWorkingData);
+
+#if 0
+	{
+		ApexMirroredArray<uint32_t> actorID(*mIofxScene.mApexScene);
+		ApexMirroredArray<PxVec4> outMinBounds(*mIofxScene.mApexScene);
+		ApexMirroredArray<PxVec4> outMaxBounds(*mIofxScene.mApexScene);
+		ApexMirroredArray<PxVec4> outDebugInfo(*mIofxScene.mApexScene);
+		ApexMirroredArray<uint32_t> tmpLastActorID(*mIofxScene.mApexScene);
+		tmpLastActorID.setSize(64, ApexMirroredPlace::CPU_GPU);
+
+		const uint32_t NE = 2000;
+		actorID.setSize(NE, ApexMirroredPlace::CPU_GPU);
+
+		Array<uint32_t> actorCounts;
+		actorCounts.reserve(1000);
+
+		uint32_t NA = 0;
+		for (uint32_t ie = 0; ie < NE; ++NA)
+		{
+			uint32_t num_ie = rand(1, 100); // We need to use QDSRand here s.t. seed could be preset during tests!
+			uint32_t next_ie = PxMin(ie + num_ie, NE);
+
+			actorCounts.pushBack(next_ie - ie);
+
+			for (; ie < next_ie; ++ie)
+			{
+				actorID[ie] = NA;
+			}
+		}
+		outMinBounds.setSize(NA, ApexMirroredPlace::CPU_GPU);
+		outMaxBounds.setSize(NA, ApexMirroredPlace::CPU_GPU);
+		outDebugInfo.setSize(NA, ApexMirroredPlace::CPU_GPU);
+
+		for (uint32_t ia = 0; ia < NA; ++ia)
+		{
+			outMinBounds[ia].setZero();
+			outMaxBounds[ia].setZero();
+		}
+
+		PxTaskManager* tm = mIofxScene.mApexScene->getTaskManager();
+		PxCudaContextManager* ctx = tm->getGpuDispatcher()->getCudaContextManager();
+		PxScopedCudaLock s(*ctx);
+
+		mCopyQueue.reset(0, 4);
+
+		actorID.copyHostToDeviceQ(mCopyQueue);
+		outMinBounds.copyHostToDeviceQ(mCopyQueue);
+		outMaxBounds.copyHostToDeviceQ(mCopyQueue);
+		mCopyQueue.flushEnqueued();
+
+		CUDA_OBJ(bboxKernel2)(0, NE, actorID.getGpuPtr(), NULL, 0, (float4*)outDebugInfo.getGpuPtr(), (float4*)outMinBounds.getGpuPtr(), (float4*)outMaxBounds.getGpuPtr()/*, tmpLastActorID.getGpuPtr()*/);
+
+		outMinBounds.copyDeviceToHostQ(mCopyQueue);
+		outMaxBounds.copyDeviceToHostQ(mCopyQueue);
+		outDebugInfo.copyDeviceToHostQ(mCopyQueue);
+		tmpLastActorID.copyDeviceToHostQ(mCopyQueue);
+		mCopyQueue.flushEnqueued();
+
+		CUT_SAFE_CALL(cuCtxSynchronize());
+
+		uint32_t errors = 0;
+		float totCount = 0;
+		for (uint32_t ie = 0; ie < NE; ++ie)
+		{
+			uint32_t id = actorID[ie];
+			if (ie == 0 || actorID[ie - 1] != id)
+			{
+				uint32_t count = actorCounts[id];
+				const PxVec4& bounds = outMinBounds[id];
+				if (bounds.x != count)
+				{
+					++errors;
+				}
+				if (bounds.y != count * 2)
+				{
+					++errors;
+				}
+				if (bounds.z != count * 3)
+				{
+					++errors;
+				}
+				totCount += count;
+			}
+		}
+
+	}
+#endif
+
+#if 0
+	{
+		PxTaskManager* tm = mIofxScene.mApexScene->getTaskManager();
+		PxCudaContextManager* ctx = tm->getGpuDispatcher()->getCudaContextManager();
+
+		PxScopedCudaLock s(*ctx);
+
+		CUT_SAFE_CALL(cuCtxSynchronize());
+	}
+#endif
+#if DEBUG_GPU
+	{
+		nvidia::Array<int> valuesCounters(mWorkingData->maxStateID, 0);
+		uint32_t lastKey = uint32_t(-1);
+		for (uint32_t i = 0; i < mWorkingData->maxStateID; ++i)
+		{
+			uint32_t currKey = mCuSortedActorIDs.get(i);
+			PX_ASSERT(currKey < mCountActorIDs + 2);
+			if (lastKey != uint32_t(-1))
+			{
+				PX_ASSERT(lastKey <= currKey);
+			}
+			if (lastKey != currKey)
+			{
+				if (mCuActorStart[currKey] != i)
+				{
+					int temp = 0;
+					temp++;
+				}
+				PX_ASSERT(mCuActorStart[currKey] == i);
+				if (lastKey != uint32_t(-1))
+				{
+					if (mCuActorEnd[lastKey] != i)
+					{
+						int temp = 0;
+						temp++;
+					}
+					PX_ASSERT(mCuActorEnd[lastKey] == i);
+				}
+			}
+			lastKey = currKey;
+
+			uint32_t currValue = (mCuSortedStateIDs.get(i) & STATE_ID_MASK);
+			PX_ASSERT(currValue < mWorkingData->maxStateID);
+			if (currValue < mWorkingData->maxStateID)
+			{
+				valuesCounters[currValue] += 1;
+			}
+		}
+		if (lastKey != uint32_t(-1))
+		{
+			PX_ASSERT(mCuActorEnd[lastKey] == mWorkingData->maxStateID);
+		}
+		for (uint32_t i = 0; i < mWorkingData->maxStateID; ++i)
+		{
+			PX_ASSERT(valuesCounters[i] == 1);
+		}
+	}
+#endif
+
+	/* Swap input/output state offsets */
+	mManager.swapStates();
+
+	if (mEmptySimulation)
+	{
+		for (uint32_t i = 0 ; i < mNumberVolumes ; i++)
+		{
+			IofxManager::VolumeData& d = mManager.mVolumeTable[ i ];
+			if (d.vol == 0)
+			{
+				continue;
+			}
+
+			for (uint32_t j = 0 ; j < mManager.mActorTable.size() ; j++)
+			{
+				IofxActorImpl* iofx = d.mActors[ j ];
+				if (iofx && iofx != DEFERRED_IOFX_ACTOR)
+				{
+					iofx->mResultBounds.setEmpty();
+					iofx->mResultRange.startIndex = 0;
+					iofx->mResultRange.objectCount = 0;
+					iofx->mResultVisibleCount = 0;
+				}
+			}
+		}
+	}
+	else
+	{
+		PX_ASSERT(mCuActorStart.cpuPtrIsValid() && mCuActorEnd.cpuPtrIsValid());
+		if (!mCuActorStart.cpuPtrIsValid() || !mCuActorEnd.cpuPtrIsValid())
+		{
+			// Workaround for issue seen by a customer
+			APEX_INTERNAL_ERROR("Bad cpuPtr in IofxManagerGPU::fetchResults");
+			return;
+		}
+#ifndef NDEBUG
+		//check Actor Ranges
+		{
+			uint32_t totalCount = 0;
+			//range with the last index (= mCountActorIDs) contains homeless particles!
+			for (uint32_t i = 0 ; i <= mCountActorIDs ; i++)
+			{
+				const uint32_t rangeStart = mCuActorStart[ i ];
+				const uint32_t rangeEnd = mCuActorEnd[ i ];
+				const uint32_t rangeVisibleEnd = mCuActorVisibleEnd[ i ];
+
+				PX_ASSERT(rangeStart < mWorkingData->numParticles);
+				PX_ASSERT(rangeEnd <= mWorkingData->numParticles);
+				PX_ASSERT(rangeStart <= rangeEnd);
+				PX_ASSERT(rangeStart <= rangeVisibleEnd && rangeVisibleEnd <= rangeEnd);
+				PX_UNUSED(rangeVisibleEnd);
+
+				const uint32_t rangeCount = rangeEnd - rangeStart;
+				totalCount += rangeCount;
+			}
+			PX_ASSERT(totalCount == mWorkingData->numParticles);
+		}
+#endif
+
+		uint32_t aid = 0;
+		for (uint32_t i = 0 ; i < mNumberVolumes ; i++)
+		{
+			IofxManager::VolumeData& d = mManager.mVolumeTable[ i ];
+			if (d.vol == 0)
+			{
+				aid += mManager.mActorTable.size();
+				continue;
+			}
+
+			for (uint32_t j = 0 ; j < mManager.mActorTable.size() ; j++)
+			{
+				const uint32_t rangeStart = mCuActorStart[ aid ];
+				const uint32_t rangeEnd = mCuActorEnd[ aid ];
+				const uint32_t rangeVisibleEnd = mCuActorVisibleEnd[ aid ];
+
+				const uint32_t rangeCount = rangeEnd - rangeStart;
+				const uint32_t visibleCount = rangeVisibleEnd - rangeStart;
+
+				if (d.mActors[ j ] == DEFERRED_IOFX_ACTOR && mManager.mActorTable[ j ] != NULL &&
+				        (mIofxScene.mModule->mDeferredDisabled || rangeCount))
+				{
+					IofxActorImpl* iofxActor = PX_NEW(IofxActorGPU)(mManager.mActorTable[j]->getRenderResID(), &mIofxScene, mManager);
+					if (d.vol->addIofxActor(*iofxActor))
+					{
+						d.mActors[ j ] = iofxActor;
+
+						mManager.initIofxActor(iofxActor, j, d.vol);
+
+						// lock this renderable because the APEX scene will unlock it after this method is called
+						iofxActor->renderDataLock();
+					}
+					else
+					{
+						iofxActor->release();
+					}
+				}
+
+				IofxActorImpl* iofxActor = d.mActors[ j ];
+				if (iofxActor && iofxActor != DEFERRED_IOFX_ACTOR)
+				{
+					iofxActor->mResultBounds.setEmpty();
+					if (rangeCount > 0)
+					{
+						iofxActor->mResultBounds.minimum = mCuMinBounds[ aid ].getXYZ();
+						iofxActor->mResultBounds.maximum = mCuMaxBounds[ aid ].getXYZ();
+					}
+					PX_ASSERT(iofxActor->mRenderBounds.isFinite());
+					iofxActor->mResultRange.startIndex = rangeStart;
+					iofxActor->mResultRange.objectCount = rangeCount;
+					iofxActor->mResultVisibleCount = visibleCount;
+				}
+
+				aid++;
+			}
+		}
+	}
+
+}
+
+
+/**
+ * Called from render thread context, just before renderer calls update/dispatch on any IOFX
+ * actors.  Map/Unmap render resources as required.  "Mapped" means the graphics buffer has been
+ * mapped into our CUDA context where our kernels can write directly into it.
+ */
+void IofxManager::fillMapUnmapArraysForInterop(nvidia::Array<CUgraphicsResource> &toMapArray, nvidia::Array<CUgraphicsResource> &toUnmapArray)
+{
+	if (mInteropFlags == RenderInteropFlags::CUDA_INTEROP)
+	{
+		mResultIosData->renderData->fillMapUnmapArraysForInterop(toMapArray, toUnmapArray);
+		mStagingIosData->renderData->fillMapUnmapArraysForInterop(toMapArray, toUnmapArray);
+	}
+}
+
+
+void IofxManager::mapBufferResultsForInterop(bool mapSuccess, bool unmapSuccess)
+{
+	if (mInteropFlags == RenderInteropFlags::CUDA_INTEROP)
+	{
+		mResultIosData->renderData->mapBufferResultsForInterop(mapSuccess, unmapSuccess);
+		mStagingIosData->renderData->mapBufferResultsForInterop(mapSuccess, unmapSuccess);
+	}
+}
+
+}
+} // namespace nvidia
+
+#endif
author	git perforce import user <a@b>	2016-10-25 12:29:14 -0600
committer	Sheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees>	2016-10-25 18:56:37 -0500
commit	3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
tree	fa6485c169e50d7415a651bf838f5bcd0fd3bfbd /APEX_1.4/module/iofx/src/IofxManagerGPU.cpp
download	physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip