diff options
| author | sschirm <[email protected]> | 2016-12-23 14:20:36 +0100 |
|---|---|---|
| committer | sschirm <[email protected]> | 2016-12-23 14:56:17 +0100 |
| commit | ef6937e69e8ee3f409cf9d460d5ad300a65d5924 (patch) | |
| tree | 710426e8daa605551ce3f34b581897011101c30f /APEX_1.4/module/pxparticleios/src/ParticleIosActorGPU.cpp | |
| parent | Initial commit: (diff) | |
| download | physx-3.4-ef6937e69e8ee3f409cf9d460d5ad300a65d5924.tar.xz physx-3.4-ef6937e69e8ee3f409cf9d460d5ad300a65d5924.zip | |
PhysX 3.4 / APEX 1.4 release candidate @21506124
Diffstat (limited to 'APEX_1.4/module/pxparticleios/src/ParticleIosActorGPU.cpp')
| -rw-r--r-- | APEX_1.4/module/pxparticleios/src/ParticleIosActorGPU.cpp | 993 |
1 files changed, 0 insertions, 993 deletions
diff --git a/APEX_1.4/module/pxparticleios/src/ParticleIosActorGPU.cpp b/APEX_1.4/module/pxparticleios/src/ParticleIosActorGPU.cpp deleted file mode 100644 index 407c24b4..00000000 --- a/APEX_1.4/module/pxparticleios/src/ParticleIosActorGPU.cpp +++ /dev/null @@ -1,993 +0,0 @@ -/* - * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved. - * - * NVIDIA CORPORATION and its licensors retain all intellectual property - * and proprietary rights in and to this software, related documentation - * and any modifications thereto. Any use, reproduction, disclosure or - * distribution of this software and related documentation without an express - * license agreement from NVIDIA CORPORATION is strictly prohibited. - */ - - -#include "ApexDefs.h" -#if APEX_CUDA_SUPPORT - -#include "Apex.h" -#include "SceneIntl.h" -#include "ApexSDKIntl.h" -#include "ParticleIosActor.h" -#include "ParticleIosActorGPU.h" -#include "ParticleIosAssetImpl.h" - -#include "iofx/IofxAsset.h" -#include "iofx/IofxActor.h" - -#include "ModuleParticleIosImpl.h" -#include "ParticleIosScene.h" -#include "RenderDebugInterface.h" -#include "AuthorableObjectIntl.h" - -#include "PxMath.h" - -#define DEBUG_GPU 0 -#define USE_PHYSX_TASK_SYNC 1 - -//CUDA -#include "PxGpuTask.h" -#include "ApexCutil.h" - -#define CUDA_OBJ(name) SCENE_CUDA_OBJ(*mParticleIosScene, name) - -#include "PxParticleBase.h" -#include "PxParticleSystem.h" -#include "PxParticleDeviceExclusive.h" - -namespace nvidia -{ -namespace pxparticleios -{ - -using namespace physx; - -#pragma warning(disable: 4355) // 'this' : used in base member initializer list - -ParticleIosActorGPU::ParticleIosActorGPU( - ResourceList& list, - ParticleIosAssetImpl& asset, - ParticleIosScene& scene, - IofxAsset& iofxAsset) - : ParticleIosActorImpl(list, asset, scene, iofxAsset, true) - , mCopyQueue(*scene.getApexScene().getTaskManager()->getGpuDispatcher()) - , mHoleScanSum(scene.getApexScene(), PX_ALLOC_INFO("mHoleScanSum", PARTICLES)) - , mMoveIndices(scene.getApexScene(), PX_ALLOC_INFO("mMoveIndices", PARTICLES)) - , mTmpReduce(scene.getApexScene(), PX_ALLOC_INFO("mTmpReduce", PARTICLES)) - , mTmpHistogram(scene.getApexScene(), PX_ALLOC_INFO("mTmpHistogram", PARTICLES)) - , mTmpScan(scene.getApexScene(), PX_ALLOC_INFO("mTmpScan", PARTICLES)) - , mTmpScan1(scene.getApexScene(), PX_ALLOC_INFO("mTmpScan1", PARTICLES)) - , mTmpOutput(scene.getApexScene(), PX_ALLOC_INFO("mTmpOutput", PARTICLES)) - , mTmpBoundParams(scene.getApexScene(), PX_ALLOC_INFO("mTmpBoundParams", PARTICLES)) - , mLaunchTask(*this) - , mTriggerTask(*this) - , mCuSyncEvent(0) -{ - initStorageGroups(CUDA_OBJ(simulateStorage)); - -#if DEBUG_GPU - const ApexMirroredPlace::Enum defaultPlace = ApexMirroredPlace::CPU_GPU; -#else - const ApexMirroredPlace::Enum defaultPlace = ApexMirroredPlace::GPU; -#endif - - mTmpOutput.setSize(4, ApexMirroredPlace::CPU_GPU); - mTmpBoundParams.setSize(2, defaultPlace); - - const unsigned int ScanWarpsPerBlock = MAX_WARPS_PER_BLOCK; //CUDA_OBJ(scanKernel).getBlockDim().x / WARP_SIZE; - - mTmpReduce.reserve(MAX_BOUND_BLOCKS * 4, defaultPlace); - mTmpHistogram.reserve(MAX_BOUND_BLOCKS * HISTOGRAM_SIMULATE_BIN_COUNT, defaultPlace); - mTmpScan.reserve(MAX_BOUND_BLOCKS * ScanWarpsPerBlock, defaultPlace); - mTmpScan1.reserve(MAX_BOUND_BLOCKS * ScanWarpsPerBlock, defaultPlace); - - mField.reserve(mMaxParticleCount, defaultPlace); - mLifeTime.reserve(mMaxParticleCount, defaultPlace); - - mLifeSpan.reserve(mMaxTotalParticleCount, ApexMirroredPlace::CPU_GPU); - mInjector.reserve(mMaxTotalParticleCount, ApexMirroredPlace::CPU_GPU); - mBenefit.reserve(mMaxTotalParticleCount, ApexMirroredPlace::CPU_GPU); - - { - uint32_t size = mGridDensityParams.GridResolution; - if(size > 0) - { - mGridDensityGrid.setSize(size*size*size,ApexMirroredPlace::GPU); - mGridDensityGridLowPass.setSize(size*size*size,ApexMirroredPlace::GPU); - } - } - - mHoleScanSum.reserve(mMaxTotalParticleCount, defaultPlace); - mMoveIndices.reserve(mMaxTotalParticleCount, defaultPlace); - -#if USE_PHYSX_TASK_SYNC - { - PxCudaContextManager* ctxMgr = mParticleIosScene->getApexScene().getTaskManager()->getGpuDispatcher()->getCudaContextManager(); - PxScopedCudaLock _lock_(*ctxMgr); - - CUT_SAFE_CALL(cuEventCreate(&mCuSyncEvent, CU_EVENT_DISABLE_TIMING)); - } -#endif -} - -ParticleIosActorGPU::~ParticleIosActorGPU() -{ -#if USE_PHYSX_TASK_SYNC - { - PxCudaContextManager* ctxMgr = mParticleIosScene->getApexScene().getTaskManager()->getGpuDispatcher()->getCudaContextManager(); - PxScopedCudaLock _lock_(*ctxMgr); - - CUT_SAFE_CALL(cuEventDestroy(mCuSyncEvent)); - } -#endif -} - -PxTaskID ParticleIosActorGPU::submitTasks(PxTaskManager* tm) -{ - ParticleIosActorImpl::submitTasks(tm); - mInjectorsCounters.setSize(mInjectorList.getSize(), ApexMirroredPlace::CPU_GPU); - - if (mAsset->getParticleDesc()->Enable == false) - { - return mInjectTask.getTaskID(); - } - - tm->submitUnnamedTask(mTriggerTask); - PxTaskID taskID = tm->submitUnnamedTask(mLaunchTask, PxTaskType::TT_GPU); - - SCOPED_PHYSX_LOCK_WRITE(mParticleIosScene->getModulePhysXScene()); - - if (!PxParticleDeviceExclusive::isEnabled(*mParticleActor->is<PxParticleBase>())) - { - PxParticleDeviceExclusive::enable(*mParticleActor->is<PxParticleBase>()); - } - PxParticleDeviceExclusive::setValidParticleRange(*mParticleActor->is<PxParticleBase>(), mParticleCount); - -#if USE_PHYSX_TASK_SYNC - PxParticleDeviceExclusive::addLaunchTaskDependent(*mParticleActor->is<PxParticleBase>(), mLaunchTask); - - PxBaseTask* physxTask = PxParticleDeviceExclusive::getLaunchTask(*mParticleActor->is<PxParticleBase>()); - static_cast<ParticleIosSceneGPU*>(mParticleIosScene)->getGpuDispatcher()->addPreLaunchDependent(*physxTask); - physxTask->removeReference(); -#endif - return taskID; -} - -void ParticleIosActorGPU::setTaskDependencies(PxTaskID taskStartAfterID, PxTaskID taskFinishBeforeID) -{ - if (mAsset->getParticleDesc()->Enable == false) - { - ParticleIosActorImpl::setTaskDependencies(taskStartAfterID, taskFinishBeforeID, NULL, true); - return; - } - -#if USE_PHYSX_TASK_SYNC - PX_UNUSED(taskStartAfterID); - PX_UNUSED(taskFinishBeforeID); - ParticleIosActorImpl::setTaskDependencies(PxTaskID(0xFFFFFFFF), PxTaskID(0xFFFFFFFF), &mLaunchTask, true); - - mTriggerTask.startAfter(mParticleIosScene->getApexScene().getTaskManager()->getNamedTask(AST_PHYSX_SIMULATE)); - mTriggerTask.finishBefore(mLaunchTask.getTaskID()); -#else - ParticleIosActorImpl::setTaskDependencies(taskStartAfterID, taskFinishBeforeID, &mLaunchTask, true); -#endif - -#if 0 - if (tm->getGpuDispatcher()->getCudaContextManager()->supportsArchSM20()) - { - /* For Fermi devices, it pays to launch all IOS together. This also forces - * The IOFX managers to step at the same time. - */ - PxTaskID interlock = tm->getNamedTask("IOS::StepInterlock"); - mLaunchTask.startAfter(interlock); - } -#endif -} - -void ParticleIosActorGPU::trigger() -{ -#if USE_PHYSX_TASK_SYNC - static_cast<ParticleIosSceneGPU*>(mParticleIosScene)->getGpuDispatcher()->getPreLaunchTask().removeReference(); -#endif -} - -bool ParticleIosActorGPU::launch(CUstream stream, int kernelIndex) -{ - float deltaTime = mParticleIosScene->getApexScene().getPhysXSimulateTime(); - - uint32_t activeCount = mLastActiveCount + mInjectedCount; - mParticleBudget = mMaxParticleCount; - if (mParticleBudget > activeCount) - { - mParticleBudget = activeCount; - } - uint32_t targetCount = mParticleBudget; - if (targetCount == 0) - { - //reset output - float* pTmpOutput = (float*)mTmpOutput.getPtr(); - mTmpOutput[STATUS_LAST_ACTIVE_COUNT] = 0; - pTmpOutput[STATUS_LAST_BENEFIT_SUM] = 0.0f; - pTmpOutput[STATUS_LAST_BENEFIT_MIN] = +FLT_MAX; - pTmpOutput[STATUS_LAST_BENEFIT_MAX] = -FLT_MAX; - - for(uint32_t i = 0; i < mInjectorsCounters.getSize(); ++i) - { - mInjectorsCounters[i] = 0; - } - - //skip simulation & just call IofxManager - mIofxMgr->updateEffectsData(deltaTime, 0, 0, 0, stream); - return false; - } - - uint32_t lastCount = mParticleCount; - uint32_t injectCount = mInjectedCount; - uint32_t totalCount = lastCount + injectCount; - PX_ASSERT(targetCount <= totalCount); - - uint32_t boundCount = 0; - if (activeCount > targetCount) - { - boundCount = activeCount - targetCount; - } - - ParticleIosSceneGPU* sceneGPU = static_cast<ParticleIosSceneGPU*>(mParticleIosScene); - bool useSyncKernels = !sceneGPU->getGpuDispatcher()->getCudaContextManager()->supportsArchSM20(); - - switch (kernelIndex) - { - case 0: - if (!mFieldSamplerQuery && mOnStartCallback) - { - (*mOnStartCallback)(stream); - } - - // Copy particle data for newly injected particles - mCopyQueue.reset(stream, 24); - mIofxMgr->outputHostToDevice(mCopyQueue); - if (mInjectedCount > 0) - { - mBufDesc.pmaPositionMass->copyHostToDeviceQ(mCopyQueue, mInjectedCount, mParticleCount); - mBufDesc.pmaVelocityLife->copyHostToDeviceQ(mCopyQueue, mInjectedCount, mParticleCount); - mBufDesc.pmaActorIdentifiers->copyHostToDeviceQ(mCopyQueue, mInjectedCount, mParticleCount); - mLifeSpan.copyHostToDeviceQ(mCopyQueue, mInjectedCount, mParticleCount); - mInjector.copyHostToDeviceQ(mCopyQueue, mInjectedCount, mParticleCount); - mBenefit.copyHostToDeviceQ(mCopyQueue, mInjectedCount, mParticleCount); - mBufDesc.pmaUserData->copyHostToDeviceQ(mCopyQueue,mInjectedCount,mParticleCount); - } -#if DEBUG_GPU - mBenefit.copyDeviceToHostQ(mCopyQueue, mParticleCount); -#endif - mCopyQueue.flushEnqueued(); - return true; - - case 1: - if (totalCount > 0) - { - float benefitMin = PxMin(mLastBenefitMin, mInjectedBenefitMin); - float benefitMax = PxMax(mLastBenefitMax, mInjectedBenefitMax); - PX_ASSERT(benefitMin <= benefitMax); - benefitMax *= 1.00001f; - - if (useSyncKernels) - { - CUDA_OBJ(histogramSyncKernel)( - stream, totalCount, - mBenefit.getGpuPtr(), boundCount, - benefitMin, benefitMax, - mTmpBoundParams.getGpuPtr(), - mTmpHistogram.getGpuPtr() - ); - } - else - { - uint32_t histogramGridSize = - CUDA_OBJ(histogramKernel)( - stream, totalCount, - createApexCudaMemRef(mBenefit, ApexCudaMemFlags::IN), - boundCount, benefitMin, benefitMax, - createApexCudaMemRef(mTmpBoundParams, ApexCudaMemFlags::IN), - createApexCudaMemRef(mTmpHistogram, ApexCudaMemFlags::OUT), - 1, 0 - ); - - //launch just 1 block - CUDA_OBJ(histogramKernel)( - stream, APEX_CUDA_SINGLE_BLOCK_LAUNCH, - createApexCudaMemRef(mBenefit, ApexCudaMemFlags::IN), boundCount, - benefitMin, benefitMax, - createApexCudaMemRef(mTmpBoundParams, ApexCudaMemFlags::OUT), - createApexCudaMemRef(mTmpHistogram, ApexCudaMemFlags::IN_OUT), - 2, histogramGridSize - ); - } - } - return true; - - case 2: - if (totalCount > 0) - { - float benefitMin = PxMin(mLastBenefitMin, mInjectedBenefitMin); - float benefitMax = PxMax(mLastBenefitMax, mInjectedBenefitMax); - PX_ASSERT(benefitMin <= benefitMax); - benefitMax *= 1.00001f; - - if (useSyncKernels) - { - CUDA_OBJ(scanSyncKernel)( - stream, totalCount, - benefitMin, benefitMax, - mHoleScanSum.getGpuPtr(), mBenefit.getGpuPtr(), - mTmpBoundParams.getGpuPtr(), - mTmpScan.getGpuPtr(), mTmpScan1.getGpuPtr() - ); - } - else - { - uint32_t scanGridSize = - CUDA_OBJ(scanKernel)( - stream, totalCount, - benefitMin, benefitMax, - createApexCudaMemRef(mHoleScanSum, ApexCudaMemFlags::IN), - createApexCudaMemRef(mBenefit, ApexCudaMemFlags::IN), - createApexCudaMemRef(mTmpBoundParams, ApexCudaMemFlags::IN), - createApexCudaMemRef(mTmpScan, ApexCudaMemFlags::OUT), - createApexCudaMemRef(mTmpScan1, ApexCudaMemFlags::OUT), - 1, 0 - ); - - //launch just 1 block - CUDA_OBJ(scanKernel)( - stream, APEX_CUDA_SINGLE_BLOCK_LAUNCH, - benefitMin, benefitMax, - createApexCudaMemRef(mHoleScanSum, ApexCudaMemFlags::IN), - createApexCudaMemRef(mBenefit, ApexCudaMemFlags::IN), - createApexCudaMemRef(mTmpBoundParams, ApexCudaMemFlags::IN), - createApexCudaMemRef(mTmpScan, ApexCudaMemFlags::IN_OUT), - createApexCudaMemRef(mTmpScan1, ApexCudaMemFlags::IN_OUT), - 2, scanGridSize - ); - - CUDA_OBJ(scanKernel)( - stream, totalCount, - benefitMin, benefitMax, - createApexCudaMemRef(mHoleScanSum, ApexCudaMemFlags::OUT), - createApexCudaMemRef(mBenefit, ApexCudaMemFlags::IN), - createApexCudaMemRef(mTmpBoundParams, ApexCudaMemFlags::IN), - createApexCudaMemRef(mTmpScan, ApexCudaMemFlags::IN), - createApexCudaMemRef(mTmpScan1, ApexCudaMemFlags::IN), - 3, 0 - ); - } - } - return true; - - case 3: - { - if (totalCount > 0) - { - APEX_CUDA_TEXTURE_SCOPE_BIND(texRefCompactScanSum, mHoleScanSum); - const uint32_t injectorCount = mInjectorList.getSize(); - - CUDA_OBJ(compactKernel)( - stream, - PxMax(totalCount, injectorCount), - targetCount, - totalCount, - injectorCount, - createApexCudaMemRef(mMoveIndices, ApexCudaMemFlags::OUT), - createApexCudaMemRef(mTmpScan, ApexCudaMemFlags::OUT), - createApexCudaMemRef(mInjectorsCounters, ApexCudaMemFlags::OUT) - ); - } - return true; - } - - case 4: - if (targetCount > 0) - { - uint32_t histogramGridSize = 0; - { - PxCudaReadWriteParticleBuffers buffers; - memset(&buffers, 0, sizeof(buffers)); - - CUstream physxCuStream = 0; - { - SCOPED_PHYSX_LOCK_READ(&mParticleIosScene->getApexScene()); - - PxParticleDeviceExclusive::getReadWriteCudaBuffers(*mParticleActor->is<PxParticleBase>(), buffers); - PX_ASSERT( buffers.positions && buffers.velocities && buffers.collisionNormals && buffers.flags); - -#if USE_PHYSX_TASK_SYNC - physxCuStream = PxParticleDeviceExclusive::getCudaStream(*mParticleActor->is<PxParticleBase>()); -#endif - } - PX_UNUSED(physxCuStream); -#if USE_PHYSX_TASK_SYNC - //sync physx & apex cuda streams! - if (stream != 0 && physxCuStream != 0) - { - CUT_SAFE_CALL(cuEventRecord(mCuSyncEvent, physxCuStream)); - CUT_SAFE_CALL(cuStreamWaitEvent(stream, mCuSyncEvent, 0)); - } -#endif - - APEX_CUDA_TEXTURE_SCOPE_BIND_SIZE(texRefMoveIndices, mMoveIndices, totalCount); - - APEX_CUDA_TEXTURE_SCOPE_BIND_SIZE(texRefPositionMass, *mBufDesc.pmaPositionMass, totalCount); - APEX_CUDA_TEXTURE_SCOPE_BIND_SIZE(texRefVelocityLife, *mBufDesc.pmaVelocityLife, totalCount); - APEX_CUDA_TEXTURE_SCOPE_BIND_SIZE(texRefIofxActorIDs, *mBufDesc.pmaActorIdentifiers, totalCount); - APEX_CUDA_TEXTURE_SCOPE_BIND_SIZE(texRefLifeSpan, mLifeSpan, totalCount); - APEX_CUDA_TEXTURE_SCOPE_BIND_SIZE(texRefLifeTime, mLifeTime, totalCount); - APEX_CUDA_TEXTURE_SCOPE_BIND_SIZE(texRefInjector, mInjector, totalCount); - - APEX_CUDA_TEXTURE_SCOPE_BIND_SIZE(texRefUserData,*mBufDesc.pmaUserData, totalCount); - - - APEX_CUDA_TEXTURE_SCOPE_BIND_PTR(texRefPxPosition, (float4*)buffers.positions, lastCount); - APEX_CUDA_TEXTURE_SCOPE_BIND_PTR(texRefPxVelocity, (float4*)buffers.velocities, lastCount); - APEX_CUDA_TEXTURE_SCOPE_BIND_PTR(texRefPxCollision, (float4*)buffers.collisionNormals, lastCount); - if(buffers.densities) - { - CUDA_OBJ(texRefPxDensity).bindTo(buffers.densities, lastCount); - } - APEX_CUDA_TEXTURE_SCOPE_BIND_PTR(texRefNvFlags, (unsigned int*)buffers.flags, lastCount); - - const PxVec3& eyePos = mParticleIosScene->getApexScene().getEyePosition(); - ParticleIosSceneGPU* sceneGPU = static_cast<ParticleIosSceneGPU*>(mParticleIosScene); - - if (mFieldSamplerQuery != NULL) - { - APEX_CUDA_TEXTURE_SCOPE_BIND_SIZE(texRefField, mField, totalCount); - - histogramGridSize = CUDA_OBJ(simulateApplyFieldKernel)(stream, - targetCount, - lastCount, - deltaTime, - eyePos, - sceneGPU->mInjectorConstMemGroup.getStorage().mappedHandle(sceneGPU->mInjectorParamsArrayHandle), - mInjectorsCounters.getSize(), - createApexCudaMemRef(mHoleScanSum, targetCount, ApexCudaMemFlags::IN), - createApexCudaMemRef(mInputIdToParticleIndex, ApexCudaMemFlags::IN), - createApexCudaMemRef(mTmpScan, 1, ApexCudaMemFlags::IN), //g_moveCount - createApexCudaMemRef(mTmpHistogram, targetCount, ApexCudaMemFlags::OUT), //targetCount ???? - createApexCudaMemRef(mInjectorsCounters, mInjectorsCounters.getSize(), ApexCudaMemFlags::OUT), - createApexCudaMemRef((float4*)mBufDesc.pmaPositionMass->getGpuPtr(), targetCount, ApexCudaMemFlags::OUT), - createApexCudaMemRef((float4*)mBufDesc.pmaVelocityLife->getGpuPtr(), targetCount, ApexCudaMemFlags::OUT), - createApexCudaMemRef((float4*)mBufDesc.pmaCollisionNormalFlags->getGpuPtr(), targetCount, ApexCudaMemFlags::OUT), - createApexCudaMemRef((unsigned int*)mBufDesc.pmaUserData->getGpuPtr(), targetCount, ApexCudaMemFlags::OUT), - createApexCudaMemRef(mLifeSpan, targetCount, ApexCudaMemFlags::OUT), - createApexCudaMemRef(mLifeTime, targetCount, ApexCudaMemFlags::OUT), - mBufDesc.pmaDensity != NULL ? createApexCudaMemRef((float*)mBufDesc.pmaDensity->getGpuPtr(), targetCount, ApexCudaMemFlags::OUT) : ApexCudaMemRef<float>(NULL, 0), - createApexCudaMemRef(mInjector, targetCount, ApexCudaMemFlags::OUT), - createApexCudaMemRef(*(mBufDesc.pmaActorIdentifiers), targetCount, ApexCudaMemFlags::OUT), - createApexCudaMemRef(mBenefit, targetCount, ApexCudaMemFlags::OUT), - createApexCudaMemRef((float4*)buffers.positions, targetCount, ApexCudaMemFlags::OUT), - createApexCudaMemRef((float4*)buffers.velocities, targetCount, ApexCudaMemFlags::OUT), - createApexCudaMemRef((float4*)buffers.collisionNormals, targetCount, ApexCudaMemFlags::IN), - buffers.densities != NULL ? createApexCudaMemRef((float*)buffers.densities, targetCount, ApexCudaMemFlags::OUT) : ApexCudaMemRef<float>(NULL, 0), - createApexCudaMemRef((unsigned int*)buffers.flags, targetCount, ApexCudaMemFlags::OUT), - mGridDensityParams - ); - } - else - { - histogramGridSize = CUDA_OBJ(simulateKernel)(stream, - targetCount, - lastCount, - deltaTime, - eyePos, - sceneGPU->mInjectorConstMemGroup.getStorage().mappedHandle(sceneGPU->mInjectorParamsArrayHandle), - mInjectorsCounters.getSize(), - mHoleScanSum.getGpuPtr(), - mInputIdToParticleIndex.getGpuPtr(), - mTmpScan.getGpuPtr(), - mTmpHistogram.getGpuPtr(), - mInjectorsCounters.getGpuPtr(), - (float4*)mBufDesc.pmaPositionMass->getGpuPtr(), - (float4*)mBufDesc.pmaVelocityLife->getGpuPtr(), - (float4*)mBufDesc.pmaCollisionNormalFlags->getGpuPtr(), - (unsigned int*)mBufDesc.pmaUserData->getGpuPtr(), - mLifeSpan.getGpuPtr(), - mLifeTime.getGpuPtr(), - mBufDesc.pmaDensity != NULL ? (float*)mBufDesc.pmaDensity->getGpuPtr() : NULL, - mInjector.getGpuPtr(), - mBufDesc.pmaActorIdentifiers->getGpuPtr(), - mBenefit.getGpuPtr(), - (float4*)buffers.positions, - (float4*)buffers.velocities, - (float4*)buffers.collisionNormals, - buffers.densities != NULL ? (float*)buffers.densities : NULL, - (unsigned int*) buffers.flags, - mGridDensityParams - ); - } - if(buffers.densities) - { - CUDA_OBJ(texRefPxDensity).unbind(); - } - } - //new kernel invocation - to merge temp histograms - { - if(mInjectorsCounters.getSize() <= HISTOGRAM_SIMULATE_BIN_COUNT) - { - CUDA_OBJ(mergeHistogramKernel)(stream, APEX_CUDA_SINGLE_BLOCK_LAUNCH, - mInjectorsCounters.getGpuPtr(), - mTmpHistogram.getGpuPtr(), - histogramGridSize, - mInjectorsCounters.getSize() - ); - } - - } - // calculate grid grid density - if (mGridDensityParams.Enabled) - { - mGridDensityParams.DensityOrigin = mDensityOrigin; - const unsigned int dim = mGridDensityParams.GridResolution; - // refreshed non-shared params - { - ParticleIosAssetParam* params = (ParticleIosAssetParam*)(mAsset->getAssetNvParameterized()); - const SimpleParticleSystemParams* gridParams = static_cast<SimpleParticleSystemParams*>(params->particleType); - mGridDensityParams.GridSize = gridParams->GridDensity.GridSize; - mGridDensityParams.GridMaxCellCount = gridParams->GridDensity.MaxCellCount; - } - // extract frustum - if(mParticleIosScene->getApexScene().getNumProjMatrices() > 0) - { - PxMat44 matDen = PxMat44(PxIdentity); - GridDensityFrustumParams frustum; - PxMat44 matModel = mParticleIosScene->getApexScene().getViewMatrix(); - PxMat44 matProj = mParticleIosScene->getApexScene().getProjMatrix(); - PxMat44 mat = matProj*matModel; - PxMat44 matInv = inverse(mat); - const float targetDepth = mGridDensityParams.GridSize; - // for debug vis - mDensityDebugMatInv = matInv; - // to calculate w transform - float nearDimX = distance(matInv.transform(PxVec4(-1.f,0.f,0.f,1.f)),matInv.transform(PxVec4(1.f,0.f,0.f,1.f))); - float farDimX = distance(matInv.transform(PxVec4(-1.f,0.f,1.f,1.f)),matInv.transform(PxVec4(1.f,0.f,1.f,1.f))); - float nearDimY = distance(matInv.transform(PxVec4(0.f,-1.f,0.f,1.f)),matInv.transform(PxVec4(0.f,1.f,0.f,1.f))); - float farDimY = distance(matInv.transform(PxVec4(0.f,-1.f,1.f,1.f)),matInv.transform(PxVec4(0.f,1.f,1.f,1.f))); - float dimZ = distance(matInv.transform(PxVec4(0.f, 0.f,0.f,1.f)),matInv.transform(PxVec4(0.f,0.f,1.f,1.f))); - float myFarDimX = nearDimX*(1.f-targetDepth/dimZ) + farDimX*(targetDepth/dimZ); - float myFarDimY = nearDimY*(1.f-targetDepth/dimZ) + farDimY*(targetDepth/dimZ); - // grab necessary frustum coordinates - PxVec4 origin4 = matInv.transform(PxVec4(-1.f, 1.f,0.f,1.f)); - PxVec4 basisX4 = matInv.transform(PxVec4( 1.f, 1.f,0.f,1.f)); - PxVec4 basisY4 = matInv.transform(PxVec4(-1.f,-1.f,0.f,1.f)); - PxVec4 zDepth4 = matInv.transform(PxVec4(-1.f, 1.f,1.f,1.f)); - // create vec3 versions - PxVec3 origin3(origin4.x/origin4.w,origin4.y/origin4.w,origin4.z/origin4.w); - PxVec3 basisX3(basisX4.x/basisX4.w,basisX4.y/basisX4.w,basisX4.z/basisX4.w); - PxVec3 basisY3(basisY4.x/basisY4.w,basisY4.y/basisY4.w,basisY4.z/basisY4.w); - PxVec3 zDepth3(zDepth4.x/zDepth4.w,zDepth4.y/zDepth4.w,zDepth4.z/zDepth4.w); - // make everthing relative to origin - basisX3 -= origin3; - basisY3 -= origin3; - zDepth3 -= origin3; - // find third basis - PxVec3 basisZ3(basisX3.cross(basisY3)); - basisZ3.normalize(); - basisZ3*= targetDepth; - // build scale,rotation,translation matrix - PxMat44 mat1Inv = PxMat44(PxIdentity); - mat1Inv.column0 = PxVec4(basisX3,0.f); - mat1Inv.column1 = PxVec4(basisY3,0.f); - mat1Inv.column2 = PxVec4(basisZ3,0.f); - mat1Inv.column3 = PxVec4(origin3,1.f); - PxMat44 mat1 = inverse(mat1Inv); - // do perspective transform - PxMat44 mat2 = PxMat44(PxIdentity); - { - float left = -3.0f; - float right = 1.0f; - float top = 1.0f; - float bottom = -3.0f; - float nearVal = nearDimX/(0.5f*(myFarDimX-nearDimX)); - //float farVal = nearVal + 1.f; - // build matrix - mat2.column0.x = -2.f*nearVal/(right-left); - mat2.column1.y = -2.f*nearVal/(top-bottom); - mat2.column2.x = (right+left)/(right-left); - mat2.column2.y = (top+bottom)/(top-bottom); - //mat2.column2.z = -(farVal+nearVal)/(farVal-nearVal); - mat2.column2.w = -1.f; - //mat2.column3.z = -(2.f*farVal*nearVal)/(farVal-nearVal); - mat2.column3.w = 0.f; - } - // shrink to calculate density just outside of frustum - PxMat44 mat3 = PxMat44(PxIdentity); - float factor = (float)(mGridDensityParams.GridResolution-4) / (mGridDensityParams.GridResolution); - { - mat3.column0.x = factor; - mat3.column1.y = factor; - mat3.column2.z = factor; - mat3.column3.x = (1.0f-factor)/2.0f; - mat3.column3.y = (1.0f-factor)/2.0f; - mat3.column3.z = (1.0f-factor)/2.0f; - } - // create final matrix - matDen = mat3*mat2*mat1; - // create frustum info - frustum.nearDimX = factor*nearDimX; - frustum.farDimX = factor*myFarDimX; - frustum.nearDimY = factor*nearDimY; - frustum.farDimY = factor*myFarDimY; - frustum.dimZ = factor*targetDepth; - // launch frustum kernels - CUDA_OBJ(gridDensityGridClearKernel)(stream, dim*dim*dim, - mGridDensityGrid.getGpuPtr(), - mGridDensityParams - ); - CUDA_OBJ(gridDensityGridFillFrustumKernel)(stream, targetCount, - (float4*)mBufDesc.pmaPositionMass->getGpuPtr(), - mGridDensityGrid.getGpuPtr(), - mGridDensityParams, - matDen, - frustum - ); - CUDA_OBJ(gridDensityGridLowPassKernel)(stream, dim*dim*dim, - mGridDensityGrid.getGpuPtr(), - mGridDensityGridLowPass.getGpuPtr(), - mGridDensityParams - ); - CUDA_OBJ(gridDensityGridApplyFrustumKernel)(stream, targetCount, - mBufDesc.pmaDensity != NULL ? (float*)mBufDesc.pmaDensity->getGpuPtr() : NULL, - (float4*)mBufDesc.pmaPositionMass->getGpuPtr(), - mGridDensityGridLowPass.getGpuPtr(), - mGridDensityParams, - matDen, - frustum - ); - } - } - } - return true; - - case 5: - if (targetCount > 0) - { - if (useSyncKernels) - { - CUDA_OBJ(reduceSyncKernel)( - stream, targetCount, - mBenefit.getGpuPtr(), (float4*)mTmpOutput.getGpuPtr(), mTmpReduce.getGpuPtr() - ); - } - else - { - uint32_t reduceGridSize = - CUDA_OBJ(reduceKernel)( - stream, targetCount, - createApexCudaMemRef(mBenefit, ApexCudaMemFlags::IN), - createApexCudaMemRef((float4*)mTmpOutput.getGpuPtr(), 1, ApexCudaMemFlags::IN), - createApexCudaMemRef(mTmpReduce, ApexCudaMemFlags::OUT), - 1, 0 - ); - - //launch just 1 block - CUDA_OBJ(reduceKernel)( - stream, APEX_CUDA_SINGLE_BLOCK_LAUNCH, - createApexCudaMemRef(mBenefit, ApexCudaMemFlags::IN), - createApexCudaMemRef((float4*)mTmpOutput.getGpuPtr(), 1, ApexCudaMemFlags::OUT), - createApexCudaMemRef(mTmpReduce, ApexCudaMemFlags::IN), - 2, reduceGridSize - ); - } - } - return true; - - case 6: - if (totalCount > 0) - { - APEX_CUDA_TEXTURE_SCOPE_BIND(texRefHoleScanSum, mHoleScanSum); - APEX_CUDA_TEXTURE_SCOPE_BIND(texRefMoveIndices, mMoveIndices); - - CUDA_OBJ(stateKernel)(stream, totalCount, - lastCount, targetCount, - createApexCudaMemRef(mTmpScan, ApexCudaMemFlags::IN), - createApexCudaMemRef(*mBufDesc.pmaInStateToInput, ApexCudaMemFlags::OUT), - createApexCudaMemRef(*mBufDesc.pmaOutStateToInput, ApexCudaMemFlags::IN) - ); - } - return true; - - case 7: - mTmpOutput.copyDeviceToHostQ(mCopyQueue); - mInjectorsCounters.copyDeviceToHostQ(mCopyQueue); -#if DEBUG_GPU - mHoleScanSum.copyDeviceToHostQ(mCopyQueue, totalCount); - mMoveIndices.copyDeviceToHostQ(mCopyQueue, totalCount); - mTmpScan.copyDeviceToHostQ(mCopyQueue, 1); - mTmpBoundParams.copyDeviceToHostQ(mCopyQueue, 2); - //mTmpHistogram.copyDeviceToHostQ(mCopyQueue, HISTOGRAM_BIN_COUNT); - mBufDesc.pmaInStateToInput->copyDeviceToHostQ(mCopyQueue, totalCount); -#endif - mCopyQueue.flushEnqueued(); - - /* Oh! Manager of the IOFX! do your thing */ - mIofxMgr->updateEffectsData(deltaTime, targetCount, targetCount, totalCount, stream); - return false; - } - return false; -} - - -#if DEBUG_GPU -template<typename T, typename F> -void dumpArray(const char* name, ApexMirroredArray<T>& inpArray, uint32_t size, F func) -{ - char buf[256 * 1024]; - char* str = buf; - str += sprintf(str, "%s[%d]=", name, size); - for (uint32_t i = 0; i < PxMin<uint32_t>(size, 1024); ++i) - { - str += func(str, inpArray.get(i)); - } - APEX_DEBUG_INFO(buf); -} -#endif - -void ParticleIosActorGPU::fetchResults() -{ - ParticleIosActorImpl::fetchResults(); - if (mAsset->getParticleDesc()->Enable == false) - { - return; - } -#if DEBUG_GPU - if (mParticleBudget > 0) - { - uint32_t targetCount = mParticleBudget; - uint32_t lastCount = mParticleCount; - uint32_t totalCount = lastCount + mInjectedCount; - uint32_t activeCount = mLastActiveCount + mInjectedCount; - uint32_t boundCount = (activeCount > targetCount) ? (activeCount - targetCount) : 0; - - uint32_t lastActiveCount = 0; - for (uint32_t i = 0; i < lastCount; ++i) - { - float benefit = mBenefit[i]; - if (benefit > -FLT_MAX) - { - ++lastActiveCount; - } - } - if (lastActiveCount != mLastActiveCount) - { - APEX_DEBUG_INFO("lastCount=%d, totalCount=%d, targetCount=%d", lastCount, totalCount, targetCount); - struct FBenefit { PX_INLINE uint32_t operator () (char* str, float x) { return sprintf(str, "%f,", x); } }; - dumpArray("mBenefit", mBenefit, lastCount, FBenefit() ); - APEX_INTERNAL_ERROR("lastActiveCount(%d) != mLastActiveCount(%d)", lastActiveCount, mLastActiveCount); - } - PX_ASSERT(lastActiveCount == mLastActiveCount); - - //Test src hole count - uint32_t moveCount = mTmpScan[0]; - - uint32_t holeCount = 0; - for (uint32_t i = 0; i < totalCount; ++i) - { - uint32_t holeScanSum = mHoleScanSum[i]; - if (holeScanSum & HOLE_SCAN_FLAG) - { - ++holeCount; - } - PX_ASSERT(holeCount == (holeScanSum & ~HOLE_SCAN_FLAG)); - } - if (totalCount != targetCount + holeCount) - { - APEX_DEBUG_INFO("lastCount=%d, lastActiveCount=%d, injectCount=%d, totalCount=%d, targetCount=%d, moveCount=%d, holeCount=%d", lastCount, mLastActiveCount, mInjectedCount, totalCount, targetCount, moveCount, holeCount); - APEX_DEBUG_INFO("boundCount=%d, mTmpBoundParams[0]=%d, mTmpBoundParams[1]=%d", boundCount, mTmpBoundParams[0], mTmpBoundParams[1]); - struct FBenefit { PX_INLINE uint32_t operator () (char* str, float x) { return sprintf(str, "%f,", x); } }; - dumpArray("mBenefit", mBenefit, totalCount, FBenefit() ); - struct FHoleScanSum { PX_INLINE uint32_t operator () (char* str, uint32_t x) { return sprintf(str, "%x,", x); } }; - dumpArray("mHoleScanSum", mHoleScanSum, totalCount, FHoleScanSum() ); - //struct FHistogram { PX_INLINE uint32_t operator () (char* str, uint32_t x) { return sprintf(str, "%d,", x); } }; - //dumpArray("mTmpHistogram", mTmpHistogram, HISTOGRAM_BIN_COUNT, FHistogram() ); - APEX_INTERNAL_ERROR("totalCount (%d) != targetCount + holeCount (%d)", totalCount, targetCount + holeCount); - } - PX_ASSERT(totalCount == targetCount + holeCount); - - PX_ASSERT(moveCount <= holeCount); - for (uint32_t i = 0; i < moveCount; ++i) - { - uint32_t holeIndex = mMoveIndices[i]; - PX_ASSERT(holeIndex < targetCount); - - uint32_t holeScanSum = mHoleScanSum[holeIndex]; - PX_ASSERT((holeScanSum & HOLE_SCAN_FLAG) != 0); - PX_ASSERT( i + 1 == (holeScanSum & HOLE_SCAN_MASK) ); - } - for (uint32_t i = moveCount; i < moveCount*2; ++i) - { - uint32_t nonHoleIndex = mMoveIndices[i]; - PX_ASSERT(nonHoleIndex >= targetCount); - PX_ASSERT(nonHoleIndex < totalCount); - - uint32_t nonHoleScanSum = mHoleScanSum[nonHoleIndex]; - PX_ASSERT((nonHoleScanSum & HOLE_SCAN_FLAG) == 0); - PX_ASSERT( i + 1 == moveCount + (((nonHoleIndex + 1) - nonHoleScanSum) - (targetCount - moveCount)) ); - } - - uint32_t validInputCount = 0; - for (uint32_t i = 0; i < totalCount; ++i) - { - uint32_t inputId = mBufDesc.pmaInStateToInput->get(i); - if (inputId != IosBufferDescIntl::NOT_A_PARTICLE) - { - inputId &= ~IosBufferDescIntl::NEW_PARTICLE_FLAG; - PX_ASSERT(inputId < targetCount); - - validInputCount++; - } - } - PX_ASSERT(validInputCount == targetCount); - } -#endif - - mParticleCount = mParticleBudget; - - float* pTmpOutput = (float*)mTmpOutput.getPtr(); - mLastActiveCount = mTmpOutput[STATUS_LAST_ACTIVE_COUNT]; - mLastBenefitSum = pTmpOutput[STATUS_LAST_BENEFIT_SUM]; - mLastBenefitMin = pTmpOutput[STATUS_LAST_BENEFIT_MIN]; - mLastBenefitMax = pTmpOutput[STATUS_LAST_BENEFIT_MAX]; -} - - -PxMat44 ParticleIosActorGPU::inverse(const PxMat44& in) -{ - PxMat44 ret; - float inv[16]; - float* invOut = &ret.column0.x; - const float* m = &in.column0.x; - int i; - - inv[0] = m[5] * m[10] * m[15] - - m[5] * m[11] * m[14] - - m[9] * m[6] * m[15] + - m[9] * m[7] * m[14] + - m[13] * m[6] * m[11] - - m[13] * m[7] * m[10]; - - inv[4] = -m[4] * m[10] * m[15] + - m[4] * m[11] * m[14] + - m[8] * m[6] * m[15] - - m[8] * m[7] * m[14] - - m[12] * m[6] * m[11] + - m[12] * m[7] * m[10]; - - inv[8] = m[4] * m[9] * m[15] - - m[4] * m[11] * m[13] - - m[8] * m[5] * m[15] + - m[8] * m[7] * m[13] + - m[12] * m[5] * m[11] - - m[12] * m[7] * m[9]; - - inv[12] = -m[4] * m[9] * m[14] + - m[4] * m[10] * m[13] + - m[8] * m[5] * m[14] - - m[8] * m[6] * m[13] - - m[12] * m[5] * m[10] + - m[12] * m[6] * m[9]; - - inv[1] = -m[1] * m[10] * m[15] + - m[1] * m[11] * m[14] + - m[9] * m[2] * m[15] - - m[9] * m[3] * m[14] - - m[13] * m[2] * m[11] + - m[13] * m[3] * m[10]; - - inv[5] = m[0] * m[10] * m[15] - - m[0] * m[11] * m[14] - - m[8] * m[2] * m[15] + - m[8] * m[3] * m[14] + - m[12] * m[2] * m[11] - - m[12] * m[3] * m[10]; - - inv[9] = -m[0] * m[9] * m[15] + - m[0] * m[11] * m[13] + - m[8] * m[1] * m[15] - - m[8] * m[3] * m[13] - - m[12] * m[1] * m[11] + - m[12] * m[3] * m[9]; - - inv[13] = m[0] * m[9] * m[14] - - m[0] * m[10] * m[13] - - m[8] * m[1] * m[14] + - m[8] * m[2] * m[13] + - m[12] * m[1] * m[10] - - m[12] * m[2] * m[9]; - - inv[2] = m[1] * m[6] * m[15] - - m[1] * m[7] * m[14] - - m[5] * m[2] * m[15] + - m[5] * m[3] * m[14] + - m[13] * m[2] * m[7] - - m[13] * m[3] * m[6]; - - inv[6] = -m[0] * m[6] * m[15] + - m[0] * m[7] * m[14] + - m[4] * m[2] * m[15] - - m[4] * m[3] * m[14] - - m[12] * m[2] * m[7] + - m[12] * m[3] * m[6]; - - inv[10] = m[0] * m[5] * m[15] - - m[0] * m[7] * m[13] - - m[4] * m[1] * m[15] + - m[4] * m[3] * m[13] + - m[12] * m[1] * m[7] - - m[12] * m[3] * m[5]; - - inv[14] = -m[0] * m[5] * m[14] + - m[0] * m[6] * m[13] + - m[4] * m[1] * m[14] - - m[4] * m[2] * m[13] - - m[12] * m[1] * m[6] + - m[12] * m[2] * m[5]; - - inv[3] = -m[1] * m[6] * m[11] + - m[1] * m[7] * m[10] + - m[5] * m[2] * m[11] - - m[5] * m[3] * m[10] - - m[9] * m[2] * m[7] + - m[9] * m[3] * m[6]; - - inv[7] = m[0] * m[6] * m[11] - - m[0] * m[7] * m[10] - - m[4] * m[2] * m[11] + - m[4] * m[3] * m[10] + - m[8] * m[2] * m[7] - - m[8] * m[3] * m[6]; - - inv[11] = -m[0] * m[5] * m[11] + - m[0] * m[7] * m[9] + - m[4] * m[1] * m[11] - - m[4] * m[3] * m[9] - - m[8] * m[1] * m[7] + - m[8] * m[3] * m[5]; - - inv[15] = m[0] * m[5] * m[10] - - m[0] * m[6] * m[9] - - m[4] * m[1] * m[10] + - m[4] * m[2] * m[9] + - m[8] * m[1] * m[6] - - m[8] * m[2] * m[5]; - - float det = m[0] * inv[0] + m[1] * inv[4] + m[2] * inv[8] + m[3] * inv[12]; - - if (det == 0) - return PxMat44(PxIdentity); - - det = 1.0f / det; - - for (i = 0; i < 16; i++) - invOut[i] = inv[i] * det; - - return ret; -} - -float ParticleIosActorGPU::distance(PxVec4 a, PxVec4 b) -{ - PxVec3 a3(a.x/a.w,a.y/a.w,a.z/a.w); - PxVec3 b3(b.x/b.w,b.y/b.w,b.z/b.w); - PxVec3 diff(b3-a3); - return diff.magnitude(); -} - -} -} // namespace nvidia - -#endif //APEX_CUDA_SUPPORT - |