aboutsummaryrefslogtreecommitdiff
path: root/APEX_1.4/module/basicios/src/BasicIosActorGPU.cpp
diff options
context:
space:
mode:
authorgit perforce import user <a@b>2016-10-25 12:29:14 -0600
committerSheikh Dawood Abdul Ajees <Sheikh Dawood Abdul Ajees>2016-10-25 18:56:37 -0500
commit3dfe2108cfab31ba3ee5527e217d0d8e99a51162 (patch)
treefa6485c169e50d7415a651bf838f5bcd0fd3bfbd /APEX_1.4/module/basicios/src/BasicIosActorGPU.cpp
downloadphysx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.tar.xz
physx-3.4-3dfe2108cfab31ba3ee5527e217d0d8e99a51162.zip
Initial commit:
PhysX 3.4.0 Update @ 21294896 APEX 1.4.0 Update @ 21275617 [CL 21300167]
Diffstat (limited to 'APEX_1.4/module/basicios/src/BasicIosActorGPU.cpp')
-rw-r--r--APEX_1.4/module/basicios/src/BasicIosActorGPU.cpp802
1 files changed, 802 insertions, 0 deletions
diff --git a/APEX_1.4/module/basicios/src/BasicIosActorGPU.cpp b/APEX_1.4/module/basicios/src/BasicIosActorGPU.cpp
new file mode 100644
index 00000000..0a4ef4ca
--- /dev/null
+++ b/APEX_1.4/module/basicios/src/BasicIosActorGPU.cpp
@@ -0,0 +1,802 @@
+/*
+ * Copyright (c) 2008-2015, NVIDIA CORPORATION. All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto. Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+
+#include "ApexDefs.h"
+#if APEX_CUDA_SUPPORT
+
+#include "Apex.h"
+#include "SceneIntl.h"
+#include "ApexSDKIntl.h"
+
+#include "BasicIosActor.h"
+#include "BasicIosActorGPU.h"
+#include "BasicIosAssetImpl.h"
+#include "IofxAsset.h"
+#include "IofxActor.h"
+#include "ModuleBasicIosImpl.h"
+#include "BasicIosSceneGPU.h"
+#include "RenderDebugInterface.h"
+#include "AuthorableObjectIntl.h"
+
+#include "PxMath.h"
+
+//CUDA
+#include "PxGpuTask.h"
+#include "ApexCutil.h"
+
+#define CUDA_OBJ(name) SCENE_CUDA_OBJ(*mBasicIosScene, name)
+
+namespace nvidia
+{
+namespace basicios
+{
+
+#pragma warning(disable: 4355) // 'this' : used in base member initializer list
+
+BasicIosActorGPU::BasicIosActorGPU(
+ ResourceList& list,
+ BasicIosAssetImpl& asset,
+ BasicIosScene& scene,
+ nvidia::apex::IofxAsset& iofxAsset,
+ const ApexMirroredPlace::Enum defaultPlace)
+ : BASIC_IOS_ACTOR(list, asset, scene, iofxAsset, true)
+ , mCopyQueue(*scene.getApexScene().getTaskManager()->getGpuDispatcher())
+ , mHoleScanSum(scene.getApexScene())
+ , mMoveIndices(scene.getApexScene())
+ , mTmpReduce(scene.getApexScene())
+ , mTmpHistogram(scene.getApexScene())
+ , mTmpScan(scene.getApexScene())
+ , mTmpScan1(scene.getApexScene())
+ , mTmpOutput(scene.getApexScene())
+ , mTmpOutput1(scene.getApexScene())
+ , mLaunchTask(*this)
+{
+ initStorageGroups(CUDA_OBJ(simulateStorage));
+
+ //CUDA
+ mTmpOutput.setSize(4, ApexMirroredPlace::CPU_GPU);
+ mTmpOutput1.setSize(2, ApexMirroredPlace::CPU_GPU);
+
+ const unsigned int ScanWarpsPerBlock = MAX_WARPS_PER_BLOCK; //CUDA_OBJ(scanKernel).getBlockDim().x / WARP_SIZE;
+ mTmpReduce.reserve(MAX_BOUND_BLOCKS * 4, defaultPlace);
+ mTmpHistogram.reserve(MAX_BOUND_BLOCKS * HISTOGRAM_SIMULATE_BIN_COUNT, defaultPlace);
+ mTmpScan.reserve(MAX_BOUND_BLOCKS * ScanWarpsPerBlock, defaultPlace);
+ mTmpScan1.reserve(MAX_BOUND_BLOCKS * ScanWarpsPerBlock, defaultPlace);
+
+ mLifeTime.reserve(mMaxParticleCount, defaultPlace);
+
+ mLifeSpan.reserve(mMaxTotalParticleCount, ApexMirroredPlace::CPU_GPU);
+ mInjector.reserve(mMaxTotalParticleCount, ApexMirroredPlace::CPU_GPU);
+ mBenefit.reserve(mMaxTotalParticleCount, ApexMirroredPlace::CPU_GPU);
+
+ {
+ uint32_t size = mGridDensityParams.GridResolution;
+ if(size > 0)
+ {
+ mGridDensityGrid.setSize(size*size*size,ApexMirroredPlace::GPU);
+ mGridDensityGridLowPass.setSize(size*size*size,ApexMirroredPlace::GPU);
+ }
+ }
+
+ if (mAsset->mParams->collisionWithConvex)
+ {
+ mConvexPlanes.reserve(MAX_CONVEX_PLANES_COUNT, ApexMirroredPlace::CPU_GPU);
+ mConvexVerts.reserve(MAX_CONVEX_VERTS_COUNT, ApexMirroredPlace::CPU_GPU);
+ mConvexPolygonsData.reserve(MAX_CONVEX_POLYGONS_DATA_SIZE, ApexMirroredPlace::CPU_GPU);
+ }
+ if (mAsset->mParams->collisionWithTriangleMesh)
+ {
+ mTrimeshVerts.reserve(MAX_TRIMESH_VERTS_COUNT, ApexMirroredPlace::CPU_GPU);
+ mTrimeshIndices.reserve(MAX_TRIMESH_INDICES_COUNT, ApexMirroredPlace::CPU_GPU);
+ }
+
+ mHoleScanSum.reserve(mMaxTotalParticleCount, defaultPlace);
+ mMoveIndices.reserve(mMaxTotalParticleCount, defaultPlace);
+}
+
+BasicIosActorGPU::~BasicIosActorGPU()
+{
+}
+
+void BasicIosActorGPU::submitTasks()
+{
+ BasicIosActorImpl::submitTasks();
+
+ mInjectorsCounters.setSize(mInjectorList.getSize(), ApexMirroredPlace::CPU_GPU);
+ PxTaskManager* tm = mBasicIosScene->getApexScene().getTaskManager();
+ tm->submitUnnamedTask(mLaunchTask, PxTaskType::TT_GPU);
+}
+
+void BasicIosActorGPU::setTaskDependencies()
+{
+ BasicIosActorImpl::setTaskDependencies(&mLaunchTask, true);
+
+ PxTaskManager* tm = mBasicIosScene->getApexScene().getTaskManager();
+ if (tm->getGpuDispatcher()->getCudaContextManager()->supportsArchSM20())
+ {
+ /* For Fermi devices, it pays to launch all IOS together. This also forces
+ * The IOFX managers to step at the same time.
+ */
+ PxTaskID interlock = tm->getNamedTask("IOS::StepInterlock");
+ mLaunchTask.startAfter(interlock);
+ }
+}
+
+bool BasicIosActorGPU::launch(CUstream stream, int kernelIndex)
+{
+ float deltaTime = mBasicIosScene->getApexScene().getPhysXSimulateTime();
+
+ uint32_t activeCount = mLastActiveCount + mInjectedCount;
+ mParticleBudget = mMaxParticleCount;
+ if (mParticleBudget > activeCount)
+ {
+ mParticleBudget = activeCount;
+ }
+ uint32_t targetCount = mParticleBudget;
+
+ if (targetCount == 0)
+ {
+ //reset output
+ float* pTmpOutput = (float*)mTmpOutput.getPtr();
+ mTmpOutput[STATUS_LAST_ACTIVE_COUNT] = 0;
+ pTmpOutput[STATUS_LAST_BENEFIT_SUM] = 0.0f;
+ pTmpOutput[STATUS_LAST_BENEFIT_MIN] = +FLT_MAX;
+ pTmpOutput[STATUS_LAST_BENEFIT_MAX] = -FLT_MAX;
+
+ for(uint32_t i = 0; i < mInjectorsCounters.getSize(); ++i)
+ {
+ mInjectorsCounters[i] = 0;
+ }
+
+ //skip simulation & just call IofxManager
+ mIofxMgr->updateEffectsData(deltaTime, 0, 0, 0, stream);
+ return false;
+ }
+
+ uint32_t lastCount = mParticleCount;
+ uint32_t injectCount = mInjectedCount;
+ uint32_t totalCount = lastCount + injectCount;
+ PX_ASSERT(targetCount <= totalCount);
+ uint32_t boundCount = 0;
+ if (activeCount > targetCount)
+ {
+ boundCount = activeCount - targetCount;
+ }
+
+ BasicIosSceneGPU* sceneGPU = static_cast<BasicIosSceneGPU*>(mBasicIosScene);
+ bool useSyncKernels = !sceneGPU->getGpuDispatcher()->getCudaContextManager()->supportsArchSM20();
+
+ switch (kernelIndex)
+ {
+ case 0:
+ if (!mFieldSamplerQuery && mOnStartCallback)
+ {
+ (*mOnStartCallback)(stream);
+ }
+
+ mCopyQueue.reset(stream, 24);
+ mIofxMgr->outputHostToDevice(mCopyQueue);
+ if (mInjectedCount > 0)
+ {
+ mBufDesc.pmaPositionMass->copyHostToDeviceQ(mCopyQueue, mInjectedCount, mParticleCount);
+ mBufDesc.pmaVelocityLife->copyHostToDeviceQ(mCopyQueue, mInjectedCount, mParticleCount);
+ mBufDesc.pmaActorIdentifiers->copyHostToDeviceQ(mCopyQueue, mInjectedCount, mParticleCount);
+ mLifeSpan.copyHostToDeviceQ(mCopyQueue, mInjectedCount, mParticleCount);
+ mInjector.copyHostToDeviceQ(mCopyQueue, mInjectedCount, mParticleCount);
+ mBenefit.copyHostToDeviceQ(mCopyQueue, mInjectedCount, mParticleCount);
+ mBufDesc.pmaUserData->copyHostToDeviceQ(mCopyQueue, mInjectedCount, mParticleCount);
+ }
+ if (mAsset->mParams->collisionWithConvex)
+ {
+ mConvexPlanes.copyHostToDeviceQ(mCopyQueue);
+ mConvexVerts.copyHostToDeviceQ(mCopyQueue);
+ mConvexPolygonsData.copyHostToDeviceQ(mCopyQueue);
+ }
+ if (mAsset->mParams->collisionWithTriangleMesh)
+ {
+ mTrimeshVerts.copyHostToDeviceQ(mCopyQueue);
+ mTrimeshIndices.copyHostToDeviceQ(mCopyQueue);
+ }
+ mCopyQueue.flushEnqueued();
+ return true;
+
+ case 1:
+ if (totalCount > 0)
+ {
+ float benefitMin = PxMin(mLastBenefitMin, mInjectedBenefitMin);
+ float benefitMax = PxMax(mLastBenefitMax, mInjectedBenefitMax);
+ PX_ASSERT(benefitMin <= benefitMax);
+ benefitMax *= 1.00001f;
+
+ if (useSyncKernels)
+ {
+ CUDA_OBJ(histogramSyncKernel)(
+ stream, totalCount,
+ mBenefit.getGpuPtr(), boundCount,
+ benefitMin, benefitMax,
+ mTmpOutput1.getGpuPtr(),
+ mTmpHistogram.getGpuPtr()
+ );
+ }
+ else
+ {
+ uint32_t histogramGridSize =
+ CUDA_OBJ(histogramKernel)(
+ stream, totalCount,
+ createApexCudaMemRef(mBenefit, totalCount, ApexCudaMemFlags::IN),
+ boundCount, benefitMin, benefitMax,
+ createApexCudaMemRef(mTmpOutput1, ApexCudaMemFlags::IN),
+ createApexCudaMemRef(mTmpHistogram, ApexCudaMemFlags::OUT),
+ 1, 0
+ );
+
+ //launch just 1 block
+ CUDA_OBJ(histogramKernel)(
+ stream, APEX_CUDA_SINGLE_BLOCK_LAUNCH,
+ createApexCudaMemRef(mBenefit, totalCount, ApexCudaMemFlags::IN),
+ boundCount, benefitMin, benefitMax,
+ createApexCudaMemRef(mTmpOutput1, ApexCudaMemFlags::OUT),
+ createApexCudaMemRef(mTmpHistogram, ApexCudaMemFlags::IN_OUT),
+ 2, histogramGridSize
+ );
+ }
+ }
+ return true;
+
+ case 2:
+ if (totalCount > 0)
+ {
+ float benefitMin = PxMin(mLastBenefitMin, mInjectedBenefitMin);
+ float benefitMax = PxMax(mLastBenefitMax, mInjectedBenefitMax);
+ PX_ASSERT(benefitMin <= benefitMax);
+ benefitMax *= 1.00001f;
+
+ if (useSyncKernels)
+ {
+ CUDA_OBJ(scanSyncKernel)(
+ stream, totalCount,
+ benefitMin, benefitMax,
+ mHoleScanSum.getGpuPtr(), mBenefit.getGpuPtr(),
+ mTmpOutput1.getGpuPtr(),
+ mTmpScan.getGpuPtr(), mTmpScan1.getGpuPtr()
+ );
+ }
+ else
+ {
+ uint32_t scanGridSize =
+ CUDA_OBJ(scanKernel)(
+ stream, totalCount,
+ benefitMin, benefitMax,
+ createApexCudaMemRef(mHoleScanSum, totalCount, ApexCudaMemFlags::IN),
+ createApexCudaMemRef(mBenefit, totalCount, ApexCudaMemFlags::IN),
+ createApexCudaMemRef(mTmpOutput1, ApexCudaMemFlags::IN),
+ createApexCudaMemRef(mTmpScan, ApexCudaMemFlags::OUT),
+ createApexCudaMemRef(mTmpScan1, ApexCudaMemFlags::OUT),
+ 1, 0
+ );
+
+ //launch just 1 block
+ CUDA_OBJ(scanKernel)(
+ stream, APEX_CUDA_SINGLE_BLOCK_LAUNCH,
+ benefitMin, benefitMax,
+ createApexCudaMemRef(mHoleScanSum, totalCount, ApexCudaMemFlags::IN),
+ createApexCudaMemRef(mBenefit, totalCount, ApexCudaMemFlags::IN),
+ createApexCudaMemRef(mTmpOutput1, ApexCudaMemFlags::IN),
+ createApexCudaMemRef(mTmpScan, ApexCudaMemFlags::IN_OUT),
+ createApexCudaMemRef(mTmpScan1, ApexCudaMemFlags::IN_OUT),
+ 2, scanGridSize
+ );
+
+ CUDA_OBJ(scanKernel)(
+ stream, totalCount,
+ benefitMin, benefitMax,
+ createApexCudaMemRef(mHoleScanSum, totalCount, ApexCudaMemFlags::OUT),
+ createApexCudaMemRef(mBenefit, totalCount, ApexCudaMemFlags::IN),
+ createApexCudaMemRef(mTmpOutput1, ApexCudaMemFlags::IN),
+ createApexCudaMemRef(mTmpScan, ApexCudaMemFlags::IN),
+ createApexCudaMemRef(mTmpScan1, ApexCudaMemFlags::IN),
+ 3, 0
+ );
+ }
+ }
+ return true;
+
+ case 3:
+ {
+ if (totalCount > 0)
+ {
+ APEX_CUDA_TEXTURE_SCOPE_BIND(texRefCompactScanSum, mHoleScanSum);
+ const uint32_t injectorCount = mInjectorList.getSize();
+
+ CUDA_OBJ(compactKernel)(
+ stream,
+ PxMax(totalCount, injectorCount),
+ targetCount,
+ totalCount,
+ injectorCount,
+ createApexCudaMemRef(mMoveIndices, totalCount, ApexCudaMemFlags::OUT),
+ createApexCudaMemRef(mTmpScan, ApexCudaMemFlags::OUT),
+ createApexCudaMemRef(mInjectorsCounters, ApexCudaMemFlags::OUT)
+ );
+
+ }
+ return true;
+ }
+
+
+ case 4:
+ if (targetCount > 0)
+ {
+ uint32_t histogramGridSize = 0;
+ {
+ APEX_CUDA_TEXTURE_SCOPE_BIND(texRefMoveIndices, mMoveIndices);
+
+ APEX_CUDA_TEXTURE_SCOPE_BIND(texRefPositionMass, *mBufDesc.pmaPositionMass);
+ APEX_CUDA_TEXTURE_SCOPE_BIND(texRefVelocityLife, *mBufDesc.pmaVelocityLife);
+ APEX_CUDA_TEXTURE_SCOPE_BIND(texRefIofxActorIDs, *mBufDesc.pmaActorIdentifiers);
+ APEX_CUDA_TEXTURE_SCOPE_BIND(texRefLifeSpan, mLifeSpan);
+ APEX_CUDA_TEXTURE_SCOPE_BIND(texRefLifeTime, mLifeTime);
+ APEX_CUDA_TEXTURE_SCOPE_BIND(texRefInjector, mInjector);
+
+ APEX_CUDA_TEXTURE_SCOPE_BIND(texRefUserData, *mBufDesc.pmaUserData);
+
+ APEX_CUDA_TEXTURE_SCOPE_BIND(texRefConvexPlanes, mConvexPlanes);
+ APEX_CUDA_TEXTURE_SCOPE_BIND(texRefConvexVerts, mConvexVerts);
+ APEX_CUDA_TEXTURE_SCOPE_BIND(texRefConvexPolygonsData, mConvexPolygonsData);
+
+ APEX_CUDA_TEXTURE_SCOPE_BIND(texRefTrimeshVerts, mTrimeshVerts);
+ APEX_CUDA_TEXTURE_SCOPE_BIND(texRefTrimeshIndices, mTrimeshIndices);
+
+ PxVec3 gravity = -mUp;
+ const PxVec3& eyePos = mBasicIosScene->getApexScene().getEyePosition();
+
+ if (mFieldSamplerQuery != NULL)
+ {
+ APEX_CUDA_TEXTURE_SCOPE_BIND(texRefField, mField);
+
+ histogramGridSize = CUDA_OBJ(simulateApplyFieldKernel)(stream,
+ targetCount,
+ lastCount,
+ deltaTime,
+ gravity,
+ eyePos,
+ sceneGPU->mInjectorConstMemGroup.getStorage().mappedHandle(sceneGPU->mInjectorParamsArrayHandle), mInjectorsCounters.getSize(),
+ createApexCudaMemRef(mHoleScanSum, targetCount, ApexCudaMemFlags::IN),
+ createApexCudaMemRef(mTmpScan, 1, ApexCudaMemFlags::IN),
+ createApexCudaMemRef(mTmpHistogram, ApexCudaMemFlags::OUT),
+ createApexCudaMemRef(mInjectorsCounters, mInjectorsCounters.getSize(), ApexCudaMemFlags::OUT),
+ createApexCudaMemRef((float4*)mBufDesc.pmaPositionMass->getGpuPtr(), targetCount, ApexCudaMemFlags::OUT),
+ createApexCudaMemRef((float4*)mBufDesc.pmaVelocityLife->getGpuPtr(), targetCount, ApexCudaMemFlags::OUT),
+ createApexCudaMemRef((float4*)mBufDesc.pmaCollisionNormalFlags->getGpuPtr(), targetCount, ApexCudaMemFlags::OUT),
+ createApexCudaMemRef(mBufDesc.pmaUserData->getGpuPtr(), targetCount, ApexCudaMemFlags::OUT),
+ createApexCudaMemRef(mLifeSpan, targetCount, ApexCudaMemFlags::OUT),
+ createApexCudaMemRef(mLifeTime, targetCount, ApexCudaMemFlags::OUT),
+ createApexCudaMemRef(mInjector, targetCount, ApexCudaMemFlags::OUT),
+ createApexCudaMemRef(*(mBufDesc.pmaActorIdentifiers), targetCount, ApexCudaMemFlags::OUT),
+ createApexCudaMemRef(mBenefit, targetCount, ApexCudaMemFlags::OUT),
+ mSimulationStorageGroup.getStorage().mappedHandle(mSimulationParamsHandle)
+ );
+ }
+ else
+ {
+ histogramGridSize = CUDA_OBJ(simulateKernel)(stream, targetCount,
+ lastCount, deltaTime, gravity, eyePos,
+ sceneGPU->mInjectorConstMemGroup.getStorage().mappedHandle(sceneGPU->mInjectorParamsArrayHandle), mInjectorsCounters.getSize(),
+ mHoleScanSum.getGpuPtr(), mTmpScan.getGpuPtr(), mTmpHistogram.getGpuPtr(), mInjectorsCounters.getGpuPtr(),
+ (float4*)mBufDesc.pmaPositionMass->getGpuPtr(),
+ (float4*)mBufDesc.pmaVelocityLife->getGpuPtr(),
+ (float4*)mBufDesc.pmaCollisionNormalFlags->getGpuPtr(),
+ mBufDesc.pmaUserData->getGpuPtr(),
+ mLifeSpan.getGpuPtr(), mLifeTime.getGpuPtr(), mInjector.getGpuPtr(), mBufDesc.pmaActorIdentifiers->getGpuPtr(),
+ mBenefit.getGpuPtr(), mSimulationStorageGroup.getStorage().mappedHandle(mSimulationParamsHandle)
+ );
+ }
+ }
+ //new kernel invocation - to merge temp histograms
+ {
+ if(mInjectorsCounters.getSize() <= HISTOGRAM_SIMULATE_BIN_COUNT)
+ {
+ CUDA_OBJ(mergeHistogramKernel)(stream, APEX_CUDA_SINGLE_BLOCK_LAUNCH,
+ createApexCudaMemRef(mInjectorsCounters, mInjectorsCounters.getSize(), ApexCudaMemFlags::IN),
+ createApexCudaMemRef(mTmpHistogram, ApexCudaMemFlags::IN),
+ histogramGridSize,
+ mInjectorsCounters.getSize()
+ );
+ }
+
+ }
+ // calculate grid grid density
+ if (mGridDensityParams.Enabled)
+ {
+ mGridDensityParams.DensityOrigin = mDensityOrigin;
+ const unsigned int dim = mGridDensityParams.GridResolution;
+ // refreshed non-shared params
+ {
+ BasicIOSAssetParam* gridParams = (BasicIOSAssetParam*)(mAsset->getAssetNvParameterized());
+ mGridDensityParams.GridSize = gridParams->GridDensity.GridSize;
+ mGridDensityParams.GridMaxCellCount = gridParams->GridDensity.MaxCellCount;
+ }
+ // extract frustum
+ if (mBasicIosScene->getApexScene().getNumProjMatrices() > 0)
+ {
+ PxMat44 matDen = PxMat44(PxIdentity);
+ GridDensityFrustumParams frustum;
+ PxMat44 matModel = mBasicIosScene->getApexScene().getViewMatrix();
+ PxMat44 matProj = mBasicIosScene->getApexScene().getProjMatrix();
+ PxMat44 mat = matProj*matModel;
+ PxMat44 matInv = inverse(mat);
+ const float targetDepth = mGridDensityParams.GridSize;
+ // for debug vis
+ mDensityDebugMatInv = matInv;
+ // to calculate w transform
+ float nearDimX = distance(matInv.transform(PxVec4(-1.f,0.f,0.f,1.f)),matInv.transform(PxVec4(1.f,0.f,0.f,1.f)));
+ float farDimX = distance(matInv.transform(PxVec4(-1.f,0.f,1.f,1.f)),matInv.transform(PxVec4(1.f,0.f,1.f,1.f)));
+ float nearDimY = distance(matInv.transform(PxVec4(0.f,-1.f,0.f,1.f)),matInv.transform(PxVec4(0.f,1.f,0.f,1.f)));
+ float farDimY = distance(matInv.transform(PxVec4(0.f,-1.f,1.f,1.f)),matInv.transform(PxVec4(0.f,1.f,1.f,1.f)));
+ float dimZ = distance(matInv.transform(PxVec4(0.f, 0.f,0.f,1.f)),matInv.transform(PxVec4(0.f,0.f,1.f,1.f)));
+ float myFarDimX = nearDimX*(1.f-targetDepth/dimZ) + farDimX*(targetDepth/dimZ);
+ float myFarDimY = nearDimY*(1.f-targetDepth/dimZ) + farDimY*(targetDepth/dimZ);
+ // grab necessary frustum coordinates
+ PxVec4 origin4 = matInv.transform(PxVec4(-1.f, 1.f,0.f,1.f));
+ PxVec4 basisX4 = matInv.transform(PxVec4( 1.f, 1.f,0.f,1.f));
+ PxVec4 basisY4 = matInv.transform(PxVec4(-1.f,-1.f,0.f,1.f));
+ PxVec4 zDepth4 = matInv.transform(PxVec4(-1.f, 1.f,1.f,1.f));
+ // create vec3 versions
+ PxVec3 origin3(origin4.x/origin4.w,origin4.y/origin4.w,origin4.z/origin4.w);
+ PxVec3 basisX3(basisX4.x/basisX4.w,basisX4.y/basisX4.w,basisX4.z/basisX4.w);
+ PxVec3 basisY3(basisY4.x/basisY4.w,basisY4.y/basisY4.w,basisY4.z/basisY4.w);
+ PxVec3 zDepth3(zDepth4.x/zDepth4.w,zDepth4.y/zDepth4.w,zDepth4.z/zDepth4.w);
+ // make everthing relative to origin
+ basisX3 -= origin3;
+ basisY3 -= origin3;
+ zDepth3 -= origin3;
+ // find third basis
+ PxVec3 basisZ3(basisX3.cross(basisY3));
+ basisZ3.normalize();
+ basisZ3*= targetDepth;
+ // see how skewed the eye point is
+ PxVec3 eye;
+ {
+ // find the eye point
+ PxVec4 a4 = matInv.transform(PxVec4(1.f,1.f,0.00f,1.f));
+ PxVec4 b4 = matInv.transform(PxVec4(1.f,1.f,0.01f,1.f));
+ PxVec4 c4 = matInv.transform(PxVec4(-1.f,-1.f,0.00f,1.f));
+ PxVec4 d4 = matInv.transform(PxVec4(-1.f,-1.f,0.01f,1.f));
+ PxVec3 a3 = a4.getXYZ()/a4.w;
+ PxVec3 b3 = b4.getXYZ()/b4.w;
+ PxVec3 c3 = c4.getXYZ()/c4.w;
+ PxVec3 d3 = d4.getXYZ()/d4.w;
+ PxVec3 a = b3-a3;
+ PxVec3 b = d3-c3;
+ PxVec3 c = a.cross(b);
+ PxVec3 d = a3-c3;
+ PxMat33 m(a,b,c);
+ PxMat33 mInv = m.getInverse();
+ PxVec3 coord = mInv.transform(d);
+ eye = c3 + (d3-c3)*coord.y;
+ }
+ // build scale,rotation,translation matrix
+ PxMat44 mat1Inv = PxMat44(PxIdentity);
+ mat1Inv.column0 = PxVec4(basisX3,0.f);
+ mat1Inv.column1 = PxVec4(basisY3,0.f);
+ mat1Inv.column2 = PxVec4(basisZ3,0.f);
+ mat1Inv.column3 = PxVec4(origin3,1.f);
+ PxMat44 mat1 = inverse(mat1Inv);
+ PxVec3 eyeOffset = mat1.transform(eye);
+ // do perspective transform
+ PxMat44 mat2 = PxMat44(PxIdentity);
+ {
+ float xshift = -2.f*(eyeOffset.x-0.5f);
+ float yshift = -2.f*(eyeOffset.y-0.5f);
+ float left = -3.0f + xshift;
+ float right = 1.0f + xshift;
+ float top = 1.0f + yshift;
+ float bottom = -3.0f + yshift;
+ float nearVal = nearDimX/(0.5f*(myFarDimX-nearDimX));
+ //float farVal = nearVal + 1.f;
+ // build matrix
+ mat2.column0.x = -2.f*nearVal/(right-left);
+ mat2.column1.y = -2.f*nearVal/(top-bottom);
+ mat2.column2.x = (right+left)/(right-left);
+ mat2.column2.y = (top+bottom)/(top-bottom);
+ //mat2.column2.z = -(farVal+nearVal)/(farVal-nearVal);
+ mat2.column2.w = -1.f;
+ //mat2.column3.z = -(2.f*farVal*nearVal)/(farVal-nearVal);
+ mat2.column3.w = 0.f;
+ }
+ // shrink to calculate density just outside of frustum
+ PxMat44 mat3 = PxMat44(PxIdentity);
+ float factor = PxMin((float)(mGridDensityParams.GridResolution-4) / (mGridDensityParams.GridResolution),0.75f);
+ {
+ mat3.column0.x = factor;
+ mat3.column1.y = factor;
+ mat3.column2.z = factor;
+ mat3.column3.x = (1.0f-factor)/2.0f;
+ mat3.column3.y = (1.0f-factor)/2.0f;
+ mat3.column3.z = (1.0f-factor)/2.0f;
+ }
+ // create final matrix
+ matDen = mat3*mat2*mat1;
+ // create frustum info
+ frustum.nearDimX = factor*nearDimX;
+ frustum.farDimX = factor*myFarDimX;
+ frustum.nearDimY = factor*nearDimY;
+ frustum.farDimY = factor*myFarDimY;
+ frustum.dimZ = factor*targetDepth;
+ // launch frustum kernels
+ CUDA_OBJ(gridDensityGridClearKernel)(stream, dim*dim*dim,
+ mGridDensityGrid.getGpuPtr(),
+ mGridDensityParams
+ );
+ CUDA_OBJ(gridDensityGridFillFrustumKernel)(stream, targetCount,
+ (float4*)mBufDesc.pmaPositionMass->getGpuPtr(),
+ mGridDensityGrid.getGpuPtr(),
+ mGridDensityParams,
+ matDen,
+ frustum
+ );
+ CUDA_OBJ(gridDensityGridLowPassKernel)(stream, dim*dim*dim,
+ mGridDensityGrid.getGpuPtr(),
+ mGridDensityGridLowPass.getGpuPtr(),
+ mGridDensityParams
+ );
+ CUDA_OBJ(gridDensityGridApplyFrustumKernel)(stream, targetCount,
+ mBufDesc.pmaDensity != NULL ? (float*)mBufDesc.pmaDensity->getGpuPtr() : NULL,
+ (float4*)mBufDesc.pmaPositionMass->getGpuPtr(),
+ mGridDensityGridLowPass.getGpuPtr(),
+ mGridDensityParams,
+ matDen,
+ frustum
+ );
+ }
+ }
+ }
+ return true;
+
+ case 5:
+ if (targetCount > 0)
+ {
+ if (useSyncKernels)
+ {
+ CUDA_OBJ(reduceSyncKernel)(
+ stream, targetCount,
+ mBenefit.getGpuPtr(), (float4*)mTmpOutput.getGpuPtr(), mTmpReduce.getGpuPtr()
+ );
+ }
+ else
+ {
+ uint32_t reduceGridSize =
+ CUDA_OBJ(reduceKernel)(
+ stream, targetCount,
+ createApexCudaMemRef(mBenefit, targetCount, ApexCudaMemFlags::IN),
+ createApexCudaMemRef((float4*)mTmpOutput.getGpuPtr(), 1, ApexCudaMemFlags::IN),
+ createApexCudaMemRef(mTmpReduce, ApexCudaMemFlags::OUT),
+ 1, 0
+ );
+
+ //launch just 1 block
+ CUDA_OBJ(reduceKernel)(
+ stream, APEX_CUDA_SINGLE_BLOCK_LAUNCH,
+ createApexCudaMemRef(mBenefit, targetCount, ApexCudaMemFlags::IN),
+ createApexCudaMemRef((float4*)mTmpOutput.getGpuPtr(), 1, ApexCudaMemFlags::OUT),
+ createApexCudaMemRef(mTmpReduce, ApexCudaMemFlags::IN),
+ 2, reduceGridSize
+ );
+ }
+ }
+ return true;
+
+ case 6:
+ if (totalCount > 0)
+ {
+ APEX_CUDA_TEXTURE_SCOPE_BIND(texRefHoleScanSum, mHoleScanSum);
+ APEX_CUDA_TEXTURE_SCOPE_BIND(texRefMoveIndices, mMoveIndices);
+
+ CUDA_OBJ(stateKernel)(stream, totalCount,
+ lastCount, targetCount,
+ createApexCudaMemRef(mTmpScan, 1, ApexCudaMemFlags::IN),
+ createApexCudaMemRef(*mBufDesc.pmaInStateToInput, totalCount, ApexCudaMemFlags::OUT),
+ createApexCudaMemRef(*mBufDesc.pmaOutStateToInput, lastCount, ApexCudaMemFlags::IN)
+ );
+
+ }
+ return true;
+
+ case 7:
+ mTmpOutput.copyDeviceToHostQ(mCopyQueue);
+ mInjectorsCounters.copyDeviceToHostQ(mCopyQueue);
+
+ if (mTmpScan.cpuPtrIsValid())
+ {
+ mTmpOutput1.copyDeviceToHostQ(mCopyQueue);
+ mTmpScan.copyDeviceToHostQ(mCopyQueue, 1);
+ mHoleScanSum.copyDeviceToHostQ(mCopyQueue, totalCount);
+ mMoveIndices.copyDeviceToHostQ(mCopyQueue, totalCount);
+ mTmpHistogram.copyDeviceToHostQ(mCopyQueue, HISTOGRAM_BIN_COUNT);
+ mBenefit.copyDeviceToHostQ(mCopyQueue, totalCount);
+ mBufDesc.pmaInStateToInput->copyDeviceToHostQ(mCopyQueue, totalCount);
+ mBufDesc.pmaPositionMass->copyDeviceToHostQ(mCopyQueue, targetCount);
+ }
+ mCopyQueue.flushEnqueued();
+
+ /* Oh! Manager of the IOFX! do your thing */
+ mIofxMgr->updateEffectsData(deltaTime, targetCount, targetCount, totalCount, stream);
+ return false;
+ }
+ return false;
+}
+
+void BasicIosActorGPU::fetchResults()
+{
+ BASIC_IOS_ACTOR::fetchResults();
+
+ uint32_t totalCount = mParticleCount + mInjectedCount;
+ checkBenefit(totalCount);
+ checkHoles(totalCount);
+ checkInState(totalCount);
+ /*checkHistogram(PxMax<int32_t>(mLastActiveCount + mInjectedCount - mParticleBudget, 0), mTmpHistogram[ mTmpOutput1[1] ], mTmpHistogram[ HISTOGRAM_BIN_COUNT - 1 ]);*/
+ //this can be done only after TaskLaunchAfterLod is finished!!!
+
+ mParticleCount = mParticleBudget;
+
+ float* pTmpOutput = (float*)mTmpOutput.getPtr();
+ mLastActiveCount = mTmpOutput[STATUS_LAST_ACTIVE_COUNT];
+ mLastBenefitSum = pTmpOutput[STATUS_LAST_BENEFIT_SUM];
+ mLastBenefitMin = pTmpOutput[STATUS_LAST_BENEFIT_MIN];
+ mLastBenefitMax = pTmpOutput[STATUS_LAST_BENEFIT_MAX];
+}
+
+
+PxMat44 BasicIosActorGPU::inverse(const PxMat44& in)
+{
+ PxMat44 ret;
+ float inv[16];
+ float* invOut = &ret.column0.x;
+ const float* m = &in.column0.x;
+ int i;
+
+ inv[0] = m[5] * m[10] * m[15] -
+ m[5] * m[11] * m[14] -
+ m[9] * m[6] * m[15] +
+ m[9] * m[7] * m[14] +
+ m[13] * m[6] * m[11] -
+ m[13] * m[7] * m[10];
+
+ inv[4] = -m[4] * m[10] * m[15] +
+ m[4] * m[11] * m[14] +
+ m[8] * m[6] * m[15] -
+ m[8] * m[7] * m[14] -
+ m[12] * m[6] * m[11] +
+ m[12] * m[7] * m[10];
+
+ inv[8] = m[4] * m[9] * m[15] -
+ m[4] * m[11] * m[13] -
+ m[8] * m[5] * m[15] +
+ m[8] * m[7] * m[13] +
+ m[12] * m[5] * m[11] -
+ m[12] * m[7] * m[9];
+
+ inv[12] = -m[4] * m[9] * m[14] +
+ m[4] * m[10] * m[13] +
+ m[8] * m[5] * m[14] -
+ m[8] * m[6] * m[13] -
+ m[12] * m[5] * m[10] +
+ m[12] * m[6] * m[9];
+
+ inv[1] = -m[1] * m[10] * m[15] +
+ m[1] * m[11] * m[14] +
+ m[9] * m[2] * m[15] -
+ m[9] * m[3] * m[14] -
+ m[13] * m[2] * m[11] +
+ m[13] * m[3] * m[10];
+
+ inv[5] = m[0] * m[10] * m[15] -
+ m[0] * m[11] * m[14] -
+ m[8] * m[2] * m[15] +
+ m[8] * m[3] * m[14] +
+ m[12] * m[2] * m[11] -
+ m[12] * m[3] * m[10];
+
+ inv[9] = -m[0] * m[9] * m[15] +
+ m[0] * m[11] * m[13] +
+ m[8] * m[1] * m[15] -
+ m[8] * m[3] * m[13] -
+ m[12] * m[1] * m[11] +
+ m[12] * m[3] * m[9];
+
+ inv[13] = m[0] * m[9] * m[14] -
+ m[0] * m[10] * m[13] -
+ m[8] * m[1] * m[14] +
+ m[8] * m[2] * m[13] +
+ m[12] * m[1] * m[10] -
+ m[12] * m[2] * m[9];
+
+ inv[2] = m[1] * m[6] * m[15] -
+ m[1] * m[7] * m[14] -
+ m[5] * m[2] * m[15] +
+ m[5] * m[3] * m[14] +
+ m[13] * m[2] * m[7] -
+ m[13] * m[3] * m[6];
+
+ inv[6] = -m[0] * m[6] * m[15] +
+ m[0] * m[7] * m[14] +
+ m[4] * m[2] * m[15] -
+ m[4] * m[3] * m[14] -
+ m[12] * m[2] * m[7] +
+ m[12] * m[3] * m[6];
+
+ inv[10] = m[0] * m[5] * m[15] -
+ m[0] * m[7] * m[13] -
+ m[4] * m[1] * m[15] +
+ m[4] * m[3] * m[13] +
+ m[12] * m[1] * m[7] -
+ m[12] * m[3] * m[5];
+
+ inv[14] = -m[0] * m[5] * m[14] +
+ m[0] * m[6] * m[13] +
+ m[4] * m[1] * m[14] -
+ m[4] * m[2] * m[13] -
+ m[12] * m[1] * m[6] +
+ m[12] * m[2] * m[5];
+
+ inv[3] = -m[1] * m[6] * m[11] +
+ m[1] * m[7] * m[10] +
+ m[5] * m[2] * m[11] -
+ m[5] * m[3] * m[10] -
+ m[9] * m[2] * m[7] +
+ m[9] * m[3] * m[6];
+
+ inv[7] = m[0] * m[6] * m[11] -
+ m[0] * m[7] * m[10] -
+ m[4] * m[2] * m[11] +
+ m[4] * m[3] * m[10] +
+ m[8] * m[2] * m[7] -
+ m[8] * m[3] * m[6];
+
+ inv[11] = -m[0] * m[5] * m[11] +
+ m[0] * m[7] * m[9] +
+ m[4] * m[1] * m[11] -
+ m[4] * m[3] * m[9] -
+ m[8] * m[1] * m[7] +
+ m[8] * m[3] * m[5];
+
+ inv[15] = m[0] * m[5] * m[10] -
+ m[0] * m[6] * m[9] -
+ m[4] * m[1] * m[10] +
+ m[4] * m[2] * m[9] +
+ m[8] * m[1] * m[6] -
+ m[8] * m[2] * m[5];
+
+ float det = m[0] * inv[0] + m[1] * inv[4] + m[2] * inv[8] + m[3] * inv[12];
+
+ if (det == 0)
+ return PxMat44(PxIdentity);
+
+ det = 1.0f / det;
+
+ for (i = 0; i < 16; i++)
+ invOut[i] = inv[i] * det;
+
+ return ret;
+}
+
+float BasicIosActorGPU::distance(PxVec4 a, PxVec4 b)
+{
+ PxVec3 a3(a.x/a.w,a.y/a.w,a.z/a.w);
+ PxVec3 b3(b.x/b.w,b.y/b.w,b.z/b.w);
+ PxVec3 diff(b3-a3);
+ return diff.magnitude();
+}
+
+}
+} // namespace nvidia
+
+#endif //APEX_CUDA_SUPPORT