summaryrefslogtreecommitdiff
path: root/src/FFT_Simulation_Manager_CUDA.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/FFT_Simulation_Manager_CUDA.cpp')
-rw-r--r--src/FFT_Simulation_Manager_CUDA.cpp766
1 files changed, 766 insertions, 0 deletions
diff --git a/src/FFT_Simulation_Manager_CUDA.cpp b/src/FFT_Simulation_Manager_CUDA.cpp
new file mode 100644
index 0000000..c6aae8a
--- /dev/null
+++ b/src/FFT_Simulation_Manager_CUDA.cpp
@@ -0,0 +1,766 @@
+// This code contains NVIDIA Confidential Information and is disclosed
+// under the Mutual Non-Disclosure Agreement.
+//
+// Notice
+// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES
+// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT,
+// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+//
+// NVIDIA Corporation assumes no responsibility for the consequences of use of such
+// information or for any infringement of patents or other rights of third parties that may
+// result from its use. No license is granted by implication or otherwise under any patent
+// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless
+// expressly authorized by NVIDIA. Details are subject to change without notice.
+// This code supersedes and replaces all information previously supplied.
+// NVIDIA Corporation products are not authorized for use as critical
+// components in life support devices or systems without express written approval of
+// NVIDIA Corporation.
+//
+// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved.
+//
+// NVIDIA Corporation and its licensors retain all intellectual property and proprietary
+// rights in and to this software and related documentation and any modifications thereto.
+// Any use, reproduction, disclosure or distribution of this software and related
+// documentation without an express license agreement from NVIDIA Corporation is
+// strictly prohibited.
+//
+
+#include "FFT_Simulation_Manager_CUDA_impl.h"
+#include "FFT_Simulation_CUDA_impl.h"
+
+#ifdef SUPPORT_CUDA
+#include <malloc.h>
+#include <string.h>
+
+#if defined(TARGET_PLATFORM_NIXLIKE)
+#define _alloca alloca
+#endif
+
+extern "C"
+{
+ cudaError cuda_GetConstantsSize(size_t* size);
+ cudaError cuda_GetConstantsAddress(void** ptr);
+}
+
+NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl() :
+ m_NextKickID(0),
+ m_StagingCursorIsValid(false),
+ m_StagingCursorKickID(0)
+{
+ m_numCudaDevices = 0;
+ m_activeCudaDeviceIndex = 0;
+ m_pCudaDeviceInfos = NULL;
+ m_cudaResourcesInitialised = false;
+ m_d3dAPI = nv_water_d3d_api_undefined;
+}
+
+NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::~NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl()
+{
+ releaseAll();
+}
+
+void NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::releaseAll()
+{
+ if(m_cudaResourcesInitialised)
+ {
+ releaseCudaResources();
+ }
+
+ switch(m_d3dAPI)
+ {
+#if WAVEWORKS_ENABLE_D3D9
+ case nv_water_d3d_api_d3d9:
+ {
+ SAFE_RELEASE(m_d3d._9.m_pd3d9Device);
+ }
+ break;
+#endif
+#if WAVEWORKS_ENABLE_D3D10
+ case nv_water_d3d_api_d3d10:
+ {
+ SAFE_RELEASE(m_d3d._10.m_pd3d10Device);
+ }
+ break;
+#endif
+#if WAVEWORKS_ENABLE_D3D11
+ case nv_water_d3d_api_d3d11:
+ {
+ SAFE_RELEASE(m_d3d._11.m_pd3d11Device);
+ }
+ break;
+#endif
+ }
+
+ assert(0 == m_Simulations.size()); // It is an error to destroy a non-empty manager
+ m_Simulations.erase_all();
+
+ m_d3dAPI = nv_water_d3d_api_undefined;
+
+ SAFE_DELETE_ARRAY(m_pCudaDeviceInfos);
+ m_numCudaDevices = 0;
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::initD3D9(IDirect3DDevice9* pD3DDevice)
+{
+#if WAVEWORKS_ENABLE_D3D9
+ if(nv_water_d3d_api_d3d9 != m_d3dAPI)
+ {
+ releaseAll();
+ }
+ else if(m_d3d._9.m_pd3d9Device != pD3DDevice)
+ {
+ releaseAll();
+ }
+
+ if(nv_water_d3d_api_undefined == m_d3dAPI)
+ {
+ m_d3dAPI = nv_water_d3d_api_d3d9;
+ m_d3d._9.m_pd3d9Device = pD3DDevice;
+ m_d3d._9.m_pd3d9Device->AddRef();
+
+ CUDA_V_RETURN(cudaD3D9GetDevices(&m_numCudaDevices, NULL, 0, pD3DDevice, cudaD3D9DeviceListAll));
+ int* pCudaDevices = (int*)_alloca(m_numCudaDevices * sizeof(int));
+ CUDA_V_RETURN(cudaD3D9GetDevices(&m_numCudaDevices, pCudaDevices, m_numCudaDevices, pD3DDevice, cudaD3D9DeviceListAll));
+ m_pCudaDeviceInfos = new CudaDeviceInfo[m_numCudaDevices];
+ memset(m_pCudaDeviceInfos, 0, m_numCudaDevices * sizeof(CudaDeviceInfo));
+ for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
+ {
+ m_pCudaDeviceInfos[cuda_dev_index].m_cudaDevice = pCudaDevices[cuda_dev_index];
+ }
+ }
+
+ return S_OK;
+#else
+ return E_FAIL;
+#endif
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::initD3D10(ID3D10Device* pD3DDevice)
+{
+#if WAVEWORKS_ENABLE_D3D10
+ if(nv_water_d3d_api_d3d10 != m_d3dAPI)
+ {
+ releaseAll();
+ }
+ else if(m_d3d._10.m_pd3d10Device != pD3DDevice)
+ {
+ releaseAll();
+ }
+
+ if(nv_water_d3d_api_undefined == m_d3dAPI)
+ {
+ m_d3dAPI = nv_water_d3d_api_d3d10;
+ m_d3d._10.m_pd3d10Device = pD3DDevice;
+ m_d3d._10.m_pd3d10Device->AddRef();
+
+ CUDA_V_RETURN(cudaD3D10GetDevices(&m_numCudaDevices, NULL, 0, pD3DDevice, cudaD3D10DeviceListAll));
+ int* pCudaDevices = (int*)_alloca(m_numCudaDevices * sizeof(int));
+ CUDA_V_RETURN(cudaD3D10GetDevices(&m_numCudaDevices, pCudaDevices, m_numCudaDevices, pD3DDevice, cudaD3D10DeviceListAll));
+ m_pCudaDeviceInfos = new CudaDeviceInfo[m_numCudaDevices];
+ memset(m_pCudaDeviceInfos, 0, m_numCudaDevices * sizeof(CudaDeviceInfo));
+ for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
+ {
+ m_pCudaDeviceInfos[cuda_dev_index].m_cudaDevice = pCudaDevices[cuda_dev_index];
+ }
+ }
+
+ return S_OK;
+#else
+ return E_FAIL;
+#endif
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::initD3D11(ID3D11Device* pD3DDevice)
+{
+#if WAVEWORKS_ENABLE_D3D11
+ if(nv_water_d3d_api_d3d11 != m_d3dAPI)
+ {
+ releaseAll();
+ }
+ else if(m_d3d._11.m_pd3d11Device != pD3DDevice)
+ {
+ releaseAll();
+ }
+
+ if(nv_water_d3d_api_undefined == m_d3dAPI)
+ {
+ m_d3dAPI = nv_water_d3d_api_d3d11;
+ m_d3d._11.m_pd3d11Device = pD3DDevice;
+ m_d3d._11.m_pd3d11Device->AddRef();
+
+ CUDA_V_RETURN(cudaD3D11GetDevices(&m_numCudaDevices, NULL, 0, pD3DDevice, cudaD3D11DeviceListAll));
+ int* pCudaDevices = (int*)_alloca(m_numCudaDevices * sizeof(int));
+ CUDA_V_RETURN(cudaD3D11GetDevices(&m_numCudaDevices, pCudaDevices, m_numCudaDevices, pD3DDevice, cudaD3D11DeviceListAll));
+ m_pCudaDeviceInfos = new CudaDeviceInfo[m_numCudaDevices];
+ memset(m_pCudaDeviceInfos, 0, m_numCudaDevices * sizeof(CudaDeviceInfo));
+ for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
+ {
+ m_pCudaDeviceInfos[cuda_dev_index].m_cudaDevice = pCudaDevices[cuda_dev_index];
+ }
+ }
+
+ return S_OK;
+#else
+ return E_FAIL;
+#endif
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::initGL2(void* pGLContext)
+{
+#if WAVEWORKS_ENABLE_GL
+ if(nv_water_d3d_api_gl2 != m_d3dAPI)
+ {
+ releaseAll();
+ }
+ else if(m_d3d._GL2.m_pGLContext != pGLContext)
+ {
+ releaseAll();
+ }
+
+ if(nv_water_d3d_api_undefined == m_d3dAPI)
+ {
+ m_d3dAPI = nv_water_d3d_api_gl2;
+
+ CUDA_V_RETURN(cudaGLGetDevices(&m_numCudaDevices, NULL, 0, cudaGLDeviceListAll));
+ int* pCudaDevices = (int*)_alloca(m_numCudaDevices * sizeof(int));
+ CUDA_API_RETURN(cudaGLGetDevices(&m_numCudaDevices, pCudaDevices, m_numCudaDevices, cudaGLDeviceListAll));
+ m_pCudaDeviceInfos = new CudaDeviceInfo[m_numCudaDevices];
+ memset(m_pCudaDeviceInfos, 0, m_numCudaDevices * sizeof(CudaDeviceInfo));
+ for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
+ {
+ m_pCudaDeviceInfos[cuda_dev_index].m_cudaDevice = pCudaDevices[cuda_dev_index];
+ }
+ }
+ return S_OK;
+#else
+ return E_FAIL;
+#endif
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::initNoGraphics()
+{
+ if(nv_water_d3d_api_none != m_d3dAPI)
+ {
+ releaseAll();
+ }
+
+ if(nv_water_d3d_api_undefined == m_d3dAPI)
+ {
+ m_d3dAPI = nv_water_d3d_api_none;
+
+ int cuda_device;
+ CUDA_V_RETURN(cudaGetDevice(&cuda_device));
+
+ m_numCudaDevices = 1;
+ m_pCudaDeviceInfos = new CudaDeviceInfo[m_numCudaDevices];
+ memset(m_pCudaDeviceInfos, 0, m_numCudaDevices * sizeof(CudaDeviceInfo));
+ m_pCudaDeviceInfos->m_cudaDevice = cuda_device;
+ }
+
+ return S_OK;
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::allocateCudaResources()
+{
+ for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
+ {
+ CudaDeviceInfo& dev_state = m_pCudaDeviceInfos[cuda_dev_index];
+ CUDA_V_RETURN(cudaSetDevice(dev_state.m_cudaDevice));
+
+ CUDA_V_RETURN(cuda_GetConstantsSize(&dev_state.m_constants_size));
+ CUDA_V_RETURN(cuda_GetConstantsAddress(&dev_state.m_constants_address));
+ CUDA_V_RETURN(cudaMalloc((void **)&dev_state.m_device_constants, dev_state.m_constants_size));
+ CUDA_V_RETURN(cudaMemset(dev_state.m_device_constants, 0, dev_state.m_constants_size));
+
+ CUDA_V_RETURN(cudaStreamCreateWithFlags(&dev_state.m_kernel_stream,cudaStreamNonBlocking));
+ CUDA_V_RETURN(cudaStreamCreateWithFlags(&dev_state.m_readback_stream,cudaStreamNonBlocking));
+ }
+
+ m_cudaResourcesInitialised = true;
+
+ return S_OK;
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::releaseCudaResources()
+{
+ for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
+ {
+ CudaDeviceInfo& dev_state = m_pCudaDeviceInfos[cuda_dev_index];
+ CUDA_V_RETURN(cudaSetDevice(dev_state.m_cudaDevice));
+
+ CUDA_SAFE_FREE(dev_state.m_device_constants);
+
+ CUDA_V_RETURN(cudaStreamDestroy(dev_state.m_kernel_stream));
+ CUDA_V_RETURN(cudaStreamDestroy(dev_state.m_readback_stream));
+ }
+
+ m_cudaResourcesInitialised = false;
+
+ return S_OK;
+}
+
+NVWaveWorks_FFT_Simulation* NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::createSimulation(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params)
+{
+ NVWaveWorks_FFT_Simulation_CUDA_Impl* pResult = new NVWaveWorks_FFT_Simulation_CUDA_Impl(this,params);
+ m_Simulations.push_back(pResult);
+ return pResult;
+}
+
+void NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::releaseSimulation(NVWaveWorks_FFT_Simulation* pSimulation)
+{
+ //remove from list
+ m_Simulations.erase(pSimulation);
+
+ SAFE_DELETE(pSimulation);
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::beforeReinit(const GFSDK_WaveWorks_Detailed_Simulation_Params& /*params*/, bool /*reinitOnly*/)
+{
+ return S_OK;
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::checkForReadbackResults()
+{
+ HRESULT hr;
+
+ // The goal here is to evolve the readback state of all our simulations in lockstep, so that either all our simulations collect
+ // a single readback or else none do (IOW: 'some' is *not* permitted, because it would break lockstep)
+
+ NVWaveWorks_FFT_Simulation_CUDA_Impl** pBeginSimulationsSrc = (NVWaveWorks_FFT_Simulation_CUDA_Impl**)_alloca(m_Simulations.size() * sizeof(NVWaveWorks_FFT_Simulation_CUDA_Impl*));
+ memcpy(pBeginSimulationsSrc,m_Simulations.begin(),m_Simulations.size() * sizeof(NVWaveWorks_FFT_Simulation_CUDA_Impl*));
+ NVWaveWorks_FFT_Simulation_CUDA_Impl** pEndSimulationsSrc = pBeginSimulationsSrc + m_Simulations.size();
+
+ NVWaveWorks_FFT_Simulation_CUDA_Impl** pBeginSimulationsNoResult = (NVWaveWorks_FFT_Simulation_CUDA_Impl**)_alloca(m_Simulations.size() * sizeof(NVWaveWorks_FFT_Simulation_CUDA_Impl*));;
+ NVWaveWorks_FFT_Simulation_CUDA_Impl** pEndSimulationsNoResult = pBeginSimulationsNoResult;
+
+ // Do an initial walk thru and see if any readbacks arrived (without blocking), and write any that did not get a readback result into dst
+ for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = pBeginSimulationsSrc; pSim != pEndSimulationsSrc; ++pSim)
+ {
+ hr = (*pSim)->collectSingleReadbackResult(false);
+ if(FAILED(hr))
+ {
+ return hr;
+ }
+
+ if(S_FALSE == hr)
+ {
+ (*pEndSimulationsNoResult) = (*pSim);
+ ++pEndSimulationsNoResult;
+ }
+ }
+
+ // If no results are ready, we're in sync so don't try again
+ if((pEndSimulationsNoResult-pBeginSimulationsNoResult) != m_Simulations.size())
+ {
+ // Otherwise, wait on the remaining results
+ for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = pBeginSimulationsNoResult; pSim != pEndSimulationsNoResult; ++pSim)
+ {
+ V_RETURN((*pSim)->collectSingleReadbackResult(true));
+ }
+ }
+
+ return S_OK;
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::kick(Graphics_Context* /*pGC*/, double dSimTime, gfsdk_U64& kickID)
+{
+ HRESULT hr;
+
+ kickID = m_NextKickID;
+
+ if(!m_cudaResourcesInitialised)
+ {
+ V_RETURN(allocateCudaResources());
+ }
+
+ // Check for readback results - note that we do this at the manager level in order to guarantee lockstep between
+ // the simulations that form a cascade. We either want all of simulations to collect a result, or none - some is
+ // not an option
+ checkForReadbackResults();
+
+ // Be sure to use the correct cuda device for the current frame (important in SLI)
+ int cuda_device = -1;
+ if(1 == m_numCudaDevices)
+ {
+ m_activeCudaDeviceIndex = 0;
+ cuda_device = m_pCudaDeviceInfos[m_activeCudaDeviceIndex].m_cudaDevice;
+ CUDA_V_RETURN(cudaSetDevice(cuda_device));
+ }
+ else
+ {
+ // Multiple devices, we will have to do it the 'long' way
+ switch(m_d3dAPI)
+ {
+#if WAVEWORKS_ENABLE_D3D9
+ case nv_water_d3d_api_d3d9:
+ {
+ unsigned int cuda_device_count = 0;
+ CUDA_V_RETURN(cudaD3D9GetDevices(&cuda_device_count, &cuda_device, 1, m_d3d._9.m_pd3d9Device, cudaD3D9DeviceListCurrentFrame));
+ CUDA_V_RETURN(cudaSetDevice(cuda_device));
+ break;
+ }
+#endif
+#if WAVEWORKS_ENABLE_D3D10
+ case nv_water_d3d_api_d3d10:
+ {
+ unsigned int cuda_device_count = 0;
+ CUDA_V_RETURN(cudaD3D10GetDevices(&cuda_device_count, &cuda_device, 1, m_d3d._10.m_pd3d10Device, cudaD3D10DeviceListCurrentFrame));
+ CUDA_V_RETURN(cudaSetDevice(cuda_device));
+ break;
+ }
+#endif
+#if WAVEWORKS_ENABLE_D3D11
+ case nv_water_d3d_api_d3d11:
+ {
+ unsigned int cuda_device_count = 0;
+ CUDA_V_RETURN(cudaD3D11GetDevices(&cuda_device_count, &cuda_device, 1, m_d3d._11.m_pd3d11Device, cudaD3D11DeviceListCurrentFrame));
+ CUDA_V_RETURN(cudaSetDevice(cuda_device));
+ break;
+ }
+#endif
+#if WAVEWORKS_ENABLE_GL
+ case nv_water_d3d_api_gl2:
+ {
+ unsigned int cuda_device_count = 0;
+ CUDA_V_RETURN(cudaGLGetDevices(&cuda_device_count, &cuda_device, 1, cudaGLDeviceListCurrentFrame));
+ CUDA_V_RETURN(cudaSetDevice(cuda_device));
+ break;
+ }
+#endif
+ case nv_water_d3d_api_none:
+ {
+ assert(1 == m_numCudaDevices); // Well by the time we get here we're guaranteed to hit this assert,
+ // but the assert neatly documents the violated expecation i.e. the only
+ // supported no-graphics CUDA path is single device
+ break;
+ }
+ default:
+ return E_FAIL;
+ }
+
+ // Match the current device to our list
+ for(unsigned int cuda_device_index = 0; cuda_device_index != m_numCudaDevices; ++cuda_device_index)
+ {
+ if(cuda_device == m_pCudaDeviceInfos[cuda_device_index].m_cudaDevice)
+ {
+ m_activeCudaDeviceIndex = cuda_device_index;
+ break;
+ }
+ }
+ }
+
+ const CudaDeviceInfo& active_dev_info = m_pCudaDeviceInfos[m_activeCudaDeviceIndex];
+
+ for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim)
+ {
+ V_RETURN((*pSim)->preKick(pSim - m_Simulations.begin()));
+ }
+ CUDA_V_RETURN(cudaMemcpyAsync(active_dev_info.m_constants_address, active_dev_info.m_device_constants,
+ active_dev_info.m_constants_size, cudaMemcpyDeviceToDevice, active_dev_info.m_kernel_stream));
+
+ // Do all the CUDA work as far as interop
+ for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim)
+ {
+ V_RETURN((*pSim)->kickPreInterop(dSimTime,kickID));
+ }
+
+ // Map for interop
+ V_RETURN(mapInteropResources(active_dev_info));
+
+ // Do all interop CUDA work
+ for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim)
+ {
+ V_RETURN((*pSim)->kickWithinInterop(kickID));
+ }
+
+ // Unmap for interop
+ V_RETURN(unmapInteropResources(active_dev_info));
+
+ // Do post-interop CUDA work
+ for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim)
+ {
+ V_RETURN((*pSim)->kickPostInterop(kickID));
+ }
+
+ m_StagingCursorIsValid = true;
+ m_StagingCursorKickID = kickID;
+ ++m_NextKickID;
+
+ return S_OK;
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::mapInteropResources(const CudaDeviceInfo& cdi)
+{
+ switch(m_d3dAPI)
+ {
+#if WAVEWORKS_ENABLE_D3D9
+ case nv_water_d3d_api_d3d9:
+ {
+ const int num_resources = m_Simulations.size();
+ IDirect3DResource9** pInteropResources = (IDirect3DResource9**)alloca(sizeof(IDirect3DResource9*)*num_resources);
+ int i = 0;
+ for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim, ++i)
+ {
+ pInteropResources[i] = (*pSim)->getD3D9InteropResource(m_activeCudaDeviceIndex);
+ }
+ CUDA_V_RETURN(cudaD3D9MapResources(num_resources, pInteropResources)); // @TODO: why no cu_stream?
+ break;
+ }
+#endif
+#if WAVEWORKS_ENABLE_D3D10
+ case nv_water_d3d_api_d3d10:
+ {
+ const int num_resources = m_Simulations.size();
+ ID3D10Resource** pInteropResources = (ID3D10Resource**)alloca(sizeof(ID3D10Resource*)*num_resources);
+ int i = 0;
+ for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim, ++i)
+ {
+ pInteropResources[i] = (*pSim)->getD3D10InteropResource(m_activeCudaDeviceIndex);
+ }
+ CUDA_V_RETURN(cudaD3D10MapResources(num_resources, pInteropResources)); // @TODO: why no cu_stream?
+ break;
+ }
+#endif
+#if WAVEWORKS_ENABLE_D3D11 || WAVEWORKS_ENABLE_GL
+ case nv_water_d3d_api_d3d11:
+ case nv_water_d3d_api_gl2:
+ {
+ const int num_resources = m_Simulations.size();
+ cudaGraphicsResource** pInteropResources = (cudaGraphicsResource**)alloca(sizeof(cudaGraphicsResource*)*num_resources);
+ int i = 0;
+ for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim, ++i)
+ {
+ pInteropResources[i] = (*pSim)->getInteropResource(m_activeCudaDeviceIndex);
+ }
+ CUDA_V_RETURN(cudaGraphicsMapResources(num_resources, pInteropResources, cdi.m_kernel_stream));
+ break;
+ }
+#endif
+ case nv_water_d3d_api_none:
+ {
+ // Nothing to do...
+ break;
+ }
+ default:
+ return E_FAIL;
+ }
+
+ return S_OK;
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::unmapInteropResources(const CudaDeviceInfo& cdi)
+{
+ switch(m_d3dAPI)
+ {
+#if WAVEWORKS_ENABLE_D3D9
+ case nv_water_d3d_api_d3d9:
+ {
+ const int num_resources = m_Simulations.size();
+ IDirect3DResource9** pInteropResources = (IDirect3DResource9**)alloca(sizeof(IDirect3DResource9*)*num_resources);
+ int i = 0;
+ for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim, ++i)
+ {
+ pInteropResources[i] = (*pSim)->getD3D9InteropResource(m_activeCudaDeviceIndex);
+ }
+ CUDA_V_RETURN(cudaD3D9UnmapResources(num_resources, pInteropResources)); // @TODO: why no cu_stream?
+ break;
+ }
+#endif
+#if WAVEWORKS_ENABLE_D3D10
+ case nv_water_d3d_api_d3d10:
+ {
+ const int num_resources = m_Simulations.size();
+ ID3D10Resource** pInteropResources = (ID3D10Resource**)alloca(sizeof(ID3D10Resource*)*num_resources);
+ int i = 0;
+ for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim, ++i)
+ {
+ pInteropResources[i] = (*pSim)->getD3D10InteropResource(m_activeCudaDeviceIndex);
+ }
+ CUDA_V_RETURN(cudaD3D10UnmapResources(num_resources, pInteropResources)); // @TODO: why no cu_stream?
+ break;
+ }
+#endif
+#if WAVEWORKS_ENABLE_D3D11 || WAVEWORKS_ENABLE_GL
+ case nv_water_d3d_api_d3d11:
+ case nv_water_d3d_api_gl2:
+ {
+ const int num_resources = m_Simulations.size();
+ cudaGraphicsResource** pInteropResources = (cudaGraphicsResource**)alloca(sizeof(cudaGraphicsResource*)*num_resources);
+ int i = 0;
+ for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim, ++i)
+ {
+ pInteropResources[i] = (*pSim)->getInteropResource(m_activeCudaDeviceIndex);
+ }
+ CUDA_V_RETURN(cudaGraphicsUnmapResources(num_resources, pInteropResources, cdi.m_kernel_stream));
+ break;
+ }
+#endif
+ case nv_water_d3d_api_none:
+ {
+ // Nothing to do...
+ break;
+ }
+ default:
+ return E_FAIL;
+ }
+
+ return S_OK;
+}
+
+bool NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::getStagingCursor(gfsdk_U64* pKickID)
+{
+ if(pKickID && m_StagingCursorIsValid)
+ {
+ *pKickID = m_StagingCursorKickID;
+ }
+
+ return m_StagingCursorIsValid;
+}
+
+NVWaveWorks_FFT_Simulation_Manager::AdvanceCursorResult NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::advanceStagingCursor(bool /*block*/)
+{
+ // The CUDA pipeline pipeline is not async wrt the API, so there can never be any pending kicks and we can return immediately
+ return AdvanceCursorResult_None;
+}
+NVWaveWorks_FFT_Simulation_Manager::WaitCursorResult NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::waitStagingCursor()
+{
+ // The CUDA pipeline is not async wrt the API, so there can never be any pending kicks and we can return immediately
+ return WaitCursorResult_None;
+}
+
+bool NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::getReadbackCursor(gfsdk_U64* pKickID)
+{
+ if(0 == m_Simulations.size())
+ return false;
+
+ // We rely on collectSingleReadbackResult() to maintain lockstep between the cascade members, therefore we can in theory
+ // query any member to get the readback cursor...
+
+ // ...but let's check that theory in debug builds!!!
+#ifdef _DEV
+ if(m_Simulations.size() > 1)
+ {
+ gfsdk_U64 sim0KickID;
+ bool sim0GRCresult = m_Simulations[0]->getReadbackCursor(&sim0KickID);
+ for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin()+1; pSim != m_Simulations.end(); ++pSim)
+ {
+ gfsdk_U64 simNKickID;
+ bool simNGRCresult = (*pSim)->getReadbackCursor(&simNKickID);
+ assert(simNGRCresult == sim0GRCresult);
+ if(sim0GRCresult)
+ {
+ assert(sim0KickID == simNKickID);
+ }
+ }
+
+ }
+#endif
+
+ return m_Simulations[0]->getReadbackCursor(pKickID);
+}
+
+NVWaveWorks_FFT_Simulation_Manager::AdvanceCursorResult NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::advanceReadbackCursor(bool block)
+{
+ if(0 == m_Simulations.size())
+ return AdvanceCursorResult_None;
+
+ // First, check whether we even have readbacks in-flight
+ const bool hasReadbacksInFlightSim0 = m_Simulations[0]->hasReadbacksInFlight();
+
+ // Usual paranoid verficiation that we're maintaining lockstep...
+#ifdef _DEV
+ if(m_Simulations.size() > 1)
+ {
+ for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin()+1; pSim != m_Simulations.end(); ++pSim)
+ {
+ assert(hasReadbacksInFlightSim0 == (*pSim)->hasReadbacksInFlight());
+ }
+ }
+#endif
+
+ if(!hasReadbacksInFlightSim0)
+ {
+ return AdvanceCursorResult_None;
+ }
+
+ if(!block)
+ {
+ // Non-blocking case - in order to maintain lockstep, either all of the simulations should consume a readback,
+ // or none. Therefore we need to do an initial pass to test whether the 'all' case applies (and bail if not)...
+ for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim)
+ {
+ HRESULT hr = (*pSim)->canCollectSingleReadbackResultWithoutBlocking();
+ if(FAILED(hr))
+ {
+ return AdvanceCursorResult_Failed;
+ }
+ else if(S_FALSE == hr)
+ {
+ // Cannot advance, would have blocked -> bail
+ return AdvanceCursorResult_WouldBlock;
+ }
+ }
+ }
+
+ // We have readbacks in flight, and in the non-blocking case we *should* be in a position to consume them without
+ // any waiting, so just visit each simulation in turn with a blocking wait for the next readback to complete...
+ for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim)
+ {
+ if(FAILED((*pSim)->collectSingleReadbackResult(true)))
+ {
+ return AdvanceCursorResult_Failed;
+ }
+ }
+
+ return AdvanceCursorResult_Succeeded;
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::archiveDisplacements()
+{
+ HRESULT hr;
+
+ if(!getReadbackCursor(NULL))
+ {
+ return E_FAIL;
+ }
+
+ for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim)
+ {
+ V_RETURN((*pSim)->archiveDisplacements());
+ }
+
+ return S_OK;
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::getTimings(GFSDK_WaveWorks_Simulation_Manager_Timings& timings)
+{
+ // CUDA implementation doesn't update these CPU implementation related timings
+ timings.time_start_to_stop = 0;
+ timings.time_total = 0;
+ timings.time_wait_for_completion = 0;
+ return S_OK;
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::beforeReallocateSimulation()
+{
+ HRESULT hr;
+
+ // A simulation is about to be reallocated...
+
+ // Implication 1: at least some displacement map contents will become undefined and
+ // will need a kick to make them valid again, which in turn means that we can no longer
+ // consider any kick that was previously staged as still being staged...
+ m_StagingCursorIsValid = false;
+
+ // Implication 2: some of the readback tracking will be reset, meaning we break
+ // lockstep. We can avoid this by forcible resetting all readback tracking
+ for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim)
+ {
+ V_RETURN((*pSim)->resetReadbacks());
+ }
+
+ return S_OK;
+}
+
+#endif // SUPPORT_CUDA