1 files changed, 766 insertions, 0 deletions
diff --git a/src/FFT_Simulation_Manager_CUDA.cpp b/src/FFT_Simulation_Manager_CUDA.cpp
new file mode 100644
index 0000000..c6aae8a
--- /dev/null
+++ b/src/FFT_Simulation_Manager_CUDA.cpp
@@ -0,0 +1,766 @@
+// This code contains NVIDIA Confidential Information and is disclosed 
+// under the Mutual Non-Disclosure Agreement. 
+// 
+// Notice 
+// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES 
+// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO 
+// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, 
+// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. 
+// 
+// NVIDIA Corporation assumes no responsibility for the consequences of use of such 
+// information or for any infringement of patents or other rights of third parties that may 
+// result from its use. No license is granted by implication or otherwise under any patent 
+// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless 
+// expressly authorized by NVIDIA.  Details are subject to change without notice. 
+// This code supersedes and replaces all information previously supplied. 
+// NVIDIA Corporation products are not authorized for use as critical 
+// components in life support devices or systems without express written approval of 
+// NVIDIA Corporation. 
+// 
+// Copyright � 2008- 2013 NVIDIA Corporation. All rights reserved.
+//
+// NVIDIA Corporation and its licensors retain all intellectual property and proprietary
+// rights in and to this software and related documentation and any modifications thereto.
+// Any use, reproduction, disclosure or distribution of this software and related
+// documentation without an express license agreement from NVIDIA Corporation is
+// strictly prohibited.
+//
+
+#include "FFT_Simulation_Manager_CUDA_impl.h"
+#include "FFT_Simulation_CUDA_impl.h"
+
+#ifdef SUPPORT_CUDA
+#include <malloc.h>
+#include <string.h>
+
+#if defined(TARGET_PLATFORM_NIXLIKE)
+#define _alloca alloca
+#endif
+
+extern "C"
+{
+	cudaError cuda_GetConstantsSize(size_t* size);
+	cudaError cuda_GetConstantsAddress(void** ptr);
+}
+
+NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl() :
+	m_NextKickID(0),
+	m_StagingCursorIsValid(false),
+	m_StagingCursorKickID(0)
+{
+	m_numCudaDevices = 0;
+	m_activeCudaDeviceIndex = 0;
+	m_pCudaDeviceInfos = NULL;
+	m_cudaResourcesInitialised = false;
+	m_d3dAPI = nv_water_d3d_api_undefined;
+}
+
+NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::~NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl()
+{
+	releaseAll();
+}
+
+void NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::releaseAll()
+{
+    if(m_cudaResourcesInitialised)
+    {
+        releaseCudaResources();
+    }
+
+    switch(m_d3dAPI)
+    {
+#if WAVEWORKS_ENABLE_D3D9
+    case nv_water_d3d_api_d3d9:
+        {
+			SAFE_RELEASE(m_d3d._9.m_pd3d9Device);
+        }
+        break;
+#endif
+#if WAVEWORKS_ENABLE_D3D10
+    case nv_water_d3d_api_d3d10:
+        {
+			SAFE_RELEASE(m_d3d._10.m_pd3d10Device);
+        }
+		break;
+#endif
+#if WAVEWORKS_ENABLE_D3D11
+    case nv_water_d3d_api_d3d11:
+        {
+			SAFE_RELEASE(m_d3d._11.m_pd3d11Device);
+        }
+        break;
+#endif
+    }
+
+	assert(0 == m_Simulations.size());	// It is an error to destroy a non-empty manager
+	m_Simulations.erase_all();
+
+	m_d3dAPI = nv_water_d3d_api_undefined;
+
+	SAFE_DELETE_ARRAY(m_pCudaDeviceInfos);
+	m_numCudaDevices = 0;
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::initD3D9(IDirect3DDevice9* pD3DDevice)
+{
+#if WAVEWORKS_ENABLE_D3D9
+    if(nv_water_d3d_api_d3d9 != m_d3dAPI)
+    {
+        releaseAll();
+    }
+    else if(m_d3d._9.m_pd3d9Device != pD3DDevice)
+    {
+        releaseAll();
+    }
+
+    if(nv_water_d3d_api_undefined == m_d3dAPI)
+    {
+        m_d3dAPI = nv_water_d3d_api_d3d9;
+        m_d3d._9.m_pd3d9Device = pD3DDevice;
+        m_d3d._9.m_pd3d9Device->AddRef();
+
+		CUDA_V_RETURN(cudaD3D9GetDevices(&m_numCudaDevices, NULL, 0, pD3DDevice, cudaD3D9DeviceListAll));
+		int* pCudaDevices = (int*)_alloca(m_numCudaDevices * sizeof(int));
+		CUDA_V_RETURN(cudaD3D9GetDevices(&m_numCudaDevices, pCudaDevices, m_numCudaDevices, pD3DDevice, cudaD3D9DeviceListAll));
+		m_pCudaDeviceInfos = new CudaDeviceInfo[m_numCudaDevices];
+		memset(m_pCudaDeviceInfos, 0, m_numCudaDevices * sizeof(CudaDeviceInfo));
+		for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
+		{
+			m_pCudaDeviceInfos[cuda_dev_index].m_cudaDevice = pCudaDevices[cuda_dev_index];
+		}
+    }
+
+    return S_OK;
+#else
+    return E_FAIL;
+#endif
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::initD3D10(ID3D10Device* pD3DDevice)
+{
+#if WAVEWORKS_ENABLE_D3D10
+    if(nv_water_d3d_api_d3d10 != m_d3dAPI)
+    {
+        releaseAll();
+    }
+    else if(m_d3d._10.m_pd3d10Device != pD3DDevice)
+    {
+        releaseAll();
+    }
+
+    if(nv_water_d3d_api_undefined == m_d3dAPI)
+    {
+        m_d3dAPI = nv_water_d3d_api_d3d10;
+        m_d3d._10.m_pd3d10Device = pD3DDevice;
+        m_d3d._10.m_pd3d10Device->AddRef();
+
+		CUDA_V_RETURN(cudaD3D10GetDevices(&m_numCudaDevices, NULL, 0, pD3DDevice, cudaD3D10DeviceListAll));
+		int* pCudaDevices = (int*)_alloca(m_numCudaDevices * sizeof(int));
+		CUDA_V_RETURN(cudaD3D10GetDevices(&m_numCudaDevices, pCudaDevices, m_numCudaDevices, pD3DDevice, cudaD3D10DeviceListAll));
+		m_pCudaDeviceInfos = new CudaDeviceInfo[m_numCudaDevices];
+		memset(m_pCudaDeviceInfos, 0, m_numCudaDevices * sizeof(CudaDeviceInfo));
+		for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
+		{
+			m_pCudaDeviceInfos[cuda_dev_index].m_cudaDevice = pCudaDevices[cuda_dev_index];
+		}
+    }
+
+    return S_OK;
+#else
+    return E_FAIL;
+#endif
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::initD3D11(ID3D11Device* pD3DDevice)
+{
+#if WAVEWORKS_ENABLE_D3D11
+    if(nv_water_d3d_api_d3d11 != m_d3dAPI)
+    {
+        releaseAll();
+    }
+    else if(m_d3d._11.m_pd3d11Device != pD3DDevice)
+    {
+        releaseAll();
+    }
+
+    if(nv_water_d3d_api_undefined == m_d3dAPI)
+    {
+        m_d3dAPI = nv_water_d3d_api_d3d11;
+        m_d3d._11.m_pd3d11Device = pD3DDevice;
+        m_d3d._11.m_pd3d11Device->AddRef();
+
+		CUDA_V_RETURN(cudaD3D11GetDevices(&m_numCudaDevices, NULL, 0, pD3DDevice, cudaD3D11DeviceListAll));
+		int* pCudaDevices = (int*)_alloca(m_numCudaDevices * sizeof(int));
+		CUDA_V_RETURN(cudaD3D11GetDevices(&m_numCudaDevices, pCudaDevices, m_numCudaDevices, pD3DDevice, cudaD3D11DeviceListAll));
+		m_pCudaDeviceInfos = new CudaDeviceInfo[m_numCudaDevices];
+		memset(m_pCudaDeviceInfos, 0, m_numCudaDevices * sizeof(CudaDeviceInfo));
+		for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
+		{
+			m_pCudaDeviceInfos[cuda_dev_index].m_cudaDevice = pCudaDevices[cuda_dev_index];
+		}
+    }
+
+    return S_OK;
+#else
+    return E_FAIL;
+#endif
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::initGL2(void* pGLContext)
+{
+#if WAVEWORKS_ENABLE_GL
+	if(nv_water_d3d_api_gl2 != m_d3dAPI)
+    {
+        releaseAll();
+    }
+    else if(m_d3d._GL2.m_pGLContext != pGLContext)
+    {
+        releaseAll();
+    }
+
+    if(nv_water_d3d_api_undefined == m_d3dAPI)
+    {
+        m_d3dAPI = nv_water_d3d_api_gl2;
+
+		CUDA_V_RETURN(cudaGLGetDevices(&m_numCudaDevices, NULL, 0, cudaGLDeviceListAll));
+		int* pCudaDevices = (int*)_alloca(m_numCudaDevices * sizeof(int));
+		CUDA_API_RETURN(cudaGLGetDevices(&m_numCudaDevices, pCudaDevices, m_numCudaDevices, cudaGLDeviceListAll));
+		m_pCudaDeviceInfos = new CudaDeviceInfo[m_numCudaDevices];
+		memset(m_pCudaDeviceInfos, 0, m_numCudaDevices * sizeof(CudaDeviceInfo));
+		for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
+		{
+			m_pCudaDeviceInfos[cuda_dev_index].m_cudaDevice = pCudaDevices[cuda_dev_index];
+		}
+    }
+    return S_OK;
+#else
+	return E_FAIL;
+#endif
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::initNoGraphics()
+{
+    if(nv_water_d3d_api_none != m_d3dAPI)
+    {
+        releaseAll();
+    }
+
+    if(nv_water_d3d_api_undefined == m_d3dAPI)
+    {
+        m_d3dAPI = nv_water_d3d_api_none;
+
+		int cuda_device;
+		CUDA_V_RETURN(cudaGetDevice(&cuda_device));
+
+		m_numCudaDevices = 1;
+		m_pCudaDeviceInfos = new CudaDeviceInfo[m_numCudaDevices];
+		memset(m_pCudaDeviceInfos, 0, m_numCudaDevices * sizeof(CudaDeviceInfo));
+		m_pCudaDeviceInfos->m_cudaDevice = cuda_device;
+    }
+
+    return S_OK;
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::allocateCudaResources()
+{
+	for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
+	{
+		CudaDeviceInfo& dev_state = m_pCudaDeviceInfos[cuda_dev_index];
+		CUDA_V_RETURN(cudaSetDevice(dev_state.m_cudaDevice));
+
+		CUDA_V_RETURN(cuda_GetConstantsSize(&dev_state.m_constants_size));
+		CUDA_V_RETURN(cuda_GetConstantsAddress(&dev_state.m_constants_address));
+		CUDA_V_RETURN(cudaMalloc((void **)&dev_state.m_device_constants, dev_state.m_constants_size));
+		CUDA_V_RETURN(cudaMemset(dev_state.m_device_constants, 0, dev_state.m_constants_size));
+
+		CUDA_V_RETURN(cudaStreamCreateWithFlags(&dev_state.m_kernel_stream,cudaStreamNonBlocking));
+		CUDA_V_RETURN(cudaStreamCreateWithFlags(&dev_state.m_readback_stream,cudaStreamNonBlocking));
+	}
+
+    m_cudaResourcesInitialised = true;
+
+    return S_OK;
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::releaseCudaResources()
+{
+	for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
+	{
+		CudaDeviceInfo& dev_state = m_pCudaDeviceInfos[cuda_dev_index];
+		CUDA_V_RETURN(cudaSetDevice(dev_state.m_cudaDevice));
+
+		CUDA_SAFE_FREE(dev_state.m_device_constants);
+
+		CUDA_V_RETURN(cudaStreamDestroy(dev_state.m_kernel_stream));
+		CUDA_V_RETURN(cudaStreamDestroy(dev_state.m_readback_stream));
+	}
+
+    m_cudaResourcesInitialised = false;
+
+    return S_OK;
+}
+
+NVWaveWorks_FFT_Simulation* NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::createSimulation(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params)
+{
+	NVWaveWorks_FFT_Simulation_CUDA_Impl* pResult = new NVWaveWorks_FFT_Simulation_CUDA_Impl(this,params);
+	m_Simulations.push_back(pResult);
+	return pResult;
+}
+
+void NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::releaseSimulation(NVWaveWorks_FFT_Simulation* pSimulation)
+{
+	//remove from list
+	m_Simulations.erase(pSimulation);
+
+	SAFE_DELETE(pSimulation);
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::beforeReinit(const GFSDK_WaveWorks_Detailed_Simulation_Params& /*params*/, bool /*reinitOnly*/)
+{
+	return S_OK;
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::checkForReadbackResults()
+{
+	HRESULT hr;
+
+	// The goal here is to evolve the readback state of all our simulations in lockstep, so that either all our simulations collect
+	// a single readback or else none do (IOW: 'some' is *not* permitted, because it would break lockstep)
+
+	NVWaveWorks_FFT_Simulation_CUDA_Impl** pBeginSimulationsSrc = (NVWaveWorks_FFT_Simulation_CUDA_Impl**)_alloca(m_Simulations.size() * sizeof(NVWaveWorks_FFT_Simulation_CUDA_Impl*));
+	memcpy(pBeginSimulationsSrc,m_Simulations.begin(),m_Simulations.size() * sizeof(NVWaveWorks_FFT_Simulation_CUDA_Impl*));
+	NVWaveWorks_FFT_Simulation_CUDA_Impl** pEndSimulationsSrc = pBeginSimulationsSrc + m_Simulations.size();
+
+	NVWaveWorks_FFT_Simulation_CUDA_Impl** pBeginSimulationsNoResult = (NVWaveWorks_FFT_Simulation_CUDA_Impl**)_alloca(m_Simulations.size() * sizeof(NVWaveWorks_FFT_Simulation_CUDA_Impl*));;
+	NVWaveWorks_FFT_Simulation_CUDA_Impl** pEndSimulationsNoResult = pBeginSimulationsNoResult;
+
+	// Do an initial walk thru and see if any readbacks arrived (without blocking), and write any that did not get a readback result into dst
+	for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = pBeginSimulationsSrc; pSim != pEndSimulationsSrc; ++pSim)
+	{
+		hr = (*pSim)->collectSingleReadbackResult(false);
+		if(FAILED(hr))
+		{
+			return hr;
+		}
+
+		if(S_FALSE == hr)
+		{
+			(*pEndSimulationsNoResult) = (*pSim);
+			++pEndSimulationsNoResult;
+		}
+	}
+
+	// If no results are ready, we're in sync so don't try again
+	if((pEndSimulationsNoResult-pBeginSimulationsNoResult) != m_Simulations.size())
+	{
+		// Otherwise, wait on the remaining results
+		for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = pBeginSimulationsNoResult; pSim != pEndSimulationsNoResult; ++pSim)
+		{
+			V_RETURN((*pSim)->collectSingleReadbackResult(true));
+		}
+	}
+
+	return S_OK;
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::kick(Graphics_Context* /*pGC*/, double dSimTime, gfsdk_U64& kickID)
+{
+	HRESULT hr;
+
+	kickID = m_NextKickID;
+
+    if(!m_cudaResourcesInitialised)
+    {
+        V_RETURN(allocateCudaResources());
+	}
+
+	// Check for readback results - note that we do this at the manager level in order to guarantee lockstep between
+	// the simulations that form a cascade. We either want all of simulations to collect a result, or none - some is
+	// not an option
+	checkForReadbackResults();
+
+	// Be sure to use the correct cuda device for the current frame (important in SLI)
+	int cuda_device = -1;
+	if(1 == m_numCudaDevices)
+	{
+		m_activeCudaDeviceIndex = 0;
+		cuda_device = m_pCudaDeviceInfos[m_activeCudaDeviceIndex].m_cudaDevice;
+		CUDA_V_RETURN(cudaSetDevice(cuda_device));
+	}
+	else
+	{
+		// Multiple devices, we will have to do it the 'long' way
+		switch(m_d3dAPI)
+		{
+#if WAVEWORKS_ENABLE_D3D9
+		case nv_water_d3d_api_d3d9:
+			{
+				unsigned int cuda_device_count = 0;
+				CUDA_V_RETURN(cudaD3D9GetDevices(&cuda_device_count, &cuda_device, 1, m_d3d._9.m_pd3d9Device, cudaD3D9DeviceListCurrentFrame));
+				CUDA_V_RETURN(cudaSetDevice(cuda_device));
+				break;
+			}
+#endif
+#if WAVEWORKS_ENABLE_D3D10
+		case nv_water_d3d_api_d3d10:
+			{
+				unsigned int cuda_device_count = 0;
+				CUDA_V_RETURN(cudaD3D10GetDevices(&cuda_device_count, &cuda_device, 1, m_d3d._10.m_pd3d10Device, cudaD3D10DeviceListCurrentFrame));
+				CUDA_V_RETURN(cudaSetDevice(cuda_device));
+				break;
+			}
+#endif
+#if WAVEWORKS_ENABLE_D3D11
+		case nv_water_d3d_api_d3d11:
+			{
+				unsigned int cuda_device_count = 0;
+				CUDA_V_RETURN(cudaD3D11GetDevices(&cuda_device_count, &cuda_device, 1, m_d3d._11.m_pd3d11Device, cudaD3D11DeviceListCurrentFrame));
+				CUDA_V_RETURN(cudaSetDevice(cuda_device));
+				break;
+			}
+#endif
+#if WAVEWORKS_ENABLE_GL
+		case nv_water_d3d_api_gl2:
+			{
+				unsigned int cuda_device_count = 0;
+				CUDA_V_RETURN(cudaGLGetDevices(&cuda_device_count, &cuda_device, 1, cudaGLDeviceListCurrentFrame));
+				CUDA_V_RETURN(cudaSetDevice(cuda_device));
+				break;
+			}
+#endif
+		case nv_water_d3d_api_none:
+			{
+				assert(1 == m_numCudaDevices);		// Well by the time we get here we're guaranteed to hit this assert,
+													// but the assert neatly documents the violated expecation i.e. the only
+													// supported no-graphics CUDA path is single device
+				break;
+			}
+        default:
+            return E_FAIL;
+		}
+
+		// Match the current device to our list
+		for(unsigned int cuda_device_index = 0; cuda_device_index != m_numCudaDevices; ++cuda_device_index)
+		{
+			if(cuda_device == m_pCudaDeviceInfos[cuda_device_index].m_cudaDevice)
+			{
+				m_activeCudaDeviceIndex = cuda_device_index;
+				break;
+			}
+		}
+	}
+	
+	const CudaDeviceInfo& active_dev_info = m_pCudaDeviceInfos[m_activeCudaDeviceIndex];
+
+	for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim)
+	{
+		V_RETURN((*pSim)->preKick(pSim - m_Simulations.begin()));
+	}
+	CUDA_V_RETURN(cudaMemcpyAsync(active_dev_info.m_constants_address, active_dev_info.m_device_constants, 
+		active_dev_info.m_constants_size, cudaMemcpyDeviceToDevice, active_dev_info.m_kernel_stream));
+
+	// Do all the CUDA work as far as interop
+	for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim)
+	{
+		V_RETURN((*pSim)->kickPreInterop(dSimTime,kickID));
+	}
+
+	// Map for interop
+	V_RETURN(mapInteropResources(active_dev_info));
+
+	// Do all interop CUDA work
+	for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim)
+	{
+		V_RETURN((*pSim)->kickWithinInterop(kickID));
+	}
+
+	// Unmap for interop
+	V_RETURN(unmapInteropResources(active_dev_info));
+
+	// Do post-interop CUDA work
+	for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim)
+	{
+		V_RETURN((*pSim)->kickPostInterop(kickID));
+	}
+
+	m_StagingCursorIsValid = true;
+	m_StagingCursorKickID = kickID;
+	++m_NextKickID;
+
+	return S_OK;
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::mapInteropResources(const CudaDeviceInfo& cdi)
+{
+	switch(m_d3dAPI)
+	{
+#if WAVEWORKS_ENABLE_D3D9
+	case nv_water_d3d_api_d3d9:
+		{
+			const int num_resources = m_Simulations.size();
+			IDirect3DResource9** pInteropResources = (IDirect3DResource9**)alloca(sizeof(IDirect3DResource9*)*num_resources);
+			int i = 0;
+			for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim, ++i)
+			{
+				pInteropResources[i] = (*pSim)->getD3D9InteropResource(m_activeCudaDeviceIndex);
+			}
+			CUDA_V_RETURN(cudaD3D9MapResources(num_resources, pInteropResources));	// @TODO: why no cu_stream?
+			break;
+		}
+#endif
+#if WAVEWORKS_ENABLE_D3D10
+	case nv_water_d3d_api_d3d10:
+		{
+			const int num_resources = m_Simulations.size();
+			ID3D10Resource** pInteropResources = (ID3D10Resource**)alloca(sizeof(ID3D10Resource*)*num_resources);
+			int i = 0;
+			for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim, ++i)
+			{
+				pInteropResources[i] = (*pSim)->getD3D10InteropResource(m_activeCudaDeviceIndex);
+			}
+			CUDA_V_RETURN(cudaD3D10MapResources(num_resources, pInteropResources));	// @TODO: why no cu_stream?
+			break;
+		}
+#endif
+#if WAVEWORKS_ENABLE_D3D11 || WAVEWORKS_ENABLE_GL
+	case nv_water_d3d_api_d3d11:
+	case nv_water_d3d_api_gl2:
+		{
+			const int num_resources = m_Simulations.size();
+			cudaGraphicsResource** pInteropResources = (cudaGraphicsResource**)alloca(sizeof(cudaGraphicsResource*)*num_resources);
+			int i = 0;
+			for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim, ++i)
+			{
+				pInteropResources[i] = (*pSim)->getInteropResource(m_activeCudaDeviceIndex);
+			}
+			CUDA_V_RETURN(cudaGraphicsMapResources(num_resources, pInteropResources, cdi.m_kernel_stream));
+			break;
+		}
+#endif
+	case nv_water_d3d_api_none:
+		{
+			// Nothing to do...
+			break;
+		}
+    default:
+        return E_FAIL;
+	}
+
+	return S_OK;
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::unmapInteropResources(const CudaDeviceInfo& cdi)
+{
+	switch(m_d3dAPI)
+	{
+#if WAVEWORKS_ENABLE_D3D9
+	case nv_water_d3d_api_d3d9:
+		{
+			const int num_resources = m_Simulations.size();
+			IDirect3DResource9** pInteropResources = (IDirect3DResource9**)alloca(sizeof(IDirect3DResource9*)*num_resources);
+			int i = 0;
+			for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim, ++i)
+			{
+				pInteropResources[i] = (*pSim)->getD3D9InteropResource(m_activeCudaDeviceIndex);
+			}
+			CUDA_V_RETURN(cudaD3D9UnmapResources(num_resources, pInteropResources));	// @TODO: why no cu_stream?
+			break;
+		}
+#endif
+#if WAVEWORKS_ENABLE_D3D10
+	case nv_water_d3d_api_d3d10:
+		{
+			const int num_resources = m_Simulations.size();
+			ID3D10Resource** pInteropResources = (ID3D10Resource**)alloca(sizeof(ID3D10Resource*)*num_resources);
+			int i = 0;
+			for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim, ++i)
+			{
+				pInteropResources[i] = (*pSim)->getD3D10InteropResource(m_activeCudaDeviceIndex);
+			}
+			CUDA_V_RETURN(cudaD3D10UnmapResources(num_resources, pInteropResources));	// @TODO: why no cu_stream?
+			break;
+		}
+#endif
+#if WAVEWORKS_ENABLE_D3D11 || WAVEWORKS_ENABLE_GL
+	case nv_water_d3d_api_d3d11:
+	case nv_water_d3d_api_gl2:
+		{
+			const int num_resources = m_Simulations.size();
+			cudaGraphicsResource** pInteropResources = (cudaGraphicsResource**)alloca(sizeof(cudaGraphicsResource*)*num_resources);
+			int i = 0;
+			for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim, ++i)
+			{
+				pInteropResources[i] = (*pSim)->getInteropResource(m_activeCudaDeviceIndex);
+			}
+			CUDA_V_RETURN(cudaGraphicsUnmapResources(num_resources, pInteropResources, cdi.m_kernel_stream));
+			break;
+		}
+#endif
+	case nv_water_d3d_api_none:
+		{
+			// Nothing to do...
+			break;
+		}
+    default:
+        return E_FAIL;
+	}
+
+	return S_OK;
+}
+
+bool NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::getStagingCursor(gfsdk_U64* pKickID)
+{
+	if(pKickID && m_StagingCursorIsValid)
+	{
+		*pKickID = m_StagingCursorKickID;
+	}
+
+	return m_StagingCursorIsValid;
+}
+
+NVWaveWorks_FFT_Simulation_Manager::AdvanceCursorResult NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::advanceStagingCursor(bool /*block*/)
+{
+	// The CUDA pipeline pipeline is not async wrt the API, so there can never be any pending kicks and we can return immediately
+	return AdvanceCursorResult_None;
+}
+NVWaveWorks_FFT_Simulation_Manager::WaitCursorResult NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::waitStagingCursor()
+{
+	// The CUDA pipeline is not async wrt the API, so there can never be any pending kicks and we can return immediately
+	return WaitCursorResult_None;
+}
+
+bool NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::getReadbackCursor(gfsdk_U64* pKickID)
+{
+	if(0 == m_Simulations.size())
+		return false;
+
+	// We rely on collectSingleReadbackResult() to maintain lockstep between the cascade members, therefore we can in theory
+	// query any member to get the readback cursor...
+
+	// ...but let's check that theory in debug builds!!!
+#ifdef _DEV
+	if(m_Simulations.size() > 1)
+	{
+		gfsdk_U64 sim0KickID;
+		bool sim0GRCresult = m_Simulations[0]->getReadbackCursor(&sim0KickID);
+		for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin()+1; pSim != m_Simulations.end(); ++pSim)
+		{
+			gfsdk_U64 simNKickID;
+			bool simNGRCresult = (*pSim)->getReadbackCursor(&simNKickID);
+			assert(simNGRCresult == sim0GRCresult);
+			if(sim0GRCresult)
+			{
+				assert(sim0KickID == simNKickID);
+			}
+		}
+
+	}
+#endif
+
+	return m_Simulations[0]->getReadbackCursor(pKickID);
+}
+
+NVWaveWorks_FFT_Simulation_Manager::AdvanceCursorResult NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::advanceReadbackCursor(bool block)
+{
+	if(0 == m_Simulations.size())
+		return AdvanceCursorResult_None;
+
+	// First, check whether we even have readbacks in-flight
+	const bool hasReadbacksInFlightSim0 = m_Simulations[0]->hasReadbacksInFlight();
+
+	// Usual paranoid verficiation that we're maintaining lockstep...
+#ifdef _DEV
+	if(m_Simulations.size() > 1)
+	{
+		for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin()+1; pSim != m_Simulations.end(); ++pSim)
+		{
+			assert(hasReadbacksInFlightSim0 == (*pSim)->hasReadbacksInFlight());
+		}
+	}
+#endif
+
+	if(!hasReadbacksInFlightSim0)
+	{
+		return AdvanceCursorResult_None;
+	}
+
+	if(!block)
+	{
+		// Non-blocking case - in order to maintain lockstep, either all of the simulations should consume a readback,
+		// or none. Therefore we need to do an initial pass to test whether the 'all' case applies (and bail if not)...
+		for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim)
+		{
+			HRESULT hr = (*pSim)->canCollectSingleReadbackResultWithoutBlocking();
+			if(FAILED(hr))
+			{
+				return AdvanceCursorResult_Failed;
+			}
+			else if(S_FALSE == hr)
+			{
+				// Cannot advance, would have blocked -> bail
+				return AdvanceCursorResult_WouldBlock;
+			}
+		}
+	}
+
+	// We have readbacks in flight, and in the non-blocking case we *should* be in a position to consume them without
+	// any waiting, so just visit each simulation in turn with a blocking wait for the next readback to complete...
+	for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim)
+	{
+		if(FAILED((*pSim)->collectSingleReadbackResult(true)))
+		{
+			return AdvanceCursorResult_Failed;
+		}
+	}
+
+	return AdvanceCursorResult_Succeeded;
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::archiveDisplacements()
+{
+	HRESULT hr;
+
+	if(!getReadbackCursor(NULL))
+	{
+		return E_FAIL;
+	}
+
+	for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim)
+	{
+		V_RETURN((*pSim)->archiveDisplacements());
+	}
+
+	return S_OK;
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::getTimings(GFSDK_WaveWorks_Simulation_Manager_Timings& timings)
+{
+	// CUDA implementation doesn't update these CPU implementation related timings
+	timings.time_start_to_stop = 0;
+	timings.time_total = 0;
+	timings.time_wait_for_completion = 0;
+	return S_OK;
+}
+
+HRESULT NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::beforeReallocateSimulation()
+{
+	HRESULT hr;
+
+	// A simulation is about to be reallocated...
+
+	// Implication 1: at least some displacement map contents will become undefined and
+	// will need a kick to make them valid again, which in turn means that we can no longer
+	// consider any kick that was previously staged as still being staged...
+	m_StagingCursorIsValid = false;
+
+	// Implication 2: some of the readback tracking will be reset, meaning we break
+	// lockstep. We can avoid this by forcible resetting all readback tracking
+	for(NVWaveWorks_FFT_Simulation_CUDA_Impl** pSim = m_Simulations.begin(); pSim != m_Simulations.end(); ++pSim)
+	{
+		V_RETURN((*pSim)->resetReadbacks());
+	}
+
+	return S_OK;
+}
+
+#endif // SUPPORT_CUDA