// This code contains NVIDIA Confidential Information and is disclosed 
// under the Mutual Non-Disclosure Agreement. 
// 
// Notice 
// ALL NVIDIA DESIGN SPECIFICATIONS AND CODE ("MATERIALS") ARE PROVIDED "AS IS" NVIDIA MAKES 
// NO REPRESENTATIONS, WARRANTIES, EXPRESSED, IMPLIED, STATUTORY, OR OTHERWISE WITH RESPECT TO 
// THE MATERIALS, AND EXPRESSLY DISCLAIMS ANY IMPLIED WARRANTIES OF NONINFRINGEMENT, 
// MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. 
// 
// NVIDIA Corporation assumes no responsibility for the consequences of use of such 
// information or for any infringement of patents or other rights of third parties that may 
// result from its use. No license is granted by implication or otherwise under any patent 
// or patent rights of NVIDIA Corporation. No third party distribution is allowed unless 
// expressly authorized by NVIDIA.  Details are subject to change without notice. 
// This code supersedes and replaces all information previously supplied. 
// NVIDIA Corporation products are not authorized for use as critical 
// components in life support devices or systems without express written approval of 
// NVIDIA Corporation. 
// 
// Copyright © 2008- 2013 NVIDIA Corporation. All rights reserved.
//
// NVIDIA Corporation and its licensors retain all intellectual property and proprietary
// rights in and to this software and related documentation and any modifications thereto.
// Any use, reproduction, disclosure or distribution of this software and related
// documentation without an express license agreement from NVIDIA Corporation is
// strictly prohibited.
//
 
#include "Internal.h"
#ifdef SUPPORT_CUDA
#include "FFT_Simulation_CUDA_impl.h"
#include "FFT_Simulation_Manager_CUDA_impl.h"
#include "Simulation_Util.h"
#include "CircularFIFO.h"

#include <malloc.h>
#include <string.h>

namespace
{
#if WAVEWORKS_ENABLE_D3D10 || WAVEWORKS_ENABLE_D3D10
	const DXGI_SAMPLE_DESC kNoSample = {1, 0};
#endif

	bool cudaQueryResultIsError(cudaError_t result)
	{
		return result != cudaErrorNotReady && result != cudaSuccess;
	};

	typedef NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl::CudaDeviceInfo CudaDeviceInfo;
}

// CUDA stubs
extern "C"
{
	cudaError cuda_SetConstants (void* constants,
							float2* Gauss,
							float2* H0, 
							float2* Ht, 
							float4* Dt,
							float* Omega,
							int resolution,
							float fft_period,
							float window_in,
							float window_out,
							float2 wind_dir,
							float wind_speed,
							float wind_dependency,
							float wave_amplitude,
							float small_wave_fraction,
							float choppy_scale,
							cudaStream_t cu_stream);

	cudaError cuda_ComputeH0(int resolution, int constantsIndex, cudaStream_t cu_stream);
	cudaError cuda_ComputeRows(int resolution, double time, int constantsIndex, cudaStream_t cu_stream);
	cudaError cuda_ComputeColumns(float4* displacement, int resolution, int constantsIndex, cudaStream_t cu_stream);
	cudaError cuda_ComputeColumns_array(cudaArray* displacement, int resolution, int constantsIndex, cudaStream_t cu_stream);
}

NVWaveWorks_FFT_Simulation_CUDA_Impl::NVWaveWorks_FFT_Simulation_CUDA_Impl(NVWaveWorks_FFT_Simulation_Manager_CUDA_Impl* pManager, const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params) :
	m_pManager(pManager),
	m_params(params)
{
	m_numCudaDevices = 0;
	m_pCudaDeviceStates = NULL;

    for(int slot = 0; slot != NumReadbackSlots; ++slot)
    {
		m_readback_slots[slot].m_host_Dxyz = NULL;
		m_readback_slots[slot].m_device_Dxyz = NULL;
		m_readback_slots[slot].m_cudaDevice = -1;
		m_readback_slots[slot].m_completion_evt = NULL;
		m_readback_slots[slot].m_staging_evt = NULL;
		m_readback_slots[slot].m_kickID = GFSDK_WaveWorks_InvalidKickID;
    }

    m_active_readback_slot = 0;
	m_active_readback_host_Dxyz = NULL;
    m_end_inflight_readback_slots = 1;
	m_working_readback_slot = NULL;

	m_pReadbackFIFO = NULL;

    m_active_timer_slot = 0;
    m_end_inflight_timer_slots = 1;
	m_timer_slots[m_active_timer_slot].m_elapsed_time = 0.f;						// Ensure first call to getTimings() gives reasonable results
	m_timer_slots[m_active_timer_slot].m_kickID = GFSDK_WaveWorks_InvalidKickID;	// Ensure first call to getTimings() gives reasonable results
	m_working_timer_slot = NULL;

    m_DisplacementMapIsCUDARegistered = false;
	m_GaussAndOmegaInitialised = false;
    //m_H0Dirty = true;
    m_ReadbackInitialised = false;
	m_DisplacementMapVersion = GFSDK_WaveWorks_InvalidKickID;

	m_readback_element_size = 0;

    memset(&m_d3d, 0, sizeof(m_d3d));
    m_d3dAPI = nv_water_d3d_api_undefined;
}

NVWaveWorks_FFT_Simulation_CUDA_Impl::~NVWaveWorks_FFT_Simulation_CUDA_Impl()
{
    releaseAll();
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::initD3D11(ID3D11Device* pD3DDevice)
{
#if WAVEWORKS_ENABLE_D3D11
    HRESULT hr;

    if(nv_water_d3d_api_d3d11 != m_d3dAPI)
    {
        releaseAll();
    }
    else if(m_d3d._11.m_pd3d11Device != pD3DDevice)
    {
        releaseAll();
    }

    if(nv_water_d3d_api_undefined == m_d3dAPI)
    {
        m_d3dAPI = nv_water_d3d_api_d3d11;
        m_d3d._11.m_pd3d11Device = pD3DDevice;
        m_d3d._11.m_pd3d11Device->AddRef();

		// Use 4x16F for D3D11
		m_readback_element_size = sizeof(ushort4);

		m_numCudaDevices = m_pManager->GetNumCudaDevices();
		m_pCudaDeviceStates = new CudaDeviceState[m_numCudaDevices];
		memset(m_pCudaDeviceStates, 0, m_numCudaDevices * sizeof(CudaDeviceState));
		for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
		{
			m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice = m_pManager->GetCudaDeviceInfo(cuda_dev_index).m_cudaDevice;
		}

        V_RETURN(allocateAllResources());
    }

    return S_OK;
#else
    return E_FAIL;
#endif
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::initGL2(void* pGLContext)
{
#if WAVEWORKS_ENABLE_GL
    HRESULT hr;

	if(nv_water_d3d_api_gl2 != m_d3dAPI)
    {
        releaseAll();
    }
	else if(m_d3d._GL2.m_pGLContext != pGLContext)
    {
        releaseAll();
    }

    if(nv_water_d3d_api_undefined == m_d3dAPI)
    {
        m_d3dAPI = nv_water_d3d_api_gl2;

		// Use 4x16F for GL2
		m_readback_element_size = sizeof(ushort4);

		m_numCudaDevices = m_pManager->GetNumCudaDevices();
		m_pCudaDeviceStates = new CudaDeviceState[m_numCudaDevices];
		memset(m_pCudaDeviceStates, 0, m_numCudaDevices * sizeof(CudaDeviceState));
		for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
		{
			m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice = m_pManager->GetCudaDeviceInfo(cuda_dev_index).m_cudaDevice;
		}
        V_RETURN(allocateAllResources());
    }

    return S_OK;
#else
    return E_FAIL;
#endif
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::initNoGraphics()
{
    HRESULT hr;

    if(nv_water_d3d_api_none != m_d3dAPI)
    {
        releaseAll();
    }

    if(nv_water_d3d_api_undefined == m_d3dAPI)
    {
        m_d3dAPI = nv_water_d3d_api_none;

		// Use 4x32F for no-gfx
		m_readback_element_size = sizeof(float4);

		m_numCudaDevices = m_pManager->GetNumCudaDevices();
		m_pCudaDeviceStates = new CudaDeviceState[m_numCudaDevices];
		memset(m_pCudaDeviceStates, 0, m_numCudaDevices * sizeof(CudaDeviceState));
		for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
		{
			m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice = m_pManager->GetCudaDeviceInfo(cuda_dev_index).m_cudaDevice;
		}

        V_RETURN(allocateAllResources());
    }

    return S_OK;
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::reinit(const GFSDK_WaveWorks_Detailed_Simulation_Params::Cascade& params)
{
	HRESULT hr;

    BOOL bRelease = FALSE;
    BOOL bAllocate = FALSE;
    BOOL bRecalcH0 = FALSE;
	BOOL bReinitGaussAndOmega = FALSE;

    if(params.fft_resolution != m_params.fft_resolution ||
        params.readback_displacements != m_params.readback_displacements)
    {
        bRelease = TRUE;
        bAllocate = TRUE;

		// We're reallocating, which breaks various lockstep/synchronization assumptions...
		V_RETURN(m_pManager->beforeReallocateSimulation());
    }

    if(	params.fft_period != m_params.fft_period || 
        params.fft_resolution != m_params.fft_resolution
        )
    {
        bReinitGaussAndOmega = TRUE;
    }

    if(	params.wave_amplitude != m_params.wave_amplitude ||
        params.wind_speed != m_params.wind_speed ||
        params.wind_dir.x != m_params.wind_dir.x ||
		params.wind_dir.y != m_params.wind_dir.y ||
        params.wind_dependency != m_params.wind_dependency ||
        params.small_wave_fraction != m_params.small_wave_fraction ||
        params.window_in != m_params.window_in ||
        params.window_out != m_params.window_out ||
		bReinitGaussAndOmega
        )
    {
        bRecalcH0 = TRUE;
    }

	m_params = params;

    if(bRelease)
    {
        releaseAllResources();
    }

    if(bAllocate)
    {
        V_RETURN(allocateAllResources());
    }

    if(bReinitGaussAndOmega)
    {
		m_GaussAndOmegaInitialised = false;
    }

	if(bRecalcH0)
	{
		for(unsigned int i = 0; i < m_numCudaDevices; i ++)
		{
			m_pCudaDeviceStates[i].m_H0Dirty = true;
		}
	}

	return S_OK;
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::allocateCudaResources()
{
	int gauss_size = m_resolution * m_resolution;
	int h0_size = (m_resolution + 1) * (m_resolution + 1);
	int omega_size = m_half_resolution_plus_one * m_half_resolution_plus_one;
	int htdt_size = m_half_resolution_plus_one * m_resolution;
	int output_size = m_resolution * m_resolution;

	for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
	{
		CudaDeviceState& dev_state = m_pCudaDeviceStates[cuda_dev_index];
		CUDA_V_RETURN(cudaSetDevice(dev_state.m_cudaDevice));

		CUDA_V_RETURN(cudaMalloc((void **)&dev_state.m_device_Gauss, gauss_size * sizeof(float2)));
		CUDA_V_RETURN(cudaMalloc((void **)&dev_state.m_device_H0, h0_size * sizeof(float2)));
		CUDA_V_RETURN(cudaMalloc((void **)&dev_state.m_device_Omega, omega_size * sizeof(float2)));
		CUDA_V_RETURN(cudaMalloc((void **)&dev_state.m_device_Ht, htdt_size * sizeof(float2))); 
		CUDA_V_RETURN(cudaMalloc((void **)&dev_state.m_device_Dt, htdt_size * sizeof(float4))); 

		// Optional completion events for displacements readback
		if(m_params.readback_displacements)
		{
			for(int slot = 0; slot != NumReadbackSlots; ++slot)
			{
				CUDA_V_RETURN(cudaEventCreate(&dev_state.m_readback_completion_evts[slot]));
				CUDA_V_RETURN(cudaEventCreate(&dev_state.m_readback_staging_evts[slot]));
				CUDA_V_RETURN(cudaMalloc((void **)&dev_state.m_readback_device_Dxyzs[slot], output_size * m_readback_element_size));
			}
		}

		// Timer events
		for(int slot = 0; slot != NumTimerSlots; ++slot)
		{
			CUDA_V_RETURN(cudaEventCreate(&dev_state.m_start_timer_evts[slot]));
			CUDA_V_RETURN(cudaEventCreate(&dev_state.m_stop_timer_evts[slot]));
			CUDA_V_RETURN(cudaEventCreate(&dev_state.m_start_fft_timer_evts[slot]));
			CUDA_V_RETURN(cudaEventCreate(&dev_state.m_stop_fft_timer_evts[slot]));
		}
	}

    // Optional page-locked mem for displacements readback
    if(m_params.readback_displacements)
    {
        for(int slot = 0; slot != NumReadbackSlots; ++slot)
        {
            CUDA_V_RETURN(cudaMallocHost((void **)&m_readback_slots[slot].m_host_Dxyz, output_size * m_readback_element_size));
            memset(m_readback_slots[slot].m_host_Dxyz, 0, output_size * m_readback_element_size);
        }

        m_active_readback_slot = 0;
		m_active_readback_host_Dxyz = NULL;
        m_end_inflight_readback_slots = 1;
		m_readback_slots[m_active_readback_slot].m_kickID = GFSDK_WaveWorks_InvalidKickID;

		const int num_readback_FIFO_entries = m_params.num_readback_FIFO_entries;
		if(num_readback_FIFO_entries)
		{
			m_pReadbackFIFO = new CircularFIFO<ReadbackFIFOSlot>(num_readback_FIFO_entries);
			for(int i = 0; i != m_pReadbackFIFO->capacity(); ++i)
			{
				ReadbackFIFOSlot& slot = m_pReadbackFIFO->raw_at(i);
				CUDA_V_RETURN(cudaMallocHost((void **)&slot.host_Dxyz, output_size * m_readback_element_size));
				memset(slot.host_Dxyz, 0, output_size * m_readback_element_size);
				slot.kickID = GFSDK_WaveWorks_InvalidKickID;
			}
		}

        m_ReadbackInitialised = true;
    }

	// Init timer slots
    m_active_timer_slot = 0;
    m_end_inflight_timer_slots = 1;
	m_timer_slots[m_active_timer_slot].m_elapsed_time = 0.f;						// Ensure first call to getTimings() gives reasonable results
	m_timer_slots[m_active_timer_slot].m_kickID = GFSDK_WaveWorks_InvalidKickID;	// Ensure first call to getTimings() gives reasonable results

	for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
	{
		CudaDeviceState& dev_state = m_pCudaDeviceStates[cuda_dev_index];
		CUDA_V_RETURN(cudaSetDevice(dev_state.m_cudaDevice));

		// clear
		CUDA_V_RETURN(cudaMemset(dev_state.m_device_Gauss, 0, gauss_size * sizeof(float2)));
		CUDA_V_RETURN(cudaMemset(dev_state.m_device_H0, 0, h0_size * sizeof(float2)));
		CUDA_V_RETURN(cudaMemset(dev_state.m_device_Omega, 0, omega_size * sizeof(float2)));
		CUDA_V_RETURN(cudaMemset(dev_state.m_device_Ht, 0, htdt_size * sizeof(float2)));
		CUDA_V_RETURN(cudaMemset(dev_state.m_device_Dt, 0, htdt_size * sizeof(float4)));
	}

    m_cudaResourcesInitialised = true;

    return S_OK;
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::releaseCudaResources()
{
    HRESULT hr;

	if(m_ReadbackInitialised)
	{
		V_RETURN(waitForAllInFlightReadbacks());
	}

	V_RETURN(waitForAllInFlightTimers());

	for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
	{
		CudaDeviceState& dev_state = m_pCudaDeviceStates[cuda_dev_index];
		CUDA_V_RETURN(cudaSetDevice(dev_state.m_cudaDevice));

		CUDA_SAFE_FREE(dev_state.m_device_Gauss);
		CUDA_SAFE_FREE(dev_state.m_device_H0);
		CUDA_SAFE_FREE(dev_state.m_device_Ht);
		CUDA_SAFE_FREE(dev_state.m_device_Dt);
		CUDA_SAFE_FREE(dev_state.m_device_Omega);

		if(m_ReadbackInitialised)
		{
			for(int slot = 0; slot != NumReadbackSlots; ++slot)
			{
				CUDA_V_RETURN(cudaEventDestroy(dev_state.m_readback_completion_evts[slot]));
				CUDA_V_RETURN(cudaEventDestroy(dev_state.m_readback_staging_evts[slot]));
				CUDA_SAFE_FREE(dev_state.m_readback_device_Dxyzs[slot]);
			}
		}

		// Timer events
		for(int slot = 0; slot != NumTimerSlots; ++slot)
		{
			CUDA_V_RETURN(cudaEventDestroy(dev_state.m_start_timer_evts[slot]));
			CUDA_V_RETURN(cudaEventDestroy(dev_state.m_stop_timer_evts[slot]));
			CUDA_V_RETURN(cudaEventDestroy(dev_state.m_start_fft_timer_evts[slot]));
			CUDA_V_RETURN(cudaEventDestroy(dev_state.m_stop_fft_timer_evts[slot]));
		}
	}

    if(m_ReadbackInitialised)
    {
        for(int slot = 0; slot != NumReadbackSlots; ++slot)
        {
            CUDA_SAFE_FREE_HOST(m_readback_slots[slot].m_host_Dxyz);
        }

        m_ReadbackInitialised = false;
    }

	if(m_pReadbackFIFO)
	{
		for(int i = 0; i != m_pReadbackFIFO->capacity(); ++i)
		{
			CUDA_SAFE_FREE_HOST(m_pReadbackFIFO->raw_at(i).host_Dxyz);
		}
		SAFE_DELETE(m_pReadbackFIFO);
	}

    m_cudaResourcesInitialised = false;

    return S_OK;
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::preKick(int constantsIndex)
{
    HRESULT hr;

	// Check for timers
	if(m_cudaResourcesInitialised)
	{
		V_RETURN(queryTimers());
	}

    // Register displacement map, if necessary
    if(!m_DisplacementMapIsCUDARegistered)
    {
        V_RETURN(registerDisplacementMapWithCUDA());
    }

    // Init cuda resources, if necessary
    if(!m_cudaResourcesInitialised)
    {
        V_RETURN(allocateCudaResources());
        V_RETURN(initGaussAndOmega());
    }
    else if(!m_GaussAndOmegaInitialised)
    {
        V_RETURN(initGaussAndOmega());
    }

	// Be sure to use the correct cuda device for the current frame (important in SLI)
	const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex();
	CudaDeviceState& dev_state = m_pCudaDeviceStates[activeCudaDeviceIndex];
	const CudaDeviceInfo& dev_info = m_pManager->GetCudaDeviceInfo(activeCudaDeviceIndex);

	if(dev_state.m_H0Dirty || dev_state.m_constantsIndex != constantsIndex) 
	{
		void* device_constants = (char*)dev_info.m_device_constants + dev_info.m_constants_size / MAX_NUM_CASCADES * constantsIndex;

		float wind_dir_len = sqrtf(m_params.wind_dir.x*m_params.wind_dir.x + m_params.wind_dir.y*m_params.wind_dir.y);
		float2 wind_dir = { m_params.wind_dir.x / wind_dir_len, m_params.wind_dir.y / wind_dir_len };

		CUDA_V_RETURN(cuda_SetConstants(device_constants, dev_state.m_device_Gauss, dev_state.m_device_H0,
			dev_state.m_device_Ht, dev_state.m_device_Dt, dev_state.m_device_Omega,
			m_resolution, m_params.fft_period, m_params.window_in, m_params.window_out, 
			wind_dir, m_params.wind_speed, m_params.wind_dependency, m_params.wave_amplitude, 
			m_params.small_wave_fraction, m_params.choppy_scale, dev_info.m_kernel_stream));

		dev_state.m_constantsIndex = constantsIndex;
	}

	return S_OK;
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::kickPreInterop(double dSimTime, gfsdk_U64 kickID)
{
	HRESULT hr;

	// Be sure to use the correct cuda device for the current frame (important in SLI)
	const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex();
	CudaDeviceState& dev_state = m_pCudaDeviceStates[activeCudaDeviceIndex];
	const CudaDeviceInfo& dev_info = m_pManager->GetCudaDeviceInfo(activeCudaDeviceIndex);
	// already done in simulation manager (doing it again would flush pushbuffer)
	// CUDA_V_RETURN(cudaSetDevice(dev_state.m_cudaDevice));

	// Start CUDA workload timer
	m_working_timer_slot = NULL;
	if(m_params.enable_CUDA_timers)
	{
		V_RETURN(consumeAvailableTimerSlot(dev_state, kickID, &m_working_timer_slot));
		assert(m_working_timer_slot != NULL);
		CUDA_V_RETURN(cudaEventRecord(m_working_timer_slot->m_start_timer_evt,dev_info.m_kernel_stream));
	}

	// ------------------------------ Update H(0) if necessary ------------------------------------
	if(dev_state.m_H0Dirty) 
	{
		updateH0(dev_state, dev_info.m_kernel_stream);
		dev_state.m_H0Dirty = false;
	}

    // ------------------------------ Calculate H(t) from H(0) ------------------------------------
	const double fModeSimTime = dSimTime * (double)m_params.time_scale;
    CUDA_V_RETURN(cuda_ComputeRows(m_resolution, fModeSimTime, dev_state.m_constantsIndex, dev_info.m_kernel_stream));

	return S_OK;
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::kickWithinInteropD3D11(gfsdk_U64 kickID)
{
#if WAVEWORKS_ENABLE_D3D11
	HRESULT hr;

	assert(nv_water_d3d_api_d3d11 == m_d3dAPI);

	// Be sure to use the correct cuda device for the current frame (important in SLI)
	const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex();
	CudaDeviceState& dev_state = m_pCudaDeviceStates[activeCudaDeviceIndex];
	const CudaDeviceInfo& dev_info = m_pManager->GetCudaDeviceInfo(activeCudaDeviceIndex);

    cudaArray* tex_array;
	cudaGraphicsResource* mapped_resource = m_d3d._11.m_pd3d11PerCudaDeviceResources[activeCudaDeviceIndex].m_pd3d11RegisteredDisplacementMapResource;
    CUDA_V_RETURN(cudaGraphicsSubResourceGetMappedArray(&tex_array, mapped_resource, 0, 0));

    // Fill displacement texture
	CUDA_V_RETURN(cuda_ComputeColumns_array(tex_array, m_resolution, dev_state.m_constantsIndex, dev_info.m_kernel_stream));

    // Optionally, get data staged for readback
	m_working_readback_slot = NULL;
	if(m_ReadbackInitialised) {
		V_RETURN(consumeAvailableReadbackSlot(dev_state, kickID, &m_working_readback_slot));
		CUDA_V_RETURN(cudaMemcpy2DFromArrayAsync(	m_working_readback_slot->m_device_Dxyz,
													m_resolution * sizeof(ushort4),
													tex_array, 0, 0,
													m_resolution * sizeof(ushort4),
													m_resolution,
													cudaMemcpyDeviceToDevice,
													dev_info.m_kernel_stream
													));

		// The copy out of staging is done on a separate stream with the goal of allowing the copy to occur
		// in parallel with other GPU workloads, so we need to do some inter-stream sync here
		CUDA_V_RETURN(cudaEventRecord(m_working_readback_slot->m_staging_evt,dev_info.m_kernel_stream));
	}

	// CUDA workload is done, stop the clock and unmap as soon as we can so as not to block the graphics pipe
	if(m_working_timer_slot)
	{
		CUDA_V_RETURN(cudaEventRecord(m_working_timer_slot->m_stop_timer_evt,dev_info.m_kernel_stream));
	}
#endif

	return S_OK;
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::kickWithinInteropGL2(gfsdk_U64 kickID)
{
#if WAVEWORKS_ENABLE_GL
	HRESULT hr;

	assert(nv_water_d3d_api_gl2 == m_d3dAPI);

	// Be sure to use the correct cuda device for the current frame (important in SLI)
	const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex();
	CudaDeviceState& dev_state = m_pCudaDeviceStates[activeCudaDeviceIndex];
	const CudaDeviceInfo& dev_info = m_pManager->GetCudaDeviceInfo(activeCudaDeviceIndex);

    cudaArray* tex_array;
	cudaGraphicsResource* mapped_resource = m_d3d._GL2.m_pGL2PerCudaDeviceResources[activeCudaDeviceIndex].m_pGL2RegisteredDisplacementMapResource;
    CUDA_V_RETURN(cudaGraphicsSubResourceGetMappedArray(&tex_array, mapped_resource, 0, 0));

    // Fill displacement texture
	CUDA_V_RETURN(cuda_ComputeColumns_array(tex_array, m_resolution, dev_state.m_constantsIndex, dev_info.m_kernel_stream));

	// Copy to GL texture
	// CUDA_V_RETURN(cudaMemcpyToArray(tex_array, 0, 0, cuda_dest_resource, size_tex_data, cudaMemcpyDeviceToDevice));
		
    // Optionally, get data staged for readback
	m_working_readback_slot = NULL;
	if(m_ReadbackInitialised) {
		V_RETURN(consumeAvailableReadbackSlot(dev_state, kickID, &m_working_readback_slot));
		CUDA_V_RETURN(cudaMemcpy2DFromArrayAsync(	m_working_readback_slot->m_device_Dxyz,
													m_resolution * sizeof(ushort4),
													tex_array, 0, 0,
													m_resolution * sizeof(ushort4),
													m_resolution,
													cudaMemcpyDeviceToDevice
													));
	}

	// CUDA workload is done, stop the clock and unmap as soon as we can so as not to block the graphics pipe
	if(m_working_timer_slot)
	{
		CUDA_V_RETURN(cudaEventRecord(m_working_timer_slot->m_stop_timer_evt,dev_info.m_kernel_stream));
	}
#endif

	return S_OK;
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::kickWithinInteropNoGfx(gfsdk_U64 kickID)
{
	HRESULT hr;

	assert(nv_water_d3d_api_none == m_d3dAPI);

	// Be sure to use the correct cuda device for the current frame (important in SLI)
	const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex();
	CudaDeviceState& dev_state = m_pCudaDeviceStates[activeCudaDeviceIndex];
	const CudaDeviceInfo& dev_info = m_pManager->GetCudaDeviceInfo(activeCudaDeviceIndex);

	int output_size = m_resolution * m_resolution;

	float4* device_displacementMap = m_d3d._noGFX.m_pNoGraphicsPerCudaDeviceResources[activeCudaDeviceIndex].m_Device_displacementMap;

    // Fill displacement texture
	CUDA_V_RETURN(cuda_ComputeColumns(device_displacementMap, m_resolution, dev_state.m_constantsIndex, dev_info.m_kernel_stream));

    // Optionally, get data staged for readback
	m_working_readback_slot = NULL;
	if(m_ReadbackInitialised) {
		V_RETURN(consumeAvailableReadbackSlot(dev_state, kickID, &m_working_readback_slot));
		CUDA_V_RETURN(cudaMemcpyAsync(m_working_readback_slot->m_device_Dxyz, device_displacementMap, output_size * sizeof(float4), cudaMemcpyDeviceToDevice, dev_info.m_kernel_stream));

		// The copy out of staging is done on a separate stream with the goal of allowing the copy to occur
		// in parallel with other GPU workloads, so we need to do some inter-stream sync here
		CUDA_V_RETURN(cudaEventRecord(m_working_readback_slot->m_staging_evt,dev_info.m_kernel_stream));
	}

	// CUDA workload is done, stop the clock
	if(m_working_timer_slot)
	{
		CUDA_V_RETURN(cudaEventRecord(m_working_timer_slot->m_stop_timer_evt,dev_info.m_kernel_stream));
	}

	return S_OK;
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::kickWithinInterop(gfsdk_U64 kickID)
{
	HRESULT hr;

    switch(m_d3dAPI)
    {
#if WAVEWORKS_ENABLE_D3D11
    case nv_water_d3d_api_d3d11:
        {
			V_RETURN(kickWithinInteropD3D11(kickID));
        }
        break;
#endif
#if WAVEWORKS_ENABLE_GL
    case nv_water_d3d_api_gl2:
        {
			V_RETURN(kickWithinInteropGL2(kickID));
        }
        break;
#endif
	case nv_water_d3d_api_none:
		{
			V_RETURN(kickWithinInteropNoGfx(kickID));
		}
		break;
	default:
		return E_FAIL;
	}

    return S_OK;
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::kickPostInterop(gfsdk_U64 kickID)
{
	// Be sure to use the correct cuda device for the current frame (important in SLI)
	const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex();
	const CudaDeviceInfo& dev_info = m_pManager->GetCudaDeviceInfo(activeCudaDeviceIndex);

	const int output_size = m_resolution * m_resolution;

	if(m_working_readback_slot) {
		// Do readback out of staging area
		CUDA_V_RETURN(cudaStreamWaitEvent(dev_info.m_readback_stream,m_working_readback_slot->m_staging_evt,0));
		CUDA_V_RETURN(cudaMemcpyAsync(m_working_readback_slot->m_host_Dxyz, m_working_readback_slot->m_device_Dxyz, output_size * m_readback_element_size, cudaMemcpyDeviceToHost, dev_info.m_readback_stream));
		CUDA_V_RETURN(cudaEventRecord(m_working_readback_slot->m_completion_evt,dev_info.m_readback_stream));
	}

    // Update displacement map version
    m_DisplacementMapVersion = kickID;

	// We're done with slots
	m_working_readback_slot = NULL;
	m_working_timer_slot = NULL;

    return S_OK;
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::allocateAllResources()
{
    HRESULT hr;

	m_resolution = m_params.fft_resolution;
	m_half_resolution_plus_one = m_resolution / 2 + 1;

    switch(m_d3dAPI)
    {
#if WAVEWORKS_ENABLE_D3D11
    case nv_water_d3d_api_d3d11:
        {
			m_d3d._11.m_pd3d11PerCudaDeviceResources = new D3D11Objects::PerCudaDeviceResources[m_numCudaDevices];

            // Create displacement maps
            D3D11_TEXTURE2D_DESC displacementMapTD;
            displacementMapTD.Width = m_resolution;
            displacementMapTD.Height = m_resolution;
            displacementMapTD.MipLevels = 1;
            displacementMapTD.ArraySize = 1;
            displacementMapTD.Format = DXGI_FORMAT_R16G16B16A16_FLOAT;
            displacementMapTD.SampleDesc = kNoSample;
            displacementMapTD.Usage = D3D11_USAGE_DEFAULT;
            displacementMapTD.BindFlags = D3D11_BIND_SHADER_RESOURCE;
            displacementMapTD.CPUAccessFlags = 0;
            displacementMapTD.MiscFlags = 0;

			for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
			{
				D3D11Objects::PerCudaDeviceResources& pcdr = m_d3d._11.m_pd3d11PerCudaDeviceResources[cuda_dev_index];
				V_RETURN(m_d3d._11.m_pd3d11Device->CreateTexture2D(&displacementMapTD, NULL, &pcdr.m_pd3d11DisplacementMapResource));
				V_RETURN(m_d3d._11.m_pd3d11Device->CreateShaderResourceView(pcdr.m_pd3d11DisplacementMapResource, NULL, &pcdr.m_pd3d11DisplacementMap));
				pcdr.m_pd3d11RegisteredDisplacementMapResource = NULL;
			}
        }
		break;
#endif
#if WAVEWORKS_ENABLE_GL
    case nv_water_d3d_api_gl2:
		{
			m_d3d._GL2.m_pGL2PerCudaDeviceResources = new GL2Objects::PerCudaDeviceResources[m_numCudaDevices];

            // Create displacement maps
			for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
			{
				GL2Objects::PerCudaDeviceResources& pcdr = m_d3d._GL2.m_pGL2PerCudaDeviceResources[cuda_dev_index];
				if(pcdr.m_GL2DisplacementMapTexture !=0 ) NVSDK_GLFunctions.glDeleteTextures(1, &pcdr.m_GL2DisplacementMapTexture); CHECK_GL_ERRORS;
				NVSDK_GLFunctions.glGenTextures(1,&pcdr.m_GL2DisplacementMapTexture); CHECK_GL_ERRORS;
				NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, pcdr.m_GL2DisplacementMapTexture); CHECK_GL_ERRORS;
				NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); CHECK_GL_ERRORS;
				NVSDK_GLFunctions.glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); CHECK_GL_ERRORS;
				NVSDK_GLFunctions.glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, m_resolution, m_resolution, 0, GL_RGBA, GL_FLOAT, NULL); CHECK_GL_ERRORS;
				NVSDK_GLFunctions.glBindTexture(GL_TEXTURE_2D, 0); CHECK_GL_ERRORS;				
				pcdr.m_pGL2RegisteredDisplacementMapResource = NULL;
			}
        }
		break;
#endif
    case nv_water_d3d_api_none:
        {
			m_d3d._noGFX.m_pNoGraphicsPerCudaDeviceResources = new NoGraphicsObjects::PerCudaDeviceResources[m_numCudaDevices];
			for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
			{
				NoGraphicsObjects::PerCudaDeviceResources& pcdr = m_d3d._noGFX.m_pNoGraphicsPerCudaDeviceResources[cuda_dev_index];
				pcdr.m_Device_displacementMap = NULL;
			}
        }
		break;

    default:
        return E_FAIL;
    }

	// Remaining allocations are deferred, in order to ensure that they occur on the host's simulation thread
    m_cudaResourcesInitialised = false;
    m_DisplacementMapIsCUDARegistered = false;
	m_GaussAndOmegaInitialised = false;
    for(unsigned int i = 0; i < m_numCudaDevices; i++)
	{
		m_pCudaDeviceStates[i].m_H0Dirty = true;
	}
    m_ReadbackInitialised = false;

	// Displacement map contents are initially undefined
	m_DisplacementMapVersion = GFSDK_WaveWorks_InvalidKickID;

    return S_OK;
}

void NVWaveWorks_FFT_Simulation_CUDA_Impl::releaseAll()
{
	releaseAllResources();

#if WAVEWORKS_ENABLE_GRAPHICS
    switch(m_d3dAPI)
    {
#if WAVEWORKS_ENABLE_D3D11
    case nv_water_d3d_api_d3d11:
        {
			SAFE_RELEASE(m_d3d._11.m_pd3d11Device);
        }
        break;
#endif
#if WAVEWORKS_ENABLE_GL
    case nv_water_d3d_api_gl2:
        {
			// nothing to do?
        }
        break;
#endif
	}
#endif

	m_d3dAPI = nv_water_d3d_api_undefined;

	SAFE_DELETE_ARRAY(m_pCudaDeviceStates);
	m_numCudaDevices = 0;
}

void NVWaveWorks_FFT_Simulation_CUDA_Impl::releaseAllResources()
{
    if(m_DisplacementMapIsCUDARegistered)
    {
        unregisterDisplacementMapWithCUDA();
    }

    if(m_cudaResourcesInitialised)
    {
        releaseCudaResources();
    }

    switch(m_d3dAPI)
    {
#if WAVEWORKS_ENABLE_D3D11
    case nv_water_d3d_api_d3d11:
        {
			for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
			{
				D3D11Objects::PerCudaDeviceResources& pcdr = m_d3d._11.m_pd3d11PerCudaDeviceResources[cuda_dev_index];
				SAFE_RELEASE(pcdr.m_pd3d11DisplacementMapResource);
				SAFE_RELEASE(pcdr.m_pd3d11DisplacementMap);
			}

			SAFE_DELETE_ARRAY(m_d3d._11.m_pd3d11PerCudaDeviceResources);
        }
        break;
#endif
#if WAVEWORKS_ENABLE_GL
    case nv_water_d3d_api_gl2:
        {
			for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
			{
				GL2Objects::PerCudaDeviceResources& pcdr = m_d3d._GL2.m_pGL2PerCudaDeviceResources[cuda_dev_index];
				if(pcdr.m_GL2DisplacementMapTexture !=0) NVSDK_GLFunctions.glDeleteTextures(1, &pcdr.m_GL2DisplacementMapTexture); CHECK_GL_ERRORS;
			}
			SAFE_DELETE_ARRAY(m_d3d._GL2.m_pGL2PerCudaDeviceResources);
        }
        break;
#endif
	case nv_water_d3d_api_none:
        {
			SAFE_DELETE_ARRAY(m_d3d._noGFX.m_pNoGraphicsPerCudaDeviceResources);
        }
        break;
    }
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::registerDisplacementMapWithCUDA()
{
    switch(m_d3dAPI)
    {
#if WAVEWORKS_ENABLE_D3D11
    case nv_water_d3d_api_d3d11:
        {
			bool all_registered = true;
			for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
			{
				D3D11Objects::PerCudaDeviceResources& pcdr = m_d3d._11.m_pd3d11PerCudaDeviceResources[cuda_dev_index];
				if(pcdr.m_pd3d11DisplacementMapResource)
				{
					if(NULL == pcdr.m_pd3d11RegisteredDisplacementMapResource)
					{
						CUDA_V_RETURN(cudaSetDevice(m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice));
						CUDA_V_RETURN(cudaGraphicsD3D11RegisterResource(&pcdr.m_pd3d11RegisteredDisplacementMapResource, pcdr.m_pd3d11DisplacementMapResource, cudaGraphicsRegisterFlagsSurfaceLoadStore));
						CUDA_V_RETURN(cudaGraphicsResourceSetMapFlags(pcdr.m_pd3d11RegisteredDisplacementMapResource, cudaGraphicsMapFlagsWriteDiscard));
					}
				}
				else
				{
					all_registered = false;
				}
			}
			m_DisplacementMapIsCUDARegistered = all_registered;
        }
        break;
#endif
#if WAVEWORKS_ENABLE_GL
    case nv_water_d3d_api_gl2:
        {
			bool all_registered = true;
			for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
			{
				GL2Objects::PerCudaDeviceResources& pcdr = m_d3d._GL2.m_pGL2PerCudaDeviceResources[cuda_dev_index];
				if(pcdr.m_GL2DisplacementMapTexture)
				{
					if(NULL == pcdr.m_pGL2RegisteredDisplacementMapResource)
					{
						CUDA_V_RETURN(cudaSetDevice(m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice));
						CUDA_V_RETURN(cudaGraphicsGLRegisterImage(&pcdr.m_pGL2RegisteredDisplacementMapResource, pcdr.m_GL2DisplacementMapTexture, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsSurfaceLoadStore));
					}
				}
				else
				{
					all_registered = false;
				}
			}
			m_DisplacementMapIsCUDARegistered = all_registered;
        }
        break;
#endif
    case nv_water_d3d_api_none:
        {
			int output_size = m_resolution * m_resolution;

			// Well this is something of a fake - there's no graphics, so no interop as such, however we can re-use all our existing infrastucture
			// if we use a simple CUDA device alloc instead
			for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
			{
				NoGraphicsObjects::PerCudaDeviceResources& pcdr = m_d3d._noGFX.m_pNoGraphicsPerCudaDeviceResources[cuda_dev_index];
				CUDA_V_RETURN(cudaSetDevice(m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice));
				CUDA_V_RETURN(cudaMalloc((void **)&pcdr.m_Device_displacementMap, output_size * sizeof(float4)));
			}
			m_DisplacementMapIsCUDARegistered = true;
        }
        break;
     default:
        return E_FAIL;
    }

    return S_OK;
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::unregisterDisplacementMapWithCUDA()
{
    switch(m_d3dAPI)
    {
#if WAVEWORKS_ENABLE_D3D11
    case nv_water_d3d_api_d3d11:
        {
			for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
			{
				D3D11Objects::PerCudaDeviceResources& pcdr = m_d3d._11.m_pd3d11PerCudaDeviceResources[cuda_dev_index];
				if(pcdr.m_pd3d11DisplacementMapResource)
				{
					CUDA_V_RETURN(cudaSetDevice(m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice));
					CUDA_V_RETURN(cudaGraphicsUnregisterResource(pcdr.m_pd3d11RegisteredDisplacementMapResource));
					pcdr.m_pd3d11RegisteredDisplacementMapResource = NULL;
				}
			}
            m_DisplacementMapIsCUDARegistered = false;
        }
        break;
#endif
#if WAVEWORKS_ENABLE_GL
    case nv_water_d3d_api_gl2:
        {
			for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
			{
				GL2Objects::PerCudaDeviceResources& pcdr = m_d3d._GL2.m_pGL2PerCudaDeviceResources[cuda_dev_index];
				if(pcdr.m_GL2DisplacementMapTexture != 0)
				{
					CUDA_V_RETURN(cudaSetDevice(m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice));
					CUDA_V_RETURN(cudaGraphicsUnregisterResource(pcdr.m_pGL2RegisteredDisplacementMapResource));
					pcdr.m_pGL2RegisteredDisplacementMapResource = NULL;
				}
			}
            m_DisplacementMapIsCUDARegistered = false;
        }
        break;
#endif
	case nv_water_d3d_api_none:
        {
			for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
			{
				NoGraphicsObjects::PerCudaDeviceResources& pcdr = m_d3d._noGFX.m_pNoGraphicsPerCudaDeviceResources[cuda_dev_index];
				if(pcdr.m_Device_displacementMap)
				{
					CUDA_V_RETURN(cudaSetDevice(m_pCudaDeviceStates[cuda_dev_index].m_cudaDevice));
					CUDA_SAFE_FREE(pcdr.m_Device_displacementMap);
				}
			}
            m_DisplacementMapIsCUDARegistered = false;
        }
        break;
     default:
        return E_FAIL;
    }

    return S_OK;
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::updateH0(const CudaDeviceState& cu_dev_state, cudaStream_t cu_kernel_stream)
{
	CUDA_V_RETURN(cuda_ComputeH0(m_resolution, cu_dev_state.m_constantsIndex, cu_kernel_stream));

	return S_OK;
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::initGaussAndOmega()
{
	int omega_width = m_resolution + 4;
	int gauss_width = gauss_map_resolution + 4;

	float2* gauss = new float2[gauss_map_size];
	float* omega = new float[omega_width * (m_resolution + 1)];

	GFSDK_WaveWorks_Simulation_Util::init_gauss(m_params, gauss);
	GFSDK_WaveWorks_Simulation_Util::init_omega(m_params, omega);

	// copy actually used gauss window around center of max resolution buffer
	// note that we need to generate full resolution to maintain pseudo-randomness
	float2* gauss_src = gauss + (gauss_map_resolution - m_resolution) / 2 * (1 + gauss_width);
	for(int i=0; i<m_resolution; ++i)
		memmove(gauss + i * m_resolution, gauss_src + i * gauss_width, m_resolution * sizeof(float2));

	// strip unneeded padding
	for(int i=0; i<m_half_resolution_plus_one; ++i)
		memmove(omega + i * m_half_resolution_plus_one, omega + i * omega_width, m_half_resolution_plus_one * sizeof(float));

	int gauss_size = m_resolution * m_resolution;
	int omega_size = m_half_resolution_plus_one * m_half_resolution_plus_one;

	for(unsigned int cuda_dev_index = 0; cuda_dev_index != m_numCudaDevices; ++cuda_dev_index)
	{
		CudaDeviceState& dev_state = m_pCudaDeviceStates[cuda_dev_index];
		CUDA_V_RETURN(cudaSetDevice(dev_state.m_cudaDevice));

		CUDA_V_RETURN(cudaMemcpy(dev_state.m_device_Gauss, gauss, gauss_size * sizeof(float2), cudaMemcpyHostToDevice));
		CUDA_V_RETURN(cudaMemcpy(dev_state.m_device_Omega, omega, omega_size * sizeof(float), cudaMemcpyHostToDevice));
	}

    SAFE_DELETE_ARRAY(gauss);
    SAFE_DELETE_ARRAY(omega);

    m_GaussAndOmegaInitialised = true;

    return S_OK;
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::waitForAllInFlightReadbacks()
{
	HRESULT hr;

	// Consume the readbacks
	int wait_slot = (m_active_readback_slot + 1) % NumReadbackSlots;
	while(wait_slot != m_end_inflight_readback_slots)
	{
		V_RETURN(collectSingleReadbackResult(true));
		wait_slot = (m_active_readback_slot + 1) % NumReadbackSlots;
	}

    return S_OK;
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::waitForAllInFlightTimers()
{
    // The slot after the active slot is always the first in-flight slot
    for(	int slot = (m_active_timer_slot + 1) % NumTimerSlots;
            slot != m_end_inflight_timer_slots;
            slot = (slot + 1) % NumTimerSlots
            )
    {
		CUDA_V_RETURN(cudaSetDevice(m_timer_slots[slot].m_cudaDevice));
        CUDA_V_RETURN(cudaEventSynchronize(m_timer_slots[slot].m_start_timer_evt));
		CUDA_V_RETURN(cudaEventSynchronize(m_timer_slots[slot].m_stop_timer_evt));
    }

    return S_OK;
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::consumeAvailableReadbackSlot(CudaDeviceState& cu_dev_state, gfsdk_U64 kickID, ReadbackSlot** ppSlot)
{
    if(m_active_readback_slot == m_end_inflight_readback_slots)
    {
        // No slots available - we must wait for the oldest in-flight readback to complete
        int wait_slot = (m_active_readback_slot + 1) % NumReadbackSlots;
		CUDA_V_RETURN(cudaSetDevice(m_readback_slots[wait_slot].m_cudaDevice));
        CUDA_V_RETURN(cudaEventSynchronize(m_readback_slots[wait_slot].m_completion_evt));
        m_active_readback_slot = wait_slot;
		m_active_readback_host_Dxyz = m_readback_slots[wait_slot].m_host_Dxyz;

		// Restore the CUDA device
		CUDA_V_RETURN(cudaSetDevice(cu_dev_state.m_cudaDevice));
    }

    // Consume a slot!
    *ppSlot = &m_readback_slots[m_end_inflight_readback_slots];
	(*ppSlot)->m_cudaDevice = cu_dev_state.m_cudaDevice;
	(*ppSlot)->m_completion_evt = cu_dev_state.m_readback_completion_evts[m_end_inflight_readback_slots];
	(*ppSlot)->m_staging_evt = cu_dev_state.m_readback_staging_evts[m_end_inflight_readback_slots];
	(*ppSlot)->m_device_Dxyz = cu_dev_state.m_readback_device_Dxyzs[m_end_inflight_readback_slots];
	(*ppSlot)->m_kickID = kickID;
    m_end_inflight_readback_slots = (m_end_inflight_readback_slots + 1) % NumReadbackSlots;

    return S_OK;
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::consumeAvailableTimerSlot(CudaDeviceState& cu_dev_state, gfsdk_U64 kickID, TimerSlot** ppSlot)
{
	HRESULT hr;

    if(m_active_timer_slot == m_end_inflight_timer_slots)
    {
        // No slots available - we must wait for the oldest in-flight timer to complete
        int wait_slot = (m_active_timer_slot + 1) % NumTimerSlots;
		CUDA_V_RETURN(cudaSetDevice(m_timer_slots[wait_slot].m_cudaDevice));
        CUDA_V_RETURN(cudaEventSynchronize(m_timer_slots[wait_slot].m_start_timer_evt));
		CUDA_V_RETURN(cudaEventSynchronize(m_timer_slots[wait_slot].m_stop_timer_evt));
        m_active_timer_slot = wait_slot;
		V_RETURN(getElapsedTimeForActiveSlot());

		// Restore the CUDA device
		CUDA_V_RETURN(cudaSetDevice(cu_dev_state.m_cudaDevice));
    }

    // Consume a slot!
    *ppSlot = &m_timer_slots[m_end_inflight_timer_slots];
	(*ppSlot)->m_cudaDevice = cu_dev_state.m_cudaDevice;
	(*ppSlot)->m_start_timer_evt = cu_dev_state.m_start_timer_evts[m_end_inflight_timer_slots];
	(*ppSlot)->m_stop_timer_evt = cu_dev_state.m_stop_timer_evts[m_end_inflight_timer_slots];
	(*ppSlot)->m_elapsed_time = 0.f;
	(*ppSlot)->m_kickID = kickID;
    m_end_inflight_timer_slots = (m_end_inflight_timer_slots + 1) % NumTimerSlots;

    return S_OK;
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::resetReadbacks()
{
	HRESULT hr;

	if(!m_ReadbackInitialised)
	{
		// Nothing to reset
		return S_OK;
	}

	V_RETURN(waitForAllInFlightReadbacks());

    m_active_readback_slot = 0;
	m_active_readback_host_Dxyz = NULL;
    m_end_inflight_readback_slots = 1;
	m_readback_slots[m_active_readback_slot].m_kickID = GFSDK_WaveWorks_InvalidKickID;

	return S_OK;
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::canCollectSingleReadbackResultWithoutBlocking()
{
	if(!m_ReadbackInitialised)
	{
		return S_FALSE;
	}

    const int wait_slot = (m_active_readback_slot + 1) % NumReadbackSlots;
	if(wait_slot == m_end_inflight_readback_slots)
	{
		// Nothing in-flight...
		return S_FALSE;
	}

	// Do the query
	CUDA_V_RETURN(cudaSetDevice(m_readback_slots[wait_slot].m_cudaDevice));
	const cudaError_t query_result = cudaEventQuery(m_readback_slots[wait_slot].m_completion_evt);
	if(cudaSuccess == query_result)
	{
		// Whaddyaknow, it's ready!
		return S_OK;
	}
	else if(cudaQueryResultIsError(query_result))
	{
		// Fail
		return E_FAIL;
	}
	else
	{
		// Not ready
		return S_FALSE;
	}
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::collectSingleReadbackResult(bool blocking)
{
	if(!m_ReadbackInitialised)
	{
		return S_FALSE;
	}

    const int wait_slot = (m_active_readback_slot + 1) % NumReadbackSlots;

    // Just consume one readback result per check (per function name!)
    if(wait_slot != m_end_inflight_readback_slots)
    {
		CUDA_V_RETURN(cudaSetDevice(m_readback_slots[wait_slot].m_cudaDevice));

		if(blocking)
		{
			CUDA_V_RETURN(cudaEventSynchronize(m_readback_slots[wait_slot].m_completion_evt));
            m_active_readback_slot = wait_slot;
			m_active_readback_host_Dxyz = m_readback_slots[wait_slot].m_host_Dxyz;
			return S_OK;
		}
		else
		{
			const cudaError_t query_result = cudaEventQuery(m_readback_slots[wait_slot].m_completion_evt);
			if(cudaSuccess == query_result)
			{
				m_active_readback_slot = wait_slot;
				m_active_readback_host_Dxyz = m_readback_slots[wait_slot].m_host_Dxyz;
				return S_OK;
			}
			else if(cudaQueryResultIsError(query_result))
			{
				return E_FAIL;
			}
		}
    }

	// Nothing in-flight, or else not ready yet
    return S_FALSE;
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::getElapsedTimeForActiveSlot()
{
	float elapsed_ms;
	CUDA_V_RETURN(cudaEventElapsedTime(&elapsed_ms, m_timer_slots[m_active_timer_slot].m_start_timer_evt, m_timer_slots[m_active_timer_slot].m_stop_timer_evt));
	m_timer_slots[m_active_timer_slot].m_elapsed_time = elapsed_ms;

	return S_OK;
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::queryTimers()
{
	HRESULT hr;

    const int wait_slot = (m_active_timer_slot + 1) % NumTimerSlots;

    // Just consume one timer result per check
    if(wait_slot != m_end_inflight_timer_slots)
    {
		CUDA_V_RETURN(cudaSetDevice(m_timer_slots[wait_slot].m_cudaDevice));
        const cudaError_t query_result_start = cudaEventQuery(m_timer_slots[wait_slot].m_start_timer_evt);
		const cudaError_t query_result_stop = cudaEventQuery(m_timer_slots[wait_slot].m_stop_timer_evt);
        if(cudaSuccess == query_result_start && cudaSuccess == query_result_stop)
        {
            m_active_timer_slot = wait_slot;
			V_RETURN(getElapsedTimeForActiveSlot());
        }
        else if(cudaQueryResultIsError(query_result_start) || cudaQueryResultIsError(query_result_stop))
        {
            return E_FAIL;
        }
    }

    return S_OK;
}

void NVWaveWorks_FFT_Simulation_CUDA_Impl::addDisplacements(	const BYTE* pReadbackData,
																const gfsdk_float2* inSamplePoints,
																gfsdk_float4* outDisplacements,
																UINT numSamples,
																float multiplier
																)
{
	switch(m_d3dAPI)
	{
	case nv_water_d3d_api_d3d11:
	case nv_water_d3d_api_gl2:
		// These paths use the surface<>-based variants of the CUDA kernels, which output to 16F
		{
			const UINT row_pitch = sizeof(ushort4) * m_resolution;
			GFSDK_WaveWorks_Simulation_Util::add_displacements_float16(m_params, pReadbackData, row_pitch, inSamplePoints, outDisplacements, numSamples, multiplier);
		}
		break;
	default:
		{
			const UINT row_pitch = sizeof(float4) * m_resolution;
			GFSDK_WaveWorks_Simulation_Util::add_displacements_float32(m_params, pReadbackData, row_pitch, inSamplePoints, outDisplacements, numSamples, multiplier);
		}
		break;
	}
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::addDisplacements(	const gfsdk_float2* inSamplePoints,
																gfsdk_float4* outDisplacements,
																UINT numSamples
																)
{
	if(!getReadbackCursor(NULL))
	{
		return S_OK;
	}

    const BYTE* pRB = reinterpret_cast<BYTE*>(m_active_readback_host_Dxyz);
	addDisplacements(pRB, inSamplePoints, outDisplacements, numSamples);

    return S_OK;
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::addArchivedDisplacements(	float coord,
																		const gfsdk_float2* inSamplePoints,
																		gfsdk_float4* outDisplacements,
																		UINT numSamples
																		)
{
	if(NULL == m_pReadbackFIFO)
	{
		// No FIFO, nothing to add
		return S_OK;
	}
	else if(0 == m_pReadbackFIFO->range_count())
	{
		// No entries, nothing to add
		return S_OK;
	}

	const float coordMax = float(m_pReadbackFIFO->range_count()-1);

	// Clamp coord to archived range
	float coord_clamped = coord;
	if(coord_clamped < 0.f)
		coord_clamped = 0.f;
	else if(coord_clamped > coordMax)
		coord_clamped = coordMax;

	// Figure out what interp is required
	const float coord_round = floorf(coord_clamped);
	const float coord_frac = coord_clamped - coord_round;
	const int coord_lower = (int)coord_round;
	if(0.f != coord_frac)
	{
		const int coord_upper = coord_lower + 1;

		addDisplacements(
			(const BYTE*)m_pReadbackFIFO->range_at(coord_lower).host_Dxyz,
			inSamplePoints, outDisplacements, numSamples,
			1.f-coord_frac);

		addDisplacements(
			(const BYTE*)m_pReadbackFIFO->range_at(coord_upper).host_Dxyz,
			inSamplePoints, outDisplacements, numSamples,
			coord_frac);
	}
	else
	{
		addDisplacements(
			(const BYTE*)m_pReadbackFIFO->range_at(coord_lower).host_Dxyz,
			inSamplePoints, outDisplacements, numSamples,
			1.f);
	}

    return S_OK;
}

bool NVWaveWorks_FFT_Simulation_CUDA_Impl::getReadbackCursor(gfsdk_U64* pKickID)
{
	if(!m_params.readback_displacements || !m_ReadbackInitialised)
	{
		return false;
	}

	if(GFSDK_WaveWorks_InvalidKickID == m_readback_slots[m_active_readback_slot].m_kickID)
	{
		// No results yet
		return false;
	}

	if(pKickID)
	{
		*pKickID = m_readback_slots[m_active_readback_slot].m_kickID;
	}

	return true;
}

bool NVWaveWorks_FFT_Simulation_CUDA_Impl::hasReadbacksInFlight() const
{
	if(!m_params.readback_displacements || !m_ReadbackInitialised)
	{
		return false;
	}

	int begin_inflight_readback_slots = (m_active_readback_slot + 1) % NumReadbackSlots;
	return begin_inflight_readback_slots != m_end_inflight_readback_slots;
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::getTimings(NVWaveWorks_FFT_Simulation_Timings& timings) const
{
	timings.GPU_simulation_time = m_timer_slots[m_active_timer_slot].m_elapsed_time;
	timings.GPU_FFT_simulation_time = 0.0f;
	return S_OK;
}

ID3D11ShaderResourceView** NVWaveWorks_FFT_Simulation_CUDA_Impl::GetDisplacementMapD3D11()
{
#if WAVEWORKS_ENABLE_D3D11
	assert(m_d3dAPI == nv_water_d3d_api_d3d11);
	const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex();
	return m_d3d._11.m_pd3d11PerCudaDeviceResources ? &m_d3d._11.m_pd3d11PerCudaDeviceResources[activeCudaDeviceIndex].m_pd3d11DisplacementMap : NULL;
#else
    return NULL;
#endif
}

GLuint NVWaveWorks_FFT_Simulation_CUDA_Impl::GetDisplacementMapGL2()
{
#if WAVEWORKS_ENABLE_GL
	assert(m_d3dAPI == nv_water_d3d_api_gl2);
	const int activeCudaDeviceIndex = m_pManager->GetActiveCudaDeviceIndex();
	return m_d3d._GL2.m_pGL2PerCudaDeviceResources ? m_d3d._GL2.m_pGL2PerCudaDeviceResources[activeCudaDeviceIndex].m_GL2DisplacementMapTexture : NULL;
#else
	return 0;
#endif
}

cudaGraphicsResource* NVWaveWorks_FFT_Simulation_CUDA_Impl::getInteropResource(unsigned int deviceIndex)
{
	switch(m_d3dAPI)
	{
#if WAVEWORKS_ENABLE_D3D11
	case nv_water_d3d_api_d3d11:
		return m_d3d._11.m_pd3d11PerCudaDeviceResources[deviceIndex].m_pd3d11RegisteredDisplacementMapResource;
#endif

#if WAVEWORKS_ENABLE_GL
	case nv_water_d3d_api_gl2:
		return m_d3d._GL2.m_pGL2PerCudaDeviceResources[deviceIndex].m_pGL2RegisteredDisplacementMapResource;
#endif

	default:
		assert(false);	// ...shouldn't ever happen
		return NULL;
	}
}

HRESULT NVWaveWorks_FFT_Simulation_CUDA_Impl::archiveDisplacements()
{
	gfsdk_U64 kickID = GFSDK_WaveWorks_InvalidKickID;
	if(getReadbackCursor(&kickID) && m_pReadbackFIFO)
	{
		// We avoid big memcpys by swapping pointers, specifically we will either evict a FIFO entry or else use a free one and
		// swap it with one of the slots used for in-flight readbacks
		//
		// First job is to check whether the FIFO already contains this result. We know that if it does contain this result,
		// it will be the last one pushed on...
		if(m_pReadbackFIFO->range_count())
		{
			if(kickID == m_pReadbackFIFO->range_at(0).kickID)
			{
				// It is an error to archive the same results twice...
				return E_FAIL;
			}
		}

		// Assuming the current results have not been archived, the next-up readback buffer should match the one we are serving up
		// for addDisplacements...
		assert(m_active_readback_host_Dxyz == m_readback_slots[m_active_readback_slot].m_host_Dxyz);

		ReadbackFIFOSlot& slot = m_pReadbackFIFO->consume_one();
		m_readback_slots[m_active_readback_slot].m_host_Dxyz = slot.host_Dxyz;
		slot.host_Dxyz = m_active_readback_host_Dxyz;
		slot.kickID = kickID;
	}

	return S_OK;
}

#endif //SUPPORT_CUDA